import java.io.IOException; import java.security.SecureRandom; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import javax.net.ssl.HostnameVerifier; import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSession; import javax.net.ssl.X509TrustManager; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; /* CREATE TABLE `ing` ( `id` int(11) unsigned NOT NULL, `url` varchar(500) DEFAULT NULL, `user` varchar(100) DEFAULT NULL, `date` varchar(30) DEFAULT NULL, `content` varchar(5000) DEFAULT NULL, `lucky` tinyint(4) DEFAULT NULL, `userlink` varchar(500) DEFAULT NULL, `mtime` timestamp NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; CREATE TABLE `comment` ( `id` int(11) unsigned NOT NULL, `ingid` int(11) DEFAULT NULL, `user` varchar(100) DEFAULT NULL, `content` varchar(5000) DEFAULT NULL, `date` varchar(30) DEFAULT NULL, `userlink` varchar(100) DEFAULT NULL, `mtime` timestamp NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; * */ public class IngCrawler { static { try { HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() { public boolean verify(String hostname, SSLSession session) { return true; } }); SSLContext context = SSLContext.getInstance("TLS"); context.init(null, new X509TrustManager[] { new X509TrustManager() { public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { } public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { } public X509Certificate[] getAcceptedIssuers() { return new X509Certificate[0]; } } }, new SecureRandom()); HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory()); } catch (Exception e) { e.printStackTrace(); } } public static void main(String[] args) throws IOException { int id = Inserter.getNextId(); int lastestid = Crawler.getLastestId(); for (; id <= lastestid; id++) { Crawler.crawl("https://ing.cnblogs.com/u/1/status/" + id, id); } } static class Ing { int id; String url; String user; String date; String content; boolean lucky; String userlink; List<Comment> comments = new ArrayList<Comment>(); @Override public String toString() { StringBuilder sb = new StringBuilder(id + " - [" + date + "][" + user + "] - " + content); for (Comment c : this.comments) { sb.append("\n\t" + c); } return sb.toString(); } static Ing parseIng(Document doc, String url, int id) { Ing ing = new Ing(); ing.id = id; ing.url = url; if (doc.select(".ing_detail_title").size() == 0) { return ing; } ing.user = doc.select(".ing_item_author").text().trim(); ing.userlink = doc.select(".ing_item_author").attr("href"); ing.date = doc.select(".ing_detail_title").text().trim(); if (ing.date.indexOf(":") != -1) { ing.date = ing.date.substring(ing.date.indexOf(":") + 1).trim(); } ing.content = doc.select("#ing_detail_body").text().trim(); ing.lucky = doc.select(".ing_icon_lucky").size() > 0; for (Element e : doc.select("#comment_block_" + id).get(0).children()) { ing.comments.add(Comment.parseComment(e, id)); } return ing; } static class Comment { int id; int ingid; String user; String content; String date; String userlink; static Comment parseComment(Element e, int ingid) { Comment comment = new Comment(); comment.id = Integer.parseInt(e.id().substring(8)); comment.ingid = ingid; comment.user = e.select("#comment_author_" + comment.id).text().trim(); comment.userlink = e.select("#comment_author_" + comment.id).attr("href"); comment.date = e.select(".text_green").attr("title").trim(); e.select("#comment_author_" + comment.id).remove(); e.select(".text_green").remove(); e.select(".gray3").remove(); comment.content = e.select("div").text().trim(); if (comment.content.startsWith(":")) { comment.content = comment.content.substring(1).trim(); } return comment; } @Override public String toString() { return "[" + user + "] - " + content; } } } static class Crawler implements Runnable { static ExecutorService crawler = Executors.newFixedThreadPool(10); String url; int id; public Crawler(String url, int id) { this.url = url; this.id = id; } public static int getLastestId() { return 1054304; } public static void crawl(String url, int id) { crawler.execute(new Crawler(url, id)); } @Override public void run() { System.out.println("crawl for: " + url); try { String cookie = "YOUR COOKIE HERE"; String useragent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36"; Inserter.insert(Ing.parseIng(Jsoup.connect(url).header("cookie", cookie).userAgent(useragent).get(), url, id)); } catch (IOException e) { e.printStackTrace(); } } } static class Inserter implements Runnable { static ExecutorService inserter = Executors.newFixedThreadPool(1); static Connection conn; static PreparedStatement pstating, pstatcmt; static { try { Class.forName("com.mysql.jdbc.Driver"); conn = DriverManager.getConnection( "jdbc:mysql://localhost:3306/ing?useUnicode=true&characterEncoding=utf-8&autoReconnect=true", "root", ""); pstating = conn .prepareStatement("insert into ing (id,url,user,date,content,lucky,userlink) values (?,?,?,?,?,?,?)"); pstatcmt = conn .prepareStatement("insert into comment (id,ingid,user,content,date,userlink) values (?,?,?,?,?,?)"); } catch (Exception e) { e.printStackTrace(); } } Ing ing; public Inserter(Ing ing) { this.ing = ing; } public static int getNextId() { try { Statement stat = conn.createStatement(); ResultSet rs = stat.executeQuery("select max(id) as id from ing"); if (rs.next()) { return rs.getInt("id") + 1; } } catch (SQLException e) { // ignore } return 1; } static int no = 0; public static void insert(Ing ing) { inserter.execute(new Inserter(ing)); } @Override public void run() { System.out.println(++no + ". " + ing); try { pstating.setInt(1, ing.id); pstating.setString(2, ing.url); pstating.setString(3, ing.user); pstating.setString(4, ing.date); pstating.setString(5, ing.content); pstating.setInt(6, ing.lucky ? 1 : 0); pstating.setString(7, ing.userlink); pstating.executeUpdate(); for (Ing.Comment c : ing.comments) { pstatcmt.setInt(1, c.id); pstatcmt.setInt(2, c.ingid); pstatcmt.setString(3, c.user); pstatcmt.setString(4, c.content); pstatcmt.setString(5, c.date); pstatcmt.setString(6, c.userlink); pstatcmt.executeUpdate(); } } catch (SQLException e) { System.err.println("ERROR - " + e.getMessage() + " - " + ing); } } } }
时间: 2024-10-06 05:22:53