IngCrawler

import java.io.IOException;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.X509TrustManager;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

/*
CREATE TABLE `ing` (
  `id` int(11) unsigned NOT NULL,
  `url` varchar(500) DEFAULT NULL,
  `user` varchar(100) DEFAULT NULL,
  `date` varchar(30) DEFAULT NULL,
  `content` varchar(5000) DEFAULT NULL,
  `lucky` tinyint(4) DEFAULT NULL,
  `userlink` varchar(500) DEFAULT NULL,
  `mtime` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE `comment` (
  `id` int(11) unsigned NOT NULL,
  `ingid` int(11) DEFAULT NULL,
  `user` varchar(100) DEFAULT NULL,
  `content` varchar(5000) DEFAULT NULL,
  `date` varchar(30) DEFAULT NULL,
  `userlink` varchar(100) DEFAULT NULL,
  `mtime` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 * */
public class IngCrawler {
    static {
        try {
            HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {
                public boolean verify(String hostname, SSLSession session) {
                    return true;
                }
            });

            SSLContext context = SSLContext.getInstance("TLS");
            context.init(null, new X509TrustManager[] { new X509TrustManager() {
                public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
                }

                public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
                }

                public X509Certificate[] getAcceptedIssuers() {
                    return new X509Certificate[0];
                }
            } }, new SecureRandom());
            HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) throws IOException {
        int id = Inserter.getNextId();
        int lastestid = Crawler.getLastestId();

        for (; id <= lastestid; id++) {
            Crawler.crawl("https://ing.cnblogs.com/u/1/status/" + id, id);
        }
    }

    static class Ing {
        int id;
        String url;
        String user;
        String date;
        String content;
        boolean lucky;
        String userlink;
        List<Comment> comments = new ArrayList<Comment>();

        @Override
        public String toString() {
            StringBuilder sb = new StringBuilder(id + " - [" + date + "][" + user + "] - " + content);
            for (Comment c : this.comments) {
                sb.append("\n\t" + c);
            }
            return sb.toString();
        }

        static Ing parseIng(Document doc, String url, int id) {
            Ing ing = new Ing();
            ing.id = id;
            ing.url = url;

            if (doc.select(".ing_detail_title").size() == 0) {
                return ing;
            }

            ing.user = doc.select(".ing_item_author").text().trim();
            ing.userlink = doc.select(".ing_item_author").attr("href");
            ing.date = doc.select(".ing_detail_title").text().trim();
            if (ing.date.indexOf(":") != -1) {
                ing.date = ing.date.substring(ing.date.indexOf(":") + 1).trim();
            }
            ing.content = doc.select("#ing_detail_body").text().trim();
            ing.lucky = doc.select(".ing_icon_lucky").size() > 0;
            for (Element e : doc.select("#comment_block_" + id).get(0).children()) {
                ing.comments.add(Comment.parseComment(e, id));
            }
            return ing;
        }

        static class Comment {
            int id;
            int ingid;
            String user;
            String content;
            String date;
            String userlink;

            static Comment parseComment(Element e, int ingid) {
                Comment comment = new Comment();
                comment.id = Integer.parseInt(e.id().substring(8));
                comment.ingid = ingid;

                comment.user = e.select("#comment_author_" + comment.id).text().trim();
                comment.userlink = e.select("#comment_author_" + comment.id).attr("href");
                comment.date = e.select(".text_green").attr("title").trim();

                e.select("#comment_author_" + comment.id).remove();
                e.select(".text_green").remove();
                e.select(".gray3").remove();

                comment.content = e.select("div").text().trim();
                if (comment.content.startsWith(":")) {
                    comment.content = comment.content.substring(1).trim();
                }

                return comment;
            }

            @Override
            public String toString() {
                return "[" + user + "] - " + content;
            }
        }

    }

    static class Crawler implements Runnable {
        static ExecutorService crawler = Executors.newFixedThreadPool(10);

        String url;
        int id;

        public Crawler(String url, int id) {
            this.url = url;
            this.id = id;
        }

        public static int getLastestId() {
            return 1054304;
        }

        public static void crawl(String url, int id) {
            crawler.execute(new Crawler(url, id));
        }

        @Override
        public void run() {
            System.out.println("crawl for: " + url);
            try {
                String cookie = "YOUR COOKIE HERE";
                String useragent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36";
                Inserter.insert(Ing.parseIng(Jsoup.connect(url).header("cookie", cookie).userAgent(useragent).get(), url, id));
            } catch (IOException e) {
                e.printStackTrace();
            }

        }

    }

    static class Inserter implements Runnable {
        static ExecutorService inserter = Executors.newFixedThreadPool(1);

        static Connection conn;
        static PreparedStatement pstating, pstatcmt;

        static {
            try {
                Class.forName("com.mysql.jdbc.Driver");
                conn = DriverManager.getConnection(
                        "jdbc:mysql://localhost:3306/ing?useUnicode=true&characterEncoding=utf-8&autoReconnect=true", "root", "");

                pstating = conn
                        .prepareStatement("insert into ing (id,url,user,date,content,lucky,userlink) values (?,?,?,?,?,?,?)");
                pstatcmt = conn
                        .prepareStatement("insert into comment (id,ingid,user,content,date,userlink) values (?,?,?,?,?,?)");
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        Ing ing;

        public Inserter(Ing ing) {
            this.ing = ing;
        }

        public static int getNextId() {
            try {
                Statement stat = conn.createStatement();
                ResultSet rs = stat.executeQuery("select max(id) as id from ing");
                if (rs.next()) {
                    return rs.getInt("id") + 1;
                }
            } catch (SQLException e) {
                // ignore
            }
            return 1;
        }

        static int no = 0;

        public static void insert(Ing ing) {
            inserter.execute(new Inserter(ing));
        }

        @Override
        public void run() {
            System.out.println(++no + ". " + ing);
            try {
                pstating.setInt(1, ing.id);
                pstating.setString(2, ing.url);
                pstating.setString(3, ing.user);
                pstating.setString(4, ing.date);
                pstating.setString(5, ing.content);
                pstating.setInt(6, ing.lucky ? 1 : 0);
                pstating.setString(7, ing.userlink);
                pstating.executeUpdate();

                for (Ing.Comment c : ing.comments) {
                    pstatcmt.setInt(1, c.id);
                    pstatcmt.setInt(2, c.ingid);
                    pstatcmt.setString(3, c.user);
                    pstatcmt.setString(4, c.content);
                    pstatcmt.setString(5, c.date);
                    pstatcmt.setString(6, c.userlink);
                    pstatcmt.executeUpdate();
                }
            } catch (SQLException e) {
                System.err.println("ERROR - " + e.getMessage() + " - " + ing);
            }
        }

    }

}
时间: 2024-10-06 05:22:53

IngCrawler的相关文章