项目的目录结构
核心源码:
package cn.edu.zyt.spider; import java.io.BufferedInputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.Properties; import cn.edu.zyt.spider.model.SpiderParams; import cn.edu.zyt.spider.queue.UrlQueue; import cn.edu.zyt.spider.worker.SpiderWorker; public class SpiderStarter { public static void main(String[] args){ System.setProperty("java.net.useSystemProxies", "true"); System.setProperty("http.proxyHost", "113.128.9.37"); System.setProperty("http.proxyPort", "9999"); System.setProperty("https.proxyHost", "113.128.9.37"); System.setProperty("https.proxyPort", "9999"); // 初始化配置参数 initializeParams(); // 初始化爬取队列 initializeQueue(); // 创建worker线程并启动 for(int i = 1; i <= SpiderParams.WORKER_NUM; i++){ new Thread(new SpiderWorker(i)).start(); } } /** * 初始化配置文件参数 */ private static void initializeParams(){ InputStream in; try { in = new BufferedInputStream(new FileInputStream("conf/spider.properties")); Properties properties = new Properties(); properties.load(in); // 从配置文件中读取参数 SpiderParams.WORKER_NUM = Integer.parseInt(properties.getProperty("spider.threadNum")); SpiderParams.DEYLAY_TIME = Integer.parseInt(properties.getProperty("spider.fetchDelay")); in.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * 准备初始的爬取链接 */ private static void initializeQueue(){ // 例如,需要抓取天下粮仓信息,根据链接规则生成URLs放入带抓取队列http://www.cofeed.com/national_1.html for(int i = 0; i < 3; i += 1){ UrlQueue.addElement("http://www.cofeed.com/national_" + i+".html"); } } }
实现效果图:
由于页面代码较多就不一一粘贴了,获取完整源码可在博客下方留言哈
原文地址:https://www.cnblogs.com/zyt-bg/p/10637350.html
时间: 2024-11-10 01:33:13