今天webmaigic爬虫又学了一个小技巧,想要自己设计保存爬取内容形式,可以不用重写Pipeline,在process()方法中写上,你想要的保存操作,多数情况可以达到相同的效果,我的爬虫程序,想要将内容保存在一个txt中,就是这么实现的,个人感觉简单很多,也是看了网上的高手的文章,才学到了这个技巧,受益匪浅。
爬虫北京政府信件到此就完成完成了,全部代码如下,我的保存特点为以空格隔开不同的信息,方便导入数据库:
package my.webmagic2; import java.io.File; import java.io.FileWriter; import java.io.IOException; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.model.HttpRequestBody; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import us.codecraft.webmagic.utils.HttpConstant; public class text implements PageProcessor{ public static int h=1; private Site site=Site.me().setRetrySleepTime(3).setSleepTime(100); public int check=0; /** * @param args */ public Site getSite() { // TODO Auto-generated method stub return site; } public void process(Page page) { // TODO Auto-generated method stub if(check==0){ check++; String[] str1=page.getHtml().regex("\"letter_type\":\"[^,]+").all().toString().split(","); String[] str2=page.getHtml().regex("\"original_id\":\"[^,]+").all().toString().split(","); int len1,len2; for(int i=0;i<str1.length-1;i++){ len1=str1[i].length()-1; str1[i]=str1[i].substring(16,len1); len2=str2[i].length()-1; str2[i]=str2[i].substring(16,len2); } str1[str1.length-1]= str1[str1.length-1].substring(16,str1[str1.length-1].length()-2); str2[str2.length-1]= str2[str2.length-1].substring(16,str2[str2.length-1].length()-2); for(int i=0;i<str2.length;i++){ if(str1[i].equals("咨询")){ page.addTargetRequest("http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId="+str2[i]); } else if(str1[i].equals("建议")){ page.addTargetRequest("http://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId="+str2[i]); } else if(str1[i].equals("投诉")){ page.addTargetRequest("http://www.beijing.gov.cn/hudong/hdjl/com.web.complain.complainDetail.flow?originalId="+str2[i]); } else{ page.addTargetRequest("http://www.beijing.gov.cn/hudong/hdjl/com.web.complain.complainDetail.flow?originalId="+str2[i]); } } } else{ File file=new File("/home/hadoop/xinjian"); try { FileWriter w=new FileWriter(file,true); if(page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[1]/div[2]/strong").toString()!=null){ String hf=page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[2]/div/div[1]/div[2]").toString(); hf=hf.replace("<div class=\"col-xs-12 col-md-12 column p-4 text-muted my-3\">","" ); hf=hf.replace("</div>", ""); hf=hf.replaceAll(" ", ""); hf=hf.replaceAll("<p>", ""); hf=hf.replaceAll("</p>",""); hf=hf.replaceAll(" ", ""); hf=hf.replaceAll("\n", ""); w.write(page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[1]/div[2]/strong/text()").toString().replaceAll(" ","") +" " +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[2]/div[1]/text()").toString().substring(4).replaceAll(" ","") +" " +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[2]/div[2]/text()").toString().substring(3).replaceAll(" ","") +" " +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[2]/div[3]/label/text()").toString().replaceAll(" ","") +" " +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[3]/text()").toString().replaceAll(" ","") +" " +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[2]/div/div[1]/div[1]/div[2]/text()").toString().replaceAll(" ","") +" " +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[2]/div/div[1]/div[1]/div[3]/text()").toString().substring(5).replaceAll(" ","") +" " +hf +"\n" ); w.close(); h++; }else if(page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[1]/div[2]/strong").toString()!=null){ String hf=page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[2]/div/div[1]/div[2]").toString(); hf=hf.replace("<div class=\"col-xs-12 col-md-12 column p-4 text-muted my-3\">","" ); hf=hf.replace("</div>", ""); hf=hf.replaceAll(" ", " "); hf=hf.replaceAll("<p>", ""); hf=hf.replaceAll("</p>",""); hf=hf.replaceAll(" ", ""); hf=hf.replaceAll("\n", ""); w.write( page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[1]/div[2]/strong/text()").toString().replaceAll(" ","") +" " +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[2]/div[1]/text()").toString().substring(4).replaceAll(" ","") +" " +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[2]/div[2]/text()").toString().substring(3).replaceAll(" ","") +" " +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[2]/div[3]/label/text()").toString().replaceAll(" ","") +" " +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[3]/text()").toString().replaceAll(" ","") +" " +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[2]/div/div[1]/div[1]/div[2]/text()").toString().replaceAll(" ","") +" " +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[2]/div/div[1]/div[1]/div[3]/text()").toString().substring(5).replaceAll(" ","") +" " +hf +"\n" ); w.close(); h++; }else{ page.putField("all", page.getHtml().toString()); w.close(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /** * @param args */ public static void main(String[] args) { int j=0; for(int i=0;i<=5586;i++){ j=i*6; // TODO Auto-generated method stub Request request = new Request("http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.replyMailList.biz.ext"); request.setMethod(HttpConstant.Method.POST); request.setRequestBody(HttpRequestBody.json("{‘PageCond/begin‘:"+j+",‘PageCond/length‘:6,‘PageCond/isCount‘:‘true‘,‘keywords‘:‘‘,‘orgids‘:‘‘,‘startDate‘:‘‘,‘endDate‘:‘‘,‘letterType‘:‘2‘,‘letterStatue‘:‘‘}","utf-8")); Spider.create(new text()) .addRequest(request) .addPipeline(new FilePipeline("./xinjian/")) .setScheduler(new FileCacheQueueScheduler("./xinjian/")) .thread(5) .run(); System.out.println("完成"+i); } System.out.println("全部完成"); } }
原文地址:https://www.cnblogs.com/my---world/p/12313824.html
时间: 2024-11-02 05:47:12