httpclient爬取性感美图

依赖httpclient4.2,Jsop

SemeiziCrawler.java

package kidbei.learn.crawler;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
 * http://sejie.wanxun.org/post/2012-09-25/40039413449
 * @author Administrator
 *
 */
public class SemeiziCrawler {
    private static final String BASEHOST = "http://sejie.wanxun.org/";
    private static DefaultHttpClient client = ConnectionManager.getHttpClient();
    static String url = "http://sejie.wanxun.org/post/2012-09-25/40039413449";
    private static String IMGPATH = "D:\\sexpicture\\色戒美眉图"+File.separator+StringUtil.getDate();
    static int STARTPAGE = 1;
    static int PAGECOUNT = 100;

    public static void main(String[] args) {
        File f = new File(IMGPATH);
        if(!f.exists()){
            f.mkdirs();
        }
        String host = BASEHOST ;
        for(int i=STARTPAGE;i<PAGECOUNT;i++){
            if(i != 1){
                host = BASEHOST+"page/"+i;
            }
            System.out.println("进入第"+i+"页");
            String pageContext = getResultByUrl(host);
//          System.out.println(pageContext);
            List<String>articleURLS = getArticleURL(pageContext);
            for(String articleURL:articleURLS){
                String articleContext = getResultByUrl(articleURL);
                List<String> ImgURLS = getImgURLS(articleContext);
                for(String ImgURL:ImgURLS){
                    savepic(ImgURL);
                }
            }
        }
//      String articleContext = getResultByUrl(url);
//      List<String> strs = getImgURLS(articleContext);
//      for(String str:strs){
//          System.out.println(str);
//      }
    }
    /**
     * 根据url获取页面
     * @param url
     * @return
     */
    public static String getResultByUrl(String url){
        System.out.println("打开网页"+url);
        HttpGet get = new HttpGet(url);
        HttpEntity entity = null;
        HttpResponse response = null;
        try {
            response = client.execute(get);
            entity = response.getEntity();
            if(entity != null){
                InputStream is = entity.getContent();
                StringWriter sw = new StringWriter();
                IOUtils.copy(is, sw, "UTF-8");
                is.close();
                sw.close();
                return sw.toString();
            }
        } catch (Exception e) {
            System.out.println("网页打开出错");
            return null;
        }finally{
            get.abort();
            try {
                EntityUtils.consume(entity);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return null;
    }
    /**
     * 找出当前页面中所有帖子的地址
     * @param pageStr  网页字符串
     * @return
     */
    public static List<String> getArticleURL(String pageContext){
        if(pageContext == null){
            return null;
        }
        List<String> articleURLS = new ArrayList<String>();
        System.out.println("寻找帖子...........");
        try {
            Document doc = Jsoup.parseBodyFragment(pageContext);
            Elements es = doc.select("div.post");
            es = es.select("div[class=post-item type-photo]");
            es = es.select("div.meta a:containsOwn(全文)");
            for(Element e:es){
                articleURLS.add(e.attr("href"));
            }
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
        return articleURLS;
    }
    /**
     * 获取帖子的图片地址
     * @param articleURLS
     * @return
     */
    public static List<String> getImgURLS(String articleContext){
        List<String>ImgURLS = new ArrayList<String>();
        if(articleContext == null){
            return null;
        }
        System.out.println("获取图片地址-----------");
        Document doc = Jsoup.parse(articleContext);
        Elements es = doc.select("a[target=_blank] img[src]");
         for(Iterator<Element> i=es.iterator();i.hasNext();){
                Element e = i.next();
                ImgURLS.add(e.attr("src"));
             }
        return ImgURLS;
    }
    /**
     * 保存图片
     * @param ImgURL
     */
    public static void savepic(String ImgURL){
        if(ImgURL == null){
            return ;
        }
        HttpGet get = new HttpGet(ImgURL);
        String[] strs = ImgURL.split("/");
        String fileName = strs[strs.length-1];
        String savePath = IMGPATH+File.separator+fileName;
        HttpEntity entity = null;
        try {
            HttpResponse response = client.execute(get);
            entity = response.getEntity();
            System.out.println("保存图片>>>>.>>>>>>"+fileName);
            InputStream is = entity.getContent();
            OutputStream os = new FileOutputStream(savePath);
            IOUtils.copy(is, os);
            IOUtils.closeQuietly(os);
            IOUtils.closeQuietly(is);
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("图片保存失败");
            return ;
        }
    }
}

StringUtil.java

package kidbei.learn.crawler;

import java.io.File;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Random;

public class StringUtil {
    public static String getRandomString(){
        StringBuffer generateRandStr = new StringBuffer();
        Random rand = new Random();
        int length = 6;
        char ch;
        for(int i=0;i<length;i++)
        {
         int randNum = Math.abs(rand.nextInt())%26+97; // 产生97到122的随机数(a-z的键位值)
            ch = ( char ) randNum;
            generateRandStr.append( ch );
        }
        return generateRandStr.toString();
    }

    public static String getSavePath(String IMGPATH,String fileName){
        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
        String date = sdf.format(new Date()).toString();
        if(!(fileName.endsWith(".jpg"))){
            fileName = fileName + ".jpg";
        }
        String randStr = StringUtil.getRandomString();
        return IMGPATH+File.separator+date+File.separator+randStr+fileName;
    }

    public static String getDate(){
        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
        return sdf.format(new Date()).toString();
    }
}

ConnectionManager.java

package kidbei.learn.crawler;

import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;

public class ConnectionManager {
    static final int TIMEOUT = 20000;//连接超时时间
    static final int SO_TIMEOUT = 20000;//数据传输超时
    static String UA = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1" +
            " (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1";

    public static DefaultHttpClient getHttpClient(){
        SchemeRegistry schemeRegistry = new SchemeRegistry();
        schemeRegistry.register(
                new Scheme("http",80,PlainSocketFactory.getSocketFactory()));
        schemeRegistry.register(
                new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));

        PoolingClientConnectionManager  cm = new PoolingClientConnectionManager(schemeRegistry);
        cm.setMaxTotal(500);
        cm.setDefaultMaxPerRoute(200);

        HttpParams params = new BasicHttpParams();
        params.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,TIMEOUT);
        params.setParameter(CoreConnectionPNames.SO_TIMEOUT, SO_TIMEOUT);
        params.setParameter(CoreProtocolPNames.USER_AGENT, UA);

        DefaultHttpClient client = new DefaultHttpClient(cm,params);
        return client;
    }
}

本文转自:http://www.oschina.net/code/snippet_257479_14524#23843

时间: 2024-10-10 02:52:57

httpclient爬取性感美图的相关文章

Android 性感美图在线浏览APP

周末无聊,遂整理了下近来经常使用的几个开源库,无意间又发现了一些开放接口,于是乎决定融合在一起,做个简单的"性感美图"浏览的APP,名字呢,就叫"性感沙滩",效果例如以下.(相信一些屌丝男士对改名字不会陌生) 该项目还是比較简单的,代码也不复杂,此处做些简要的介绍. 1. 图片开放接口: http://www.tngou.net/doc/gallery/28 2. 相关的类库: · 下拉刷新列表组件:https://github.com/chrisbanes/And

Python3网络爬虫(十):这个帅哥、肌肉男横行的世界(爬取帅哥图)

"-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> Python3网络爬虫(十):这个帅哥.肌肉男横行的世界(爬取帅哥图) - Jack-Cui - 博客频道 - CSDN.NET Jack-Cui 努力-是为了将运气成分降到最低 目录视图 摘要视图 订阅 [活动]2017 CSDN博客专栏评选 &nbsp [5月书讯

python 爬取天猫美的评论数据

笔者最近迷上了数据挖掘和机器学习,要做数据分析首先得有数据才行.对于我等平民来说,最廉价的获取数据的方法,应该是用爬虫在网络上爬取数据了.本文记录一下笔者爬取天猫某商品的全过程,淘宝上面的店铺也是类似的做法,不赘述.主要是分析页面以及用Python实现简单方便的抓取. 笔者使用的工具如下 Python 3--极其方便的编程语言.选择3.x的版本是因为3.x对中文处理更加友好. Pandas--Python的一个附加库,用于数据整理. IE 11--分析页面请求过程(其他类似的流量监控工具亦可).

Scrapy框架学习(四)爬取360摄影美图

我们要爬取的网站为http://image.so.com/z?ch=photography,打开开发者工具,页面往下拉,观察到出现了如图所示Ajax请求, 其中list就是图片的详细信息,接着观察到每个Ajax请求的sn值会递增30,当sn为30时,返回前30张图片,当sn为60时,返回第31到60张图片,所以我们每次抓取时需要改变sn的值.接下来实现这个项目. 首先新建一个项目:scrapy startproject images360 新建一个Spider:scrapy genspider

python 爬取京东手机图

初学urllib,高手勿喷... import re import urllib.request #函数:每一页抓取的30张图片 def craw(url,page): imagelist = []#这里每次都要重新定义新的空列表,第一次没有定义结果爬取的都是一样的图片 html1 = urllib.request.urlopen(url) data = str(html1.read()) patter1 = '<li class="gl-item".+?</li>'

Scrapy爬取天天美剧封面照及剧集下载地址

其实我只是想试试爬取图片而已,先看看网页,需要爬的地方有两个,一是封面图,二是下载地址,挺简单的 Item定义: import scrapy class TiantianmeijuItem(scrapy.Item):     name = scrapy.Field()     image_urls = scrapy.Field()     images = scrapy.Field()     image_paths = scrapy.Field()     episode = scrapy.F

jsoup httpclient 爬取网页并下载google图标

jsoup下载地址 http://www.jsoup.org httpclient下载地址 http://hc.apache.org/downloads.cgi 其他jar包见附件 Crawler package jsoup; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashMap;

Httpclient爬取优酷网

参考:http://www.cnblogs.com/lchzls/p/6277210.html /httpClient/src/main/java/com/louis/youku/Page.java /** * Project Name:httpClient * File Name:Page.java * Package Name:com.louis.youku * Date:2017年11月9日上午9:11:28 * Copyright (c) 2017, [email protected]

你难道不想知道怎么用Python爬取性感美女壁纸?

百度性感美女壁纸了解一下 看到这个图片,有没有一种.........emmmmm.......刺激.兴奋的感觉 不管你们有没有 反正小编我是有一股冲劲的,自从知道了Python爬虫之后,只要看到有妹子的照片的网站,我就是想要批量下载一下! 不为别的,是为了能更好的学习Python! 我这样说你们信吗? 准备: Python3.6 import requests import json 完整代码 1 # !/usr/bin/env python 2 # -*- coding:utf-8 -*- 3