依赖httpclient4.2,Jsop
SemeiziCrawler.java
package kidbei.learn.crawler; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.StringWriter; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * http://sejie.wanxun.org/post/2012-09-25/40039413449 * @author Administrator * */ public class SemeiziCrawler { private static final String BASEHOST = "http://sejie.wanxun.org/"; private static DefaultHttpClient client = ConnectionManager.getHttpClient(); static String url = "http://sejie.wanxun.org/post/2012-09-25/40039413449"; private static String IMGPATH = "D:\\sexpicture\\色戒美眉图"+File.separator+StringUtil.getDate(); static int STARTPAGE = 1; static int PAGECOUNT = 100; public static void main(String[] args) { File f = new File(IMGPATH); if(!f.exists()){ f.mkdirs(); } String host = BASEHOST ; for(int i=STARTPAGE;i<PAGECOUNT;i++){ if(i != 1){ host = BASEHOST+"page/"+i; } System.out.println("进入第"+i+"页"); String pageContext = getResultByUrl(host); // System.out.println(pageContext); List<String>articleURLS = getArticleURL(pageContext); for(String articleURL:articleURLS){ String articleContext = getResultByUrl(articleURL); List<String> ImgURLS = getImgURLS(articleContext); for(String ImgURL:ImgURLS){ savepic(ImgURL); } } } // String articleContext = getResultByUrl(url); // List<String> strs = getImgURLS(articleContext); // for(String str:strs){ // System.out.println(str); // } } /** * 根据url获取页面 * @param url * @return */ public static String getResultByUrl(String url){ System.out.println("打开网页"+url); HttpGet get = new HttpGet(url); HttpEntity entity = null; HttpResponse response = null; try { response = client.execute(get); entity = response.getEntity(); if(entity != null){ InputStream is = entity.getContent(); StringWriter sw = new StringWriter(); IOUtils.copy(is, sw, "UTF-8"); is.close(); sw.close(); return sw.toString(); } } catch (Exception e) { System.out.println("网页打开出错"); return null; }finally{ get.abort(); try { EntityUtils.consume(entity); } catch (IOException e) { e.printStackTrace(); } } return null; } /** * 找出当前页面中所有帖子的地址 * @param pageStr 网页字符串 * @return */ public static List<String> getArticleURL(String pageContext){ if(pageContext == null){ return null; } List<String> articleURLS = new ArrayList<String>(); System.out.println("寻找帖子..........."); try { Document doc = Jsoup.parseBodyFragment(pageContext); Elements es = doc.select("div.post"); es = es.select("div[class=post-item type-photo]"); es = es.select("div.meta a:containsOwn(全文)"); for(Element e:es){ articleURLS.add(e.attr("href")); } } catch (Exception e) { e.printStackTrace(); return null; } return articleURLS; } /** * 获取帖子的图片地址 * @param articleURLS * @return */ public static List<String> getImgURLS(String articleContext){ List<String>ImgURLS = new ArrayList<String>(); if(articleContext == null){ return null; } System.out.println("获取图片地址-----------"); Document doc = Jsoup.parse(articleContext); Elements es = doc.select("a[target=_blank] img[src]"); for(Iterator<Element> i=es.iterator();i.hasNext();){ Element e = i.next(); ImgURLS.add(e.attr("src")); } return ImgURLS; } /** * 保存图片 * @param ImgURL */ public static void savepic(String ImgURL){ if(ImgURL == null){ return ; } HttpGet get = new HttpGet(ImgURL); String[] strs = ImgURL.split("/"); String fileName = strs[strs.length-1]; String savePath = IMGPATH+File.separator+fileName; HttpEntity entity = null; try { HttpResponse response = client.execute(get); entity = response.getEntity(); System.out.println("保存图片>>>>.>>>>>>"+fileName); InputStream is = entity.getContent(); OutputStream os = new FileOutputStream(savePath); IOUtils.copy(is, os); IOUtils.closeQuietly(os); IOUtils.closeQuietly(is); } catch (Exception e) { e.printStackTrace(); System.out.println("图片保存失败"); return ; } } }
StringUtil.java
package kidbei.learn.crawler; import java.io.File; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Random; public class StringUtil { public static String getRandomString(){ StringBuffer generateRandStr = new StringBuffer(); Random rand = new Random(); int length = 6; char ch; for(int i=0;i<length;i++) { int randNum = Math.abs(rand.nextInt())%26+97; // 产生97到122的随机数(a-z的键位值) ch = ( char ) randNum; generateRandStr.append( ch ); } return generateRandStr.toString(); } public static String getSavePath(String IMGPATH,String fileName){ SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); String date = sdf.format(new Date()).toString(); if(!(fileName.endsWith(".jpg"))){ fileName = fileName + ".jpg"; } String randStr = StringUtil.getRandomString(); return IMGPATH+File.separator+date+File.separator+randStr+fileName; } public static String getDate(){ SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); return sdf.format(new Date()).toString(); } }
ConnectionManager.java
package kidbei.learn.crawler; import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.conn.ssl.SSLSocketFactory; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.params.CoreProtocolPNames; import org.apache.http.params.HttpParams; public class ConnectionManager { static final int TIMEOUT = 20000;//连接超时时间 static final int SO_TIMEOUT = 20000;//数据传输超时 static String UA = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1" + " (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"; public static DefaultHttpClient getHttpClient(){ SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register( new Scheme("http",80,PlainSocketFactory.getSocketFactory())); schemeRegistry.register( new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); PoolingClientConnectionManager cm = new PoolingClientConnectionManager(schemeRegistry); cm.setMaxTotal(500); cm.setDefaultMaxPerRoute(200); HttpParams params = new BasicHttpParams(); params.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,TIMEOUT); params.setParameter(CoreConnectionPNames.SO_TIMEOUT, SO_TIMEOUT); params.setParameter(CoreProtocolPNames.USER_AGENT, UA); DefaultHttpClient client = new DefaultHttpClient(cm,params); return client; } }
本文转自:http://www.oschina.net/code/snippet_257479_14524#23843
时间: 2024-10-10 02:52:57