运用apache httpclient爬数据、httpcleaner解析爬回来的数据:
package cn.sniper.spider.util; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.htmlcleaner.XPatherException; import org.json.JSONArray; import org.json.JSONObject; import org.junit.Before; import org.junit.Test; public class SpiderUtil { private String pageContent; @Before public void init() { HttpClientBuilder builder = HttpClients.custom(); CloseableHttpClient client = builder.build(); String url = "http://www.2345.com/"; HttpGet request = new HttpGet(url); try { CloseableHttpResponse resp = client.execute(request); HttpEntity entity = resp.getEntity(); pageContent = EntityUtils.toString(entity); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * 抓取整个页面 */ @Test public void testDownload1() { HttpClientBuilder builder = HttpClients.custom(); CloseableHttpClient client = builder.build(); String url = "http://www.2345.com/"; HttpGet request = new HttpGet(url); try { CloseableHttpResponse resp = client.execute(request); HttpEntity entity = resp.getEntity(); String pageContent = EntityUtils.toString(entity); System.out.println(pageContent); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * 取得text内容 */ @Test public void testDownload2() { HtmlCleaner cleaner = new HtmlCleaner(); TagNode rootNode = cleaner.clean(pageContent); //拿到id=name元素中的第一个h1元素,如果只有一个,//*[@id=\"name\"]h1 String xPathExpression = "//*[@id=\"name\"]h1[1]"; try { Object[] objs = rootNode.evaluateXPath(xPathExpression); TagNode node = (TagNode)objs[0]; System.out.println(node.getText()); } catch (XPatherException e) { e.printStackTrace(); } } /** * 通过属性名称取得值 */ @Test public void testDownload3() { HtmlCleaner cleaner = new HtmlCleaner(); TagNode rootNode = cleaner.clean(pageContent); String xPathExpression = "//*[@id=\"j_search_img\"]"; try { Object[] objs = rootNode.evaluateXPath(xPathExpression); TagNode node = (TagNode)objs[0]; String src = node.getAttributeByName("src"); //注意,需要写前缀:http:// 否则报错:java.net.MalformedURLException: no protocol URL url = new URL("http://www.2345.com/" + src); URLConnection conn = url.openConnection(); InputStream is = conn.getInputStream(); FileOutputStream fos = new FileOutputStream("D:/1.gif"); int b = 0; while((b = is.read()) != -1) { fos.write(b); } fos.close(); is.close(); System.out.println(src); } catch (XPatherException e) { e.printStackTrace(); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 抓取的页面返回json数据 */ @Test public void testDownload4() { HttpClientBuilder builder = HttpClients.custom(); CloseableHttpClient client = builder.build(); String url = "http://www.2345.com/"; HttpGet request = new HttpGet(url); try { CloseableHttpResponse resp = client.execute(request); HttpEntity entity = resp.getEntity(); String pageContent = EntityUtils.toString(entity); JSONArray jsonArray = new JSONArray(pageContent); JSONObject jsonObj = (JSONObject)jsonArray.get(0); System.out.println(jsonObj.get("price")); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
时间: 2024-10-10 05:08:52