抓下来返回text是这样的:
需要用到的包:下载地址:http://pan.baidu.com/s/1o69myOq
两个类的代码
WeiboCN.java
import java.util.Set;
import cn.edu.hfut.dmic.webcollector.*;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
public class WeiboCN {
/**
* 代码由WebCollector提供,如果不在WebCollector中使用,需要导入selenium相关jar包
*/
/**
* 获取新浪微博的cookie,这个方法针对weibo.cn有效,对weibo.com无效
* weibo.cn以明文形式传输数据,请使用小号
* @param username 新浪微博用户名
* @param password 新浪微博密码
* @return
* @throws Exception
*/
public static String getSinaCookie(String username, String password) throws Exception{
StringBuilder sb = new StringBuilder();
HtmlUnitDriver driver = new HtmlUnitDriver();
driver.setJavascriptEnabled(true);
driver.get("http://login.weibo.cn/login/");
WebElement mobile = driver.findElementByCssSelector("input[name=mobile]");
mobile.sendKeys(username);
WebElement pass = driver.findElementByCssSelector("input[name^=password]");
pass.sendKeys(password);
WebElement rem = driver.findElementByCssSelector("input[name=remember]");
rem.click();
WebElement submit = driver.findElementByCssSelector("input[name=submit]");
submit.click();
Set<Cookie> cookieSet = driver.manage().getCookies();
driver.close();
for (Cookie cookie : cookieSet) {
sb.append(cookie.getName()+"="+cookie.getValue()+";");
}
String result=sb.toString();
if(result.contains("gsid_CTandWM")){
return result;
}else{
throw new Exception("weibo login failed");
}
}
}
WeiboCrlawer.java
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequesterImpl;
public class WeiboCrawler extends DeepCrawler{
public WeiboCrawler(String crawlPath) throws Exception {
super(crawlPath);
/*获取新浪微博的cookie,账号密码以明文形式传输,请使用小号*/
String cookie=WeiboCN.getSinaCookie("你的微博账号", "密码");
HttpRequesterImpl myRequester=(HttpRequesterImpl) this.getHttpRequester();
myRequester.setCookie(cookie);
}
public Links visitAndGetNextLinks(Page page) {
/*抽取微博*/
Elements weibos=page.getDoc().select("div.c");
for(Element weibo:weibos){
System.out.println(weibo.text());
}
/*如果要爬取评论,这里可以抽取评论页面的URL,返回*/
return null;
}
public static void main(String[] args) throws Exception{
WeiboCrawler crawler=new WeiboCrawler("/home/hu/data/weibo");
crawler.setThreads(3);
/*对某人微博前5页进行爬取*/
for(int i=0;i<5;i++){
crawler.addSeed("http://weibo.cn/vipgcu?vt=4&page="+i);
}
crawler.start(1);
}
}
本文作者:By: 罗坚元 :http://blog.csdn.net/sunyuan_software
时间: 2024-12-24 13:01:30