爬虫平台设置代理ip

首先从国外一个网站爬取了免费的代理ip信息存到mongodb中；接着代码设置:

在爬虫客户端抽象类中添加属性:

设置代理的代码其实就以下几句:

firefoxProfile.setPreference("network.proxy.type", 1);
firefoxProfile.setPreference("network.proxy.no_proxies_on", "localhost, 127.0.0.1"); //设置当没有代理ip时本地爬取

firefoxProfile.setPreference("network.proxy.http", proxyHttp.getIp());
firefoxProfile.setPreference("network.proxy.http_port", proxyHttp.getPort());

firefoxProfile.setPreference("network.proxy.ssl", proxyHttps.getIp());
firefoxProfile.setPreference("network.proxy.ssl_port", proxyHttps.getPort());

以下是具体实现代码:

/**
* 爬虫客户端抽象类
* 其生命周期如下
* setSpiderDao→setRootUrl→setParamsMap→init→runSpider→returnData→destory
*/
public abstract class SpiderClient {

private static final Logger logger = LoggerFactory.getLogger(SpiderClient.class);
protected SpiderDao spiderDao;
protected SpiderData spiderData;
protected WebDriver driver;
protected String rootUrl;
protected Map<String, Object> params;
private String collection;
protected boolean enableProxy;

//.. get set

/**
* 初始化工作
*/
public void init(){

FirefoxProfile firefoxProfile = new FirefoxProfile();

// 去掉css
firefoxProfile.setPreference("permissions.default.stylesheet", 2);
// 去掉图片
firefoxProfile.setPreference("permissions.default.image", 2);
// 去掉flash
firefoxProfile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", false);
//设置默认下载
// 设置是否显示下载进度框
firefoxProfile.setPreference("browser.download.manager.showWhenStarting", false);
// browser.download.folderList 设置Firefox的默认下载文件夹。0是桌面；1是“我的下载”；2是自定义
firefoxProfile.setPreference("browser.download.folderList", 2);
// ,如果使用自定义路径，必须要将browser.download.folderList设置为2
firefoxProfile.setPreference("browser.download.dir", System.getProperty("java.io.tmpdir")+"material_images");
// 设置哪种类型的文件下载不询问直接下载
firefoxProfile.setPreference("browser.helperApps.neverAsk.saveToDisk","image/gif,image/png,image/jpeg,image/bmp,image/webp");
/*firefoxProfile.setPreference("browser.helperApps.neverAsk.saveToDisk",
"application/zip,text/plain,application/vnd.ms-excel,text/csv,text/comma-separated-values,application/octet-stream,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.openxmlformats-officedocument.wordprocessingml.document");
*/
//proxy
if(enableProxy){
firefoxProfile.setPreference("network.proxy.type", 1);
firefoxProfile.setPreference("network.proxy.no_proxies_on", "localhost, 127.0.0.1");

ProxyIP proxyHttp = getProxyIPForHttp();
if(proxyHttp!=null){
firefoxProfile.setPreference("network.proxy.http", proxyHttp.getIp());
firefoxProfile.setPreference("network.proxy.http_port", proxyHttp.getPort());
logger.info("Set http proxy: {}:{}",proxyHttp.getIp(),proxyHttp.getPort());
}
ProxyIP proxyHttps = getProxyIPForHttps();
if(proxyHttps!=null){
firefoxProfile.setPreference("network.proxy.ssl", proxyHttps.getIp());
firefoxProfile.setPreference("network.proxy.ssl_port", proxyHttps.getPort());
logger.info("Set https proxy: {}:{}",proxyHttps.getIp(),proxyHttps.getPort());
}
}
this.driver = new FirefoxDriver(firefoxProfile);
this.driver.manage().timeouts().implicitlyWait(30, TimeUnit.SECONDS);
this.spiderData = new SpiderData();
this.spiderData.setIds(new ArrayList<String>());

}

//先从China的ip获取(信号相对好，网速快)

private ProxyIP getProxyIPForHttp(){
MongoSpiderDao mongoSpiderDao = (MongoSpiderDao) spiderDao;
List<ProxyIP> list = mongoSpiderDao.getProxyIP("HTTP", "China", 20); //从mongodb中查询20条ip数据
if(list==null || list.isEmpty()){
return null;
}
return list.get(RandomUtils.nextInt(0, list.size()));
}
private ProxyIP getProxyIPForHttps(){
MongoSpiderDao mongoSpiderDao = (MongoSpiderDao) spiderDao;
List<ProxyIP> list = mongoSpiderDao.getProxyIP("HTTPS", "China", 20);
if(list==null || list.isEmpty()){
return null;
}
return list.get(RandomUtils.nextInt(0, list.size()));
}

...

}

时间： 2024-08-27 02:07:51

爬虫平台设置代理ip

爬虫平台设置代理ip的相关文章

java爬虫常用设置代理IP教程

爬虫-设置代理ip

浏览器怎么设置代理IP？四种浏览器设置代理IP的方法

网络爬虫一定用代理IP吗？不用代理IP加快速度会被封吗？

python爬取准备四定义Opener和设置代理IP

【python爬虫】加密代理IP的使用与设置一套session请求头

构建一个给爬虫使用的代理IP池

Python 爬虫抓取代理IP，并检测联通性

python爬虫爬取代理IP