1 抓取的网址是360团购
http://tuan.360.cn/bei_jing/c_0.html?kw=电影&pageno=1#tuanFilter
2 利用firefox的FireBug插件分析其源代码,如下所示:
"//*/h3[@class=‘desc‘]" 匹配电影院名称 "//*/span [@class=‘discount‘]" 匹配原价 "//*/span [@class=‘price‘]" 匹配优惠价 "//*/div [@class=‘other-info clearfix‘]" 匹配备注信息 "//*/div[@class=‘source clearfix‘]" 匹配团购信息来源
3 源代码
## 利用RCurl抓取电影团购信息 library(RCurl) library(XML) library("plyr") page <- 1:5 urlist[page] <- paste("http://tuan.360.cn/bei_jing/c_0.html?kw=电影&pageno=",page,"#tuanFilter",sep="") #伪造报头 myheader=c("User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ", "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language"="en-us", "Connection"="keep-alive", "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7" ) #电影院名称 dyy_name<-c("") #原价 last_price<-c("") #优惠价格 now_price<-c("") #备注 others<-c("") #来源 tg_source<-c("") for(url in urlist){ #下载网址 webpage <- getURL(url,httpheader=myheader,.encoding="utf-8") #转化成XML格式 pagetree <- htmlTreeParse(webpage,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE) #利用XPATH筛选 temp_name <- xpathSApply (pagetree, "//*/h3[@class=‘desc‘]", xmlValue) dyy_name<-c(dyy_name,temp_name) temp_price <- xpathSApply (pagetree, "//*/span [@class=‘discount‘]",xmlValue) last_price<-c(last_price,temp_price) temp_now_price <- xpathSApply (pagetree, "//*/span [@class=‘price‘]",xmlValue) now_price<-c(now_price,temp_now_price) temp_others <- xpathSApply (pagetree, "//*/div [@class=‘other-info clearfix‘]",xmlValue) others<-c(others,temp_others) temp_tg_source <- xpathSApply (pagetree, "//*/div[@class=‘source clearfix‘]", xmlValue) tg_source<-c(tg_source,temp_tg_source) } # 删除不必要的信息 tg_source<-laply(as.list(tg_source),function(x){ unlist(strsplit(x,"\n"))[2] }) # 删除不必要的信息 others<-laply(as.list(others),function(x){ unlist(strsplit(x,"\n"))[2] }) #组装成数据库 content<-data.frame(dyy_name,last_price,now_price,others,tg_source) names(content)<-c(‘影院‘,‘原价‘,‘优惠价‘,‘备注‘,‘来源‘) #写入csv文件 write.csv(content,file="电影团购信息.csv")
利用RCurl抓取电影团购信息,布布扣,bubuko.com
时间: 2024-10-05 16:10:36