import pycurl,StringIO,json,time,re,sys,urllib2 from lxml import etree # headers = { # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", # "Accept-Encoding":"gzip, deflate, sdch", # "Accept-Language":"zh-CN,zh;q=0.8", # "Cache-Control":"max-age=0", # "Connection":"keep-alive", # "Cookie":"Hm_lvt_fa633287999535c3e5f5a63e82308549=1462868485; Hm_lpvt_fa633287999535c3e5f5a63e82308549=1462868485; CNZZDATA5838747=cnzz_eid%3D1693591872-1459152412-http%253A%252F%252Fwww.1396app.com%252F%26ntime%3D1462865237", # "Host":"www.1396app.com", # "Upgrade-Insecure-Requests":"1", # "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36", # } reload(sys) sys.setdefaultencoding(‘utf-8‘) def gethtml(url,headers): c = pycurl.Curl() #通过curl方法构造一个对象 #c.setopt(pycurl.REFERER, ‘http://qy.m.58.com/‘) #设置referer c.setopt(pycurl.FOLLOWLOCATION, True) #自动进行跳转抓取 c.setopt(pycurl.MAXREDIRS,5) #设置最多跳转多少次 c.setopt(pycurl.CONNECTTIMEOUT, 60) #设置链接超时 c.setopt(pycurl.TIMEOUT,120) #下载超时 c.setopt(pycurl.ENCODING, ‘gzip,deflate‘) #处理gzip内容,有些傻逼网站,就算你给的请求没有gzip,它还是会返回一个gzip压缩后的网页 # c.setopt(c.PROXY,ip) # 代理 c.fp = StringIO.StringIO() c.setopt(pycurl.URL, url) #设置要访问的URL # c.setopt(pycurl.HTTPHEADER,headers) #传入请求头 c.setopt(c.WRITEFUNCTION, c.fp.write) #回调写入字符串缓存 c.perform() code = c.getinfo(c.HTTP_CODE) #返回状态码 html = c.fp.getvalue() #返回源代码 return code write_key = open(‘key.txt‘,‘a+‘) for list_url in range(0,441): url = ‘http://www.icaile.com/tag/gl-45-%s.html‘ % list_urlfor key in re.findall(r‘title="(.*?)"‘,gethtml(url)): key = key.decode(‘utf-8‘,‘ignore‘) write_key.write(key+‘\n‘) print key
headers 加不加都可以,建议是加一个吧,最后这段用re.findall提取 title="" 的内容也是技术不到位
用xpath会比较精准。。。仅用于记录
url_range = etree.HTML(gethtml(url).decode(‘utf-8‘,‘ignore‘)) dateil_url = url_range.xpath(‘/html/body/div[3]/div[3]/div[1]/div[1]/div/ul/li[1]/a‘)[0].text print dateil
写得比较粗,没有去重
关于对SEO能有什么用处,我感觉抓关键词比较爽。。。再改改就当火车头使了,效率比火车高很多,写个多线程的话(据说Pycurl也是带多线程的)
时间: 2024-10-13 16:13:12