碰到的问题:
1.list 越界 查询后加了个
try:
except IndexError: pass
一个简单的爬虫程序
1 import requests 2 from lxml import etree 3 import csv 4 import os 5 6 7 #创建一个csv文件 如果文件步存在则自动创建 8 f = open("house3.csv", "w+") 9 10 #构造一个csv对象 11 csv_file = csv.writer(f) 12 13 # 获取url对应的网页源码 14 head= { 15 ‘User-Agent‘: ‘User-Agent Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0)‘} 16 def getsource(url): 17 18 sourceHtml = requests.get(url, headers=head) 19 return sourceHtml.text 20 21 # 开始抓取并分析数据 22 def spiderData(url): 23 rtext = getsource(url) 24 # path="C://Users/Administrator/Desktop/picure" 25 # 将服务器返回的内容转换成xml格式 26 html = etree.HTML(rtext) 27 div_list = html.xpath(‘//div[contains(@class, "js-tips-list")]/div[contains(@class, "f-list-item")]‘) 28 # data = [] 29 csv_file.writerow(["标题","价格","地址"]) 30 for item in div_list: 31 32 try: 33 34 title= item.xpath(‘.//dd[contains(@class, "title")]/a/text()‘)[0] 35 price = item.xpath(‘.//dd[contains(@class,"info")]/div[@class="price"]/span[@class="num"]/text()‘)[0] 36 yue = item.xpath(‘.//dd[contains(@class,"info")]/div[@class="price"]/span[@class="yue"]/text()‘)[0] 37 address= item.xpath(‘.//dd[contains(@class, "address")]//a[@class="address-eara"]/text()‘)[0] 38 img = html.xpath(‘.//div[@class="img-wrap"]//img/@src‘)[0] 39 40 print(title,price+yue,address,img) 41 except IndexError: 42 pass 43 #获取电影的图标 44 img = html.xpath(‘.//div[@class="img-wrap"]//img/@src‘)[0] 45 r = requests.get(img, headers = head) 46 # #保存图片 47 f1 = open("{}.png".format(title), "wb") 48 print(f1) 49 f1.write(r.content)#将服务器返回的内容写入到文件中 50 f1.close() 51 52 try: 53 csv_file.writerow([title,price+yue,address,img]) 54 except: 55 pass 56 # f1.close() 57 58 59 if __name__==‘__main__‘: 60 url="http://cs.ganji.com/zufang/b2/" 61 62 for i in range(2,5): 63 64 src="pn"+str(i) 65 url=url+src+"/?qq-pf-to=pcqq.group‘" 66 spiderData(url) 67 68
------------恢复内容结束------------
原文地址:https://www.cnblogs.com/industrial-fd-2019/p/12149262.html
时间: 2024-10-09 06:42:38