这个案例主要是在乌托家网站上爬取家具公司的数据,用的方法是requests模块和xpath语法。代码如下:
1 # Author:K 2 import requests 3 from lxml import etree 4 import os 5 6 HEADERS = { 7 ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36‘ 8 } 9 10 def parse_page(url): 11 response = requests.get(url=url, headers=HEADERS) 12 page_text = response.text 13 tree = etree.HTML(page_text) 14 li_list = tree.xpath(‘//ul[@class="rec-commodity-ul targetElement"]/li‘) 15 for li in li_list: 16 merchant_href = li.xpath(‘.//div[@class="impression"]/a/@href‘)[0] 17 merchant_name = li.xpath(‘.//div[@class="impression"]/a/text()‘)[0] 18 commodity_name = li.xpath(‘.//div[@class="material"]/a/text()‘)[0] 19 # print(merchant_href,merchant_name,commodity_name) 20 detail_page_text = requests.get(url=merchant_href, headers=HEADERS).text 21 tree = etree.HTML(detail_page_text) 22 div_infos = tree.xpath(‘//div[@class="brand-r"]‘) 23 for div in div_infos: 24 try: 25 brand_name = div.xpath(‘./div[4]/dl/dd/text()‘)[0] 26 addr = div.xpath(‘.//p/text()‘)[0] 27 phone = div.xpath(‘.//dd[2]/text()‘)[0] 28 # print(brand_name, addr, phone) 29 30 # 持久化存储 31 file_path = ‘H:/乌托家/乌托家家具公司.txt‘ 32 fp = open(file_path, ‘r+‘, encoding=‘utf-8‘) 33 if brand_name not in fp.read(): 34 if str(addr).__contains__(‘广东‘): 35 fp.write(brand_name+‘ ‘+addr+‘ ‘+phone+‘\n\n‘) 36 print(brand_name,‘爬取成功!!!‘) 37 fp.close() 38 except Exception as e: 39 print(e) 40 41 42 def get_page(): 43 for page in range(1,413): 44 url = ‘http://www.wutuojia.com/item/list.html?page=‘ + str(page) 45 parse_page(url) 46 47 48 49 def main(): 50 get_page() 51 52 53 if __name__ == ‘__main__‘: 54 # 持久化存储 55 if not os.path.exists(‘H:/乌托家‘): 56 os.mkdir(‘H:/乌托家‘) 57 main()
原文地址:https://www.cnblogs.com/KisInfinite/p/10952938.html
时间: 2024-11-05 13:41:03