from lxml import etree import requests url = ‘https://movie.douban.com/chart‘ headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" } response = requests.get(url,headers=headers) html_str = response.content.decode() # print(html_str) # 使用etree来处理数据 html = etree.HTML(html_str) # 获取电影的url地址 url_list = html.xpath("//div[@class=‘indent‘]/div/table//div[@class=‘pl2‘]/a/@href") #print(url_list) # 获取电影图片地址 img_list = html.xpath("//div[@class=‘indent‘]/div/table//a[@class=‘nbg‘]/img/@src") #print(img_list) # 把每一部电影组成一个字典,字典中是电影的数据 # 1.分组 # 2.每一组提取数据 rets = html.xpath("//div[@class=‘indent‘]/div/table") for table in rets: item = {} item[‘title‘] = table.xpath(".//div[@class=‘pl2‘]/a/text()")[0].replace("/","").strip() item[‘href‘] = table.xpath(".//div[@class=‘pl2‘]/a/@href")[0] item[‘img‘] = table.xpath(".//a[@class=‘nbg‘]/img/@src")[0] item[‘comment_num‘] = table.xpath(".//div[@class=‘pl2‘]/div//span[@class=‘pl‘]/text()")[0] item[‘rating_num‘] = table.xpath(".//div[@class=‘pl2‘]/div//span[@class=‘rating_nums‘]/text()")[0] print(item)
原文地址:https://www.cnblogs.com/zqrios/p/9017480.html
时间: 2024-10-01 01:04:11