#需求:抓取猫眼电影TOP100的电影名称、时间、评分、图片等信息,提取的结果会以文件的形式保存下来 import requests import time from lxml import etree import json import csv import codecs class MaoYanTop100Spider: #存储电影详情页的url film_page_url_list = [] #存储每个的电影信息 #film_info = {} film_info_list = [] # 1.获取电影列表页数据 def Top100_list(self, session, headers): #1.1向列表页发送请求 #https://maoyan.com/board/4?offset=20 #(1)固定url base_url = "https://maoyan.com/board/4" #(2)url变化部分: for i in range(0, 91, 10): #(3)拼接URL: final_url = base_url + "?offset=" + str(i) #(4)发送请求: time.sleep(5) response = session.get(url=final_url, headers=headers) #1.2解析列表页 film_list_page_data = response.content.decode("utf-8") #1.2.1使用xpath解析数据 #(1)转类型 xpath_data = etree.HTML(film_list_page_data) #(2) #/dl/dd[1]/a #dl/dd[10]/a/img[2] # dl / dd[2] / a / img[2] for xpath_num in range(1, 11): # 电影名称 #dl/dd[1]/a film_name = xpath_data.xpath(‘//dl[@class="board-wrapper"]/dd[‘ + str(xpath_num) + ‘]/a/@title‘)[0] # 时间 #//*[@id="app"]/div/div/div[1]/dl/dd[1]/div/div/div[1]/p[3] #dl/dd[1]/div/div/div[1]/p[3] #dl/dd[2]/div/div/div[1]/p[3] film_time = xpath_data.xpath(‘//dl[@class="board-wrapper"]/dd[‘ + str(xpath_num) + ‘]/div/div/div[1]/p[3]/text()‘)[0][5:].strip() # 主演 #dl/dd[1]/div/div/div[1]/p[2] film_actors = xpath_data.xpath(‘//dl[@class="board-wrapper"]/dd[‘ + str(xpath_num) + ‘]/div/div/div[1]/p[2]/text()‘)[0].strip()[3:] # 评分 #dl/dd[1]/div/div/div[2]/p/i[1] score_int = xpath_data.xpath(‘//dl[@class="board-wrapper"]/dd[‘ + str(xpath_num) + ‘]/div/div/div[2]/p/i[1]/text()‘)[0] #dl/dd[1]/div/div/div[2]/p/i[2] score_fraction = xpath_data.xpath(‘//dl[@class="board-wrapper"]/dd[‘ + str(xpath_num) + ‘]/div/div/div[2]/p/i[2]/text()‘)[0] film_score = str(score_int) + str(score_fraction) # 图片 #dl/dd[1]/a/img[2] #dl/dd[1]/a/img[2] film_img = xpath_data.xpath(‘//dl[@class="board-wrapper"]/dd[‘ + str(xpath_num) + ‘]/a/img[2]/@data-src‘)[0] # 详情页url #dl/dd[1]/div/div/div[1]/p[1]/a #dl/dd[1]/div/div/div[1]/p[1]/a film_url = xpath_data.xpath(‘//dl[@class="board-wrapper"]/dd[‘ + str(xpath_num) + ‘]/div/div/div[1]/p[1]/a/@href‘)[0] #电影信息 film_info = {} film_info["name"] = film_name film_info["time"] = film_time film_info["actors"] = film_actors film_info["score"] = film_score film_info["img"] = film_img film_info["url"] = film_url self.film_info_list.append(film_info) #print(film_info) #详情页url self.film_page_url_list.append(film_url) # 2.获取电影详情页数据 def film_page(self, url, session, headers, num): #2.1向详情页发送请求 base_url = "https://maoyan.com" final_url = base_url + str(url) print(final_url) time.sleep(3) response = session.get(url=final_url, headers=headers) data = response.content.decode("utf-8") #print(response) #2.2解析详情页 xpath_data = etree.HTML(data) #//*[@id="app"]/div/div[1]/div/div[2]/div[1]/div[1]/div[2]/span film_summary = xpath_data.xpath(‘//span[@class="dra"]/text()‘)[0].strip() #print(film_summary) self.film_info_list[num]["summary"] = film_summary #将数据保存至CSV文件 def save_data(self): #1.读取json文件,创建csv文件 #json_fp = open("new.json", "r") csv_fp = codecs.open("maoyan.csv", "w", "utf-8") #2.提出csv文件表头,表内容 #2.1 表头 #data_list = json.load(json_fp) title_list = self.film_info_list[0].keys() #2.2 表内容 excel_data = [] for data in self.film_info_list: excel_data.append(data.values()) #3.使用csv写入器,写入文件 #3.1创建csv写入器 csv_writer = csv.writer(csv_fp) #3.2写入表头和表内容 #(1)写入表头 csv_writer.writerow(title_list) #(2)写入表内容 csv_writer.writerows(excel_data) #4.关闭csv文件和json文件 #json_fp.close() csv_fp.close() #运行: def run(self): #0.创建session,维持会话 session = requests.Session() #0.1请求头:headers headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", } #1.获取电影列表页数据 self.Top100_list(session=session, headers=headers) #print(self.film_info_list) #2.获取电影详情页数据 for i, film_page_url in enumerate(self.film_page_url_list): self.film_page(url=film_page_url, session=session, headers=headers, num=i) print(self.film_info_list[i]) #3.保存数据 self.save_data() MaoYanTop100Spider().run()
原文地址:https://www.cnblogs.com/tommyngx/p/11182661.html
时间: 2024-10-14 22:14:52