需要着重学习的地方:(1)爬取分页数据时,url链接的构建(2)保存json格式数据到文件,中文显示问题(3)线程池的使用(4)正则表达式的写法
import requests from requests.exceptions import RequestException import re import json from multiprocessing import Pool # 抓取单页内容 def get_one_page(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except RequestException: return None # 正则表达式分析 def parse_one_page(html): pattern = re.compile(‘<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a‘ +‘.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>‘ +‘.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>‘,re.S) items = re.findall(pattern,html) # print(items) for item in items: yield { ‘index‘:item[0], ‘image‘:item[1], ‘title‘:item[2], ‘actor‘:item[3].strip()[3:], ‘time‘:item[4].strip()[5:], ‘score‘:item[5] + item[6] } # 保存至文件 def write_to_file(content): # 注意:把json数据保存到文件中显示出中文 with open(‘result.txt‘,‘a‘,encoding=‘utf-8‘) as f: f.write(json.dumps(content,ensure_ascii=False) + ‘\n‘) f.close() def main(offset): url = ‘https://maoyan.com/board/4?offset=‘ + str(offset) html = get_one_page(url) parse_one_page(html) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == ‘__main__‘: # 开启循环,抓取多页数据 # for i in range(10): # main(i*10) # 使用线程池 pool = Pool() pool.map(main,[i*10 for i in range(10)])
原文地址:https://www.cnblogs.com/sanduzxcvbnm/p/10185364.html
时间: 2024-12-11 10:51:47