import jsonimport reimport requestsfrom bs4 import BeautifulSoupfrom requests import RequestExceptionfrom multiprocessing import Pooldef get_one_page(url): headers = {‘User-Agent‘:‘baiduspider+‘} try: response = requests.get(url,headers=headers,timeout = 5) if response.status_code == 200: return response.text return None except RequestException: return Nonedef parse_one_page(html): #pattern = re.compile(‘<dd>.*?board-index.*?>(\d+)</i>.*?src="(.*?)".*?name"><a.*?(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)<i>.*?fraction">(.*?)</i>.*?</dd>‘,re.S) #items = re.findall(pattern,html) for item in BeautifulSoup(html,‘lxml‘).find_all(‘dd‘): rank = item.select(‘i‘)[0].text name = item.select(‘p > a‘)[0].text star = item.select(‘.star‘)[0].text.strip() releasetime = item.select(‘.releasetime‘)[0].text integer = item.select(‘.integer‘)[0].text fraction = item.select(‘.fraction‘)[0].text grade = integer+fraction yield { ‘rank‘:rank, ‘name‘:name, ‘star‘:star, ‘releasetime‘:releasetime, ‘grade‘:grade } #print(rank,name,star,releasetime,‘评分为:‘,integer+fraction) #return rank,name,star,releasetime,‘评分为:‘,integer+fractiondef write_to_file(content): with open(‘result.txt‘,‘a‘,encoding=‘utf-8‘) as f: f.write(json.dumps(content,ensure_ascii=False) + ‘\n‘)#将字典转化为字符串 f.close()def main(offset): url = ‘http://maoyan.com/board/4?offset=‘ + str(offset) html = get_one_page(url) parse_one_page(html) for item in parse_one_page(html): print(item) write_to_file(item) if __name__=="__main__": for i in range(10): main(i*10) #pool = Pool() #pool.map(main,[i*10 for i in range(10)])
时间: 2024-12-11 00:26:35