1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Oct 12 16:48:33 2016 4 5 @author: fuzzier 6 """ 7 8 import requests 9 from bs4 import BeautifulSoup 10 import re 11 import os 12 import codecs 13 14 URL = ‘http://www.xxxxx.net‘ 15 16 def download_page(url): 17 headers = {‘User_Agent‘:‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1581.2 Safari/537.36‘} 18 html = requests.get(url,headers=headers).content 19 return html 20 21 def parser_html(data): 22 soup = BeautifulSoup(data,‘html.parser‘) 23 films = [] 24 trs = soup.find(‘div‘,class_=‘bd3rl‘).find(‘div‘,class_=‘co_content8‘).find_all(‘tr‘) 25 for i in trs: 26 tr = i.find(‘a‘,href=re.compile(r‘/\w+?/\w+?/\w+?/\d+?/\d+?.html‘)).string 27 if tr: 28 films.append(tr) 29 else: 30 films.append(‘None‘) 31 return films 32 33 if __name__ == ‘__main__‘: 34 html = download_page(URL) 35 film_list = parser_html(html) 36 with codecs.open(os.getcwd()+‘\\dytt8_hot.txt‘,‘w‘,encoding=‘utf8‘) as f: 37 for i in film_list: 38 f.write(i+‘\r\n‘)
时间: 2024-10-12 11:47:09