网址 https://movie.douban.com/top250
一共250部电影,有分页,获取每一部的详细信息
不采用框架,使用 urilib读取网页,re进行正则表达式匹配,lxml进行xpath查找
1 from film import * 2 from urllib import request 3 import time,re 4 url=r‘https://movie.douban.com/top250?start=‘ 5 for i in range(10): 6 url=url+str(i*25) 7 print(url) 8 9 headers = { 10 ‘User-Agent‘: r‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ‘ 11 r‘Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3‘, 12 ‘Connection‘: ‘keep-alive‘ 13 } 14 req=request.Request(url,headers=headers) 15 page=request.urlopen(req).read() 16 page=page.decode(‘utf-8‘) 17 #fp=open("page.txt",mode="w",encoding="UTF-8") 18 #fp.writelines(page) 19 p=re.compile(r‘\<em\sclass=\"\"\>\d+\</em\>\s*\<a\shref=\"https://movie.douban.com/subject/\d+/\"\>‘) 20 result=p.findall(page) 21 for item in result: 22 #print(item) 23 p=re.compile(r‘\d+‘) 24 no=p.findall(item) 25 #print(no[0]) 26 p=re.compile(r‘https://movie.douban.com/subject/\d+/‘) 27 rurl=p.findall(item) 28 #print(rurl[0]) 29 filma=film(no[0],rurl[0],‘‘,‘‘,‘‘,‘‘,‘‘,‘‘) 30 filma.getall() 31 filma.detail() 32 time.sleep(3) 33 #print (result) 34 time.sleep(3) 35 #print(i)
film.py 如果要做数据的持久化,在这里实现
1 from urllib import request 2 from lxml import etree 3 class film: 4 def __init__(self,no,url,name,year,score,director,classification,actor): 5 self.name=name 6 self.year=year 7 self.score=score 8 self.director=director 9 self.classification=classification 10 self.actor=actor 11 self.url=url 12 self.no=no 13 14 def detail(self): 15 temp = "No:%s;url:%s;片名:%s;年份:%s;分数:%s;导演:%s;分级:%s;演员:%s;" %(self.no,self.url,self.name,self.year,self.score,self.director,self.classification,self.actor) 16 print(temp) 17 def getall(self): 18 headers={ 19 ‘User-Agent‘: r‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ‘ 20 r‘Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3‘, 21 ‘Connection‘: ‘keep-alive‘ 22 } 23 req=request.Request(self.url,headers=headers) 24 page=request.urlopen(req).read() 25 page=page.decode(‘utf-8‘) 26 selector=etree.HTML(page) 27 print (page) 28 self.name=selector.xpath(‘/html/body/div[3]/div[1]/h1/span[1]/text()‘) 29 self.year=selector.xpath(‘//*[@id="content"]/h1/span[2]/text()‘) 30 self.score=selector.xpath(‘//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()‘) 31 self.director=selector.xpath(‘//*[@id="info"]/span[1]/span[2]/a/text()‘) 32 self.classification=selector.xpath(‘//*[@id="info"]/span[5]/text()‘) 33 self.actor=selector.xpath(‘//*[@id="info"]/span[3]/span[2]/a/text()‘) 34 35
时间: 2024-10-13 12:29:49