#爬取电影天堂全站电影资源链接#功能:#1、获取电影天堂资源下载链接并输出#2、所有链接保存为一个csv文件 import timeimport requestsfrom bs4 import BeautifulSoupimport csv def spider(url): global page, No, fobj try: page += 1 print("第{}页".format(page)) # time.sleep(1) #获取网页链接并读取 html = requests.get(url) html.encoding="gbk" html=html.text #beautfulSoup装载文档 root=BeautifulSoup(html,"lxml") #查找所需元素,获取tables列表 tables=root.find("div",attrs={"class":"co_content8"}).find("ul").find_all("table") for table in tables: name = table.find("a").text url = "http://www.dytt8.net"+table.find("a")["href"] # 文件写入操作 writer = csv.writer(fobj) writer.writerow([name, url]) No += 1 print("No:", No, name, url) #爬取下一页 # time.sleep(1) urls=root.find("div",attrs={"class":"co_content8"}).find("div",attrs={"class":"x"}).find_all("a") #寻找下一页的链接 for u in urls: # print(url.text) # try: if u.text == "下一页": url="https://www.dytt8.net/html/gndy/dyzz/"+u["href"] print(url) #如有下一页 spiderA(url) except:#没有下一页 print("finished") # spiderA(url) begin_time = time.time()url="https://www.dytt8.net/html/gndy/dyzz/index.html"page=0No=0fobj=open("movies.csv", "wt", encoding="gbk", newline=‘‘)spider(url)fobj.close()end_time = time.time()time=end_time-begin_timem,s=divmod(round(time),60)print("用时:{}min{}s".format(m,s))
原文地址:https://www.cnblogs.com/billie52707/p/12113520.html
时间: 2024-10-29 19:07:07