目前问题: 1.爬虫爬到十七条数据后,就不进行了,处于等待状态,咱不知道问题所在 2.采用selenium爬虫,由于涉及到页面加载,十分缓慢,个人倾向于ajax技术。 #from bs4 import BeautifulSoup import re, csv, urllib.request, urllib.parse, time, json, pickle,random from selenium import webdriver from selenium.webdriver.common.keys import Keys class managerurl(object): def __init__(self): self.oldurls=set() self.newurls=set() def add_newurls(self,urls): if urls: for i in urls: self.add_newurl(i) def add_newurl(self,url): if url not in self.newurls and url not in self.oldurls: self.newurls.add(url) def has_newurls(self): url=self.newurls.pop() self.oldurls.add(url) return url class data_get(object): def __init__(self): self.manaurl=managerurl() def data_gain(self,url): data = [] num=0 self.manaurl.add_newurl(url) while 1: if len(self.manaurl.newurls): driver = webdriver.Firefox() url1=self.manaurl.has_newurls() driver.get(url1) print(num) #time.sleep(random.randrange(5,10)) moviescore = driver.find_element_by_xpath("//strong[@class=‘ll rating_num‘]") # 获得电影的评分数据 othermovies = driver.find_elements_by_xpath("/html/body/div[3]/div[1]/div[2]/div[1]/div[7]/div/dl/dd/a") # 获取同类电影的数据 moviename=driver.find_element_by_xpath(‘/html/body/div[3]/div[1]/h1‘) print(moviename.text) for i in othermovies: self.manaurl.add_newurl(i.get_attribute(‘href‘)) data.append({‘moviename‘: moviename.text, ‘score‘: moviescore.text}) num+=1 driver.close() if num>100: print(data) break if __name__=="__main__": b=data_get() url=‘https://movie.douban.com/subject/27663742/‘ b.data_gain(url)
原文地址:https://www.cnblogs.com/xuehaiwuya0000/p/10674794.html
时间: 2024-10-01 11:21:17