延续上个抓取活动行中会议活动的问题,上次使用是单线程的抓取,效率较低,现在使用多线程的抓取。
数据的抓取分为两个过程:首先获取每个关键字搜索结果对应的url和页数,保存在列表里面,这个过程用一个线程来实现(类似生产者),同时根据获取的关键字的url和页数,抓取对应的数据,这个过程用多线程来抓取(类似消费者)
这样整个抓取过程共用了144.366188 秒,采用单线程来进行抓取要用大概184秒,这样大概节省了40秒
具体代码如下:
# coding=utf-8import osimport refrom selenium import webdriverimport selenium.webdriver.support.ui as uifrom selenium.webdriver.common.keys import Keysimport timefrom selenium.webdriver.common.action_chains import ActionChainsfrom selenium.webdriver.support.select import Selectimport IniFilefrom selenium.webdriver.common.keys import Keysimport LogFilefrom threading import Threadfrom Queue import Queue global url_pageCount_keyword_listurl_pageCount_keyword_list = []# url_pageCount_keyword_queue = Queue() #获取url线程类class GetUrl_Thread(Thread): def __init__(self, IEDriverServer,keywordList,webSearchUrl,pageCountLable): ‘‘‘ 构造函数 :param IEDriverServer: IE驱动的路径路径,比如:C:\Program Files\Internet Explorer\IEDriverServer.exe :param keywordList: 关键字列表 :param webSearchUrl: 网站搜索页url :param pageCountLable: 页数标签 ‘‘‘ Thread.__init__(self) #关键字列表 self.keywordList = keywordList self.pageCountLable = pageCountLable self.urldriver = webdriver.Ie(IEDriverServer) self.wait = ui.WebDriverWait(self.urldriver, 20) self.urldriver.maximize_window() self.urldriver.get(webSearchUrl) def run(self): # global url_pageCount_keyword_list # self.urldriver.implicitly_wait(3) time.sleep(3) for keyword in self.keywordList: if len(keyword) > 0: js = "var obj = document.getElementById(‘mainSearchTextbox‘);obj.value=‘" + keyword + "‘;" self.urldriver.execute_script(js) # 点击搜索链接 ss_elements = self.urldriver.find_element_by_id("mainSearchTextbox") ss_elements.send_keys(Keys.RETURN) time.sleep(5) current_url = self.urldriver.current_url.replace(‘pi=1‘, ‘pi=‘) try: elements = self.urldriver.find_elements_by_xpath(self.pageCountLable) # 要爬虫的页数 strCount = elements[0].text.encode(‘utf8‘) pageCount = int(strCount) / 10 if int(strCount) % 10 > 0: pageCount = pageCount + 1 # my_queue.put(current_url + ‘_‘ + str(pageCount)) url_pageCount_keyword_list.append(current_url.encode(‘utf8‘) + ‘_‘ + str(pageCount) + ‘_‘ + keyword) except Exception, e: print e.message self.urldriver.close() self.urldriver.quit() #抓取数据线程类class ScrapyData_Thread(Thread): def __init__(self, url_pageCount_keyword,htmlLable,OriginalUrlLabel): ‘‘‘ 构造函数 :param url_pageCount_keyword: 关键字搜索结果url+页数+关键字 :param htmlLable: 要搜索的标签 :param OriginalUrlLabel: 每个记录对应的url标签 ‘‘‘ Thread.__init__(self) #对应关键字搜索结果的url self.current_url = url_pageCount_keyword.split(‘_‘)[0] #搜索结果的页数 self.pageCount = int(url_pageCount_keyword.split(‘_‘)[1]) #关键字 self.keyword = url_pageCount_keyword.split(‘_‘)[2] self.htmlLable = htmlLable self.OriginalUrlLabel = OriginalUrlLabel self.currentDate = time.strftime(‘%Y-%m-%d‘) self.datePattern = re.compile(r‘\d{4}-\d{2}-\d{2}‘) self.driver = webdriver.PhantomJS() self.wait = ui.WebDriverWait(self.driver, 20) self.driver.maximize_window() def compareDate(self, dateLeft, dateRight): ‘‘‘ 比较俩个日期的大小 :param dateLeft: 日期 格式2017-03-04 :param dateRight:日期 格式2017-03-04 :return: 1:左大于右,0:相等,-1:左小于右 ‘‘‘ dls = dateLeft.split(‘-‘) drs = dateRight.split(‘-‘) if len(dls) > len(drs): return 1 if int(dls[0]) == int(drs[0]) and int(dls[1]) == int(drs[1]) and int(dls[2]) == int(drs[2]): return 0 if int(dls[0]) > int(drs[0]): return 1 elif int(dls[0]) == int(drs[0]) and int(dls[1]) > int(drs[1]): return 1 elif int(dls[0]) == int(drs[0]) and int(dls[1]) == int(drs[1]) and int(dls[2]) > int(drs[2]): return 1 return -1 def run(self): try: print ‘‘ print ‘关键字:%s ‘ % self.keyword pageCount = self.pageCount recordCount = 0 if pageCount > 0: pageIndex = 0 while pageCount > 0: url = self.current_url + str(pageIndex) self.driver.get(url) # 延迟3秒 time.sleep(3) # self.driver.implicitly_wait(3) pageCount = pageCount - 1 self.wait.until(lambda driver: self.driver.find_elements_by_xpath(self.htmlLable)) Elements = self.driver.find_elements_by_xpath(self.htmlLable) # 查找微博对应的原始url urlList = [] self.wait.until(lambda driver: self.driver.find_elements_by_xpath(self.OriginalUrlLabel)) hrefElements = self.driver.find_elements_by_xpath(self.OriginalUrlLabel) for hrefe in hrefElements: urlList.append(hrefe.get_attribute(‘href‘).encode(‘utf8‘)) # self.driver.implicitly_wait(2) index = 0 strMessage = ‘ ‘ strsplit = ‘\n------------------------------------------------------------------------------------\n‘ index = 0 # 每页中有用记录 usefulCount = 0 for element in Elements: txt = element.text.encode(‘utf8‘) txts = txt.split(‘\n‘) strDate = re.findall(self.datePattern, txt) # 日期大于今天并且搜索的关键字在标题中才认为是复合要求的数据 if len(strDate) > 0 and self.compareDate(strDate[0], self.currentDate) == 1 and \ txts[0].find(self.keyword) > -1: print ‘ ‘ print txt print ‘活动链接:‘ + urlList[index] print strsplit strMessage = txt + "\n" strMessage += ‘活动链接:‘ + urlList[index] + "\n" strMessage += strsplit strMessage = unicode(strMessage, ‘utf8‘) # log.WriteLog(strMessage) usefulCount = usefulCount + 1 recordCount = recordCount + 1 index = index + 1 pageIndex = pageIndex + 1 if usefulCount == 0: break print "共浏览了: %d 页数据" % self.pageCount print "共抓取了: %d 个符合条件的活动记录" % recordCount except Exception, e: print e.message self.driver.close() self.driver.quit() if __name__ == ‘__main__‘: configfile = os.path.join(os.getcwd(), ‘MeetingConfig.conf‘) cf = IniFile.ConfigFile(configfile) IEDriverServer = cf.GetValue("section", "IEDriverServer") os.environ["webdriver.ie.driver"] = IEDriverServer keyword = cf.GetValue("section", "keywords") keywordList = keyword.split(‘;‘) webSearchUrl = cf.GetValue("section", "webSearchUrl") pageCountLable = cf.GetValue("section", "pageCountLable") htmlLable = cf.GetValue("section", "htmlLable") OriginalUrlLabel = cf.GetValue("section", "OriginalUrlLabel") start = time.clock() turl = GetUrl_Thread(IEDriverServer,keywordList,webSearchUrl,pageCountLable) turl.start() while True: if len(url_pageCount_keyword_list) >0: url_pageCount_keyword = url_pageCount_keyword_list[0] url_pageCount_keyword_list.remove(url_pageCount_keyword) t = ScrapyData_Thread(url_pageCount_keyword, htmlLable, OriginalUrlLabel) t.setDaemon(True) t.start() t.join() else: if turl.isAlive(): time.sleep(1) else: break end = time.clock() print "整个过程用时间: %f 秒" % (end - start)
时间: 2024-08-14 23:17:11