import threading,os,time,requests,pymongo,refrom queue import Queuefrom lxml import etreefrom bs4 import BeautifulSoup as BPclient = pymongo.MongoClient(host=‘localhost‘,port=27017)mg = client[‘biquge‘]def get_fenlei(): """ 爬取图书全部分类 :return: """ collection = mg[‘biquge_info‘] url = ‘http://www.xxbqg5200.com/‘ sp = requests.get(url=url,headers = headers,cookies=cookies) soup = BP(sp.text,‘lxml‘) fenlei_url = soup.select(‘#wrapper > div.nav > ul > li > a‘) list1 = [] for i in fenlei_url: href_url = i[‘href‘] fenlei_name = i.get_text() if ‘‘.join(href_url).split(‘/‘)[1] != ‘sort‘: continue else: fenlei_href_url = ‘{}‘.format(‘http://www.xxbqg5200.com‘)+str(href_url) list1.append(fenlei_href_url) try: date = {‘fenlei_name‘:fenlei_name,‘fenlei_url‘:fenlei_href_url} collection.insert(date) print(‘{}{}‘.format(fenlei_name,‘》》》》》存储成功‘)) except : print(‘{}‘.format(fenlei_name,‘存储失败‘)) return list1def get_page(): """ 爬取全部分类的分页链接 :return: """ list1_url = get_fenlei() list_page = [] a = 0 for i in list1_url: a+=1 page_url = ‘‘.join(i).split(‘/‘)[3] page_href_url = ‘{}‘.format(‘http://www.xxbqg5200.com/‘)+str(page_url)+str(‘/‘)+str(a)+str(‘_‘) for page in range(1,190): page_url = "{}".format(page_href_url)+str(page)+str(‘/‘) list_page.append(page_url) return list_pagedef get_tushu_url(): """ 爬取所有图书的链接 :return: """ global q,lock while not q.empty(): lock.acquire() url = q.get() lock.release() print(url,‘###################################‘) collection = mg[‘biquge_info‘] list1 = [] sp = requests.get(url=url,headers=headers,cookies=cookies) soup = BP(sp.text,‘lxml‘) tushu_url = soup.select(‘#newscontent > div.l > ul > li > span.s2 > a‘) if tushu_url: for tushu_href_url in tushu_url: tushu_name_url = tushu_href_url[‘href‘] tushu_name = tushu_href_url.get_text() list1.append(tushu_name_url) try: date = {‘tushu_name‘:tushu_name,‘tushu_name_url‘:tushu_name_url} collection.insert(date) print(‘{}{}‘.format(tushu_name, ‘》》》》》存储成功‘)) except : print(‘{}‘.format(tushu_name,‘存储失败‘)) else: pass """ 爬取章节所有链接 """ list2 = [] for zhang_url in list1: response = requests.get(zhang_url,headers=headers,cookies=cookies) soup_zhang = BP(response.text,‘lxml‘) zhangjie_url = soup_zhang.select(‘#list > dl > dd > a‘) for zhang_href in zhangjie_url: zhangjie_href = zhang_href[‘href‘] zhangjie_name = zhang_href.get_text() # print(zhangjie_name,‘)))))))))))‘,zhangjie_href) content_url = ‘{}‘.format(‘http://www.xxbqg5200.com‘)+str(zhangjie_href) list2.append(content_url) try: date_zhangjie = {‘zhangjie_name‘:zhangjie_name,‘zhangjie_href‘:zhangjie_href} collection.insert(date_zhangjie) print(‘{}{}‘.format(zhangjie_name, ‘》》》》》存储成功‘)) except : print(‘{}‘.format(zhangjie_name,‘存储失败‘)) """ 爬取章节下的所有内容 """ content_sql = mg[‘tushu_content‘] for content_list_url in list2: response1 = requests.get(content_list_url,headers=headers,cookies=cookies) soup_content = BP(response1.text,‘lxml‘) content_nei = soup_content.select(‘#content‘) for text_content in content_nei: filter_content = re.findall(‘[\u4e00-\u9fa5a-zA-Z0-9]+‘, text_content, re.S) # 只要字符串中的中文,字母,数字 filter_text_content = "".join(filter_content) try: date_content = {‘content‘:filter_text_content} content_sql.insert(date_content) print(‘{}‘.format( ‘》》》》》存储成功‘)) except : print(‘{}‘.format(‘存储失败‘))if __name__ == ‘__main__‘: headers = { ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36‘, ‘Referer‘: ‘http://www.xxbqg5200.com/register.php?do=submit‘, } cookies = { ‘Cookie‘: ‘Hm_lvt_bbb2110ecd75330bec79c7868b24e681=1575524043; PHPSESSID=03pt092b5nb8qsdl6pk425kh87; jieqiUserInfo=jieqiUserId%3D1912%2CjieqiUserName%3Dduanyibo%2CjieqiUserGroup%3D3%2CjieqiUserName_un%3Dduanyibo%2CjieqiUserLogin%3D1575524132; jieqiVisitInfo=jieqiUserLogin%3D1575524132%2CjieqiUserId%3D1912; Hm_lpvt_bbb2110ecd75330bec79c7868b24e681=1575524140‘, } q = Queue() lock = threading.Lock() list_url = get_page() for i in list_url: q.put(i) for i in range(10): t = threading.Thread(target=get_tushu_url) t.start()
原文地址:https://www.cnblogs.com/duanlinxiao/p/11993911.html
时间: 2024-08-02 10:32:09