demo:
#coding:utf-8 import requests from bs4 import BeautifulSoup import bs4 import re def getHTMLText(url): try: r=requests.get(url,timeout=30) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except: return ‘‘ def fillList(ulist,html): soup=BeautifulSoup(html,‘lxml‘) bd=soup.body.find_all(‘cite‘) for node in soup.find_all(‘div‘, {‘class‘: ‘g‘}): cite_node = node.find(‘cite‘) abstract_node = node.find(‘span‘, {‘class‘: ‘st‘}) time_node=node.find(‘span‘,{‘class‘:‘f‘}) cite=cite_node.text abstract=abstract_node.text #time=time_node.text #if time is None: # continue #print(time) ulist.append([cite,abstract]) #print(‘*********‘) print(ulist) uinfo=[] url="https://www.google.com.hk/search?safe=strict&source=hp&ei=mQltW6O1CLe60PEP-_eY-AQ&q=%E6%98%8E%E7%95%A5%E6%95%B0%E6%8D%AECTO&oq=%E6%98%8E%E7%95%A5%E6%95%B0%E6%8D%AECTO&gs_l=psy-ab.3...7917.11610.0.12024.14.12.0.0.0.0.896.1417.5-1j1.2.0....0...1c.1j4.64.psy-ab..12.2.1416...0j0i30k1j0i5i30k1.0.uovOOEULNls" html=getHTMLText(url) fillList(uinfo,html)
原文地址:https://www.cnblogs.com/elpsycongroo/p/9454551.html
时间: 2024-09-29 06:36:43