爬取http://bj.58.com/pbdn/0/pn2/中除转转、推广商品以外的产品信息,因为转转和推广的详情信息不规范,需要另外写一个方法存放,后期补上,详情页如下
这周学习了爬虫,但是遇到一些js,requests方法无法渲染,比如浏览量,所以结合selenium+phantomjs渲染网页,获取信息
上代码,注释中详细解释:
from selenium import webdriverfrom bs4 import BeautifulSoupimport re class GetPageInfo(object):
‘这个类主要是定义爬网页的方法,定义一个规范,以后爬取网页方法就重写这个类就行’ def index_page(self): ‘获取所有每个网页的url‘
‘这周暂时只爬取一页的所有记录详细信息,所以该方法暂时不用’ pass def detail_page(self): ‘从每个index_page中,获取每个网页的每条记录的url‘ pass def domain_page(self): ‘从每个detail_page中,获取每条记录的详细信息‘ class TongChengFirstHomework(GetPageInfo): #传入一个浏览器 def __init__(self,browser=None): self.browser= browser #初始化一个浏览器 def detail_page(self,whoshell=0,page=1): ‘woshell:0代表个人销售,1代表商家销售;page当前页‘ #http://sz.58.com/pbdn/0/pn1/ url= ‘http://sz.58.com/pbdn/{}/pn{}/‘.format(str(whoshell),str(page))
#此处使用了format函数,详细用法可以找找百度 browser.get(url) #打开网页 html = browser.page_source #获取网页的所有内容 soup = BeautifulSoup(html,‘lxml‘) #用BeautifulSoup解析网页,转回我们平时熟悉的爬虫方法 detail_urls = soup.select(‘#infolist a.t‘) #获取某一页下的所有记录 detail_url_list=[] for row in detail_urls: detail_url = row.get(‘href‘) if (‘Mzhuanzhuan‘ not in str(detail_url)) and (‘jump‘ not in detail_url): #获取每一条记录的url detail_url_list.append(detail_url) print(detail_url_list) return detail_url_list def domain_page(self,detail_url): detail = {} #用来存放详细信息的字典 browser.get(detail_url) html = browser.page_source soup = BeautifulSoup(html,‘lxml‘) desc_product = soup.select(‘div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span‘) detail={ "provice":soup.select(‘.crb_i > a‘)[0].get_text(), "title":soup.select(‘#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.mainTitle > h1‘)[0].get_text(), "date":soup.select(‘.time‘)[0].get_text(), "views":soup.select(‘#totalcount‘)[0].get_text(), "price":soup.select(‘span.price.c_f50‘)[0].get_text(), "condition":list(desc_product[0].stripped_strings) if ‘-‘ not in str(soup.select(‘div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span‘)) else None, "area":list(soup.select(‘.c_25d‘)[0].stripped_strings) if soup.find_all(‘span‘,‘c_25d‘) else None, #这里是可以直接在字典里使用if函数,类似列表解析式 "seller":soup.select(‘#divContacter > ul > ul > li > a‘)[0].get_text(), } print(detail) return detail #返回所有详细信息 try: cap = webdriver.DesiredCapabilities.PHANTOMJS #DesiredCapabilities是一个字典,可以对浏览器进行设置 cap[‘phantomjs.page.settings.loadImages‘]=False #设置浏览器不加载图片 cap[‘phantomjs.page.settings.userAgent ‘]="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36" #设置useragent cap[‘phantomjs.page.settings.diskCache‘] =True #设置浏览器开启缓存 browser = webdriver.PhantomJS(desired_capabilities=cap) #使用desired_capabilities初始化浏览器 tongcheng=TongChengFirstHomework(browser) #实例一个tongchengfirskhomework对象 for detail_page in tongcheng.detail_page(page=2): tongcheng.domain_page(detail_page) print(detail_page) finally: browser.close() #记得要关掉浏览器
时间: 2024-10-08 21:06:16