#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/12/30 10:51 # @Site : # @File : jd_iphone.py # @Software: PyCharm import json import time import urllib3 import logging import requests from pyquery import PyQuery from selenium import webdriver urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning headers = { "Referer": "https://search.jd.com/", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", } #控制台日志输出 logging.basicConfig(level=logging.INFO, format=‘%(asctime)s - %(name)s - %(levelname)s - %(message)s‘) logger_page = logging.getLogger("jd_iphone_page") logger_detail = logging.getLogger("jd_iphone_detail") def get_page_detail(maxp): product_list = [] p_no = 0 for page in range(1,2*maxp,2): url = ‘https://search.jd.com/Search?keyword=iphoneapple&page=‘+str(page)+‘&click=0‘#筛选iPhone手机链接 # resp = requests.get(url,headers=headers,verify=False) #浏览器无窗模式 chrome_options = webdriver.ChromeOptions() chrome_options.add_argument(‘--headless‘) driver = webdriver.Chrome(chrome_options=chrome_options) # driver = webdriver.Chrome() driver.get(url) driver.implicitly_wait(10) #执行js滚动条 js = ‘‘‘ timer = setInterval(function(){ var scrollTop=document.documentElement.scrollTop||document.body.scrollTop; var ispeed=Math.floor(document.body.scrollHeight / 100); if(scrollTop > document.body.scrollHeight * 90 / 100){ clearInterval(timer); } console.log(‘scrollTop:‘+scrollTop) console.log(‘scrollHeight:‘+document.body.scrollHeight) window.scrollTo(0, scrollTop+ispeed) }, 20) ‘‘‘ driver.execute_script(js) time.sleep(5) html = driver.page_source doc = PyQuery(html,parser="html")#无参数parser="html"则不能解析,pyquery解析的是html类型的字符串,但是上面的类型是xhtml logger_page.info("正在获取%s页数据......"%((page+1)/2)) for item in doc("#J_goodsList li").items(): tmp_list = [] key_list = []#验证关键字,去除不带iphone信息的数据 #获取价格 data_sku = item.attr(‘data-sku‘) para = ".J_%s"%data_sku price = item.find(para).text() #获取店铺 shop = item.find(‘.J_im_icon‘).text() #标签 tag_p = "#J_pro_%s"%data_sku tag = item.find(tag_p).text() #商品链接 item = item.find(".gl-i-wrap div a") for font in item.find(‘font‘).items(): key_list.append(font.text()) href = item.attr(‘href‘) #判断是否是苹果手机 if (u‘苹果‘in key_list) or(u‘iphone‘in key_list) or(u‘Apple‘in key_list) or(u‘apple‘in key_list): if "http:" in href: href = href[5:] tmp_list.append(href) tmp_list.append(price) tmp_list.append(shop) tmp_list.append(tag) product_list.append(tmp_list) else: tmp_list.append(href) tmp_list.append(price) tmp_list.append(shop) tmp_list.append(tag) product_list.append(tmp_list) p_no+=1 logger_page.info(‘正在获取%s页,第%s个产品信息......‘%(((page+1)/2),p_no)) else: continue # print(product_list) # print(len(product_list)) return product_list def product_detail(list): no = 0 product_info = [] for link in list: url = ‘http:‘+link[0] logger_detail.info("正在获取第%s条信息......"%(no+1)) no+=1 detail_html = requests.get(url,verify=False) doc = PyQuery(detail_html.text,parser="html") product_dic = { "title":doc(".itemInfo-wrap div.sku-name").text(), "jd_price":list[no-1][1], "shop":list[no-1][2], "tag":list[no-1][3], "colour":doc("#choose-attr-1 div.item").text(), "ram":doc("#choose-attr-2 div.item").text(), "style_buy":doc("#choose-attr-3 div.item").text(), # "increment":doc("#summary-support div span").text() } product_info.append(product_dic) # print("第%s条iphone信息:"%(no+1)) # print(json.dumps(product_dic,encoding=‘UTF-8‘, ensure_ascii=False)) return product_info # print(json.dumps(product_dic,encoding=‘UTF-8‘, ensure_ascii=False))#字典中文输出 if __name__ == ‘__main__‘: list = get_page_detail(1) # print(json.dumps(list,encoding=‘UTF-8‘, ensure_ascii=False)) reasult = product_detail(list) print (json.dumps(reasult,encoding=‘UTF-8‘, ensure_ascii=False))
原文地址:https://www.cnblogs.com/East-fence/p/12129371.html
时间: 2024-11-10 03:43:30