# coding:utf-8 import json import redis import time import requests session = requests.session() import logging.handlers import pickle import sys import re import datetime from bs4 import BeautifulSoup from selenium import webdriver import os import sys reload(sys) sys.setdefaultencoding(‘utf8‘) r =redis.Redis(host="123.56.74.190",port=6379,password="ZBHRwlb1608") import platform sysStr = platform.system() if sysStr =="Windows": LOG_FILE_check = ‘C:\\log\\wlb\\crawler\\cic.log‘ else: LOG_FILE_check = ‘/log/wlb/crawler/cic.log‘ handler = logging.handlers.RotatingFileHandler(LOG_FILE_check, maxBytes=128 * 1024 * 1024,backupCount=10) # 实例化handler 200M 最多十个文件 fmt = ‘\n‘ + ‘%(asctime)s - %(filename)s:%(lineno)s - %(message)s‘ formatter = logging.Formatter(fmt) # 实例化formatter handler.setFormatter(formatter) # 为handler添加formatter logger = logging.getLogger(‘check‘) # 获取名为tst的logger logger.addHandler(handler) # 为logger添加handler logger.setLevel(logging.DEBUG) def spider(): chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" os.environ["webdriver.chrome.driver"] = chromedriver browser = webdriver.Chrome(chromedriver) # 设置浏览器需要打开的url url = "https://www.suning.com/" browser.get(url) time.sleep(5) browser.find_element_by_id("searchKeywords").send_keys(u‘手机‘) time.sleep(2) for i in range(1,100): browser.find_element_by_name("index1_none_search_ss1").click() browser.find_element_by_id("nextPage").click() result = browser.page_source soup = BeautifulSoup(result,‘html.parser‘) result_ul = soup.find_all(‘div‘,attrs={"id":"filter-results"})[0] result_list = result_ul.find_all(‘div‘,attrs={"class":"li-bg"}) print len(result_list) print result_list[1] # for item in result_list: # print item # print "==" * 30 # # time.sleep(500) for item in result_list: item = str(item).replace(‘\n‘,‘‘).replace(‘\r‘,‘‘).replace(‘\t‘,‘‘) print "==" * 30 print item try: sold_price = re.findall(‘pricefn="priceCenterShow"><i>¥</i>(.*?)<i>.*?</i></span>‘,item)[0] except: sold_price = re.findall(‘<i>¥</i>(.*?)<i>.*?</i></span>‘,item)[0] try: item_name = re.findall(‘<i class=".*?" style=".*?"></i>(.*?)</b></a>‘,item)[0] except: item_name = re.findall(‘target="_blank" title="(.*?)"><i class=‘,item)[0] try: item_url = re.findall(‘class=".*?" href="(.*?)" name‘,item)[0] except: item_url = re.findall(‘<a class=".*?" href="(.*?)" id=‘, item)[0] try: item_desc = re.findall(‘<span><i></i><em>(.*?)</em><b></b></span>‘,item)[0] except: item_desc = re.findall(‘<em>(.*?)</em>‘, item)[0] print item_url print item_name print sold_price print item_desc time.sleep(500) spider()
原文地址:https://www.cnblogs.com/xuchunlin/p/8326007.html
时间: 2024-10-10 03:27:21