import requests import re import json import time class WPHSpider(object): """ 抓取唯品会列表页产品名称以及图片 """ def __init__(self): # 伪造请求头 self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36", } # ip代理 self.proxies = { "https:": "https://36.248.129.249:26392", } self.start_param = { "q": "3|29970||", "rp": "30073|30079", } def get_page(self, url, data_for_get): response = requests.get(url, params=data_for_get, headers=self.headers, proxies=self.proxies) return response.text def run(self): i = 1 while True: # 指定抓取5页 if i == 5: break start_url = "https://category.vip.com/search-2-0-%s.html?q=3|29970||&rp=30073|30079" % i # 1.请求列表页 res_content = self.get_page(start_url, self.start_param) # 2.从静态相应中解析出产品id com = re.compile(""".*productIds":(.*),"cateName".*""") data_by_re = com.findall(res_content)[0][3:-3].split(‘","‘) for n, pro_id in enumerate(data_by_re): if n == 50: break # 3.构造ajax请求所需参数 query_data = { "service": "product_info", "callback": "categoryMerchandiseInfo1", "productIds": pro_id, "functions": "brandShowName,surprisePrice,pcExtra", "warehouse": "VIP_SH", "mobile_platform": "1", "app_name": "shop_pc", "app_version": "4.0", "mars_cid": "1526974867740_7faa89da4de8a00c3547899d727a48f4", "fdc_area_id": "103103101", "_": "1527053746100", } # 4.发起请求,获取产品数据 dest_url = "https://category.vip.com/ajax/mapi.php" res_content = self.get_page(dest_url, query_data) # 5.解析所需数据 com_second = re.compile(""".*"data":(.*)""") try: temp_dict = com_second.findall(res_content)[0][13:-4] # 6.将json字符串转换成python的字典数据类型 temp_dict = json.loads(temp_dict) except: pass else: # 7.构造文件名,以及获取图片数据源 productName = temp_dict["productName"] + ".jpg" smallImage = temp_dict["smallImage"] vipshopPrice = temp_dict["vipshopPrice"] print(productName, smallImage, vipshopPrice) i += 1 if __name__ == "__main__": wph_spider = WPHSpider() wph_spider.run()
# -*- coding: utf-8 -*- import scrapy import re import json import time class WphSpiderSpider(scrapy.Spider): name = ‘wph_spider‘ allowed_domains = [‘vip.com‘] start_urls = [ "https://category.vip.com/search-2-0-%s.html?q=3|29970||&rp=30073|30079" % i for i in range(5)] def parse(self, response): res_content = response.text com = re.compile(r""".*productIds":(.*),"cateName".*""") data_by_re = com.findall(res_content)[0][3:-3].split(‘","‘) for n, pro_id in enumerate(data_by_re): query_data = { "service": "product_info", "callback": "categoryMerchandiseInfo1", "productIds": pro_id, "functions": "brandShowName,surprisePrice,pcExtra", "warehouse": "VIP_SH", "mobile_platform": "1", "app_name": "shop_pc", "app_version": "4.0", "mars_cid": "1526974867740_7faa89da4de8a00c3547899d727a48f4", "fdc_area_id": "103103101", } param = "" i = 1 for key,value in query_data.items(): if i == 1: param01 = key + "=" + value else: param01 = "&" + key + "=" + value param += param01 i += 1 temp_url = "https://category.vip.com/ajax/mapi.php?" + param yield scrapy.Request(temp_url, callback=self.parse_detail) def parse_detail(self, response): com_second = re.compile(r""".*"data":(.*)""") temp_dict = com_second.findall(response.text)[0][13:-4] temp_dict = json.loads(temp_dict) productName = temp_dict["productName"] + ".jpg" smallImage = temp_dict["smallImage"] vipshopPrice = temp_dict["vipshopPrice"] with open("data.json", "a+") as f: f.write(json.dumps({ "productName": productName, "smallImage": smallImage, "vipshopPrice": vipshopPrice, }, ensure_ascii=False)) print(productName, smallImage, vipshopPrice)
scrapy实现抓取
原文地址:https://www.cnblogs.com/pymkl/p/9089780.html
时间: 2024-10-11 19:09:51