import requestsfrom pymysql import *from tkinter import * window = Tk()window.title("淘宝列表商品采集")window.geometry(‘200x180‘) Label(window, text=‘关键字采集‘).pack()name = StringVar()Entry(window, textvariable=name).pack() Label(window, text=‘采集起始页‘).pack()to_page = StringVar()Entry(window, textvariable=to_page).pack() Label(window, text=‘采集结束页‘).pack()w_page = StringVar()Entry(window, textvariable=w_page).pack() class Taobao(object): def __init__(self,name, to_page, w_page): self.url = ‘https://s.taobao.com/search?q=‘+name+‘&s={}‘ self.headers = {‘accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘, ‘accept-encoding‘:‘gzip, deflate, br‘, ‘accept-language‘:‘zh-CN,zh;q=0.9‘, ‘cache-control‘:‘max-age=0‘, ‘upgrade-insecure-requests‘:‘1‘, ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36‘} self.url_list = [self.url.format(str((i) * 44)) for i in range(to_page, w_page + 1)] # 请求url def get_data(self, url): response = requests.get(url, headers=self.headers, timeout=10) return response.content.decode() # 解析数据 def parse_data(self, data): # 源码数据 # 定义各个字段正则匹配规则 img_urlpat = ‘"pic_url":"(//.*?)"‘ goods_urlpat = ‘"nid":"(.*?)"‘ title_pat = ‘"raw_title":"(.*?)"‘ company_pat = ‘"nick":"(.*?)"‘ price_pat = ‘"view_price":"(.*?)"‘ deal_countpat = ‘"view_sales":"(.*?)"‘ comment_countpat = ‘"comment_count":"(.*?)"‘ # 查找满足匹配规则的内容,并存在列表中 imgL = re.compile(img_urlpat).findall(data) goodsL = re.compile(goods_urlpat).findall(data) nameL = re.compile(title_pat).findall(data) companyL = re.compile(company_pat).findall(data) priceL = re.compile(price_pat).findall(data) dealL = re.compile(deal_countpat).findall(data) commentL = re.compile(comment_countpat).findall(data) data_list = [] for j in range(len(imgL)): name = ‘淘宝‘ goods_link = "https://detail.tmall.com/item.htm?id=" + goodsL[j] # 商品链接 img_link = "http:" + imgL[j] # 商品图片链接 title = nameL[j] # 商品名称 company = companyL[j] # 淘宝店铺名称 price = priceL[j] # 商品价格 deal_count = dealL[j] # 商品付款人数 comment_count = commentL[j] # 商品评论数,会存在为空值的情况 if (comment_count == ""): comment_count = 0 time = { ‘name‘: name, ‘img_link‘: img_link, ‘goods_link‘: goods_link, ‘price‘: price, ‘title‘: title, ‘company‘: company, ‘deal_count‘: deal_count, ‘comment_count‘: comment_count, } data_list.append(time) return data_list # 保存数据 def save_data(self, data_list): try: conn = Connect(host="127.0.0.1", user="root", password="root", database="data_list", port=3306, charset="utf8") cs1 = conn.cursor() # 执行insert语句,并返回受影响的行数:添加一条数据 for index, data in enumerate(data_list): count = cs1.execute( ‘insert into data(name,goods_link,img_link,title,price,company,deal_count,comment_count) values("%s","%s","%s","%s","%s","%s","%s","%s")‘ % ( data[‘name‘], data[‘goods_link‘], data[‘img_link‘], data[‘title‘], data[‘price‘], data[‘company‘], data[‘deal_count‘], data[‘comment_count‘])) # 关闭Cursor对象 print(count) cs1.close() # 提交之前的操作,此处为insert操作 conn.commit() except Exception as e: ‘‘‘吧报错信息写入log日志‘‘‘ with open(‘log.txt‘, ‘a‘) as f: f.write(repr(e) + ‘\n‘) finally: # 关闭Connection对象 conn.close() def run(self): # 构建url # 构建请求头 # 发起请求 for url in self.url_list: data = self.get_data(url) # 解析响应,抽取数据 data_list = self.parse_data(data) # 保存数据 self.save_data(data_list) def main(): n = str(name.get()) t = int(to_page.get()) w = int(w_page.get()) all = Taobao(n, t, w) all.run() if __name__ == ‘__main__‘: Button(window, text="确定", relief=‘groove‘, width=9, height=1, bd=4, command=main).pack() window.mainloop()
原文地址:https://www.cnblogs.com/bkylkh/p/8629410.html
时间: 2024-09-30 11:19:00