公司研发不给力,我就自己写了一个,专门爬关键词对应的商品ID。
其中还学会了用Python操作DB2数据库、Python发送邮件、写日志文件、处理浏览器访问限制。
#!/usr/bin/python# -*- encoding:utf-8 -*- import requestsfrom lxml import etreeimport ibm_dbimport loggingimport sysimport timeimport smtplib #配置写入日志logging.basicConfig(level=logging.INFO, format=‘%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s‘, datefmt=‘%a, %d %b %Y %H:%M:%S‘, filename=‘keywords_weekly.log‘, filemode=‘a‘) #编码reload(sys)sys.setdefaultencoding(‘utf-8‘) # 解决服务器限制访问问题def get_url_data(url,headers,max_tries=10): remaining_tries = max_tries while remaining_tries > 0: try: return requests.get(url,headers=headers) except requests.exceptions: time.sleep(60) remaining_tries = remaining_tries - 1 raise Exception("Couldn‘t get the url_data.") #写入db2def write_db2(resultdict): rank=resultdict[‘rank‘] #由于中文编码问题,关键词直接用update的方法更新 # keywords=resultdict[‘keywords‘] uv=resultdict[‘uv‘] frequency=resultdict[‘frequency‘] goods_id=resultdict[‘goods_id‘] sql_in="insert into T_KEYWORDS_weekly(K_RANK,UV,FREQUENCY,GOODS_ID,week_YEAR)" \ " values (%r,%r,%r,%r,year(current date)||‘-‘||WEEK_ISO(current date))" % (rank,uv,frequency,goods_id) ibm_db.exec_immediate(conn, sql_in) ibm_db.commit(conn) # #翻页def get_html(keywords): #keywords="沙发" user_agent = ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/31.0.1650.63 Safari/537.36‘ headers = { ‘User-Agent‘ : user_agent } #网址被我隐藏了哈,可以猜猜 url=‘http://www.XXX.com/category-9999/list-p1/?fl=q&keywords=%s‘%keywords html = get_url_data(url,headers) html_list=[html] selector = etree.HTML(html.text) #page_f="http://www.meilele.com" #得到共有多少页,循环得到各个页面的url page_e=selector.xpath(‘/html/body/div[@class="page-panel"]/div/div/span[@class="p-info"]/b/text()‘) if page_e: for i in range(2,int(page_e[0])+1): url_temp=‘http://www.meilele.com/category-9999/list-p%d/?fl=q&keywords=%s‘%(i,keywords) html_temp=requests.get(url_temp,headers=headers) html_list.append(html_temp) return html_list # #获取内容def get_id(dictionary): keywords=dictionary[1] html_list=get_html(keywords) logging.info("get the html_list %s successfully" %keywords) for each in html_list: html=each selector = etree.HTML(html.text) result={} try: content_field = selector.xpath(‘//*[@id="JS_list_panel"]/div[@class="w list-wrap"]/ul[@class="list-goods clearfix"]‘)[0] except: content_field=[] result[‘rank‘]=str(dictionary[0]) # result[‘keywords‘]=str(keywords) result[‘uv‘]=str(dictionary[2]) result[‘frequency‘]=str(dictionary[3]) result[‘goods_id‘]=str(‘‘) write_db2(result) else: for i in range(1,len(content_field)+1): goods_id = content_field.xpath(‘li[%d]/@data-goods-id‘%i)[0] #return goods_id result[‘rank‘]=str(dictionary[0]) # result[‘keywords‘]=str(keywords) result[‘uv‘]=str(dictionary[2]) result[‘frequency‘]=str(dictionary[3]) result[‘goods_id‘]=str(goods_id) write_db2(result) if __name__ == "__main__": #把密码也隐藏起来 conn=ibm_db.connect("DATABASE=aedw;HOSTNAME=miranda;PORT=50000;PROTOCOL=TCPIP;UID=miranda; PWD=miranda;", "", "") #测试连接 try: conn logging.info("connect to DB2 successfully") except: logging.info("couldn‘t connect to DB2") # #创建表 # sql_create=‘create table T_keywords_weekly_TEMP like V_keywords_weekly‘ # stmt_create = ibm_db.exec_immediate(conn, sql_create) # try: # stmt_create # logging.info("create table T_keywords_weekly_TEMP successfully") # except: # logging.info("couldn‘t create table T_keywords_weekly_TEMP") # #插入表 # sql_insert="insert into T_keywords_weekly_TEMP select * from V_keywords_weekly where rank>=100" # stmt_insert = ibm_db.exec_immediate(conn, sql_insert) # try: # stmt_insert # logging.info("insert into T_keywords_weekly_TEMP successfully") # except: # logging.info("couldn‘t insert into table T_keywords_weekly_TEMP")
sql_select="select * from T_keywords_weekly_TEMP where rank>=162"stmt_select = ibm_db.exec_immediate(conn, sql_select)try: stmt_select logging.info("get the data from T_keywords_weekly_TEMP")except: logging.info("couldn‘t get the data from T_keywords_weekly_TEMP")else: dictionary = ibm_db.fetch_both(stmt_select) while dictionary != False: logging.info (‘rank:‘+str(dictionary[0])+ ‘ keywords:‘+str(dictionary[1])) get_id(dictionary) dictionary = ibm_db.fetch_both(stmt_select) #这一句不能少啊# 更新关键字sql_update=‘‘‘MERGE INTO T_KEYWORDS_weekly as tkmUSING T_keywords_weekly_TEMP as tkmtON tkm.K_RANK=tkmt.RANKand tkm.week_YEAR=year(current date)||‘-‘||WEEK_ISO(current date)WHEN MATCHEDTHEN UPDATE SET tkm.KEYWORDS=tkmt.KEYWORDSELSE IGNORE‘‘‘stmt_update=ibm_db.exec_immediate(conn, sql_update)try: stmt_update logging.info("update the keywords")except: logging.info("couldn‘t update the keywords") # sql_drop="drop table T_keywords_weekly_TEMP"# stmt_drop = ibm_db.exec_immediate(conn, sql_drop)# try:# stmt_drop# logging.info("drop table T_keywords_weekly_TEMP successfully")# except:# logging.info("couldn‘t drop table T_keywords_weekly_TEMP")## ibm_db.close(conn)# logging.info("close the connect!") # 配置发送邮件sender = ‘[email protected]‘receivers = [‘[email protected]‘]SUBJECT = "Successfully update T_KEYWORDS_weekly"TEXT = ‘‘‘ Dear miranda, your python script of update T_KEYWORDS_weekly of last month have successed. may you have a good mood‘‘‘ message = """\From: %sTo: %sSubject: %s %s""" % (sender, ", ".join(receivers), SUBJECT, TEXT) try: smtpObj = smtplib.SMTP(‘mail.meilele.com‘, 25) smtpObj.sendmail(sender, receivers, message) logging.info("Successfully sent email")except: logging.info("Error: unable to send email" ) print(‘finish‘)
时间: 2024-11-06 19:16:33