import psycopg2import jsonfrom news_project.config.sql_log import logfrom news_project.middlewares import Deal_Content class NewsProjectPipeline(object): def open_spider(self, spider): l = self.l = log() self.conn = psycopg2.connect(database=l.database, user=l.user, password=l.password, host=l.host, port=l.port) def process_item(self, item, spider): l = self.l = log() self.conn = psycopg2.connect(database=l.database, user=l.user, password=l.password, host=l.host, port=l.port) self.cur = self.conn.cursor() item = dict(item) d = Deal_Content() item[‘time‘] = d.handleTime(item[‘time‘],item[‘title_url‘]) #修改时间格式 print("item*************************************///////////////////////", item[‘time‘]) for i in item.keys(): if item[i] == "" or item[i] == None: item[i] = None if item[‘type_cn‘] == None: item[‘type_cn‘] = "行业新闻" if item[‘type_no‘] == None: item[‘type_no‘] = 16 if item[‘content‘] == None: return 0 #两种存储状态。 if not item.get(‘association_id‘): self.cur.execute( "INSERT INTO bjzs_big_data.baoji_news(type_cn,source,level2,level1,event_time,title,url,content,lable,type_no) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (item[‘type_cn‘], item[‘news‘], item[‘id‘], item[‘pid‘], item[‘time‘], item[‘title‘], item[‘title_url‘], item[‘content‘], item[‘tags‘], item[‘type_no‘])) elif item.get(‘association_id‘): self.cur.execute( "INSERT INTO bjzs_big_data.baoji_news(type_cn,source,level2,level1,event_time,title,url,content,lable,type_no,association_id) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(item[‘type_cn‘],item[‘news‘],item[‘id‘],item[‘pid‘],item[‘time‘],item[‘title‘],item[‘title_url‘],item[‘content‘],item[‘tags‘],item[‘type_no‘],item[‘association_id‘])) else: pass #提交 self.conn.commit() self.cur.close() self.conn.close() return item def close_spider(self, spider): self.conn.close()
原文地址:https://www.cnblogs.com/yuanjia8888/p/10233834.html
时间: 2024-10-13 08:49:27