其实很简单,却因为一些小问题,折腾不少时间,简要记录一下,以备后需。
>> scrapy startproject lagou >> cd lagou >> scrapy gen lagou_jd www.lagou.com
定义item
在items.py中继续完善定义:
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class LagouItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() job_title = scrapy.Field() job_description = scrapy.Field() job_url = scrapy.Field()
完善爬虫
# -*- coding: utf-8 -*- from scrapy.selector import Selector from scrapy.contrib.spiders import CrawlSpider,Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from lagou.items import LagouItem import codecs,re from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding(‘utf-8‘) class LagoujdSpider(CrawlSpider): name = "lagoujd" allowed_domains = ["lagou.com"] start_urls = ( ‘http://www.lagou.com/jobs/787409.html‘, ) rules = [ Rule(SgmlLinkExtractor(allow =r‘jobs/\d+\.html‘),callback = ‘parse_lagou‘,follow=True), ] def parse_lagou(self, response): # 主要改名,不能使用默认的parse! self.SPLIT_DEMAND = re.compile(u‘(要求|资格|条件)[::;\r\n]?‘) self.SPLIT_LINE = re.compile(u‘[;;。\r\n]‘) self.DEMAND = re.compile(u‘具备|熟悉|具有|熟练|掌握|良好的|能够|丰富的|以上学历|优秀的|有深入研究|有很强的|工作 经历|工作经验|善于|懂得|优先|不少于|不超过|喜欢|较强的.{2,8}能力|相关专业|相关学历|开发经验|实习经验|\d年以上‘) item = LagouItem() sel = Selector(response) try: item["job_title"] =sel.xpath("//title/text()").extract()[0].split(‘-‘)[0][:-2].strip() job_des = sel.xpath(‘//*[@id="container"]/div[1]/div[1]/dl[1]/dd[2]‘).extract()[0] job_des = BeautifulSoup(job_des).get_text() item["job_description"] = self.get_demand(job_des) item["job_url"] = response.url print item[‘job_title‘] except Exception,e: print e # if item.has_key("job_title") and item.has_key("job_description"): # with codecs.open("./output/"+item["job_title"].strip()+".txt",‘a‘,‘utf-8‘) as fw: # fw.write(item["job_description"]) # print item["job_title"],"done" return item def get_demand(self,jdstr): res = [] if self.SPLIT_DEMAND.search(jdstr): pos = self.SPLIT_DEMAND.search(jdstr).span()[1] linelist =self.SPLIT_LINE.split(jdstr[pos:]) for line in linelist: if len(line)<5:continue if re.match(‘\d‘,line.strip()): res.append(line) elif self.DEMAND.search(line): res.append(line) else: break return ‘\n‘.join(res)
存储抓取的数据为json格式
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import json import codecs class LagouPipeline(object): def __init__(self): self.file = codecs.open(‘lagou_jd.json‘,‘w‘,encoding=‘utf-8‘) def process_item(self, item, spider): line = json.dumps(dict(item),ensure_ascii=False)+‘\n‘ self.file.write(line) return item def spider_closed(self,spider): self.file.close()
在setttings.py 中注册pipline
# -*- coding: utf-8 -*- # Scrapy settings for lagou project # # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # # http://doc.scrapy.org/en/latest/topics/settings.html # BOT_NAME = ‘lagou‘ SPIDER_MODULES = [‘lagou.spiders‘] NEWSPIDER_MODULE = ‘lagou.spiders‘ ITEM_PIPELINES = { ‘lagou.pipelines.LagouPipeline‘:300, } # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = ‘lagou (+http://www.yourdomain.com)‘
运行,各种抓!!!
>> scrapy crawl lagou_jd 或者 >> scrapy crawl lagou_jd -o item.json -t json
demo:
[email protected]:~/workplace/spiders/lagou$ more lagou_jd.json {"job_url": "http://www.lagou.com/jobs/1102051.html", "job_description": "1、具有2年以上互联网产品经验,优秀的交互设计 能力,对产品设计有极高的要求,追求极致的用户体验 2、善于观察与学习,具有宽广的视野、对于整体产品规划有自己的见解和理念 3、有优秀缜密的逻辑与思维能力,良好的协调能力、分析、计划及项目管理能力,具备良好的团队精神,沟通能力强 4、熟练使用 Axu re 、 visio 、 office 等软件\n 5、有成熟的O2O平台类产品设计经验者优先", "job_title": "产品经理"} {"job_url": "http://www.lagou.com/jobs/917776.html", "job_description": "1、有2年以上互联网产品规划和体验设计相关经验, 熟悉互联网或软件产品整体实现过程,包括从需求分析到产品发布\n2、有完整策划至少2个以上成功、目前尚在运营中的互联网产品设 计案例\n3、能通过数据分析等系统性方法深刻理解用户需求并予以满足\n4、执行力强,善于组织协调并推动项目进展\n5、对工作充满 热情,富有创新精神,能承受较大的工作压力\n6、有良好的学习能力、良好的沟通能力和团队合作精神,出色的组织能力", "job_titl e": "产品经理"}
新建脚本文件preprocess.py,进一步预处理
# coding=utf-8 import simplejson as json import re import sys,codecs reload(sys) sys.setdefaultencoding(‘utf-8‘) from collections import defaultdict from simhash import Simhash def clean_text(fname=‘./lagou_jd.json‘): res = defaultdict(str) i=1 for line in codecs.open(fname): jd = json.loads(line) if not re.match(u‘\d‘,jd[‘job_description‘].strip()) or len(jd["job_description"])<8 or len(jd["job_title"])<2: continue if re.search(u‘公司简介|职业描述|福利|职责|描述‘,jd[‘job_description‘]):continue if res.has_key(jd["job_title"]) and Simhash(jd[‘job_title‘]).distance(Simhash(res[jd[‘job_title‘]]))<0.1: jd[‘job_title‘] = res[jd[‘job_title‘]] jd["job_description"] = re.sub(ur"\xa0","",jd["job_description"].decode(‘utf-8‘)) res[jd[‘job_title‘]] = res.get(jd[‘job_title‘],‘‘)+‘\n‘+‘\n‘.join(map(lambda x:re.sub("\s+|\t"," ",x).strip(),( filter(lambda x:len(x)>3 and re.match(r‘\d‘,x),re.split(u‘[;:。\r\n]‘,jd[‘job_description‘].strip()))))) # print ‘\n\n===\n\n‘,jd[‘job_title‘],‘\t‘,res[jd[‘job_title‘]] i += 1 if i%100==0: print i print i,"done" print len(res) json.dump(res,open(‘lagou_jd_clean.json‘,‘w‘)) def get_jds(fname=‘./lagou_jd_clean.json‘): res = json.load(codecs.open(fname)) i = 1 for k,v in res.iteritems(): if len(k)>5: print k,v print "\n\n===\n\n" i += 1 if i>5: break if __name__ == "__main__": clean_text() get_jds()
时间: 2024-10-29 15:32:00