一、爬取电影信息
http://www.imdb.cn/nowplaying/{num} #页面规则
http://www.imdb.cn/title/tt{num} #某部电影信息
获取电影url和title
新建项目
scrapy startproject imdb
修改items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class ImdbItem(Item): # define the fields for your item here like: # name = scrapy.Field() #url = scrapy.Field() #url #title = scrapy.Field() #影片名 video_title = Field() video_rating = Field() video_name = Field() video_alias = Field() video_director = Field() video_actor = Field() video_length = Field() video_language = Field() video_year = Field() video_type = Field() video_color = Field() video_area = Field() video_voice = Field() video_summary = Field() video_url = Field()
在spiders目录下新建爬虫文件moive.py
# -*- coding: utf-8 -*- from scrapy.spiders import CrawlSpider, Request, Rule from imdb.items import ImdbItem from scrapy.linkextractor import LinkExtractor class ImdbSpider(CrawlSpider): name = 'imdb' allowed_domains = ['www.imdb.cn'] rules = ( Rule(LinkExtractor(allow=r"/title/tt\d+$"), callback="parse_imdb", follow=True), ) def start_requests(self): for i in range(1, 20): url = "http://www.imdb.cn/nowplaying/" + str(i) yield Request(url=url, callback=self.parse) def parse_imdb(self, response): item = ImdbItem() try: item['video_title'] = "".join(response.xpath('//*[@class="fk-3"]/div[@class="hdd"]/h3/text()').extract()) item['video_rating'] = "".join( response.xpath('//*[@class="fk-3"]/div[@class="hdd"]/span/i/text()').extract()) content = response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li').extract() for i in range(0, len(content)): if "片名" in content[i]: if i == 0: item['video_name'] = "".join( response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[1]/a/text()').extract()) if "别名" in content[i]: if i == 1: item['video_alias'] = "|".join( response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[2]/a/text()').extract()) if "导演" in content[i]: if i == 1: item['video_director'] = "|".join( response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[2]/a/text()').extract()) elif i == 2: item['video_director'] = "|".join( response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[3]/a/text()').extract()) if "主演" in content[i]: if i == 2: item['video_actor'] = "|".join( response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[3]/a/text()').extract()) if i == 3: item['video_actor'] = "|".join( response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[4]/a/text()').extract()) if "上映时间" in content[i]: if i == 4: item['video_year'] = "|".join( response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a[1]/text()').extract()) a = response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a').extract() length = len(a) - 1 try: item['video_color'] = "".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a/text()').extract()[length]) except Exception as e: item['video_color'] = "" try: type = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a/text()').extract()[1:length]) maohao = type.split(":") if len(maohao) > 0: item['video_type'] = maohao[0] else: item['video_type'] = "" except Exception as e: item['video_type'] = "" if i == 5: item['video_year'] = "".join( response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[1]/text()').extract()) a = response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a').extract() length = len(a) - 1 try: item['video_color'] = "".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a/text()').extract()[length]) except Exception as e: item['video_color'] = "" try: type = "|".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a/text()').extract()[1:length]) maohao = type.split(":") if len(maohao) > 0: item['video_type'] = maohao[0] else: item['video_type'] = "" except Exception as e: item['video_type'] = "" if "国家" in content[i]: if i == 5: item['video_area'] = "|".join( response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[1]/text()').extract()) item['video_voice'] = "|".join( response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[2]/text()').extract()) if i == 6: item['video_area'] = "|".join( response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[7]/a[1]/text()').extract()) item['video_voice'] = "|".join( response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[7]/a[2]/text()').extract()) item['video_length'] = "".join( response.xpath( '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[@class="nolink"]/text()').extract()).replace( " ", "") item['video_language'] = "".join( response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[@class="nolink"]/a/text()').extract()) item['video_summary'] = "".join( response.xpath( '//*[@class="fk-4 clear"]/div[@class="bdd clear"]/i/text()').extract()).lstrip().rstrip().replace( "<br>", "") item['video_url'] = response.url yield item except Exception as error: log(error)
在spiders目录下新建run.py启动文件
vim run.py
# coding:utf-8
from scrapy import cmdline
cmdline.execute("scrapy crawl imdb".split())
二、有限深度爬取
新建项目
scrapy startproject douban
scrapy中,我们在settings.py设置深度使用DEPTH_LIMIT,例如:DEPTH_LIMIT = 5,该深度是相对于初始请求url的深度
修改settings.py
DEPTH_LIMIT = 4
#豆瓣有反爬虫机制,因此设置延时DOWNLOAD_DELAY
DOWNLOAD_DELAY = 2
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36' #设置代理
items.py
from scrapy import Item, Field # 音乐 class MusicItem(Item): music_name = Field() music_alias = Field() music_singer = Field() music_time = Field() music_rating = Field() music_votes = Field() music_tags = Field() music_url = Field() class MusicReviewItem(Item): review_title = Field() review_content = Field() review_author = Field() review_music = Field() review_time = Field() review_url = Field()
爬虫文件music.py
# --*-- coding: utf-8 --*-- from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from douban.items import MusicItem, MusicReviewItem from scrapy import log class ReviewSpider(CrawlSpider): name = 'review' allowed_domains = ['music.douban.com'] start_urls = ['https://music.douban.com/subject/1406522/'] rules = ( Rule(LinkExtractor(allow=r"/subject/\d+/reviews$")), Rule(LinkExtractor(allow=r"/subject/\d+/reviews\?sort=time$")), Rule(LinkExtractor(allow=r"/subject/\d+/reviews\?sort=time\&start=\d+$")), Rule(LinkExtractor(allow=r"/review/\d+/$"), callback="parse_review", follow=True), ) def parse_review(self,response): try: item = MusicReviewItem() item['review_title'] = "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) content = "".join( response.xpath('//*[@id="link-report"]/div[@property="v:description"]/text()').extract() ) item['review_content'] = content.lstrip().rstrip().replace('\n'," ") item['review_author'] = "".join(response.xpath('//*[@property="v:reviewer"]/text()').extract()) item['review_music'] = "".join(response.xpath('//*[@class="main-hd"]/a[2]/text()').extract()) item['review_time'] = "".join(response.xpath('//*[@class="main-hd"]/p/text()').extract()) item['review_url'] = response.url yield item except Exception as error: log(error)
启动命令文件run.py
# --*-- coding: utf-8 --*-- from scrapy import cmdline cmdline.execute("scrapy crawl review -o review.json".split())
-o 参数导出结果到review.json文件
多个爬虫组合
我们现在有这么个需求,既要爬取音乐详情又要爬取乐评,既要爬取电影详情又要爬取影评,这个要怎么搞,难道是每一个需求就要创建一个项目么,如果按这种方式,我们就要创建四个项目,分别来爬取音乐、乐评、电影、影评,显然这么做的话,代码不仅有很多重合的部分,而且还不容易维护爬虫
新建项目
scrapy startproject multi
修改settings.py
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36' DOWNLOAD_DELAY = 2 修改items.py from scrapy import Item, Field # 音乐 class MusicItem(Item): music_name = Field() music_alias = Field() music_singer = Field() music_time = Field() music_rating = Field() music_votes = Field() music_tags = Field() music_url = Field() # 乐评 class MusicReviewItem(Item): review_title = Field() review_content = Field() review_author = Field() review_music = Field() review_time = Field() review_url = Field() # 电影 class VideoItem(Item): video_name = Field() video_alias = Field() video_actor = Field() video_year = Field() video_time = Field() video_rating = Field() video_votes = Field() video_tags = Field() video_url = Field() video_director = Field() video_type = Field() video_bigtype = Field() video_area = Field() video_language = Field() video_length = Field() video_writer = Field() video_desc = Field() video_episodes = Field() # 影评 class VideoReviewItem(Item): review_title = Field() review_content = Field() review_author = Field() review_video = Field() review_time = Field() review_url = Field() spiders目录下新建两个爬虫文件 videospider.py # --*-- coding: utf-8 --*-- from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from multi.items import VideoItem, VideoReviewItem from scrapy import log import re AREA = re.compile(r"制片国家/地区:</span> (.+?)<br>") ALIAS = re.compile(r"又名:</span> (.+?)<br>") LANGUAGE = re.compile(r"语言:</span> (.+?)<br>") EPISODES = re.compile(r"集数:</span> (.+?)<br>") LENGTH = re.compile(r"单集片长:</span> (.+?)<br>") class VideoSpider(CrawlSpider): name = 'video' allowed_domains = ['movie.douban.com'] start_urls = [ 'https://movie.douban.com/tag/', 'https://movie.douban.com/tag/?view=cloud' ] rules = (Rule(LinkExtractor(allow=r"/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))$")), Rule(LinkExtractor(allow=r"/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))\?start=\d+\&type=T$")), Rule(LinkExtractor(allow=r"/subject/\d+/reviews$")), Rule(LinkExtractor(allow=r"/subject/\d+/reviews\?start=\d+$")), Rule(LinkExtractor(allow=r"/subject/\d+/$"), callback="parse_video", follow=True), Rule(LinkExtractor(allow=r"/review/\d+/$"), callback="parse_review", follow=True), ) def parse_video(self, response): item = VideoItem() try: item["video_url"] = response.url item["video_name"] = ''.join( response.xpath('//*[@id="content"]/h1/span[@property="v:itemreviewed"]/text()').extract() ) try: item["video_year"] = ''.join( response.xpath('//*[@id="content"]/h1/span[@class="year"]/text()').extract()).replace( "(", "").replace(")", "" ) except Exception as e: print('Exception:', e) item['video_year'] = '' introduction = response.xpath('//*[@id="link-report"]/span[@property="v:summary"]/text()').extract() if introduction: item["video_desc"] = ''.join(introduction).strip().replace("\r\n", " ") else: item["video_desc"] = ''.join( response.xpath('//*[@id="link-report"]/span/text()').extract()).strip().replace("\r\n", " ") item["video_director"] = "|".join( response.xpath('//*[@id="info"]/span/span/a[@rel="v:directedBy"]/text()').extract()) item["video_writer"] = "|".join( response.xpath('//*[@id="info"]/span[2]/span[2]/a/text()').extract()) item["video_actor"] = "|".join(response.xpath("//a[@rel='v:starring']/text()").extract()) item["video_type"] = "|".join(response.xpath('//*[@id="info"]/span[@property="v:genre"]/text()').extract()) S = "".join(response.xpath("//div[@id='info']").extract()) M = AREA.search(S) if M is not None: item["video_area"] = "|".join([area.strip() for area in M.group(1).split("/")]) else: item['video_area'] = '' A = "".join(response.xpath("//div[@id='info']").extract()) AL = ALIAS.search(A) if AL is not None: item["video_alias"] = "|".join([alias.strip() for alias in AL.group(1).split("/")]) else: item["video_alias"] = "" video_info = "".join(response.xpath("//div[@id='info']").extract()) language = LANGUAGE.search(video_info) episodes = EPISODES.search(video_info) length = LENGTH.search(video_info) if language is not None: item["video_language"] = "|".join([language.strip() for language in language.group(1).split("/")]) else: item['video_language'] = '' if length is not None: item["video_length"] = "|".join([runtime.strip() for runtime in length.group(1).split("/")]) else: item["video_length"] = "".join( response.xpath('//*[@id="info"]/span[@property="v:runtime"]/text()').extract()) item['video_time'] = "/".join( response.xpath('//*[@id="info"]/span[@property="v:initialReleaseDate"]/text()').extract()) if episodes is not None: item['video_bigtype'] = "电视剧" item["video_episodes"] = "|".join([episodes.strip() for episodes in episodes.group(1).split("/")]) else: item['video_bigtype'] = "电影" item['video_episodes'] = '' item['video_tags'] = "|".join( response.xpath('//*[@class="tags"]/div[@class="tags-body"]/a/text()').extract()) try: item['video_rating'] = "".join(response.xpath( '//*[@class="rating_self clearfix"]/strong/text()').extract()) item['video_votes'] = "".join(response.xpath( '//*[@class="rating_self clearfix"]/div/div[@class="rating_sum"]/a/span/text()').extract()) except Exception as error: item['video_rating'] = '0' item['video_votes'] = '0' log(error) yield item except Exception as error: log(error) def parse_review(self, response): try: item = VideoReviewItem() item['review_title'] = "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) content = "".join( response.xpath('//*[@id="link-report"]/div[@property="v:description"]/text()').extract()) item['review_content'] = content.lstrip().rstrip().replace("\n", " ") item['review_author'] = "".join(response.xpath('//*[@property = "v:reviewer"]/text()').extract()) item['review_video'] = "".join(response.xpath('//*[@class="main-hd"]/a[2]/text()').extract()) item['review_time'] = "".join(response.xpath('//*[@class="main-hd"]/p/text()').extract()) item['review_url'] = response.url yield item except Exception as error: log(error) 新建musicspider.py # --*-- coding: utf-8 --*-- from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from multi.items import MusicItem, MusicReviewItem from scrapy import log import re class MusicSpider(CrawlSpider): name = "music" allowed_domains = ['music.douban.com'] start_urls = [ 'https://music.douban.com/tag/', 'https://music.douban.com/tag/?view=cloud' ] rules = (Rule(LinkExtractor(allow=r"/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))$")), Rule(LinkExtractor(allow=r"/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))\?start=\d+\&type=T$")), Rule(LinkExtractor(allow=r"/subject/\d+/reviews\?sort=time$")), Rule(LinkExtractor(allow=r"/subject/\d+/reviews\?sort=time\&start=\d+$")), Rule(LinkExtractor(allow=r"/subject/\d+/$"), callback="parse_music", follow=True), Rule(LinkExtractor(allow=r"/review/\d+/$"), callback="parse_review", follow=True), ) def parse_music(self,response): item = MusicItem() try: item['music_name'] = response.xpath('//*[@id="wrapper"]/h1/span/text()').extract()[0] content = "".join(response.xpath('//*[@id="info"]').extract()) info = response.xpath('//*[@id="info"]/span').extract() item['music_alias'] = "" item['music_singer'] = "" item['music_time'] = "" for i in range(0, len(info)): if "又名" in info[i]: if i == 0: item['music_alias'] = response.xpath('//*[@id="info"]/text()').extract()[1].replace("\xa0", "").replace("\n", "").rstrip() elif i == 1: item['music_alias'] = response.xpath('//*[@id="info"]/text()').extract()[2].replace("\xa0", "").replace("\n", "").rstrip() elif i == 2: item['music_alias'] = response.xpath('//*[@id="info"]/text()').extract()[3].replace("\xa0", "").replace("\n", "").rstrip() else: item['music_alias'] = "" if "表演者" in info[i]: if i == 0: item['music_singer'] = "|".join(response.xpath('//*[@id="info"]/span[1]/span/a/text()').extract()) elif i == 1: item['music_singer'] = "|".join( response.xpath('//*[@id="info"]/span[2]/span/a/text()').extract()) elif i == 2: item['music_singer'] = "|".join( response.xpath('//*[@id="info"]/span[3]/span/a/text()').extract()) else: item['music_singer'] = "" if "发行时间" in info[i]: nbsp = re.findall(r"<span class=\"pl\">发行时间:</span>(.*?)<br>", content, re.S) item['music_time'] = "".join(nbsp).replace("\xa0", "").replace("\n", "").replace(" ", "") try: item['music_rating'] = "".join(response.xpath('//*[@class="rating_self clearfix"]/strong/text()').extract()) item['music_votes'] = "".join(response.xpath('//*[@class="rating_self clearfix"]/div/div[@class="rating_sum"]/a/span/text()').extract()) except Exception as error: item['music_rating'] = '0' item['music_votes'] = '0' log(error) item['music_tags'] = "|".join(response.xpath('//*[@id="db-tags-section"]/div/a/text()').extract()) item['music_url'] = response.url yield item except Exception as error: log(error) def parse_review(self, response): try: item = MusicReviewItem() item['review_title'] = "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) content = "".join( response.xpath('//*[@id="link-report"]/div[@property="v:description"]/text()').extract() ) item['review_content'] = content.lstrip().rstrip().replace("\n", " ") item['review_author'] = "".join(response.xpath('//*[@property = "v:reviewer"]/text()').extract()) item['review_music'] = "".join(response.xpath('//*[@class="main-hd"]/a[2]/text()').extract()) item['review_time'] = "".join(response.xpath('//*[@class="main-hd"]/p/text()').extract()) item['review_url'] = response.url yield item except Exception as error: log(error) 新建启动文件run.py # --*-- coding: utf-8 --*-- from scrapy import cmdline cmdline.execute("scrapy crawl music".split()) cmdline.execute("scrapy crawl video".split())
原文地址:http://blog.51cto.com/haoyonghui/2061715