第 0013 题: 用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-)
完整代码
思路:
其实这个可以不用scrapy,就用正则匹配+request应该就可以完成任务了。我想练习下scrapy,于是就用scrapy做这个了。
这个只要求爬一个网页上的图片,所以也不用写什么follow规则,算是比较简单的。通过分析链接里的妹子图片的标签,发现百度贴吧里发的图片是带BDE_Image这个类的,所以就好办了,直接用xpath把所有img标签中带BDE_Image类的全部提出来,就是所需的图片了,把需要的东西放到item里,然后交给pipeline搞定。
我在pipeline中先判断信息是否齐全,然后检测是否已经下载过这图片,如果是的话就跳过,否则把图片下载下来,为了方便,保存图片后,我还把图片信息(名字,存放路径)存放在mongodb中。
步骤:
生成一个叫baidutieba的scrapy项目:scrapy startproject baidutieba
打开项目文件夹:cd baidutieba
生成一个叫meizi的spider:scrapy genspider meizi baidu.com
然后编写相关代码
运行:scrapy crawl meizi
代码:
spider:
meizi.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from baidutieba.items import BaidutiebaItem
from scrapy.selector import Selector
import sys
reload(sys)
sys.setdefaultencoding(‘utf-8‘)
class MeiziSpider(CrawlSpider):
name = "meizi"
allowed_domains = ["baidu.com"]
print "开始爬取妹子图"
start_urls = (
‘http://tieba.baidu.com/p/2166231880‘,
)
# 定义parse方法,用来解析
def parse(self, response):
# 找出所有类为BDE_Image的图片
AllImg = Selector(response).xpath(‘//img[@class="BDE_Image"]‘)
for img in AllImg:
item = BaidutiebaItem()
item[‘Img_name‘] = img.xpath(‘@bdwater‘).extract()[0]
item[‘Img_url‘] = img.xpath(‘@src‘).extract()[0]
yield item
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
import requests
import os
class ImageDownloadAndMongoDBPipeline(object):
def __init__(self):
# 创建mongodb连接
connection = pymongo.MongoClient(
settings[‘MONGODB_SERVER‘],
settings[‘MONGODB_PORT‘]
)
db = connection[settings[‘MONGODB_DB‘]]
self.collection = db[settings[‘MONGODB_COLLECTION‘]]
def process_item(self, item, spider):
valid = True
# 检查是否合法
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
# 定义目录地址
dir_path = ‘%s/%s‘ % (settings[‘IMAGES_STORE‘], spider.name)
# 检查目录是否存在
if not os.path.exists(dir_path):
log.msg("不存在目录,创建",
level=log.DEBUG, spider=spider)
os.makedirs(dir_path)
image_url = item[‘Img_url‘]
# 文件名
us = image_url.split(‘/‘)[3:]
image_file_name = ‘_‘.join(us)
file_path = ‘%s/%s‘ % (dir_path, image_file_name)
if not os.path.exists(file_path):
# 检查是否已经下载过 若不存在 下载该图片
with open(file_path, ‘wb‘) as handle:
response = requests.get(image_url, stream=True)
for block in response.iter_content(1024):
if block:
handle.write(block)
item[‘File_path‘] = file_path
log.msg("已下载图片!",
level=log.DEBUG, spider=spider)
# 数据库记录
self.collection.insert(dict(item))
log.msg("已存入数据库!",
level=log.DEBUG, spider=spider)
else:
log.msg("已下载过该图片,跳过",
level=log.DEBUG, spider=spider)
return item
class ImageDownloadPipeline(object):
def process_item(self, item, spider):
print item
return item
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class BaidutiebaItem(scrapy.Item):
Img_name = scrapy.Field()
Img_url = scrapy.Field()
File_path = scrapy.Field()
pass
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for baidutieba project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = ‘baidutieba‘
SPIDER_MODULES = [‘baidutieba.spiders‘]
NEWSPIDER_MODULE = ‘baidutieba.spiders‘
ITEM_PIPELINES = {‘baidutieba.pipelines.ImageDownloadAndMongoDBPipeline‘: 1}
# 存放图片路径
IMAGES_STORE = ‘/home/bill/Pictures‘
# mongodb配置
MONGODB_SERVER = "localhost"
MONGODB_PORT = 27017
MONGODB_DB = "meizidb"
MONGODB_COLLECTION = "meizi"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = ‘baidutieba (+http://www.yourdomain.com)‘
爬取过程:
数据库:
爬到的妹子图:
时间: 2024-10-07 05:48:59