#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author:Aiker Zhao
@file:douban3.py
@time:上午10:34
"""
import json
import os
import re
from multiprocessing import Pool
import requests
from requests.exceptions import RequestException
dir = ‘z:\\douban\\‘
def get_web(url):
try:
rq = requests.get(url)
if rq.status_code == 200:
return rq.text
return None
except RequestException:
return None
def parse_web(html):
pattern = re.compile(‘<li\sclass="">.*?cover".*?href="(.*?)"\stitle="(.*?)".*?img\***c="(.*?)"‘ +
‘.*?class="author">(.*?)<.*?year">(.*?)<.*?publisher">(.*?)<.*?</li>‘, re.S)
results = re.findall(pattern, html)
# print(results)
for i in results:
# url, title, img, author, yeah, publisher = i
# author = re.sub(‘\s‘, ‘‘, author)
# yeah = re.sub(‘\s‘, ‘‘, yeah)
# publisher = re.sub(‘\s‘, ‘‘, publisher)
# print(url, title, img, author, yeah, publisher)
yield {
‘title‘: i[1],
‘url‘: i[0],
‘img‘: i[2],
‘author‘: i[3].strip(),
‘yeah‘: i[4].strip(),
‘publisher‘: i[5].strip()
}
# print(url, title, img, author, yeah, publisher)
# return img,title
def save_image(title, img):
images = dir + title + ‘.jpg‘
if os.path.exists(images):
pass
else:
with open(images, ‘wb‘) as f:
f.write(requests.get(img).content)
f.close()
def save_info(content):
info = dir + ‘info.txt‘
with open(info, ‘a‘, encoding=‘utf-8‘) as fd: #防止出现ascII
fd.write(json.dumps(content, ensure_ascii=False) + ‘\n‘) ##防止出现ascII
fd.close()
def main():
url = ‘https://book.douban.com/‘
html = get_web(url)
# parse_web(html)
for i in parse_web(html):
print(i)
save_info(i)
save_image(i.get(‘title‘), i.get(‘img‘))
if __name__ == ‘__main__‘:
main()
- 心得:
- 需要注意正则的匹配规则的准确度,否则会没有响应,或者无限超时
原文地址:https://blog.51cto.com/m51cto/2373119
时间: 2024-10-28 20:19:00