import requests
import re
import uuid
from concurrent.futures import ThreadPoolExecutor
pool = ThreadPoolExecutor(50)
# 爬虫三部曲
# 1.发送请求
def get_html(url):
print(f'start: {url}...')
response = requests.get(url)
return response
# 2.解析数据
# 解析主页,获取视频详情页url
def parse_index(response):
'''
<a href="(.*?)" class="vervideo-lilink actplay" target="_blank">.*?<div class="vervideo-title">(.*?)</div>.*?</a>
'''
# 获取电影所有的id
movie_id_list = re.findall(
'<a href="video_(.*?)"', # video_1637397
response.text,
re.S
)
return list(set(movie_id_list))
# from concurrent.futures._base import Future
# 解析视频详情页,获取真实视频url
def parse_detail(res): # res对象 --- 》 {'result': response}
# print(type(res))
# print(res.__dict__)
# print('*'*100)
response = res.result()
# 通过回调得到的response参数是一个对象
'''
<video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay" src="https://video.pearvideo.com/mp4/adshort/20191228/cont-1637151-14745612_adpkg-ad_hd.mp4" style="width: 100%; height: 100%;"></video>
'''
movie_url = re.findall('srcUrl="(.*?)"', response.text, re.S)[0]
print('是否到此处了')
# 异步提交任务爬取真实视频数据,并保存
pool.submit(save_movie, movie_url)
# 3.保存数据
def save_movie(movie_url):
print('start')
movie_response = get_html(movie_url)
# print(movie_response.text)
# print(movie_response.content)
with open(f'{str(uuid.uuid4())}.mp4', 'wb') as f:
for line in movie_response.iter_content():
f.write(line)
print('end...')
if __name__ == '__main__':
import time
index_url = 'https://www.pearvideo.com/'
response = get_html(index_url)
# 1.对梨视频主页进行解析,提取所有视频详情页的绝对路径
movie_id_list = parse_index(response)
for movie_id in movie_id_list:
detail_url = 'https://www.pearvideo.com/video_' + movie_id
time.sleep(0.1)
# 循环并发异步提交任务, add_done_callback将get_html任务的执行结果,回调给
pool.submit(get_html, detail_url).add_done_callback(parse_detail)
原文地址:https://www.cnblogs.com/chanyuli/p/12135616.html
时间: 2024-10-30 08:06:36