from urllib.request import urlopenfrom http.client import HTTPResponse response = urlopen(‘http://www.baidu.com‘)# http.client.HTTPResponse对象print(type(response)) with response: print(1,response.status) print(2,response.reason) print(3,response.geturl()) print(4,response.info()) # headers print(5,response.read())# urlopen 只能传递url和data,但是不能构造HTTP请求,所以Request类来实现 # 初始化方法,构造一个请求对象 可以添加一个headers字典,data参数决定是GET或者POST# add_header(key,value)也可以为headers中增加一个键值对
from urllib.request import Request,urlopenimport randomurl = ‘http://www.bing.com‘ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]ua = random.choice(ua_list)req = Request(url) # 返回<class ‘urllib.request.Request‘>类req.add_header(‘User-agent‘,ua)print(‘type_req‘,type(req))# 返回<class ‘http.client.HTTPResponse‘>类 response = urlopen(req,timeout=20)print(type(response)) with response: print(1,response.status,response.getcode(),response.reason) print(2,response.geturl()) print(3,response.info())#response的headers print(4,response.read())print(5,req.get_header(‘User-agent‘))print(6,‘user-agent‘.capitalize())
# urllib.parse 模块from urllib import parseu = { ‘url‘:‘http://www.baidu.com‘, ‘p_url‘:‘http://www.baidu.com‘}x = parse.urlencode(u)print(x) u = parse.urlencode({‘wd‘:‘中国‘}) # 编码print(u)url = "https://www.baidu.com/s?{}".format(u)print(url)print(‘中国‘.encode(‘utf-8‘)) print(parse.unquote(u)) # 解码print(parse.unquote(url)) 需求:通过关键字在bing中搜索,返回结果保存在html文件中
from urllib.request import Request,urlopenfrom urllib.parse import urlencodeimport randomkeyword = input("请输入关键字")data = urlencode({‘q‘:keyword})base_url = ‘http://cn.bing.com/search‘url = ‘{}?{}‘.format(base_url,data)print(url)ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]ua = random.choice(ua_list)req = Request(url,headers={‘User-agent‘:ua})response = urlopen(req)with response: with open(‘1.html‘,‘wb‘) as f: f.write(response.read())print("success")
# POST方法from urllib.request import Request,urlopenfrom urllib.parse import urlencodeimport simplejsonimport randomua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]ua = random.choice(ua_list)req = Request(‘http://httonin.org/post‘)req.add_header(‘User-agent‘,ua)data = urlencode({‘name‘:‘张三,@=/&*‘,‘age‘:‘6‘})print(data)res1 = urlopen(req,data=‘name=张三,@=/&*,&age=6‘.encode())# 不做url编码res2 = urlopen(req,data=data.encode())#POST方法,Form提交数据# with res1:# print(res1.read())with res2: print(res2.read())
# 豆瓣https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=20&page_start=0from urllib.request import Request,urlopenfrom urllib.parse import urlencodeimport random ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]ua = random.choice(ua_list)url = ‘https://movie.douban.com/explore#!‘req = Request(url)req.add_header(‘User-agent‘,ua) data = urlencode({ ‘type‘:‘movie‘, ‘tag‘:‘热门‘, ‘sort‘:‘rank‘, ‘page_limit‘:8, ‘page_start‘:10})# POST 方法res = urlopen(req,data=data.encode())with res: print(res._method) print(1,res.read().decode())# GET方法with urlopen(‘{}?{}‘.format(url,data)) as res: print(res._method) print(2,res.read().decode())
from urllib.request import Request,urlopenimport sslimport randomua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]ua = random.choice(ua_list)request = Request(‘https://www.12306.cn/mormhweb/‘)request.add_header( ‘User-agent‘,ua)# 接受不受信任证书context = ssl._create_unverified_context()res = urlopen(request,context= context)with res: print(res._method) print(res.geturl()) print(res.read().decode()) 标准库urllib缺少关键功能,非标准第三方库提供了比如连接池管理
import urllib3import randomurl = ‘https://movie.douban.com‘ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]ua = random.choice(ua_list)with urllib3.PoolManager() as http: response = http.request(‘GET‘,url,headers={‘User-agent‘:ua}) print(type(response)) # <class ‘urllib3.response.HTTPResponse‘>类 print(response.status,response.reason) print(response.headers) print(response.data) requests库使用了urllib3库,提供可友好的api
import requestsimport randomurl = ‘https://movie.douban.com‘ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]ua = random.choice(ua_list)response = requests.request(‘GET‘,url,headers={‘User-Agent‘:ua})with response: print(type(response)) print(response.url) print(response.status_code) print(response.request.headers)# 请求头 print(response.headers)# 响应头 print(response.text) with open(‘movie.html‘,‘w‘,encoding=‘utf-8‘) as f: f.write(response.text)requests默认使用了Session对象,是为了多次与服务器交互保留会话信息:
# 直接使用sessionimport requestsimport randomua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]ua = random.choice(ua_list)urls = [‘https://www.baidu.com‘,‘https://www.baidu.com‘]session = requests.Session()print(type(session))with session: for url in urls: response = session.get(url,headers={‘User-agent‘:ua}) with response: print(type(response)) # <class ‘requests.models.Response‘>类 print(response.url) print(response.status_code) print(‘headers‘,response.request.headers) print(‘cookie‘,response.cookies) print(response.text[:20])
原文地址:https://www.cnblogs.com/qyan-blog/p/12153645.html
时间: 2024-10-03 17:24:37