一、Requests库
参考 :http://www.python-requests.org/en/master/user/quickstart/#make-a-request
Requests是一个很实用的Python HTTP客户端库,编写爬虫和测试服务器响应数据时经常会用到。Requests 完全满足如今网络的需求
安装方式一般采用 pip install requests
In [1]: import requests In [2]: response=requests.get(‘https://api.github.com/events‘) In [3]: print(response) <Response [200]> In [4]: response=requests.post(‘http://httpbin.org/post‘,data={‘key1‘:‘values1‘}) #提交表单时使用 In [5]: print(response) <Response [200]> In [7]: response=requests.put(‘http://httpbin.org/put‘,data={‘key1‘:‘values1‘}) In [8]: print(response) <Response [200]> In [10]: response=requests.delete(‘http://httpbin.org/delete‘) In [11]: print(response) <Response [200]> In [13]: response=requests.head(‘http://httpbin.org/get‘) In [14]: print(response) <Response [200]> In [15]: response=requests.options(‘http://httpbin.org/get‘) In [16]: print(response) <Response [200]> In [17]: payload={‘key1‘:‘value1‘,‘key2‘:‘value2‘} In [18]: response=requests.get(‘http://httpbin.org/get‘,params=payload) #携带参数发送get请求 In [19]: print(response) <Response [200]> In [20]: print(response.text) { "args": { "key1": "value1", "key2": "value2" }, "headers": { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Connection": "close", "Host": "httpbin.org", "User-Agent": "python-requests/2.18.4" }, "origin": "103.215.2.233", "url": "http://httpbin.org/get?key1=value1&key2=value2" } In [22]: print(response.url) http://httpbin.org/get?key1=value1&key2=value2 In [23]: payload={‘key1‘:‘value1‘,‘key2‘:[‘value2‘,‘value3‘]} In [24]: response=requests.get(‘http://httpbin.org/get‘,params=payload) In [25]: print(response.url) http://httpbin.org/get?key1=value1&key2=value2&key2=value3 In [27]: response=requests.get(‘http://api.github.com/events‘) In [28]: response.encoding #字符集编码 Out[28]: ‘utf-8‘ In [29]: print(response.text) #文件信息 [{"id":"6850814749","type":"CreateEvent","actor":{"id":679017,"login":...... In [30]: print(response.content) #二进制格式信息 b‘[{"id":"6850814749","type":"CreateEvent","actor":{"id":679017,"login":"..... In [34]: response.json() In [36]: response.status_code #返回状态码 Out[36]: 200 In [38]: headers={ ‘User-Agent‘:‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62. ...: 0.3202.75 Safari/537.36‘,‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘ ...: ,‘Accept-Encoding‘:‘gzip, deflate, br‘,‘Accept-Language‘:‘zh-CN,zh;q=0.9,en;q=0.8‘,‘Connection‘:‘keep-alive‘} In [39]: response=requests.get(‘https://api.github.com/events‘,headers=headers) In [40]: print(response.headers) {‘Server‘: ‘GitHub.com‘, ‘Date‘: ‘Tue, 14 Nov 2017 06:10:31 GMT‘, ‘Content-Type‘: ‘application/json; charset=utf-8‘, ‘Transfer-Encoding‘: ‘chunked‘, ‘Status‘: ‘200 OK‘, ‘X-RateLimit-Limit‘: ‘60‘, ‘X-RateLimit-Remaining‘: ‘58‘, ‘X-RateLimit-Reset‘: ‘1510642339‘, ‘Cache-Control‘: ‘public, max-age=60, s-maxage=60‘, ‘Vary‘: ‘Accept‘, ‘ETag‘: ‘W/"34b51a08c5a8f4fa2400dd5c0d89221b"‘, ‘Last-Modified‘: ‘Tue, 14 Nov 2017 06:10:31 GMT‘, ‘X-Poll-Interval‘: ‘60‘, ‘X-GitHub-Media-Type‘: ‘unknown, github.v3‘, ‘Link‘: ‘<https://api.github.com/events?page=2>; rel="next", <https://api.github.com/events?page=10>; rel="last"‘, ‘Access-Control-Expose-Headers‘: ‘ETag, Link, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval‘, ‘Access-Control-Allow-Origin‘: ‘*‘, ‘Content-Security-Policy‘: "default-src ‘none‘", ‘Strict-Transport-Security‘: ‘max-age=31536000; includeSubdomains; preload‘, ‘X-Content-Type-Options‘: ‘nosniff‘, ‘X-Frame-Options‘: ‘deny‘, ‘X-XSS-Protection‘: ‘1; mode=block‘, ‘X-Runtime-rack‘: ‘0.104190‘, ‘Content-Encoding‘: ‘gzip‘, ‘X-GitHub-Request-Id‘: ‘D528:C0F5:6BAAA:E4CB6:5A0A88D6‘} In [41]: In [43]: print(response.headers[‘Content-Type‘]) application/json; charset=utf-8 In [44]: print(response.headers.get(‘Content-Type‘)) application/json; charset=utf-8 In [45]: url=‘http://www.baidu.com‘ In [46]: response=requests.get(url,headers=headers) #向baidu请求会有cookies返回,有些site没有cookies In [47]: print(response.cookies) #输出整个cookies <RequestsCookieJar[<Cookie H_PS_PSSID=1425_21088_24880 for .baidu.com/>, <Cookie BDSVRTM=0 for www.baidu.com/>, <Cookie BD_HOME=0 for www.baidu.com/>]> In [48]: for k,v in response.cookies.get_dict().items(): #遍历cookies内容 ...: print(k,v) ...: H_PS_PSSID 1425_21088_24880 BDSVRTM 0 BD_HOME 0 In [49]: cookies={‘c1‘:‘v1‘,‘c2‘:‘v2‘} In [50]: response=requests.get(‘http://httpbin.org/cookies‘,cookies=cookies) #携带cookies发送请求 In [52]: print(response.text) { "cookies": { "c1": "v1", "c2": "v2" } } In [53]: jar = requests.cookies.RequestsCookieJar() In [54]: jar.set(‘tasty_cookie‘, ‘yum‘, domain=‘httpbin.org‘, path=‘/cookies‘) Out[54]: Cookie(version=0, name=‘tasty_cookie‘, value=‘yum‘, port=None, port_specified=False, domain=‘httpbin.org‘, domain_specified=True, domain_initial_dot=False, path=‘/cookies‘, path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={‘HttpOnly‘: None}, rfc2109=False) In [55]: jar.set(‘gross_cookie‘, ‘blech‘, domain=‘httpbin.org‘, path=‘/elsewhere‘) Out[55]: Cookie(version=0, name=‘gross_cookie‘, value=‘blech‘, port=None, port_specified=False, domain=‘httpbin.org‘, domain_specified=True, domain_initial_dot=False, path=‘/elsewhere‘, path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={‘HttpOnly‘: None}, rfc2109=False) In [56]: url = ‘http://httpbin.org/cookies‘ In [57]: response = requests.get(url, cookies=jar) In [58]: print(response.text) { "cookies": { "tasty_cookie": "yum" } }
Cookies are returned in a RequestsCookieJar, which acts like a dict but also offers a more complete interface, suitable for use over multiple domains or paths. Cookie jars can also be passed in to requests
In [62]: url=‘http://github.com‘ In [64]: response=requests.get(url,allow_redirects=True) In [65]: print(response.url) https://github.com/ In [66]: response.history Out[66]: [<Response [301]>] In [69]: url = ‘http://httpbin.org/post‘ In [70]: files = {‘file‘: open(‘test.txt‘, ‘rb‘)} In [71]: response=requests.post(url,files=files) #post提交时携带文件 In [72]: response.text Out[72]: ‘...文件的内容...‘ In [73]: response=requests.get(‘https://github.com‘, timeout=5) #关于请求超时
import json
import requests
from io import BytesIO
from PIL import Image
#1 处理图片
r=requests.get(‘http://img.jrjimg.cn/2013/11/20131105065502114.jpg‘) image=Image.open(BytesIO(r.content)) #从图片的二进制内容 生成一张图片 image.save(‘mm.jpg‘)
#2 Json 处理josn
r=requests.get(‘https://github.com/timeline.json‘) print(type(r.json)) print(r.json) print(r.text)
#3 org data 处理源数据
r=requests.get(‘https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1508166336374&di=ef1073a52a7582f29ffa27c47e95e74e&imgtype=0&src=http%3A%2F%2Fp3.gexing.com%2FG1%2FM00%2F3F%2FDD%2FrBACE1MaezngiEoIAADSr3bccSw151.jpg‘) with open(‘mm2.jpg‘,‘wb+‘) as f: for chunk in r.iter_content(1024): f.write(chunk)
#4 Form 处理表单
form={‘username‘:‘user‘,‘password‘:‘pwd‘} r=requests.post(‘http://httpbin.org/post‘,data=form) print(r.text) r=requests.post(‘http://httpbin.org/post‘,data=json.dumps(form)) print(r.text)
二、通过Requests抓取豆瓣电影列表及评分
import requests from lxml import etree sess = requests.Session() for id in range(0, 251, 25): url = ‘https://movie.douban.com/top250/?start-‘ + str(id) r = sess.get(url) r.encoding = ‘utf-8‘ #fname="movie"+str(id)+".txt" #with open(fname,"wb+") as f: # f.write(r.content) root = etree.HTML(r.content) items = root.xpath(‘//ol/li/div[@class="item"]‘) # print(len(items)) for item in items: title = item.xpath(‘./div[@class="info"]//a/span[@class="title"]/text()‘) name = title[0].encode(‘gb2312‘, ‘ignore‘).decode(‘gb2312‘) # rank = item.xpath(‘./div[@class="pic"]/em/text()‘)[0] rating = item.xpath(‘.//div[@class="bd"]//span[@class="rating_num"]/text()‘)[0] rating = item.xpath(‘.//div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()‘)[0] print(name, rating)
三、BeautifulSoup
BeautifulSoup模块用于接收一个HTML或XML字符串,然后将其进行格式化,之后便可以使用他提供的方法进行快速查找指定元素,从而使得在HTML或XML中查找指定元素变得简单。Beautiful Soup 支持 Python 标准库中的 HTML 解析器,还支持一些第三方的解析器,如果不安装第三方解析器,Python 会使用默认的解析器,其中lxml 解析器更加强大,速度更快,推荐安装。
from bs4 import BeautifulSoup soup=BeautifulSoup(open(‘test.html‘)) #这种方式适用于打开本地文件进行解析 print(soup.prettify()) #格式化输出
#1 Tag 处理tag
print(type(soup.title)) print(soup.title) print(soup.title.name)
#2 String
print(type(soup.title.string)) print(soup.title.string)
#3 Comment
print(type(soup.a.string)) print(soup.a.string) for item in soup.body.contents: print(item.name)
#4 CSS query
print(soup.select(‘.sister‘)) print(soup.select(‘#link1‘)) print(soup.select(‘head > title‘))
a_s=soup.select(‘a‘) for a in a_s: print(a)
例:
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse‘s story</title></head> <body> asdf <div class="title"> <b>The Dormouse‘s story总共</b> <h1>f</h1> </div> <div class="story">Once upon a time there were three little sisters; and their names were <a class="sister0" id="link1">Els<span>f</span>ie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div> ad<br/>sf <p class="story">...</p> </body> </html> """
soup = BeautifulSoup(html_doc, features="lxml") tag1 = soup.find(name=‘a‘) #find first tag a tag2 = soup.find_all(name=‘a‘) #find all tag a tag3 = soup.select(‘#link2‘) #find id=link2 label
print(tag1.name) # 输出 a print(tag1.attrs) # 输出 字典 {‘class‘: [‘sister0‘], ‘id‘: ‘link1‘} tag1.attrs[‘id‘]=‘link01‘ print(tag1.attrs) # 输出 字典 {‘class‘: [‘sister0‘], ‘id‘: ‘link01‘} print(tag1.has_attr(‘id‘)) # 输出 True print(tag1.get_text(‘id‘)) # 输出 Elsidfidie tag1.name=‘soup‘ # 设置name 的值 print(tag2) # 输出 [<a class="sister0" id="link1">Els<span>f</span>ie</a>, ......] print(tag2[0].name) # 输出 soup
# decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
print(tag2[1]) # 输出 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> print(type(tag2[1])) # 输出 <class ‘bs4.element.Tag‘> print(tag2[1].decode()) # 输出 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> print(type(tag2[1].decode())) # 输出 <class ‘str‘>
# encode,转换为字节(含当前标签);encode_contents(不含当前标签)
print(tag2[1].encode()) # 输出 b‘<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>‘ print(type(tag2[1].encode())) # 输出 <class ‘bytes‘> print(tag2[1].get_text()) # 输出 Lacie
body = soup.find(name=‘body‘) #所有子标签 childs = body.children print(childs) # 输出 <list_iterator object at 0x10349b9e8> for tag in childs: print(tag)
body=soup.find(name=‘body‘) #所有子子孙孙标签 descs=body.descendants # 输出 <generator object descendants at 0x106327360> print(descs) for des in descs: print(des)
body=soup.find(name=‘body‘) # 将标签的所有子标签全部清空 , 保留本标签名 body.clear() print(soup)
body=soup.find(name=‘body‘) body.decompose() # 递归的删除所有的标签 print(soup)
body=soup.find(name=‘body‘) d=body.extract() # 递归的删除所有的标签,并获取删除的标签 print(soup) print(d)
body=soup.find(name=‘body‘) index=body.index(body.find(‘div‘)) # 输出 1 , 检查标签在某标签中的索引位置 print(index)
br=soup.find(name=‘br‘) test=br.is_empty_element # 输出True ; 判断是否是如下标签:‘br‘ , ‘hr‘, ‘input‘, ‘img‘, ‘meta‘,‘spacer‘, ‘link‘, ‘frame‘, ‘base‘ print(test)
span=soup.find(‘span‘) print(span) # 输出 <span>f</span> print(span.string) # 输出 f span.string=‘yeecall.com‘ # 设置 string print(span.string) # 输出 yeecall.com
body=soup.find(name=‘body‘) texts=body.stripped_strings # 递归内部获取所有标签的文本 print(texts) # 输出 <generator object stripped_strings at 0x107311360> for text in texts: print(text)
# Select CSS 选择器的举例
soup.select("title") soup.select("p nth-of-type(3)") soup.select("body a") soup.select("html head title") tag = soup.select("span,a") soup.select("head > title") soup.select("p > a") soup.select("p > a:nth-of-type(2)") soup.select("p > #link1") soup.select("body > a") soup.select("#link1 ~ .sister") soup.select("#link1 + .sister") soup.select(".sister") soup.select("[class~=sister]") soup.select("#link1") soup.select("a#link2") soup.select(‘a[href]‘) soup.select(‘a[href="http://example.com/elsie"]‘) soup.select(‘a[href^="http://example.com/"]‘) soup.select(‘a[href$="tillie"]‘) soup.select(‘a[href*=".com/el"]‘)
四、使用requests 、BeautifulSoup实现豆瓣登录
import requests import html5lib import re from bs4 import BeautifulSoup sess=requests.Session() url_login=‘https://accounts.douban.com/login‘ formdata={ ‘redir‘:‘https://www.douban.com‘, ‘source‘:‘index_nav‘, ‘form_email‘:‘******@*****.com‘, ‘form_password‘:‘*********‘, ‘login‘:u‘登录‘ } headers={‘User-Agent‘:‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘} r=sess.post(url_login,data=formdata,headers=headers) content=r.text #print(content) soup=BeautifulSoup(content,‘html5lib‘) captcha=soup.find(‘img‘,id=‘captcha_image‘) if captcha: print(captcha) captcha_url=captcha[‘src‘] #re_captcha_id=r‘id="(.*?)"&‘ #captcha_id=re.findall(re_captcha_id,captcha) captcha_id=re.findall(r‘(id=)(.*)(&)‘,captcha_url) captcha_id=captcha_id[0][1] print(captcha_url) print(captcha_id) captcha_text=input(‘Please input the captcha:‘) formdata[‘captcha-solution‘]=captcha_text formdata[‘captcha-id‘]=captcha_id print(formdata) r=sess.post(url_login,data=formdata,headers=headers) #print(r.text) with open(‘contacts.txt‘,‘w+‘,encoding=‘utf-8‘) as f: f.write(r.text)