寻找登录的post地址
- 在form表单中寻找action对应的url地址
- post的数据是input标签中name的值作为键,真正的用户名密码作为值的字典,post的url地址就是action对应的url地址
- 抓包,寻找登录的url地址
- 勾选perserve log按钮,防止页面跳转找不到url
- 寻找post数据,确定参数
- 参数不会变,直接用,比如密码不是动态加密的时候
- 参数会变
- 参数在当前的响应中
- 通过js生成
定位想要的js
- 选择会触发js时间的按钮,点击event listener,找到js的位置
- 通过chrome中的search all file来搜索url中关键字
- 添加断点的方式来查看js的操作,通过python来进行同样的操作
安装第三方模块
- pip install retrying
- 下载源码解码,进入解压后的目录,```python setup.py install```
- `***.whl` 安装方法 `pip install ***.whl`
json使用注意点
- json中的字符串都是双引号引起来的
- 如果不是双引号
- eval:能实现简单的字符串和python类型的转化
- replace:把单引号替换为双引号
- 往一个文件中写入多个json串,不再是一个json串,不能直接读取
- 一行写一个json串,按照行来读取
# coding=utf-8 import requests import json import sys class BaiduFanyi: def __init__(self,trans_str): self.trans_str = trans_str self.lang_detect_url = "http://fanyi.baidu.com/langdetect" self.trans_url = "http://fanyi.baidu.com/basetrans" self.headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36"} def parse_url(self,url,data): #发送post请求,获取响应 response = requests.post(url,data=data,headers=self.headers) return json.loads(response.content.decode()) def get_ret(self,dict_response):#提取翻译的结果 ret = dict_response["trans"][0]["dst"] print("result is :",ret) def run(self):#实现主要逻辑 #1.获取语言类型 #1.1 准备post的url地址,post_data lang_detect_data = {"query":self.trans_str} #1.2 发送post请求,获取响应 lang = self.parse_url(self.lang_detect_url,lang_detect_data)["lan"] #1.3 提取语言类型 #2.准备post的数据 trans_data = {"query":self.trans_str,"from":"zh","to":"en"} if lang== "zh" else {"query":self.trans_str,"from":"en","to":"zh"} #3.发送请求,获取响应 dict_response = self.parse_url(self.trans_url,trans_data) #4.提取翻译的结果 self.get_ret(dict_response) if __name__ == ‘__main__‘: trans_str= sys.argv[1] baidu_fanyi = BaiduFanyi(trans_str) baidu_fanyi.run()
# coding=utf-8 import json import requests from parse_url import parse_url from pprint import pprint url = "https://m.douban.com/rexxar/api/v2/subject_collection/movie_showing/items?start=0&count=18&loc_id=108288" html_str = parse_url(url) # json.loads把json字符串转化为python类型 ret1 = json.loads(html_str) # pprint(ret1) # print(type(ret1)) # json.dumps能够把python类型转化为json字符串 with open("douban.json","w",encoding="utf-8") as f: f.write(json.dumps(ret1,ensure_ascii=False,indent=4)) # f.write(str(ret1)) # with open("douban.json","r",encoding="utf-8") as f: # ret2 = f.read() # ret3 = json.loads(ret2) # print(ret3) # print(type(ret3)) # 使用json。load提取类文件对象中的数据 with open("douban.json","r",encoding="utf-8") as f: ret4 = json.load(f) print(ret4) print(type(ret4)) #json.dump能够把python类型放入类文件对象中 with open("douban1.json","w",encoding="utf-8") as f: json.dump(ret1,f,ensure_ascii=False,indent=2)
# coding=utf-8 import re from parse_url import parse_url import json url = "http://36kr.com/" html_str = parse_url(url) ret = re.findall("<script>var props=(.*?),locationnal=",html_str)[0] with open("36kr.json","w",encoding="utf-8") as f: f.write(ret) ret = json.loads(ret) print(ret)
import requests,sys,json class Automatic(): def __init__(self,translade_word): self.translade_word = translade_word self.langdetect_headers ={"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36"} self.langdetect_parpams = {"query": translade_word} self.langdetect_url ="http://fanyi.baidu.com/langdetect" self.translated_url = "http://fanyi.baidu.com/basetrans" def langdetect(self): response = requests.post(self.langdetect_url,headers = self.langdetect_headers,data=self.langdetect_parpams) return json.loads(response.content.decode())[‘lan‘] def get_data_language(self,language_word): # # if "zh" ==language_word: # translade_data ={"query":self.translade_word, # "from":"zh", # "to":"en"} # else: # translade_data = {"query": self.translade_word, # "from": language_word, # "to": "zh"} return {"query":self.translade_word,"from":"zh","to":"en"} if "zh" ==language_word else {"query": self.translade_word,"from": language_word,"to": "zh"} def translade(self,translade_data): response = requests.post(self.translated_url,data=translade_data,headers = self.langdetect_headers) response_data = json.loads(response.text) # print("1111111111",response_data) return response_data def get_ret(self,response_data): data = response_data["trans"][0]["dst"] print("{} 翻译后的结果:{}".format(self.translade_word, data)) def run(self): language_word = self.langdetect() translade_data= self.get_data_language(language_word) response_data = self.translade(translade_data) self.get_ret(response_data) if __name__ == ‘__main__‘: translade_word = sys.argv[1] automatic = Automatic(translade_word) automatic.run()
# coding=utf-8 import requests from retrying import retry headers={"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"} @retry(stop_max_attempt_number=3) def _parse_url(url,method,data,proxies): print("*"*20) if method=="POST": response = requests.post(url,data=data,headers=headers,proxies=proxies) else: response = requests.get(url,headers=headers,timeout=3,proxies=proxies) assert response.status_code == 200 return response.content.decode() def parse_url(url,method="GET",data=None,proxies={}): try: html_str = _parse_url(url,method,data,proxies) except: html_str = None return html_str if __name__ == ‘__main__‘: url = "www.baidu.com" print(parse_url(url))
正则使用的注意点
- `re.findall("a(.*?)b","str")`,能够返回括号中的内容,括号前后的内容起到定位和过滤的效果
- 原始字符串r,待匹配字符串中有反斜杠的时候,使用r能够忽视反斜杠带来的转义的效果
- 点号默认情况匹配不到`\n`
- `\s`能够匹配空白字符,不仅仅包含空格,还有`\t|\r\n`
xpath学习重点
- 使用xpath helper或者是chrome中的copy xpath都是从element中提取的数据,但是爬虫获取的是url对应的响应,往往和elements不一样
- 获取文本
- `a/text()` 获取a下的文本
- `a//text()` 获取a下的所有标签的文本
- `//a[text()=‘下一页‘]` 选择文本为下一页三个字的a标签
- `@符号`
- `a/@href`
- `//ul[@id="detail-list"]`
- `//`
- 在xpath最前面表示从当前html中任意位置开始选择
- `li//a` 表示的是li下任何一个标签
lxml使用注意点
- lxml能够修正HTML代码,但是可能会改错了
- 使用etree.tostring观察修改之后的html的样子,根据修改之后的html字符串写xpath
- lxml 能够接受bytes和str的字符串
- 提取页面数据的思路
- 先分组,渠道一个包含分组标签的列表
- 遍历,取其中每一组进行数据的提取,不会造成数据的对应错乱
# coding=utf-8 import requests import json class DoubanSpider: def __init__(self): self.url_temp_list = [ { "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?start={}&count=18&loc_id=108288", "country": "US" }, { "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_english_hot/items?start={}&count=18&loc_id=108288", "country": "UK" }, { "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?start={}&count=18&loc_id=108288", "country": "CN" } ] self.headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36"} def parse_url(self, url): # 发送请求,获取响应 print(url) response = requests.get(url, headers=self.headers) return response.content.decode() def get_content_list(self, json_str): # 提取是数据 dict_ret = json.loads(json_str) content_list = dict_ret["subject_collection_items"] total = dict_ret["total"] return content_list, total def save_content_list(self, content_list,country): # 保存 with open("douban.txt", "a", encoding="utf-8") as f: for content in content_list: content["country"] = country f.write(json.dumps(content, ensure_ascii=False)) f.write("\n") # 写入换行符,进行换行 print("保存成功") def run(self): # 实现主要逻辑 for url_temp in self.url_temp_list: num = 0 total = 100 # 假设有第一页 while num < total + 18: # 1.start_url url = url_temp["url_temp"].format(num) # 2.发送请求,获取响应 json_str = self.parse_url(url) # 3.提取是数据 content_list, total = self.get_content_list(json_str) # 4.保存 self.save_content_list(content_list,url_temp["country"]) # if len(content_list)<18: # break # 5.构造下一页的url地址,进入循环 num += 18 if __name__ == ‘__main__‘: douban_spider = DoubanSpider() douban_spider.run()
# coding=utf-8 import requests import re import json class Neihan: def __init__(self): self.start_url = "http://neihanshequ.com/" self.next_url_temp = "http://neihanshequ.com/joke/?is_json=1&app_name=neihanshequ_web&max_time={}" self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"} def parse_url(self, url): # 发送请求 print(url) response = requests.get(url, headers=self.headers) return response.content.decode() def get_first_page_content_list(self, html_str): # 提取第一页的数据 content_list = re.findall(r"<h1 class=\"title\">.*?<p>(.*?)</p>", html_str, re.S) max_time = re.findall("max_time: ‘(.*?)‘,", html_str)[0] return content_list, max_time def save_content_list(self, content_list): # 保存 with open("neihan.txt", "a", encoding="utf-8") as f: for content in content_list: f.write(json.dumps(content, ensure_ascii=False)) f.write("\n") print("保存成功") def get_content_list(self, json_str): # 提取从第二页开始的json中的数据 dict_ret = json.loads(json_str) data = dict_ret["data"]["data"] content_list = [i["group"]["content"] for i in data] max_time = dict_ret["data"]["max_time"] has_more = dict_ret["data"]["has_more"] return content_list, max_time, has_more def run(self): # 实现主要逻辑 # 1.start_url # 2.发送请求,获取响应 html_str = self.parse_url(self.start_url) # 3.提取数据 content_lsit, max_time = self.get_first_page_content_list(html_str) # 4.保存 self.save_content_list(content_lsit) has_more = True # 有第二页 while has_more: # 内涵社区是用has_more 来判断是否有下一页的 # 5.构造下一页的url地址 next_url = self.next_url_temp.format(max_time) # 6.发送请求,获取响应 json_str = self.parse_url(next_url) # 7.提取数据,提取max_time content_lsit, max_time, has_more = self.get_content_list(json_str) # 8.保存 self.save_content_list(content_lsit) # 9.循环5-8步 if __name__ == ‘__main__‘: neihan = Neihan() neihan.run()
# coding=utf-8 from lxml import etree text = ‘‘‘ <div> <ul> <li class="item-1"><a>first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </ul> </div> ‘‘‘ html = etree.HTML(text) print(html) #查看element对象中包含的字符串 # print(etree.tostring(html).decode()) #获取class为item-1 li下的a的herf ret1 = html.xpath("//li[@class=‘item-1‘]/a/@href") print(ret1) #获取class为item-1 li下的a的文本 ret2 = html.xpath("//li[@class=‘item-1‘]/a/text()") print(ret2) #每个li是一条新闻,把url和文本组成字典 for href in ret1: item = {} item["href"] = href item["title"] = ret2[ret1.index(href)] print(item) print("*"*100) #分组,根据li标签进行分组,对每一组继续写xpath ret3 = html.xpath("//li[@class=‘item-1‘]") print(ret3) for i in ret3: item= {} item["title"] = i.xpath("a/text()")[0] if len(i.xpath("./a/text()"))>0 else None item["href"] = i.xpath("./a/@href")[0] if len( i.xpath("./a/@href"))>0 else None print(item)
# _*_coding: utf-8 _*_ import json import requests from parse_url import parse_url import sys from pprint import pprint import re class douban: def __init__(self, url): self.url = url self.L_url = [] self.start = 0 self.html_str = "" self.ret = {} def get_total(self): html_str = parse_url(self.url) # json.loads把json字符串转化为python类型 ret1 = json.loads(html_str) total = ret1["total"] return total def get_url(self, total): while self.start < total + 50: url = self.url.format( self.start + 1, 50) self.L_url.append(url) self.start += 50 def get_name(self): Wurl = self.url reg = r‘https://m.douban.com/rexxar/api/v2/subject_collection/(.*?)/‘ name = re.findall(reg, Wurl) return name[0] + ".json" def data(self, name): for url in self.L_url: self.html_str = parse_url(url) ret = json.loads(self.html_str) with open(name, "a", encoding="utf-8") as f: f.write(json.dumps(ret, ensure_ascii=False, indent=4)) def run(self): total = self.get_total() self.get_url(total=total) name = self.get_name() self.data(name=name) if __name__ == ‘__main__‘: url_dict = { "美国": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?os=android&start=0&count=18&loc_id=108288", "英国": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_english_hot/items?os=android&start=0&count=18&loc_id=108288", "韩国": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_korean_drama_hot/items?os=android&start=0&count=18&loc_id=108288", "中国": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?os=android&start=0&count=18&loc_id=108288" } Len = len(sys.argv) for i in range(Len - 1): url_name = sys.argv[i + 1] url = url_dict[url_name] print(url) douban = douban(url) douban.run()
原文地址:https://www.cnblogs.com/MR-allen/p/10584063.html