--首先,感谢黑板客老师做了这个爬虫闯关系列,让大家学习到不少知识。
第一关:将网页提示的数字加在网址后面
解题思路:
1、找到数字对应的html标签,用正则匹配标签内容。
2、将数字提取出来加到第一关网址后面,获取新的数字。
解题过程:
第二关:
解题思路:从0~30依次尝试登录。
第三关:在第二关的基础上加了两层保护
1、访问第三关时会跳转到一个登录页面,必须先登录(测试帐号:username:test;password:test123)。
2、登录时有一个CSRF参数。
解题思路:
1、先发一个GET请求获取登录页面,将服务器返回的csrftoken保存。
2、尝试用username:test;password:test123;csrftoken登录。登录成功后保存服务器返回的新csrftoken。
3、尝试用username:test;password:(0~30);csrftoken登录。
解题过程:
#!/usr/bin/python # codeing:utf-8 # Be hxs import re import time from threading import Thread try: import requests except ImportError: print "import requests error" exit(0) def print_run_time(func): """ 装饰器函数,输出运行时间 """ def wrapper(self, *args, **kw): local_time = time.time() # print args),kw func(self) print ‘run time is {:.2f}:‘.format(time.time() - local_time) return wrapper class hbk_crawler(object): """黑板客爬虫闯关""" def __init__(self): pass #super(hbk_exxx, self).__init__() def login(self, level): """登录函数 input:第几关""" self.url = ‘http://www.heibanke.com/lesson/crawler_ex‘ + level self.login_url = ‘http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex‘ + level self.s = requests.session() print u"正在登录第{}关....".format(int(level)+1) try: self.csrftoken = self.s.get(self.login_url).cookies[‘csrftoken‘] except: print u"网络连接错误,请重试..." exit() self.payload = {‘username‘: ‘test‘, ‘password‘: ‘test123‘, ‘csrfmiddlewaretoken‘: self.csrftoken} self.payload[‘csrfmiddlewaretoken‘] = self.s.post( self.login_url, self.payload).cookies[‘csrftoken‘] print u"登录成功...." return None @print_run_time def ex01(self, *args, **kw): """ 第1关:找密码""" url = ‘http://www.heibanke.com/lesson/crawler_ex00/‘ num = ‘‘ while True: content = requests.get(url + str(num)).text pattern = r‘<h3>(.*)</h3>‘ result = re.findall(pattern, content) try: num = int( ‘‘.join(map(lambda n: n if n.isdigit() else ‘‘, result[0]))) except: break print result[0] print result[0] @print_run_time def ex02(self, *args, **kw): """ 第2关:猜密码 """ url = ‘http://www.heibanke.com/lesson/crawler_ex01/‘ payload = {‘username‘: ‘test‘, ‘password‘: 0} for n in range(30): payload[‘password‘] = n content = requests.post(url, payload).text pattern = r‘<h3>(.*)</h3>‘ result = re.findall(pattern, content) print "try enter {} ...".format(n), result[0] if u"错误" not in result[0]: break @print_run_time def ex03(self, *args, **kw): """ 第3关:猜密码,加入了登录验证,CSRF保护 """ self.login(‘02‘) for n in range(30): self.payload[‘password‘] = n content = self.s.post(self.url, self.payload).text pattern = r‘<h3>(.*)</h3>‘ result = re.findall(pattern, content) print "try enter {} ...".format(n), result[0] if u"错误" not in result[0]: break def parseurl(self, url): """分析网页,查找密码位置和值""" while self.count < 100: response = self.s.get(url) if response.ok: content = response.text pos_pattern = r‘_pos.>(.*)</td>‘ val_pattern = r‘_val.>(.*)</td>‘ pos_list = re.findall(pos_pattern, content) val_list = re.findall(val_pattern, content) for pos, val in zip(pos_list, val_list): if pos not in self.pw_dict: self.pw_dict[pos] = val self.count = self.count + 1 print str(self.count) + ‘%‘ + self.count // 2 * ‘*‘ @print_run_time def ex04(self, *args, **kw): """ 第4关:找密码,加入了登录验证,CSRF保护,密码长度100位,响应时间增加 """ self.count = 0 self.login(‘03‘) self.pw_dict = {} pw_url = (‘http://www.heibanke.com/lesson/crawler_ex03/pw_list‘,) # 线程数,黑板客服务器15秒内最多响应2个请求,否则返回404. n = 2 threads = [Thread(target=self.parseurl, args=( pw_url)) for i in xrange(n)] for t in threads: print t.name, ‘start...‘ t.start() for t in threads: t.join() self.pw_list = [‘‘ for n in range(101)] for pos in self.pw_dict.keys(): self.pw_list[int(pos)] = self.pw_dict[pos] password = int(‘‘.join(self.pw_list)) self.payload[‘password‘] = password response = self.s.post(self.url, self.payload) pattern = r‘<h3>(.*)</h3>‘ result = re.findall(pattern, response.text) print result[0] if __name__ == ‘__main__‘: Hbk_crawler = hbk_crawler() Hbk_crawler.ex01() Hbk_crawler.ex02() Hbk_crawler.ex03() Hbk_crawler.ex04()
时间: 2024-10-08 10:04:16