最近发现 吾志 上用户的头像都很个性,另外,对于没有把日记设为私密的用户,当天的日记是公开的,谁都可以查看。
所以,如果每天把所有可查看的日记爬一遍,那么~~ 哈哈
我以前对爬虫只是了解一点点,没有真的玩过。既然今晚兴致来了,那就随便学一下咯~
参考 http://cuiqingcai.com/1052.html
1 #coding=utf-8 2 import os 3 import urllib 4 import urllib2 5 import re 6 import cookielib 7 8 9 10 def mkdir(path): 11 # 去除左右两边的空格 12 path = path.strip() 13 # 去除尾部 \ 符号 14 path = path.rstrip("\\") 15 16 if not os.path.exists(path): 17 os.makedirs(path) 18 19 return path 20 21 22 def save_file(path, file_name, data): 23 if data == None: 24 return 25 26 mkdir(path) 27 if (not path.endswith("/")): 28 path = path + "/" 29 f = open(path+file_name, "wb") 30 f.write(data) 31 f.flush() 32 f.close() 33 34 35 36 user_agent = ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36‘ 37 headers = {‘User-Agent‘ : user_agent} 38 values = {} 39 data = urllib.urlencode(values) 40 41 def getHtml(url): 42 req = urllib2.Request(url, data, headers) 43 page = urllib2.urlopen(req, timeout=10) 44 html = page.read() 45 page.close() 46 #print html 47 return html 48 49 def get_file(url): 50 try: 51 opener = urllib2.build_opener() 52 opener.addheaders = [(‘User-Agent‘, ‘Mozilla/5.0‘)] 53 urllib2.install_opener(opener) 54 req = urllib2.Request(url) 55 operate = opener.open(req) 56 data = operate.read() 57 operate.close() 58 return data 59 except BaseException, e: 60 print e, ‘fuck‘ 61 return None 62 63 64 def getImg(html): 65 reg = r‘src="(.+?\.jpg)" alt=‘ 66 imgre = re.compile(reg) 67 imglist = re.findall(imgre, html) 68 69 x = 0 70 for imgurl in imglist: 71 #urllib.urlretrieve(imgurl, ‘%s.jpg‘ % x) 72 da = get_file(imgurl) 73 save_file(‘.‘, ‘%s.jpg‘ % x, da) 74 x += 1 75 76 return x 77 78 79 80 html = getHtml("https://wuzhi.me/last") 81 82 print getImg(html)
十分简陋,哈哈~
时间: 2024-10-24 18:45:33