import requests import Queue import urllib import urllib2 import re import requests alreadyImg = set() s = requests.session() s.post("http://acm.hrbust.edu.cn/index.php?m=User&a=login" , data={ "user_name": "1304020306", "password": "123456" }) r = s.get("http://acm.hrbust.edu.cn/index.php?m=User&a=userInfo&user_name=1404020214") print r.text urllist = Queue.Queue(maxsize = -1) already = set() url = "http://acm.hrbust.edu.cn/index.php?m=Ranklist&a=showRatingrank" urllist.put(url) reg = r‘a href="(.+?)"‘ httpre = re.compile(reg) #reg = r‘src="(.+?\.jpg)"‘ reimg = r‘img class="large_avatar" src="([^>]+?\.(png|jpg))>?"‘ imgre = re.compile(reimg) def putUrl(html): httplist = re.findall(httpre, html) for url in httplist: realurl = url if ‘http‘ not in url: realurl = "http://acm.hrbust.edu.cn/"+url #print realurl if url not in already: already.add(url) urllist.put(realurl) x = 0; def getImg(html): Imglist = re.findall(imgre, html) global x for Img in Imglist: Img = Img[0] if Img in alreadyImg: continue else: alreadyImg.add(Img) print Img if Img[0] != ‘h‘: Img = "http://acm.hrbust.edu.cn/" + Img #print "Img == " +Img try: urllib.urlretrieve(Img, ‘C:/%s.jpg‘ % x) except urllib2.URLError, e: pass else: #print "http://acm.hrbust.edu.cn/"+Img x += 1 while True != urllist.empty(): url = urllist.get(urllist) print url try: r = s.get(url) html = r.text if "index.php?m=Ranklist&a=showRatingrank" in url: putUrl(html) getImg(html) except urllib2.URLError, e: pass except urllib2.HTTPError, e: pass else: pass #else: # print url #print html #break
时间: 2024-12-24 16:03:40