1 #-*- coding=utf-8 -*- 2 import requests 3 import re 4 import json 5 import time 6 from PIL import Image 7 import cStringIO 8 import cookielib 9 import urllib 10 import os 11 import xlrd 12 13 from requests.packages.urllib3.exceptions import InsecureRequestWarning,InsecurePlatformWarning 14 requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 15 requests.packages.urllib3.disable_warnings(InsecurePlatformWarning) 16 17 data=xlrd.open_workbook(‘1.xlsx‘) 18 table=data.sheet_by_name(u‘Sheet1‘) 19 20 message_url=‘https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:DEFAULT_EVENT‘ 21 login_url=‘https://matrix.dean.swust.edu.cn/cas/login‘ 22 topic_url=‘‘ 23 flag=0 24 25 student = {} 26 student = { 27 ‘学号‘:‘‘, 28 ‘姓名‘:‘‘, 29 ‘性别‘:‘‘, 30 ‘生日‘:‘‘, 31 ‘pic‘:‘‘, 32 ‘民族‘:‘‘, 33 ‘行政班‘:‘‘, 34 ‘专业‘:‘‘, 35 } 36 37 headers={ 38 ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36‘ 39 } 40 41 session=requests.Session() 42 session.headers=headers 43 session.cookies = cookielib.LWPCookieJar(filename=‘cookies‘) 44 # try: 45 # session.cookies.load(ignore_discard=True) 46 # except: 47 # print u"未登陆过,需先登录" 48 49 50 def get_lt(url="https://matrix.dean.swust.edu.cn/cas/login"): 51 ‘‘‘‘‘_lt 是一个动态变化的参数‘‘‘ 52 global session 53 index_url = url 54 index_page = session.get(index_url,verify=False) 55 html = index_page.content 56 pattern = r‘name="lt" type="hidden" value="(.*?)"‘ 57 lt = re.findall(pattern, html) 58 return lt[0] 59 60 def login(username,password): 61 global session 62 global topic_url 63 global flag 64 data={ 65 ‘lt‘:get_lt(), 66 ‘username‘:username, 67 ‘password‘:password, 68 ‘service‘:‘https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentPortal:DEFAULT_EVENT‘, 69 } 70 loginurl=login_url 71 try: 72 login_page=session.post(loginurl,data=data) 73 login_code=login_page.content 74 pattern=r‘<a class="btn btn-primary" href="(.*?)"‘ 75 real_url=re.findall(pattern, login_code) 76 topic_url=real_url[0] 77 flag=1 78 except: 79 print ‘error01‘ 80 session.cookies.save() 81 82 83 def isLogin(): 84 global session 85 url = "https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:DEFAULT_EVENT" 86 login_code = session.get(url, allow_redirects=False).status_code 87 if int(x=login_code) == 200: 88 return True 89 else: 90 return False 91 92 def get_message(): 93 global session 94 global topic_url 95 global message_url 96 global student 97 98 html=session.get(topic_url) 99 html=session.get(message_url).text 100 101 pattern_ming=r‘<td>(.*?)</td>‘ 102 pattern_id=r‘<span class="number">(.*?)</span>‘ 103 pattern_pic=r‘<td style="padding:0;" width="135" height="180" valign="middle" align="center" rowspan="6"><img width="135" height="180" align="middle" src="(.*?)" /></td>‘ 104 message_name=re.findall(pattern_ming, html) 105 message_pic=re.findall(pattern_pic, html) 106 try: 107 student[‘学号‘]=re.findall(r‘<span class="number">(\d*?)</span>‘, message_name[2])[0] 108 except: 109 pass 110 111 student[‘姓名‘]=message_name[4] 112 student[‘性别‘]=message_name[6] 113 student[‘专业‘]=message_name[37] 114 #student[‘生日‘]=re.findall(r‘<span class="number">(.*?)</span>‘, message_name[8])[0] 115 #student[‘民族‘]=message_name[10] 116 student[‘行政班‘]=message_name[27] 117 student[‘pic‘]=‘https://matrix.dean.swust.edu.cn/acadmicManager/student/profile/‘+student[‘学号‘]+‘.jpg‘ 118 119 120 def download(): 121 global student 122 global session 123 basepath=os.path.abspath(‘.‘) 124 savepath=os.path.join(basepath,student[‘专业‘]) 125 if not os.path.exists(savepath): 126 os.mkdir(savepath) 127 try: 128 picpath=os.path.join(savepath,student[‘姓名‘]+student[‘学号‘]+‘.jpg‘) 129 r=session.get(student[‘pic‘]) 130 with open(picpath, "wb") as pic: 131 pic.write(r.content) 132 print u‘>>>>>>>>>成功抓取>>>>>>>>>>>>>>>‘+student[‘姓名‘] 133 except Exception, e: 134 pass 135 136 137 if __name__ == ‘__main__‘: 138 count=table.nrows 139 i=2 140 while(count>0): 141 if(table.col_values(3)[i]==u‘女‘): 142 try: 143 login(str(int(table.col_values(1)[i])), str(table.col_values(13)[i])[11:17]) 144 except: 145 pass 146 if(flag==1): 147 get_message() 148 download() 149 flag=0 150 count=count-1 151 i=i+1 152 session.cookies.clear()
总结:python处理excel>> http://www.cnblogs.com/lhj588/archive/2012/01/06/2314181.htmlsession释放>>
http://stackoverflow.com/questions/23816139/clear-cookies-from-requests-pytho
注明:
1.xlsx为提供学生资料的excel
异常处理之间的妥协关系需要事先计划好
时间: 2024-10-13 03:02:58