#-*- coding: utf-8 -*-# search.py import urllib2 import json #import copy from sgmllib import SGMLParser DETAIL_URL = "http://shixin.court.gov.cn/detail?id={}" DETAIL_KEYS = ["age", "sexy", "cardNum", "areaName", "courtName", "gistId", "regDate", "gistUnit", "duty", "performance", "disruptTypeName"] csv_file = open(‘search.csv‘, ‘w‘) class GetIdList(SGMLParser): def reset(self): self.all_data = [] self.IDlist = [] self.flag = False self.getdata = False SGMLParser.reset(self) def start_tr(self, attrs): for k,v in attrs:#遍历div的所有属性以及其值 # tr style="height:28px;" if k == ‘style‘ and v == ‘height:28px;‘:#确定进入了<div class=‘entry-content‘> self.flag = True return def end_tr(self):#遇到</div> self.flag = False if self.IDlist: self.get_detail(self.IDlist[1]) def start_a(self, attrs): if self.getdata == True: for k,v in attrs: if k == ‘id‘: self.IDlist.append(v) def start_td(self, attrs): if self.flag == False: return self.getdata = True def end_td(self):#遇到</p> if self.getdata: self.getdata = False def handle_data(self, text):#处理文本 if self.getdata: self.IDlist.append(text) def get_detail(self, pid): print pid, self.IDlist[2] while True: try: detail_msg = urllib2.urlopen(DETAIL_URL.format(pid)).read() except: continue break detail = json.loads(detail_msg) self.IDlist = self.IDlist[:-4] for item in DETAIL_KEYS: value = detail.get(item, ‘‘) self.IDlist.append(value) import sys reload(sys) sys.setdefaultencoding(‘utf-8‘) for index,item in enumerate(self.IDlist): self.IDlist[index] = str(item).decode(‘utf-8‘) self.all_data.append(‘,‘.join(self.IDlist)) self.IDlist = [] pass def print_data(self): for i in self.all_data: print >> csv_file, i #for i in range(1, 2): for i in range(1, 79866): while True: try: content = urllib2.urlopen(‘http://shixin.court.gov.cn/personMore.do?currentPage={}‘.format(i)).read() except: continue break # print content if content: t = GetIdList() t.feed(content) t.print_data()
时间: 2025-01-01 23:36:14