时间:2017-8-3 23:30
Url:http://www.court.gov.cn
py3.4 + mysql + win7
import urllib.request import re import pymysql from time import sleep try: con = pymysql.connect(host = ‘127.0.0.1‘,user = ‘root‘,passwd=‘root‘) con.query(‘create database PeopleCourt‘) con = pymysql.connect(host = ‘127.0.0.1‘,user = ‘root‘,passwd=‘root‘,db = ‘PeopleCourt‘) except: con = pymysql.connect(host = ‘127.0.0.1‘,user = ‘root‘,passwd=‘root‘,db = ‘PeopleCourt‘) try: con.query(‘create TABLE lawcase(title char(100),url char(100),time char(50))‘) except: print(‘Table existed‘) url_row = ‘http://www.court.gov.cn/fabu-gengduo-15.html?page=1‘ header = {‘User-Agent‘:‘Mozilla/5.0‘} req = urllib.request.Request(url_row,headers=header) res = urllib.request.urlopen(req) data = res.read().decode() reg_page = re.compile(‘<li class="last"><a href="/fabu-gengduo-15\.html\?page=(.*?)">‘).findall(data) print(‘page:‘+str(reg_page[0])) for page in range(1,int(reg_page[0])+1): print(‘Grab page:‘+str(page)) url = ‘http://www.court.gov.cn/fabu-gengduo-15.html?page=‘+str(page) req = urllib.request.Request(url,headers=header) res = urllib.request.urlopen(req) data = res.read().decode() reg_item_string = ‘<a title="(.*?)" target="_blank" href="(.*?)">.*?</a>.*?<i class="date">(.*?)</i>‘ reg_item = re.compile(reg_item_string,re.S).findall(data) for item in reg_item: title = item[0].replace(‘\n‘,‘‘) Url = ‘http://www.court.gov.cn‘+item[1] time = item[2] sql = "insert INTO lawcase(title,url,time) VALUES (‘"+title+"‘,‘"+Url+"‘,‘"+time+"‘)" con.query(sql) sleep(2) print(‘Ok‘)
数据库截图:
时间: 2024-10-03 13:29:52