import re import requests import pymysql base_url = "http://202.192.240.54/kkcx/" # 连接mysql db = pymysql.connect(host="localhost",user="root",passwd="root",db="course_teacher",charset="utf8") cursor = db.cursor() sql = "drop table if exists course_to_teacher" cursor.execute(sql) sql = "create table course_to_teacher(id int auto_increment,course_name varchar(80),teacher varchar(80),primary key(id))" cursor.execute(sql) # 获取各个学院的 url def get_academy_urls(url="http://202.192.240.54/kkcx/yxkk.aspx"): r = requests.get(url) p = re.compile(r‘yxkk.aspx\?id=\d{2}‘) acadmy_course_urls = p.findall(r.text) return acadmy_course_urls # 获取学院的的所有课程url def get_all_course_url(url): r = requests.get(url) p = re.compile(r‘yxkk_view.*id=\d{7}‘) urls = p.findall(r.text) get_teacher_course(urls) # 获取课程名和老师 def get_teacher_course(urls): for url in urls: r = requests.get(base_url + url) # 获取课程名 p = re.compile(r"课程名:(.*)学") course_info = p.findall(r.text) course_name = get_course_name(course_info) # 获取对应的教师 p = re.compile(r"<span.*cnameLabel.*>(.*)</span>") teachers = p.findall(r.text) for teacher in teachers: if(teacher and teacher != "未指定"): # print(course_name,"=",teacher) sql = "insert into course_to_teacher(course_name,teacher) values(‘%s‘,‘%s‘)" % (course_name,teacher) cursor.execute(sql) db.commit() # 获取课程名 def get_course_name(course_info): for info in course_info: course_name = info.strip("\u3000") return course_name if __name__ == "__main__": for acadmy_course_url in get_academy_urls(): get_all_course_url(base_url + acadmy_course_url) # TODO 优化,优化,优化 # 新建表去重 # CREATE TABLE course_to_teacher_copy LIKE course_to_teacher # INSERT INTO course_to_teacher_copy(course_name,teacher) SELECT DISTINCT(course_name),teacher FROM course_to_teacher
时间: 2024-10-08 22:10:21