怎么说呢,这个虽然是装逼的产物,但是还是有一定实用价值的。
先说说我看到这只鸡腿的时候,是怎么吃的:
整体内容可以先分为三部分,抓取数据,整理数据,插入到表。
抓取数据:
拿到网址后,进去看了一下需要的数据都在什么位置,结构了解清楚后就可以准备准备开始把他撸出来了。
想要数据当然要先发个请求咯:
#定义一个内置浏览器session s = requests.session() url = ‘http://www.310win.com/jingcaizuqiu/rangqiushengpingfu/kaijiang_jc_all.html‘ # header把爬虫伪装的像是正常的访问 header = { ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 ‘, ‘Connection‘: ‘keep-alive‘ } # 用requests库的get方法与服务器进行链接 w = s.get(url, headers=header)
请求之后再对返回值做一定的处理:
在这里需要为获取数据做一个动作,因为html中的换行符<br>是单个的,后面提取时会有问题,所以在这里先把它去掉。
# 用chardet库的detect方法获取网页编码格式,具体的编码格式在encoding这个键对应的值中 w.encoding = chardet.detect(w.content)[‘encoding‘] # 设置好编码格式后,用text方法返回为字符串供beautifulSoup处理 e = w.text # 替换掉其中的符号‘-‘,和换行符```这里不能替换‘-’,不然比分出来缺少,改为下面替换 z=(e.replace(‘<br>‘,‘‘)).replace(‘<br/>‘,‘‘) # 使用BeautifulSoup函数把z转化为一个BeautifulSoup对象,解析器的类型是lxml soup = BeautifulSoup(z, ‘lxml‘)
然后就是想想办法把需要的数据搞出来了:
首先根据数据位置与标签结构,用class属性获取到需要的内容,
之后我这里用了个笨办法(大佬掠过),循环把需要的数据写进list中,
然后顺便把一些数据处理成插sql时需要的格式。
# 获取class属性为ni,ni2中的td list = soup.select(‘tr.ni td‘) list2 = soup.select(‘tr.ni2 td‘) # 用string方法获取td标签中的数据,写进list a = 0 while a < len(list2): for z1 in list[a]: MATCH_ID.append(‘2018‘+z1.string[5:10].replace(‘-‘,‘‘)+‘%s‘%z1.string[2:5]) for x1 in list[a+1]: saishi.append(x1.string) for c1 in list[a+3]: BF.append(c1.string) for v1 in list[a+4]: kedui.append(v1.string) for b1 in list[a+5]: BF_HALF.append(b1.string) for n1 in list[a-2]: pankou.append(n1.string) for m1 in list[a-1]: shuju.append(m1.string) for z2 in list2[a]: MATCH_ID.append(‘2018‘+z2.string[5:10].replace(‘-‘,‘‘)+‘%s‘%z2.string[2:5]) for x2 in list2[a+1]: saishi.append(x2.string) for c2 in list2[a+3]: BF.append(c2.string) for v2 in list2[a+4]: kedui.append(v2.string) for b2 in list2[a+5]: BF_HALF.append(b2.string) for n2 in list2[a-2]: pankou.append(n2.string) for m2 in list2[a-1]: shuju.append(m2.string) a = a+13
这个时候需要的数据就拿到了,然后就可以开始吃sql部分了,数据就在爬取和插入之间的小步骤中做处理。
插入到表:
装好pymysql模块,调用之后连接到数据库,
这里可以先转换数据格式,再插入,我是在插入的时候转的格式,都一样。
# 连接数据库 connect = pymysql.Connect( host = ‘****‘, port = ****, user = ‘****‘, passwd = ‘****‘, db = ‘****‘, charset = ‘utf8‘ ) # 也可以先转换数据格式,再插入变量 # label = list((1, 2, 3, 4, 5, 6, 7, 8, 9, "10")) # label_A = [str(i) for i in label] # print(label_A) # 获取游标 cursor = connect.cursor()
最后写条insert into的sql,循环插入所有的数据就ok啦。
这里还有一个问题,就是爬到的数据可能有为None的情况,insert时要跳过为None的字段,然后我在写判断时就犯了一个二比的错,条件里把None写成str了,实际没有抓到数据给的None是个值,低级错误,罪过罪过,善哉善哉。
到这里鸡腿就算是吃完了,还剩骨头。
有兴趣的可以建个表舔一下0.0
1 import requests 2 from bs4 import BeautifulSoup 3 import chardet 4 import pymysql.cursors 5 import time 6 from decimal import * 7 8 class Reptilian(object): 9 def data(self): 10 # 定义需要的数据 11 saishi = [] 12 kedui = [] 13 pankou = [] 14 shuju = [] 15 # insetinto sql要用到的数据 16 BF_HALF = [] 17 MATCH_ID = [] 18 BF = [] 19 BF_ODDS = [] 20 SPF = [] 21 SPF_ODDS = [] 22 RQ = [] 23 RQ_ODDS = [] 24 JQ = [] 25 JQ_ODDS = [] 26 BQC = [] 27 BQC_ODDS = [] 28 29 #定义一个内置浏览器session 30 s = requests.session() 31 32 # # q是存储爬取信息的文本文件,使用追加模式,就是说后面写入的信息会放在已有的信息后面,这样就不会把之前的信息覆盖掉 33 # q = open(‘info.txt‘, ‘a‘) 34 url = ‘http://www.310win.com/jingcaizuqiu/rangqiushengpingfu/kaijiang_jc_all.html‘ 35 36 # header把爬虫伪装的像是正常的访问 37 header = { 38 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 ‘, 39 ‘Connection‘: ‘keep-alive‘ 40 } 41 42 # 用requests库的get方法与服务器进行链接,返回一个requests.models.Response的类 43 w = s.get(url, headers=header) 44 45 # 用chardet库的detect方法获取网页编码格式,具体的编码格式在encoding这个键对应的值中 46 w.encoding = chardet.detect(w.content)[‘encoding‘] 47 48 # 设置好编码格式后,用text方法把Response这个类转化为字符串供beautifulSoup处理 49 e = w.text 50 51 # 替换掉其中的符号‘-‘,和换行符```这里不能替换‘-’,不然比分出来缺少,改为下面替换 52 z=(e.replace(‘<br>‘,‘‘)).replace(‘<br/>‘,‘‘) 53 54 # 使用BeautifulSoup函数把z转化为一个BeautifulSoup对象,解析器的类型是lxml 55 soup = BeautifulSoup(z, ‘lxml‘) 56 57 # 获取class属性为ni,ni2中的td 58 list = soup.select(‘tr.ni td‘) 59 list2 = soup.select(‘tr.ni2 td‘) 60 61 # 用string方法获取td标签中的数据,写进list 62 a = 0 63 while a < len(list2): 64 for z1 in list[a]: 65 MATCH_ID.append(‘2018‘+z1.string[5:10].replace(‘-‘,‘‘)+‘%s‘%z1.string[2:5]) 66 for x1 in list[a+1]: 67 saishi.append(x1.string) 68 for c1 in list[a+3]: 69 BF.append(c1.string) 70 for v1 in list[a+4]: 71 kedui.append(v1.string) 72 for b1 in list[a+5]: 73 BF_HALF.append(b1.string) 74 for n1 in list[a-2]: 75 pankou.append(n1.string) 76 for m1 in list[a-1]: 77 shuju.append(m1.string) 78 for z2 in list2[a]: 79 MATCH_ID.append(‘2018‘+z2.string[5:10].replace(‘-‘,‘‘)+‘%s‘%z2.string[2:5]) 80 for x2 in list2[a+1]: 81 saishi.append(x2.string) 82 for c2 in list2[a+3]: 83 BF.append(c2.string) 84 for v2 in list2[a+4]: 85 kedui.append(v2.string) 86 for b2 in list2[a+5]: 87 BF_HALF.append(b2.string) 88 for n2 in list2[a-2]: 89 pankou.append(n2.string) 90 for m2 in list2[a-1]: 91 shuju.append(m2.string) 92 a = a+13 93 94 # 用string方法获取a标签中的数据,写进list 95 list3 = soup.select(‘tr.ni a‘) 96 list4 = soup.select(‘tr.ni2 a‘) 97 b = 2 98 while b < len(list4): 99 for z3 in list3[b]: 100 BF_ODDS.append(z3.string) 101 for x3 in list3[b-1]: 102 SPF_ODDS.append(x3.string) 103 for c3 in list3[b - 2]: 104 RQ_ODDS.append(c3.string) 105 for v3 in list3[b+1]: 106 JQ_ODDS.append(v3.string) 107 for b3 in list3[b + 2]: 108 BQC_ODDS.append(b3.string) 109 for z4 in list4[b]: 110 BF_ODDS.append(z4.string) 111 for x4 in list4[b-1]: 112 SPF_ODDS.append(x4.string) 113 for c4 in list4[b - 2]: 114 RQ_ODDS.append(c4.string) 115 for v4 in list4[b+1]: 116 JQ_ODDS.append(v4.string) 117 for b4 in list4[b + 2]: 118 BQC_ODDS.append(b4.string) 119 b = b+8 120 121 # 用string方法获取span标签中的数据,写进list 122 c = 1 123 list5 = soup.select(‘tr.ni span‘) 124 list6 = soup.select(‘tr.ni2 span‘) 125 while c < len(list6): 126 for z5 in list5[c]: 127 SPF.append(z5.string) 128 for x5 in list5[c-1]: 129 RQ.append(x5.string) 130 for c5 in list5[c+2]: 131 JQ.append(c5.string) 132 for v5 in list5[c + 3]: 133 BQC.append(v5.string) 134 for z6 in list6[c]: 135 SPF.append(z6.string) 136 for x6 in list6[c-1]: 137 RQ.append(x6.string) 138 for c6 in list6[c+2]: 139 JQ.append(c6.string) 140 for v6 in list6[c + 3]: 141 BQC.append(v6.string) 142 c = c+5 143 return MATCH_ID,BF_HALF,BF,BF_ODDS,SPF,SPF_ODDS,RQ,RQ_ODDS,JQ,JQ_ODDS,BQC,BQC_ODDS 144 145 # 爬取的数据 146 # try: 147 # d = 0 148 # for d in range(len(MATCH_ID)): 149 # print(‘%s‘ % MATCH_ID[d],BF_HALF[d],BF[d],BF_ODDS[d],SPF[d],SPF_ODDS[d],RQ[d],RQ_ODDS[d],JQ[d],JQ_ODDS[d],BQC[d],BQC_ODDS[d]) 150 # d = d+1 151 # except: 152 # print(‘-----------------------‘) 153 154 def insert(self): 155 156 # 定义当前时间 157 now = time.strftime(‘%Y%m%d‘, time.localtime()) 158 159 # 调用data的返回值,分别定义为sql所用字段 160 MATCH_ID = Reptilian.data()[0], 161 BF_HALF = Reptilian.data()[1], 162 BF = Reptilian.data()[2], 163 BF_ODDS = Reptilian.data()[3], 164 SPF = Reptilian.data()[4], 165 SPF_ODDS = Reptilian.data()[5], 166 RQ = Reptilian.data()[6], 167 RQ_ODDS = Reptilian.data()[7], 168 JQ = Reptilian.data()[8], 169 JQ_ODDS = Reptilian.data()[9], 170 BQC = Reptilian.data()[10], 171 BQC_ODDS = Reptilian.data()[11] 172 173 # 连接数据库 174 connect = pymysql.Connect( 175 host = ‘‘, 176 port = , 177 user = ‘‘, 178 passwd = ‘‘, 179 db = ‘‘, 180 charset = ‘utf8‘ 181 ) 182 183 # 也可以先转换数据格式,再插入变量 184 # label = list((1, 2, 3, 4, 5, 6, 7, 8, 9, "10")) 185 # label_A = [str(i) for i in label] 186 # print(label_A) 187 188 # 获取游标 189 cursor = connect.cursor() 190 191 # VALUES从list中第一个值开始插入,循环至最后一个值 192 Value = 0 193 while Value <= len(MATCH_ID[0]): 194 if BF_ODDS[0][Value] != None and JQ_ODDS[0][Value] != None and RQ_ODDS[0][Value] != None and SPF_ODDS[0][Value] != None and BQC_ODDS[Value] != None: 195 sql = "INSERT INTO bdwork_test.T_LOT_FOOT_MATCH_RESULT(MATCH_ID,BF_HALF,BF,SPF,RQ,JQ,BQC,SUB_TIME,UPDATE_TIME,BF_ODDS,SPF_ODDS,RQ_ODDS,JQ_ODDS,BQC_ODDS)VALUES(‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘)" %(MATCH_ID[0][Value],BF_HALF[0][Value],BF[0][Value],SPF[0][Value],RQ[0][Value],JQ[0][Value],BQC[0][Value],now,now,float(BF_ODDS[0][Value]),float(SPF_ODDS[0][Value]),float(RQ_ODDS[0][Value]),float(JQ_ODDS[0][Value]),float(BQC_ODDS[Value])) 196 cursor.execute(sql) 197 connect.commit() 198 Value += 1 199 if Value == len(MATCH_ID[0]): 200 break 201 else: 202 if SPF_ODDS[0][Value] == None: 203 sql = "INSERT INTO bdwork_test.T_LOT_FOOT_MATCH_RESULT(MATCH_ID,BF_HALF,BF,SPF,RQ,JQ,BQC,SUB_TIME,UPDATE_TIME,BF_ODDS,RQ_ODDS,JQ_ODDS,BQC_ODDS)VALUES(‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘)" %(MATCH_ID[0][Value],BF_HALF[0][Value],BF[0][Value],SPF[0][Value],RQ[0][Value],JQ[0][Value],BQC[0][Value],now,now,float(BF_ODDS[0][Value]),float(RQ_ODDS[0][Value]),float(JQ_ODDS[0][Value]),float(BQC_ODDS[Value])) 204 cursor.execute(sql) 205 connect.commit() 206 Value += 1 207 if Value == len(MATCH_ID[0]): 208 break 209 elif BF_ODDS[0][Value] == None: 210 sql = "INSERT INTO bdwork_test.T_LOT_FOOT_MATCH_RESULT(MATCH_ID,BF_HALF,BF,SPF,RQ,JQ,BQC,SUB_TIME,UPDATE_TIME,SPF_ODDS,RQ_ODDS,JQ_ODDS,BQC_ODDS)VALUES(‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘)" % ( 211 MATCH_ID[0][Value], BF_HALF[0][Value], BF[0][Value], SPF[0][Value], RQ[0][Value], 212 JQ[0][Value], BQC[0][Value], now, now,float(SPF_ODDS[0][Value]), float(RQ_ODDS[0][Value]), 213 float(JQ_ODDS[0][Value]), float(BQC_ODDS[Value])) 214 cursor.execute(sql) 215 connect.commit() 216 Value += 1 217 if Value == len(MATCH_ID[0]): 218 break 219 elif JQ_ODDS[0][Value] == None: 220 sql = "INSERT INTO bdwork_test.T_LOT_FOOT_MATCH_RESULT(MATCH_ID,BF_HALF,BF,SPF,RQ,JQ,BQC,SUB_TIME,UPDATE_TIME,BF_ODDS,SPF_ODDS,RQ_ODDS,BQC_ODDS)VALUES(‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘)" % ( 221 MATCH_ID[0][Value], BF_HALF[0][Value], BF[0][Value], SPF[0][Value], RQ[0][Value], JQ[0][Value], 222 BQC[0][Value], now, now, float(BF_ODDS[0][Value]),float(SPF_ODDS[0][Value]), float(RQ_ODDS[0][Value]), 223 float(BQC_ODDS[Value])) 224 cursor.execute(sql) 225 connect.commit() 226 Value += 1 227 if Value == len(MATCH_ID[0]): 228 break 229 elif RQ_ODDS[0][Value] == None: 230 sql = "INSERT INTO bdwork_test.T_LOT_FOOT_MATCH_RESULT(MATCH_ID,BF_HALF,BF,SPF,RQ,JQ,BQC,SUB_TIME,UPDATE_TIME,BF_ODDS,SPF_ODDS,JQ_ODDS,BQC_ODDS)VALUES(‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘)" % ( 231 MATCH_ID[0][Value], BF_HALF[0][Value], BF[0][Value], SPF[0][Value], RQ[0][Value], JQ[0][Value], 232 BQC[0][Value], now, now, float(BF_ODDS[0][Value]),float(SPF_ODDS[0][Value]), 233 float(JQ_ODDS[0][Value]), float(BQC_ODDS[Value])) 234 cursor.execute(sql) 235 connect.commit() 236 Value += 1 237 if Value == len(MATCH_ID[0]): 238 break 239 elif BQC_ODDS[Value] == None: 240 sql = "INSERT INTO bdwork_test.T_LOT_FOOT_MATCH_RESULT(MATCH_ID,BF_HALF,BF,SPF,RQ,JQ,BQC,SUB_TIME,UPDATE_TIME,BF_ODDS,SPF_ODDS,RQ_ODDS,JQ_ODDS)VALUES(‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘,‘%s‘)" % ( 241 MATCH_ID[0][Value], BF_HALF[0][Value], BF[0][Value], SPF[0][Value], RQ[0][Value], JQ[0][Value], 242 BQC[0][Value], now, now, float(BF_ODDS[0][Value]), float(SPF_ODDS[0][Value]),float(RQ_ODDS[0][Value]),float(JQ_ODDS[0][Value])) 243 cursor.execute(sql) 244 connect.commit() 245 Value += 1 246 if Value == len(MATCH_ID[0]): 247 break 248 print(‘插入第%s条数据‘ % Value) 249 # 关闭连接 250 cursor.close() 251 connect.close() 252 return print(‘执行完成‘) 253 254 if __name__ == ‘__main__‘: 255 Reptilian = Reptilian() 256 Reptilian.insert()
原文地址:https://www.cnblogs.com/lvzhenhua/p/9668901.html
时间: 2024-10-01 00:29:06