在学过正则表达式后,跟着老师实战了下“淘宝商品比价定向爬虫”和“股票数据定向爬虫”这两个。
一开始我想了下,思路好迷糊,不好从哪下手。在看了老师的教程后,才慢慢的理解了。下面是代码与效果
淘宝商品比价定向爬虫:
1 import requests 2 import re 3 4 def getHTMLText(url): 5 try: 6 r = requests.get(url, timeout = 30) 7 r.raise_for_status() 8 r.encoding = r.apparent_encoding 9 return r.text 10 except: 11 return "" 12 13 def parsePage(ilt, html): 14 try: 15 plt = re.findall(r‘\"view_price\"\:\"[\d\.]*\"‘,html) 16 tlt = re.findall(r‘\"raw_title\"\:\".*?\"‘,html) 17 for i in range(len(plt)): 18 price = eval(plt[i].split(‘:‘)[1]) 19 title = eval(tlt[i].split(‘:‘)[1]) 20 ilt.append([price, title]) 21 except: 22 print("") 23 24 25 def printGoodsList(ilt): 26 tplt = "{:4}\t{:8}\t{:16}" 27 print(tplt.format("序号","价格","商品名称")) 28 count = 0 29 for g in ilt: 30 count += 1 31 print(tplt.format(count, g[0], g[1])) 32 33 def main(): 34 goods = ‘acm‘ 35 depth = 2 36 start_url = ‘https://s.taobao.com/search?q=‘ + goods 37 infoList = [] 38 for i in range(depth): 39 try: 40 url = start_url + ‘&s=‘ + str(44*i) 41 html = getHTMLText(url) 42 parsePage(infoList, html) 43 except: 44 continue 45 printGoodsList(infoList) 46 47 main()
下面是效果:
股票数据定向爬虫:
1 import requests 2 from bs4 import BeautifulSoup 3 import traceback 4 import re 5 6 def getHTMLText(url, code="utf-8"): 7 try: 8 r = requests.get(url) 9 r.raise_for_status() 10 r.encoding = code 11 return r.text 12 except: 13 return "" 14 15 def getStockList(lst, stockURL): 16 html = getHTMLText(stockURL, "GB2312")#由于知道网站编码,就可以直接赋值进去,如果用r.status_encoding的话时间更久些 17 soup = BeautifulSoup(html, ‘html.parser‘) 18 a = soup.find_all(‘a‘) 19 for i in a: 20 try: 21 href = i.attrs[‘href‘] 22 lst.append(re.findall(r"[s][hz]\d{6}", href)[0]) 23 except: 24 continue 25 26 def getStockInfo(lst, stockURL, fpath): 27 count = 0 28 for stock in lst: 29 url = stockURL + stock + ".html" 30 html = getHTMLText(url) 31 try: 32 if html=="": 33 continue 34 infoDict = {} 35 soup = BeautifulSoup(html, ‘html.parser‘) 36 stockInfo = soup.find(‘div‘,attrs={‘class‘:‘stock-bets‘}) 37 38 name = stockInfo.find_all(attrs={‘class‘:‘bets-name‘})[0] 39 infoDict.update({‘股票名称‘: name.text.split()[0]}) 40 41 keyList = stockInfo.find_all(‘dt‘) 42 valueList = stockInfo.find_all(‘dd‘) 43 for i in range(len(keyList)): 44 key = keyList[i].text 45 val = valueList[i].text 46 infoDict[key] = val 47 48 with open(fpath, ‘a‘, encoding=‘utf-8‘) as f: 49 f.write( str(infoDict) + ‘\n‘ ) 50 count = count + 1 51 print("\r当前进度: {:.2f}%".format(count*100/len(lst)),end="") 52 except: 53 count = count + 1 54 print("\r当前进度: {:.2f}%".format(count*100/len(lst)),end="") 55 continue 56 57 def main(): 58 stock_list_url = ‘http://quote.eastmoney.com/stocklist.html‘ 59 stock_info_url = ‘https://gupiao.baidu.com/stock/‘ 60 output_file = ‘E:/BaiduStockInfo.txt‘ 61 slist=[] 62 getStockList(slist, stock_list_url) 63 getStockInfo(slist, stock_info_url, output_file) 64 65 main()
效果的话先放这个吧,爬取时间有点慢
时间: 2024-10-07 07:20:49