首先从东方财富网获取股票代码
再从网易财经下载股票历史数据
import requests import random from bs4 import BeautifulSoup as bs import time import redis import re import json def get_stock_names(): """ 通过东方财富网上爬取股票的名称代码,并存入redis数据库和本地txt文档 """ rds = redis.from_url(‘redis://:[email protected]:6379‘, db=1, decode_responses=True) # 连接redis db1 url = "http://quote.eastmoney.com/stocklist.html" headers = { ‘Referer‘: ‘http://quote.eastmoney.com/center/gridlist.html‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36‘ } response = requests.get(url, headers=headers).content.decode(‘utf-8‘) # 网站编码为gbk 需要解码 soup = bs(response, ‘lxml‘) all_ul = soup.find(‘div‘, id=‘table_wrapper-table‘).find_all(‘ul‘) # 获取两个ul 标签数据 with open(‘stock_names.txt‘, ‘w+‘, encoding=‘utf-8‘) as f: for ul in all_ul: all_a = ul.find_all(‘a‘) # 获取ul 下的所有的a 标签 for a in all_a: rds.rpush(‘stock_names‘, a.text) # a.text 为a标签中的text数据 rpush将数据右侧插入数据库 f.write(a.text + ‘\n‘) def get_data(stocklist, outfile=r‘D:\PycharmProjects\web_scraping\stockdata‘): headers = { ‘Referer‘: ‘http://quotes.money.163.com/‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36‘ } for stock_code, stock_name in stocklist: try: #stock_code = stock_name.split(‘(‘)[1].split(‘)‘)[0] # 由于东方财富网上获取的代码一部分为基金,无法获取数据,故将基金剔除掉。 # 沪市股票以6,9开头,深市以0,2,3开头,但是部分基金也是2开头,201/202/203/204这些也是基金 # 另外获取data的网址股票代码 沪市前加0, 深市前加1 if int(stock_code[0]) in [0, 2, 3, 6, 9]: if int(stock_code[0]) in [6, 9]: stock_code_new = ‘0‘ + stock_code elif int(stock_code[0]) in [0, 2, 3]: if not int(stock_code[:3]) in [201, 202, 203, 204]: stock_code_new = ‘1‘ + stock_code else: continue else: continue else: continue stock_url = ‘http://quotes.money.163.com/trade/lsjysj_{}.html‘.format(stock_code) respones = requests.get(stock_url, headers=headers).text soup = bs(respones, ‘lxml‘) start_time = soup.find(‘input‘, {‘name‘: ‘date_start_type‘}).get(‘value‘).replace(‘-‘, ‘‘) #获取起始时间 end_time = soup.find(‘input‘, {‘name‘: ‘date_end_type‘}).get(‘value‘).replace(‘-‘, ‘‘) #获取结束时间 time.sleep(random.choice([1, 2])) #两次访问之间休息1-2秒 download_url = "http://quotes.money.163.com/service/chddata.html?code={}&start={}&end={}&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP".format(stock_code_new, start_time, end_time) data = requests.get(download_url, headers=headers) file_name = outfile + ‘\\{}.csv‘.format(stock_code) with open(file_name, ‘wb‘) as f: for chunk in data.iter_content(chunk_size=10000): #批量写入数据 if chunk: f.write(chunk) print("{}数据已下载".format(stock_code)) except Exception as e: print("{}({})数据下载报错".format(stock_name, stock_code)) print(e) #从东方财富网获取股票代码及名称 stocklist = [] #3769支,只有‘0‘,‘3‘,‘6‘开头的 max_page = 189 for i in range(max_page): url = ‘‘‘http://1.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112405721872315676919_1566176986516&pn={} &pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2& fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152 &_=1566176986517‘‘‘.format(i+1) response = requests.get(url).content.decode(‘utf-8‘) json_text = re.sub(r‘jQuery112405721872315676919_1566176986516\(‘, ‘‘, response)[:-2] #json_str = re.sub(r‘\)‘, ‘‘, response) json_text = json.loads(json_text) for fi in json_text[‘data‘][‘diff‘]: stocklist.append([fi[‘f12‘], fi[‘f14‘]]) # 从网易财经下载数据 get_data(stocklist, outfile=r‘D:\PycharmProjects\web_scraping\stockdata‘)
参考资料:
Python爬虫(5):比Selenium快100倍的方法爬东方财富网财务报表
原文地址:https://www.cnblogs.com/iupoint/p/11375932.html
时间: 2024-10-21 07:30:34