这个可以作为xpath的练手项目,爬取股吧2016年6月份到2016年12月份的文章标题和发帖时间
代码如下:
import requests from requests.exceptions import RequestException from lxml import etree import csv def get_one_index_page(url): """ 获取请求页的源码 :param url: :return: """ try: headers = { ‘User-Agent‘: ‘Mozilla / 5.0(X11;Linuxx86_64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / ‘ ‘76.0.3809.100Safari / 537.36‘, } response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: return None def parse_page(text): """ 解析网页源代码 :param text: :return: """ html = etree.HTML(text) title_name = html.xpath("//span[@class=‘l3 a3‘]/a/text()") time = html.xpath("//span[@class=‘l5 a5‘]/text()") list_new_time = [] for i in time[1::]: list_new_time.append(‘2016-‘ + i) return zip(title_name, list_new_time) def change_page(number): """ 翻页 :param number: :return: """ base_url = ‘http://guba.eastmoney.com/‘ url = base_url + ‘list,zssh000016,f_%d.html‘ % number return url def save_to_csv(result, filename): """ 保存 :param result: :param filename: :return: """ with open(‘%s‘ % filename, ‘a‘) as csvfile: writer = csv.writer(csvfile) writer.writerow(result) def main(): # list01 = [] for i in range(265, 272): number = change_page(i) text = get_one_index_page(number) result = parse_page(text) for j in result: save_to_csv(j, ‘data_new.csv‘) if __name__ == ‘__main__‘: main()
原文地址:https://www.cnblogs.com/lattesea/p/11746486.html
时间: 2024-11-09 04:58:41