雪球数据的爬取

import requests
from lxml import etree
import json
import pymongo

# 连接mongodb 数据库  存mongodb中
client = pymongo.MongoClient(‘127.0.0.1‘, port=27017)
db = client.xueqiu
collection = db.xueqiu

url = ‘https://xueqiu.com/‘
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
session = requests.Session()
session.get(url=url,headers=headers)

def get_page_list():
    url = ‘https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1‘
    params = {
        "since_id": "-1",
        "max_id": "-1",
        "count": "10",
        "category": "-1"
    }
    response = session.get(url=url,headers=headers,params=params)
    page_text = response.json()
    content = page_text["list"]

    info_dict = {}
    for x in content:
        per_info = x["data"] #json 格式
        per_info = json.loads(per_info)
        # print(per_info)
        id  = per_info["id"]
        title = per_info["title"]
        description = per_info["description"]
        target = per_info["target"]
        detail_url = "https://xueqiu.com"+target
        info_dict[‘id‘]=id
        info_dict[‘title‘] = title
        info_dict[‘detail_url‘]=detail_url
        parse_detail(detail_url)
        # break

def parse_detail(url):
    response = session.get(url=url, headers=headers,)
    page_text = response.text
    tree = etree.HTML(page_text)
    title = tree.xpath(‘//div[@class="container article__container"]//h1[@class="article__bd__title"]/text()‘)
    print(title)
    print("=="*20)

    data_dict = {}
    data_dict["title"] = title

    p_list = tree.xpath(‘//div[@class="article__bd__detail"]/p‘)
    content_list = []
    for p in p_list:
        content = p.xpath(‘./text()|./b/text()‘)
        content = "".join(content).strip()
        # print(content)
        if len(content)>0:
            content_list.append(content)

    content_str = "".join(content_list)
    data_dict["content"] = content_str
    # print(data_dict)

    collection.insert([data_dict])

def main():
    get_page_list()

if __name__ == ‘__main__‘:
    main()

优化成redis增量式获取数据

import requests
from lxml import etree
import json
from redis import Redis
import pymongo
import time
import datetime

client = pymongo.MongoClient(‘127.0.0.1‘, port=27017)
db = client.xueqiu
collection = db.xueqiu

conn = Redis(host=‘127.0.0.1‘,port=6379)

url = ‘https://xueqiu.com/‘
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
session = requests.Session()
session.get(url=url,headers=headers)

def get_page_list():
    url = ‘https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1‘
    params = {
        "since_id": "-1",
        "max_id": "-1",
        "count": "10", # 获取10条信息
        "category": "-1"
    }
    response = session.get(url=url,headers=headers,params=params)
    page_text = response.json()
    content = page_text["list"]

    info_dict = {}
    for x in content:
        per_info = x["data"] #json 格式
        per_info = json.loads(per_info)
        # print(per_info)
        id  = per_info["id"]
        title = per_info["title"]
        description = per_info["description"]
        target = per_info["target"]
        detail_url = "https://xueqiu.com"+target

        ex = conn.sadd(‘news_urls‘, detail_url)
        if ex == 0:
            print(‘暂无最新数据可爬取......‘)
        else:
            print(‘有最新数据的更新......‘)

            info_dict[‘id‘]=id
            info_dict[‘title‘] = title
            info_dict[‘detail_url‘]=detail_url
            parse_detail(detail_url)
        # break

def parse_detail(url):
    response = session.get(url=url, headers=headers,)
    page_text = response.text
    tree = etree.HTML(page_text)
    title = tree.xpath(‘//div[@class="container article__container"]//h1[@class="article__bd__title"]/text()‘)
    print(title)
    print("=="*20)

    data_dict = {}
    data_dict["title"] = title

    p_list = tree.xpath(‘//div[@class="article__bd__detail"]/p‘)
    content_list = []
    for p in p_list:
        content = p.xpath(‘./text()|./b/text()‘)
        content = "".join(content).strip()
        # print(content)
        if len(content)>0:
            content_list.append(content)

    content_str = "".join(content_list)
    data_dict["content"] = content_str
    # print(data_dict)
    collection.insert([data_dict])

def main():
    flag = 0
    now = datetime.datetime.now()
    sched_time = datetime.datetime(now.year, now.month, now.day, now.hour, now.minute, now.second) + datetime.timedelta(
        seconds=6)
    while True:
        now = datetime.datetime.now()   # 设置爬取时间, 多久爬一次
        if sched_time < now:
            time.sleep(300)
            print(now)
            get_page_list()
            flag = 1
        else:
            if flag == 1:
                sched_time = sched_time+datetime.timedelta(minutes=1)
                flag = 0

if __name__ == ‘__main__‘:
    main()

原文地址：https://www.cnblogs.com/kenD/p/11123726.html

时间： 2024-10-05 21:18:36

雪球数据的爬取的相关文章

使用 Chrome 浏览器插件 Web Scraper 10分钟轻松实现网页数据的爬取

本文标签: WebScraper Chrome浏览器插件网页数据的爬取使用Chrome 浏览器插件 Web Scraper 可以轻松实现网页数据的爬取,不写代码,鼠标操作,点哪爬哪,还不用考虑爬虫中的登陆.验证码.异步加载等复杂问题. Web Scraper插件 Web Scraper 官网中的简介: Web Scraper Extension (Free!)Using our extension you can create a plan (sitemap) how a web site

python爬虫入门之移动端数据的爬取

第七章移动端数据的爬取基于某一款抓包工具 : fiddler ,青花瓷 ,miteproxy 7.1 fiddler 基本配置 7.1.1fiddler简介和安装什么是Fiddler? Fiddler是位于客户端和服务器端的HTTP代理,也是目前最常用的http抓包工具之一 . 它能够记录客户端和服务器之间的所有 HTTP请求,可以针对特定的HTTP请求,分析请求数据.设置断点.调试web应用.修改请求的数据,甚至可以修改服务器返回的数据,功能非常强大,是web调试的利器. Fiddler

python爬虫---CrawlSpider实现的全站数据的爬取,分布式,增量式,所有的反爬机制

CrawlSpider实现的全站数据的爬取新建一个工程 cd 工程创建爬虫文件:scrapy genspider -t crawl spiderName www.xxx.com 连接提取器LinkExtractor 可以根据指定的规则对指定的连接进行提取提取的规则就是构造方法中的allow('正则表达式')参数决定规则解析器Rule 可以将将连接提取器提取到的连接进行请求发送,可以根据指定的规则(callback)对请求到的数据进行解析 follow=True:将连接提取器继续作用到

爬虫入门五（移动端数据的爬取）

常用的抓包工具常用的抓包工具:本质就是一款服务器,适用于实现请求和响应的拦截 fiddler 默认只可以抓起http协议的请求和响应 https: tools - options - https - detrypt https traffic 含义:将fiddler的证书安装到本地青花瓷(charles) miteproxy 移动端数据的爬取配置相关的环境 fiddler的配置: tools->options->connections->allow remote conxxx 查看

【实例】--股票数据定向爬取

从股票列表网页获取股票代码根据股票代码去股票详情页面获取股票详细信息 1. 股票列表页面凤凰网财经—股票信息 http://app.finance.ifeng.com/list/stock.php?t=ha&f=chg_pct&o=desc&p=1 2. 股票详细信息老虎社区—股票详情 https://www.laohu8.com/stock/600210 股票数据定向爬取思路 1. 查看网站robots协议,查看网站是否可以爬取 2. 查看网页源代码,查看网页信息是否可以直

药大贴吧用户数据资料爬取与简单分析

使用python爬虫连接到药大贴吧的首页,然后爬取每个话题的链接.将链接记录到一个列表中.打开列表中的链接,读取第一页页的用户的主页链接和话题下的帖子页数.将用户的主页连接记录到一个集合中.如果发现有多页,就记录每一页的连接,再从这些连接中读取用户的主页连接记录到集合中.这样可爬取首页下所有用户的主页url. 依次从集合中取出URL,打开主页,记录用户名称,性别,粉丝数,关注者的信息,发帖量等资料. #coding:utf-8 import urllib2 import re from bs4

基于java的网络爬虫框架(实现京东数据的爬取，并将插入数据库)

原文地址http://blog.csdn.net/qy20115549/article/details/52203722 本文为原创博客,仅供技术学习使用.未经允许,禁止将其复制下来上传到百度文库等平台. 目录网络爬虫框架网络爬虫的逻辑顺序网络爬虫实例教学 model main util parse db 再看main方法爬虫效果展示网络爬虫框架写网络爬虫,一个要有一个逻辑顺序.本文主要讲解我自己经常使用的一个顺序,并且本人经常使用这个框架来写一些简单的爬虫,复杂的爬虫,也是在这个基

【个人】爬虫实践，利用xpath方式爬取数据之爬取虾米音乐排行榜

实验网站:虾米音乐排行榜网站地址:http://www.xiami.com/chart 难度系数:★☆☆☆☆ 依赖库:request.lxml的etree (安装lxml:pip install lxml) IDEA开发工具:PyCharm_2017.3 Python版本:Python3 期望结果:爬取出排行版歌名以及对应歌手运行效果图: 音乐排行榜: 爬取数据结果图: 像这种简单的爬取就没必要使用Scrapy框架进行处理,是在有点大材小用,不过如果你刚开始学Scrapy的话,拿这些简单的练

某鱼直播数据全站爬取

前言本次爬取使用了代理IP,爬取全站为1个小时,当然也可以不用代理proxy,但是要设置爬取速度 time.sleep(5) 先附上完整代码,下面有详解 import csv from fake_useragent import UserAgent import json from lxml import etree import requests # 代理服务器 proxyHost = "http-dyn.abuyun.com" proxyPort = "9020&quo