【学习】爬糗事百科，可自动翻页。

  1 namespace HTML
  2 {
  3     class Program
  4     {
  5         const string qsbkMainUrl = "http://www.qiushibaike.com";
  6
  7         private static string GetWBJokeUrl(int pageIndex)
  8         {
  9
 10             StringBuilder url = new StringBuilder();
 11
 12             url.Append(qsbkMainUrl);
 13
 14             url.Append("/textnew/page/");
 15
 16             url.Append(pageIndex.ToString());
 17
 18             url.Append("/?s=4869039");
 19
 20             return url.ToString();
 21
 22         }
 23
 24         //根据网页的url获取网页的html源码
 25
 26
 27         private static string GetUrlContent(string url)
 28         {
 29             try
 30             {
 31
 32                 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
 33
 34                 request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.8.1000 Chrome/30.0.1599.101 Safari/537.36";
 35
 36                 request.Method = "GET";
 37
 38                 request.ContentType = "text/html;charset=UTF-8";
 39
 40                 HttpWebResponse response = (HttpWebResponse)request.GetResponse();
 41
 42                 Stream myResponseStream = response.GetResponseStream();
 43
 44                 StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));//因为知道糗百网页的编码方式为utf-8
 45
 46                 string retString = myStreamReader.ReadToEnd();
 47
 48                 myStreamReader.Close();
 49
 50                 myResponseStream.Close();
 51
 52                 return retString;
 53
 54             }
 55
 56             catch { return null; }
 57
 58         }
 59         static void Main(string[] args)
 60         {
 61             System.Threading.Timer threadTimer = new System.Threading.Timer(new System.Threading.TimerCallback(Method3), null, 0, 5000);
 62             while (true)
 63             {
 64                 Thread.Sleep(1000);
 65             }
 66         }
 67
 68        static int first = 1;
 69        static int curNum=1;
 70        static void Method3(Object state)
 71         {
 72             List<JokeItem> a = GetJokeList(first == 1 ? curNum : first);
 73             int i = 1;
 74             Console.Clear();
 75             foreach (JokeItem item in a)
 76             {
 77                 Console.WriteLine("笑话" + i + ":" + item.JokeContent + "\n");
 78                 i++;
 79             }
 80             curNum++;
 81         }
 82         public class JokeItem
 83         {
 84
 85             private string nickName;
 86
 87             /// <summary>
 88
 89             /// 昵称
 90
 91             /// </summary>
 92
 93             public string NickName
 94             {
 95
 96                 get { return nickName; }
 97
 98                 set { nickName = value; }
 99
100             }
101
102
103
104             private Image headImage;
105
106             /// <summary>
107
108             /// 头像
109
110             /// </summary>
111
112             public Image HeadImage
113             {
114
115                 get { return headImage; }
116
117                 set { headImage = value; }
118
119             }
120
121             private string jokeContent;
122
123             /// <summary>
124
125             /// 笑话内容
126
127             /// </summary>
128
129             public string JokeContent
130             {
131
132                 get { return jokeContent; }
133
134                 set { jokeContent = value; }
135
136             }
137
138
139
140             private string jokeUrl;
141
142             /// <summary>
143
144             /// 笑话地址
145
146             /// </summary>
147
148             public string JokeUrl
149             {
150
151                 get { return jokeUrl; }
152
153                 set { jokeUrl = value; }
154
155             }
156
157         }
158
159         /// <summary>
160
161         /// 获取笑话列表
162
163         /// </summary>
164
165         /// <param name="htmlContent"></param>
166
167         public static List<JokeItem> GetJokeList(int pageIndex)
168         {
169
170             string htmlContent = GetUrlContent(GetWBJokeUrl(pageIndex));
171             List<JokeItem> jokeList = new List<JokeItem>();
172             Regex rg = new Regex("<div class=\"content\">\\s*((.*|<br/>)*)", RegexOptions.IgnoreCase);
173
174             JokeItem joke;
175
176             MatchCollection matchResults = rg.Matches(htmlContent);
177
178
179
180             foreach (Match result in matchResults)
181             {
182                 joke = new JokeItem();
183                 joke.JokeContent = result.Groups[0].Value.Replace("<div class=\"content\">", "").Replace("</span>", "").Replace("<span>", "").Replace("<br/>","");
184                 joke.JokeContent = Regex.Replace(joke.JokeContent, @"(\r\n)+|(\r)+", "");//去掉多余的空行
185                 joke.JokeContent = Regex.Replace(joke.JokeContent, @"(\n)+", "\n");
186                 jokeList.Add(joke);
187             }
188
189             return jokeList;
190
191         }
192
193
194
195
196     }
197 }

控制台代码

原文地址：https://www.cnblogs.com/Zhengxue/p/8777981.html

时间： 2024-10-09 20:33:21

【学习】爬糗事百科，可自动翻页。的相关文章

第一个爬虫：爬糗事百科笑话

前排提示: Python 3.5 没有分布式队列,没有查重,没有Scrapy-Redis框架,没有效率参考资料(前排拜谢); 网友静觅 CSDN专栏 Jecvay Notes 知乎大神,言简意赅第一步:能爬就行 import urllib import urllib.request url = "http://news.dbanotes.net" html = urllib.request.urlopen(url) data = html.read().decode('UTF-8'

爬取糗事百科1到5页的图片并下载到本地

思路如下: 首先找到图片的节点 <div class="thumb"> <a href="/article/121672165" target="_blank"> <img src="//pic.qiushibaike.com/system/pictures/12167/121672165/medium/NTDNQY3EJKUSRZ2X.jpg" alt="糗事#121672165&qu

Python爬虫实战（一）：爬糗事百科段子

代码: # _*_ coding:utf-8 _*_ import urllib2 import re from datetime import datetime class QSBK: def __init__(self): self.pageIndex = 1 self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' self.headers = {'User-Agent':self.user_agent} self

python简单爬虫-----爬糗事百科段子

#-*-coding:utf-8 -*- import urllib2 import sys import re reload(sys) sys.setdefaultencoding('utf-8') url='http://www.qiushibaike.com/hot/page/1/' header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64)'} try: page=1 while True:

python爬糗事百科段子

#!/usr/bin/env python # coding: UTF-8 # -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup #import pandas newurl='http://www.qiushibaike.com/text/page/1/' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.

Android实战——jsoup实现网络爬虫，糗事百科项目的起步

Android实战--jsoup实现网络爬虫,爬糗事百科主界面本篇文章包括以下内容: 前言 jsoup的简介 jsoup的配置 jsoup的使用结语前言对于Android初学者想要做项目时,最大的烦恼是什么?毫无疑问是数据源的缺乏,当然可以选择第三方接口提供数据,也可以使用网络爬虫获取数据,这样就不用第三方数据作为支持.本来是打算爬一些购物网站的数据,由于他们的反爬做得好,所以没办法爬到数据,只能爬取糗事百科的数据,或许聪明的你会想到可以高仿个糗事百科作为自己的练手项目,利用jsoup是

python 糗事百科实例

爬取糗事百科段子,假设页面的URL是 http://www.qiushibaike.com/8hr/page/1 要求: 使用requests获取页面信息,用XPath / re 做数据提取获取每个帖子里的用户头像链接.用户姓名.段子内容.点赞次数和评论次数保存到 json 文件内参考代码 #qiushibaike.py #import urllib #import re #import chardet import requests from lxml import etree page

Python爬虫(十七)_糗事百科案例

糗事百科实例爬取糗事百科段子,假设页面的URL是: http://www.qiushibaike.com/8hr/page/1 要求: 使用requests获取页面信息,用XPath/re做数据提取获取每个帖子里的用户头像连接.用户姓名.段子内容.点赞次数和评论次数保存到json文件内参考代码 #-*- coding:utf-8 -*- import requests from lxml import etree page = 1 url = 'http://www.qiushibaik

糗事百科的段子

#! usr/bin/env python# -*- coding: utf-8 -*- '''1.抓取糗事百科热门段子2.过滤带有图片的段子3.实现每按一次回车显示一个段子的发布时间,发布人,段子内容,点赞数.''' import urllib2import re class QSBK(object): def __init__(self): self.pageIndex = 1 self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Win