CSDN爬虫

仅做技术交流。

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Text.RegularExpressions;
using DotNet.Utilities;
using System.Xml;
using System.Net;

namespace CSDNEpt
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        public const string Category = "<div\\s*id=\"panel_Category\"\\s*class=\"panel\">[\\w\\W]*?</div>";
        public const string CategoryId = "(?<=<li(.*)\\s*<a(.*))\\d+(?=\"\\sonclick=(.*)\\s*</li>)";
        public const string CategoryName="(?<=<li(.*)\\s*<a(.*)\">)(.*)(?=</a><span>(.*)\\s*</li>)";
        public const string ArticleCount = "(?<=<li(.*)\\s*<a(.*)</a><span>\\()\\d*(?=\\)</span>\\s*</li>)";
        public const string ArticleName = "(?<=<div\\s*class=\"article_title\">[\\w\\W]*\">\\s*)(.*)(?=\\s*</a></span>\\s*(.*)\\s*</div>)";
        public const string PostDate = "(?<=<span\\s*class=\"link_postdate\">)(.*)(?=</span>)";
        public const string ReadCount = "(?<=<span\\s*class=\"link_view\"\\s*title=\"阅读次数\">)(.*)(?=人阅读</span>)";
        public const string ArticleContent = "(?<=<div\\s*id=\"article_content\"\\s*class=\"article_content\">)[\\w\\W]*?(?=</div>)";
        public const string ArticleId = "(?<=<span\\s*class=\"link_title\">(.*)details/)\\d*(?=\">[\\w\\W]*?</a></span>)";
        public const string IsOriginal="(?<=<span\\s*class=\"ico\\s*)(.*)(?=\"></span>)";//是否原创

        public List<string> MatchStr(string regexStr,string matchStr){
            List<string> lt = new List<string>();
            Regex reg = new Regex(regexStr, RegexOptions.IgnoreCase);
            Match m = reg.Match(matchStr);
            while (m.Success){
                lt.Add(m.ToString());
                m = m.NextMatch();
            }
            return lt;
        }

        private void button1_Click(object sender, EventArgs e)
        {
            string path = Application.StartupPath + "\\article\\";
            if(!Directory.Exists(path)){
                Directory.CreateDirectory(path);
            }
            List<string> lt_CategoryId = new List<string>();
            List<string> lt_CategoryName = new List<string>();
            List<string> lt_ArticleCount = new List<string>();
            List<string> lt_ArticleId = new List<string>();

            HttpHelper http = new HttpHelper();
            HttpItem item = new HttpItem();
            item.URL = textBox1.Text;
            item.Referer = textBox1.Text;
            item.ProxyIp = "ieproxy";
            item.Encoding = Encoding.GetEncoding("utf-8");
            string html=http.GetHtml(item).Html;

            string CategoryHtml = MatchStr(Category, html)[0];
            lt_CategoryId = MatchStr(CategoryId, CategoryHtml);
            lt_CategoryName = MatchStr(CategoryName, CategoryHtml);
            lt_ArticleCount = MatchStr(ArticleCount, CategoryHtml);

            //循环每一个分类 取分类下的文章集合
            for (int i = 0; i < lt_CategoryId.Count; i++)
            {
                listBox1.Items.Insert(0, "正在获取【" + lt_CategoryName[i] + "】分类...");
                int count = Convert.ToInt32(lt_ArticleCount[i]);
                int page = (count % 20 == 0) ? (count / 20) : (count / 20 + 1);

                lt_ArticleId.Clear();
                for (int k = 1; k < page+1; k++)
                {
                    string pageUrl = textBox1.Text.Trim() + "/article/category/" + lt_CategoryId[i] + "/" + k;
                    item.URL = pageUrl;
                    string pageHtml = http.GetHtml(item).Html;
                    lt_ArticleId.AddRange(MatchStr(ArticleId, pageHtml));
                }

                if (lt_ArticleId.Count != 0)
                {
                    string articleUrl = "";
                    for (int j = 0; j < lt_ArticleId.Count; j++)
                    {
                        articleUrl = textBox1.Text.Trim() + "/article/details/" + lt_ArticleId[j];
                        item.URL = articleUrl;
                        string articleHtml = http.GetHtml(item).Html;
                        string articleName_txt = MatchStr(ArticleName, articleHtml)[0].Trim().Replace("\r","");
                        string postDate_txt = MatchStr(PostDate, articleHtml)[0].Trim();
                        string readCount_txt = MatchStr(ReadCount, articleHtml)[0].Trim();
                        string articleContent_txt = MatchStr(ArticleContent, articleHtml)[0];
                        string isOriginal_txt = MatchStr(IsOriginal, articleHtml)[0].Trim().Trim();

                        listBox1.Items.Insert(0, "正在抓取【" + articleName_txt + "】文章...");

                        //创建xml 保存文章
                        XmlDocument xml = new XmlDocument();
                        XmlDeclaration xmldecl=xml.CreateXmlDeclaration("1.0", "gb2312", null);
                        XmlElement root = xml.CreateElement("Article");
                        XmlElement name = xml.CreateElement("Name");
                        name.InnerText = articleName_txt;
                        XmlElement url = xml.CreateElement("URL");
                        url.InnerText = articleUrl;
                        XmlElement isOriginal = xml.CreateElement("IsOriginal");
                        isOriginal.InnerText = isOriginal_txt == "ico_type_Original" ? "Y" : "N";
                        XmlElement postDate = xml.CreateElement("PostDate");
                        postDate.InnerText = postDate_txt;
                        XmlElement readCount = xml.CreateElement("ReadCount");
                        readCount.InnerText = readCount_txt;
                        XmlElement articleContent = xml.CreateElement("ArticleContent");
                        articleContent.InnerText = articleContent_txt;
                        root.AppendChild(name);
                        root.AppendChild(url);
                        root.AppendChild(isOriginal);
                        root.AppendChild(postDate);
                        root.AppendChild(readCount);
                        root.AppendChild(articleContent);
                        xml.AppendChild(xmldecl);
                        xml.AppendChild(root);
                        xml.Save(path + articleName_txt + ".xml");

                        listBox1.Items.Insert(0, "【" + articleName_txt + "】文章抓取成功!");
                        Application.DoEvents();
                    }
                }
                listBox1.Items.Insert(0, "【"+lt_CategoryName[i] + "】分类获取完毕!");

            }
        }

    }
}

时间： 2024-10-28 01:07:15

CSDN爬虫的相关文章

Python爬虫Csdn系列I

Python爬虫Csdn系列I By 白熊花田(http://blog.csdn.net/whiterbear) 说明: 我会在这个系列介绍如何利用python写一个csdn爬虫,并将给定的Csdn用户的博客的所有文章保存起来.嗯,实用性貌似不是很大,写着玩,这个系列后,会有更好玩的更高级的爬虫出现. 原因: 本来想学cookie的,后来发现爬取csdn的文章伪装成浏览器去访问就行了. 本次目标: 爬取csdn某用户的文章列表.这里以我的blog为例,仅仅打开第一列文章列表,不做任何分析,只是验

Python3 爬虫（八） -- BeautifulSoup之再次爬取CSDN博文

序我的Python3爬虫(五)博文使用utllib基本函数以及正则表达式技术实现了爬取csdn全部博文信息的任务. 链接:Python3 爬虫(五) -- 单线程爬取我的CSDN全部博文上一篇,我们学习了BeautifulSoup这样一个优秀的Python库,必须有效利用起来.那么我们就利用BeautifulSoup4重新实现一次爬取csdn博文的任务. 由于我修改了博客配置,首页主题换了一下,我们基于新的主题查看网页,如下图所示: 同样的,确认要提取的信息,以及博文总页数. 分析网页源码

从零起步系统入门Python爬虫工程师

课程目录及大纲: 第1章从零开始系统入门python爬虫工程师-课程导学获取课程资料链接:点击这里获取这是一门专门为爬虫初学者打造的教程,从零起步的系统化教程,课程内容从理论到实践,一层一层深入讲解,尤其是课程实战环节:一步一步带你进行多场景项目实践 ,让你能够举一反三从容面对以后的数据抓取问题,最后关于就业部分,重点,难点,针对性讲解,轻松应对面试,最终达到就业水准.... 1-1 从零开始系统入门python爬虫工程师-课程导学试看第2章彻底解决让人头疼的环境搭建问题视频教程

Python3爬虫实战：实战源码+博客讲解

Python Spider 贵有恒,何必三更起五更睡:最无益,只怕一日暴十寒. Python3爬虫实战:实战源码+博客讲解个人网站 CSDN博客 CSDN爬虫专栏学习交流群[328127489] 声明代码.教程仅限于学习交流,请勿用于任何商业用途! 文章首发声明文章在自己的个人网站首发,其他平台文章均属转发,如想获得最新更新进展,欢迎关注我的个人网站:http://cuijiahua.com/ 目录爬虫小工具文件下载小助手爬虫实战笔趣看小说下载百度文库免费文章下载助手_rev1

python爬虫爬取csdn博客专家所有博客内容

Hello Python!用python写一个抓取CSDN博客文章的简单爬虫

网络上一提到python,总会有一些不知道是黑还是粉的人大喊着:python是世界上最好的语言.最近利用业余时间体验了下python语言,并写了个爬虫爬取我csdn上关注的几个大神的博客,然后利用leancloud一站式后端云服务器存储数据,再写了一个android app展示数据,也算小试了一下这门语言,给我的感觉就是,像python这类弱类型的动态语言相比于java来说,开发者不需要分太多心去考虑编程问题,能够把精力集中于业务上,思考逻辑的实现.下面分享一下我此次写爬虫的一下小经验,抛砖引玉

[python爬虫] Selenium爬取CSDN博客摘要及问题

本文主要是采用Selenium来爬取CSDN的博文摘要,为后面对CSDN的热点技术.近几年专家发表的博客作数据分析.由于采用BeautifulSoup爬取该网站会报错"HTTPError: Forbidden",所以作者采用Selenium爬取.同时,在爬取过程中遇到了局部动态更新的问题,无法定位换页的问题,作者采用Firebug进行分析,也希望读者提出更好的方法.代码下载地址: 一. CSDN博客网站分析及问题本文主要爬取CSDN专家的博客,因为专家的论文水平相对高点,同时专栏较多

CSDN Android客户端开发(二):详解如何基于Java用Jsoup爬虫HTML数据

本文参考链接详细介绍如何使用Jsoup包抓取HTML数据,是一个纯java工程,并将其打包成jar包.希望了解如何用java语言爬虫网页的可以看下. 杂家前文就又介绍用HTTP访问百度主页得到html的string字符串,但html的文本数据如果不经过处理就是个文本字符串没有任何效果的.所谓的浏览器就是负责将文本的html"翻译"成看到的界面.在前文有介绍,这个csdn的客户端app分首页.业界.移动.研发.程序员.云计算五大类.以业界为例,http://news.csdn.net/

python爬虫CSDN文章抓取

CSDN原则上不让非人浏览访问,正常爬虫无法从这里爬取文章,需要进行模拟人为浏览器访问. 使用:输入带文章的CSDN链接自动生成正文的HTML,文件名为标题名 #!/usr/bin/env python # coding=utf-8 ######################################### #> File Name: CSDN_article.py #> Author: nealgavin #> Mail: [email protected] #> Cre