MongoDBcrud操作,采集部分代码

using System;
using System.Collections.Generic;
using System.ComponentModel.Design;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using CDPWIB.DAL;
using CDPWIB.Data;
using CommonUtility;
using HtmlAgilityPack;
using MongoDB.Driver;
using MongoDB.Driver.Builders;
using MongoDB.Driver.Linq;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using WebKit;

namespace CDPWIB.WebCollection
{
    internal class QiDianCol : INovalCollect
    {
        private int Source = Convert.ToInt32(NovalSource.QiDian);

        private readonly MongoCollection<NovalTempBase> Novalcol =
            MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof (NovalTempBase));

        public void GetNovalTypeTemp()
        {
            try
            {
                var typecol = MongoConnectionFactory.GetMongoCollction<NovalTypeTemp>("Noval", typeof (NovalTypeTemp));
                var subcol = MongoConnectionFactory.GetMongoCollction<NovalSubType>("Noval", typeof (NovalSubType));
                // 大类 http://www.qidian.com/Javascript/qidian.bookstore.js?t=20130917
                string typeshtml =
                    NetHelper.HttpGet("http://www.qidian.com/Javascript/qidian.bookstore.js?t=20130917")
                        .Replace("/", "")
                        .Replace("&nbsp", "")
                        .Replace("\r", "")
                        .Replace("\n", "")
                        .Replace("\t", "")
                        .Replace("|", "")
                        .Replace(" ", "");
                ;
                string subtypes =
                    NetHelper.HttpGet("http://script.cmfu.com/script/BookStore.js ")
                        .Replace("&nbsp", "")
                        .Replace("\r", "")
                        .Replace("\n", "")
                        .Replace("\t", "")
                        .Replace("|", "")
                        .Replace(" ", "");
                ;

                Match mtype = Regex.Match(typeshtml, "CategoryArr:(.*?)]]",
                    RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline);

                string typesstring = mtype.Groups[1].Value + "]]";
                JArray typearr = (JArray) JsonConvert.DeserializeObject(typesstring);
                //JsonTextWriter

                Match msubtype = Regex.Match(subtypes, "SubCategoryArr=(.*?);",
                    RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline);

                string subtypesstring = msubtype.Groups[1].Value;

                JArray subarr = (JArray) JsonConvert.DeserializeObject(subtypesstring);

                List<NovalTypeTemp> lstypes = new List<NovalTypeTemp>(10);
                //CategoryArr: [["全部", "-1"], ["玄幻", "21"], ["奇幻", "1"], ["武侠", "2"], ["仙侠", "22"], ["都市", "4"], ["历史", "5"], ["军事", "6"], ["游戏", "7"]
                for (int i = 0; i < typearr.Count; i++)
                {
                    if (typearr[i][1].ToString() != "-1")
                    {
                        NovalTypeTemp type = new NovalTypeTemp()
                        {
                            WebNum = typearr[i][1].ToString().ToInt(),
                            Name = typearr[i][0].ToString(),
                            Source = Source
                        };
                        lstypes.Add(type);
                    }
                }
                IMongoQuery query = Query<NovalTypeTemp>.EQ(p => p.Source, Source);

                typecol.Remove(query);

                typecol.InsertBatch(lstypes);
                List<NovalSubType> subtypels = new List<NovalSubType>(300);

                foreach (var NovalTypeTemp in lstypes)
                {

                    for (int i = 0; i < subarr.Count; i++)
                    {
                        var obj = subarr[i];
                        if (obj[0].ToString() == NovalTypeTemp.WebNum.ToString())
                        {
                            NovalSubType subtype = new NovalSubType()
                            {
                                Name = obj[2].ToString(),
                                ParentWebNum = NovalTypeTemp.WebNum,
                                WebNum = obj[1].ToString().ToInt(),
                                Source = Source
                            };
                            subtypels.Add(subtype);
                        }
                    }

                }
                query = Query<NovalSubType>.EQ(p => p.Source, Source);
                subcol.Remove(query);
                subcol.InsertBatch(subtypels);
            }
            catch (Exception ex)
            {
                throw;
            }
        }

        /// <summary>
        /// 根据点击数页面查小说
        /// </summary>
        public void GetNovals()
        {
            //取1到10页
            //得到月点击排行小说。
            string sourcehtml = string.Empty;
            HtmlDocument htmldocc = new HtmlDocument();
            List<NovalTempBase> qdls = new List<NovalTempBase>(500);
            for (int j = 1; j < 11; j++)
            {
                sourcehtml =
                    NetHelper.HttpGet("http://top.qidian.com/Book/TopDetail.aspx?TopType&Time=2&PageIndex=" + j);
                ;
                htmldocc.LoadHtml(sourcehtml);
                var doc = htmldocc.GetElementbyId("textlist");
                //string tablehtml = "<table>" + doc.InnerHtml + "</table>";
                //     htmldocc.LoadHtml(tablehtml);
                //一页50列
                for (int i = 2; i < 52; i++)
                {
                    var trdoc = doc.SelectSingleNode("tr[" + i + "]");
                    //这里的下标,从1算起
                    var tdtype = trdoc.SelectSingleNode("td[2]/a");
                    var tdbook = trdoc.SelectSingleNode("td[3]/a[1]");
                    var tdclick = trdoc.SelectSingleNode("td[4]");
                    var tdauth = trdoc.SelectSingleNode("td[5]/a");
                    Match typematch = Regex.Match(tdtype.OuterHtml, "ChannelId=(\\d*?)&SubCategoryId=(\\d*?)‘");
                    Match bookmatck = Regex.Match(tdbook.OuterHtml, "Book/(\\d*?).aspx");
                    Match authmatch = Regex.Match(tdauth.OuterHtml, "id=(\\d*?)\"");
                    int authid = authmatch.Groups[1].Value.ToInt();
                    int type = typematch.Groups[1].Value.ToInt();
                    int subtype = typematch.Groups[2].Value.ToInt();
                    int booknum = bookmatck.Groups[1].Value.ToInt();
                    string bookname = tdbook.InnerText.Trim();
                    //http://image.cmfu.com/books/3127618/3127618.jpg
                    string titleimg = "http://image.cmfu.com/books/" + booknum + "/" + booknum + ".jpg";

                   bool exist= qdls.Exists(p => p.SourceWebNum == booknum);
                    if (!exist)
                    {
                        NovalTempBase qidian = new NovalTempBase()
                        {
                            AuthName = tdauth.InnerText.Trim(),
                            AuthId = authid,
                            SubType = subtype,
                            TitleImg = titleimg,
                            Title = bookname,
                            TotalClick = tdclick.InnerText.ToInt(),
                            TotalComment = 0,
                            Type = type,
                            SourceWebNum = booknum,
                            Source = Source
                        };
                        qdls.Add(qidian);
                    }

                }
            }

            PublicMethod.InsertAndUpdateNovalTmp(qdls,Source);
        }

        //public void GetNovalsByType()
        //{
        //}
        /// <summary>
        /// 得到小说章节 ,个别来源,带分卷。
        /// </summary>
        public void GetNovalChapers()
        {

            //http://sight.qq.com/book/chapterpage?uin=0&g_tk=5381&callback=_Callback&pagesize=100&pageno=2&bid=16043&_r=0.6934567329008132
            var novalcol = MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof (NovalTempBase));

            var books = novalcol.AsQueryable().Where(p=>p.Source==Source).ToList();
            foreach (var infoQidian in books)
            {
                GetSingleNovalChapers(infoQidian.SourceWebNum);
            }
        }

        public void GetSingleNovalChapers(int novalwebnum)
        {
            IMongoQuery q2 = Query<NovalVolumeTemp>.EQ(p => p.Source, Source);
            IMongoQuery q1 = Query<NovalVolumeTemp>.EQ(p => p.NovalWebNum, novalwebnum);
            IMongoQuery[] qarray = { q1, q2 };

            IMongoQuery query = Query.And(qarray);

            var chaptercol = MongoConnectionFactory.GetMongoCollction<NovalChapterTemp>("Noval", typeof(NovalChapterTemp));
            var volumecol = MongoConnectionFactory.GetMongoCollction<NovalVolumeTemp>("Noval", typeof (NovalVolumeTemp));
            List<NovalChapterTemp> lschapters = new List<NovalChapterTemp>(1000);
            List<NovalVolumeTemp> lsvolumes = new List<NovalVolumeTemp>(10);
            int chapterorder = 1;
            int volumeorder = 1;
            HtmlDocument htmldocc = new HtmlDocument();
            //http://read.qidian.com/BookReader/3127618.aspx

            string sourcehtml = string.Empty;
            string url = "http://read.qidian.com/BookReader/" + novalwebnum + ".aspx";
            try
            {
                sourcehtml = NetHelper.HttpGet(url);
                //目录主页
                htmldocc.LoadHtml(sourcehtml);
                var doc = htmldocc.GetElementbyId("content");
                int i = 1;

                var topdoc = doc.SelectSingleNode("div[" + i + "]");
                while (topdoc != null)
                {
                    var topa = topdoc.SelectSingleNode("div/a");
                    //如果是vip章节,没有这个A标签。
                    int topnum;
                    //分卷信息
                    if (topa != null)
                    {
                        string topahtml = topa.OuterHtml;
                        //href="http://www.qidian.com/BookReader/vol,107580,486625.aspx"
                        Match m = Regex.Match(topahtml, ",(\\d*?).aspx");
                        topnum = m.Groups[1].Value.ToInt();
                        var topaname = topdoc.SelectSingleNode("div/b");
                        string topname = topaname.InnerText.Trim();

                        topname = topname.Replace("&nbsp", "").Split(‘;‘)[1];
                        //if(topname=="作品相关")
                        NovalVolumeTemp volume = new NovalVolumeTemp()
                        {
                            Sort = volumeorder,
                            WebNum = topnum,
                            Name = topname,
                            NovalWebNum = novalwebnum,
                            Source = Source
                        };
                        lsvolumes.Add(volume);
                        volumeorder++;
                    }
                    else
                    {
                        topnum = 0;
                    }

                    var contextdoc = doc.SelectSingleNode("div[" + (i + 1) + "]");
                    var chaperas = contextdoc.SelectNodes("div/ul/li/a");
                    //<a itemprop=‘url‘ href="http://read.qidian.com/BookReader/107580,20901221.aspx" title=‘凡人修仙传
字数:84  更新时间:2008-08-01 07:54:48‘><span itemprop=‘headline‘>呵呵!终于上架了!</span></a>
                    //,(\d*?).aspx
                    string chaptername = string.Empty;
                    //章节信息
                    int chapterwebnum = 0;
                    for (int x = 0; x < chaperas.Count; x++)
                    {
                        var chapera = chaperas[x];
                        chaptername = chapera.InnerText.Trim();
                        Match chapmatchwebnum = Regex.Match(chapera.OuterHtml, ",(\\d*?).aspx");
                        chapterwebnum = chapmatchwebnum.Groups[1].Value.ToInt();
                        NovalChapterTemp chapter = new NovalChapterTemp()
                        {
                            Name = chaptername,
                            Sort = chapterorder,
                            WebNum = chapterwebnum,
                            VolumeId = topnum
                            ,
                            NovalWebNum = novalwebnum,
                            Source = Source
                        };
                        lschapters.Add(chapter);
                        chapterorder++;
                    }
                    i += 2;
                    topdoc = doc.SelectSingleNode("div[" + i + "]");
                }
                volumecol.Remove(query);
                volumecol.InsertBatch(lsvolumes);
                PublicMethod.InsertChapterTempToSQL(lschapters, Source, novalwebnum);

            }
            catch (Exception ex)
            {
                return;
            }

        }

        public void GetNovalCilckComment()
        {
            var novalcol = MongoConnectionFactory.GetMongoCollction<NovalTempBase>("Noval", typeof(NovalTempBase));

            var books = novalcol.AsQueryable().Where(p => p.Source == Source).ToList();
            string sourcehtml = string.Empty;
            string url = string.Empty;
            HtmlDocument htmldocc = new HtmlDocument();
            foreach (var novalTempBase in books)
            {
                //http://www.qidian.com/Book/3106580.aspx
                 url = "http://www.qidian.com/Book/" + novalTempBase.SourceWebNum + ".aspx";
                sourcehtml = NetHelper.HttpGet(url);
                htmldocc.LoadHtml(sourcehtml);
                var cliclickdiv = htmldocc.GetElementbyId("contentdiv");
                // /div/div/div[1]/table/tbody/tr/td[1]

                var clickcount =
                    cliclickdiv.SelectSingleNode("div/div[1]/table/tr/td[1]")
                        .InnerText.Replace("总点击", "")
                        .Replace(":", "").Trim();

                int click = Convert.ToInt32(clickcount);

            //    string urlcom = "http://forum.qidian.com/NewForum/List.aspx?BookId=3106580";
            ////http://forum.qidian.com/NewForum/List.aspx?BookId=3106580

            // //   http://c.pingba.qidian.com/BookComment.aspx?BookId=3106580
            //    url = "http://c.pingba.qidian.com/BookComment.aspx?" + novalTempBase.SourceWebNum;
            //    sourcehtml = NetHelper.HttpGet(url);
            //    htmldocc.LoadHtml(sourcehtml);
                novalTempBase.TotalClick = click;
                novalcol.Save(novalTempBase);
            }

                //目录主页

        }

    }
}
时间: 2024-11-08 22:01:50

MongoDBcrud操作,采集部分代码的相关文章

jquery操作单选钮代码示例

jquery操作单选钮代码示例:radio单选按钮是最重要的表单元素之一,下面介绍一下常用的几个jquery对radio单选按钮操作.一.取消选中: $(".theclass").each(function(){ if($(this).attr('checked')) { $(this).attr('checked',false); } }); 以上代码可以将class属性值为theclass的被选中单选按钮取消选中.二.获取被选中的单选按钮的值: var val=$('.thecla

ios多线程操作(七)—— GCD延迟操作与一次性代码

使用GCD函数可以进行延时操作,该函数为 dispatch_after(dispatch_time(DISPATCH_TIME_NOW, (int64_t)(delayInSeconds * NSEC_PER_SEC)), dispatch_get_main_queue(), ^{ }); 现在我们来分解一下参数 dispatch_time(DISPATCH_TIME_NOW, (int64_t)(delayInSeconds * NSEC_PER_SEC)) : NSEC_PER_SEC在头文

30 个 php 操作 redis 常用方法代码例子

这篇文章主要介绍了 30 个 php 操作 redis 常用方法代码例子 , 本文其实不止 30 个方法 , 可以操作 string 类型. list 类型和 set 类型的数据 , 需要的朋友可以参考下redis 的操作很多的,以前看到一个比较全的博客,但是现在找不到了.查个东西搜半天,下面整理一下php 处理 redis 的例子,个人觉得常用一些例子.下面的例子都是基于 php-redis 这个扩展的.1 , connect描述:实例连接到一个 Redis.参数: host: string

AD帐户操作C#示例代码(一)——导入用户信息

最近写了一个AD帐户导入的小工具(为啥写作“帐”户呢?),跟大家分享下相关代码,欢迎各位高手指教! 首先,我准备一个这样的Excel文件作为导入模版,并添加了一些测试数据. 然后,我打开Visual Studio 2012,新建一个Windows窗体应用程序.在主窗体界面,我放了一些Label.TextBox.Button控件,还有一个ProgressBar. 开始写代码.首先写从Excel里读取数据的方法. private static async Task<DataTable> GetTa

无法执行添加/移除操作,因为代码元素 是只读的

刚刚学习用MFC编写嵌入式软件,各种问题接踵而来啊,在资源选项卡里面新建一个dialog后拖进去一个button按钮,想要添加这个空间的时间相应却怎么也不成功.会出现 提示框 “无法执行添加/移除操作,因为代码元素**是只读的”.根据提示去查看对应的.cpp和.h文件,发现并没有只读属性,没办法,求助于网络,发现这个问题还是挺普遍的,参考这篇文章后,保存现有工程后,在目录里面删掉.ncb文件后重新打开解决方案,问题解决. 另外还碰到一个情况,就是在属性栏点击“控件事件”后列表为空,不应该啊,比对

AD帐户操作C#示例代码(二)——检查密码将过期的用户

本文接着和大家分享AD帐户操作,这次开发一个简单的检查密码将过期用户的小工具. 首先,新建一个用户实体类,属性是我们要取的用户信息. public class UserInfo { /// <summary> /// sAM帐户名称 /// </summary> public string SamAccountName { get; set; } /// <summary> /// 名称 /// </summary> public string Name {

Linux下互斥量加锁与解锁操作的C代码实现

一.概述 在实际的软件程序中,由于代码量较大,函数之间的调用关系较为复杂,因此对于某些全局变量的操作要格外小心.在程序中,一般采用互斥量加锁的方式来保证对全局变量的操作的唯一性. 本文详细介绍了Linux下互斥量加锁与解锁操作的C代码实现,为相关的软件开发工作的开展提供了有益的参考. 二.加锁与解锁函数及时间结构体介绍 1.加锁函数pthread_mutex_timedlock 函数原型:int pthread_mutex_timedlock(pthread_mutex_t *restrict

Installshield停止操作系统进程的代码--IS5版本适用

原文:Installshield停止操作系统进程的代码--IS5版本适用 出处:http://www.installsite.org/pages/en/isp_ext.htm这个地址上有不少好东西,有空要好好研究下里面的“List and Shut Down Running Applications”就是演示了Installshield如何停止操作系统进程 Code/*****************************************************************

Installshield停止操作系统进程的代码 --IS6及以上版本适用

原文:Installshield停止操作系统进程的代码 --IS6及以上版本适用 setup.rul的代码 Code //////////////////////////////////////////////////////////////////////////////////                                                                            //  IIIIIII SSSSSS               

c#/ASP.NET操作cookie(读写)代码示例

Cookie是存在硬盘上,IE存cookie的地方和Firefox存cookie的地方不一样.不同的操作系统也可能存cookie的地方不一样. 不同的浏览器会在各自的独立空间存放Cookie, 互不干涉以我的windows7, IE8为例,  cookie存在这: C:\Users\xiaoj\AppData\Local\Microsoft\Windows\Temporary Internet Files 注意: 缓存文件和cookie文件,是存在一起的, 都在这个目录下. 你也可以这样找, 打