需要引用:
using Fizzler;
using Fizzler.Systems.HtmlAgilityPack;
using HtmlAgilityPack;
public class FizzlerHelper { /// <summary> /// 获取相应的标签内容 /// </summary> /// <param name="url">地址链接</param> /// <param name="cssLoad">css路径</param> /// <returns></returns> public static IEnumerable<HtmlNode> GetUrlInfo(string url, string cssLoad) { HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; string html = HttpGet<string>(url); htmlDoc.LoadHtml(html); IEnumerable<HtmlNode> NodesMainContent = htmlDoc.DocumentNode.QuerySelectorAll(cssLoad);//查询的路径 return NodesMainContent; } /// <summary> /// 获取相应的标签内容 /// </summary> /// <param name="html">html内容</param> /// <param name="cssLoad">css路径</param> /// <returns></returns> public static IEnumerable<HtmlNode> GetHtmlInfo(string html, string cssLoad) { HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; htmlDoc.LoadHtml(html); IEnumerable<HtmlNode> NodesMainContent = htmlDoc.DocumentNode.QuerySelectorAll(cssLoad);//查询的路径 return NodesMainContent; } #region GET请求 public static T HttpGet<T>(string url) { try { string retString = ""; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.Method = "GET"; using (HttpWebResponse response = (HttpWebResponse)request.GetResponse()) { Stream stream = response.GetResponseStream(); using (StreamReader streamReader = new StreamReader(stream, Encoding.UTF8)) { retString = streamReader.ReadToEnd().ToString(); } } return (T)Convert.ChangeType(retString, typeof(T)); } catch { return default(T); } } #endregion }
实现数据抓取(透明售房网),Fizzler:主要是通过Html中的标签样式获取数据,屏蔽了复杂的正则表达式。
using System; using System.Collections; using System.Collections.Generic; using System.Linq; using System.Web; using System.Web.Services; using System.Web.UI; using System.Web.UI.WebControls; using Fizzler; using Fizzler.Systems.HtmlAgilityPack; using HtmlAgilityPack; using Newtonsoft.Json; using DataCollectionCommon; using System.Text; using DataCollectionDAL; using DataCollectionModel; using System.Text.RegularExpressions; namespace DataCollectionDemo { public partial class WebForm1 : System.Web.UI.Page { public static FizzlerHelper fizzlerHelper = new FizzlerHelper(); public string resultHtml = string.Empty; protected void Page_Load(object sender, EventArgs e) { if (!IsPostBack) { //杭州 resultHtml = StartDataCollection("http://www.tmsf.com/daily.htm"); } } /// <summary> /// 开始抓取数据 /// </summary> /// <param name="<span style="font-family: Arial, Helvetica, sans-serif;">url</span><span style="font-family: Arial, Helvetica, sans-serif;">">网站地址路径</param></span> /// <returns></returns> [WebMethod] public static string StartDataCollection(string url) { StringBuilder temp_table = new StringBuilder(); temp_table.Append("<table width=\"100%\" border=\"0\" cellspacing=\"0\" cellpadding=\"0\">"); temp_table.Append("<tr><td>楼盘名称</td><td>城区</td><td>签约套数</td><td>预定套数</td><td>签约面积</td><td>签约均价</td></tr>"); List<HtmlNode> list_tr = FizzlerHelper.GetUrlInfo(url, "div.datanowin table tr").ToList(); if (list_tr.Count > 0) { //移除表头 list_tr.RemoveAt(0); } //循环行tr foreach (HtmlNode node_tr in list_tr) { string tdHtml = node_tr.InnerHtml;//再去解析html中的td List<HtmlNode> list_td = FizzlerHelper.GetHtmlInfo(tdHtml, "td").ToList(); temp_table.Append("<tr>"); //循环列td foreach (HtmlNode node_td in list_td) { string spanHtml = node_td.InnerHtml; MatchCollection mcc_temp = Regex.Matches(spanHtml, @"<span class=""(?<url>.+?)""></span>", RegexOptions.Singleline); string values = GetValueBySpanClass(mcc_temp); temp_table.AppendFormat("<td>{0}{1}</td>", values, node_td.InnerText); } temp_table.Append("</tr>"); } temp_table.Append("</table>"); return temp_table.ToString(); //return JsonConvert.SerializeObject(new { code = 1, msg = "数据采集失败", data = temp.ToString() }); } /// <summary> /// 根据span样式名称 解析值 /// </summary> /// <param name="mcc_span"></param> /// <returns></returns> private static string GetValueBySpanClass(MatchCollection mcc_span) { string str_value = ""; for (int i = 0, length = mcc_span.Count; i < length; i++) { switch (mcc_span[i].Groups["url"].Value) { case "numbdor": str_value += "."; break; case "numbzero": str_value += "0"; break; case "numbone": str_value += "1"; break; case "numbtwo": str_value += "2"; break; case "numbthree": str_value += "3"; break; case "numbfour": str_value += "4"; break; case "numbfive": str_value += "5"; break; case "numbsix": str_value += "6"; break; case "numbseven": str_value += "7"; break; case "numbeight": str_value += "8"; break; case "numbnine": str_value += "9"; break; default: break; } } return str_value; } } }
时间: 2024-11-25 10:56:31