public static void Main(string[] args) { string url = "https://ly.esf.fang.com/house-a010204-b012374/";//所需要爬取网站地址 string data = GetWebContent(url); var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(data);//加载数据流 HtmlNodeCollection htmlNodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=‘shop_list shop_list_4‘]/dl[@id]");//查询根节点div下的class属性 详细请看https://www.cnblogs.com/GmrBrian/p/6201237.html string a = ""; foreach (var item in htmlNodes)//htmlNodes把数据乱码进行编译 { a += item.OuterHtml.ToString(); } var FangJi = new HtmlDocument(); FangJi.LoadHtml(a); string fM = "//dd/h4[@class=‘clearfix‘]/a/span";//同上 string gGe = "//dd/p[@class=‘tel_shop‘]"; string dZ = "//dd/p[@class=‘add_shop‘]/span"; var fangMing = FangJi.DocumentNode.SelectNodes(fM); var guiGe = FangJi.DocumentNode.SelectNodes(gGe); var diZhi = FangJi.DocumentNode.SelectNodes(dZ); string fm = ""; string gge = ""; string dz = ""; foreach (var item in fangMing) { fm += item.InnerText.ToString() + "*"; } foreach (var item in guiGe) { gge += item.InnerText.Trim() + "*"; } foreach (var item in diZhi) { dz += item.InnerText.ToString() + "*"; } string[] Fangming = fm.Split(‘*‘); string[] Guige = gge.Split(‘*‘); string[] Dizhi = dz.Split(‘*‘); for (int i = 0; i < Fangming.Length - 1; i++) { Add(Fangming[i], Guige[i], Dizhi[i]); } } public static void Add(string da1, string da2, string da3)//存储数据数据库 { SqlConnection conn = new SqlConnection(ConfigurationManager.AppSettings["constring"]); SqlCommand cmd = new SqlCommand("insert into CountPC values(‘" + da1 + "‘,‘" + da2 + "‘,‘" + da3 + "‘)", conn); conn.Open(); cmd.ExecuteNonQuery(); conn.Close(); } public static string GetWebContent(string Url) { /// 抓取数据 string strResult = ""; try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url); //声明一个HttpWebRequest请求 request.Timeout = 30000; //设置连接超时时间 request.Headers.Set("Pragma", "no-cache"); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream streamReceive = response.GetResponseStream(); Encoding encoding = Encoding.GetEncoding("GB2312");//网页所需编码格式 StreamReader read = new StreamReader(new GZipStream(streamReceive, CompressionMode.Decompress), encoding); strResult = read.ReadToEnd(); } catch { } return strResult; }
}
原文地址:https://www.cnblogs.com/wxc-love/p/10746404.html
时间: 2024-11-08 06:21:02