借鉴C#网页爬虫抓取行政区划,从国家统计局获取了最新行政区域数据。
以下为代码贴片:
数据库类:
public class City { public decimal ID { get; set; } public string Name { get; set; } public string Code { get; set; } public string Org_Level { get; set; } public string ParentCode { get; set; } public decimal ParentID { get; set; } public string Contry { get; set; } public string Loc_x { get; set; } public string Loc_y { get; set; } }
获取网页帮助类:
1 public class HttpHelper { 2 private static ILog log = log4net.LogManager.GetLogger(typeof(HttpHelper)); 3 4 public static string DownloadHtml(string url,Encoding encod) { 5 string html = string.Empty; 6 try { 7 //设置请求参数 8 HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest; 9 request.Timeout = 10 * 1000;//10s超时 10 request.ContentType = "text/html;charset=utf-8"; 11 request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"; 12 13 //获取结果 14 using(HttpWebResponse resp = request.GetResponse() as HttpWebResponse) { 15 if(resp.StatusCode != HttpStatusCode.OK) { 16 log.Fatal(string.Format("抓取{0}地址返回失败,response.StatusCode = {1}",url,resp.StatusCode)); 17 } else { 18 try { 19 StreamReader sr = new StreamReader(resp.GetResponseStream(),encod); 20 html = sr.ReadToEnd(); 21 sr.Close(); 22 } catch(Exception e) { 23 log.Fatal(string.Format("DownLoadHtml抓取html{0}保存失败",url),e); 24 25 } 26 } 27 } 28 } catch(Exception e) { 29 if(e.Message.Equals("远程服务器返回错误:(306)。")) { 30 } 31 log.Fatal(e); 32 } finally { 33 } 34 return html; 35 } 36 }
数据库保存帮助类:
public class SQLHelper { /// 一个有效的数据库连接对象 /// 命令类型(存储过程,命令文本或其它.) /// T存储过程名称或T-SQL语句 /// SqlParamter参数数组 /// 返回影响的行数 public static int ExecuteNonQueryForCity(List<City> cityList) { int count = 0; //string dbConnectStr = System.Configuration.ConfigurationSettings.AppSettings["DBContext"].ToString(); var connectionString = System.Configuration.ConfigurationManager.ConnectionStrings["DBContext"].ConnectionString; using(SqlConnection connection = new SqlConnection(connectionString)) { if(connection.State != ConnectionState.Open) { connection.Open(); } // 创建SqlCommand命令,并进行预处理 using(SqlCommand cmd = new SqlCommand()) { cmd.Connection = connection; cmd.CommandText = "insert into base_city(ID,name,Code,Contry,Loc_x,Loc_y,Org_Level,ParentCode,ParentID,state) values(@ID,@name,@Code,@Contry,@Loc_x,@Loc_y,@Org_Level,@ParentCode,@ParentID,@state)"; foreach(var city in cityList) { try { if(string.IsNullOrEmpty(city.Name)) city.Name = ""; if(string.IsNullOrEmpty(city.Code)) city.Code = ""; if(string.IsNullOrEmpty(city.Contry)) city.Contry = ""; if(string.IsNullOrEmpty(city.Loc_x)) city.Loc_x = ""; if(string.IsNullOrEmpty(city.Loc_y)) city.Loc_y = ""; if(string.IsNullOrEmpty(city.Org_Level)) city.Org_Level = ""; if(string.IsNullOrEmpty(city.ParentCode)) city.ParentCode = ""; cmd.Parameters.Add(new SqlParameter("@ID",city.ID)); cmd.Parameters.Add(new SqlParameter("@name",city.Name)); cmd.Parameters.Add(new SqlParameter("@Code",city.Code)); cmd.Parameters.Add(new SqlParameter("@Contry",city.Contry)); cmd.Parameters.Add(new SqlParameter("@Loc_x",city.Loc_x)); cmd.Parameters.Add(new SqlParameter("@Loc_y",city.Loc_y)); cmd.Parameters.Add(new SqlParameter("@Org_Level",city.Org_Level)); cmd.Parameters.Add(new SqlParameter("@ParentCode",city.ParentCode)); cmd.Parameters.Add(new SqlParameter("@ParentID",city.ParentID)); cmd.Parameters.Add(new SqlParameter("@state","1")); // Finally, execute the command int retval = cmd.ExecuteNonQuery(); if(retval == 0) { Console.WriteLine("插入错误:"); } count += retval; } catch(Exception e) { Console.WriteLine("插入错误:" + e.Message); } // 清除参数,以便再次使用. cmd.Parameters.Clear(); } } connection.Close(); } return count; } }
抓取数据:
public class 省市县数据抓取 { private ILog log = log4net.LogManager.GetLogger(typeof(省市县数据抓取)); public const string UrlStr = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html"; public List<City> SaveList = new List<City>(); public 省市县数据抓取() { try { log.Info("抓取数据"); string HtmlStr = HttpHelper.DownloadHtml(UrlStr,Encoding.UTF8); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(HtmlStr); //string goodsListPath = "//*[@id=‘J_goodsList‘]"; //HtmlNode goodsListNode = doc.DocumentNode.SelectSingleNode(goodsListPath); string liPath = "//p[@class=‘MsoNormal‘]"; HtmlNodeCollection goodsNodeCollection = doc.DocumentNode.SelectNodes(liPath); City c = new City() { ID=1, Name = "全国", Code = "100000", Contry = "China", Org_Level = "1" }; SaveList.Add(c); foreach(var item in goodsNodeCollection) { var firstNode = item.FirstChild; if(firstNode.Name == "b") GetProvince(item); else if(firstNode.InnerText == " ") { GetCity(item); } else if(firstNode.InnerText == " ") { GetCounty(item); } } } catch(Exception e) { log.Info("last child code:" + SaveList.Last().Code); log.Info(e); throw (e); } } private void GetCounty(HtmlNode item) { City c = new City(); c.Code = item.ChildNodes[1].InnerText.Replace(" ","").Trim(); c.Name = item.ChildNodes[2].InnerText.Trim(); c.Org_Level = "4"; c.ID = SaveList.Last().ID + 1; var pc = SaveList.Last(i => i.Org_Level == "3"); c.ParentCode = pc.Code; c.ParentID = pc.ID; c.Contry = "China"; //if(c.Name == "市辖区") // return; SaveList.Add(c); } private void GetCity(HtmlNode item) { City c = new City(); c.Code = item.ChildNodes[1].InnerText.Replace(" ","").Trim(); c.Name = item.ChildNodes[2].InnerText.Trim(); c.Org_Level = "3"; c.ID = SaveList.Last().ID + 1; var pc = SaveList.Last(i => i.Org_Level == "2"); c.ParentCode = pc.Code; c.ParentID = pc.ID; c.Contry = "China"; SaveList.Add(c); } private void GetProvince(HtmlNode item) { City c = new City(); c.Code = item.ChildNodes[0].FirstChild.InnerText.Replace(" ","").Trim(); c.Name = item.ChildNodes[1].FirstChild.InnerText.Trim(); c.Org_Level = "2"; c.ID = SaveList.Last().ID + 1; var pc = SaveList.Last(i => i.Org_Level == "1"); c.ParentCode = pc.Code; c.ParentID = pc.ID; c.Contry = "China"; SaveList.Add(c); } public void Save() { log.Info("保存数据"); SQLHelper.ExecuteNonQueryForCity(SaveList); } }
全国 Org_Level =1
省 Org_Level =2
市 Org_Level =3
县 Org_Level =4
SaveList 首先添加了一个全国属性城市,Org_Level =1
因为网页数据读取是从 省->市->县 ->省->市->县 这样循环读取的,所以在获取省、市、县的父级时,可以直接从SaveList 获取最后一个上一级别的对象即可
执行类:
省市县数据抓取 CityCatch = new 省市县数据抓取(); CityCatch.Save();
获取的数据如下:
时间: 2024-10-17 15:33:47