因为工作需要,需要对京东地址和国标地址转换,特意采集了一下最新的国标地址库,以方便进行数据映射。
因为前端技术不太好,只能一点一点的页面分析(大神勿笑),故整理出来,一是为了存储方便以后再用,二是为了一样的方便同样需要这部分数据的小伙伴。
/// <summary> /// 国标地址库采集 /// /// 作者:南丘伟 /// 时间:2017-06-21 /// 版本:V1.0.0 /// 版本说明: /// 下次采集时很有可能因为页面格式变化导致采集不到 /// 此方法可作为下次采集的Demo,采集时另做调整 /// /// </summary> public class GbAddressHelper { public GbAddressHelper() { } /// <summary> /// 国家统计局地址信息采集 /// 地址:http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html /// 调试结果: /// [{省,市,区县}] /// /// </summary> /// <param name="gbUrl"></param> /// <returns></returns> public string DownLoadGbAddress(string gbUrl) { string newjson = string.Empty; string html = HttpUtilitys.SendHttpRequest(gbUrl, ""); if (html.Contains("xilan_con")) { //第一步:从页面上获取地址数据相关块 string regEx = "<div class=\"xilan_con\"([\\w\\W]+)</span></b></p></div>";//获取当前页面 MatchCollection mcPageIndex = Regex.Matches(html, regEx, RegexOptions.IgnoreCase | RegexOptions.Multiline); if (mcPageIndex.Count == 1) { html = mcPageIndex[0].Groups[0].Value.Trim(); var sb = new StringBuilder(); newjson += "["; //第二步:页面地址数据字符串,根据省份切割 var prohtmlArry = Regex.Split(html, "<p class=\"MsoNormal\"><span lang=\"EN-US\">", RegexOptions.IgnoreCase); foreach (var item in prohtmlArry) { var prolist = Regex.Split(item, "<p class=\"MsoNormal\"><b><span lang=\"EN-US\">", RegexOptions.IgnoreCase); for (int i = 0; i < prolist.Length; i++) { string newhtml = "<p class=\"MsoNormal\"><b><span lang=\"EN-US\">" + prolist[i]; regEx = "<p class=\"MsoNormal\"><b><span lang=\"EN-US\">([\\d]+)<span>";//获取当前页面 MatchCollection mcproId = Regex.Matches(newhtml, regEx, RegexOptions.IgnoreCase | RegexOptions.Multiline); if (mcproId.Count > 0) { int proId = Utils.ObjToInt(mcproId[0].Groups[1].Value.Trim(), 0); if (proId > 0) { // regEx = "<b><span style=\"font-family: 宋体\">([\u4e00-\u9fa5]+)</span></b></p>";//获取当前页面 // regEx = "</span></span></b><b><span style=\"font-family: 宋体\">([\\w\\W]+)</span></b></p>";//获取当前页面 MatchCollection mcproName = Regex.Matches(newhtml, regEx, RegexOptions.IgnoreCase | RegexOptions.Multiline); if (mcproName.Count > 0) { string proName = mcproName[0].Groups[1].Value.Trim(); newjson += "{"; newjson += "\"PId\":\"0\","; newjson += "\"AreaId\":\"" + proId + "\","; newjson += "\"AreaName\":\"" + proName + "\"},"; //第三步:根据省数据,分割城市 //匹配城市 //"<span lang=\"EN-US\">([\\d]+)<span> (\\s*)</span></span><span style=\"font-family: 宋体\">(\\s*)([\u4e00-\u9fa5]+)</span></p>" var citylist = Regex.Split(newhtml, "<p class=\"MsoNormal\"><span style=\"font-family: 宋体\"> </span>", RegexOptions.IgnoreCase); for (int k = 0; k < citylist.Length; k++) { int cityId = 0; //拆分出来的每个城市详情 newhtml = citylist[k]; regEx = "<span lang=\"EN-US\">([\\d]+)<span> (\\s*)</span></span><span style=\"font-family: 宋体\">(\\s*)([\u4e00-\u9fa5]+)</span></p>"; //拆分城市ID和名称 MatchCollection cityslist = Regex.Matches(newhtml, regEx, RegexOptions.IgnoreCase | RegexOptions.Multiline); if (cityslist.Count > 0) { cityId = Utils.ObjToInt(cityslist[0].Groups[1].Value.Trim(), 0); newjson += "{"; newjson += "\"PId\":\"" + proId + "\","; newjson += "\"AreaId\":\"" + cityId + "\","; newjson += "\"AreaName\":\"" + cityslist[0].Groups[4].Value.Trim() + "\"},"; //拆分区县 regEx = "<span lang=\"EN-US\">([\\d]+)<span> </span></span><span style=\"font-family: 宋体\">(\\s*)([\u4e00-\u9fa5]+)</span></p>"; MatchCollection townlist = Regex.Matches(newhtml, regEx, RegexOptions.IgnoreCase | RegexOptions.Multiline); if (townlist.Count > 0) { for (int j = 0; j < townlist.Count; j++) { var town = townlist[j].Groups[0].Value.Trim(); newjson += "{"; newjson += "\"PId\":\"" + cityId + "\","; newjson += "\"AreaId\":\"" + Utils.ObjToInt(townlist[j].Groups[1].Value.Trim(), 0) + "\","; newjson += "\"AreaName\":\"" + townlist[j].Groups[3].Value.Trim() + "\"},"; } } } } } } else { newjson = "{\"status\":\"n\",\"info\":\"省份代码为空!,请核查采集源码!\"}"; } } } } newjson = newjson.Substring(0, newjson.Length - 1); newjson += "]"; } else { newjson = "{\"status\":\"n\",\"info\":\"页面数据标签位置变动,请核查采集源码!\"}"; } } else { newjson = "{\"status\":\"n\",\"info\":\"采集页面数据格式变动,请核查采集源码!\"}"; } return newjson; } }
时间: 2024-11-10 16:33:20