#region 获取网页的HTML内容 2 // 获取网页的HTML内容,指定Encoding 3 public static string GetHtml(string url, Encoding encoding) 4 { 5 byte[] buf = new WebClient().DownloadData(url); 6 if (encoding != null) return encoding.GetString(buf); 7 string html = Encoding.UTF8.GetString(buf); 8 encoding = GetEncoding(html); 9 if (encoding == null || encoding == Encoding.UTF8) return html; 10 return encoding.GetString(buf); 11 } 12 // 根据网页的HTML内容提取网页的Encoding 13 public static Encoding GetEncoding(string html) 14 {
using System; 2 using System.Collections.Generic; 3 using System.Linq; 4 using System.Text; 5 using System.IO; 6 using System.Net; 7 using System.Web; 8 using System.Security.Cryptography; 9 using System.Text.RegularExpressions; 10 using System.Web.Script.Serialization; 11 using System.Data; 12 using System.Collections; 13 using System.Runtime.Serialization.Json; 14 using System.Configuration; 15 using System.Reflection;
15 string pattern = @"(?i)\bcharset=(?<charset>[-a-zA-Z_0-9]+)"; 16 string charset = Regex.Match(html, pattern).Groups["charset"].Value; 17 try { return Encoding.GetEncoding(charset); } 18 catch (ArgumentException) { return null; } 19 } 20 #endregion
时间: 2024-11-16 02:21:00