asp.net 网页编码自动识别代码

复制代码 代码如下:

  using System;

  using System.Net;

  using System.Text;

  using System.Text.RegularExpressions;

  class Program

  {

  // 获取网页的HTML内容,根据网页的charset自动判断Encoding

  static string GetHtml(string url)

  {

  return GetHtml(url, null);

  }

  // 获取网页的HTML内容,指定Encoding

  static string GetHtml(string url, Encoding encoding)

  {

  byte[] buf = new WebClient().DownloadData(url);

  if (encoding != null) return encoding.GetString(buf);

  string html = Encoding.UTF8.GetString(buf);

  encoding = GetEncoding(html);

  if (encoding == null || encoding == Encoding.UTF8) return html;

  return encoding.GetString(buf);

  }

  // 根据网页的HTML内容提取网页的Encoding

  static Encoding GetEncoding(string html)

  {

  string pattern = @"(?i)\bcharset=(?<charset>[-a-zA-Z_0-9]+)";

  string charset = Regex.Match(html, pattern).Groups["charset"].Value;

  try { return Encoding.GetEncoding(charset); }

  catch (ArgumentException) { return null; }

  }

  // 程序入口

  static void Main()

  {

  Console.WriteLine(GetHtml(http://www.glzy8.com));

  Console.Read();

  }

  }