管理资源吧首页>>>教程>>>编程>>>ASP.NET教程>>>

提取HTML代码中文字的C#函数

　　/// <summary>

　　/// 去除HTML标记

　　/// </summary>

　　/// <param name="strHtml">包括HTML的源码 </param>

　　/// <returns>已经去除后的文字</returns>

　　public static string StripHTML(string strHtml)

　　{

　　string [] aryReg ={

　　@"<script[^>]*?>.*?</script>",

　　@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",

　　@"([\r\n])[\s]+",

　　@"&(quot|#34);",

　　@"&(amp|#38);",

　　@"&(lt|#60);",

　　@"&(gt|#62);",

　　@"&(nbsp|#160);",

　　@"&(iexcl|#161);",

　　@"&(cent|#162);",

　　@"&(pound|#163);",

　　@"&(copy|#169);",

　　@"&#(\d+);",

　　@"-->",

@"<!--.*\n"

　　};

　　string [] aryRep = {

　　"",

　　"\"",

　　"&",

　　"<",

　　">",

　　" ",

　　"\xa1",//chr(161),

　　"\xa2",//chr(162),

　　"\xa3",//chr(163),

　　"\xa9",//chr(169),

　　"",

　　"\r\n",

　　""

　　};

　　string newReg =aryReg[0];

　　string strOutput=strHtml;

　　for(int i = 0;i<aryReg.Length;i++)

　　{

　　Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase );

　　strOutput = regex.Replace(strOutput,aryRep[i]);

　　}

　　strOutput.Replace("<","");

　　strOutput.Replace(">","");

　　strOutput.Replace("\r\n","");

　　return strOutput;

　　}

教程首页更多教程