当前位置: 移动技术网 > IT编程>开发语言>.net > c#中过滤html的正则表达式

c#中过滤html的正则表达式

2018年04月28日  | 移动技术网IT编程  | 我要评论

实现代码

///  <summary>
///  去除html标记
///  </summary>
///  <param  name=”nohtml”>包括html的源码  </param>
///  <returns>已经去除后的文字</returns>
public static string nohtml(string htmlstring)
{
  //删除脚本
  htmlstring = regex.replace(htmlstring, @"<script[^>]*?>.*?</script>", "",
  regexoptions.ignorecase);
  //删除html 
  htmlstring = regex.replace(htmlstring, @"<(.[^>]*)>", "",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"([\r\n])[\s]+", "",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"–>", "", regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"<!–.*", "", regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(quot|#34);", "\"",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(amp|#38);", "&",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(lt|#60);", "<",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(gt|#62);", ">",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(nbsp|#160);", "  ",
  regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(iexcl|#161);", "\xa1", regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(cent|#162);", "\xa2", regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(pound|#163);", "\xa3", regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&(copy|#169);", "\xa9", regexoptions.ignorecase);
  htmlstring = regex.replace(htmlstring, @"&#(\d+);", "", regexoptions.ignorecase);
  htmlstring.replace("<", "");
  htmlstring.replace(">", "");
  htmlstring.replace("\r\n", "");
  htmlstring = httpcontext.current.server.htmlencode(htmlstring).trim();
  return htmlstring;
}

c#过滤html标签及空格

public static string filterhtml(string htmlstr)
    {
      if (!string.isnullorempty(htmlstr))
        return system.text.regularexpressions.regex.replace(htmlstr, "<[^>]*>| ", "");
      else
        return "";
    }

写一个静态方法移除html标签

#region
///  <summary>
///  移除html标签
///  </summary>
///  <param  name="htmlstr">htmlstr</param>
public static string parsetags(string htmlstr)
{
 return system.text.regularexpressions.regex.replace(htmlstr, "<[^>]*>", "");
}
#endregion

取出文本中的图片地址

#region
///  <summary>
///  取出文本中的图片地址
///  </summary>
///  <param  name="htmlstr">htmlstr</param>
public static string getimgurl(string htmlstr)
{
 string str = string.empty;
 string spattern = @"^<img\s+[^>]*>";
 regex r = new regex(@"<img\s+[^>]*\s*src\s*=\s*([']?)(?<url>\s+)'?[^>]*>",
  regexoptions.compiled);
 match m = r.match(htmlstr.tolower());
 if (m.success)
  str = m.result("${url}");
 return str;
}
#endregion

提取html代码中文字的c#函数

///  <summary>
///  提取html代码中文字的c#函数
///  </summary>
///  <param  name="strhtml">包括html的源码  </param>
///  <returns>已经去除后的文字</returns>
using system;
using system.text.regularexpressions;
public class striphtmltest
{
 public static void main()
 {
  string s = striphtml(
   "<html><head><title>中国石龙信息平台</title></head><body>faddfs龙信息平台</body></html>");
  console.writeline(s);
 }

 public static string striphtml(string strhtml)
 {
  string[]aryreg =
  {
   @"<script[^>]*?>.*?</script>",

   @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\["
    "'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", @"([\r\n])[\s]+", @
    "&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @
    "&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);",
    @"&(copy|#169);", @"&#(\d+);", @"-->", @"<!--.*\n"
  };

  string[]aryrep =
  {
   "", "", "", "\"", "&", "<", ">", "  ", "\xa1", //chr(161),
   "\xa2", //chr(162),
   "\xa3", //chr(163),
   "\xa9", //chr(169),
   "", "\r\n", ""
  };

  string newreg = aryreg[0];
  string stroutput = strhtml;
  for (int i = 0; i < aryreg.length; i++)
  {
   regex regex = new regex(aryreg[i], regexoptions.ignorecase);
   stroutput = regex.replace(stroutput, aryrep[i]);
  }
  stroutput.replace("<", "");
  stroutput.replace(">", "");
  stroutput.replace("\r\n", "");
  return stroutput;
 }
}

tempcontent 表示包含有html的字符串;
tempcontent = system.text.regularexpressions.regex.replace(tempcontent,"<[^>]+>","");至少一个
tempcontent = system.text.regularexpressions.regex.replace(tempcontent,"<[^>]*>","");任意个 

如对本文有疑问, 点击进行留言回复!!

相关文章:

验证码:
移动技术网