当前位置: 移动技术网 > IT编程>开发语言>c# > C#实现抓取和分析网页类实例

C#实现抓取和分析网页类实例

2019年07月18日  | 移动技术网IT编程  | 我要评论
本文实例讲述了c#实现抓取和分析网页类。分享给大家供大家参考。具体分析如下: 这里介绍了抓取和分析网页的类。 其主要功能有: 1、提取网页的纯文本,去所有html标签

本文实例讲述了c#实现抓取和分析网页类。分享给大家供大家参考。具体分析如下:

这里介绍了抓取和分析网页的类。

其主要功能有:

1、提取网页的纯文本,去所有html标签和javascript代码
2、提取网页的链接,包括href和frame及iframe
3、提取网页的title等(其它的标签可依此类推,正则是一样的)
4、可以实现简单的表单提交及cookie保存

/*
* author:sunjoy at ccnu
* 如果您改进了这个类请发一份代码给我(ccnusjy 在gmail.com)
*/
using system;
using system.data;
using system.configuration;
using system.net;
using system.io;
using system.text;
using system.collections.generic;
using system.text.regularexpressions;
using system.threading;
using system.web;
/// <summary>
/// 网页类
/// </summary>
public class webpage
{
 #region 私有成员
 private uri m_uri; //网址
 private list<link> m_links; //此网页上的链接
 private string m_title;  //此网页的标题
 private string m_html;   //此网页的html代码
 private string m_outstr;  //此网页可输出的纯文本
 private bool m_good;   //此网页是否可用
 private int m_pagesize;  //此网页的大小
 private static dictionary<string, cookiecontainer> webcookies = new dictionary<string, cookiecontainer>();//存放所有网页的cookie
 private string m_post; //此网页的登陆页需要的post数据
 private string m_loginurl; //此网页的登陆页
 #endregion
 #region 私有方法
 /// <summary>
 /// 这私有方法从网页的html代码中分析出链接信息
 /// </summary>
 /// <returns>list<link></returns>
 private list<link> getlinks()
 {
  if (m_links.count == 0)
  {
   regex[] regex = new regex[2];
   regex[0] = new regex("(?m)<a[^><]+href=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>(?<text>(\\w|\\w)*?)</", regexoptions.multiline | regexoptions.ignorecase);
   regex[1] = new regex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", regexoptions.multiline | regexoptions.ignorecase);
   for (int i = 0; i < 2; i++)
   {
    match match = regex[i].match(m_html);
    while (match.success)
    {
     try
     {
      string url = new uri(m_uri, match.groups["url"].value).absoluteuri;
      string text = "";
      if (i == 0) text = new regex("(<[^>]+>)|(\\s)|( )|&|\"", regexoptions.multiline | regexoptions.ignorecase).replace(match.groups["text"].value, "");
      link link = new link(url, text);
      m_links.add(link);
     }
     catch(exception ex){console.writeline(ex.message); };
     match = match.nextmatch();
    }
   }
  }
  return m_links;
 }
 /// <summary>
 /// 此私有方法从一段html文本中提取出一定字数的纯文本
 /// </summary>
 /// <param name="instr">html代码</param>
 /// <param name="firstn">提取从头数多少个字</param>
 /// <param name="withlink">是否要链接里面的字</param>
 /// <returns>纯文本</returns>
 private string getfirstnchar(string instr, int firstn, bool withlink)
 {
  if (m_outstr == "")
  {
   m_outstr = instr.clone() as string;
   m_outstr = new regex(@"(?m)<script[^>]*>(\w|\w)*?</script[^>]*>", regexoptions.multiline | regexoptions.ignorecase ).replace(m_outstr, "");
   m_outstr = new regex(@"(?m)<style[^>]*>(\w|\w)*?</style[^>]*>", regexoptions.multiline | regexoptions.ignorecase ).replace(m_outstr, "");
   m_outstr = new regex(@"(?m)<select[^>]*>(\w|\w)*?</select[^>]*>", regexoptions.multiline | regexoptions.ignorecase ).replace(m_outstr, "");
   if (!withlink) m_outstr = new regex(@"(?m)<a[^>]*>(\w|\w)*?</a[^>]*>", regexoptions.multiline | regexoptions.ignorecase).replace(m_outstr, "");
   regex objreg = new system.text.regularexpressions.regex("(<[^>]+?>)| ", regexoptions.multiline | regexoptions.ignorecase);
   m_outstr = objreg.replace(m_outstr, "");
   regex objreg2 = new system.text.regularexpressions.regex("(\\s)+", regexoptions.multiline | regexoptions.ignorecase);
   m_outstr = objreg2.replace(m_outstr, " ");
  }
  return m_outstr.length > firstn ? m_outstr.substring(0, firstn) : m_outstr;
 }
 /// <summary>
 /// 此私有方法返回一个ip地址对应的无符号整数
 /// </summary>
 /// <param name="x">ip地址</param>
 /// <returns></returns>
 private uint getuintfromip(ipaddress x)
 {
  byte[] bt = x.getaddressbytes();
  uint i = (uint)(bt[0] * 256 * 256 * 256);
  i += (uint)(bt[1] * 256 * 256);
  i += (uint)(bt[2] * 256);
  i += (uint)(bt[3]);
  return i;
 }
 #endregion
 #region 公有文法
 /// <summary>
 /// 此公有方法提取网页中一定字数的纯文本,包括链接文字
 /// </summary>
 /// <param name="firstn">字数</param>
 /// <returns></returns>
 public string getcontext(int firstn)
 {
  return getfirstnchar(m_html, firstn, true);
 }
 /// <summary>
 /// 此公有方法提取网页中一定字数的纯文本,不包括链接文字
 /// </summary>
 /// <param name="firstn"></param>
 /// <returns></returns>
 public string getcontextwithoutlink(int firstn)
 {
  return getfirstnchar(m_html, firstn, false);
 }
 /// <summary>
 /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的url满足某正则式
 /// </summary>
 /// <param name="pattern">正则式</param>
 /// <param name="count">返回的链接的个数</param>
 /// <returns>list<link></returns>
 public list<link> getspeciallinksbyurl(string pattern,int count)
 {
  if(m_links.count==0)getlinks();
  list<link> speciallinks = new list<link>();
  list<link>.enumerator i;
  i = m_links.getenumerator();
  int cnt = 0;
  while (i.movenext() && cnt<count)
  {
   if (new regex(pattern, regexoptions.multiline | regexoptions.ignorecase ).match(i.current.url).success)
   {
    speciallinks.add(i.current);
    cnt++;
   }
  } 
  return speciallinks;
 }
 /// <summary>
 /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式
 /// </summary>
 /// <param name="pattern">正则式</param>
 /// <param name="count">返回的链接的个数</param>
 /// <returns>list<link></returns>
 public list<link> getspeciallinksbytext(string pattern,int count)
 {
  if (m_links.count == 0) getlinks();
  list<link> speciallinks = new list<link>();
  list<link>.enumerator i;
  i = m_links.getenumerator();
  int cnt = 0;
  while (i.movenext() && cnt < count)
  {
   if (new regex(pattern, regexoptions.multiline | regexoptions.ignorecase ).match(i.current.text).success)
   {
    speciallinks.add(i.current);
    cnt++;
   }
  }
  return speciallinks;
 }
 /// <summary>
 /// 此公有方法获得所有链接中在一定ip范围的链接
 /// </summary>
 /// <param name="_ip_start">起始ip</param>
 /// <param name="_ip_end">终止ip</param>
 /// <returns></returns>
 public list<link> getspeciallinksbyip(string _ip_start, string _ip_end)
 {
  ipaddress ip_start = ipaddress.parse(_ip_start);
  ipaddress ip_end = ipaddress.parse(_ip_end);
  if (m_links.count == 0) getlinks();
  list<link> speciallinks = new list<link>();
  list<link>.enumerator i;
  i = m_links.getenumerator();
  while (i.movenext())
  {
   ipaddress ip;
   try
   {
    ip = dns.gethostentry(new uri(i.current.url).host).addresslist[0];
   }
   catch { continue; }
   if(getuintfromip(ip)>=getuintfromip(ip_start) && getuintfromip(ip)<=getuintfromip(ip_end))
   {
    speciallinks.add(i.current);
   }
  }
  return speciallinks;
 }
 /// <summary>
 /// 这公有方法提取本网页的纯文本中满足某正则式的文字
 /// </summary>
 /// <param name="pattern">正则式</param>
 /// <returns>返回文字</returns>
 public string getspecialwords(string pattern)
 {
  if (m_outstr == "") getcontext(int16.maxvalue);
  regex regex = new regex(pattern, regexoptions.multiline | regexoptions.ignorecase );
  match mc=regex.match(m_outstr);
  if (mc.success)
   return mc.groups[1].value;
  return string.empty;
 }
 #endregion
 #region 构造函数
 private void init(string _url)
 {
  try
  {
   m_uri = new uri(_url);
   m_links = new list<link>();
   m_html = "";
   m_outstr = "";
   m_title = "";
   m_good = true;
   if (_url.endswith(".rar") || _url.endswith(".dat") || _url.endswith(".msi"))
   {
    m_good = false;
    return;
   }
   httpwebrequest rqst = (httpwebrequest)webrequest.create(m_uri);
   rqst.allowautoredirect = true;
   rqst.maximumautomaticredirections = 3;
   rqst.useragent = "mozilla/4.0 (compatible; msie 5.01; windows nt 5.0)";
   rqst.keepalive = true;
   rqst.timeout = 30000;
   lock (webpage.webcookies)
   {
    if (webpage.webcookies.containskey(m_uri.host))
     rqst.cookiecontainer = webpage.webcookies[m_uri.host];
    else
    {
     cookiecontainer cc = new cookiecontainer();
     webpage.webcookies[m_uri.host] = cc;
     rqst.cookiecontainer = cc;
    }
   }
   httpwebresponse rsps = (httpwebresponse)rqst.getresponse();
   stream sm = rsps.getresponsestream();
   if (!rsps.contenttype.tolower().startswith("text/") || rsps.contentlength > 1 << 22)
   {
    rsps.close();
    m_good = false;
    return;
   }
   encoding cding = system.text.encoding.default;
   string contenttype=rsps.contenttype.tolower();
   int ix = contenttype.indexof("charset=");
   if (ix != -1)
   {
    try
    {
     cding = system.text.encoding.getencoding(rsps.contenttype.substring(ix + "charset".length + 1));
    }
    catch
    {
     cding = encoding.default;
    }
    m_html = new streamreader(sm, cding).readtoend();
   }
   else
   {
    m_html = new streamreader(sm, cding).readtoend();
    regex regex = new regex("charset=(?<cding>[^=]+)?\"",regexoptions.ignorecase);
    string strcding = regex.match(m_html).groups["cding"].value;
    try
    {
     cding = encoding.getencoding(strcding);
    }
    catch{
     cding = encoding.default;
    }
    byte[] bytes=encoding.default.getbytes(m_html.tochararray());
    m_html = cding.getstring(bytes);
    if (m_html.split('?').length > 100)
    {
     m_html=encoding.default.getstring(bytes);
    }
   }
   
   m_pagesize = m_html.length;
   m_uri = rsps.responseuri;
   rsps.close();
  }
  catch (exception ex)
  {
   console.writeline(ex.message+m_uri.tostring());
   m_good = false;
  }
 }
 public webpage(string _url)
 {
  string uurl = "";
  try
  {
   uurl = uri.unescapedatastring(_url);
   _url = uurl;
  }
  catch { };
  regex re = new regex("(?<h>[^\x00-\xff]+)");
  match mc = re.match(_url);
  if (mc.success)
  {
   string han = mc.groups["h"].value;
   _url = _url.replace(han, system.web.httputility.urlencode(han, encoding.getencoding("gb2312")));
  }
  init(_url);
 }
 public webpage(string _url, string _loginurl, string _post)
 {
  string uurl = "";
  try
  {
   uurl = uri.unescapedatastring(_url);
   _url = uurl;
  }
  catch { };
  regex re = new regex("(?<h>[^\x00-\xff]+)");
  match mc = re.match(_url);
  if (mc.success)
  {
   string han = mc.groups["h"].value;
   _url = _url.replace(han, system.web.httputility.urlencode(han, encoding.getencoding("gb2312")));
  }
  if (_loginurl.trim() == "" || _post.trim() == "" || webpage.webcookies.containskey(new uri(_url).host))
  {
   init(_url);
  }
  else
  {
   #region 登陆
   string indata = _post;
   m_post = _post;
   m_loginurl = _loginurl;
   byte[] bytes = encoding.default.getbytes(_post);
   cookiecontainer mycookiecontainer = new cookiecontainer();
   try
   {
    //新建一个cookiecontainer来存放cookie集合 
    httpwebrequest myhttpwebrequest = (httpwebrequest)webrequest.create(_loginurl);
    //新建一个httpwebrequest 
    myhttpwebrequest.contenttype = "application/x-www-form-urlencoded";
    myhttpwebrequest.allowautoredirect = false;
    myhttpwebrequest.useragent = "mozilla/4.0 (compatible; msie 5.01; windows nt 5.0)";
    myhttpwebrequest.timeout = 60000;
    myhttpwebrequest.keepalive = true;
    myhttpwebrequest.contentlength = bytes.length;
    myhttpwebrequest.method = "post";
    myhttpwebrequest.cookiecontainer = mycookiecontainer;
    //设置httpwebrequest的cookiecontainer为刚才建立的那个mycookiecontainer 
    stream myrequeststream = myhttpwebrequest.getrequeststream();
    myrequeststream.write(bytes, 0, bytes.length);
    myrequeststream.close();
    httpwebresponse myhttpwebresponse = (httpwebresponse)myhttpwebrequest.getresponse();
    foreach (cookie ck in myhttpwebresponse.cookies)
    {
     mycookiecontainer.add(ck);
    }
    myhttpwebresponse.close();
   }
   catch
   {
    init(_url);
    return;
   }
   #endregion
   #region 登陆后再访问页面
   try
   {
    m_uri = new uri(_url);
    m_links = new list<link>();
    m_html = "";
    m_outstr = "";
    m_title = "";
    m_good = true;
    if (_url.endswith(".rar") || _url.endswith(".dat") || _url.endswith(".msi"))
    {
     m_good = false;
     return;
    }
    httpwebrequest rqst = (httpwebrequest)webrequest.create(m_uri);
    rqst.allowautoredirect = true;
    rqst.maximumautomaticredirections = 3;
    rqst.useragent = "mozilla/4.0 (compatible; msie 5.01; windows nt 5.0)";
    rqst.keepalive = true;
    rqst.timeout = 30000;
    rqst.cookiecontainer = mycookiecontainer;
    lock (webpage.webcookies)
    {
     webpage.webcookies[m_uri.host] = mycookiecontainer;
    }
    httpwebresponse rsps = (httpwebresponse)rqst.getresponse();
    stream sm = rsps.getresponsestream();
    if (!rsps.contenttype.tolower().startswith("text/") || rsps.contentlength > 1 << 22)
    {
     rsps.close();
     m_good = false;
     return;
    }
    encoding cding = system.text.encoding.default;
    int ix = rsps.contenttype.tolower().indexof("charset=");
    if (ix != -1)
    {
     try
     {
      cding = system.text.encoding.getencoding(rsps.contenttype.substring(ix + "charset".length + 1));
     }
     catch
     {
      cding = encoding.default;
     }
    }
    m_html = new streamreader(sm, cding).readtoend();
    m_pagesize = m_html.length;
    m_uri = rsps.responseuri;
    rsps.close();
   }
   catch (exception ex)
   {
    console.writeline(ex.message+m_uri.tostring());
    m_good = false;
   }
   #endregion
  }
 }
 #endregion
 #region 属性
 /// <summary>
 /// 通过此属性可获得本网页的网址,只读
 /// </summary>
 public string url
 {
  get
  {
   return m_uri.absoluteuri;
  }
 }
 /// <summary>
 /// 通过此属性可获得本网页的标题,只读
 /// </summary>
 public string title
 {
  get
  {
   if (m_title == "")
   {
    regex reg = new regex(@"(?m)<title[^>]*>(?<title>(?:\w|\w)*?)</title[^>]*>", regexoptions.multiline | regexoptions.ignorecase );
    match mc = reg.match(m_html);
    if (mc.success)
     m_title= mc.groups["title"].value.trim();
   }
   return m_title;
  }
 }
 /// <summary>
 /// 此属性获得本网页的所有链接信息,只读
 /// </summary>
 public list<link> links
 {
  get
  {
   if (m_links.count == 0) getlinks();
   return m_links;
  }
 }
 /// <summary>
 /// 此属性返回本网页的全部纯文本信息,只读
 /// </summary>
 public string context
 {
  get
  {
   if (m_outstr == "") getcontext(int16.maxvalue);
   return m_outstr;
  }
 }
 /// <summary>
 /// 此属性获得本网页的大小
 /// </summary>
 public int pagesize
 {
  get
  {
   return m_pagesize;
  }
 }
 /// <summary>
 /// 此属性获得本网页的所有站内链接
 /// </summary>
 public list<link> insitelinks
 {
  get
  {
   return getspeciallinksbyurl("^http://"+m_uri.host,int16.maxvalue);
  }
 }
 /// <summary>
 /// 此属性表示本网页是否可用
 /// </summary>
 public bool isgood
 {
  get
  {
   return m_good;
  }
 }
 /// <summary>
 /// 此属性表示网页的所在的网站
 /// </summary>
 public string host
 {
  get
  {
   return m_uri.host;
  }
 }
 /// <summary>
 /// 此网页的登陆页所需的post数据
 /// </summary>
 public string poststr
 {
  get
  {
   return m_post;
  }
 }
 /// <summary>
 /// 此网页的登陆页
 /// </summary>
 public string loginurl
 {
  get
  {
   return m_loginurl;
  }
 }
 #endregion
}
/// <summary>
/// 链接类
/// </summary>
public class link
{
 public string url; //链接网址
 public string text; //链接文字
 public link(string _url, string _text)
 {
  url = _url;
  text = _text;
 }
}

希望本文所述对大家的c#程序设计有所帮助。

如您对本文有疑问或者有任何想说的,请 点击进行留言回复,万千网友为您解惑!

相关文章:

验证码:
移动技术网