当前位置: 移动技术网 > IT编程>开发语言>.net > 用DOM实现文章采集--采集到网页源码

用DOM实现文章采集--采集到网页源码

2019年04月19日  | 移动技术网IT编程  | 我要评论

湖南省新宁县,高田三七,李攀教你一招

先来个采集网页的代码。
[csharp]
using system; 
using system.collections.generic; 
using system.io; 
using system.io.compression; 
using system.net; 
using system.text; 
namespace topwincms.common 

    public class nethelper 
    { 
 
        //private string _http_user_agent = "mozilla/4.0+(compatible;+msie+6.0;+windows+nt+5.2;+sv1;+.net+clr+1.1.4322;+.net+clr+2.0.50727)"; 
        private string _useragent = "googlebot/2.1 (+https://www.google.com/bot.html)"; 
        private encoding _httpencoding = null; 
        private string _proxyhost = string.empty; 
        private int _proxyint = 8080; 
        private int _timeout = 200000; 
 
        #region 属性 
        /// <summary> 
        /// 设置useragent 
        /// </summary> 
        public string useragent 
        { 
            get 
            { 
                return this._useragent; 
            } 
            set 
            { 
                this._useragent = value; 
            } 
        } 
        /// <summary> 
        /// 设置编码 
        /// </summary> 
        public encoding httpencoding 
        { 
            get 
            { 
                return this._httpencoding; 
            } 
            set 
            { 
                this._httpencoding = value; 
            } 
        } 
        /// <summary> 
        /// 设置代理服务器 
        /// </summary> 
        public string proxyhost 
        { 
            get 
            { 
                return this._proxyhost; 
            } 
            set 
            { 
                this._proxyhost = value; 
            } 
        } 
        /// <summary> 
        /// 设置代理服务器端口 
        /// </summary> 
        public int proxyint 
        { 
            get 
            { 
                return this._proxyint; 
            } 
            set 
            { 
                this._proxyint = value; 
            } 
        } 
        /// <summary> 
        /// 设置默认超时时间 
        /// </summary> 
        public int timeout 
        { 
            get 
            { 
                return this._timeout; 
            } 
            set 
            { 
                this._timeout = value; 
            } 
        } 
        #endregion 
 
        public remoteres get(string uri) 
        { 
            return get(new uri(uri)); 
        } 
        public remoteres get(uri uri) 
        { 
            remoteres info = new remoteres(); 
 
            httpwebrequest request = (httpwebrequest)webrequest.create(uri); 
            request.timeout = this._timeout; 
            request.useragent = this._useragent; 
            request.method = "get";  
            request.referer = string.concat("https://", uri.host); 
 
            if (this._proxyhost.length > 0) 
            { 
                request.proxy = new webproxy(this._proxyhost, this._proxyint); 
            } 
            httpwebresponse response = null; 
            stream responsestream = null; 
            try 
            { 
                encoding encoding; 
                response = (httpwebresponse)request.getresponse(); 
                responsestream = response.getresponsestream(); 
               
                if (response.headers["accept-encoding"] != null) 
                { 
                    if (mycollections.contain(response.headers["accept-encoding"], "*", "gzip", "x-gzip")) 
                    { 
                        responsestream = new gzipstream(responsestream, compressionmode.decompress); 
                    } 
                } 
                else if (response.headers["content-encoding"] != null) 
                { 
                    if (mycollections.contain(response.headers["content-encoding"], "*", "gzip", "x-gzip")) 
                    { 
                        responsestream = new gzipstream(responsestream, compressionmode.decompress); 
                    } 
                } 
                
                if (this._httpencoding == null) 
                { 
                    string str = response.characterset.tolower(); 
                    if (str.length > 3) 
                    { 
                        if (str.substring(0, 3) == "iso") 
                        { 
                            encoding = encoding.default; 
                        } 
                        else 
                        { 
                            encoding = encoding.getencoding(response.characterset); 
                        } 
                    } 
                    else 
                    { 
                        encoding = encoding.getencoding(response.characterset); 
                    } 
                    if (str.length == 0) 
                    { 
                        encoding = encoding.utf8; 
                    } 
                } 
                else 
                { 
                    encoding = this._httpencoding; 
                } 
                info.html = new streamreader(responsestream, encoding).readtoend(); 
                info.contenttype = response.contenttype; 
                info.statuscode = response.statuscode; 
 
            } 
            catch (webexception we) 
            { 
                if (we.response != null) 
                { 
                    info.statuscode = (we.response as httpwebresponse).statuscode; 
                } 
                else 
                { 
                    info.statuscode = httpstatuscode.serviceunavailable; 
                } 
                info.code = "错误:" + we.message; 
 
            } 
            catch (exception ex) 
            { 
                info.code = "错误:" + ex.message; 
                info.statuscode = httpstatuscode.internalservererror; 
            } 
            finally 
            { 
                if (responsestream != null) 
                    responsestream.close(); 
                if (response != null) 
                    response.close(); 
            } 
 
            return info; 
        } 
 
        #region 取得远程资源 
        /// <summary> 
        /// 取得远程资源   
        /// </summary> 
        /// <param name="strurl">要取的url</param> 
        /// <returns>网页源代码</returns> 
        public remoteres getremoteresource(string strurl) 
        { 
            httpwebresponse response = null; 
            stream stream = null; 
            remoteres info = new remoteres(); 
            try 
            { 
                httpwebrequest request = (httpwebrequest)webrequest.create(strurl); 
                request.allowautoredirect = true; 
                request.useragent = "mozilla/4.0 (compatible; msie 7.0; windows nt 6.0; slcc1; .net clr 2.0.50727; .net clr 3.0.04506)"; 
                request.referer = "https://" + new uri(strurl).host; 
                response = request.getresponse() as httpwebresponse; 
                stream = response.getresponsestream(); 
                info.contenttype = response.contenttype; 
                memorystream ms = new memorystream(); 
 
                byte[] buffer = new byte[256]; 
 
                int c = stream.read(buffer, 0, buffer.length); 
 
                while (c > 0) 
                { 
                    ms.write(buffer, 0, c); 
                    c = stream.read(buffer, 0, buffer.length); 
                } 
                stream.close(); 
 
                info.statuscode = response.statuscode; 
 
                info.bytes = ms.toarray(); 
 
            } 
            catch (webexception we) 
            { 
                if (we.response != null) 
                { 
                    info.statuscode = (we.response as httpwebresponse).statuscode; 
                } 
                else 
                { 
                    info.statuscode = httpstatuscode.serviceunavailable; 
                } 
 
                return null; 
            } 
            catch 
            { 
                info.statuscode = httpstatuscode.internalservererror; 
 
                return null; 
            } 
            finally 
            { 
                if (stream != null) 
                    stream.close(); 
 
                if (response != null) 
                    response.close(); 
            } 
            return info; 
        } 
        #endregion 
 
 
        public remoteres post(string strurl, string postdata) 
        { 
            remoteres info = new remoteres(); 
            stream responsestream = null; 
            httpwebresponse response = null; 
            try 
            { 
                byte[] bytes = this._httpencoding.getbytes(postdata); 
                httpwebrequest request = (httpwebrequest)webrequest.create(strurl); 
                request.method = "post"; 
                request.contenttype = "application/x-www-form-urlencoded"; 
                request.contentlength = bytes.length; 
                request.timeout = this._timeout; 
                request.useragent = this._useragent;  
                //request.referer = string.concat("https://", uri.host); 
                if (this._proxyhost.length > 0) 
                { 
                    request.proxy = new webproxy(this._proxyhost, this._proxyint); 
                } 
                using (stream requeststream = request.getrequeststream()) 
                { 
                    requeststream.write(bytes, 0, bytes.length); 
                    requeststream.close(); 
                } 
                try 
                { 
                    encoding encoding; 
                    response = (httpwebresponse)request.getresponse(); 
                    responsestream = response.getresponsestream(); 
                    if (this._httpencoding == null) 
                    { 
                        string str = response.characterset.tolower(); 
                        if (str.length > 3) 
                        { 
                            if (str.substring(0, 3) == "iso") 
                            { 
                                encoding = encoding.default; 
                            } 
                            else 
                            { 
                                encoding = encoding.getencoding(response.characterset); 
                            } 
                        } 
                        else 
                        { 
                            encoding = encoding.getencoding(response.characterset); 
                        } 
                        if (str.length == 0) 
                        { 
                            encoding = encoding.default; 
                        } 
                    } 
                    else 
                    { 
                        encoding = this._httpencoding; 
                    } 
                    info.html = new streamreader(responsestream, encoding).readtoend(); 
                    info.statuscode = httpstatuscode.ok; 
 
                    responsestream.close(); 
                    response.close(); 
                    return info; 
                } 
                catch (exception ex) 
                { 
                    info.html = "错误:" + ex.message; 
                } 
 
            } 
            catch (exception ex) 
            { 
                info.html = "错误:" + ex.message; 
            } 
            finally 
            { 
                if (responsestream != null) 
                    responsestream.close(); 
                if (response != null) 
                    response.close(); 
            } 
            return info; 
        } 
 
        #region 检查链接 
        /// <summary> 
        /// 检查链接是否存在 
        /// </summary> 
        /// <param name="surl"></param> 
        /// <param name="allowbadnum"></param> 
        public bool urlexist(string strurl) 
        { 
            httpwebrequest request = (httpwebrequest)webrequest.create(strurl); 
            request.method = "head"; 
            request.allowautoredirect = false; 
            request.useragent = "mozilla/4.0 (compatible; msie 7.0; windows nt 6.0; slcc1; .net clr 2.0.50727; .net clr 3.0.04506; .net clr 3.5.21022; .net clr 1.0.3705; .net clr 1.1.4322)"; 
            httpwebresponse response = (httpwebresponse)request.getresponse(); 
            if (response.statuscode != httpstatuscode.ok) 
            { 
                response.close(); 
                return false; 
            } 
            else 
            { 
                return true; 
            } 
 
 
        } 
        /// <summary> 
        /// 检查死链接是否在能容忍的数量内 
        /// </summary> 
        /// <param name="urls"></param> 
        /// <param name="allowbadnum"></param> 
        /// <returns></returns> 
        public bool urlexist(list<string> urls, int allowbadnum) 
        { 
            //如果图片的数量小于能容忍的数量就不用检查了。 
            if (urls.count <= allowbadnum) 
            { 
                return true; 
            } 
            int inttemp = 0; 
            foreach (string strurl in urls) 
            { 
                if (urlexist(strurl) == false) 
                { 
                    inttemp++; 
                    if (inttemp > allowbadnum) 
                    { 
                        return false; 
                    } 
                } 
            } 
            return true; 
        } 
        #endregion 
    } 
 
    public class remoteres 
    { 
        private string _code; 
        private string _html; 
        private byte[] _bytes; 
        private string _contenttype; 
        private httpstatuscode _statuscode; 
        /// <summary> 
        /// 返回信息的代码 
        /// </summary> 
        public string code 
        { 
            get 
            { 
                return this._code; 
            } 
            set 
            { 
                this._code = value; 
            } 
        } 
        /// <summary> 
        /// 信息 
        /// </summary> 
        public string html 
        { 
            get 
            { 
                return this._html; 
            } 
            set 
            { 
                this._html = value; 
            } 
        } 
        /// <summary> 
        /// 远程资源 
        /// </summary> 
        public byte[] bytes 
        { 
            get 
            { 
                return this._bytes; 
            } 
            set 
            { 
                this._bytes = value; 
            } 
        } 
        /// <summary> 
        /// 内容类型 
        /// </summary> 
        public string contenttype 
        { 
            get 
            { 
                return this._contenttype; 
            } 
            set 
            { 
                this._contenttype = value; 
            } 
        } 
        /// <summary> 
        /// 状态代码 
        /// </summary> 
        public httpstatuscode statuscode 
        { 
            get 
            { 
                return this._statuscode; 
            } 
            set 
            { 
                this._statuscode = value; 
            } 
        } 
    } 




摘自 winner2050的专栏

如对本文有疑问,请在下面进行留言讨论,广大热心网友会与你互动!! 点击进行留言回复

相关文章:

验证码:
移动技术网