当前位置: 移动技术网 > IT编程>开发语言>.net > 采集网页图片代码

采集网页图片代码

2019年04月19日  | 移动技术网IT编程  | 我要评论
采集网页上图片的主要关键是在怎么解析出页面代码里那些img标签的src属性,在网上找了下大多都是通过字符串操作找出img标签,这种方式操作起来比较麻烦,而且代码看起来比较累。这里我

采集网页上图片的主要关键是在怎么解析出页面代码里那些img标签的src属性,在网上找了下大多都是通过字符串操作找出img标签,这种方式操作起来比较麻烦,而且代码看起来比较累。这里我用的方法是通过webbrowser来加载一个页面,然后htmldocument类来操作省去了字符串操作的步骤,直接调用getelementsbytagname把所有图片地址返回到一个htmlelementcollection对象里。
代码如下:
using system;
using system.collections.generic;
using system.linq;
using system.text;
using system.text.regularexpressions;
using system.net;
using system.io;
using system.windows.forms;
namespace windowsformsapplication1
{
    public class gatherpic
    {
        private string savepath;
        private string geturl;
        private webbrowser wb;
        private int iimgcount;
        //初始化参数
        public gatherpic(string sweburl, string ssavepath)
        {
            this.geturl = sweburl;
            this.savepath = ssavepath;
        }
        //开始采集
        public bool start()
        {
            if (geturl.trim().equals(""))
            {
                messagebox.show("哪来的虾米连网址都没输!");
                return false;
            }
            this.wb = new webbrowser();
            this.wb.navigate(geturl);
            //委托事件
            this.wb.documentcompleted += new system.windows.forms.webbrowserdocumentcompletedeventhandler(documentcompleted);
            return true;
        }
        //webbrowser.documentcompleted委托事件
        private void documentcompleted(object sender, webbrowserdocumentcompletedeventargs e)
        {
            //页面里框架iframe加载完成不掉用searchimglist()
            if (e.url != wb.document.url) return;
            searchimglist();
        }
        //检查出所有图片并采集到本地
        public void searchimglist()
        {
            string simgurl;
            //取得所有图片地址
            htmlelementcollection elemcoll = this.wb.document.getelementsbytagname("img");
            this.iimgcount = elemcoll.count;
            foreach (htmlelement elem in elemcoll)
            {
                simgurl = elem.getattribute("src");
                //调用保存远程图片函数
                saveimagefromweb(simgurl, this.savepath);
            }
        }
        //保存远程图片函数
        public int saveimagefromweb(string imgurl, string path)
        {
            string imgname = imgurl.tostring().substring(imgurl.tostring().lastindexof("/") + 1);
            path = path + "\\" + imgname;
            string defaulttype = ".jpg";
            string[] imgtypes = new string[] { ".jpg", ".jpeg", ".png", ".gif", ".bmp" };
            string imgtype = imgurl.tostring().substring(imgurl.tostring().lastindexof("."));
            foreach (string it in imgtypes)
            {
                if (imgtype.tolower().equals(it))
                    break;
                if (it.equals(".bmp"))
                    imgtype = defaulttype;
            }
            try
            {
                httpwebrequest request = (httpwebrequest)webrequest.create(imgurl);
                request.useragent = "mozilla/6.0 (msie 6.0; windows nt 5.1; natas.robot)";
                request.timeout = 10000;
                webresponse response = request.getresponse();
                stream stream = response.getresponsestream();
                if (response.contenttype.tolower().startswith("image/"))
                {
                    byte[] arraybyte = new byte[1024];
                    int imglong = (int)response.contentlength;
                    int l = 0;
                    // createdirectory(path);
                    filestream fso = new filestream(path, filemode.create);
                    while (l < imglong)
                    {
                        int i = stream.read(arraybyte, 0, 1024);
                        fso.write(arraybyte, 0, i);
                        l += i;
                    }
                    fso.close();
                    stream.close();
                    response.close();
                    return 1;
                }
                else
                {
                    return 0;
                }
            }
            catch (webexception)
            {
                return 0;
            }
            catch (uriformatexception)
            {
                return 0;
            }
        }
    }
}
//-----------------调用代码--------------------
gatherpic gatherpic = new gatherpic(“https://www.baidu.com”,"c:\test");
//请确保c:\下存在test路径
gatherpic.start()

 

 

摘自 与时俱进

如您对本文有疑问或者有任何想说的,请点击进行留言回复,万千网友为您解惑!

相关文章:

验证码:
移动技术网