最近对爬虫很感兴趣,稍微研究了一下,利用htmlagilitypack制作了一个十分简单的爬虫,这个简易爬虫只能获取静态页面的html
htmlagilitypack是一个解析速度十分快,并且开源的html解析工具,并且htmlagilitypack支持使用xpath解析html,能够帮助我们解析html文档就像解析xml文档一样轻松、方便。
//从网页中加载 string url = "https://www.bilibili.com"; htmlweb web = new htmlweb(); htmldocument hd = web.load(url);
需要using system.net
和using system.io
/// <summary> /// 图片下载器 /// </summary> public class imgdownloader { /// <summary> /// 下载图片 /// </summary> /// <param name="webclient"></param> /// <param name="url">图片url</param> /// <param name="folderpath">文件夹路径</param> /// <param name="filename">图片名</param> public static void downloadimg(webclient webclient, string url, string folderpath, string filename) { //如果文件夹不存在,则创建一个 if (!directory.exists(folderpath)) { directory.createdirectory(folderpath); } //判断路径是否完整,补全不完整的路径 if (url.indexof("https:") == -1 && url.indexof("http:") == -1) { url = "https:" + url; } //下载图片 try { webclient.downloadfile(url, folderpath + filename); console.writeline(filename + "下载成功"); } catch (exception ex) { console.write(ex.message); console.writeline(url); } } }
string imgpath = "//img";//选择img int imgnum = 0;//图片编号 //获取img标签中的图片 foreach (htmlnode node in hd.documentnode.selectnodes(imgpath)) { if (node.attributes["src"] != null) { string imgurl = node.attributes["src"].value.tostring(); if (imgurl != "" && imgurl != " ") { imgnum++; //生成文件名,自动获取后缀 string filename = imgnum + imgurl.substring(imgurl.lastindexof(".")); imgdownloader.downloadimg(wc, imgurl, "images/", filename); } } }
//获取背景图 string bgimgpath = "//*[@style]";//选择具有style属性的节点 foreach (htmlnode node in hd.documentnode.selectnodes(bgimgpath)) { if (node.attributes["style"].value.contains("background-image:url")) { imgnum++; string bgimgurl = node.attributes["style"].value; bgimgurl = regex.match(bgimgurl, @"(?<=\().+?(?=\))").value;//读取url()的内容 //console.writeline(bgimgurl); //生成文件名,自动获取后缀 string filename = imgnum + bgimgurl.substring(bgimgurl.lastindexof(".")); imgdownloader.downloadimg(wc, bgimgurl, "images/bgcimg/", filename); } }
using system.linq; using system.text; using system.threading.tasks; using system.net; using system.io; using htmlagilitypack; using system.text.regularexpressions; namespace webcrawlerdemo { class program { static void main(string[] args) { webclient wc = new webclient(); string url = "https://www.bilibili.com"; htmlweb web = new htmlweb(); htmldocument hd = web.load(url);//下载html页面 string imgpath = "//img";//选择img int imgnum = 0;//图片编号 //获取img标签中的图片 foreach (htmlnode node in hd.documentnode.selectnodes(imgpath)) { if (node.attributes["src"] != null) { string imgurl = node.attributes["src"].value.tostring(); if (imgurl != "" && imgurl != " ") { imgnum++; //生成文件名,自动获取后缀 string filename = imgnum + imgurl.substring(imgurl.lastindexof(".")); imgdownloader.downloadimg(wc, imgurl, "images/", filename); } } } //获取背景图 string bgimgpath = "//*[@style]";//选择具有style属性的节点 foreach (htmlnode node in hd.documentnode.selectnodes(bgimgpath)) { if (node.attributes["style"].value.contains("background-image:url")) { imgnum++; string bgimgurl = node.attributes["style"].value; bgimgurl = regex.match(bgimgurl, @"(?<=\().+?(?=\))").value;//读取url()的内容 //生成文件名,自动获取后缀 string filename = imgnum + bgimgurl.substring(bgimgurl.lastindexof(".")); imgdownloader.downloadimg(wc, bgimgurl, "images/bgcimg/", filename); } } console.writeline("----------end----------"); console.readkey(); } } /// <summary> /// 图片下载器 /// </summary> public class imgdownloader { /// <summary> /// 下载图片 /// </summary> /// <param name="webclient"></param> /// <param name="url">图片url</param> /// <param name="folderpath">文件夹路径</param> /// <param name="filename">图片名</param> public static void downloadimg(webclient webclient, string url, string folderpath, string filename) { //如果文件夹不存在,则创建一个 if (!directory.exists(folderpath)) { directory.createdirectory(folderpath); } //判断路径是否完整,补全不完整的路径 if (url.indexof("https:") == -1 && url.indexof("http:") == -1) { url = "https:" + url; } //下载图片 try { webclient.downloadfile(url, folderpath + filename); console.writeline(filename + "下载成功"); } catch (exception ex) { console.write(ex.message); console.writeline(url); } } } }
如对本文有疑问, 点击进行留言回复!!
使用Visual Studio2019创建C#项目(窗体应用程序、控制台应用程序、Web应用程序)
C#实现获取本地内网(局域网)和外网(公网)IP地址的方法分析
网友评论