当前位置：移动技术网 > IT编程>开发语言>c# > C#通过正则表达式实现提取网页中的图片

C#通过正则表达式实现提取网页中的图片

2019年07月18日 | 移动技术网IT编程 | 我要评论

目前在做项目中有处理图片的部分，参考了一下网上案例，自己写了一个获取内容中的图片地址的方法。

一般来说一个 html 文档有很多标签，比如“<html>”、“<body>”、“<table>”等，想把文档中的 img 标签提取出来并不是一件容易的事。由于 img 标签样式变化多端，使提取的时候用程序寻找并不容易。于是想要寻找它们就必须写一个非常健全的正则表达式，不然有可能会找得不全，或者找出来的不是正确的 img 标签。

我们可以从 html 标签的格式去想应该怎么建这个正则表达式。首先要想一下 img 标签有几种写法，忽略大小写不看的话，下面列出 img 标签可能出现的几种情况。
<img> <img/> <img src=/>

这一些标签不用考虑，因为没有图片资源地址。
<img src = /images/pic.jpg/ > <img src =" /images/pic.jpg" > <img src= '/images/pic.jpg ' / >

这一些标签都有图片资源地址，另外还有一个特点就是有引号对，可能为单引号，也可能为双引号。因为不需要同时匹配引号对，所以正则表达式可以这么写：@"<img\s*src\s*=\s*[""']?\s*(?[^\s""'<>]*)\s*/?\s*>"
<img width="320" height="240" src=/images/pic.jpg onclick="window.open('/images/pic.jpg')">

因为 img 和 src 之间可能会有其他的参数，所以“<img”要有个单词结束，比如说不能是“<imgabc”，同样 src 前面也是一样，使用单词结束符“\b”有一个好处就是省去了表示空格的“\s*”。另外由于 img 标签中不可以出现“<”、“>”这样的符号，所以要改写前面的正则表达式：@"<img\b[^<>]*?\bsrc\s*=\s*[""']?\s*(?<imgurl>[^\s""'<>]*)[^<>]*?/?\s*>"
<img width="320" height="240" src = "
/images/pic.jpg" />

像这种可能会用回车符折行的问题有时候会出现，所以在有空格分开的地方要包含回车换行和 tab 字符，另外在图片地址中不能出现空格、tab、回车和换行字符。

所以上面的正则表达式可以改成：@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgurl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>"

下面写出取得html中所有图片地址的类hvthtmlimage：

using system.text.regularexpressions;
namespace hovertree.hovertreeframe.hvtimage
{
public class hvthtmlimage
{
/// <summary> 
/// 取得html中所有图片的 url。 
/// </summary> 
/// <param name="shtmltext">html代码</param> 
/// <returns>图片的url列表</returns> 
public static string[] gethvtimgurls(string shtmltext)
{
// 定义正则表达式用来匹配 img 标签 
regex m_hvtregimg = new regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgurl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", regexoptions.ignorecase);

// 搜索匹配的字符串 
matchcollection matches = m_hvtregimg.matches(shtmltext);
int m_i = 0;
string[] surllist = new string[matches.count];
// 取得匹配项列表 
foreach (match match in matches)
surllist[m_i++] = match.groups["imgurl"].value;
return surllist;
}
}
}

下面我们再来看一个例子

public array matchhtml(string html,string com)
    {
      list<string> urls = new list<string>();
      html = html.tolower();
      //获取src标签中的url
      regex regexsrc = new regex("src=\"[^\"]*[(.jpg)(.png)(.gif)(.bmp)(.ico)]\"");
      foreach(match m in regexsrc.matches(html))
      {
        string src = m.value;
        src = src.replace("src=","").replace("\"","");
        if (!src.contains("http"))
          src = com + src;
        if(!urls.contains(src))
        urls.add(src);
      }
      //获取href标签中url
      regex regexhref = new regex("href=\"[^\"]*[(.jpg)(.png)(.gif)(.bmp)(.ico)]\"");
      foreach (match m in regexhref.matches(html))
      {
        string href = m.value;
        href = href.replace("href=", "").replace("\"", "");
        if (!href.contains("http"))
          href = com + href;
        if(!urls.contains(href))
        urls.add(href);
      }
      return urls.toarray();
    }

[dllimport("kernel32.dll")]
    static extern bool setconsolemode(intptr hconsolehandle, int mode);
    [dllimport("kernel32.dll")]
    static extern bool getconsolemode(intptr hconsolehandle, out int mode);
    [dllimport("kernel32.dll")]
    static extern intptr getstdhandle(int handle);
    const int std_input_handle = -10;
    const int enable_quick_edit_mode = 0x40 | 0x80;
    public static void enablequickeditmode()
    {
      int mode; intptr handle = getstdhandle(std_input_handle);
      getconsolemode(handle, out mode);
      mode |= enable_quick_edit_mode;
      setconsolemode(handle, mode);
    }
    static void main(string[] args)
    {
      enablequickeditmode();
      int oldcount = 0;
      console.title = "takeimagefrominternet";
      string path = "e:\\download\\loading\\";
      while (true)
      {
        console.clear();
        string countfile = "e:\\countfile.txt";//用来计数的文本，以至于文件名不重复
        int cursor = 0;
        if (file.exists(countfile))
        {
          string text = file.readalltext(countfile);
          try
          {
            cursor =oldcount = convert.toint32(text);//次数多了建议使用long
          }
          catch { }
        }
        console.write("please input a url:");
        string url = "http://www.baidu.com/";
        string temp = console.readline();
        if (!string.isnullorempty(temp))
          url = temp;
        match mcom = new regex(@"^(?i)http://(\w+\.){2,3}(com(\.cn)?|cn|net)\b").match(url);//获取域名
        string com = mcom.value;
        //console.writeline(mcom.value);
        console.write("please input a save path:");
        temp = console.readline();
        if (directory.exists(temp))
          path = temp;
        console.writeline();
        webclient client = new webclient();
        byte[] htmldata = null;
        htmldata = client.downloaddata(url);
        memorystream mstream = new memorystream(htmldata);
        string html = "";
        using (streamreader sr = new streamreader(mstream))
        {
          html = sr.readtoend();
        }
        array urls = new matchhtmlimageurl().matchhtml(html,com);
 
        foreach (string imageurl in urls)
        {
         console.writeline(imageurl);
          byte[] imagedata = null;
          try
          {
            imagedata = client.downloaddata(imageurl);
          }
          catch { }
          if (imagedata != null && imagedata.length>0)
            using (memorystream ms = new memorystream(imagedata))
            {
              try
              {
                
                string ext = aping.utility.file.fileopration.extendname(imageurl);
                imageformat format = imageformat.jpeg;
                switch (ext)
                {
                  case ".jpg":
                    format = imageformat.jpeg;
                    break;
                  case ".bmp":
                    format = imageformat.bmp;
                    break;
                  case ".png":
                    format = imageformat.png;
                    break;
                  case ".gif":
                    format = imageformat.gif;
                    break;
                  case ".ico":
                    format = imageformat.icon;
                    break;
                  default:
                    continue;
                }
                image image = new bitmap(ms);
                if (directory.exists(path))
                  image.save(path + "\\" + cursor + ext, format);
              }
              catch(exception ex) { console.writeline(ex.message); }
            }
          cursor++;
        }
        mstream.close();
        file.writealltext(countfile, cursor.tostring(), encoding.utf8);
        console.writeline("take done...image count:"+(cursor-oldcount).tostring());
      }      
    }

您可能感兴趣的文章:

如对本文有疑问，点击进行留言回复！！

深入了解c# 匿名类型

一、什么叫做匿名类？　　匿名类就是没有名字的类。匿名类不能被引用，只能再创建的时候用new语句来声明。二、匿名类的优势以及应用场景；　　1、匿名类型提供了一种方... [阅读全文]
C# 对PDF文档加密、解密（基于Spire.Cloud.SDK for .NET）

spire.cloud.sdk for .net提供了接口pdfsecurityapi可用于加密、解密pdf文档。本文将通过c#代码演示具体加密及解密方法。使用... [阅读全文]
C# 实现俄罗斯方块（附源码）

概述俄罗斯方块（tetris）是一款由俄罗斯人阿列克谢·帕基特诺夫发明的休闲游戏，帕基特诺夫爱玩拼图，从拼图游戏里得到灵感，设计出了俄罗斯方块。由于上手简单、老... [阅读全文]
浅析c# 接口

接口：是指定一组函数成员而不是实现他们的引用类型。所以只能类喝啊结构来实现接口，在结成该接口的类里面必须要实现接口的所有方法接口的特点：继承于接口的类，必须要实... [阅读全文]
c# 接口使用实例

用接口实现一个简单的物件的入库，出库如定义一个物流类接口，包含物件所属快递公司名称属性，物件单号属性及信息显示方法。通过物件出库类信息和物件入库类信息继承该接口... [阅读全文]
详解C# 泛型中的数据类型判定与转换

提到类型转换，首先要明确c#中的数据类型，主要分为值类型和引用类型：1.常用的值类型有：（struct）整型家族：int，byte，char，short，lon... [阅读全文]
Unity通用泛型单例设计模式（普通型和继承自MonoBehaviour）

单例模式是设计模式中最为常见的，不多解释了。但应该尽量避免使用，一般全局管理类才使用单例。普通泛型单例：public abstract class single... [阅读全文]
WindowsForm实现警告消息框的实例代码

警告消息框主要是用来向用户户展示诸如警告、异常、完成和提示消息。一般实现的效果就是从系统窗口右下角弹出，然后加上些简单的显示和消失的动画。创建警告框窗口首先我们... [阅读全文]
WindowsForm移动一个没有标题栏的窗口的方法

在winform程序中，要移动没有标题栏的窗口，基本的实现思路是监听需要拖动窗口内的控件的鼠标事件，然后将鼠标位置发送给窗口进行相应的位移就可以了。通过借用wi... [阅读全文]
快速了解c# 常量(整数常量，字符常量，定义长量)

常量是固定值，程序执行期间不会改变。常量可以是任何基本数据类型，比如整数常量、浮点常量、字符常量或者字符串常量，还有枚举常量。常量可以被当作常规的变量，只是它们... [阅读全文]

网友评论


验证码：

C#通过正则表达式实现提取网页中的图片

2019年07月18日 | 移动技术网IT编程 | 我要评论

您可能感兴趣的文章:

相关文章:

网友评论