C#实现将HTML转换成纯文本的方法_c#

本文实例讲述了c#实现将html转换成纯文本的方法。分享给大家供大家参考。具体如下：

使用方法：

htmltotext convert = new htmltotext();

textbox2.text = convert.convert(textbox1.text);

c#代码如下：

/// <summary>
/// converts html to plain text.
/// </summary>
class htmltotext
{
  // static data tables
  protected static dictionary<string, string> _tags;
  protected static hashset<string> _ignoretags;
  // instance variables
  protected textbuilder _text;
  protected string _html;
  protected int _pos;
  // static constructor (one time only)
  static htmltotext()
  {
    _tags = new dictionary<string, string>();
    _tags.add("address", "\n");
    _tags.add("blockquote", "\n");
    _tags.add("div", "\n");
    _tags.add("dl", "\n");
    _tags.add("fieldset", "\n");
    _tags.add("form", "\n");
    _tags.add("h1", "\n");
    _tags.add("/h1", "\n");
    _tags.add("h2", "\n");
    _tags.add("/h2", "\n");
    _tags.add("h3", "\n");
    _tags.add("/h3", "\n");
    _tags.add("h4", "\n");
    _tags.add("/h4", "\n");
    _tags.add("h5", "\n");
    _tags.add("/h5", "\n");
    _tags.add("h6", "\n");
    _tags.add("/h6", "\n");
    _tags.add("p", "\n");
    _tags.add("/p", "\n");
    _tags.add("table", "\n");
    _tags.add("/table", "\n");
    _tags.add("ul", "\n");
    _tags.add("/ul", "\n");
    _tags.add("ol", "\n");
    _tags.add("/ol", "\n");
    _tags.add("/li", "\n");
    _tags.add("br", "\n");
    _tags.add("/td", "\t");
    _tags.add("/tr", "\n");
    _tags.add("/pre", "\n");
    _ignoretags = new hashset<string>();
    _ignoretags.add("script");
    _ignoretags.add("noscript");
    _ignoretags.add("style");
    _ignoretags.add("object");
  }
  /// <summary>
  /// converts the given html to plain text and returns the result.
  /// </summary>
  /// <param name="html">html to be converted</param>
  /// <returns>resulting plain text</returns>
  public string convert(string html)
  {
    // initialize state variables
    _text = new textbuilder();
    _html = html;
    _pos = 0;
    // process input
    while (!endoftext)
    {
      if (peek() == '<')
      {
        // html tag
        bool selfclosing;
        string tag = parsetag(out selfclosing);
        // handle special tag cases
        if (tag == "body")
        {
          // discard content before <body>
          _text.clear();
        }
        else if (tag == "/body")
        {
          // discard content after </body>
          _pos = _html.length;
        }
        else if (tag == "pre")
        {
          // enter preformatted mode
          _text.preformatted = true;
          eatwhitespacetonextline();
        }
        else if (tag == "/pre")
        {
          // exit preformatted mode
          _text.preformatted = false;
        }
        string value;
        if (_tags.trygetvalue(tag, out value))
          _text.write(value);
        if (_ignoretags.contains(tag))
          eatinnercontent(tag);
      }
      else if (char.iswhitespace(peek()))
      {
        // whitespace (treat all as space)
        _text.write(_text.preformatted ? peek() : ' ');
        moveahead();
      }
      else
      {
        // other text
        _text.write(peek());
        moveahead();
      }
    }
    // return result
    return httputility.htmldecode(_text.tostring());
  }
  // eats all characters that are part of the current tag
  // and returns information about that tag
  protected string parsetag(out bool selfclosing)
  {
    string tag = string.empty;
    selfclosing = false;
    if (peek() == '<')
    {
      moveahead();
      // parse tag name
      eatwhitespace();
      int start = _pos;
      if (peek() == '/')
        moveahead();
      while (!endoftext && !char.iswhitespace(peek()) &&
        peek() != '/' && peek() != '>')
        moveahead();
      tag = _html.substring(start, _pos - start).tolower();
      // parse rest of tag
      while (!endoftext && peek() != '>')
      {
        if (peek() == '"' || peek() == '\'')
          eatquotedvalue();
        else
        {
          if (peek() == '/')
            selfclosing = true;
          moveahead();
        }
      }
      moveahead();
    }
    return tag;
  }
  // consumes inner content from the current tag
  protected void eatinnercontent(string tag)
  {
    string endtag = "/" + tag;
    while (!endoftext)
    {
      if (peek() == '<')
      {
        // consume a tag
        bool selfclosing;
        if (parsetag(out selfclosing) == endtag)
          return;
        // use recursion to consume nested tags
        if (!selfclosing && !tag.startswith("/"))
          eatinnercontent(tag);
      }
      else moveahead();
    }
  }
  // returns true if the current position is at the end of
  // the string
  protected bool endoftext
  {
    get { return (_pos >= _html.length); }
  }
  // safely returns the character at the current position
  protected char peek()
  {
    return (_pos < _html.length) ? _html[_pos] : (char)0;
  }
  // safely advances to current position to the next character
  protected void moveahead()
  {
    _pos = math.min(_pos + 1, _html.length);
  }
  // moves the current position to the next non-whitespace
  // character.
  protected void eatwhitespace()
  {
    while (char.iswhitespace(peek()))
      moveahead();
  }
  // moves the current position to the next non-whitespace
  // character or the start of the next line, whichever
  // comes first
  protected void eatwhitespacetonextline()
  {
    while (char.iswhitespace(peek()))
    {
      char c = peek();
      moveahead();
      if (c == '\n')
        break;
    }
  }
  // moves the current position past a quoted value
  protected void eatquotedvalue()
  {
    char c = peek();
    if (c == '"' || c == '\'')
    {
      // opening quote
      moveahead();
      // find end of value
      int start = _pos;
      _pos = _html.indexofany(new char[] { c, '\r', '\n' }, _pos);
      if (_pos < 0)
        _pos = _html.length;
      else
        moveahead();  // closing quote
    }
  }
  /// <summary>
  /// a stringbuilder class that helps eliminate excess whitespace.
  /// </summary>
  protected class textbuilder
  {
    private stringbuilder _text;
    private stringbuilder _currline;
    private int _emptylines;
    private bool _preformatted;
    // construction
    public textbuilder()
    {
      _text = new stringbuilder();
      _currline = new stringbuilder();
      _emptylines = 0;
      _preformatted = false;
    }
    /// <summary>
    /// normally, extra whitespace characters are discarded.
    /// if this property is set to true, they are passed
    /// through unchanged.
    /// </summary>
    public bool preformatted
    {
      get
      {
        return _preformatted;
      }
      set
      {
        if (value)
        {
          // clear line buffer if changing to
          // preformatted mode
          if (_currline.length > 0)
            flushcurrline();
          _emptylines = 0;
        }
        _preformatted = value;
      }
    }
    /// <summary>
    /// clears all current text.
    /// </summary>
    public void clear()
    {
      _text.length = 0;
      _currline.length = 0;
      _emptylines = 0;
    }
    /// <summary>
    /// writes the given string to the output buffer.
    /// </summary>
    /// <param name="s"></param>
    public void write(string s)
    {
      foreach (char c in s)
        write(c);
    }
    /// <summary>
    /// writes the given character to the output buffer.
    /// </summary>
    /// <param name="c">character to write</param>
    public void write(char c)
    {
      if (_preformatted)
      {
        // write preformatted character
        _text.append(c);
      }
      else
      {
        if (c == '\r')
        {
          // ignore carriage returns. we'll process
          // '\n' if it comes next
        }
        else if (c == '\n')
        {
          // flush current line
          flushcurrline();
        }
        else if (char.iswhitespace(c))
        {
          // write single space character
          int len = _currline.length;
          if (len == 0 || !char.iswhitespace(_currline[len - 1]))
            _currline.append(' ');
        }
        else
        {
          // add character to current line
          _currline.append(c);
        }
      }
    }
    // appends the current line to output buffer
    protected void flushcurrline()
    {
      // get current line
      string line = _currline.tostring().trim();
      // determine if line contains non-space characters
      string tmp = line.replace(" ", string.empty);
      if (tmp.length == 0)
      {
        // an empty line
        _emptylines++;
        if (_emptylines < 2 && _text.length > 0)
          _text.appendline(line);
      }
      else
      {
        // a non-empty line
        _emptylines = 0;
        _text.appendline(line);
      }
      // reset current line
      _currline.length = 0;
    }
    /// <summary>
    /// returns the current output as a string.
    /// </summary>
    public override string tostring()
    {
      if (_currline.length > 0)
        flushcurrline();
      return _text.tostring();
    }
  }
}

希望本文所述对大家的c#程序设计有所帮助。

您可能感兴趣的文章:

如您对本文有疑问或者有任何想说的，请点击进行留言回复，万千网友为您解惑！

C#实现将HTML转换成纯文本的方法

2019年07月18日 | 移动技术网IT编程 | 我要评论

您可能感兴趣的文章:

相关文章:

网友评论


验证码：