当前位置: 移动技术网 > IT编程>开发语言>Java > java使用htmlparser提取网页纯文本例子

java使用htmlparser提取网页纯文本例子

2019年07月22日  | 移动技术网IT编程  | 我要评论

复制代码 代码如下:

package com.test;

import org.htmlparser.node;
import org.htmlparser.nodefilter;
import org.htmlparser.parser;
import org.htmlparser.filters.tagnamefilter;
import org.htmlparser.tags.tabletag;
import org.htmlparser.util.nodelist;

/**
* 标题:利用htmlparser提取网页纯文本的例子
*/
public class testhtmlparser {
  public static void testhtml() {
    try {
        string scurrentline;
        string stotalstring;
        scurrentline = "";
        stotalstring = "";
        java.io.inputstream l_urlstream;
        java.net.url l_url = new java.net.url("http://www.ideagrace.com/html/doc/2006/07/04/00929.html");
        java.net.httpurlconnection l_connection = (java.net.httpurlconnection) l_url.openconnection();
        l_connection.connect();
        l_urlstream = l_connection.getinputstream();
        java.io.bufferedreader l_reader = new java.io.bufferedreader(new java.io.inputstreamreader(l_urlstream));
        while ((scurrentline = l_reader.readline()) != null) {
          stotalstring += scurrentline+"/r/n";
        //  system.out.println(stotalstring);
        }
        string testtext = extracttext(stotalstring);
        system.out.println( testtext );

    } catch (exception e) {
        e.printstacktrace();
    }

  }

  public static string extracttext(string inputhtml) throws exception {
    stringbuffer text = new stringbuffer();
    parser parser = parser.createparser(new string(inputhtml.getbytes(),"gbk"), "gbk");
    // 遍历所有的节点
    nodelist nodes = parser.extractallnodesthatmatch(new nodefilter() {
        public boolean accept(node node) {
          return true;
        }
    });

    system.out.println(nodes.size()); //打印节点的数量
    for (int i=0;i<nodes.size();i++){
         node nodet = nodes.elementat(i);
         //system.out.println(nodet.gettext());
        text.append(new string(nodet.toplaintextstring().getbytes("gbk"))+"/r/n");         
    }
    return text.tostring();
  }

  public static void test5(string resource) throws exception {
    parser myparser = new parser(resource);
    myparser.setencoding("gbk");
    string filterstr = "table";
    nodefilter filter = new tagnamefilter(filterstr);
    nodelist nodelist = myparser.extractallnodesthatmatch(filter);
    tabletag tabletag = (tabletag) nodelist.elementat(11);

  }

  public static void main(string[] args) throws exception {
    // test5("http://www.google.com");
    testhtml();
  }
}

如对本文有疑问, 点击进行留言回复!!

相关文章:

验证码:
移动技术网