源码网商城,靠谱的源码在线交易网站 我的订单 购物车 帮助

源码网商城

java使用htmlparser提取网页纯文本例子

  • 时间:2021-04-07 03:18 编辑: 来源: 阅读:
  • 扫一扫,手机访问
摘要:java使用htmlparser提取网页纯文本例子
[u]复制代码[/u] 代码如下:
package com.test; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.TableTag; import org.htmlparser.util.NodeList; /** * 标题:利用htmlparser提取网页纯文本的例子 */ public class TestHTMLParser {   public static void testHtml() {     try {         String sCurrentLine;         String sTotalString;         sCurrentLine = "";         sTotalString = "";         java.io.InputStream l_urlStream;         java.net.URL l_url = new java.net.URL("http://www.ideagrace.com/html/doc/2006/07/04/00929.html");         java.net.HttpURLConnection l_connection = (java.net.HttpURLConnection) l_url.openConnection();         l_connection.connect();         l_urlStream = l_connection.getInputStream();         java.io.BufferedReader l_reader = new java.io.BufferedReader(new java.io.InputStreamReader(l_urlStream));         while ((sCurrentLine = l_reader.readLine()) != null) {           sTotalString += sCurrentLine+"/r/n";         //  System.out.println(sTotalString);         }         String testText = extractText(sTotalString);         System.out.println( testText );     } catch (Exception e) {         e.printStackTrace();     }   }   public static String extractText(String inputHtml) throws Exception {     StringBuffer text = new StringBuffer();     Parser parser = Parser.createParser(new String(inputHtml.getBytes(),"GBK"), "GBK");     // 遍历所有的节点     NodeList nodes = parser.extractAllNodesThatMatch(new NodeFilter() {         public boolean accept(Node node) {           return true;         }     });     System.out.println(nodes.size()); //打印节点的数量     for (int i=0;i<nodes.size();i++){          Node nodet = nodes.elementAt(i);          //System.out.println(nodet.getText());         text.append(new String(nodet.toPlainTextString().getBytes("GBK"))+"/r/n");              }     return text.toString();   }   public static void test5(String resource) throws Exception {     Parser myParser = new Parser(resource);     myParser.setEncoding("GBK");     String filterStr = "table";     NodeFilter filter = new TagNameFilter(filterStr);     NodeList nodeList = myParser.extractAllNodesThatMatch(filter);     TableTag tabletag = (TableTag) nodeList.elementAt(11);   }   public static void main(String[] args) throws Exception {     // test5("http://www.google.com");     testHtml();   } }
  • 全部评论(0)
联系客服
客服电话:
400-000-3129
微信版

扫一扫进微信版
返回顶部