/** * 解析一个Html页面,返回一个Html页面类. * * @param resource 文件路径或者网址 */ public static SearchHtmlPage parseHtmlPage(String resource) { String title = ""; String body = ""; try { Parser myParser = new Parser(resource);
//设置编码:根据实际情况修改 myParser.setEncoding("GBK");
HtmlPage visitor = new HtmlPage(myParser);
myParser.visitAllNodesWith(visitor);
title = visitor.getTitle();
body = combineNodeText(visitor.getBody().toNodeArray()); } catch (ParserException e) { LogMan.error("Parse Html Page " + resource + " Error!"); }
SearchHtmlPage result = new SearchHtmlPage(title, body);
return result; }
/** * 解析Html内容,得到普通文本和链接的内容. * * @param content 要解析的内容 * @return 返回解析后的内容 */ public static String parseHtmlContent(String content) { Parser myParser; NodeList nodeList = null;
myParser = Parser.createParser(content, "GBK");
NodeFilter textFilter = new NodeClassFilter(TextNode.class); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
//暂时不处理 meta //NodeFilter metaFilter = new NodeClassFilter(MetaTag.class);
OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter });
try { nodeList = myParser.parse(lastFilter); } catch (ParserException e) { LogMan.warn("Parse Content Error", e); }
//中场退出了 if (null == nodeList) { return ""; }
Node[] nodes = nodeList.toNodeArray();
String result = combineNodeText(nodes); return result; }
//合并节点的有效内容 private static String combineNodeText(Node[] nodes) { StringBuffer result = new StringBuffer();
for (int i = 0; i < nodes.length; i++) { Node anode = (Node) nodes[i];
String line = ""; if (anode instanceof TextNode) { TextNode textnode = (TextNode) anode; //line = textnode.toPlainTextString().trim(); line = textnode.getText(); } else if (anode instanceof LinkTag) { LinkTag linknode = (LinkTag) anode;
line = linknode.getLink(); //过滤jsp标签 line = StringFunc.replace(line, "<%.*%>", ""); }
if (StringFunc.isTrimEmpty(line)) continue;
result.append(" ").append(line); }
return result.toString(); } |