HtmlCleaner cleaner = new HtmlCleaner(); TagNode node = cleaner.clean(new URL("http://finance.sina.com.cn/money/nmetal/20091209/10157077895.shtml")); //按tag取. Object[] ns = node.getElementsByName("title", true); //标题 if(ns.length > 0) { System.out.println("title="+((TagNode)ns[0]).getText()); } // /html/body/div[2]/div[4]/div/div/div/div[2]/p ns = node.evaluateXPath("//div[@class=\"blkContainerSblkCon\"]/p"); //选取class为指定blkContainerSblkCon的div下面的所有p标签 for (int i = 0; i < ns.length; i++) { String in = cleaner.getInnerHtml((TagNode)ns[i]); System.out.println("<p>"+in + "</p>"); } String in = cleaner.getInnerHtml((TagNode)ns[0]); System.out.println(in); System.out.println(((TagNode)ns[0]).getText());
HtmlCleaner cleaner = new HtmlCleaner(); String url = "http://finance.sina.com.cn/nmetal/hjfx.html"; URL _url = new URL(url); TagNode node = cleaner.clean(_url); //按tag取. Object[] ns = node.getElementsByName("title", true); //标题 if(ns.length > 0) { System.out.println("title="+((TagNode)ns[0]).getText()); } ns = node.evaluateXPath("//*[@class=‘Frame-Row3-01-C‘]/table[2]/tbody/tr/td/a"); //选取class为指定blkContainerSblkCon的div下面的所有p for (int i = 0; i < ns.length; i++) { //取链接文本 // String in = cleaner.getInnerHtml((TagNode)ns[i]); // System.out.println(in); //获取链接的 TagNode n = (TagNode) ns[i]; // System.out.println(n.getAttributeByName("href")); System.out.println(new URL(_url,n.getAttributeByName("href")).toString()); } // String in = cleaner.getInnerHtml((TagNode)ns[0]); // System.out.println(in); // System.out.println(((TagNode)ns[0]).getText()); // System.out.println("ul/li:"); // //按xpath取 // ns = node.evaluateXPath("//div[@class=‘d_1‘]//li"); // for(Object on : ns) { // TagNode n = (TagNode) on; // System.out.println("\ttext="+n.getText()); // } // System.out.println("a:"); // //按属性值取 // ns = node.getElementsByAttValue("name", "my_href", true, true); // for(Object on : ns) { // TagNode n = (TagNode) on; // System.out.println("\thref="+n.getAttributeByName("href")+", text="+n.getText()); // }
本文转载于:http://gstarwd.iteye.com/blog/644502
xpath 参考教材:http://www.w3school.com.cn/xpath/xpath_syntax.asp
时间: 2024-10-08 15:06:02