import cn.wanghaomiao.xpath.exception.NoSuchAxisException; import cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException; import cn.wanghaomiao.xpath.model.JXDocument; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.io.File; import java.io.IOException; import java.util.List; /** * Created by Administrator on 2017/5/27. */ public class JsoupXpathTest { // http://www.cnblogs.com/ 为例 // "//a/@href"; //"//div[@id=‘paging_block‘]/div/a[text()=‘Next >‘]/@href"; //"//div[@id=‘paging_block‘]/div/a[text()*=‘Next‘]/@href"; //"//h1/text()"; //"//h1/allText()"; //"//h1//text()"; //"//div/a"; //"//div[@id=‘post_list‘]/div[position()<3]/div/h3/allText()"; //"//div[@id=‘post_list‘]/div[first()]/div/h3/allText()"; //"//div[@id=‘post_list‘]/div[1]/div/h3/allText()"; //"//div[@id=‘post_list‘]/div[last()]/div/h3/allText()"; ////查找评论大于1000的条目(当然只是为了演示复杂xpath了,谓语中可以各种嵌套,这样才能测试的更全面嘛) //"//div[@id=‘post_list‘]/div[./div/div/span[@class=‘article_view‘]/a/num()>1000]/div/h3/allText()"; ////轴支持 //"//div[@id=‘post_list‘]/div[self::div/div/div/span[@class=‘article_view‘]/a/num()>1000]/div/h3/allText()"; //"//div[@id=‘post_list‘]/div[2]/div/p/preceding-sibling::h3/allText()"; //"//div[@id=‘post_list‘]/div[2]/div/p/preceding-sibling::h3/allText()|//div[@id=‘post_list‘]/div[1]/div/h3/allText()"; public static void main(String[] args) throws IOException{ // String xpath="//div[text()=‘工商注册‘]/text()"; // String xpath="//div[@id=‘post_list‘]"; String xpath="//div/span[text()=‘获投信息‘]/parent::*/following-sibling::*[1]/div[1]/div[2]/table[1]/tr[position()>=1]"; // String xpath="//span[@class=‘details_1221_d05_d02_s01‘]/text()|//span[@class=‘details_1221_d05_d02_s02‘]/text()"; // String xpath="//span[@class=‘details_1221_d05_d02_s01‘]|//span[@class=‘details_1221_d05_d02_s02‘]"; // String xpath="//div[@id=‘post_list‘]/div[./div/div/span[@class=‘article_view‘]/a/num()>1000]/div/h3/allText()"; //通过URL数据源 /*Document doc = Jsoup.connect("http://www.cnblogs.com/").get(); JXDocument jxDocument = new JXDocument(doc); List<Object> rs = null; try { rs = jxDocument.sel(xpath); } catch (XpathSyntaxErrorException e) { e.printStackTrace(); } for (Object o:rs){ if (o instanceof Element){ int index = ((Element) o).siblingIndex(); System.out.println(index); } System.out.println("\n"+o.toString()+"\n"); }*/ //通过本地数据源 // Document doc1 = Jsoup.parse(new File("D:\\Test\\228.html"), "UTF-8"); Document doc1 = Jsoup.parse(new File("D:\\Test\\It桔子\\2.html"), "UTF-8"); JXDocument jxDocument = new JXDocument(doc1); List<Object> rs1 = null; try { rs1 = jxDocument.sel(xpath); } catch (XpathSyntaxErrorException e) { e.printStackTrace(); } //System.out.println("\n"+rs1.toString()+"\n"); for (Object o:rs1){ if (o instanceof Element){ int index = ((Element) o).siblingIndex(); System.out.println(index); } System.out.println(o.toString()); } } }
时间: 2025-01-01 21:25:34