简介
jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。
添加依赖
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency>
查找DOM元素
使用Jsoup提供的API,可以通过标签名,Id,Class样式等来搜索DOM
常用API
getElementById(String id) // 根据id来查询DOM getElementsByTag(String tagName) // 根据tag名称来查询DOM getElementsByClass(String className) // 根据样式名称来查询DOM getElementsByAttribute(String key) // 根据属性名来查询DOM getElementsByAttributeValue(String key,String value) // 根据属性名和属性值来查询DOM
实例代码
package cn.cslg.Jsoup; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class JsoupDemo { public static void main(String[] args)throws Exception { CloseableHttpClient httpclient = HttpClients.createDefault(); // 创建httpclient实例 HttpGet httpget = new HttpGet("http://www.cnblogs.com/"); // 创建httpget实例 CloseableHttpResponse response = httpclient.execute(httpget); // 执行get请求 HttpEntity entity=response.getEntity(); // 获取返回实体 String content=EntityUtils.toString(entity, "utf-8"); response.close(); // 关闭流和释放系统资源 Document doc=Jsoup.parse(content);// 解析网页 得到文档对象 Element navTopElement=doc.getElementById("site_nav_top"); // 根据id来查询DOM String navTop=navTopElement.text(); // 返回元素的文本 System.out.println("口号:"+navTop); Elements titleElements=doc.getElementsByTag("title"); // 根据tag名称来查询DOM Element titleElement=titleElements.get(0); // 获取第1个元素 String title=titleElement.text(); // 返回元素的文本 System.out.println("网页标题是:"+title); Elements postItemElements=doc.getElementsByClass("post_item "); // 根据样式名称来查询DOM for(Element e:postItemElements){ System.out.println(e.html()); System.out.println("================"); } Elements widthElements=doc.getElementsByAttribute("width"); // 根据属性名来查询DOM for(Element e:widthElements){ System.out.println(e.toString()); System.out.println("================"); } System.out.println("target-_blank"); Elements targetElements=doc.getElementsByAttributeValue("target", "_blank"); // 根据属性名和属性值来查询DOM for(Element e:targetElements){ System.out.println(e.toString()); System.out.println("================"); } } }
使用选择器查找DOM元素
对于很多个有规律的标签层次,可以使用Jsoup使用选择器语法查找DOM元素
在选择器语法中一般为 "#id名 .class名 .class名 标签 标签"
实例代码
package cn.cslg.Jsoup; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class JsoupDemo { public static void main(String[] args)throws Exception { CloseableHttpClient httpclient = HttpClients.createDefault(); // 创建httpclient实例 HttpGet httpget = new HttpGet("http://www.cnblogs.com/"); // 创建httpget实例 CloseableHttpResponse response = httpclient.execute(httpget); // 执行get请求 HttpEntity entity=response.getEntity(); // 获取返回实体 String content=EntityUtils.toString(entity, "utf-8"); response.close(); // 关闭流和释放系统资源 Document doc=Jsoup.parse(content); // 解析网页 得到文档对象 Elements linkElements=doc.select("#post_list .post_item .post_item_body h3 a"); //通过选择器查找所有博客链接DOM for(Element e:linkElements){ System.out.println("博客标题:"+e.text()); } System.out.println("==============="); Elements hrefElements=doc.select("a[href]"); // 带有href属性的a元素 for(Element e:hrefElements){ System.out.println(e.toString()); } System.out.println("==============="); Elements imgElements=doc.select("img[src$=.png]"); // 查找扩展名为.png的图片DOM节点 for(Element e:imgElements){ System.out.println(e.toString()); } Element element=doc.getElementsByTag("title").first(); // 获取tag是title的第一个DOM元素 String title=element.text(); // 返回元素的文本 System.out.println("网页标题是:"+title); } }
获取DOM属性值
例如获取href的属性值
package cn.cslg.Jsoup; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class JsoupDemo { public static void main(String[] args)throws Exception { CloseableHttpClient httpclient = HttpClients.createDefault(); // 创建httpclient实例 HttpGet httpget = new HttpGet("http://www.cnblogs.com/"); // 创建httpget实例 CloseableHttpResponse response = httpclient.execute(httpget); // 执行get请求 HttpEntity entity=response.getEntity(); // 获取返回实体 String content=EntityUtils.toString(entity, "utf-8"); response.close(); // 关闭流和释放系统资源 Document doc=Jsoup.parse(content); // 解析网页 得到文档对象 Elements linkElements=doc.select("#post_list .post_item .post_item_body h3 a"); //通过选择器查找所有博客链接DOM for(Element e:linkElements){ System.out.println("博客标题:"+e.text()); System.out.println("博客地址:"+e.attr("href")); System.out.println("target:"+e.attr("target")); } Element linkElement=doc.select("#friend_link").first(); System.out.println("纯文本:"+linkElement.text()); // 获取纯文本 System.out.println("html:"+linkElement.html()); // 获取整个html } }
时间: 2024-10-05 16:39:56