import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.File; import java.io.IOException; /** * Created by Administrator on 2017/5/26. */ public class JsoupTest { public static void main(String args[]) { // Parse HTML String using JSoup library String HTMLSTring = "<!DOCTYPE html>" + "<html>" + "<head>" + "<title>JSoup Example</title>" + "</head>" + "<body>" // + "<table>" // + "<tr>" // + "<td>" + "<h1>" + "HelloWorld" + "<h1>" // + "<td>" // + "<tr>" // + "<table>" + "</body>" + "</html>"; Document html = Jsoup.parse(HTMLSTring); String title = html.title(); String h1 = html.body().getElementsByTag("h1").text(); System.out.println("Input HTML String to JSoup :" + HTMLSTring); System.out.println("After parsing, Title : " + title); System.out.println("Afte parsing, Heading : " + h1); // JSoup Example 2 - Reading HTML page from URL Document doc = null; // String h = null; try { doc = Jsoup.connect("http://www.baidu.com/").get(); title = doc.title(); // h = doc.html(); } catch (IOException e) { e.printStackTrace(); } System.out.println("Input HTML String to JSoup :" + doc.html()); System.out.println("Jsoup Can read HTML page from URL, title : " + title); // JSoup Example 3 - Parsing an HTML file in Java //Document htmlFile = Jsoup.parse("login.html", "ISO-8859-1"); // wrong Document htmlFile = null; try { htmlFile = Jsoup.parse(new File("D:\\Test\\228.html"), "UTF-8"); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } // right title = htmlFile.title(); Element div = htmlFile.getElementById("team_box"); //测试1 /*Elements div1 = htmlFile.getElementsContainingText("工商注册"); Elements div2 = htmlFile.getElementsMatchingText("法定代表人");*/ // String cssClass = div.className(); // getting class form HTML element System.out.println("Jsoup can also parse HTML file directly"); System.out.println("title : " + title); // System.out.println("class of div tag : " + cssClass); //增加内容 Elements spans = htmlFile.select("[span=‘注册号:‘]"); System.out.println("\n" + spans.toString() + "\n"); //测试1 /*System.out.println("found: " + div1); System.out.println("found: " + div2);*/ } }
时间: 2024-11-05 02:38:29