package parser;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
importorg.htmlparser.filters.NodeClassFilter;
importorg.htmlparser.parserapplications.StringExtractor;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
* 使用HtmlParser抓去网页内容: 要抓去页面的内容最方便的方法就是使用StringBean. 里面有几个控制页面内容的几个参数.
* 在后面的代码中会有说明. Htmlparser包中还有一个示例StringExtractor 里面有个直接得到内容的方法,
* 其中也是使用了StringBean . 另外直接解析Parser的每个标签也可以的.
*
*@author chenguoyong
*
*/
public class GetContent {
publicvoid getContentUsingStringBean(String url) {
StringBeansb = new StringBean();
sb.setLinks(true);// 是否显示web页面的连接(Links)
//为了取得页面的整洁美观一般设置上面两项为true , 如果要保持页面的原有格式, 如代码页面的空格缩进 可以设置为false
sb.setCollapse(true);// 如果是true的话把一系列空白字符用一个字符替代.
sb.setReplaceNonBreakingSpaces(true);//If true regular space
sb
.setURL("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html");
System.out.println("TheContent is :\n" + sb.getStrings());
}
publicvoid getContentUsingStringExtractor(String url, boolean link) {
//StringExtractor内部机制和上面的一样.做了一下包装
StringExtractorse = new StringExtractor(url);
Stringtext = null;
try{
text= se.extractStrings(link);
System.out.println("Thecontent is :\n" + text);
}catch (ParserException e) {
e.printStackTrace();
}
}
publicvoid getContentUsingParser(String url) {
NodeListnl;
try{
Parserp = new Parser(url);
nl= p.parse(new NodeClassFilter(BodyTag.class));
BodyTagbt = (BodyTag) nl.elementAt(0);
System.out.println(bt.toPlainTextString());// 保留原来的内容格式. 包含js代码
}catch (ParserException e) {
e.printStackTrace();
}
}
/**
* @param args
*/
publicstatic void main(String[] args) {
Stringurl = "http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html";
//newGetContent().getContentUsingParser(url);
//--------------------------------------------------
newGetContent().getContentUsingStringBean(url);
http://c.tieba.baidu.com/p/3408749050
http://c.tieba.baidu.com/p/3408749395
http://c.tieba.baidu.com/p/3408869872
http://c.tieba.baidu.com/p/3408889389
http://c.tieba.baidu.com/p/3408905730
http://c.tieba.baidu.com/p/3408983919
http://c.tieba.baidu.com/p/3408987713
http://c.tieba.baidu.com/p/3409238829
http://c.tieba.baidu.com/p/3409302576
http://c.tieba.baidu.com/p/3409324206
http://c.tieba.baidu.com/p/3409328563
http://c.tieba.baidu.com/p/3409332883
http://c.tieba.baidu.com/p/3409337269
http://c.tieba.baidu.com/p/3409341558
http://c.tieba.baidu.com/p/3409345894
http://c.tieba.baidu.com/p/3409350213
http://c.tieba.baidu.com/p/3409354458
http://c.tieba.baidu.com/p/3409358652
http://c.tieba.baidu.com/p/3409358652
http://c.tieba.baidu.com/p/3409363045
http://c.tieba.baidu.com/p/3409367533
http://c.tieba.baidu.com/p/3409371860
http://c.tieba.baidu.com/p/3409376337
http://c.tieba.baidu.com/p/3409380701
http://c.tieba.baidu.com/p/3409389603
http://c.tieba.baidu.com/p/3409394100
http://c.tieba.baidu.com/p/3409398551
http://c.tieba.baidu.com/p/3409403048
http://c.tieba.baidu.com/p/3409412676
http://c.tieba.baidu.com/p/3409407844
http://c.tieba.baidu.com/p/3409417793
http://c.tieba.baidu.com/p/3409422741
http://c.tieba.baidu.com/p/3409432831
http://c.tieba.baidu.com/p/3409437768
http://c.tieba.baidu.com/p/3409442408
http://c.tieba.baidu.com/p/3409447140
http://c.tieba.baidu.com/p/3409451830
http://c.tieba.baidu.com/p/3409456819
http://c.tieba.baidu.com/p/3409461659
http://c.tieba.baidu.com/p/3409461659
http://c.tieba.baidu.com/p/3409466665
http://c.tieba.baidu.com/p/3409471467
http://c.tieba.baidu.com/p/3409476139
http://c.tieba.baidu.com/p/3409480662
http://c.tieba.baidu.com/p/3409485140
http://c.tieba.baidu.com/p/3409490104
http://c.tieba.baidu.com/p/3409494880
http://c.tieba.baidu.com/p/3409500048
http://c.tieba.baidu.com/p/3409538997
http://c.tieba.baidu.com/p/3409543296
http://c.tieba.baidu.com/p/3409548124
http://c.tieba.baidu.com/p/3409552702
http://c.tieba.baidu.com/p/3409557518
http://c.tieba.baidu.com/p/3409562457
http://c.tieba.baidu.com/p/3409567386
http://c.tieba.baidu.com/p/3409572148
http://c.tieba.baidu.com/p/3409576791
http://c.tieba.baidu.com/p/3409581593
http://c.tieba.baidu.com/p/3409586354
http://c.tieba.baidu.com/p/3409626383
http://c.tieba.baidu.com/p/3409385259
http://c.tieba.baidu.com/p/3409767728
http://c.tieba.baidu.com/p/3409787667
http://c.tieba.baidu.com/p/3409791516
http://c.tieba.baidu.com/p/3409795327
http://c.tieba.baidu.com/p/3409866665
http://c.tieba.baidu.com/p/3409873864
http://c.tieba.baidu.com/p/3409879998
http://c.tieba.baidu.com/p/3409884553
http://c.tieba.baidu.com/p/3409895642
http://c.tieba.baidu.com/p/3409900207
http://c.tieba.baidu.com/p/3409903862
http://c.tieba.baidu.com/p/3409912381
http://c.tieba.baidu.com/p/3409908113
http://c.tieba.baidu.com/p/3409991219
http://c.tieba.baidu.com/p/3410010420
http://c.tieba.baidu.com/p/3410018434
http://c.tieba.baidu.com/p/3410178761
http://c.tieba.baidu.com/p/3410147170
http://c.tieba.baidu.com/p/3410141093
http://c.tieba.baidu.com/p/3410131727
http://c.tieba.baidu.com/p/3410122313
http://c.tieba.baidu.com/p/3410112662
http://c.tieba.baidu.com/p/3410103121
http://c.tieba.baidu.com/p/3410097950
http://c.tieba.baidu.com/p/3410093865
http://c.tieba.baidu.com/p/3410088684
http://c.tieba.baidu.com/p/3410052996
http://c.tieba.baidu.com/p/3410046741
http://c.tieba.baidu.com/p/3408925683
http://c.tieba.baidu.com/p/3410196625