最近在做一个手机APP,通过一个新闻抓取程序抓取新闻,然后通过APP展示新闻。后发现手机端不支持Style标签,如果网页中有Style标签,则标签内的内容会显示处理非常影响页面美观。于是就写了一个用NekoHTML来清除Style标签的工具类
html.filter.properties 配置文件,配置允许的标签和要删除的标签及标签内的属性
attributes=style,id,name,class,width,height,src,oldsrc,complete,align,alt,title acceptTags=div,span,a,li,ul,nav,br,p,img,font,b,strong,table,tr,td removeTags=style
PropertiesUtils 读取Properties
package com.tiamaes.gjds.util; import java.io.IOException; import java.util.Properties; import org.springframework.core.io.ClassPathResource; /** * <p>类描述: 读取Properties中的属性 </p> * <p>创建人:王成委 </p> * <p>创建时间:2015年1月28日 上午11:23:27 </p> * <p>版权说明: © 2015 Tiamaes </p> */ public class PropertiesUtils { private Properties properties; public PropertiesUtils(String path){ try { ClassPathResource resource = new ClassPathResource(path); properties = new Properties(); properties.load(resource.getInputStream()); } catch (IOException e) { e.printStackTrace(); } } public String get(String key){ return this.properties.getProperty(key); } }
过滤HTML中的标签
package com.tiamaes.gjds.util; import java.io.CharArrayReader; import java.io.StringWriter; import java.util.ArrayList; import java.util.List; import org.apache.xerces.xni.parser.XMLDocumentFilter; import org.cyberneko.html.filters.ElementRemover; import org.cyberneko.html.filters.Writer; import org.cyberneko.html.parsers.DOMParser; import org.xml.sax.InputSource; /** * <p>类描述: 过滤Html中的标签 </p> * <p>创建人:王成委 </p> * <p>创建时间:2015年1月29日 上午10:45:02 </p> * <p>版权说明: © 2015 Tiamaes </p> */ public class HtmlFilterUtils { private static PropertiesUtils properties = null; private static HtmlFilterUtils filter = null; private String configPath = "html.filter.properties"; private static final String ATTRIBUTE_FIELD = "attributes"; private static final String ACCEPT_TAGS_FIELD = "acceptTags"; private static final String REMOVE_TAGS_FIELD = "removeTags"; private List<String> attributes = new ArrayList<String>(); private List<String> acceptTags = new ArrayList<String>(); private List<String> removeTags = new ArrayList<String>(); private static synchronized void syncInit(){ if(filter == null) filter = new HtmlFilterUtils(); } public static HtmlFilterUtils getInstance(){ return getInstance(false); } public static HtmlFilterUtils getInstance(boolean createNew){ if(createNew)return new HtmlFilterUtils(); if(filter == null){ syncInit(); } return filter; } private HtmlFilterUtils(){ if(properties == null){ properties = new PropertiesUtils(configPath); } this.addToList(attributes, properties.get(ATTRIBUTE_FIELD)); this.addToList(acceptTags, properties.get(ACCEPT_TAGS_FIELD)); this.addToList(removeTags, properties.get(REMOVE_TAGS_FIELD)); } public void addAtributes(String attrName){ this.attributes.add(attrName); } public void removeAtributes(String attrName){ this.attributes.remove(attrName); } public void addRmoveTag(String tagName){ this.removeTags.add(tagName); } public void removeRmoveTag(String tagName){ this.removeTags.remove(tagName); } public void addAcceptTag(String tagName){ this.acceptTags.add(tagName); } public void removeAcceptTag(String tagName){ this.acceptTags.remove(tagName); } private void addToList(List<String> list,String sources){ if(list == null) list = new ArrayList<String>(); String[] sourcesArray = sources.split(","); for(String str:sourcesArray){ list.add(str); } } public String doFilter(String htmlCode){ ElementRemover remover = new ElementRemover(); String[] atrrs = new String[attributes.size()]; for(String tag : acceptTags)remover.acceptElement(tag,attributes.toArray(atrrs)); for(String tag : removeTags)remover.removeElement(tag); CharArrayReader reader = null; String result; try { StringWriter filteredDescription = new StringWriter(); Writer writer = new Writer(filteredDescription,"UTF-8"); XMLDocumentFilter[] filters = {remover,writer}; DOMParser parser = new DOMParser(); reader = new CharArrayReader(htmlCode.toCharArray()); InputSource inputSource = new InputSource(reader); parser.setProperty("http://cyberneko.org/html/properties/filters", filters); parser.parse(inputSource); result = filteredDescription.toString(); } catch (Exception e1) { e1.printStackTrace(); result = htmlCode; } try { reader.close(); } catch (Exception e) { e.printStackTrace(); } return result; } }
调用doFilter可以过滤HTML的内容
时间: 2024-10-21 22:22:47