场景介绍:
在处理输入的文本时,需要将http://bit.ly/3ynriE等短连接转换为真实连接lucene.apache.org/solr
1,实现TokenFilter
package com.url.plugin; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import java.io.IOException; import java.util.regex.Pattern; public class ResolveUrlTokenFilter extends TokenFilter { private final CharTermAttribute charTermAttribute=addAttribute(CharTermAttribute.class); private final Pattern patternToMatchShortenedUrls; public ResolveUrlTokenFilter(TokenStream input, Pattern patternToMatchShortenedUrls) { super(input); this.patternToMatchShortenedUrls = patternToMatchShortenedUrls; } @Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; //charTermAttribute会保存读取char char[] term=charTermAttribute.buffer(); int len=term.length; //构造字符串 String token=new String(term,0,len); //匹配token中是否出现我们需要重构的场景 if(patternToMatchShortenedUrls.matcher(token).matches()){ charTermAttribute.setEmpty().append(resolveUrlToken(token)); } return true; } private String resolveUrlToken(String token) { //TODO 根据实际需求处理token try { if ("http://bit.ly/3ynriE".equals(token)) { return "lucene.apache.org/solr"; } else if ("http://bit.ly/15tzw".equals(token)) { return "manning.com"; } } catch (Exception exc) { // rather than failing analysis if you can‘t resolve the URL, // you should log the error and return the un-resolved value exc.printStackTrace(); } return token; } }
2,实现TokenFilterFactory
package com.url.plugin; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.util.TokenFilterFactory; import java.util.Map; import java.util.regex.Pattern; public class ResolveUrlTokenFilterFactory extends TokenFilterFactory { private Pattern patternToMatchShortenedUrls; public ResolveUrlTokenFilterFactory(Map<String, String> args) { super(args); assureMatchVersion(); //从solr读取的配置文件信息中获取正则表达式信息 String shortenedUrls=require(args,"shortenedUrlPattern"); patternToMatchShortenedUrls=Pattern.compile(shortenedUrls); } @Override public TokenFilter create(TokenStream tokenStream) { //创建ResolveUrlTokenFilter实例对象 return new ResolveUrlTokenFilter(tokenStream,patternToMatchShortenedUrls); } }
3,将其打成jar包
4,在solr的schema文件中添加如下内容
<fieldType name="text_plugin" class="solr.TextField" positionIncrementGap="100"> <analyzer type="index"> <tokenizer class="solr.StandardTokenizerFactory"/> <filter class="com.url.plugin.ResolveUrlTokenFilterFactory" shortenedUrlPattern="http:\/\/bit.ly\/[\w\-]+" /> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> <filter class="solr.LowerCaseFilterFactory"/> </analyzer> <analyzer type="query"> <tokenizer class="solr.StandardTokenizerFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> <filter class="solr.LowerCaseFilterFactory"/> </analyzer> </fieldType>
5,在solr的根目录下创建plugin文件夹,(位置同dist,contrib文件),并将3生成的jar放入其中
6,在solrconfg.xml中添加
<lib dir="../../../plugins/" regex=".*\.jar" />
7,java -jar start.jar
时间: 2024-11-05 14:39:37