将query改成filter,lucene中有个QueryWrapperFilter性能比较差,所以基本上都需要自己写filter,包括TermFilter,ExactPhraseFilter,ConjunctionFilter,DisjunctionFilter。
这几天验证下来,还是or改善最明显,4个termfilter,4508个返回结果,在我本机上性能提高1/3。ExactPhraseFilter也有小幅提升(5%-10%)。
最令人不解的是and,原来以为跟结果数和子查询数相关,但几次测试基本都是下降。
附ExactPhraseFilter和ut代码:
import java.io.IOException; import java.util.ArrayList; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermContext; import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Filter; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; // A fake to lucene phrase query, but far simplified. public class ExactPhraseFilter extends Filter { protected final ArrayList<Term> terms = new ArrayList<Term>(); protected final ArrayList<Integer> positions = new ArrayList<Integer>(); protected String fieldName; public void add(Term term) { if (terms.size() == 0) { fieldName = term.field(); } else { assert fieldName == term.field(); } positions.add(Integer.valueOf(terms.size())); terms.add(term); } @Override public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { return new ExactPhraseDocIdSet(context, acceptDocs); } static class PostingAndFreq implements Comparable<PostingAndFreq> { DocsAndPositionsEnum posEnum; int docFreq; int position; boolean useAdvance; int posFreq = 0; int pos = -1; int posTime = 0; public PostingAndFreq(DocsAndPositionsEnum posEnum, int docFreq, int position, boolean useAdvance) { this.posEnum = posEnum; this.docFreq = docFreq; this.position = position; this.useAdvance = useAdvance; } @Override public int compareTo(PostingAndFreq other) { if (docFreq != other.docFreq) { return docFreq - other.docFreq; } if (position != other.position) { return position - other.position; } return 0; } } protected class ExactPhraseDocIdSet extends DocIdSet { protected final AtomicReaderContext context; protected final Bits acceptDocs; protected final PostingAndFreq[] postings; protected boolean noDocs = false; public ExactPhraseDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { this.context = context; this.acceptDocs = acceptDocs; Terms fieldTerms = context.reader().fields().terms(fieldName); // TermContext states[] = new TermContext[terms.size()]; postings = new PostingAndFreq[terms.size()]; TermsEnum te = fieldTerms.iterator(null); for (int i = 0; i < terms.size(); ++i) { final Term t = terms.get(i); // states[i] = TermContext.build(context, terms.get(i), true); // final TermState state = states[i].get(context.ord); if (!te.seekExact(t.bytes(), true)) { noDocs = true; return; } if (i == 0) { postings[i] = new PostingAndFreq(te.docsAndPositions(acceptDocs, null, 0), te.docFreq(), positions.get(i), false); } else { postings[i] = new PostingAndFreq(te.docsAndPositions(acceptDocs, null, 0), te.docFreq(), positions.get(i), te.docFreq() > 5 * postings[0].docFreq); } } ArrayUtil.mergeSort(postings); for (int i = 1; i < terms.size(); ++i) { postings[i].posEnum.nextDoc(); } } @Override public DocIdSetIterator iterator() throws IOException { if (noDocs) { return EMPTY_DOCIDSET.iterator(); } else { return new ExactPhraseDocIdSetIterator(context, acceptDocs); } } protected class ExactPhraseDocIdSetIterator extends DocIdSetIterator { protected int docID = -1; public ExactPhraseDocIdSetIterator(AtomicReaderContext context, Bits acceptDocs) throws IOException { } @Override public int nextDoc() throws IOException { while (true) { // first (rarest) term final int doc = postings[0].posEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { // System.err.println("END"); return docID = doc; } // non-first terms int i = 1; while (i < postings.length) { final PostingAndFreq pf = postings[i]; int doc2 = pf.posEnum.docID(); if (pf.useAdvance) { if (doc2 < doc) { doc2 = pf.posEnum.advance(doc); } } else { int iter = 0; while (doc2 < doc) { if (++iter == 50) { doc2 = pf.posEnum.advance(doc); } else { doc2 = pf.posEnum.nextDoc(); } } } if (doc2 > doc) { break; } ++i; } if (i == postings.length) { // System.err.println(doc); docID = doc; // return docID; if (containsPhrase()) { return docID; } } } } @Override public int advance(int target) throws IOException { throw new IOException(); } private boolean containsPhrase() throws IOException { int index = -1; int i = 0; PostingAndFreq pf; // init. for (i = 0; i < postings.length; ++i) { postings[i].posFreq = postings[i].posEnum.freq(); postings[i].pos = postings[i].posEnum.nextPosition() - postings[i].position; postings[i].posTime = 1; } while (true) { pf = postings[0]; // first term. while (pf.pos < index && pf.posTime < pf.posFreq) { pf.pos = pf.posEnum.nextPosition() - pf.position; ++pf.posTime; } if (pf.pos >= index) { index = pf.pos; } else if (pf.posTime == pf.posFreq) { return false; } // other terms. for (i = 1; i < postings.length; ++i) { pf = postings[i]; while (pf.pos < index && pf.posTime < pf.posFreq) { pf.pos = pf.posEnum.nextPosition() - pf.position; ++pf.posTime; } if (pf.pos > index) { index = pf.pos; break; } if (pf.pos == index) { continue; } if (pf.posTime == pf.posFreq) {
return false; } } if (i == postings.length) { return true; } } } @Override public int docID() { return docID; } } } }
UT:
import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.testng.annotations.AfterTest; import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; import com.dp.arts.lucenex.codec.Dp10Codec; public class ExactPhraseFilterTest { final Directory dir = new RAMDirectory(); @BeforeTest public void setUp() throws IOException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer); iwc.setOpenMode(OpenMode.CREATE); iwc.setCodec(Codec.forName(Dp10Codec.DP10_CODEC_NAME)); IndexWriter writer = new IndexWriter(dir, iwc); addDocument(writer, "新疆烧烤"); // 0 addDocument(writer, "啤酒"); // 1 addDocument(writer, "烤烧"); // 2 addDocument(writer, "烧烧烧"); // 3 addDocument(writer, "烤烧中华烧烤"); // 4 writer.close(); } private void addDocument(IndexWriter writer, String str) throws IOException { Document doc = new Document(); doc.add(new TextField("searchkeywords", str, Store.YES)); writer.addDocument(doc, new StandardAnalyzer(Version.LUCENE_40)); } @AfterTest public void tearDown() throws IOException { this.dir.close(); } @Test public void test1() throws IOException { IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); ExactPhraseFilter pf = new ExactPhraseFilter(); pf.add(new Term("searchkeywords", "烧")); pf.add(new Term("searchkeywords", "烤")); Query query = new ConstantScoreQuery(pf); TopDocs results = searcher.search(query, 20); assert results.totalHits == 2; assert results.scoreDocs[0].doc == 0; assert results.scoreDocs[1].doc == 4; searcher.getIndexReader().close(); } }
关于使用Filter减少Lucene tf idf打分计算的调研,码迷,mamicode.com
时间: 2024-10-10 19:55:27