--------------------------------------------------
IndexReader的设计
--------------------------------------------------
IndexReader的创建需要消耗大量内存空间,
所以通过将IndexReader设计出属性值,进行一次创建
整个项目周期就只有一个IndexReader
1.// IndexReader的设计
private static IndexReader reader = null;
2.在构造方法中对IndexReader进行初始化
// 创建indexReader
reader = IndexReader.open(directory);
3.创建getSearch()方法,返回IndexSearch
private static IndexSearcher getSearch() {
return new IndexSearcher(reader);
}
4.最后在使用IndexReader完毕后,只需要关闭IndexSearch
// 最后只需要关闭search
search.close();
PS:
因此此IndexReader属于单例模式,在IndexReader过程中如果改变IndexWriter的索引,IndexReader所search出的数据将不会改变,除非重新构建一个新的IndexReader
代码优化:
// 优化
try { if (reader == null) { reader = IndexReader.open(directory); //reader = IndexReader.open(directory,false); //不设置为只读的reader } else { // 如果Index索引改变了将返回一个新的reader,否则将返回null IndexReader read = IndexReader.openIfChanged(reader); if (read != null) { //把原来的reader给close()掉 reader.close(); reader = read; } } return new IndexSearcher(reader); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null;
//因为此时Reader已经为全局范围,用reader也能删除文档
/*
* 使用reader删除,会立即更新索引信息(但不建议)
*/
// reader.deleteDocuments(new Term("id","1")); // reader.close();
有时候整个项目周期中只有一个IndexWriter
这时候,IndexWriter就不能关闭
那么怎么提交呢?
使用IndexWriter.commit()方法提交对索引操作后的数据
--------------------------------------------------------
Directory的几种操作方式
--------------------------------------------------------
FSDirectory.open()。。系统会根据具体运行环境使用最佳方式打开一个Directory
new RAMDirectory()。。就是将索引存储在内存中。好处:速度快。坏处:不能持久化
RAMDirectory(Directory dir)。也可以将一个持久化好的directory放入内存中。
-------------------------------------------------------
lucene的搜索_TermRange等基本搜索
-------------------------------------------------------
1.创建IndexSearch
/*
* 创建IndexSearch的方法
*/
public IndexSearcher getSearch() { try { if (reader == null) { reader = IndexReader.open(directory); } else { IndexReader read = IndexReader.openIfChanged(reader); if (read != null) { reader.close(); reader = read; } } return new IndexSearcher(reader); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; }
2.查询的几种
======精确查询:
IndexSearcher search = getSearch(); Query query = new TermQuery(new Term(field, name)); TopDocs tds = search.search(query, num); for (ScoreDoc sdc : tds.scoreDocs) { Document doc = search.doc(sdc.doc); } search.close();
======基于字符串的范围查询(TermRange)
// true表示闭区间
IndexSearcher search = getSearch(); Query query = new TermRangeQuery(field, start, end, true, true); TopDocs tds = search.search(query, num); System.out.println("一共查询了:" + tds.totalHits);
======基于数字的范围查询(NumericRangeQuery.new...)
IndexSearcher search = getSearch(); // true表示闭区间 Query query = NumericRangeQuery.newIntRange(field, start, end,true, true); TopDocs tds = search.search(query, num); System.out.println("一共查询了:" + tds.totalHits);
======PS:tds.totalHits是总记录数,与我们传入的num,没有任何关系
eg:
// 基于范围的查询(参数:传入的field,开始字符,结束字符,显示数目)
public void SearchByTermRange(String field, String start, String end,int num) { try { IndexSearcher search = getSearch(); // 范围查询 // true表示闭区间(是否包含开始字符和结束字符,默认为true) Query query = new TermRangeQuery(field, start, end, true, true); TopDocs tds = search.search(query, num); System.out.println("一共查询了:" + tds.totalHits); for (ScoreDoc sdc : tds.scoreDocs) { Document doc = search.doc(sdc.doc); System.out.println(sdc.doc + doc.get("name") + "[" + doc.get("email") + "," + doc.get("id") + "," + doc.get("attach") + "]"); } search.close(); } catch (IOException e) { e.printStackTrace(); } }
-----------------------------------------------------------------
lucene的搜索_其他常用Query搜索
-----------------------------------------------------------------
======前缀搜索(prefixquery)
Query query = new PrefixQuery(new Term(field, value));
======通配符搜索(wildcardquery)
Query query = new WildcardQuery(new Term(field, value));
//使用方法
sutil.SearchByWildCard("name", "l*", 3);
在传入的value中可以使用通配符? 和 *
?表示匹配一个字符,*表示匹配任意多个字符。可以在任何位置使用。
=======可以连接多个条件(BooleanQuery)
BooleanQuery query = new BooleanQuery(); // Occur.Must表必须 //Occur.SHOULD表示可有可无 //Occur.MUST_NOT表示必须没有 query.add(new TermQuery(new Term("name", "lili")), Occur.MUST); query.add(new TermQuery(new Term("content", "hello")), Occur.MUST);
=======短语查询(phrasequery)
PhraseQuery query = new PhraseQuery(); // setSlop()设置跳数,及两个单词之间有几个单词 query.setSlop(1); // 设置field字段,即哪两个单词 // 第一个term query.add(new Term("content", "i")); // 产生距离后的第二个term query.add(new Term("content", "basketball"));
======模糊查询(FuzzyQuery)
//会匹配有一个字符出错的情况
Query query=new FuzzyQuery(new Term("name", "mirk"));
-------------------------------------------------------------
lucene的搜索_基于QueryParser的搜索
-------------------------------------------------------------
//基于字符串操作 public void SearchByQueryParse(Query query,int num){ try { IndexSearcher search = getSearch(); TopDocs tds = search.search(query, num); System.out.println("一共查询了:" + tds.totalHits); for (ScoreDoc sdc : tds.scoreDocs) { Document doc = search.doc(sdc.doc); System.out.println(sdc.doc + doc.get("name") + "[" + doc.get("email") + "," + doc.get("id") + "," + doc.get("attach") + "]"); } search.close(); } catch (IOException e) { e.printStackTrace(); } }
//使用query查询(创建queryparser,再通过queryparser创建query) // 1.创建Parse对象(设置默认搜索域为content) QueryParser parse = new QueryParser(Version.LUCENE_35, "content",new StandardAnalyzer(Version.LUCENE_35)); // 改变空格的默认操作(改为AND型) parse.setDefaultOperator(Operator.AND); // 开启第一个字符的通配符匹配(*xxx,?xxx),默认关闭,因为效率比较低 parse.setAllowLeadingWildcard(true); // 2.通过parse生成query(搜索content域中包含有like的) Query query = parse.parse("like"); // 能够一直加条件(空格默认就是OR) query = parse.parse("basketball i"); // 改变搜索域(域:值) query = parse.parse("name:mark"); // 同样能进行*或?的通配符匹配(通配符默认不能放在首位) query = parse.parse("name:*i"); // name中不包含mark,但是content中包含basketball(-和+必须放在域说明的前面) query = parse.parse("- name:mark + basketball"); // id的1~3(TO表示一个闭区间,TO必须是大写的) query = parse.parse("id:[1 TO 3]"); // {}表示1~3的开区间匹配 query = parse.parse("id:{1 TO 3}"); // name域值是lili或mark,默认域值是game query = parse.parse("name:(lili OR mark) AND game"); // 两个‘’号表示短语匹配 query = parse.parse("'i like basketball'"); // 表示i basketball之间有一个单词遗漏的匹配 query = parse.parse("\"i basketball\"~1"); // 加个~就能模糊查询mark query = parse.parse("name:mirk~"); // 没有办法匹配数字范围(自己扩展parse) query = parse.parse("attach:[1 TO 3]"); sutil.SearchByQueryParse(query, 5);
------------------------------------------------------------
简单分页搜索
------------------------------------------------------------
Lucene通过再查询的方式:将所有数据取出,再进行分段分页
3.5以后使用的是searchAfter
//第一种分页方式(通过取出全部数据,再通过start和end对数据进行分页)
public void searchPage(String query,int pageIndex,int pageSize) { try { Directory dir = FileIndexUtils.getDirectory(); IndexSearcher searcher = getSearcher(dir); QueryParser parser = new QueryParser(Version.LUCENE_35,"content",new StandardAnalyzer(Version.LUCENE_35)); Query q = parser.parse(query); TopDocs tds = searcher.search(q, 500); ScoreDoc[] sds = tds.scoreDocs; int start = (pageIndex-1)*pageSize; int end = pageIndex*pageSize; for(int i=start;i<end;i++) { Document doc = searcher.doc(sds[i].doc); System.out.println(sds[i].doc+":"+doc.get("path")+"-->"+doc.get("filename")); } searcher.close(); } catch (org.apache.lucene.queryParser.ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
-----------------------------------------------------------
lucene的搜索_基于searchAfter的实现(Lucene3.5之后)
-----------------------------------------------------------
/**
* 根据页码和分页大小获取上一次的最后一个ScoreDoc
*/
private ScoreDoc getLastScoreDoc(int pageIndex,int pageSize,Query query,IndexSearcher searcher) throws IOException { if(pageIndex==1)return null;//如果是第一页就返回空 int num = pageSize*(pageIndex-1);//获取上一页的数量 //每次只取上面所有的元素 TopDocs tds = searcher.search(query, num); return tds.scoreDocs[num-1]; } public void searchPageByAfter(String query,int pageIndex,int pageSize) { try { Directory dir = FileIndexUtils.getDirectory(); IndexSearcher searcher = getSearcher(dir); QueryParser parser = new QueryParser(Version.LUCENE_35,"content",new StandardAnalyzer(Version.LUCENE_35)); Query q = parser.parse(query); //先获取上一页的最后一个元素 ScoreDoc lastSd = getLastScoreDoc(pageIndex, pageSize, q, searcher); //通过最后一个元素搜索下页的pageSize个元素 TopDocs tds = searcher.searchAfter(lastSd,q, pageSize); for(ScoreDoc sd:tds.scoreDocs) { Document doc = searcher.doc(sd.doc); System.out.println(sd.doc+":"+doc.get("path")+"-->"+doc.get("filename")); } searcher.close(); } catch (org.apache.lucene.queryParser.ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
代码片段
package test.lucene.index; import java.io.IOException; import java.util.Date; import java.util.HashMap; import java.util.Map; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeFilter; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; public class SearchUtil { /* * 假设6个文档 */ private String[] ids = { "1", "2", "3", "4", "5", "6" }; private String[] emails = { "[email protected]", "[email protected]", "[email protected]", "[email protected]", "[email protected]", "[email protected]" }; private String[] contents = { "hello boy,i like pingpang", "like boy", "xx bye i like swim", "hehe, i like basketball", "dd fsfs, i like movie", "hello xxx,i like game" }; private int[] attachs = { 2, 3, 1, 4, 5, 5 }; private String[] names = { "lili", "wangwu", "lisi", "jack", "tom", "mark" }; // 设置加权map private Map<String, Float> scores = new HashMap<String, Float>(); private Directory directory; private IndexReader reader; public SearchUtil() { directory = new RAMDirectory(); } /* * 添加索引 */ public void index() { IndexWriter writer = null; try { writer = new IndexWriter(directory, new IndexWriterConfig( Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35))); writer.deleteAll(); // 创建documents Document document = null; for (int i = 0; i < ids.length; i++) { document = new Document(); document.add(new Field("id", ids[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); document.add(new Field("email", emails[i], Field.Store.YES, Field.Index.NOT_ANALYZED)); // 不分词 document.add(new Field("content", contents[i], Field.Store.NO, Field.Index.ANALYZED)); document.add(new Field("name", names[i], Field.Store.YES, Field.Index.NOT_ANALYZED)); // 为数字添加索引 document.add(new NumericField("attach", Field.Store.YES, true) .setIntValue(attachs[i])); /* * document.setBoost(float) 设置评级 */ String et = emails[i].substring(emails[i].lastIndexOf("@") + 1); // System.out.println(et); if (scores.containsKey(et)) { document.setBoost(scores.get(et)); } else { document.setBoost(0.5f); } writer.addDocument(document); } } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) { try { writer.close(); writer = null; } catch (IOException e) { e.printStackTrace(); } } } } /* * 创建IndexSearch的方法 */ public IndexSearcher getSearch() { try { if (reader == null) { reader = IndexReader.open(directory); } else { IndexReader read = IndexReader.openIfChanged(reader); if (read != null) { reader.close(); reader = read; } } return new IndexSearcher(reader); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } // 精确匹配查询 public void SearchByTerm(String field, String name, int num) { try { IndexSearcher search = getSearch(); Query query = new TermQuery(new Term(field, name)); TopDocs tds = search.search(query, num); System.out.println("一共查询了:" + tds.totalHits); for (ScoreDoc sdc : tds.scoreDocs) { Document doc = search.doc(sdc.doc); System.out.println(sdc.doc + doc.get("name") + "[" + doc.get("email") + "," + doc.get("id") + "," + doc.get("attach") + "]"); } search.close(); } catch (IOException e) { e.printStackTrace(); } } // 基于字符串的范围的查询 public void SearchByTermRange(String field, String start, String end, int num) { try { IndexSearcher search = getSearch(); // 范围查询 // true表示闭区间 Query query = new TermRangeQuery(field, start, end, true, true); TopDocs tds = search.search(query, num); System.out.println("一共查询了:" + tds.totalHits); for (ScoreDoc sdc : tds.scoreDocs) { Document doc = search.doc(sdc.doc); System.out.println(sdc.doc + doc.get("name") + "[" + doc.get("email") + "," + doc.get("id") + "," + doc.get("attach") + "]"); } search.close(); } catch (IOException e) { e.printStackTrace(); } } // 基于数字的范围的查询 public void SearchByNumricRange(String field, int start, int end, int num) { try { IndexSearcher search = getSearch(); // 范围查询 // true表示闭区间 Query query = NumericRangeQuery.newIntRange(field, start, end, true, true); TopDocs tds = search.search(query, num); System.out.println("一共查询了:" + tds.totalHits); for (ScoreDoc sdc : tds.scoreDocs) { Document doc = search.doc(sdc.doc); System.out.println(sdc.doc + doc.get("name") + "[" + doc.get("email") + "," + doc.get("id") + "," + doc.get("attach") + "]"); } search.close(); } catch (IOException e) { e.printStackTrace(); } } // 前缀搜索 public void SearchByPrefix(String field, String value, int num) { try { IndexSearcher search = getSearch(); Query query = new PrefixQuery(new Term(field, value)); TopDocs tds = search.search(query, num); System.out.println("一共查询了:" + tds.totalHits); for (ScoreDoc sdc : tds.scoreDocs) { Document doc = search.doc(sdc.doc); System.out.println(sdc.doc + doc.get("name") + "[" + doc.get("email") + "," + doc.get("id") + "," + doc.get("attach") + "]"); } search.close(); } catch (IOException e) { e.printStackTrace(); } } // 通配符搜索 public void SearchByWildCard(String field, String value, int num) { try { IndexSearcher search = getSearch(); Query query = new WildcardQuery(new Term(field, value)); TopDocs tds = search.search(query, num); System.out.println("一共查询了:" + tds.totalHits); for (ScoreDoc sdc : tds.scoreDocs) { Document doc = search.doc(sdc.doc); System.out.println(sdc.doc + doc.get("name") + "[" + doc.get("email") + "," + doc.get("id") + "," + doc.get("attach") + "]"); } search.close(); } catch (IOException e) { e.printStackTrace(); } } // 多个条件搜索 public void SearchByBoolean(int num) { try { IndexSearcher search = getSearch(); BooleanQuery query = new BooleanQuery(); // Occur.Must表必须 Occur.SHOULD表示可有可无 Occur.MUST_NOT表示必须没有 query.add(new TermQuery(new Term("name", "lili")), Occur.MUST); query.add(new TermQuery(new Term("content", "hello")), Occur.MUST); TopDocs tds = search.search(query, num); System.out.println("一共查询了:" + tds.totalHits); for (ScoreDoc sdc : tds.scoreDocs) { Document doc = search.doc(sdc.doc); System.out.println(sdc.doc + doc.get("name") + "[" + doc.get("email") + "," + doc.get("id") + "," + doc.get("attach") + "]"); } search.close(); } catch (IOException e) { e.printStackTrace(); } } // 短语查询 public void SearchByPhrase(int num) { try { IndexSearcher search = getSearch(); PhraseQuery query = new PhraseQuery(); // setSlop()设置跳数,及两个单词之间有几个单词 query.setSlop(1); // 设置field字段,即哪两个单词 query.add(new Term("content", "i")); query.add(new Term("content", "basketball")); TopDocs tds = search.search(query, num); System.out.println("一共查询了:" + tds.totalHits); for (ScoreDoc sdc : tds.scoreDocs) { Document doc = search.doc(sdc.doc); System.out.println(sdc.doc + doc.get("name") + "[" + doc.get("email") + "," + doc.get("id") + "," + doc.get("attach") + "]"); } search.close(); } catch (IOException e) { e.printStackTrace(); } } // 模糊查询 public void SearchByFuzzy(int num) { try { IndexSearcher search = getSearch(); Query query=new FuzzyQuery(new Term("name", "mirk")); TopDocs tds = search.search(query, num); System.out.println("一共查询了:" + tds.totalHits); for (ScoreDoc sdc : tds.scoreDocs) { Document doc = search.doc(sdc.doc); System.out.println(sdc.doc + doc.get("name") + "[" + doc.get("email") + "," + doc.get("id") + "," + doc.get("attach") + "]"); } search.close(); } catch (IOException e) { e.printStackTrace(); } } //基于字符串操作 public void SearchByQueryParse(Query query,int num){ try { IndexSearcher search = getSearch(); TopDocs tds = search.search(query, num); System.out.println("一共查询了:" + tds.totalHits); for (ScoreDoc sdc : tds.scoreDocs) { Document doc = search.doc(sdc.doc); System.out.println(sdc.doc + doc.get("name") + "[" + doc.get("email") + "," + doc.get("id") + "," + doc.get("attach") + "]"); } search.close(); } catch (IOException e) { e.printStackTrace(); } } }
package test.lucene.index; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.QueryParser.Operator; import org.apache.lucene.search.Query; import org.apache.lucene.util.Version; import org.junit.Before; import org.junit.Test; public class SearchTest { private SearchUtil sutil; @Before public void init() throws Exception { sutil = new SearchUtil(); } @Test public void searchByterm() { sutil.index(); sutil.SearchByTerm("name", "mark", 3); } @Test public void searchByRangeTerm() { sutil.index(); sutil.SearchByTermRange("id", "1", "3", 10); // 查询name以a开头和s结尾的 sutil.SearchByTermRange("name", "a", "s", 10); // 由于attach是数字类型,使用termrange无法查询 sutil.SearchByTermRange("attach", "1", "5", 10); } @Test public void searchByNumricRange() { sutil.index(); // 由于attach是数字类型,使用NumricRange进行查询 sutil.SearchByNumricRange("attach", 2, 5, 10); } @Test public void searchByPrefix() { sutil.index(); // 前缀搜索 sutil.SearchByPrefix("name", "l", 3); } @Test public void searchByWildCard() { sutil.index(); // 通配符搜索 sutil.SearchByWildCard("name", "l*", 3); } @Test public void searchByBoolean() { sutil.index(); // 多条件查询 sutil.SearchByBoolean(3); } @Test public void searchByPhrase() { sutil.index(); // 短语查询 sutil.SearchByPhrase(5); } @Test public void searchByFuzzy() { sutil.index(); // 模糊查询 sutil.SearchByFuzzy(5); } @Test public void searchByqueryParse() throws Exception { sutil.index(); // 1.创建Parse对象(设置默认搜索域为content) QueryParser parse = new QueryParser(Version.LUCENE_35, "content", new StandardAnalyzer(Version.LUCENE_35)); // 改变空格的默认操作(改为AND型) parse.setDefaultOperator(Operator.AND); // 开启第一个字符的通配符匹配(*xxx,?xxx),默认关闭,因为效率比较低 parse.setAllowLeadingWildcard(true); // 2.通过parse生成query(搜索content域中包含有like的) Query query = parse.parse("like"); // 能够一直加条件(空格默认就是OR) query = parse.parse("basketball i"); // 改变搜索域(域:值) query = parse.parse("name:mark"); // 同样能进行*或?的通配符匹配(通配符默认不能放在首位) query = parse.parse("name:*i"); // name中不包含mark,但是content中包含basketball(-和+必须放在域说明的前面) query = parse.parse("- name:mark + basketball"); // id的1~3(TO表示一个闭区间,TO必须是大写的) query = parse.parse("id:[1 TO 3]"); // {}表示1~3的开区间匹配 query = parse.parse("id:{1 TO 3}"); // name域值是lili或mark,默认域值是game query = parse.parse("name:(lili OR mark) AND game"); // 两个‘’号表示短语匹配 query = parse.parse("'i like basketball'"); // 表示i basketball之间有一个单词遗漏的匹配 query = parse.parse("\"i basketball\"~1"); // 加个~就能模糊查询mark query = parse.parse("name:mirk~"); // 没有办法匹配数字范围(自己扩展parse) query = parse.parse("attach:[1 TO 3]"); sutil.SearchByQueryParse(query, 5); } }