近期用Lucene做了个比较简单的站内检索,在这里和大家做个交流。全文检索的实现,从检索的数据源来分有两种:一种是数据库,另一种是已生成的文件(doc,html,txt......)。
无论哪一种方式,实现原理都是一样的。主要分为两大步:
一、将数据源转换为Lucene文件,保存到设定目录下
private static String filePath = "D:\\rookie\\date\\";//文件存放路径
private static String indexPath = "D:\\rookie\\source";//索引存放路径
public static void main(String[] args) throws Exception {
/* 指明要索引文件夹的位置,这里是d盘的文件夹下 */
File fileDir = new File(filePath);
/* 这里放索引文件的位置 */
File indexDir = new File(indexPath);
Analyzer luceneAnalyzer = new StandardAnalyzer();
IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,true);//提醒:最后一个参数为false时,不重新创建索引文件夹需要追加索引(即更新索引时使用false)
File[] textFiles = fileDir.listFiles();
long startTime = new Date().getTime();
// 增加document到索引去
for (int i = 0; i < textFiles.length; i++) {
//支持html,txt文件
if (textFiles[i].isFile() && textFiles[i].getName().endsWith(".txt")) {
String temp = FileReaderAll(textFiles[i].getCanonicalPath(),"GBK");
Document document = new Document();
Field FieldId = new Field("id", "12345",Field.Store.YES, Field.Index.UN_TOKENIZED);//强烈建议在添加Field 时 保存一个Id
Field FieldPath = new Field("path", textFiles[i].getPath(),Field.Store.YES, Field.Index.UN_TOKENIZED);
Field FieldBody = new Field("contents", temp, Field.Store.YES,Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS);
document.add(FieldId);
document.add(FieldPath);
document.add(FieldBody);
indexWriter.addDocument(document);
}
}
// optimize()方法是对索引进行优化
indexWriter.optimize();
indexWriter.close();
// 测试一下索引的时间
long endTime = new Date().getTime();
System.out.println("索引已经添加到文档中,共花费了" + (endTime - startTime) + " 毫秒! 索引路径是:" + fileDir.getPath());
}
/**
* 功能:读取html ,txt...
* @author rookie_d
*/
public static String FileReaderAll(String FileName, String charset)
throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(FileName), charset));
String line = new String();
String temp = new String();
while ((line = reader.readLine()) != null) {
temp += line;
}
reader.close();
return temp;
}
二、从Lucene文件中进行检索
/**
* 功能:从索引中查询出包含要搜索名字的所有的文件
* @author rookie_d
*/
public static List luceneSearcher() {
String queryString="好";//要检索的字符串
String indexPath = "D:\\rookie\\source";//得到索引存放路径
Hits hits = null;
Query query = null;
IndexSearcher searcher;
List list = new ArrayList();
try {
searcher = new IndexSearcher(indexPath);
Analyzer analyzer = new StandardAnalyzer();
QueryParser qp = new QueryParser("contents", analyzer);
System.out.println(qp.getField());
try {
query = qp.parse(queryString);
System.out.println(query);
} catch (org.apache.lucene.queryParser.ParseException e) {
e.printStackTrace();
}
if (searcher != null) {
hits = searcher.search(query);
System.out.println(hits.length());
if (hits!=null && hits.length() > 0) {
System.out.println("共找到:" + hits.length() + "个结果!");
for(int i=0;i<hits.length();i++){
Document document = hits.doc(i);
String path = document.get("path");
File file = new File(path);
list.add(file.getPath());
}
}else{
System.out.println("*****no result find*****");
}
}
} catch (IOException e) {
e.printStackTrace();
}
return list;
}
在开发过程中遇到了更新索引的小难题,下面也给段转来的代码,作为菜鸟认为这段代码还是比较有用的
mport java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
public class UpdateDocument {
private static String path = "d:/index";
public static void main(String[] args){
// addIndex();
updateIndex();
search("李四");
search("王五");
}
public static void addIndex(){
try {
IndexWriter write = new IndexWriter(path,new StandardAnalyzer(),true);
Document doc = new Document();
doc.add(new Field("id","123456",Field.Store.YES,Field.Index.UN_TOKENIZED));
doc.add(new Field("userName","张三",Field.Store.YES,Field.Index.TOKENIZED));
doc.add(new Field("comefrom","北京",Field.Store.YES,Field.Index.TOKENIZED));
write.addDocument(doc);
write.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void updateIndex(){
try {
IndexWriter write = new IndexWriter(path,new StandardAnalyzer(),false);
Document docNew = new Document();
docNew.add(new Field("id","123456",Field.Store.YES,Field.Index.UN_TOKENIZED));
docNew.add(new Field("userName","王五",Field.Store.YES,Field.Index.TOKENIZED));
Term term = new Term("id","123456");
/**
调用updateDocument的方法,传给它一个新的doc来更新数据,
Term term = new Term("id","1234567");
先去索引文件里查找id为1234567的Doc,如果有就更新它(如果有多条,最后更新后只有一条)。如果没有就新增.
数据库更新的时候,我们可以只针对某个列来更新,而lucene只能针对一行数据更新。
*/
write.updateDocument(term, docNew);
write.close(); //注意在这里一定要关闭write
} catch (IOException e) {
e.printStackTrace();
}
}
public static Query queryParser(String str){
QueryParser queryParser = new QueryParser("userName", new StandardAnalyzer());
try {
Query query = queryParser.parse(str);
return query;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public static void search(String str){
try {
IndexSearcher search = new IndexSearcher(path);
Query query = queryParser(str);
Hits hits = search.search(query);
if(hits==null){
return;
}
if(hits.length() == 0){
System.out.println(" 没有搜索到‘" + str+"‘");
return;
}
for (int i = 0; i < hits.length(); i++) {
Document doc = hits.doc(i);
System.out.println("id = "+hits.id(i));
System.out.println("own id = " + doc.get("id"));
System.out.println("userName = "+doc.get("userName"));
System.out.println("come from = "+doc.get("comefrom"));
System.out.println("");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
最后再给一段删除索引的代码:
//删除Lucene中相应的索引
File indexDir = new File(indexPath);/* 这里放索引文件的位置 */
File[] textFiles = indexDir.listFiles();
Analyzer luceneAnalyzer = new StandardAnalyzer();
boolean create = false;
if(textFiles==null||textFiles.length<=0){
create = true;
}
IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,create);
Term term = new Term("id",news.getId());
indexWriter.deleteDocuments(term);
indexWriter.optimize();//optimize()方法是对索引进行优化
indexWriter.close();//关闭
在删除和更新索引时要注意new IndexWriter(indexDir, luceneAnalyzer,false);最后一个参数为false
关于全文检索的内容还有许多需要学习,写这篇文章来帮助新手和自己来熟悉Lucene,希望对你有一点帮助!