lucene示例

搭建环境

搭建Lucene的开发环境只需要加入Lucene的Jar包，要加入的jar包至少要有：

lucene-core-3.0.1.jar（核心包）

contrib\analyzers\common\lucene-analyzers-3.0.1.jar（分词器）

contrib\highlighter\lucene-highlighter-3.0.1.jar（高亮）

contrib\memory\lucene-memory-3.0.1.jar（高亮）

Article.java

 1 package cn.itcast._domain;
 2 public class Article {
 3
 4     private Integer id;
 5     private String title;
 6     private String content;
 7
 8     public Integer getId() {
 9         return id;
10     }
11
12     public void setId(Integer id) {
13         this.id = id;
14     }
15
16     public String getTitle() {
17         return title;
18     }
19
20     public void setTitle(String title) {
21         this.title = title;
22     }
23
24     public String getContent() {
25         return content;
26     }
27
28     public void setContent(String content) {
29         this.content = content;
30     }
31
32 }

HelloWorld.java

  1 package cn.itcast.helloworld;
  2
  3 import java.io.File;
  4 import java.io.IOException;
  5 import java.util.ArrayList;
  6 import java.util.List;
  7
  8 import org.apache.lucene.analysis.Analyzer;
  9 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 10 import org.apache.lucene.document.Document;
 11 import org.apache.lucene.document.Field;
 12 import org.apache.lucene.document.Field.Index;
 13 import org.apache.lucene.document.Field.Store;
 14 import org.apache.lucene.index.IndexWriter;
 15 import org.apache.lucene.index.IndexWriter.MaxFieldLength;
 16 import org.apache.lucene.queryParser.QueryParser;
 17 import org.apache.lucene.search.IndexSearcher;
 18 import org.apache.lucene.search.Query;
 19 import org.apache.lucene.search.ScoreDoc;
 20 import org.apache.lucene.search.TopDocs;
 21 import org.apache.lucene.store.Directory;
 22 import org.apache.lucene.store.FSDirectory;
 23 import org.apache.lucene.util.Version;
 24 import org.junit.Test;
 25
 26 import cn.itcast._domain.Article;
 27
 28 public class HelloWorld {
 29
 30     private static Directory directory; // 索引库目录
 31     private static Analyzer analyzer; // 分词器
 32
 33     static {
 34         try {
 35             directory = FSDirectory.open(new File("./indexDir"));
 36             analyzer = new StandardAnalyzer(Version.LUCENE_30);
 37         } catch (IOException e) {
 38             throw new RuntimeException(e);
 39         }
 40     }
 41
 42     // 建立索引
 43     @Test
 44     public void testCreateIndex() throws Exception {
 45         // 准备数据
 46         Article article = new Article();
 47         article.setId(2);
 48         article.setTitle("准备Lucene的开发环境");
 49         article.setContent("如果信息检索系统在用户发出了检索请求后再去互联网上找答案，根本无法在有限的时间内返回结果。");
 50
 51         // 放到索引库中
 52         // 1, 把Article转为Document
 53         Document doc = new Document();
 54         String idStr = article.getId().toString();        //这个使用的话效率降低 被遗弃了
 55         String idStr = NumericUtils.intToPrefixCoded(article.getId()); // 一定要使用Lucene的工具类把数字转为字符串！
 56                         //目录区域  和 数据区
 57         doc.add(new Field("id", idStr, Store.YES, Index.ANALYZED));
 58         doc.add(new Field("title", article.getTitle(), Store.YES, Index.ANALYZED));
 59         doc.add(new Field("content", article.getContent(), Store.NO, Index.ANALYZED));
 60
 61         // 2, 把Document放到索引库中                                            在目录中的长度 源码 Integer.Max_Value
 62         IndexWriter indexWriter = new IndexWriter(directory, analyzer,  MaxFieldLength.UNLIMITED);
 63         indexWriter.addDocument(doc);
 64         indexWriter.close();
 65     }
 66
 67     // 搜索
 68     @Test
 69     public void testSearch() throws Exception {
 70         // 准备查询条件
 71         String queryString = "lucene的";
 72         // String queryString = "hibernate";
 73
 74         // 执行搜索
 75         List<Article> list = new ArrayList<Article>();
 76
 77         // ==========================================================================================
 78
 79         // 1，把查询字符串转为Query对象（默认只从title中查询）
 80         QueryParser queryParser = new QueryParser(Version.LUCENE_30, "title", analyzer);
 81         Query query = queryParser.parse(queryString);
 82
 83         // 2，执行查询，得到中间结果
 84         IndexSearcher indexSearcher = new IndexSearcher(directory); // 指定所用的索引库
 85         TopDocs topDocs = indexSearcher.search(query, 100); // 最多返回前n条结果
 86
 87         int count = topDocs.totalHits;
 88         ScoreDoc[] scoreDocs = topDocs.scoreDocs;
 89
 90         // 3，处理结果
 91         for (int i = 0; i < scoreDocs.length; i++) {
 92             ScoreDoc scoreDoc = scoreDocs[i];
 93             float score = scoreDoc.score; // 相关度得分
 94             int docId = scoreDoc.doc; // Document的内部编号
 95
 96             // 根据编号拿到Document数据
 97             Document doc = indexSearcher.doc(docId);
 98
 99             // 把Document转为Article
100             String idStr = doc.get("id"); //
101             String title = doc.get("title");
102             String content = doc.get("content"); // 等价于 doc.getField("content").stringValue();
103
104             Article article = new Article();                Integer id = NumericUtils.prefixCodedToInt(doc.get("id")); // 一定要使用Lucene的工具类把字符串转为数字！
105             article.setId(id);
106             article.setTitle(title);
107             article.setContent(content);
108
109             list.add(article);
110         }
111         indexSearcher.close();
112
113         // ==========================================================================================
114
115         // 显示结果
116         System.out.println("总结果数：" + list.size());
117         for (Article a : list) {
118             System.out.println("------------------------------");
119             System.out.println("id = " + a.getId());
120             System.out.println("title = " + a.getTitle());
121             System.out.println("content = " + a.getContent());
122         }
123     }
124 }

1-_搜索互联网资源的程序结构.PNG

索引库的内部结构

建立索引的执行过程

搜索的执行过程

分词器要保持一致

lucene示例,布布扣,bubuko.com

时间： 2024-11-03 21:08:41

lucene示例的相关文章

Lucene入门程序-Java API的简单使用

Lucene入门程序准备环境 JDK: 1.8.0_162 IDE: Eclipse Neon.3 数据库: MySQL 5.7.20 Lucene: 4.10.4(已经很稳定了,高版本对部分分词器支持不好) 准备数据 SET FOREIGN_KEY_CHECKS=0; -------------------------------- Table structure for `book` -------------------------------- DROP TABLE IF EXISTS

（29）ElasticSearch分片和副本机制以及单节点环境中创建index解析

1.分片和副本机制 1.index包含多个shard 2.每个shard都是一个最小工作单元,承担部分数据:每个shard都是一个lucene示例,有完整的建立索引和处理请求的能力 3.增减节点时,shard会自动在nodes中负载均衡 4.primary shard和replica shard,每个document只存在于某个primary shard以及其对应的replica shard中,不可能存在于多个primary shard 5.replica shard是primary shard

【转载】Lucene.Net入门教程及示例

本人看到这篇非常不错的Lucene.Net入门基础教程,就转载分享一下给大家来学习,希望大家在工作实践中可以用到. 一.简单的例子 //索引Private void Index(){ IndexWriter writer = new IndexWriter(@"E:\Index", new StandardAnalyzer()); Document doc = new Document(); doc.Add(new Field("Text",&qu

【Lucene】三个高亮显示模块的简单示例-Highlighter

Lucene针对高亮显示功能提供了两种实现方式,分别是Highlighter和FastVectorHighlighter 这里的三个示例都是使用Highlighter: 示例代码: package com.tan.code; import java.io.File; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.TokenStream; import org.a

一步一步跟我学习lucene（16）---lucene搜索之facet查询查询示例（2）

本篇是接一步一步跟我学习lucene(14)---lucene搜索之facet索引原理和facet查询实例(http://blog.csdn.net/wuyinggui10000/article/details/45973769),上篇主要是统计facet的dim和每个种类对应的数量,个人感觉这个跟lucene的group不同的在于facet的存储类似于hash(key-field-value)形式的,而group则是单一的map(key-value)形式的,虽然都可以统计某一品类的数量,显然f

一步一步跟我学习lucene（8）---lucene搜索之索引的查询原理和查询工具类示例

昨天我们了解了lucene搜索之IndexSearcher构建过程(http://blog.csdn.net/wuyinggui10000/article/details/45698667),对lucene的IndexSearcher有一个大体的了解,知道了怎么创建IndexSearcher,就要开始学会使用IndexSearcher进行索引的搜索,本节我们学习索引的查询原理和根据其相关原理写索引查询的工具类的编写: IndexSearcher提供了几个常用的方法: IndexSearcher.

一步一步跟我学习lucene（18）---lucene索引时join和查询时join使用示例

了解sql的朋友都知道,我们在查询的时候可以采用join查询,即对有一定关联关系的对象进行联合查询来对多维的数据进行整理.这个联合查询的方式挺方便的,跟我们现实生活中的托人找关系类似,我们想要完成一件事,先找自己的熟人,然后通过熟人在一次找到其他,最终通过这种手段找到想要联系到的人.有点类似于"世间万物皆有联系"的感觉. lucene的join包提供了索引时join和查询时join的功能: Index-time join 大意是索引时join提供了查询时join的支持,且IndexWr

一步一步跟我学习lucene（15）---lucene搜索之正则表达式查询RegExQuery和手机邮箱查询示例

今天快下班的时候收到了一个群友的问题,大意是读取文本文件中的内容,找出文件中的手机号和邮箱,我自己写了一个读取文档的内容的正则查询示例,用于匹配文件中是否含有邮箱或者手机号,这个等于是对之前的文本处理工具的一个梳理,同时结合lucene内部提供的正则匹配查询RegexQuery: 废话不多说了,直接上代码,这里先对文件内容读取分类处理,分为pdf word excel 和普通文本四类,不同的种类读取文本内容不一样 pdf利用pdfbox读取内容,word和excel利用poi进行读取内容,文本文

Lucene分类统计示例

需求在检索系统中,遇到了分组统计(Grouping/GroupBy)的需求,比如将搜索结果按照栏目分类,统计每个栏目下各有多少条结果.以前的做法很愚蠢,先发起一次search统计出有多少组,然后在每个组里发起一次search:这样在有N组的情况下一共执行了N+1此搜索,效率低下.改进最近发现Lucene提供了分组的功能,是通过Collector实现的,最多可以在2次search的时候得出结果,如果内存够用,CachingCollector还可以节约一次查询.两次检索第一次第一次的目的是收集符合条