lucene创建索引

1.导入jar包

2.创建实体Bean

package com.zhishang.lucene;

/**
 * Created by Administrator on 2017/7/8.
 */
public class HtmlBean {
    private String title;
    private String content;
    private String url;

    public void setTitle(String title) {
        this.title = title;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getTitle() {
        return title;
    }

    public String getContent() {
        return content;
    }

    public String getUrl() {
        return url;
    }
}

3.创建工具Bean

package com.zhishang.lucene;

import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;
import org.junit.Test;

import java.io.File;
import java.io.IOException;

/**
 * Created by Administrator on 2017/7/8.
 */
public class HtmlBeanUtil {

    public static HtmlBean parseHtml(File file){
        try {
            Source sc = new Source(file);
            Element element = sc.getFirstElement(HTMLElementName.TITLE);
            if (element == null || element.getTextExtractor() == null){
                return null;
            }

            HtmlBean htmlBean = new HtmlBean();
            htmlBean.setTitle(element.getTextExtractor().toString());
            htmlBean.setContent(sc.getTextExtractor().toString());
            htmlBean.setUrl(file.getAbsolutePath());

            return htmlBean;
        } catch (IOException e) {
            e.printStackTrace();
        }

        return null;
    }
}

4.创建操作Bean

package com.zhishang.lucene;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
import org.wltea.analyzer.lucene.IKAnalyzer;

import java.io.File;
import java.io.IOException;
import java.util.Collection;

/**
 * Created by Administrator on 2017/7/7.
 */
public class CreateIndex {
    public static final String indexDir = "G:/index";
    public static final String dataDir = "G:/data";

    public void createIndex(){
        try {
            Directory dir = FSDirectory.open(new File(indexDir));
            //分词器
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9);
            IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9,analyzer);
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
            IndexWriter writer = new IndexWriter(dir,config);
            File file = new File(dataDir);

            RAMDirectory ramdir = new RAMDirectory();
            Analyzer analyzer1 = new IKAnalyzer();
            IndexWriterConfig config1 = new IndexWriterConfig(Version.LUCENE_4_9,analyzer1);
            IndexWriter ramWriter = new IndexWriter(ramdir,config1);

            Collection<File> files = FileUtils.listFiles(file, TrueFileFilter.INSTANCE,TrueFileFilter.INSTANCE);
            int count = 0;
            for(File f:files){
                HtmlBean bean =  HtmlBeanUtil.parseHtml(f);
                if(bean != null){
                    Document document = new Document();
                    document.add(new StringField("title",bean.getTitle(), Field.Store.YES));
                    document.add(new TextField("content",bean.getContent(), Field.Store.YES));
                    document.add(new StringField("url",bean.getUrl(), Field.Store.YES));
                    ramWriter.addDocument(document);
                    count++;
                    if (count == 50){
                        ramWriter.close();
                        writer.addIndexes(ramdir);
                        ramdir = new RAMDirectory();
                        Analyzer analyzer2 = new IKAnalyzer();
                        IndexWriterConfig config2 = new IndexWriterConfig(Version.LUCENE_4_9,analyzer2);
                        ramWriter = new IndexWriter(ramdir,config2);
                        count = 0;
                    }

                }
            }
            writer.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }
}

5.创建测试Bean

package com.zhishang.lucene;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;

import java.io.File;

/**
 * Created by Administrator on 2017/7/8.
 */
public class LuceneBean {

    /*
    创建索引
     */
    @Test
    public void createIndex(){
        File file = new File(CreateIndex.indexDir);
        if (file.exists()){
            file.delete();
            file.mkdirs();
        }
        CreateIndex createIndex = new CreateIndex();
        createIndex.createIndex();
    }
}

6.查看生成的索引文件

时间： 2024-12-12 11:40:30

lucene创建索引的相关文章

lucene创建索引以及索引文件合并

1 package test; 2 3 import java.io.File; 4 import java.io.IOException; 5 import java.nio.file.Path; 6 import java.util.ArrayList; 7 import java.util.List; 8 import java.util.Map; 9 10 import org.apache.lucene.analysis.standard.StandardAnalyzer; 11 im

lucene创建索引的几种方式(一)

什么是索引: 根据你输入的值去找,这个值就是索引第一种创建索引的方式: 根据文件来生成索引,如后缀为.txt等的文件步骤: 第一步:FSDirectory.open(Paths.get(url));根据路径获取存储索引的目录. FSDirectory:表示对文件系统目录的操作.RAMDirectory :内存中的目录操作. Paths为NIO(new io)的一个类:Path 类是 java.io.File 类的升级版,File file=newFile("index.html")

使用Lucene对预处理后的文档进行创建索引（可执行）

时间: 2015/3/18 杨鑫newlife 对于文档的预处理后.就要開始使用Lucene来处理相关的内容了. 这里使用的Lucene的过程例如以下: 首先要为处理对象机那里索引二是构建查询对象三是在索引中查找这里的代码是处理创建索引的部分代码: package ch2.lucenedemo.process; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import jav

Lucene 4.7 --创建索引

Lucene的最新版本和以前的语法或者类名,类规定都相差甚远 0.准备工作: 1). Lucene官方API http://lucene.apache.org/core/4_7_0/index.html 2). 我用到的常用JAR包下载:http://download.csdn.net/detail/yangxy81118/8062269 3). 所用到的jar包 lucene-analyzers-common-4.7.0.jar lucene-analyzers-smartcn-4.7.0.j

lucene学习-创建索引

本文的lucene是基于lucene3.5版本. 使用lucene实现搜索引擎开发,核心的部分是建立索引和搜索.本节主要是记录创建索引部分的内容. 创建的索引结构如图所示. 创建索引的步骤分为以下几个步骤: 1.建立索引器IndexWriter 2.创建文档对象Document 3.建立信息对象字段Field 4.将Field对象添加到Document 5.将Document对象添加到IndexWriter对象中下面简要介绍几个核心对象. (1).创建IndexWriter对象. IndexW

Lucene.net 从创建索引到搜索的代码范例

关于Lucene.Net的介绍网上已经很多了在这里就不多介绍Lucene.Net主要分为建立索引,维护索引和搜索索引Field.Store的作用是通过全文检查就能返回对应的内容,而不必再通过id去DB中加载.Field.Store.YES:存储字段值(未分词前的字段值)Field.Store.NO:不存储,存储与索引没有关系Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损Field.Index.ANALYZED:分词建索引 Field.Index.ANALYZE

全文检索之lucene的优化篇--创建索引库

在上一篇HelloWorld的基础上,建立一个directory的包,添加一个DirectoryTest的测试类,用来根据指定的索引目录创建目录存放指引. DirectoryTest类中的代码如下,基本上就是在HelloWorld的基础上改改就可以了. 里面一共三个方法,testDirectory(),测试创建索引库;testDirectoryFSAndRAM(),结合方法1的两种创建方式,优化;testDirectoryOptimize(),在方法2个基础上,研究索引的优化创建,减少创建的索引

基于lucene的案例开发：创建索引

转载请注明出处:http://blog.csdn.net/xiaojimanman/article/details/42872711 从这篇博客开始,不论是API介绍还是后面的案例开发,都是基于 lucene4.3.1 这个版本,Lucene4.3.1 下载请点击这里, Lucene其他版本下载请点击这里,Lucene4.3.1官方API文档请点击这里. 创建索引demo 在开始介绍之前,先看一个简单的索引创建demo程序: /** *@Description: 索引创建demo */ pack

使用Lucene对预处理后的文档进行创建索引（可运行）

时间: 2015/3/18 杨鑫newlife 对于文档的预处理后,就要开始使用Lucene来处理相关的内容了. 这里使用的Lucene的步骤如下: 首先要为处理对象机那里索引二是构建查询对象三是在索引中查找这里的代码是处理创建索引的部分代码: package ch2.lucenedemo.process; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.