1.导入jar包
2.创建实体Bean
package com.zhishang.lucene; /** * Created by Administrator on 2017/7/8. */ public class HtmlBean { private String title; private String content; private String url; public void setTitle(String title) { this.title = title; } public void setContent(String content) { this.content = content; } public void setUrl(String url) { this.url = url; } public String getTitle() { return title; } public String getContent() { return content; } public String getUrl() { return url; } }
3.创建工具Bean
package com.zhishang.lucene; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.Source; import org.junit.Test; import java.io.File; import java.io.IOException; /** * Created by Administrator on 2017/7/8. */ public class HtmlBeanUtil { public static HtmlBean parseHtml(File file){ try { Source sc = new Source(file); Element element = sc.getFirstElement(HTMLElementName.TITLE); if (element == null || element.getTextExtractor() == null){ return null; } HtmlBean htmlBean = new HtmlBean(); htmlBean.setTitle(element.getTextExtractor().toString()); htmlBean.setContent(sc.getTextExtractor().toString()); htmlBean.setUrl(file.getAbsolutePath()); return htmlBean; } catch (IOException e) { e.printStackTrace(); } return null; } }
4.创建操作Bean
package com.zhishang.lucene; import org.apache.commons.io.FileUtils; import org.apache.commons.io.filefilter.TrueFileFilter; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.*; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import org.wltea.analyzer.lucene.IKAnalyzer; import java.io.File; import java.io.IOException; import java.util.Collection; /** * Created by Administrator on 2017/7/7. */ public class CreateIndex { public static final String indexDir = "G:/index"; public static final String dataDir = "G:/data"; public void createIndex(){ try { Directory dir = FSDirectory.open(new File(indexDir)); //分词器 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9,analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(dir,config); File file = new File(dataDir); RAMDirectory ramdir = new RAMDirectory(); Analyzer analyzer1 = new IKAnalyzer(); IndexWriterConfig config1 = new IndexWriterConfig(Version.LUCENE_4_9,analyzer1); IndexWriter ramWriter = new IndexWriter(ramdir,config1); Collection<File> files = FileUtils.listFiles(file, TrueFileFilter.INSTANCE,TrueFileFilter.INSTANCE); int count = 0; for(File f:files){ HtmlBean bean = HtmlBeanUtil.parseHtml(f); if(bean != null){ Document document = new Document(); document.add(new StringField("title",bean.getTitle(), Field.Store.YES)); document.add(new TextField("content",bean.getContent(), Field.Store.YES)); document.add(new StringField("url",bean.getUrl(), Field.Store.YES)); ramWriter.addDocument(document); count++; if (count == 50){ ramWriter.close(); writer.addIndexes(ramdir); ramdir = new RAMDirectory(); Analyzer analyzer2 = new IKAnalyzer(); IndexWriterConfig config2 = new IndexWriterConfig(Version.LUCENE_4_9,analyzer2); ramWriter = new IndexWriter(ramdir,config2); count = 0; } } } writer.close(); } catch (IOException e) { e.printStackTrace(); } } }
5.创建测试Bean
package com.zhishang.lucene; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test; import java.io.File; /** * Created by Administrator on 2017/7/8. */ public class LuceneBean { /* 创建索引 */ @Test public void createIndex(){ File file = new File(CreateIndex.indexDir); if (file.exists()){ file.delete(); file.mkdirs(); } CreateIndex createIndex = new CreateIndex(); createIndex.createIndex(); } }
6.查看生成的索引文件
时间: 2024-12-12 11:40:30