package com.lin.util; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.wltea.analyzer.lucene.IKAnalyzer; public class LuceneUtil { private Log log = LogFactory.getLog(LuceneUtil.class); private IndexWriter writer; private IndexReader reader; private static Tika tika = new Tika(); /** * 建立索引 * @param srcDriectory 需要建立索引的文件位置 * @param indexDirectory 索引放置位置 * @param analyzer 解析器 * @param version lucene版本 * @param openMode 打开方式(1.创建,2追加,3创建或追加) * @throws IOException * @throws TikaException */ @SuppressWarnings("deprecation") public void diskIndex(File srcDriectory,File indexDirectory, Analyzer analyzer, Version version,OpenMode openMode ) throws IOException, TikaException { if(!indexDirectory.exists()){ indexDirectory.mkdirs(); } FSDirectory fsd = FSDirectory.open(indexDirectory); IndexWriterConfig config = new IndexWriterConfig(version, analyzer); config.setOpenMode(openMode); writer = new IndexWriter(fsd, config); List<File> files = FileUtil.listFile(srcDriectory); Document doc = null; for (File file : files) { doc = new Document(); doc.add(new Field("name", file.getName(), Store.YES, Index.ANALYZED)); doc.add(new Field("path", file.getAbsolutePath(), Store.YES, Index.NO)); doc.add(new Field("content", tikaParseFileToString(file), Store.YES, Index.ANALYZED)); writer.addDocument(doc); } writer.commit(); } /** * 获取查询把柄 * @param indexDirectory * @return * @throws IOException */ public IndexSearcher getIndexSearch(File indexDiretory) throws IOException{ Directory directory = FSDirectory.open(indexDiretory); return new IndexSearcher(reader.open(directory)); } public String search(File indexDirectory,String word,Analyzer analyzer) throws IOException, ParseException{ IndexSearcher indexSearch = getIndexSearch(indexDirectory); QueryParser parser = new QueryParser( "content",analyzer); Query query = parser.parse(word); TopDocs docs = indexSearch.search(query, 10); ScoreDoc[] sds = docs.scoreDocs; for(ScoreDoc sd:sds){ Document document = indexSearch.doc(sd.doc); System.out.println("name==========="+document.get("name")+"path==========="+document.get("path") ); } return null; } public String tikaParseFileToString(File file) throws IOException, TikaException{ return tika.parseToString(file); } public static void main(String[] args)throws Exception { //new LuceneUtil().diskIndex(new File("d:\\lucene"), new File("d:\\luceneIndex"), new IKAnalyzer(), Version.LUCENE_4_10_2, OpenMode.CREATE); new LuceneUtil().search(new File("d:\\luceneIndex"),"接口",new IKAnalyzer()); Tika tika = new Tika(); String str = tika.parseToString(new FileInputStream("d:\\lucene\\IKAnalyzer中文分词器V2012_FF使用手册.pdf")); System.out.println(str); } }
项目依赖使用maven:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.lin.project</groupId> <artifactId>learn</artifactId> <packaging>war</packaging> <version>0.0.1-SNAPSHOT</version> <name>mybatis Maven Webapp</name> <url>http://maven.apache.org</url> <properties> <redis.clients.version>2.6.0</redis.clients.version> <spring.data.redis.version>1.4.0.RELEASE</spring.data.redis.version> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId> <version>1.2</version> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.17</version> </dependency> <dependency> <groupId>commons-dbcp</groupId> <artifactId>commons-dbcp</artifactId> <version>1.4</version> </dependency> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis</artifactId> <version>3.2.7</version> </dependency> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis-spring</artifactId> <version>1.2.2</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-core</artifactId> <version>4.0.6.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-beans</artifactId> <version>4.0.6.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-tx</artifactId> <version>4.0.6.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-aop</artifactId> <version>4.0.6.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-jdbc</artifactId> <version>4.0.6.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-webmvc</artifactId> <version>4.0.6.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-web</artifactId> <version>4.0.6.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-context-support</artifactId> <version>4.0.6.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-orm</artifactId> <version>4.0.6.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-test</artifactId> <version>4.0.6.RELEASE</version> </dependency> <dependency> <groupId>org.aspectj</groupId> <artifactId>aspectjweaver</artifactId> <version>1.8.2</version> </dependency> <dependency> <groupId>jstl</groupId> <artifactId>jstl</artifactId> <version>1.2</version> </dependency> <dependency> <groupId>taglibs</groupId> <artifactId>standard</artifactId> <version>1.1.2</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.32</version> </dependency> <dependency> <groupId>org.quartz-scheduler</groupId> <artifactId>quartz</artifactId> <version>2.2.1</version> </dependency> <dependency> <groupId>org.quartz-scheduler</groupId> <artifactId>quartz-jobs</artifactId> <version>2.2.1</version> </dependency> <dependency> <groupId>org.codehaus.jackson</groupId> <artifactId>jackson-core-asl</artifactId> <version>1.9.13</version> </dependency> <dependency> <groupId>org.codehaus.jackson</groupId> <artifactId>jackson-mapper-asl</artifactId> <version>1.9.13</version> </dependency> <dependency> <groupId>commons-fileupload</groupId> <artifactId>commons-fileupload</artifactId> <version>1.3.1</version> </dependency> <dependency> <groupId>redis.clients</groupId> <artifactId>jedis</artifactId> <version>${redis.clients.version}</version> <type>jar</type> <scope>compile</scope> </dependency> <dependency> <groupId>org.springframework.data</groupId> <artifactId>spring-data-redis</artifactId> <version>${spring.data.redis.version}</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>4.10.2</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-highlighter</artifactId> <version>4.10.2</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queryparser</artifactId> <version>4.10.2</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> <version>4.10.2</version> </dependency> <!-- <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-app</artifactId> <version>1.6</version> </dependency> --> </dependencies> <build> <finalName>learn</finalName> </build> </project>
添加额外tika和IKAnalyzer的jar包
http://pan.baidu.com/s/1o69fCeQ 提取码:122b
http://pan.baidu.com/s/1hq6AalY 提取码:k3xp
时间: 2024-12-07 21:54:52