分词系统建立完毕,这是基础也是核心,后面我们建立索引要用到分词系统。
下面依次讲解索引的建立,索引的查找。
分词系统建立完毕,这是基础也是核心,后面我们建立索引要用到分词系统。下面依次讲解索引的建立,索引的查找。
索引的建立采用的是倒排序,原理就是遍历所有的文本,对其进行分词,然后把分的词汇建立索引表。形式类似如下:
词汇 出现词汇的篇章1,篇章2,篇章3……
建立索引的时候要注意这样的Document,Field这俩术语。Document代表的是一个文档,它里面包含一个或者多个Filed,Field表示的就是一种域,你可以在一个Document里面添加各种各样的域,名字自己起,但是关于文档的内容一定要加进去,方式如下所示:
doc.Add(new
Field("contents", str,
Field.Store.YES,
Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS));
整个索引的建立如下所示:
using System; using System.Collections.Generic; using System.Linq; using System.Web; using System.IO; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; using Lucene.Net.Index; using Lucene.Net.Documents; using Lucene.Net.Search; using Lucene.Net.Analysis.DChinese; using Version = Lucene.Net.Util.Version; using FSDirectory = Lucene.Net.Store.FSDirectory; using NativeFSLockFactory = Lucene.Net.Store.NativeFSLockFactory; namespace WebApplication6 { public class IndexFiles { public static bool CreateIndexFromFile(DirectoryInfo docDir, DirectoryInfo IndexDir) { string strUserDicPath = System.AppDomain.CurrentDomain.BaseDirectory; string strTestDic = strUserDicPath; HashSet<string> lstStopWords = new HashSet<string>(); strUserDicPath = strUserDicPath + "UserDictionary\\StopWords.txt"; string[] strs = null; StreamWriter sw = new StreamWriter(strTestDic + "UserDictionary\\StopTest.txt"); using (StreamReader strReader = new StreamReader(strUserDicPath)) { string strLine; while ((strLine = strReader.ReadLine()) != null) { strLine = strLine.Trim(); strs = strLine.Split(); foreach (string str in strs) { lstStopWords.Add(str); sw.WriteLine(str); } } strReader.Close(); sw.Close(); } bool bExist = File.Exists(docDir.FullName) || Directory.Exists(docDir.FullName); if (!bExist) { return false; } //using (IndexWriter writer = new IndexWriter(FSDirectory.Open(IndexDir), new DChineseAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED) ) //IndexWriter writer = new IndexWriter(fsDirrctory, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED); FSDirectory fsDirrctory = FSDirectory.Open(IndexDir, new NativeFSLockFactory()); Analyzer analyzer = new DChineseAnalyzer(Version.LUCENE_30,lstStopWords); IndexWriter writer = new IndexWriter(fsDirrctory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); try { IndexDirectory(writer, docDir); writer.Optimize(); writer.Commit(); } finally { writer.Dispose(); fsDirrctory.Dispose(); } return true; } internal static void IndexDirectory(IndexWriter writer, DirectoryInfo directory) { foreach (var subDirectory in directory.GetDirectories()) IndexDirectory(writer, subDirectory); foreach (var file in directory.GetFiles()) IndexDocs(writer, file); } internal static void IndexDocs(IndexWriter writer, FileInfo file) { Console.Out.WriteLine("adding " + file); try { writer.AddDocument(Document(file)); } catch (FileNotFoundException) { // At least on Windows, some temporary files raise this exception with an // "access denied" message checking if the file can be read doesn't help. } catch (UnauthorizedAccessException) { // Handle any access-denied errors that occur while reading the file. } catch (IOException) { // Generic handler for any io-related exceptions that occur. } } public static Document Document(FileInfo f) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a field that is // indexed (i.e. searchable), but don't tokenize the field into words. doc.Add(new Field("path", f.FullName, Field.Store.YES, Field.Index.NOT_ANALYZED)); // Add the last modified date of the file a field named "modified". Use // a field that is indexed (i.e. searchable), but don't tokenize the field // into words. doc.Add(new Field("modified", DateTools.TimeToString(f.LastWriteTime.Millisecond, DateTools.Resolution.MINUTE), Field.Store.YES, Field.Index.NOT_ANALYZED)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in the system's default encoding. // If that's not the case searching for special characters will fail. string str = File.ReadAllText(f.FullName); //doc.Add(new Field("contents", new StreamReader(f.FullName, System.Text.Encoding.UTF8))); doc.Add(new Field("contents", str, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); // return the document return doc; } } }
查找的实现:
Lucene.net中有多种多样的查找类,但是如果要实现多条件查询就要使用PhraseQuery
类。通过搜索函数把搜索结果放到容器里面。
最后结果的呈现时候,我们把搜索结果放到列表里面,如果还要显示关键词加亮,那么就需要做一点额外的工作。在这里我是通过ColorWord这个类实现的。具体的搜索代码如下所示:
using System; using System.Collections.Generic; using System.Linq; using System.Web; using System.IO; using Lucene.Net.Analysis; using Lucene.Net.Analysis.DChinese; using Lucene.Net.Documents; using Lucene.Net.QueryParsers; using Lucene.Net.Index; using Lucene.Net.Search; using FSDirectory = Lucene.Net.Store.FSDirectory; using NoLockFactory = Lucene.Net.Store.NoLockFactory; using Version = Lucene.Net.Util.Version; namespace WebApplication6 { public static class SearchFiles { public static List<ItemList> SearchIndex(DirectoryInfo dirIndex, List<string> termList) { FSDirectory dirFS = FSDirectory.Open(dirIndex, new NoLockFactory()); IndexReader reader = IndexReader.Open(dirFS,true); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new DChineseAnalyzer(Version.LUCENE_30); PhraseQuery query = new PhraseQuery(); foreach (string word in termList) { query.Add( new Term("contents",word) ); } query.Slop = 100; TopScoreDocCollector collector = TopScoreDocCollector.Create(1000, true); searcher.Search(query,collector); ScoreDoc[] hits = collector.TopDocs().ScoreDocs; List<ItemList> lstResult = new List<ItemList>(); for (int i = 0; i < hits.Length; i++) { Document doc = new Document(); doc = searcher.Doc(hits[i].Doc); ItemList item = new ItemList(); //item.ItemContent = doc.Get("contents"); item.ItemContent = ColorWord.addColor(doc.Get("contents"),termList); item.ItemPath = doc.Get("path"); lstResult.Add(item); } return lstResult; } } }
最终的效果展示如下所示:
最终的代码下载地址:下载
基于lucene.net 和ICTCLAS2014的站内搜索的实现2,布布扣,bubuko.com
时间: 2024-10-19 21:08:26