├── .gitattributes ├── .gitignore ├── Integration.LuceneNet.Sample ├── App.config ├── Integration.LuceneNet.Sample.csproj ├── NewsData.cs ├── NewsSearcher.cs ├── Program.cs ├── Properties │ └── AssemblyInfo.cs └── packages.config ├── Integration.LuceneNet ├── Integration.LuceneNet.csproj ├── JiebaAnalyzer.cs ├── JiebaTokenizer.cs ├── Properties │ └── AssemblyInfo.cs └── packages.config ├── LICENSE ├── README.md └── jiebaForLuceneNet.sln /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.userosscache 8 | *.sln.docstates 9 | 10 | # User-specific files (MonoDevelop/Xamarin Studio) 11 | *.userprefs 12 | 13 | # Build results 14 | [Dd]ebug/ 15 | [Dd]ebugPublic/ 16 | [Rr]elease/ 17 | [Rr]eleases/ 18 | x64/ 19 | x86/ 20 | build/ 21 | bld/ 22 | [Bb]in/ 23 | [Oo]bj/ 24 | 25 | # Visual Studo 2015 cache/options directory 26 | .vs/ 27 | 28 | # MSTest test Results 29 | [Tt]est[Rr]esult*/ 30 | [Bb]uild[Ll]og.* 31 | 32 | # NUNIT 33 | *.VisualState.xml 34 | TestResult.xml 35 | 36 | # Build Results of an ATL Project 37 | [Dd]ebugPS/ 38 | [Rr]eleasePS/ 39 | dlldata.c 40 | 41 | *_i.c 42 | *_p.c 43 | *_i.h 44 | *.ilk 45 | *.meta 46 | *.obj 47 | *.pch 48 | *.pdb 49 | *.pgc 50 | *.pgd 51 | *.rsp 52 | *.sbr 53 | *.tlb 54 | *.tli 55 | *.tlh 56 | *.tmp 57 | *.tmp_proj 58 | *.log 59 | *.vspscc 60 | *.vssscc 61 | .builds 62 | *.pidb 63 | *.svclog 64 | *.scc 65 | 66 | # Chutzpah Test files 67 | _Chutzpah* 68 | 69 | # Visual C++ cache files 70 | ipch/ 71 | *.aps 72 | *.ncb 73 | *.opensdf 74 | *.sdf 75 | *.cachefile 76 | 77 | # Visual Studio profiler 78 | *.psess 79 | *.vsp 80 | *.vspx 81 | 82 | # TFS 2012 Local Workspace 83 | $tf/ 84 | 85 | # Guidance Automation Toolkit 86 | *.gpState 87 | 88 | # ReSharper is a .NET coding add-in 89 | _ReSharper*/ 90 | *.[Rr]e[Ss]harper 91 | *.DotSettings.user 92 | 93 | # JustCode is a .NET coding addin-in 94 | .JustCode 95 | 96 | # TeamCity is a build add-in 97 | _TeamCity* 98 | 99 | # DotCover is a Code Coverage Tool 100 | *.dotCover 101 | 102 | # NCrunch 103 | _NCrunch_* 104 | .*crunch*.local.xml 105 | 106 | # MightyMoose 107 | *.mm.* 108 | AutoTest.Net/ 109 | 110 | # Web workbench (sass) 111 | .sass-cache/ 112 | 113 | # Installshield output folder 114 | [Ee]xpress/ 115 | 116 | # DocProject is a documentation generator add-in 117 | DocProject/buildhelp/ 118 | DocProject/Help/*.HxT 119 | DocProject/Help/*.HxC 120 | DocProject/Help/*.hhc 121 | DocProject/Help/*.hhk 122 | DocProject/Help/*.hhp 123 | DocProject/Help/Html2 124 | DocProject/Help/html 125 | 126 | # Click-Once directory 127 | publish/ 128 | 129 | # Publish Web Output 130 | *.[Pp]ublish.xml 131 | *.azurePubxml 132 | # TODO: Comment the next line if you want to checkin your web deploy settings 133 | # but database connection strings (with potential passwords) will be unencrypted 134 | *.pubxml 135 | *.publishproj 136 | 137 | # NuGet Packages 138 | *.nupkg 139 | # The packages folder can be ignored because of Package Restore 140 | **/packages/* 141 | # except build/, which is used as an MSBuild target. 142 | !**/packages/build/ 143 | # Uncomment if necessary however generally it will be regenerated when needed 144 | #!**/packages/repositories.config 145 | 146 | # Windows Azure Build Output 147 | csx/ 148 | *.build.csdef 149 | 150 | # Windows Store app package directory 151 | AppPackages/ 152 | 153 | # Others 154 | *.[Cc]ache 155 | ClientBin/ 156 | [Ss]tyle[Cc]op.* 157 | ~$* 158 | *~ 159 | *.dbmdl 160 | *.dbproj.schemaview 161 | *.pfx 162 | *.publishsettings 163 | node_modules/ 164 | bower_components/ 165 | 166 | # RIA/Silverlight projects 167 | Generated_Code/ 168 | 169 | # Backup & report files from converting an old project file 170 | # to a newer Visual Studio version. Backup files are not needed, 171 | # because we have git ;-) 172 | _UpgradeReport_Files/ 173 | Backup*/ 174 | UpgradeLog*.XML 175 | UpgradeLog*.htm 176 | 177 | # SQL Server files 178 | *.mdf 179 | *.ldf 180 | 181 | # Business Intelligence projects 182 | *.rdl.data 183 | *.bim.layout 184 | *.bim_*.settings 185 | 186 | # Microsoft Fakes 187 | FakesAssemblies/ 188 | 189 | # Node.js Tools for Visual Studio 190 | .ntvs_analysis.dat 191 | 192 | # Visual Studio 6 build log 193 | *.plg 194 | 195 | # Visual Studio 6 workspace options file 196 | *.opt 197 | -------------------------------------------------------------------------------- /Integration.LuceneNet.Sample/App.config: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Integration.LuceneNet.Sample/Integration.LuceneNet.Sample.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {54E868F6-9151-4088-9302-2C48DC48280E} 8 | Exe 9 | Properties 10 | JiebaNet.Integration.LuceneNet.Sample 11 | JiebaNet.Integration.LuceneNet.Sample 12 | v4.5 13 | 512 14 | 15 | 16 | AnyCPU 17 | true 18 | full 19 | false 20 | bin\Debug\ 21 | DEBUG;TRACE 22 | prompt 23 | 4 24 | 25 | 26 | AnyCPU 27 | pdbonly 28 | true 29 | bin\Release\ 30 | TRACE 31 | prompt 32 | 4 33 | 34 | 35 | 36 | ..\packages\SharpZipLib.0.86.0\lib\20\ICSharpCode.SharpZipLib.dll 37 | 38 | 39 | False 40 | ..\packages\jieba.NET.0.38.2\lib\net45\JiebaNet.Analyser.dll 41 | 42 | 43 | False 44 | ..\packages\jieba.NET.0.38.2\lib\net45\JiebaNet.Segmenter.dll 45 | 46 | 47 | ..\packages\Lucene.Net.3.0.3\lib\NET40\Lucene.Net.dll 48 | 49 | 50 | False 51 | ..\packages\Newtonsoft.Json.8.0.3\lib\net45\Newtonsoft.Json.dll 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | {F2744A8B-08B2-4208-9051-2AF4E8841E92} 74 | Integration.LuceneNet 75 | 76 | 77 | 78 | 85 | -------------------------------------------------------------------------------- /Integration.LuceneNet.Sample/NewsData.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | 4 | namespace JiebaNet.Integration.LuceneNet.Sample 5 | { 6 | public class News 7 | { 8 | public int Id { get; set; } 9 | public string Title { get; set; } 10 | public string Content { get; set; } 11 | 12 | public override string ToString() 13 | { 14 | return string.Format("{0}\t{1}: {2}", Id, Title, Content); 15 | } 16 | } 17 | 18 | public static class NewsRepository 19 | { 20 | public static News Get(int id) 21 | { 22 | return GetAll().SingleOrDefault(x => x.Id.Equals(id)); 23 | } 24 | 25 | public static List GetAll() 26 | { 27 | return new List() 28 | { 29 | new News {Id = 1, Title = "自私的基因", Content = "道金斯在《自私的基因》中的突破性贡献在于,把根据自然选择的社会学说的这一重要部分,用简明通俗的形式,妙趣横生的语言介绍给大家,这是第一次。他惊世骇俗地在《自私的基因》中提出:我们生来是自私的。人类窥见了社会关系中基本的对称性和逻辑性,在我们有了更充分的理解之后,我们的政治见解当会重新获得活力,并对心理学的科学研究提供理论上的支柱。在这一过程中,我们也必将对我们受苦受难的许多根源有一个更深刻的理解。"}, 30 | new News {Id = 2, Title = "Rust Essentials Book", Content = "This book is intended for software developers interested in systems level and application programming, and are looking for a quick entry into using Rust and understanding the core features of the framework."}, 31 | new News {Id = 3, Title = "Learning From Data", Content = "Machine learning allows computational systems to adaptively improve their performance with experience accumulated from the observed data. Its techniques are widely applied in engineering, science, finance, and commerce. This book is designed for a short course on machine learning. It is a short course, not a hurried course."}, 32 | new News {Id = 4, Title = "人类简史-从动物到上帝", Content = "尤瓦尔·赫拉利,1976年生,牛津大学历史学博士,现为耶路撒冷希伯来大学的历史系教授,青年怪才,全球瞩目的新锐历史学家。他擅长世界历史和宏观历史进程研究。在学术领域和大众出版领域都有很大的兴趣。"}, 33 | new News {Id = 5, Title = "这就是搜索引擎", Content = "搜索引擎作为互联网发展中至关重要的一种应用,已经成为互联网各个领域的制高点,其重要性不言而喻。搜索引擎领域也是互联网应用中不多见的以核心技术作为其命脉的领域,搜索引擎各个子系统是如何设计的?这成为广大技术人员和搜索引擎优化人员密切关注的内容。"}, 34 | new News {Id = 6, Title = "社会性动物", Content = "《社会性动物》是“美国社会心理学的《圣经》”(Revue des Questions Scientifiques的评价),“仍然是最好的……是一部杰作”(Contemporary Psychology的评价)。本书从1972年出版第一版以来,在世界范围内畅销数千万册,是社会心理学领域内最具影响的著作。"}, 35 | new News {Id = 7, Title = "动物庄园", Content = "故事《动物庄园(买中文版送英文版)》内容为:小说不属于人们所熟悉的蕴含教训的传统寓言,而是对现代政治神话的一种寓言式解构。“它是一部革命史,但它误入歧途,而且第一次偏离都那么有理由。”作者在1945年该书出版是如是说。奥威尔的传世之作,欧美15所名名牌大学投票选出“影响我成长的十《动物庄园(买中文版送英文版)》”之一,世界文坛最著名的政治讽喻小说。买中文版送英文版。乔治·奥威尔(George Orwell,1903-1950),原名埃里克·阿瑟·布莱尔(Eric Arthur Blair),英国作家、新闻记者、社会评论家,著名的英语文体作家。他在小说中创造的“老大哥”、“双重思想”、“新话”等词汇都已收入权威的英语词典,甚至有他的名字衍生的一个形容词“奥威尔式”不断出现在报道国际新闻的记者笔下,足以见其作品在英语国家影响之深远。“多一个人看奥威尔,就多了一份自由的保障”,有评论家如是说。"}, 36 | new News {Id = 8, Title = "动物故事集", Content = "在《动物故事集》里,常见和不常见的狐、蛇、兔、黄鼬……披着角色的外衣纷纷登场。名为“动物故事”,其实说到底还是“动物化”的人事。在小说的世界里,有点质朴,又有点神秘,就像一大群人深夜围在火堆边讲故事时讲出的故事,似真似假,似有似无,但又总是让人忍不住地听下去。"}, 37 | new News {Id = 9, Title = "计算理论导引", Content = "本书是计算理论领域的经典著作,被国外多所大学选用为教材。本书以注重思路、深入引导为特色,系统地介绍计算理论的三大主要内容:自动机与语言、可计算性理论和计算复杂性理论。同时,对可计算性和计算复杂性理论中的某些高级内容作了重点讲解。全书通过启发性的问题、精彩的结果和待解决问题来引导读者挑战此领域中的高层次问题。新版的一大亮点是增加了更多习题、教辅资料和部分习题解答,更加有利于教学。全书叙述由浅入深、详略得当,重点突出,不拘泥于技术细节。可作为计算机专业高年级本科生和研究生的教材,也可作为相关专业教师和研究人员的参考书。"}, 38 | new News {Id = 10, Title = "计算机系统要素", Content = "本书通过展现简单但功能强大的计算机系统之构建过程,为读者呈现了一幅完整、严格的计算机应用科学大图景。本书作者认为,理解计算机工作原理的最好方法就是亲自动手,从零开始构建计算机系统。通过12个章节和项目来引领读者从头开始,本书逐步地构建一个基本的硬件平台和现代软件阶层体系。在这个过程中,读者能够获得关于硬件体系结构、操作系统、编程语言、编译器、数据结构、算法以及软件工程的详实知识。通过这种逐步构造的方法,本书揭示了计算机科学知识中的重要成分,并展示其它课程中所介绍的理论和应用技术如何融入这幅全局大图景当中去。全书基于“先抽象再实现”的阐述模式,每一章都介绍一个关键的硬件或软件抽象,一种实现方式以及一个实际的项目。完成这些项目所必要的计算机科学知识在本书中都有涵盖,只要求读者具备程序设计经验。本书配套的支持网站提供了书中描述的用于构建所有硬件和软件系统"}, 39 | new News {Id = 11, Title = "数据挖掘导论", Content = "本书全面介绍了数据挖掘,涵盖了五个主题:数据、分类、关联分析、聚类和异常检测。除异常检测外,每个主题都有两章。前一章涵盖基本概念、代表性算法和评估技术,而后一章讨论高级概念和算法。这样读者在透彻地理解数据挖掘的基础的同时,还能够了解更多重要的高级主题。本书是明尼苏达大学和密歇根州立大学数据挖掘课程的教材,由于独具特色,正式出版之前就已经被斯坦福大学、得克萨斯大学奥斯汀分校等众多名校采用。"}, 40 | new News {Id = 12, Title = "机器学习实战", Content = "机器学习是人工智能研究领域中一个极其重要的研究方向,在现今的大数据时代背景下,捕获数据并从中萃取有价值的信息或模式,成为各行业求生存、谋发展的决定性手段,这使得这一过去为分析师和数学家所专属的研究领域越来越为人们所瞩目。本书第一部分主要介绍机器学习基础,以及如何利用算法进行分类,并逐步介绍了多种经典的监督学习算法,如k近邻算法、朴素贝叶斯算法、Logistic回归算法、支持向量机、AdaBoost集成方法、基于树的回归算法和分类回归树(CART)算法等。第三部分则重点介绍无监督学习及其一些主要算法:k均值聚类算法、Apriori算法、FP-Growth算法。第四部分介绍了机器学习算法的一些附属工具。"}, 41 | new News {Id = 13, Title = "机器学习", Content = "《机器学习》展示了机器学习中核心的算法和理论,并阐明了算法的运行过程。《机器学习》综合了许多的研究成果,例如统计学、人工智能、哲学、信息论、生物学、认知科学、计算复杂性和控制论等,并以此来理解问题的背景、算法和其中的隐含假定。《机器学习》可作为计算机专业 本科生、研究生教材,也可作为相关领域研究人员、教师的参考书。"}, 42 | new News {Id = 14, Title = "机器学习", Content = "机器学习是计算机科学和人工智能中非常重要的一个研究领域,近年来,机器学习不但在计算机科学的众多领域中大显身手,而且成为一些交叉学科的重要支撑技术。本书比较全面系统地介绍了机器学习的方法和技术,不仅详细阐述了许多经典的学习方法,还讨论了一些有生命力的新理论、新方法。全书案例既有分类问题,也有回归问题;既包含监督学习,也涵盖无监督学习。本书讨论的案例从分类讲到回归,然后讨论了聚类、降维、最优化问题等。这些案例包括分类:垃圾邮件识别,排序:智能收件箱,回归模型:预测网页访问量,正则化:文本回归,最优化:密码破解,无监督学习:"}, 43 | new News {Id = 15, Title = "统计学习方法", Content = "《统计学习方法》是计算机及其应用领域的一门重要的学科。《统计学习方法》全面系统地介绍了统计学习的主要方法,特别是监督学习方法,包括感知机、k近邻法、朴素贝叶斯法、决策树、逻辑斯谛回归与最大熵模型、支持向量机、提升方法、EM算法、隐马尔可夫模型和条件随机场等。除第1章概论和最后一章总结外,每章介绍一种方法。叙述从具体问题或实例入手,由浅入深,阐明思路,给出必要的数学推导,便于读者掌握统计学习方法的实质,学会运用。为满足读者进一步学习的需要,书中还介绍了一些相关研究,给出了少量习题,列出了主要参考文献。"}, 44 | new News {Id = 16, Title = "多外语学习的语言习得原理", Content = "《多外语学习的语言习得原理、认知规律及学习方法研究》从认知和心理语言学的角度探讨了多外语学习的特点,对比分析了二语习得与三语习得过程的异同,尤其是对中介语进行了深入的分析。《多外语学习的语言习得原理、认知规律及学习方法研究》引用了大量的已有的实验研究成果和观点,同时又以中国学习者为对象采集了第一手数据,进行了实证研究,可以说是间接性研究和原始性研究相结合、定性研究与定量研究相结合,体现了该学科多元化研究方法的特点。"}, 45 | //new News {Id = 16, Title = "语言学家", Content = "语言学家"}, 46 | }; 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /Integration.LuceneNet.Sample/NewsSearcher.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using JiebaNet.Segmenter; 7 | using Lucene.Net.Analysis; 8 | using Lucene.Net.Documents; 9 | using Lucene.Net.Index; 10 | using Lucene.Net.QueryParsers; 11 | using Lucene.Net.Search; 12 | using Lucene.Net.Store; 13 | using Version = Lucene.Net.Util.Version; 14 | 15 | namespace JiebaNet.Integration.LuceneNet.Sample 16 | { 17 | public static class NewsSearcher 18 | { 19 | private static readonly string LuceneDir = "lucene_index"; 20 | private static FSDirectory _directoryTemp; 21 | 22 | private static FSDirectory Directory 23 | { 24 | get 25 | { 26 | if (_directoryTemp == null) 27 | { 28 | _directoryTemp = FSDirectory.Open(new DirectoryInfo(LuceneDir)); 29 | } 30 | if (IndexWriter.IsLocked(_directoryTemp)) 31 | { 32 | IndexWriter.Unlock(_directoryTemp); 33 | } 34 | 35 | var lockFilePath = Path.Combine(LuceneDir, "write.lock"); 36 | if (File.Exists(lockFilePath)) 37 | { 38 | File.Delete(lockFilePath); 39 | } 40 | 41 | return _directoryTemp; 42 | } 43 | } 44 | 45 | private static Analyzer GetAnalyzer() 46 | { 47 | return new JiebaAnalyzer(); 48 | } 49 | 50 | #region Add & Update Index 51 | 52 | private static void AddToLuceneIndex(News data, IndexWriter writer) 53 | { 54 | // remove older index entry 55 | var searchQuery = new TermQuery(new Term("Id", data.Id.ToString())); 56 | writer.DeleteDocuments(searchQuery); 57 | 58 | // add new index entry 59 | var doc = new Document(); 60 | doc.Add(new Field("Id", data.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); 61 | doc.Add(new Field("Title", data.Title, Field.Store.YES, Field.Index.ANALYZED)); 62 | doc.Add(new Field("Content", data.Content, Field.Store.YES, Field.Index.ANALYZED)); 63 | 64 | writer.AddDocument(doc); 65 | } 66 | 67 | public static void UpdateLuceneIndex(IEnumerable data) 68 | { 69 | //var analyzer = new StandardAnalyzer(Version.LUCENE_30); 70 | var analyzer = GetAnalyzer(); 71 | 72 | using (var writer = new IndexWriter(Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) 73 | { 74 | // replaces older entry if any 75 | foreach (var sd in data) 76 | { 77 | AddToLuceneIndex(sd, writer); 78 | } 79 | 80 | analyzer.Close(); 81 | } 82 | } 83 | 84 | public static void UpdateLuceneIndex(News data) 85 | { 86 | UpdateLuceneIndex(new[] { data }); 87 | } 88 | 89 | #endregion 90 | 91 | #region Clear Index 92 | 93 | public static void ClearLuceneIndexRecord(int recordId) 94 | { 95 | //var analyzer = new StandardAnalyzer(Version.LUCENE_30); 96 | var analyzer = GetAnalyzer(); 97 | using (var writer = new IndexWriter(Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) 98 | { 99 | var searchQuery = new TermQuery(new Term("Id", recordId.ToString())); 100 | writer.DeleteDocuments(searchQuery); 101 | 102 | analyzer.Close(); 103 | } 104 | } 105 | 106 | public static bool ClearLuceneIndex() 107 | { 108 | try 109 | { 110 | //var analyzer = new StandardAnalyzer(Version.LUCENE_30); 111 | var analyzer = GetAnalyzer(); 112 | using (var writer = new IndexWriter(Directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED)) 113 | { 114 | writer.DeleteAll(); 115 | 116 | analyzer.Close(); 117 | } 118 | } 119 | catch (Exception e) 120 | { 121 | return false; 122 | } 123 | 124 | return true; 125 | } 126 | 127 | #endregion 128 | 129 | #region Optimize Index 130 | 131 | public static void OptimizeLuceneIndex() 132 | { 133 | //var analyzer = new StandardAnalyzer(Version.LUCENE_30); 134 | var analyzer = GetAnalyzer(); 135 | using (var writer = new IndexWriter(Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)) 136 | { 137 | analyzer.Close(); 138 | writer.Optimize(); 139 | } 140 | } 141 | 142 | #endregion 143 | 144 | #region Mappers 145 | 146 | private static News MapDataToModel(Document doc) 147 | { 148 | return new News() 149 | { 150 | Id = int.Parse(doc.Get("Id")), 151 | Title = doc.Get("Title"), 152 | Content = doc.Get("Content"), 153 | }; 154 | } 155 | 156 | private static IEnumerable MapLuceneToDataList(IEnumerable hits) 157 | { 158 | return hits.Select(MapDataToModel).ToList(); 159 | } 160 | 161 | private static IEnumerable MapLuceneToDataList(IEnumerable hits, IndexSearcher searcher) 162 | { 163 | return hits.Select(hit => MapDataToModel(searcher.Doc(hit.Doc))).ToList(); 164 | } 165 | 166 | #endregion 167 | 168 | #region Search 169 | 170 | private static string GetKeyWordsSplitBySpace(string keywords, JiebaTokenizer tokenizer) 171 | { 172 | var result = new StringBuilder(); 173 | 174 | var words = tokenizer.Tokenize(keywords); 175 | 176 | foreach (var word in words) 177 | { 178 | if (string.IsNullOrWhiteSpace(word.Word)) 179 | { 180 | continue; 181 | } 182 | 183 | result.AppendFormat("{0} ", word.Word); 184 | } 185 | 186 | return result.ToString().Trim(); 187 | } 188 | 189 | private static Query ParseQuery(string searchQuery, QueryParser parser) 190 | { 191 | Query query; 192 | try 193 | { 194 | query = parser.Parse(searchQuery.Trim()); 195 | } 196 | catch (ParseException pe) 197 | { 198 | query = parser.Parse(QueryParser.Escape(searchQuery.Trim() + "*")); 199 | } 200 | 201 | return query; 202 | } 203 | 204 | private static IEnumerable SearchQuery(string searchQuery, string searchField = "") 205 | { 206 | if (string.IsNullOrEmpty(searchQuery.Replace("*", "").Replace("?", ""))) 207 | { 208 | return new List(); 209 | } 210 | 211 | using (var searcher = new IndexSearcher(Directory, false)) 212 | { 213 | var hitsLimit = 1000; 214 | //var analyzer = new StandardAnalyzer(Version.LUCENE_30); 215 | var analyzer = GetAnalyzer(); 216 | 217 | if (!string.IsNullOrEmpty(searchField)) 218 | { 219 | var parser = new QueryParser(Version.LUCENE_30, searchField, analyzer); 220 | var query = ParseQuery(searchQuery, parser); 221 | var hits = searcher.Search(query, hitsLimit).ScoreDocs; 222 | var results = MapLuceneToDataList(hits, searcher); 223 | 224 | analyzer.Dispose(); 225 | return results; 226 | } 227 | else 228 | { 229 | var parser = new MultiFieldQueryParser(Version.LUCENE_30, new[] { "Id", "Title", "Content" }, analyzer); 230 | var query = ParseQuery(searchQuery, parser); 231 | var hits = searcher.Search(query, null, hitsLimit, Sort.RELEVANCE).ScoreDocs; 232 | var results = MapLuceneToDataList(hits, searcher); 233 | 234 | //QueryParser queryParser = new QueryParser(Version.LUCENE_30, "Content", analyzer); 235 | //Query query = queryParser.Parse(searchQuery); 236 | 237 | //QueryParser titleQueryParser = new QueryParser(Version.LUCENE_30, "Title", analyzer); 238 | //Query titleQuery = titleQueryParser.Parse(searchQuery); 239 | 240 | //BooleanQuery bq = new BooleanQuery(); 241 | //bq.Add(query, Occur.SHOULD); 242 | //bq.Add(titleQuery, Occur.SHOULD); 243 | 244 | //var hits = searcher.Search(bq, null, hitsLimit, Sort.RELEVANCE); 245 | //var results = MapLuceneToDataList(hits.ScoreDocs, searcher); 246 | 247 | analyzer.Close(); 248 | return results; 249 | } 250 | } 251 | } 252 | 253 | public static IEnumerable Search(string input, string fieldName = "") 254 | { 255 | if (string.IsNullOrEmpty(input)) 256 | { 257 | return new List(); 258 | } 259 | 260 | var kwords = input; 261 | kwords = GetKeyWordsSplitBySpace(kwords, new JiebaTokenizer(new JiebaSegmenter(), kwords)); 262 | 263 | var terms = kwords.Trim().Replace("-", " ").Split(' ') 264 | .Where(x => !string.IsNullOrEmpty(x)).Select(x => x.Trim() + "*"); 265 | input = string.Join(" ", terms); 266 | 267 | return SearchQuery(input, fieldName); 268 | } 269 | 270 | public static IEnumerable SearchDefault(string input, string fieldName = "") 271 | { 272 | return string.IsNullOrEmpty(input) ? new List() : SearchQuery(input, fieldName); 273 | } 274 | 275 | #endregion 276 | 277 | /// 278 | /// All the data indexed. 279 | /// 280 | /// 281 | public static IEnumerable GetAllData() 282 | { 283 | if (!System.IO.Directory.EnumerateFiles(LuceneDir).Any()) 284 | { 285 | return new List(); 286 | } 287 | 288 | var searcher = new IndexSearcher(Directory, false); 289 | var reader = IndexReader.Open(Directory, false); 290 | var docs = new List(); 291 | var term = reader.TermDocs(); 292 | 293 | while (term.Next()) 294 | { 295 | docs.Add(searcher.Doc(term.Doc)); 296 | } 297 | 298 | reader.Dispose(); 299 | searcher.Dispose(); 300 | return MapLuceneToDataList(docs); 301 | } 302 | } 303 | } -------------------------------------------------------------------------------- /Integration.LuceneNet.Sample/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using JiebaNet.Segmenter; 3 | 4 | namespace JiebaNet.Integration.LuceneNet.Sample 5 | { 6 | class Program 7 | { 8 | static void Main(string[] args) 9 | { 10 | TestNewsData(); 11 | } 12 | 13 | private static void TestNewsData() 14 | { 15 | var seg = new JiebaSegmenter(); 16 | seg.AddWord("机器学习"); 17 | 18 | NewsSearcher.ClearLuceneIndex(); 19 | 20 | var data = NewsRepository.GetAll(); 21 | NewsSearcher.UpdateLuceneIndex(data); 22 | 23 | var results = NewsSearcher.Search("进"); 24 | foreach (var result in results) 25 | { 26 | Console.WriteLine(result); 27 | } 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /Integration.LuceneNet.Sample/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("Integration.LuceneNet.Sample")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("Microsoft")] 12 | [assembly: AssemblyProduct("Integration.LuceneNet.Sample")] 13 | [assembly: AssemblyCopyright("Copyright © Microsoft 2015")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("9780c9f0-464c-4538-9b3d-57ca0b0a4828")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("0.37.2.0")] 36 | [assembly: AssemblyFileVersion("0.37.2.0")] 37 | -------------------------------------------------------------------------------- /Integration.LuceneNet.Sample/packages.config: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /Integration.LuceneNet/Integration.LuceneNet.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {F2744A8B-08B2-4208-9051-2AF4E8841E92} 8 | Library 9 | Properties 10 | JiebaNet.Integration.LuceneNet 11 | JiebaNet.Integration.LuceneNet 12 | v4.5 13 | 512 14 | 15 | 16 | true 17 | full 18 | false 19 | bin\Debug\ 20 | DEBUG;TRACE 21 | prompt 22 | 4 23 | 24 | 25 | pdbonly 26 | true 27 | bin\Release\ 28 | TRACE 29 | prompt 30 | 4 31 | 32 | 33 | 34 | ..\packages\SharpZipLib.0.86.0\lib\20\ICSharpCode.SharpZipLib.dll 35 | 36 | 37 | False 38 | ..\packages\jieba.NET.0.38.2\lib\net45\JiebaNet.Analyser.dll 39 | 40 | 41 | False 42 | ..\packages\jieba.NET.0.38.2\lib\net45\JiebaNet.Segmenter.dll 43 | 44 | 45 | ..\packages\Lucene.Net.3.0.3\lib\NET40\Lucene.Net.dll 46 | 47 | 48 | False 49 | ..\packages\Newtonsoft.Json.8.0.3\lib\net45\Newtonsoft.Json.dll 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 75 | -------------------------------------------------------------------------------- /Integration.LuceneNet/JiebaAnalyzer.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.IO; 3 | using JiebaNet.Segmenter; 4 | using Lucene.Net.Analysis; 5 | 6 | namespace JiebaNet.Integration.LuceneNet 7 | { 8 | public class JiebaAnalyzer : Analyzer 9 | { 10 | protected static readonly ISet DefaultStopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; 11 | 12 | private static ISet StopWords; 13 | 14 | static JiebaAnalyzer() 15 | { 16 | var stopWordsFile = Path.GetFullPath(JiebaNet.Analyser.ConfigManager.StopWordsFile); 17 | if (File.Exists(stopWordsFile)) 18 | { 19 | var lines = File.ReadAllLines(stopWordsFile); 20 | StopWords = new HashSet(); 21 | foreach (var line in lines) 22 | { 23 | StopWords.Add(line.Trim()); 24 | } 25 | } 26 | else 27 | { 28 | StopWords = DefaultStopWords; 29 | } 30 | } 31 | 32 | public override TokenStream TokenStream(string fieldName, TextReader reader) 33 | { 34 | var seg = new JiebaSegmenter(); 35 | TokenStream result = new JiebaTokenizer(seg, reader); 36 | // This filter is necessary, because the parser converts the queries to lower case. 37 | result = new LowerCaseFilter(result); 38 | result = new StopFilter(true, result, StopWords); 39 | return result; 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Integration.LuceneNet/JiebaTokenizer.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.IO; 3 | using System.Linq; 4 | using JiebaNet.Segmenter; 5 | using Lucene.Net.Analysis; 6 | using Lucene.Net.Analysis.Tokenattributes; 7 | 8 | namespace JiebaNet.Integration.LuceneNet 9 | { 10 | public class JiebaTokenizer : Tokenizer 11 | { 12 | private JiebaSegmenter segmenter; 13 | private ITermAttribute termAtt; 14 | private IOffsetAttribute offsetAtt; 15 | private ITypeAttribute typeAtt; 16 | 17 | private List tokens; 18 | private int position = -1; 19 | 20 | public JiebaTokenizer(JiebaSegmenter seg, TextReader input):this(seg, input.ReadToEnd()) { } 21 | 22 | public JiebaTokenizer(JiebaSegmenter seg, string input) 23 | { 24 | segmenter = seg; 25 | termAtt = AddAttribute(); 26 | offsetAtt = AddAttribute(); 27 | typeAtt = AddAttribute(); 28 | 29 | var text = input; 30 | tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList(); 31 | } 32 | 33 | public override bool IncrementToken() 34 | { 35 | ClearAttributes(); 36 | position++; 37 | if (position < tokens.Count) 38 | { 39 | var token = tokens[position]; 40 | termAtt.SetTermBuffer(token.Word); 41 | offsetAtt.SetOffset(token.StartIndex, token.EndIndex); 42 | typeAtt.Type = "Jieba"; 43 | return true; 44 | } 45 | 46 | End(); 47 | return false; 48 | } 49 | 50 | public IEnumerable Tokenize(string text, TokenizerMode mode = TokenizerMode.Search) 51 | { 52 | return segmenter.Tokenize(text, mode); 53 | } 54 | } 55 | } -------------------------------------------------------------------------------- /Integration.LuceneNet/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("Integration.LuceneNet")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("Microsoft")] 12 | [assembly: AssemblyProduct("Integration.LuceneNet")] 13 | [assembly: AssemblyCopyright("Copyright © Microsoft 2015")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("ef4d97f3-be8f-4e71-9e44-32943be0f792")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("0.37.2.0")] 36 | [assembly: AssemblyFileVersion("0.37.2.0")] 37 | -------------------------------------------------------------------------------- /Integration.LuceneNet/packages.config: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 andersc 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### jiebaForLuceneNet 2 | Integrate jieba.NET segmenter with Lucene.NET. 3 | 4 | ### 一、jiebaForLuceneNet的使用 5 | 6 | * `JiebaAnalyzer`:与Lucene.NET集成的主接口,添加索引和搜索时使用此类的实例作为analyzer参数 7 | * `JiebaTokenizer`:为JiebaAnalyzer提供分词功能 8 | * `Integration.LuceneNet.Sample`项目中有示例,演示如何通过jieba分词添加索引和搜索 9 | 10 | ### 二、若对Lucene.NET不甚熟悉,请先看: 11 | 12 | #### Lucene.NET的基本用法 13 | 14 | * 看codeproject上的文章:[Lucene.Net ultra fast search for MVC or WebForms site](http://www.codeproject.com/Articles/320219/Lucene-Net-ultra-fast-search-for-MVC-or-WebForms?msg=4643090#xx4643090xx) 15 | 16 | #### 如何自定义Tokenizer和Analyzer 17 | 18 | * [Lucene.Net – Custom Synonym Analyzer](http://www.codeproject.com/Articles/32201/Lucene-Net-Custom-Synonym-Analyzer) 19 | * https://github.com/JimLiu/Lucene.Net.Analysis.PanGu 20 | * https://github.com/JimLiu/Lucene.Net.Analysis.MMSeg 21 | 22 | #### 应用自定义Analyzer 23 | 24 | * http://pangusegment.codeplex.com/wikipage?title=PanGu4Lucene 25 | 26 | #### 调试:通过Luke来Look Lucene.NET的索引 27 | 28 | * http://luke.codeplex.com/releases/view/82033 29 | 30 | -------------------------------------------------------------------------------- /jiebaForLuceneNet.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2013 4 | VisualStudioVersion = 12.0.21005.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Integration.LuceneNet", "Integration.LuceneNet\Integration.LuceneNet.csproj", "{F2744A8B-08B2-4208-9051-2AF4E8841E92}" 7 | EndProject 8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Integration.LuceneNet.Sample", "Integration.LuceneNet.Sample\Integration.LuceneNet.Sample.csproj", "{54E868F6-9151-4088-9302-2C48DC48280E}" 9 | EndProject 10 | Global 11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 12 | Debug|Any CPU = Debug|Any CPU 13 | Release|Any CPU = Release|Any CPU 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {F2744A8B-08B2-4208-9051-2AF4E8841E92}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 17 | {F2744A8B-08B2-4208-9051-2AF4E8841E92}.Debug|Any CPU.Build.0 = Debug|Any CPU 18 | {F2744A8B-08B2-4208-9051-2AF4E8841E92}.Release|Any CPU.ActiveCfg = Release|Any CPU 19 | {F2744A8B-08B2-4208-9051-2AF4E8841E92}.Release|Any CPU.Build.0 = Release|Any CPU 20 | {54E868F6-9151-4088-9302-2C48DC48280E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 21 | {54E868F6-9151-4088-9302-2C48DC48280E}.Debug|Any CPU.Build.0 = Debug|Any CPU 22 | {54E868F6-9151-4088-9302-2C48DC48280E}.Release|Any CPU.ActiveCfg = Release|Any CPU 23 | {54E868F6-9151-4088-9302-2C48DC48280E}.Release|Any CPU.Build.0 = Release|Any CPU 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | EndGlobal 29 | --------------------------------------------------------------------------------