├── .gitattributes
├── .gitignore
├── Integration.LuceneNet.Sample
├── App.config
├── Integration.LuceneNet.Sample.csproj
├── NewsData.cs
├── NewsSearcher.cs
├── Program.cs
├── Properties
│ └── AssemblyInfo.cs
└── packages.config
├── Integration.LuceneNet
├── Integration.LuceneNet.csproj
├── JiebaAnalyzer.cs
├── JiebaTokenizer.cs
├── Properties
│ └── AssemblyInfo.cs
└── packages.config
├── LICENSE
├── README.md
└── jiebaForLuceneNet.sln
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 |
4 | # User-specific files
5 | *.suo
6 | *.user
7 | *.userosscache
8 | *.sln.docstates
9 |
10 | # User-specific files (MonoDevelop/Xamarin Studio)
11 | *.userprefs
12 |
13 | # Build results
14 | [Dd]ebug/
15 | [Dd]ebugPublic/
16 | [Rr]elease/
17 | [Rr]eleases/
18 | x64/
19 | x86/
20 | build/
21 | bld/
22 | [Bb]in/
23 | [Oo]bj/
24 |
25 | # Visual Studo 2015 cache/options directory
26 | .vs/
27 |
28 | # MSTest test Results
29 | [Tt]est[Rr]esult*/
30 | [Bb]uild[Ll]og.*
31 |
32 | # NUNIT
33 | *.VisualState.xml
34 | TestResult.xml
35 |
36 | # Build Results of an ATL Project
37 | [Dd]ebugPS/
38 | [Rr]eleasePS/
39 | dlldata.c
40 |
41 | *_i.c
42 | *_p.c
43 | *_i.h
44 | *.ilk
45 | *.meta
46 | *.obj
47 | *.pch
48 | *.pdb
49 | *.pgc
50 | *.pgd
51 | *.rsp
52 | *.sbr
53 | *.tlb
54 | *.tli
55 | *.tlh
56 | *.tmp
57 | *.tmp_proj
58 | *.log
59 | *.vspscc
60 | *.vssscc
61 | .builds
62 | *.pidb
63 | *.svclog
64 | *.scc
65 |
66 | # Chutzpah Test files
67 | _Chutzpah*
68 |
69 | # Visual C++ cache files
70 | ipch/
71 | *.aps
72 | *.ncb
73 | *.opensdf
74 | *.sdf
75 | *.cachefile
76 |
77 | # Visual Studio profiler
78 | *.psess
79 | *.vsp
80 | *.vspx
81 |
82 | # TFS 2012 Local Workspace
83 | $tf/
84 |
85 | # Guidance Automation Toolkit
86 | *.gpState
87 |
88 | # ReSharper is a .NET coding add-in
89 | _ReSharper*/
90 | *.[Rr]e[Ss]harper
91 | *.DotSettings.user
92 |
93 | # JustCode is a .NET coding addin-in
94 | .JustCode
95 |
96 | # TeamCity is a build add-in
97 | _TeamCity*
98 |
99 | # DotCover is a Code Coverage Tool
100 | *.dotCover
101 |
102 | # NCrunch
103 | _NCrunch_*
104 | .*crunch*.local.xml
105 |
106 | # MightyMoose
107 | *.mm.*
108 | AutoTest.Net/
109 |
110 | # Web workbench (sass)
111 | .sass-cache/
112 |
113 | # Installshield output folder
114 | [Ee]xpress/
115 |
116 | # DocProject is a documentation generator add-in
117 | DocProject/buildhelp/
118 | DocProject/Help/*.HxT
119 | DocProject/Help/*.HxC
120 | DocProject/Help/*.hhc
121 | DocProject/Help/*.hhk
122 | DocProject/Help/*.hhp
123 | DocProject/Help/Html2
124 | DocProject/Help/html
125 |
126 | # Click-Once directory
127 | publish/
128 |
129 | # Publish Web Output
130 | *.[Pp]ublish.xml
131 | *.azurePubxml
132 | # TODO: Comment the next line if you want to checkin your web deploy settings
133 | # but database connection strings (with potential passwords) will be unencrypted
134 | *.pubxml
135 | *.publishproj
136 |
137 | # NuGet Packages
138 | *.nupkg
139 | # The packages folder can be ignored because of Package Restore
140 | **/packages/*
141 | # except build/, which is used as an MSBuild target.
142 | !**/packages/build/
143 | # Uncomment if necessary however generally it will be regenerated when needed
144 | #!**/packages/repositories.config
145 |
146 | # Windows Azure Build Output
147 | csx/
148 | *.build.csdef
149 |
150 | # Windows Store app package directory
151 | AppPackages/
152 |
153 | # Others
154 | *.[Cc]ache
155 | ClientBin/
156 | [Ss]tyle[Cc]op.*
157 | ~$*
158 | *~
159 | *.dbmdl
160 | *.dbproj.schemaview
161 | *.pfx
162 | *.publishsettings
163 | node_modules/
164 | bower_components/
165 |
166 | # RIA/Silverlight projects
167 | Generated_Code/
168 |
169 | # Backup & report files from converting an old project file
170 | # to a newer Visual Studio version. Backup files are not needed,
171 | # because we have git ;-)
172 | _UpgradeReport_Files/
173 | Backup*/
174 | UpgradeLog*.XML
175 | UpgradeLog*.htm
176 |
177 | # SQL Server files
178 | *.mdf
179 | *.ldf
180 |
181 | # Business Intelligence projects
182 | *.rdl.data
183 | *.bim.layout
184 | *.bim_*.settings
185 |
186 | # Microsoft Fakes
187 | FakesAssemblies/
188 |
189 | # Node.js Tools for Visual Studio
190 | .ntvs_analysis.dat
191 |
192 | # Visual Studio 6 build log
193 | *.plg
194 |
195 | # Visual Studio 6 workspace options file
196 | *.opt
197 |
--------------------------------------------------------------------------------
/Integration.LuceneNet.Sample/App.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/Integration.LuceneNet.Sample/Integration.LuceneNet.Sample.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | AnyCPU
7 | {54E868F6-9151-4088-9302-2C48DC48280E}
8 | Exe
9 | Properties
10 | JiebaNet.Integration.LuceneNet.Sample
11 | JiebaNet.Integration.LuceneNet.Sample
12 | v4.5
13 | 512
14 |
15 |
16 | AnyCPU
17 | true
18 | full
19 | false
20 | bin\Debug\
21 | DEBUG;TRACE
22 | prompt
23 | 4
24 |
25 |
26 | AnyCPU
27 | pdbonly
28 | true
29 | bin\Release\
30 | TRACE
31 | prompt
32 | 4
33 |
34 |
35 |
36 | ..\packages\SharpZipLib.0.86.0\lib\20\ICSharpCode.SharpZipLib.dll
37 |
38 |
39 | False
40 | ..\packages\jieba.NET.0.38.2\lib\net45\JiebaNet.Analyser.dll
41 |
42 |
43 | False
44 | ..\packages\jieba.NET.0.38.2\lib\net45\JiebaNet.Segmenter.dll
45 |
46 |
47 | ..\packages\Lucene.Net.3.0.3\lib\NET40\Lucene.Net.dll
48 |
49 |
50 | False
51 | ..\packages\Newtonsoft.Json.8.0.3\lib\net45\Newtonsoft.Json.dll
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 | {F2744A8B-08B2-4208-9051-2AF4E8841E92}
74 | Integration.LuceneNet
75 |
76 |
77 |
78 |
85 |
--------------------------------------------------------------------------------
/Integration.LuceneNet.Sample/NewsData.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Linq;
3 |
4 | namespace JiebaNet.Integration.LuceneNet.Sample
5 | {
6 | public class News
7 | {
8 | public int Id { get; set; }
9 | public string Title { get; set; }
10 | public string Content { get; set; }
11 |
12 | public override string ToString()
13 | {
14 | return string.Format("{0}\t{1}: {2}", Id, Title, Content);
15 | }
16 | }
17 |
18 | public static class NewsRepository
19 | {
20 | public static News Get(int id)
21 | {
22 | return GetAll().SingleOrDefault(x => x.Id.Equals(id));
23 | }
24 |
25 | public static List GetAll()
26 | {
27 | return new List()
28 | {
29 | new News {Id = 1, Title = "自私的基因", Content = "道金斯在《自私的基因》中的突破性贡献在于,把根据自然选择的社会学说的这一重要部分,用简明通俗的形式,妙趣横生的语言介绍给大家,这是第一次。他惊世骇俗地在《自私的基因》中提出:我们生来是自私的。人类窥见了社会关系中基本的对称性和逻辑性,在我们有了更充分的理解之后,我们的政治见解当会重新获得活力,并对心理学的科学研究提供理论上的支柱。在这一过程中,我们也必将对我们受苦受难的许多根源有一个更深刻的理解。"},
30 | new News {Id = 2, Title = "Rust Essentials Book", Content = "This book is intended for software developers interested in systems level and application programming, and are looking for a quick entry into using Rust and understanding the core features of the framework."},
31 | new News {Id = 3, Title = "Learning From Data", Content = "Machine learning allows computational systems to adaptively improve their performance with experience accumulated from the observed data. Its techniques are widely applied in engineering, science, finance, and commerce. This book is designed for a short course on machine learning. It is a short course, not a hurried course."},
32 | new News {Id = 4, Title = "人类简史-从动物到上帝", Content = "尤瓦尔·赫拉利,1976年生,牛津大学历史学博士,现为耶路撒冷希伯来大学的历史系教授,青年怪才,全球瞩目的新锐历史学家。他擅长世界历史和宏观历史进程研究。在学术领域和大众出版领域都有很大的兴趣。"},
33 | new News {Id = 5, Title = "这就是搜索引擎", Content = "搜索引擎作为互联网发展中至关重要的一种应用,已经成为互联网各个领域的制高点,其重要性不言而喻。搜索引擎领域也是互联网应用中不多见的以核心技术作为其命脉的领域,搜索引擎各个子系统是如何设计的?这成为广大技术人员和搜索引擎优化人员密切关注的内容。"},
34 | new News {Id = 6, Title = "社会性动物", Content = "《社会性动物》是“美国社会心理学的《圣经》”(Revue des Questions Scientifiques的评价),“仍然是最好的……是一部杰作”(Contemporary Psychology的评价)。本书从1972年出版第一版以来,在世界范围内畅销数千万册,是社会心理学领域内最具影响的著作。"},
35 | new News {Id = 7, Title = "动物庄园", Content = "故事《动物庄园(买中文版送英文版)》内容为:小说不属于人们所熟悉的蕴含教训的传统寓言,而是对现代政治神话的一种寓言式解构。“它是一部革命史,但它误入歧途,而且第一次偏离都那么有理由。”作者在1945年该书出版是如是说。奥威尔的传世之作,欧美15所名名牌大学投票选出“影响我成长的十《动物庄园(买中文版送英文版)》”之一,世界文坛最著名的政治讽喻小说。买中文版送英文版。乔治·奥威尔(George Orwell,1903-1950),原名埃里克·阿瑟·布莱尔(Eric Arthur Blair),英国作家、新闻记者、社会评论家,著名的英语文体作家。他在小说中创造的“老大哥”、“双重思想”、“新话”等词汇都已收入权威的英语词典,甚至有他的名字衍生的一个形容词“奥威尔式”不断出现在报道国际新闻的记者笔下,足以见其作品在英语国家影响之深远。“多一个人看奥威尔,就多了一份自由的保障”,有评论家如是说。"},
36 | new News {Id = 8, Title = "动物故事集", Content = "在《动物故事集》里,常见和不常见的狐、蛇、兔、黄鼬……披着角色的外衣纷纷登场。名为“动物故事”,其实说到底还是“动物化”的人事。在小说的世界里,有点质朴,又有点神秘,就像一大群人深夜围在火堆边讲故事时讲出的故事,似真似假,似有似无,但又总是让人忍不住地听下去。"},
37 | new News {Id = 9, Title = "计算理论导引", Content = "本书是计算理论领域的经典著作,被国外多所大学选用为教材。本书以注重思路、深入引导为特色,系统地介绍计算理论的三大主要内容:自动机与语言、可计算性理论和计算复杂性理论。同时,对可计算性和计算复杂性理论中的某些高级内容作了重点讲解。全书通过启发性的问题、精彩的结果和待解决问题来引导读者挑战此领域中的高层次问题。新版的一大亮点是增加了更多习题、教辅资料和部分习题解答,更加有利于教学。全书叙述由浅入深、详略得当,重点突出,不拘泥于技术细节。可作为计算机专业高年级本科生和研究生的教材,也可作为相关专业教师和研究人员的参考书。"},
38 | new News {Id = 10, Title = "计算机系统要素", Content = "本书通过展现简单但功能强大的计算机系统之构建过程,为读者呈现了一幅完整、严格的计算机应用科学大图景。本书作者认为,理解计算机工作原理的最好方法就是亲自动手,从零开始构建计算机系统。通过12个章节和项目来引领读者从头开始,本书逐步地构建一个基本的硬件平台和现代软件阶层体系。在这个过程中,读者能够获得关于硬件体系结构、操作系统、编程语言、编译器、数据结构、算法以及软件工程的详实知识。通过这种逐步构造的方法,本书揭示了计算机科学知识中的重要成分,并展示其它课程中所介绍的理论和应用技术如何融入这幅全局大图景当中去。全书基于“先抽象再实现”的阐述模式,每一章都介绍一个关键的硬件或软件抽象,一种实现方式以及一个实际的项目。完成这些项目所必要的计算机科学知识在本书中都有涵盖,只要求读者具备程序设计经验。本书配套的支持网站提供了书中描述的用于构建所有硬件和软件系统"},
39 | new News {Id = 11, Title = "数据挖掘导论", Content = "本书全面介绍了数据挖掘,涵盖了五个主题:数据、分类、关联分析、聚类和异常检测。除异常检测外,每个主题都有两章。前一章涵盖基本概念、代表性算法和评估技术,而后一章讨论高级概念和算法。这样读者在透彻地理解数据挖掘的基础的同时,还能够了解更多重要的高级主题。本书是明尼苏达大学和密歇根州立大学数据挖掘课程的教材,由于独具特色,正式出版之前就已经被斯坦福大学、得克萨斯大学奥斯汀分校等众多名校采用。"},
40 | new News {Id = 12, Title = "机器学习实战", Content = "机器学习是人工智能研究领域中一个极其重要的研究方向,在现今的大数据时代背景下,捕获数据并从中萃取有价值的信息或模式,成为各行业求生存、谋发展的决定性手段,这使得这一过去为分析师和数学家所专属的研究领域越来越为人们所瞩目。本书第一部分主要介绍机器学习基础,以及如何利用算法进行分类,并逐步介绍了多种经典的监督学习算法,如k近邻算法、朴素贝叶斯算法、Logistic回归算法、支持向量机、AdaBoost集成方法、基于树的回归算法和分类回归树(CART)算法等。第三部分则重点介绍无监督学习及其一些主要算法:k均值聚类算法、Apriori算法、FP-Growth算法。第四部分介绍了机器学习算法的一些附属工具。"},
41 | new News {Id = 13, Title = "机器学习", Content = "《机器学习》展示了机器学习中核心的算法和理论,并阐明了算法的运行过程。《机器学习》综合了许多的研究成果,例如统计学、人工智能、哲学、信息论、生物学、认知科学、计算复杂性和控制论等,并以此来理解问题的背景、算法和其中的隐含假定。《机器学习》可作为计算机专业 本科生、研究生教材,也可作为相关领域研究人员、教师的参考书。"},
42 | new News {Id = 14, Title = "机器学习", Content = "机器学习是计算机科学和人工智能中非常重要的一个研究领域,近年来,机器学习不但在计算机科学的众多领域中大显身手,而且成为一些交叉学科的重要支撑技术。本书比较全面系统地介绍了机器学习的方法和技术,不仅详细阐述了许多经典的学习方法,还讨论了一些有生命力的新理论、新方法。全书案例既有分类问题,也有回归问题;既包含监督学习,也涵盖无监督学习。本书讨论的案例从分类讲到回归,然后讨论了聚类、降维、最优化问题等。这些案例包括分类:垃圾邮件识别,排序:智能收件箱,回归模型:预测网页访问量,正则化:文本回归,最优化:密码破解,无监督学习:"},
43 | new News {Id = 15, Title = "统计学习方法", Content = "《统计学习方法》是计算机及其应用领域的一门重要的学科。《统计学习方法》全面系统地介绍了统计学习的主要方法,特别是监督学习方法,包括感知机、k近邻法、朴素贝叶斯法、决策树、逻辑斯谛回归与最大熵模型、支持向量机、提升方法、EM算法、隐马尔可夫模型和条件随机场等。除第1章概论和最后一章总结外,每章介绍一种方法。叙述从具体问题或实例入手,由浅入深,阐明思路,给出必要的数学推导,便于读者掌握统计学习方法的实质,学会运用。为满足读者进一步学习的需要,书中还介绍了一些相关研究,给出了少量习题,列出了主要参考文献。"},
44 | new News {Id = 16, Title = "多外语学习的语言习得原理", Content = "《多外语学习的语言习得原理、认知规律及学习方法研究》从认知和心理语言学的角度探讨了多外语学习的特点,对比分析了二语习得与三语习得过程的异同,尤其是对中介语进行了深入的分析。《多外语学习的语言习得原理、认知规律及学习方法研究》引用了大量的已有的实验研究成果和观点,同时又以中国学习者为对象采集了第一手数据,进行了实证研究,可以说是间接性研究和原始性研究相结合、定性研究与定量研究相结合,体现了该学科多元化研究方法的特点。"},
45 | //new News {Id = 16, Title = "语言学家", Content = "语言学家"},
46 | };
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/Integration.LuceneNet.Sample/NewsSearcher.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 | using JiebaNet.Segmenter;
7 | using Lucene.Net.Analysis;
8 | using Lucene.Net.Documents;
9 | using Lucene.Net.Index;
10 | using Lucene.Net.QueryParsers;
11 | using Lucene.Net.Search;
12 | using Lucene.Net.Store;
13 | using Version = Lucene.Net.Util.Version;
14 |
15 | namespace JiebaNet.Integration.LuceneNet.Sample
16 | {
17 | public static class NewsSearcher
18 | {
19 | private static readonly string LuceneDir = "lucene_index";
20 | private static FSDirectory _directoryTemp;
21 |
22 | private static FSDirectory Directory
23 | {
24 | get
25 | {
26 | if (_directoryTemp == null)
27 | {
28 | _directoryTemp = FSDirectory.Open(new DirectoryInfo(LuceneDir));
29 | }
30 | if (IndexWriter.IsLocked(_directoryTemp))
31 | {
32 | IndexWriter.Unlock(_directoryTemp);
33 | }
34 |
35 | var lockFilePath = Path.Combine(LuceneDir, "write.lock");
36 | if (File.Exists(lockFilePath))
37 | {
38 | File.Delete(lockFilePath);
39 | }
40 |
41 | return _directoryTemp;
42 | }
43 | }
44 |
45 | private static Analyzer GetAnalyzer()
46 | {
47 | return new JiebaAnalyzer();
48 | }
49 |
50 | #region Add & Update Index
51 |
52 | private static void AddToLuceneIndex(News data, IndexWriter writer)
53 | {
54 | // remove older index entry
55 | var searchQuery = new TermQuery(new Term("Id", data.Id.ToString()));
56 | writer.DeleteDocuments(searchQuery);
57 |
58 | // add new index entry
59 | var doc = new Document();
60 | doc.Add(new Field("Id", data.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
61 | doc.Add(new Field("Title", data.Title, Field.Store.YES, Field.Index.ANALYZED));
62 | doc.Add(new Field("Content", data.Content, Field.Store.YES, Field.Index.ANALYZED));
63 |
64 | writer.AddDocument(doc);
65 | }
66 |
67 | public static void UpdateLuceneIndex(IEnumerable data)
68 | {
69 | //var analyzer = new StandardAnalyzer(Version.LUCENE_30);
70 | var analyzer = GetAnalyzer();
71 |
72 | using (var writer = new IndexWriter(Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
73 | {
74 | // replaces older entry if any
75 | foreach (var sd in data)
76 | {
77 | AddToLuceneIndex(sd, writer);
78 | }
79 |
80 | analyzer.Close();
81 | }
82 | }
83 |
84 | public static void UpdateLuceneIndex(News data)
85 | {
86 | UpdateLuceneIndex(new[] { data });
87 | }
88 |
89 | #endregion
90 |
91 | #region Clear Index
92 |
93 | public static void ClearLuceneIndexRecord(int recordId)
94 | {
95 | //var analyzer = new StandardAnalyzer(Version.LUCENE_30);
96 | var analyzer = GetAnalyzer();
97 | using (var writer = new IndexWriter(Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
98 | {
99 | var searchQuery = new TermQuery(new Term("Id", recordId.ToString()));
100 | writer.DeleteDocuments(searchQuery);
101 |
102 | analyzer.Close();
103 | }
104 | }
105 |
106 | public static bool ClearLuceneIndex()
107 | {
108 | try
109 | {
110 | //var analyzer = new StandardAnalyzer(Version.LUCENE_30);
111 | var analyzer = GetAnalyzer();
112 | using (var writer = new IndexWriter(Directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED))
113 | {
114 | writer.DeleteAll();
115 |
116 | analyzer.Close();
117 | }
118 | }
119 | catch (Exception e)
120 | {
121 | return false;
122 | }
123 |
124 | return true;
125 | }
126 |
127 | #endregion
128 |
129 | #region Optimize Index
130 |
131 | public static void OptimizeLuceneIndex()
132 | {
133 | //var analyzer = new StandardAnalyzer(Version.LUCENE_30);
134 | var analyzer = GetAnalyzer();
135 | using (var writer = new IndexWriter(Directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
136 | {
137 | analyzer.Close();
138 | writer.Optimize();
139 | }
140 | }
141 |
142 | #endregion
143 |
144 | #region Mappers
145 |
146 | private static News MapDataToModel(Document doc)
147 | {
148 | return new News()
149 | {
150 | Id = int.Parse(doc.Get("Id")),
151 | Title = doc.Get("Title"),
152 | Content = doc.Get("Content"),
153 | };
154 | }
155 |
156 | private static IEnumerable MapLuceneToDataList(IEnumerable hits)
157 | {
158 | return hits.Select(MapDataToModel).ToList();
159 | }
160 |
161 | private static IEnumerable MapLuceneToDataList(IEnumerable hits, IndexSearcher searcher)
162 | {
163 | return hits.Select(hit => MapDataToModel(searcher.Doc(hit.Doc))).ToList();
164 | }
165 |
166 | #endregion
167 |
168 | #region Search
169 |
170 | private static string GetKeyWordsSplitBySpace(string keywords, JiebaTokenizer tokenizer)
171 | {
172 | var result = new StringBuilder();
173 |
174 | var words = tokenizer.Tokenize(keywords);
175 |
176 | foreach (var word in words)
177 | {
178 | if (string.IsNullOrWhiteSpace(word.Word))
179 | {
180 | continue;
181 | }
182 |
183 | result.AppendFormat("{0} ", word.Word);
184 | }
185 |
186 | return result.ToString().Trim();
187 | }
188 |
189 | private static Query ParseQuery(string searchQuery, QueryParser parser)
190 | {
191 | Query query;
192 | try
193 | {
194 | query = parser.Parse(searchQuery.Trim());
195 | }
196 | catch (ParseException pe)
197 | {
198 | query = parser.Parse(QueryParser.Escape(searchQuery.Trim() + "*"));
199 | }
200 |
201 | return query;
202 | }
203 |
204 | private static IEnumerable SearchQuery(string searchQuery, string searchField = "")
205 | {
206 | if (string.IsNullOrEmpty(searchQuery.Replace("*", "").Replace("?", "")))
207 | {
208 | return new List();
209 | }
210 |
211 | using (var searcher = new IndexSearcher(Directory, false))
212 | {
213 | var hitsLimit = 1000;
214 | //var analyzer = new StandardAnalyzer(Version.LUCENE_30);
215 | var analyzer = GetAnalyzer();
216 |
217 | if (!string.IsNullOrEmpty(searchField))
218 | {
219 | var parser = new QueryParser(Version.LUCENE_30, searchField, analyzer);
220 | var query = ParseQuery(searchQuery, parser);
221 | var hits = searcher.Search(query, hitsLimit).ScoreDocs;
222 | var results = MapLuceneToDataList(hits, searcher);
223 |
224 | analyzer.Dispose();
225 | return results;
226 | }
227 | else
228 | {
229 | var parser = new MultiFieldQueryParser(Version.LUCENE_30, new[] { "Id", "Title", "Content" }, analyzer);
230 | var query = ParseQuery(searchQuery, parser);
231 | var hits = searcher.Search(query, null, hitsLimit, Sort.RELEVANCE).ScoreDocs;
232 | var results = MapLuceneToDataList(hits, searcher);
233 |
234 | //QueryParser queryParser = new QueryParser(Version.LUCENE_30, "Content", analyzer);
235 | //Query query = queryParser.Parse(searchQuery);
236 |
237 | //QueryParser titleQueryParser = new QueryParser(Version.LUCENE_30, "Title", analyzer);
238 | //Query titleQuery = titleQueryParser.Parse(searchQuery);
239 |
240 | //BooleanQuery bq = new BooleanQuery();
241 | //bq.Add(query, Occur.SHOULD);
242 | //bq.Add(titleQuery, Occur.SHOULD);
243 |
244 | //var hits = searcher.Search(bq, null, hitsLimit, Sort.RELEVANCE);
245 | //var results = MapLuceneToDataList(hits.ScoreDocs, searcher);
246 |
247 | analyzer.Close();
248 | return results;
249 | }
250 | }
251 | }
252 |
253 | public static IEnumerable Search(string input, string fieldName = "")
254 | {
255 | if (string.IsNullOrEmpty(input))
256 | {
257 | return new List();
258 | }
259 |
260 | var kwords = input;
261 | kwords = GetKeyWordsSplitBySpace(kwords, new JiebaTokenizer(new JiebaSegmenter(), kwords));
262 |
263 | var terms = kwords.Trim().Replace("-", " ").Split(' ')
264 | .Where(x => !string.IsNullOrEmpty(x)).Select(x => x.Trim() + "*");
265 | input = string.Join(" ", terms);
266 |
267 | return SearchQuery(input, fieldName);
268 | }
269 |
270 | public static IEnumerable SearchDefault(string input, string fieldName = "")
271 | {
272 | return string.IsNullOrEmpty(input) ? new List() : SearchQuery(input, fieldName);
273 | }
274 |
275 | #endregion
276 |
277 | ///
278 | /// All the data indexed.
279 | ///
280 | ///
281 | public static IEnumerable GetAllData()
282 | {
283 | if (!System.IO.Directory.EnumerateFiles(LuceneDir).Any())
284 | {
285 | return new List();
286 | }
287 |
288 | var searcher = new IndexSearcher(Directory, false);
289 | var reader = IndexReader.Open(Directory, false);
290 | var docs = new List();
291 | var term = reader.TermDocs();
292 |
293 | while (term.Next())
294 | {
295 | docs.Add(searcher.Doc(term.Doc));
296 | }
297 |
298 | reader.Dispose();
299 | searcher.Dispose();
300 | return MapLuceneToDataList(docs);
301 | }
302 | }
303 | }
--------------------------------------------------------------------------------
/Integration.LuceneNet.Sample/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using JiebaNet.Segmenter;
3 |
4 | namespace JiebaNet.Integration.LuceneNet.Sample
5 | {
6 | class Program
7 | {
8 | static void Main(string[] args)
9 | {
10 | TestNewsData();
11 | }
12 |
13 | private static void TestNewsData()
14 | {
15 | var seg = new JiebaSegmenter();
16 | seg.AddWord("机器学习");
17 |
18 | NewsSearcher.ClearLuceneIndex();
19 |
20 | var data = NewsRepository.GetAll();
21 | NewsSearcher.UpdateLuceneIndex(data);
22 |
23 | var results = NewsSearcher.Search("进");
24 | foreach (var result in results)
25 | {
26 | Console.WriteLine(result);
27 | }
28 | }
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/Integration.LuceneNet.Sample/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // General Information about an assembly is controlled through the following
6 | // set of attributes. Change these attribute values to modify the information
7 | // associated with an assembly.
8 | [assembly: AssemblyTitle("Integration.LuceneNet.Sample")]
9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("Microsoft")]
12 | [assembly: AssemblyProduct("Integration.LuceneNet.Sample")]
13 | [assembly: AssemblyCopyright("Copyright © Microsoft 2015")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 |
17 | // Setting ComVisible to false makes the types in this assembly not visible
18 | // to COM components. If you need to access a type in this assembly from
19 | // COM, set the ComVisible attribute to true on that type.
20 | [assembly: ComVisible(false)]
21 |
22 | // The following GUID is for the ID of the typelib if this project is exposed to COM
23 | [assembly: Guid("9780c9f0-464c-4538-9b3d-57ca0b0a4828")]
24 |
25 | // Version information for an assembly consists of the following four values:
26 | //
27 | // Major Version
28 | // Minor Version
29 | // Build Number
30 | // Revision
31 | //
32 | // You can specify all the values or you can default the Build and Revision Numbers
33 | // by using the '*' as shown below:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("0.37.2.0")]
36 | [assembly: AssemblyFileVersion("0.37.2.0")]
37 |
--------------------------------------------------------------------------------
/Integration.LuceneNet.Sample/packages.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/Integration.LuceneNet/Integration.LuceneNet.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | AnyCPU
7 | {F2744A8B-08B2-4208-9051-2AF4E8841E92}
8 | Library
9 | Properties
10 | JiebaNet.Integration.LuceneNet
11 | JiebaNet.Integration.LuceneNet
12 | v4.5
13 | 512
14 |
15 |
16 | true
17 | full
18 | false
19 | bin\Debug\
20 | DEBUG;TRACE
21 | prompt
22 | 4
23 |
24 |
25 | pdbonly
26 | true
27 | bin\Release\
28 | TRACE
29 | prompt
30 | 4
31 |
32 |
33 |
34 | ..\packages\SharpZipLib.0.86.0\lib\20\ICSharpCode.SharpZipLib.dll
35 |
36 |
37 | False
38 | ..\packages\jieba.NET.0.38.2\lib\net45\JiebaNet.Analyser.dll
39 |
40 |
41 | False
42 | ..\packages\jieba.NET.0.38.2\lib\net45\JiebaNet.Segmenter.dll
43 |
44 |
45 | ..\packages\Lucene.Net.3.0.3\lib\NET40\Lucene.Net.dll
46 |
47 |
48 | False
49 | ..\packages\Newtonsoft.Json.8.0.3\lib\net45\Newtonsoft.Json.dll
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
75 |
--------------------------------------------------------------------------------
/Integration.LuceneNet/JiebaAnalyzer.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.IO;
3 | using JiebaNet.Segmenter;
4 | using Lucene.Net.Analysis;
5 |
6 | namespace JiebaNet.Integration.LuceneNet
7 | {
8 | public class JiebaAnalyzer : Analyzer
9 | {
10 | protected static readonly ISet DefaultStopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
11 |
12 | private static ISet StopWords;
13 |
14 | static JiebaAnalyzer()
15 | {
16 | var stopWordsFile = Path.GetFullPath(JiebaNet.Analyser.ConfigManager.StopWordsFile);
17 | if (File.Exists(stopWordsFile))
18 | {
19 | var lines = File.ReadAllLines(stopWordsFile);
20 | StopWords = new HashSet();
21 | foreach (var line in lines)
22 | {
23 | StopWords.Add(line.Trim());
24 | }
25 | }
26 | else
27 | {
28 | StopWords = DefaultStopWords;
29 | }
30 | }
31 |
32 | public override TokenStream TokenStream(string fieldName, TextReader reader)
33 | {
34 | var seg = new JiebaSegmenter();
35 | TokenStream result = new JiebaTokenizer(seg, reader);
36 | // This filter is necessary, because the parser converts the queries to lower case.
37 | result = new LowerCaseFilter(result);
38 | result = new StopFilter(true, result, StopWords);
39 | return result;
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/Integration.LuceneNet/JiebaTokenizer.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.IO;
3 | using System.Linq;
4 | using JiebaNet.Segmenter;
5 | using Lucene.Net.Analysis;
6 | using Lucene.Net.Analysis.Tokenattributes;
7 |
8 | namespace JiebaNet.Integration.LuceneNet
9 | {
10 | public class JiebaTokenizer : Tokenizer
11 | {
12 | private JiebaSegmenter segmenter;
13 | private ITermAttribute termAtt;
14 | private IOffsetAttribute offsetAtt;
15 | private ITypeAttribute typeAtt;
16 |
17 | private List tokens;
18 | private int position = -1;
19 |
20 | public JiebaTokenizer(JiebaSegmenter seg, TextReader input):this(seg, input.ReadToEnd()) { }
21 |
22 | public JiebaTokenizer(JiebaSegmenter seg, string input)
23 | {
24 | segmenter = seg;
25 | termAtt = AddAttribute();
26 | offsetAtt = AddAttribute();
27 | typeAtt = AddAttribute();
28 |
29 | var text = input;
30 | tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList();
31 | }
32 |
33 | public override bool IncrementToken()
34 | {
35 | ClearAttributes();
36 | position++;
37 | if (position < tokens.Count)
38 | {
39 | var token = tokens[position];
40 | termAtt.SetTermBuffer(token.Word);
41 | offsetAtt.SetOffset(token.StartIndex, token.EndIndex);
42 | typeAtt.Type = "Jieba";
43 | return true;
44 | }
45 |
46 | End();
47 | return false;
48 | }
49 |
50 | public IEnumerable Tokenize(string text, TokenizerMode mode = TokenizerMode.Search)
51 | {
52 | return segmenter.Tokenize(text, mode);
53 | }
54 | }
55 | }
--------------------------------------------------------------------------------
/Integration.LuceneNet/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // General Information about an assembly is controlled through the following
6 | // set of attributes. Change these attribute values to modify the information
7 | // associated with an assembly.
8 | [assembly: AssemblyTitle("Integration.LuceneNet")]
9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("Microsoft")]
12 | [assembly: AssemblyProduct("Integration.LuceneNet")]
13 | [assembly: AssemblyCopyright("Copyright © Microsoft 2015")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 |
17 | // Setting ComVisible to false makes the types in this assembly not visible
18 | // to COM components. If you need to access a type in this assembly from
19 | // COM, set the ComVisible attribute to true on that type.
20 | [assembly: ComVisible(false)]
21 |
22 | // The following GUID is for the ID of the typelib if this project is exposed to COM
23 | [assembly: Guid("ef4d97f3-be8f-4e71-9e44-32943be0f792")]
24 |
25 | // Version information for an assembly consists of the following four values:
26 | //
27 | // Major Version
28 | // Minor Version
29 | // Build Number
30 | // Revision
31 | //
32 | // You can specify all the values or you can default the Build and Revision Numbers
33 | // by using the '*' as shown below:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("0.37.2.0")]
36 | [assembly: AssemblyFileVersion("0.37.2.0")]
37 |
--------------------------------------------------------------------------------
/Integration.LuceneNet/packages.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 andersc
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ### jiebaForLuceneNet
2 | Integrate jieba.NET segmenter with Lucene.NET.
3 |
4 | ### 一、jiebaForLuceneNet的使用
5 |
6 | * `JiebaAnalyzer`:与Lucene.NET集成的主接口,添加索引和搜索时使用此类的实例作为analyzer参数
7 | * `JiebaTokenizer`:为JiebaAnalyzer提供分词功能
8 | * `Integration.LuceneNet.Sample`项目中有示例,演示如何通过jieba分词添加索引和搜索
9 |
10 | ### 二、若对Lucene.NET不甚熟悉,请先看:
11 |
12 | #### Lucene.NET的基本用法
13 |
14 | * 看codeproject上的文章:[Lucene.Net ultra fast search for MVC or WebForms site](http://www.codeproject.com/Articles/320219/Lucene-Net-ultra-fast-search-for-MVC-or-WebForms?msg=4643090#xx4643090xx)
15 |
16 | #### 如何自定义Tokenizer和Analyzer
17 |
18 | * [Lucene.Net – Custom Synonym Analyzer](http://www.codeproject.com/Articles/32201/Lucene-Net-Custom-Synonym-Analyzer)
19 | * https://github.com/JimLiu/Lucene.Net.Analysis.PanGu
20 | * https://github.com/JimLiu/Lucene.Net.Analysis.MMSeg
21 |
22 | #### 应用自定义Analyzer
23 |
24 | * http://pangusegment.codeplex.com/wikipage?title=PanGu4Lucene
25 |
26 | #### 调试:通过Luke来Look Lucene.NET的索引
27 |
28 | * http://luke.codeplex.com/releases/view/82033
29 |
30 |
--------------------------------------------------------------------------------
/jiebaForLuceneNet.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 2013
4 | VisualStudioVersion = 12.0.21005.1
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Integration.LuceneNet", "Integration.LuceneNet\Integration.LuceneNet.csproj", "{F2744A8B-08B2-4208-9051-2AF4E8841E92}"
7 | EndProject
8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Integration.LuceneNet.Sample", "Integration.LuceneNet.Sample\Integration.LuceneNet.Sample.csproj", "{54E868F6-9151-4088-9302-2C48DC48280E}"
9 | EndProject
10 | Global
11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
12 | Debug|Any CPU = Debug|Any CPU
13 | Release|Any CPU = Release|Any CPU
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {F2744A8B-08B2-4208-9051-2AF4E8841E92}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
17 | {F2744A8B-08B2-4208-9051-2AF4E8841E92}.Debug|Any CPU.Build.0 = Debug|Any CPU
18 | {F2744A8B-08B2-4208-9051-2AF4E8841E92}.Release|Any CPU.ActiveCfg = Release|Any CPU
19 | {F2744A8B-08B2-4208-9051-2AF4E8841E92}.Release|Any CPU.Build.0 = Release|Any CPU
20 | {54E868F6-9151-4088-9302-2C48DC48280E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
21 | {54E868F6-9151-4088-9302-2C48DC48280E}.Debug|Any CPU.Build.0 = Debug|Any CPU
22 | {54E868F6-9151-4088-9302-2C48DC48280E}.Release|Any CPU.ActiveCfg = Release|Any CPU
23 | {54E868F6-9151-4088-9302-2C48DC48280E}.Release|Any CPU.Build.0 = Release|Any CPU
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | EndGlobal
29 |
--------------------------------------------------------------------------------