├── .gitattributes
├── .gitignore
├── Analyser
├── .DS_Store
├── Analyser.csproj
├── ConfigManager.cs
├── IdfLoader.cs
├── KeywordExtractor.cs
├── Resources
│ ├── idf.txt
│ └── stopwords.txt
├── TextRankExtractor.cs
├── TfidfExtractor.cs
└── UndirectWeightedGraph.cs
├── ConsoleApp1
├── Article.cs
├── ConsoleApp1.csproj
└── Program.cs
├── EasyLuceneNET
├── EasyLuceneNET.csproj
├── EasyLuceneNetDefaultProvider.cs
├── EasyLuceneNetExtensions.cs
└── IEasyLuceneNet.cs
├── LICENSE
├── README.md
├── Segmenter
├── .DS_Store
├── Common
│ ├── Extensions.cs
│ ├── FileExtension.cs
│ └── Trie.cs
├── ConfigManager.cs
├── Constants.cs
├── DefaultDictionary.cs
├── FinalSeg
│ ├── IFinalSeg.cs
│ └── Viterbi.cs
├── JiebaSegmenter.cs
├── Node.cs
├── Pair.cs
├── PosSeg
│ ├── Pair.cs
│ ├── PosSegmenter.cs
│ └── Viterbi.cs
├── Resources
│ ├── char_state_tab.json
│ ├── dict.txt
│ ├── pos_prob_emit.json
│ ├── pos_prob_start.json
│ ├── pos_prob_trans.json
│ ├── prob_emit.json
│ └── prob_trans.json
├── Segmenter.csproj
├── Spelling
│ └── SpellChecker.cs
├── Token.cs
├── WordDictionary.cs
└── WordInfo.cs
├── Test
├── SegmentTest.cs
└── Test.csproj
├── jieba.NET.sln
└── jieba.NET
├── .DS_Store
├── JieBaAnalyzer.cs
├── JieBaTokenizer.cs
├── Resources
└── stopwords.txt
└── jieba.NET.csproj
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.rsuser
8 | *.suo
9 | *.user
10 | *.userosscache
11 | *.sln.docstates
12 |
13 | # User-specific files (MonoDevelop/Xamarin Studio)
14 | *.userprefs
15 |
16 | # Build results
17 | [Dd]ebug/
18 | [Dd]ebugPublic/
19 | [Rr]elease/
20 | [Rr]eleases/
21 | x64/
22 | x86/
23 | bld/
24 | [Bb]in/
25 | [Oo]bj/
26 | [Ll]og/
27 |
28 | # Visual Studio 2015/2017 cache/options directory
29 | .vs/
30 | # Uncomment if you have tasks that create the project's static files in wwwroot
31 | #wwwroot/
32 |
33 | # Visual Studio 2017 auto generated files
34 | Generated\ Files/
35 |
36 | # MSTest test Results
37 | [Tt]est[Rr]esult*/
38 | [Bb]uild[Ll]og.*
39 |
40 | # NUNIT
41 | *.VisualState.xml
42 | TestResult.xml
43 |
44 | # Build Results of an ATL Project
45 | [Dd]ebugPS/
46 | [Rr]eleasePS/
47 | dlldata.c
48 |
49 | # Benchmark Results
50 | BenchmarkDotNet.Artifacts/
51 |
52 | # .NET Core
53 | project.lock.json
54 | project.fragment.lock.json
55 | artifacts/
56 |
57 | # StyleCop
58 | StyleCopReport.xml
59 |
60 | # Files built by Visual Studio
61 | *_i.c
62 | *_p.c
63 | *_h.h
64 | *.ilk
65 | *.meta
66 | *.obj
67 | *.iobj
68 | *.pch
69 | *.pdb
70 | *.ipdb
71 | *.pgc
72 | *.pgd
73 | *.rsp
74 | *.sbr
75 | *.tlb
76 | *.tli
77 | *.tlh
78 | *.tmp
79 | *.tmp_proj
80 | *_wpftmp.csproj
81 | *.log
82 | *.vspscc
83 | *.vssscc
84 | .builds
85 | *.pidb
86 | *.svclog
87 | *.scc
88 |
89 | # Chutzpah Test files
90 | _Chutzpah*
91 |
92 | # Visual C++ cache files
93 | ipch/
94 | *.aps
95 | *.ncb
96 | *.opendb
97 | *.opensdf
98 | *.sdf
99 | *.cachefile
100 | *.VC.db
101 | *.VC.VC.opendb
102 |
103 | # Visual Studio profiler
104 | *.psess
105 | *.vsp
106 | *.vspx
107 | *.sap
108 |
109 | # Visual Studio Trace Files
110 | *.e2e
111 |
112 | # TFS 2012 Local Workspace
113 | $tf/
114 |
115 | # Guidance Automation Toolkit
116 | *.gpState
117 |
118 | # ReSharper is a .NET coding add-in
119 | _ReSharper*/
120 | *.[Rr]e[Ss]harper
121 | *.DotSettings.user
122 |
123 | # JustCode is a .NET coding add-in
124 | .JustCode
125 |
126 | # TeamCity is a build add-in
127 | _TeamCity*
128 |
129 | # DotCover is a Code Coverage Tool
130 | *.dotCover
131 |
132 | # AxoCover is a Code Coverage Tool
133 | .axoCover/*
134 | !.axoCover/settings.json
135 |
136 | # Visual Studio code coverage results
137 | *.coverage
138 | *.coveragexml
139 |
140 | # NCrunch
141 | _NCrunch_*
142 | .*crunch*.local.xml
143 | nCrunchTemp_*
144 |
145 | # MightyMoose
146 | *.mm.*
147 | AutoTest.Net/
148 |
149 | # Web workbench (sass)
150 | .sass-cache/
151 |
152 | # Installshield output folder
153 | [Ee]xpress/
154 |
155 | # DocProject is a documentation generator add-in
156 | DocProject/buildhelp/
157 | DocProject/Help/*.HxT
158 | DocProject/Help/*.HxC
159 | DocProject/Help/*.hhc
160 | DocProject/Help/*.hhk
161 | DocProject/Help/*.hhp
162 | DocProject/Help/Html2
163 | DocProject/Help/html
164 |
165 | # Click-Once directory
166 | publish/
167 |
168 | # Publish Web Output
169 | *.[Pp]ublish.xml
170 | *.azurePubxml
171 | # Note: Comment the next line if you want to checkin your web deploy settings,
172 | # but database connection strings (with potential passwords) will be unencrypted
173 | *.pubxml
174 | *.publishproj
175 |
176 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
177 | # checkin your Azure Web App publish settings, but sensitive information contained
178 | # in these scripts will be unencrypted
179 | PublishScripts/
180 |
181 | # NuGet Packages
182 | *.nupkg
183 | # The packages folder can be ignored because of Package Restore
184 | **/[Pp]ackages/*
185 | # except build/, which is used as an MSBuild target.
186 | !**/[Pp]ackages/build/
187 | # Uncomment if necessary however generally it will be regenerated when needed
188 | #!**/[Pp]ackages/repositories.config
189 | # NuGet v3's project.json files produces more ignorable files
190 | *.nuget.props
191 | *.nuget.targets
192 |
193 | # Microsoft Azure Build Output
194 | csx/
195 | *.build.csdef
196 |
197 | # Microsoft Azure Emulator
198 | ecf/
199 | rcf/
200 |
201 | # Windows Store app package directories and files
202 | AppPackages/
203 | BundleArtifacts/
204 | Package.StoreAssociation.xml
205 | _pkginfo.txt
206 | *.appx
207 |
208 | # Visual Studio cache files
209 | # files ending in .cache can be ignored
210 | *.[Cc]ache
211 | # but keep track of directories ending in .cache
212 | !*.[Cc]ache/
213 |
214 | # Others
215 | ClientBin/
216 | ~$*
217 | *~
218 | *.dbmdl
219 | *.dbproj.schemaview
220 | *.jfm
221 | *.pfx
222 | *.publishsettings
223 | orleans.codegen.cs
224 |
225 | # Including strong name files can present a security risk
226 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
227 | #*.snk
228 |
229 | # Since there are multiple workflows, uncomment next line to ignore bower_components
230 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
231 | #bower_components/
232 |
233 | # RIA/Silverlight projects
234 | Generated_Code/
235 |
236 | # Backup & report files from converting an old project file
237 | # to a newer Visual Studio version. Backup files are not needed,
238 | # because we have git ;-)
239 | _UpgradeReport_Files/
240 | Backup*/
241 | UpgradeLog*.XML
242 | UpgradeLog*.htm
243 | ServiceFabricBackup/
244 | *.rptproj.bak
245 |
246 | # SQL Server files
247 | *.mdf
248 | *.ldf
249 | *.ndf
250 |
251 | # Business Intelligence projects
252 | *.rdl.data
253 | *.bim.layout
254 | *.bim_*.settings
255 | *.rptproj.rsuser
256 |
257 | # Microsoft Fakes
258 | FakesAssemblies/
259 |
260 | # GhostDoc plugin setting file
261 | *.GhostDoc.xml
262 |
263 | # Node.js Tools for Visual Studio
264 | .ntvs_analysis.dat
265 | node_modules/
266 |
267 | # Visual Studio 6 build log
268 | *.plg
269 |
270 | # Visual Studio 6 workspace options file
271 | *.opt
272 |
273 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
274 | *.vbw
275 |
276 | # Visual Studio LightSwitch build output
277 | **/*.HTMLClient/GeneratedArtifacts
278 | **/*.DesktopClient/GeneratedArtifacts
279 | **/*.DesktopClient/ModelManifest.xml
280 | **/*.Server/GeneratedArtifacts
281 | **/*.Server/ModelManifest.xml
282 | _Pvt_Extensions
283 |
284 | # Paket dependency manager
285 | .paket/paket.exe
286 | paket-files/
287 |
288 | # FAKE - F# Make
289 | .fake/
290 |
291 | # JetBrains Rider
292 | .idea/
293 | *.sln.iml
294 |
295 | # CodeRush personal settings
296 | .cr/personal
297 |
298 | # Python Tools for Visual Studio (PTVS)
299 | __pycache__/
300 | *.pyc
301 |
302 | # Cake - Uncomment if you are using it
303 | # tools/**
304 | # !tools/packages.config
305 |
306 | # Tabs Studio
307 | *.tss
308 |
309 | # Telerik's JustMock configuration file
310 | *.jmconfig
311 |
312 | # BizTalk build output
313 | *.btp.cs
314 | *.btm.cs
315 | *.odx.cs
316 | *.xsd.cs
317 |
318 | # OpenCover UI analysis results
319 | OpenCover/
320 |
321 | # Azure Stream Analytics local run output
322 | ASALocalRun/
323 |
324 | # MSBuild Binary and Structured Log
325 | *.binlog
326 |
327 | # NVidia Nsight GPU debugger configuration file
328 | *.nvuser
329 |
330 | # MFractors (Xamarin productivity tool) working folder
331 | .mfractor/
332 |
333 | # Local History for Visual Studio
334 | .localhistory/
335 |
--------------------------------------------------------------------------------
/Analyser/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolqingcheng/EasyLuceneNET/60d445d1e91e1864b31c7c4013fe105e70544f8f/Analyser/.DS_Store
--------------------------------------------------------------------------------
/Analyser/Analyser.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 | netstandard2.0
4 | Lucene.JIEba.Analyzer
5 | 1.0.0
6 | SilentCC
7 | JIEba.Lucene.Net is an analyzer tools for lucene.net which is kind to chinese
8 | false
9 | https://github.com/SilentCC/JIEba-netcore2.0/
10 | Copyright 2019 (c) AgileLabs. All rights reserved.
11 | Analyzer Segment JIEba.net core2.0
12 | true
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/Analyser/ConfigManager.cs:
--------------------------------------------------------------------------------
1 | using System.IO;
2 | using System;
3 |
4 | namespace JiebaNet.Analyser
5 | {
6 | public class ConfigManager
7 | {
8 | // TODO: duplicate codes.
9 | public static string ConfigFileBaseDir
10 | {
11 | get
12 | {
13 | return "Resources";
14 | }
15 | }
16 |
17 | public static string IdfFile
18 | {
19 | get { return Path.Combine(ConfigFileBaseDir, "idf.txt"); }
20 | }
21 |
22 | public static string StopWordsFile
23 | {
24 | get { return Path.Combine(ConfigFileBaseDir, "stopwords.txt"); }
25 | }
26 | }
27 | }
--------------------------------------------------------------------------------
/Analyser/IdfLoader.cs:
--------------------------------------------------------------------------------
1 | using JiebaNet.Segmenter.Common;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Reflection;
6 | using System.Text;
7 |
8 | namespace JiebaNet.Analyser
9 | {
10 | public class IdfLoader
11 | {
12 | internal string IdfFilePath { get; set; }
13 | internal IDictionary IdfFreq { get; set; }
14 | internal double MedianIdf { get; set; }
15 |
16 | public IdfLoader(string idfPath = null)
17 | {
18 | IdfFilePath = string.Empty;
19 | IdfFreq = new Dictionary();
20 | MedianIdf = 0.0;
21 | if (!string.IsNullOrWhiteSpace(idfPath))
22 | {
23 | SetNewPath(idfPath);
24 | }
25 | }
26 |
27 | public void SetNewPath(string newIdfPath)
28 | {
29 | var idfPath = newIdfPath;
30 | if (IdfFilePath != idfPath)
31 | {
32 | IdfFilePath = idfPath;
33 | var lines = FileExtension.ReadEmbeddedAllLines(idfPath, Encoding.UTF8);
34 | IdfFreq = new Dictionary();
35 | foreach (var line in lines)
36 | {
37 | var parts = line.Trim().Split(' ');
38 | var word = parts[0];
39 | var freq = double.Parse(parts[1]);
40 | IdfFreq[word] = freq;
41 | }
42 |
43 | MedianIdf = IdfFreq.Values.OrderBy(v => v).ToList()[IdfFreq.Count / 2];
44 | }
45 | }
46 | }
47 | }
--------------------------------------------------------------------------------
/Analyser/KeywordExtractor.cs:
--------------------------------------------------------------------------------
1 | using JiebaNet.Segmenter.Common;
2 | using Microsoft.Extensions.FileProviders;
3 | using System.Collections.Generic;
4 | using System.IO;
5 | using System.Reflection;
6 |
7 | namespace JiebaNet.Analyser
8 | {
9 | public abstract class KeywordExtractor
10 | {
11 | protected static readonly List DefaultStopWords = new List()
12 | {
13 | "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
14 | "by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
15 | "this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
16 | };
17 |
18 | protected virtual ISet StopWords { get; set; }
19 |
20 | public void SetStopWords(string stopWordsFile)
21 | {
22 | StopWords = new HashSet();
23 | var lines = FileExtension.ReadEmbeddedAllLines(stopWordsFile);
24 | foreach (var line in lines)
25 | {
26 | StopWords.Add(line.Trim());
27 | }
28 | }
29 |
30 | public abstract IEnumerable ExtractTags(string text, int count = 20, IEnumerable allowPos = null);
31 | public abstract IEnumerable ExtractTagsWithWeight(string text, int count = 20, IEnumerable allowPos = null);
32 | }
33 | }
--------------------------------------------------------------------------------
/Analyser/Resources/stopwords.txt:
--------------------------------------------------------------------------------
1 | i
2 | me
3 | my
4 | myself
5 | we
6 | our
7 | ours
8 | ourselves
9 | you
10 | your
11 | yours
12 | yourself
13 | yourselves
14 | he
15 | him
16 | his
17 | himself
18 | she
19 | her
20 | hers
21 | herself
22 | it
23 | its
24 | itself
25 | they
26 | them
27 | their
28 | theirs
29 | themselves
30 | what
31 | which
32 | who
33 | whom
34 | this
35 | that
36 | these
37 | those
38 | am
39 | is
40 | are
41 | was
42 | were
43 | be
44 | been
45 | being
46 | have
47 | has
48 | had
49 | having
50 | do
51 | does
52 | did
53 | doing
54 | a
55 | an
56 | the
57 | and
58 | but
59 | if
60 | or
61 | because
62 | as
63 | until
64 | while
65 | of
66 | at
67 | by
68 | for
69 | with
70 | about
71 | against
72 | between
73 | into
74 | through
75 | during
76 | before
77 | after
78 | above
79 | below
80 | to
81 | from
82 | up
83 | down
84 | in
85 | out
86 | on
87 | off
88 | over
89 | under
90 | again
91 | further
92 | then
93 | once
94 | here
95 | there
96 | when
97 | where
98 | why
99 | how
100 | all
101 | any
102 | both
103 | each
104 | few
105 | more
106 | most
107 | other
108 | some
109 | such
110 | no
111 | nor
112 | not
113 | only
114 | own
115 | same
116 | so
117 | than
118 | too
119 | very
120 | s
121 | t
122 | can
123 | will
124 | just
125 | don
126 | should
127 | now
128 | 一番
129 | 一直
130 | 一个
131 | 一些
132 | 许多
133 | 种
134 | 有的是
135 | 也就是说
136 | 阿
137 | 哎呀
138 | 哎哟
139 | 俺
140 | 俺们
141 | 按
142 | 按照
143 | 吧
144 | 吧哒
145 | 把
146 | 罢了
147 | 被
148 | 本
149 | 本着
150 | 比
151 | 比方
152 | 比如
153 | 鄙人
154 | 彼
155 | 彼此
156 | 边
157 | 别
158 | 别的
159 | 别说
160 | 并
161 | 并且
162 | 不比
163 | 不成
164 | 不单
165 | 不但
166 | 不独
167 | 不管
168 | 不光
169 | 不过
170 | 不仅
171 | 不拘
172 | 不论
173 | 不怕
174 | 不然
175 | 不如
176 | 不特
177 | 不惟
178 | 不问
179 | 不只
180 | 朝
181 | 朝着
182 | 趁
183 | 趁着
184 | 乘
185 | 冲
186 | 除
187 | 除此之外
188 | 除非
189 | 除了
190 | 此
191 | 此间
192 | 此外
193 | 从
194 | 从而
195 | 打
196 | 待
197 | 但
198 | 但是
199 | 当
200 | 当着
201 | 到
202 | 得
203 | 的
204 | 的话
205 | 等
206 | 等等
207 | 地
208 | 第
209 | 叮咚
210 | 对
211 | 对于
212 | 多
213 | 多少
214 | 而
215 | 而况
216 | 而且
217 | 而是
218 | 而外
219 | 而言
220 | 而已
221 | 尔后
222 | 反过来
223 | 反过来说
224 | 反之
225 | 非但
226 | 非徒
227 | 否则
228 | 嘎
229 | 嘎登
230 | 该
231 | 赶
232 | 个
233 | 各
234 | 各个
235 | 各位
236 | 各种
237 | 各自
238 | 给
239 | 根据
240 | 跟
241 | 故
242 | 故此
243 | 固然
244 | 关于
245 | 管
246 | 归
247 | 果然
248 | 果真
249 | 过
250 | 和
251 | 何
252 | 何处
253 | 何况
254 | 何时
255 | 嘿
256 | 哼
257 | 哼唷
258 | 呼哧
259 | 乎
260 | 哗
261 | 还是
262 | 还有
263 | 换句话说
264 | 换言之
265 | 或
266 | 或是
267 | 或者
268 | 极了
269 | 及
270 | 及其
271 | 及至
272 | 即
273 | 即便
274 | 即或
275 | 即令
276 | 即若
277 | 即使
278 | 几
279 | 几时
280 | 己
281 | 既
282 | 既然
283 | 既是
284 | 继而
285 | 加之
286 | 假如
287 | 假若
288 | 假使
289 | 鉴于
290 | 将
291 | 较
292 | 较之
293 | 叫
294 | 接着
295 | 结果
296 | 借
297 | 紧接着
298 | 进而
299 | 尽
300 | 尽管
301 | 经
302 | 经过
303 | 就
304 | 就是
305 | 就是说
306 | 据
307 | 具体地说
308 | 具体说来
309 | 开始
310 | 开外
311 | 靠
312 | 咳
313 | 可
314 | 可见
315 | 可是
316 | 可以
317 | 况且
318 | 啦
319 | 来
320 | 来着
321 | 离
322 | 例如
323 | 哩
324 | 连
325 | 连同
326 | 两者
327 | 了
328 | 临
329 | 另
330 | 另外
331 | 另一方面
332 | 论
333 | 嘛
334 | 吗
335 | 慢说
336 | 漫说
337 | 冒
338 | 么
339 | 每
340 | 每当
341 | 们
342 | 莫若
343 | 某
344 | 某个
345 | 某些
346 | 拿
347 | 哪
348 | 哪边
349 | 哪儿
350 | 哪个
351 | 哪里
352 | 哪年
353 | 哪怕
354 | 哪天
355 | 哪些
356 | 哪样
357 | 那
358 | 那边
359 | 那儿
360 | 那个
361 | 那会儿
362 | 那里
363 | 那么
364 | 那么些
365 | 那么样
366 | 那时
367 | 那些
368 | 那样
369 | 乃
370 | 乃至
371 | 呢
372 | 能
373 | 你
374 | 你们
375 | 您
376 | 宁
377 | 宁可
378 | 宁肯
379 | 宁愿
380 | 哦
381 | 啪达
382 | 旁人
383 | 凭
384 | 凭借
385 | 其
386 | 其次
387 | 其二
388 | 其他
389 | 其它
390 | 其一
391 | 其余
392 | 其中
393 | 起
394 | 起见
395 | 起见
396 | 岂但
397 | 恰恰相反
398 | 前后
399 | 前者
400 | 且
401 | 然而
402 | 然后
403 | 然则
404 | 让
405 | 人家
406 | 任
407 | 任何
408 | 任凭
409 | 如
410 | 如此
411 | 如果
412 | 如何
413 | 如其
414 | 如若
415 | 如上所述
416 | 若
417 | 若非
418 | 若是
419 | 啥
420 | 上下
421 | 尚且
422 | 设若
423 | 设使
424 | 甚而
425 | 甚么
426 | 甚至
427 | 省得
428 | 时候
429 | 什么
430 | 什么样
431 | 使得
432 | 是
433 | 是的
434 | 首先
435 | 谁
436 | 顺
437 | 顺着
438 | 似的
439 | 虽
440 | 虽然
441 | 虽说
442 | 虽则
443 | 随
444 | 随着
445 | 所
446 | 所以
447 | 他
448 | 他们
449 | 他人
450 | 它
451 | 它们
452 | 她
453 | 她们
454 | 倘
455 | 倘或
456 | 倘然
457 | 倘若
458 | 倘使
459 | 腾
460 | 替
461 | 通过
462 | 同
463 | 同时
464 | 哇
465 | 万一
466 | 往
467 | 望
468 | 为
469 | 为何
470 | 为了
471 | 为什么
472 | 为着
473 | 喂
474 | 嗡嗡
475 | 我
476 | 我们
477 | 呜
478 | 呜呼
479 | 乌乎
480 | 无论
481 | 无宁
482 | 毋宁
483 | 嘻
484 | 吓
485 | 相对而言
486 | 像
487 | 向
488 | 向着
489 | 嘘
490 | 焉
491 | 沿
492 | 沿着
493 | 要
494 | 要不
495 | 要不然
496 | 要不是
497 | 要么
498 | 要是
499 | 也
500 | 也罢
501 | 也好
502 | 一
503 | 一旦
504 | 一方面
505 | 一来
506 | 一切
507 | 一样
508 | 一则
509 | 依
510 | 依照
511 | 矣
512 | 以
513 | 以便
514 | 以及
515 | 以免
516 | 以至
517 | 以至于
518 | 以致
519 | 抑或
520 | 因
521 | 因此
522 | 因而
523 | 因为
524 | 用
525 | 由
526 | 由此可见
527 | 由于
528 | 有
529 | 有的
530 | 有关
531 | 有些
532 | 又
533 | 于
534 | 于是
535 | 于是乎
536 | 与
537 | 与此同时
538 | 与否
539 | 与其
540 | 越是
541 | 云云
542 | 哉
543 | 再说
544 | 再者
545 | 在
546 | 在下
547 | 咱
548 | 咱们
549 | 则
550 | 怎
551 | 怎么办
552 | 怎么样
553 | 咋
554 | 照
555 | 照着
556 | 者
557 | 这
558 | 这边
559 | 这儿
560 | 这个
561 | 这会儿
562 | 这就是说
563 | 这里
564 | 这么
565 | 这么点儿
566 | 这么些
567 | 这么样
568 | 这时
569 | 这些
570 | 这样
571 | 正如
572 | 吱
573 | 之
574 | 之类
575 | 之所以
576 | 之一
577 | 只是
578 | 只限
579 | 只要
580 | 只有
581 | 至
582 | 至于
583 | 诸位
584 | 着
585 | 着呢
586 | 自
587 | 自从
588 | 自个儿
589 | 自各儿
590 | 自己
591 | 自家
592 | 自身
593 | 综上所述
594 | 总的来看
595 | 总的来说
596 | 总的说来
597 | 总而言之
598 | 总之
599 | 纵
600 | 纵令
601 | 纵然
602 | 纵使
603 | 遵照
604 | 作为
605 | 兮
606 | 呗
607 | 咚
608 | 咦
609 | 喏
610 | 啐
611 | 喔唷
612 | 嗬
613 | 嗯
614 | 嗳
615 | 。
616 | ,
617 | :
618 | ;
619 | 、
620 | “
621 | ”
622 | 【
623 | 】
624 | 《
625 | 》
626 | (
627 | )
628 | —
629 | …
630 | .
631 | ,
632 | :
633 | ;
634 | "
635 | "
636 | [
637 | ]
638 | <
639 | >
640 | (
641 | )
642 | @
643 | #
644 | *
645 | &
646 | %
647 | ¥
648 | $
649 | -
650 | +
651 | =
652 | |
653 | \
654 |
--------------------------------------------------------------------------------
/Analyser/TextRankExtractor.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Linq;
3 | using JiebaNet.Segmenter;
4 | using JiebaNet.Segmenter.Common;
5 | using JiebaNet.Segmenter.PosSeg;
6 |
7 | namespace JiebaNet.Analyser
8 | {
9 | public class TextRankExtractor : KeywordExtractor
10 | {
11 | private static readonly IEnumerable DefaultPosFilter = new List()
12 | {
13 | "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "v", "vd", "vg", "vi", "vn", "vq"
14 | };
15 |
16 | private JiebaSegmenter Segmenter { get; set; }
17 | private PosSegmenter PosSegmenter { get; set; }
18 |
19 | public int Span { get; set; }
20 |
21 | public bool PairFilter(Pair wp)
22 | {
23 | return DefaultPosFilter.Contains(wp.Flag)
24 | && wp.Word.Trim().Length >= 2
25 | && !StopWords.Contains(wp.Word.ToLower());
26 | }
27 |
28 | public TextRankExtractor()
29 | {
30 | Span = 5;
31 |
32 | Segmenter = new JiebaSegmenter();
33 | PosSegmenter = new PosSegmenter(Segmenter);
34 | SetStopWords(ConfigManager.StopWordsFile);
35 | if (StopWords.IsEmpty())
36 | {
37 | StopWords.UnionWith(DefaultStopWords);
38 | }
39 | }
40 |
41 | public override IEnumerable ExtractTags(string text, int count = 20, IEnumerable allowPos = null)
42 | {
43 | var rank = ExtractTagRank(text, allowPos);
44 | if (count <= 0) { count = 20; }
45 | return rank.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count);
46 | }
47 |
48 | public override IEnumerable ExtractTagsWithWeight(string text, int count = 20, IEnumerable allowPos = null)
49 | {
50 | var rank = ExtractTagRank(text, allowPos);
51 | if (count <= 0) { count = 20; }
52 | return rank.OrderByDescending(p => p.Value).Select(p => new WordWeightPair()
53 | {
54 | Word = p.Key, Weight = p.Value
55 | }).Take(count);
56 | }
57 |
58 | #region Private Helpers
59 |
60 | private IDictionary ExtractTagRank(string text, IEnumerable allowPos)
61 | {
62 | if (allowPos.IsEmpty())
63 | {
64 | allowPos = DefaultPosFilter;
65 | }
66 |
67 | var g = new UndirectWeightedGraph();
68 | var cm = new Dictionary();
69 | var words = PosSegmenter.Cut(text).ToList();
70 |
71 | for (var i = 0; i < words.Count(); i++)
72 | {
73 | var wp = words[i];
74 | if (PairFilter(wp))
75 | {
76 | for (var j = i + 1; j < i + Span; j++)
77 | {
78 | if (j >= words.Count)
79 | {
80 | break;
81 | }
82 | if (!PairFilter(words[j]))
83 | {
84 | continue;
85 | }
86 |
87 | // TODO: better separator.
88 | var key = wp.Word + "$" + words[j].Word;
89 | if (!cm.ContainsKey(key))
90 | {
91 | cm[key] = 0;
92 | }
93 | cm[key] += 1;
94 | }
95 | }
96 | }
97 |
98 | foreach (var p in cm)
99 | {
100 | var terms = p.Key.Split('$');
101 | g.AddEdge(terms[0], terms[1], p.Value);
102 | }
103 |
104 | return g.Rank();
105 | }
106 |
107 | #endregion
108 | }
109 | }
--------------------------------------------------------------------------------
/Analyser/TfidfExtractor.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using JiebaNet.Segmenter;
5 | using JiebaNet.Segmenter.Common;
6 | using JiebaNet.Segmenter.PosSeg;
7 |
8 | namespace JiebaNet.Analyser
9 | {
10 | public class TfidfExtractor : KeywordExtractor
11 | {
12 | private static readonly string DefaultIdfFile = ConfigManager.IdfFile;
13 | private static readonly int DefaultWordCount = 20;
14 |
15 | private JiebaSegmenter Segmenter { get; set; }
16 | private PosSegmenter PosSegmenter { get; set; }
17 | private IdfLoader Loader { get; set; }
18 |
19 | private IDictionary IdfFreq { get; set; }
20 | private double MedianIdf { get; set; }
21 |
22 | public TfidfExtractor(JiebaSegmenter segmenter = null)
23 | {
24 | if (segmenter.IsNull())
25 | {
26 | Segmenter = new JiebaSegmenter();
27 | }
28 | else
29 | {
30 | Segmenter = segmenter;
31 | }
32 | PosSegmenter = new PosSegmenter(Segmenter);
33 | SetStopWords(ConfigManager.StopWordsFile);
34 | if (StopWords.IsEmpty())
35 | {
36 | StopWords.UnionWith(DefaultStopWords);
37 | }
38 |
39 | Loader = new IdfLoader(DefaultIdfFile);
40 |
41 | IdfFreq = Loader.IdfFreq;
42 | MedianIdf = Loader.MedianIdf;
43 | }
44 |
45 | public void SetIdfPath(string idfPath)
46 | {
47 | Loader.SetNewPath(idfPath);
48 | IdfFreq = Loader.IdfFreq;
49 | MedianIdf = Loader.MedianIdf;
50 | }
51 |
52 | private IEnumerable FilterCutByPos(string text, IEnumerable allowPos)
53 | {
54 | var posTags = PosSegmenter.Cut(text).Where(p => allowPos.Contains(p.Flag));
55 | return posTags.Select(p => p.Word);
56 | }
57 |
58 | private IDictionary GetWordIfidf(string text, IEnumerable allowPos)
59 | {
60 | IEnumerable words = null;
61 | if (allowPos.IsNotEmpty())
62 | {
63 | words = FilterCutByPos(text, allowPos);
64 | }
65 | else
66 | {
67 | words = Segmenter.Cut(text);
68 | }
69 |
70 | // Calculate TF
71 | var freq = new Dictionary();
72 | foreach (var word in words)
73 | {
74 | var w = word;
75 | if (string.IsNullOrEmpty(w) || w.Trim().Length < 2 || StopWords.Contains(w.ToLower()))
76 | {
77 | continue;
78 | }
79 | freq[w] = freq.GetDefault(w, 0.0) + 1.0;
80 | }
81 | var total = freq.Values.Sum();
82 | foreach (var k in freq.Keys.ToList())
83 | {
84 | freq[k] *= IdfFreq.GetDefault(k, MedianIdf) / total;
85 | }
86 |
87 | return freq;
88 | }
89 |
90 | public override IEnumerable ExtractTags(string text, int count = 20, IEnumerable allowPos = null)
91 | {
92 | if (count <= 0) { count = DefaultWordCount; }
93 |
94 | var freq = GetWordIfidf(text, allowPos);
95 | return freq.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count);
96 | }
97 |
98 | public override IEnumerable ExtractTagsWithWeight(string text, int count = 20, IEnumerable allowPos = null)
99 | {
100 | if (count <= 0) { count = DefaultWordCount; }
101 |
102 | var freq = GetWordIfidf(text, allowPos);
103 | return freq.OrderByDescending(p => p.Value).Select(p => new WordWeightPair()
104 | {
105 | Word = p.Key, Weight = p.Value
106 | }).Take(count);
107 | }
108 | }
109 |
110 | public class WordWeightPair
111 | {
112 | public string Word { get; set; }
113 | public double Weight { get; set; }
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/Analyser/UndirectWeightedGraph.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 |
5 | namespace JiebaNet.Analyser
6 | {
7 | public class Edge
8 | {
9 | public string Start { get; set; }
10 | public string End { get; set; }
11 | public double Weight { get; set; }
12 | }
13 |
14 | public class UndirectWeightedGraph
15 | {
16 | private static readonly double d = 0.85;
17 |
18 | public IDictionary> Graph { get; set; }
19 | public UndirectWeightedGraph()
20 | {
21 | Graph = new Dictionary>();
22 | }
23 |
24 | public void AddEdge(string start, string end, double weight)
25 | {
26 | if (!Graph.ContainsKey(start))
27 | {
28 | Graph[start] = new List();
29 | }
30 |
31 | if (!Graph.ContainsKey(end))
32 | {
33 | Graph[end] = new List();
34 | }
35 |
36 | Graph[start].Add(new Edge(){ Start = start, End = end, Weight = weight });
37 | Graph[end].Add(new Edge(){ Start = end, End = start, Weight = weight });
38 | }
39 |
40 | public IDictionary Rank()
41 | {
42 | var ws = new Dictionary();
43 | var outSum = new Dictionary();
44 |
45 | // init scores
46 | var count = Graph.Count > 0 ? Graph.Count : 1;
47 | var wsdef = 1.0/count;
48 |
49 | foreach (var pair in Graph)
50 | {
51 | ws[pair.Key] = wsdef;
52 | outSum[pair.Key] = pair.Value.Sum(e => e.Weight);
53 | }
54 |
55 | // TODO: 10 iterations?
56 | var sortedKeys = Graph.Keys.OrderBy(k => k);
57 | for (var i = 0; i < 10; i++)
58 | {
59 | foreach (var n in sortedKeys)
60 | {
61 | var s = 0d;
62 | foreach (var edge in Graph[n])
63 | {
64 | s += edge.Weight/outSum[edge.End]*ws[edge.End];
65 | }
66 | ws[n] = (1 - d) + d*s;
67 | }
68 | }
69 |
70 | var minRank = double.MaxValue;
71 | var maxRank = double.MinValue;
72 |
73 | foreach (var w in ws.Values)
74 | {
75 | if (w < minRank)
76 | {
77 | minRank = w;
78 | }
79 | if(w > maxRank)
80 | {
81 | maxRank = w;
82 | }
83 | }
84 |
85 | foreach (var pair in ws.ToList())
86 | {
87 | ws[pair.Key] = (pair.Value - minRank/10.0)/(maxRank - minRank/10.0);
88 | }
89 |
90 | return ws;
91 | }
92 | }
93 | }
--------------------------------------------------------------------------------
/ConsoleApp1/Article.cs:
--------------------------------------------------------------------------------
1 | using Lucene.Net.Documents;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace ConsoleApp1
9 | {
10 | public class Article
11 | {
12 | [Lucene(FieldStore = Field.Store.YES, IsUnique = true, type = LuceneFieldType.Int32)]
13 | public int Id { get; set; }
14 | [Lucene(FieldStore = Field.Store.YES, IsUnique = false, type = LuceneFieldType.Text)]
15 | public string Title { get; set; }
16 |
17 |
18 | [Lucene(FieldStore = Field.Store.YES, IsUnique = false, type = LuceneFieldType.Text)]
19 | public string Content { get; set; }
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/ConsoleApp1/ConsoleApp1.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | net6.0
6 | enable
7 | enable
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/ConsoleApp1/Program.cs:
--------------------------------------------------------------------------------
1 | using ConsoleApp1;
2 | using EasyLuceneNET;
3 | using Microsoft.Extensions.DependencyInjection;
4 |
5 | var service = new ServiceCollection();
6 | service.AddLogging();
7 | service.AddEasyLuceneNet();
8 | var serviceProvider = service.BuildServiceProvider();
9 |
10 | var easy = serviceProvider.GetService();
11 |
12 | //删除索引
13 |
14 |
15 | //传递一个文档对应的模型,只需要给主键赋值即可
16 | easy.Delete(new Article { Id = 1 });
17 |
18 | //创建索引
19 |
20 | //var list = new List();
21 | //for (int i = 0; i < 100; i++)
22 | //{
23 | // list.Add(new Article()
24 | // {
25 | // Id = i,
26 | // Title = i + "使用Xamarin开发移动应用示例——数独游戏(八)使用MVVM实现完成游戏列表页面",
27 | // Content = @"前面我们已经完成了游戏的大部分功能,玩家可以玩预制的数独游戏,也可以自己添加新的游戏。现在我们实现展示已完成游戏列表页面,显示用户已经完成的游戏列表,从这个列表可以进入详细的复盘页面。
28 |
29 | //前面的页面我们采用的是传统的事件驱动模型,在XAML文件中定义页面,在后台的cs文件中编写事件响应代码。采用这种模型是因为很多页面需要动态生成控件,然后动态改变这些控件的属性,事件驱动模型在这种场景下比较好理解。现在我们采用MVVM方式编写完成游戏列表页面。
30 |
31 | //MVVM是将页面绑定到视图模型,所有的操作和事件响应通过视图模型完成。视图模型中没有页面控件的定义,因此和页面是解耦的,可以独立进行测试。在视图模型中我们只关心数据,而不关心展示数据的控件。
32 |
33 | //首先,我们定义一个视图模型的基类,下一步在改造其它页面时,会用到这个基类:"
34 | // });
35 | //}
36 | //easy!.AddIndex(list);
37 |
38 | //全文检索
39 |
40 | var result = easy!.Search(new SearchRequest()
41 | {
42 | keyword = "事件模型",
43 | index = 1,
44 | size = 20,
45 | fields = new string[] { "Title", "Content" },
46 | OrderByField = "Id",
47 | });
48 | Console.WriteLine("一共:" + result.Total);
49 | foreach (var item in result.list)
50 | {
51 | Console.WriteLine($"id:{item.Id} title:{item.Title}");
52 | }
53 | Console.WriteLine($"分词:{string.Join(" ", result.cutKeys)}");
54 | Console.WriteLine("完成");
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/EasyLuceneNET/EasyLuceneNET.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netstandard2.1
5 |
6 | enable
7 | EasyLuceneNET
8 | 1.5
9 | 简单的封装了jiba.net和Lucene.net进行中文检索,适用与基本的文档和网站站内检索
10 | https://github.com/coolqingcheng/EasyLuceneNET
11 | 青城
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/EasyLuceneNET/EasyLuceneNetDefaultProvider.cs:
--------------------------------------------------------------------------------
1 | using jieba.NET;
2 | using JiebaNet.Segmenter;
3 | using Lucene.Net.Analysis;
4 | using Lucene.Net.Documents;
5 | using Lucene.Net.Index;
6 | using Lucene.Net.Search;
7 | using Lucene.Net.Search.Highlight;
8 | using Lucene.Net.Store;
9 | using Lucene.Net.Util;
10 | using Microsoft.Extensions.Logging;
11 | using System;
12 | using System.Collections.Generic;
13 | using System.IO;
14 | using System.Linq;
15 | using System.Reflection;
16 |
17 | namespace EasyLuceneNET
18 | {
19 | public class EasyLuceneNetDefaultProvider : IEasyLuceneNet, IDisposable
20 | {
21 | const LuceneVersion AppLuceneVersion = LuceneVersion.LUCENE_48;
22 | readonly IndexWriter writer;
23 |
24 | private ILogger _logger;
25 |
26 | private FSDirectory dir;
27 |
28 | //private readonly JieBaAnalyzer analyzer;
29 |
30 | public EasyLuceneNetDefaultProvider(ILogger logger)
31 | {
32 | _logger = logger;
33 | var indexPath = Path.Combine(AppContext.BaseDirectory, "indexs");
34 |
35 | dir = FSDirectory.Open(indexPath);
36 |
37 | // Create an analyzer to process the text
38 | Analyzer analyzer = new JieBaAnalyzer(TokenizerMode.Search);
39 | // Create an index writer
40 | var indexConfig = new IndexWriterConfig(AppLuceneVersion, analyzer);
41 | writer = new IndexWriter(dir, indexConfig);
42 | }
43 |
44 | public void AddIndex(List list)
45 | {
46 | if (list != null)
47 | {
48 | list.ForEach(item =>
49 | {
50 | var doc = new Document();
51 | var properties = item.GetType().GetProperties(BindingFlags.Instance | BindingFlags.Public);
52 | _logger.LogDebug("添加到文档:" + DateTime.Now);
53 | Term term = null;
54 | foreach (var property in properties)
55 | {
56 | string name = property.Name;
57 | var value = property.GetValue(item);
58 | var att = property.GetCustomAttribute();
59 | if (att == null)
60 | {
61 | _logger.LogWarning($"文档字段为:{name} 没有贴上Lucene标签,不索引");
62 | continue;
63 | }
64 | if (att.type == LuceneFieldType.String)
65 | {
66 | //默认用StringField
67 | doc.Add(new StringField(name, value.ToString(), Field.Store.YES));
68 | }
69 | else
70 | {
71 |
72 | if (att.type == LuceneFieldType.Text)
73 | {
74 | doc.Add(new TextField(name, value.ToString(), att.FieldStore));
75 | }
76 | if (att.type == LuceneFieldType.Int32)
77 | {
78 | doc.Add(new Int32Field(name, Convert.ToInt32(value), att.FieldStore));
79 | }
80 |
81 | }
82 | if (att.IsUnique)
83 | {
84 | if (new Type[] { typeof(int), typeof(long), typeof(short), typeof(uint), typeof(ulong), typeof(ushort) }.Contains(value.GetType()))
85 | {
86 | var bytes = new BytesRef(NumericUtils.BUF_SIZE_INT32);
87 | NumericUtils.Int32ToPrefixCoded(Convert.ToInt32(value), 0, bytes);
88 | term = new Term(name, bytes);
89 | }
90 | else
91 | {
92 | term = new Term(name, value.ToString());
93 | }
94 | }
95 | }
96 | if (term == null)
97 | {
98 | writer.AddDocument(doc);
99 | }
100 | else
101 | {
102 | writer.UpdateDocument(term, doc);
103 | }
104 |
105 | });
106 | var begin = DateTime.Now;
107 | _logger.LogDebug("正在提交索引:" + begin);
108 | writer.Flush(triggerMerge: false, applyAllDeletes: false);
109 | writer.Commit();
110 | var end = DateTime.Now;
111 | _logger.LogDebug("索引提交完成:" + end);
112 | writer.Flush(false, false);
113 | writer.Commit();
114 | }
115 | }
116 |
117 | public void Dispose()
118 | {
119 | writer.Dispose();
120 | dir.Dispose();
121 | }
122 |
123 | public SearchResult Search(SearchRequest request) where T : class, new()
124 | {
125 |
126 | if (request.keyword.Length > 75)
127 | {
128 | request.keyword = request.keyword.Substring(0, 75);
129 | }
130 | if (request.index <= 1)
131 | {
132 | request.index = 1;
133 | }
134 | if (request.size < 15)
135 | {
136 | request.index = 15;
137 | }
138 | var result = new SearchResult();
139 | var segmenter = new JiebaSegmenter();
140 | var keywords = segmenter.Cut(request.keyword);
141 | result.cutKeys.AddRange(keywords);
142 | var biaodian = "[’!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+()【】,。: ".ToCharArray();
143 | keywords = keywords.Where(a => !biaodian.Where(b => b.ToString() == a).Any()).ToList();
144 | BooleanQuery query = new BooleanQuery();
145 | foreach (var item in keywords)
146 | {
147 | foreach (var field in request.fields)
148 | {
149 | if (biaodian.Any(a => a.ToString() == item) == false)
150 | {
151 | query.Add(new TermQuery(new Term(field, item)), Occur.SHOULD);
152 | }
153 | }
154 | }
155 |
156 | var i = request.index * request.size;
157 |
158 | using var reader = writer.GetReader(applyAllDeletes: true);
159 | var searcher = new IndexSearcher(reader);
160 | var sort = new Sort();
161 | if (!string.IsNullOrWhiteSpace(request.OrderByDescField))
162 | {
163 | sort.SetSort(new SortField(request.OrderByDescField, SortFieldType.INT32, true));
164 | }
165 | if (!string.IsNullOrWhiteSpace(request.OrderByField))
166 | {
167 | sort.SetSort(new SortField(request.OrderByField, SortFieldType.INT32, false));
168 | }
169 | TopFieldDocs? doc = searcher.Search(query, request.size * 10, sort);
170 | var scorer = new QueryScorer(query, "Content");
171 | Highlighter highlighter = new Highlighter(scorer);
172 | Search(request.index,
173 | request.size,
174 | result,
175 | searcher,
176 | doc);
177 | return result;
178 | }
179 |
180 | private static void Search(int index, int size, SearchResult result, IndexSearcher searcher, TopDocs doc) where T : class, new()
181 | {
182 | result.Total = doc.TotalHits;
183 | var maxIndex = doc.ScoreDocs.Length - 2;
184 | var endIndex = ((index - 1) * size) + size;
185 | if (endIndex < maxIndex)
186 | {
187 | maxIndex = endIndex;
188 | }
189 | for (int j = ((index - 1) * size); j < maxIndex; j++)
190 | {
191 | var foundDoc = searcher.Doc(doc.ScoreDocs[j].Doc);
192 | var t = new T();
193 | var type = t.GetType();
194 | var propertity = type.GetProperties(BindingFlags.Instance | BindingFlags.Public);
195 |
196 | foreach (var item in propertity)
197 | {
198 | var sValue = foundDoc.Get(item.Name);
199 | if (sValue != null)
200 | {
201 |
202 | try
203 | {
204 | var v = Convert.ChangeType(sValue, item.PropertyType);
205 |
206 | item.SetValue(t, v, null);
207 | }
208 | catch (Exception)
209 | {
210 |
211 | }
212 | }
213 | }
214 | result.list.Add(t);
215 | }
216 | }
217 |
218 | private String highlightField(Query query, String fieldName, String text)
219 | {
220 | TokenStream tokenStream = new JieBaAnalyzer(TokenizerMode.Search)
221 | .GetTokenStream(fieldName, text);
222 | // Assuming "", "" used to highlight
223 | SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
224 | QueryScorer scorer = new QueryScorer(query);
225 | Highlighter highlighter = new Highlighter(formatter, scorer)
226 | {
227 | TextFragmenter = (new SimpleFragmenter(int.MaxValue))
228 | };
229 |
230 | String rv = highlighter.GetBestFragments(tokenStream, text, 1, "(FIELD TEXT TRUNCATED)");
231 | return rv.Length == 0 ? text : rv;
232 | }
233 |
234 | public void Delete(T entity)
235 | {
236 | if (entity != null)
237 | {
238 | var properties = entity.GetType().GetProperties(BindingFlags.Instance | BindingFlags.Public);
239 | var item = properties.Where(p => p.GetCustomAttribute().IsUnique = true).FirstOrDefault();
240 | if (item != null)
241 | {
242 | var value = item.GetValue(entity, null);
243 | Term term;
244 | if (new Type[] { typeof(int), typeof(long), typeof(short), typeof(uint), typeof(ulong), typeof(ushort) }.Contains(value.GetType()))
245 | {
246 | var bytes = new BytesRef(NumericUtils.BUF_SIZE_INT32);
247 | NumericUtils.Int32ToPrefixCoded(Convert.ToInt32(value), 0, bytes);
248 | term = new Term(item.Name, bytes);
249 | }
250 | else
251 | {
252 | term = new Term(item.Name, value.ToString());
253 | }
254 | writer.DeleteDocuments(term);
255 | writer.Flush(true, true);
256 | writer.Commit();
257 | }
258 |
259 | }
260 | }
261 | }
262 | }
263 |
264 | public class LuceneAttribute : System.Attribute
265 | {
266 | public LuceneFieldType type { get; set; } = LuceneFieldType.Text;
267 |
268 | public Field.Store FieldStore { get; set; }
269 |
270 | public bool IsUnique { get; set; } = false;
271 |
272 | }
273 |
274 | public enum LuceneFieldType
275 | {
276 | Text,
277 | ///
278 | /// 用于不需要检索的,如果需要检索,选择Text
279 | ///
280 | String,
281 | Int32
282 | }
--------------------------------------------------------------------------------
/EasyLuceneNET/EasyLuceneNetExtensions.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.Extensions.DependencyInjection;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Threading.Tasks;
7 |
8 | namespace EasyLuceneNET
9 | {
10 | public static class EasyLuceneNetExtensions
11 | {
12 | public static IServiceCollection AddEasyLuceneNet(this IServiceCollection service)
13 | {
14 | service.AddSingleton();
15 | return service;
16 | }
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/EasyLuceneNET/IEasyLuceneNet.cs:
--------------------------------------------------------------------------------
1 | using Lucene.Net.Search;
2 | using System.Collections.Generic;
3 |
4 | namespace EasyLuceneNET
5 | {
6 | public interface IEasyLuceneNet
7 | {
8 | ///
9 | /// 检索
10 | ///
11 | ///
12 | ///
13 | ///
14 | SearchResult Search(SearchRequest request) where T : class, new();
15 | ///
16 | /// 创建索引
17 | ///
18 | ///
19 | ///
20 | void AddIndex(List list);
21 |
22 | ///
23 | /// 删除
24 | ///
25 | ///
26 | void Delete(T entity);
27 | }
28 |
29 | public class SearchResult where T : class, new()
30 | {
31 | public int Total { get; set; }
32 |
33 | public List cutKeys { get; set; } = new List();
34 |
35 | public List list { get; set; } = new List();
36 | }
37 |
38 | public class SearchRequest
39 | {
40 | public string keyword { get; set; }
41 | public int index { get; set; } = 1;
42 | public int size { get; set; } = 15;
43 | public string[] fields { get; set; }
44 |
45 | ///
46 | /// 倒序排列字段
47 | ///
48 | public string OrderByDescField { get; set; }
49 |
50 |
51 | ///
52 | /// 顺序排序字段
53 | ///
54 | public string OrderByField { get; set; }
55 | }
56 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 qingcheng
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 基于https://github.com/SilentCC/JIEba-netcore 封装了一个lucene.net的全文检索工具
2 |
3 | # 使用
4 |
5 | ## 安装nuget包
6 |
7 | ```
8 | Install-Package EasyLuceneNET
9 | ```
10 |
11 | ## 创建模型
12 |
13 | ``` csharp
14 | public class Article
15 | {
16 | [Lucene(FieldStore = Field.Store.YES, IsUnique = true, type = LuceneFieldType.Int32)]
17 | public int Id { get; set; }
18 | [Lucene(FieldStore = Field.Store.YES, IsUnique = false, type = LuceneFieldType.Text)]
19 | public string Title { get; set; }
20 |
21 |
22 | [Lucene(FieldStore = Field.Store.YES, IsUnique = false, type = LuceneFieldType.Text)]
23 | public string Content { get; set; }
24 | }
25 | ```
26 |
27 | ## 依赖注入
28 |
29 | ``` csharp
30 | var service = new ServiceCollection();
31 | service.AddLogging();
32 | service.AddEasyLuceneNet();
33 | var serviceProvider = service.BuildServiceProvider();
34 |
35 | var easy = serviceProvider.GetService();
36 | ```
37 |
38 | ## 创建索引
39 |
40 | ``` csharp
41 |
42 |
43 | var list = new List();
44 | for (int i = 0; i < 100; i++)
45 | {
46 | list.Add(new Article()
47 | {
48 | Id = i,
49 | Title = i + "使用Xamarin开发移动应用示例——数独游戏(八)使用MVVM实现完成游戏列表页面",
50 | Content = @"前面我们已经完成了游戏的大部分功能,玩家可以玩预制的数独游戏,也可以自己添加新的游戏。现在我们实现展示已完成游戏列表页面,显示用户已经完成的游戏列表,从这个列表可以进入详细的复盘页面。
51 |
52 | 前面的页面我们采用的是传统的事件驱动模型,在XAML文件中定义页面,在后台的cs文件中编写事件响应代码。采用这种模型是因为很多页面需要动态生成控件,然后动态改变这些控件的属性,事件驱动模型在这种场景下比较好理解。现在我们采用MVVM方式编写完成游戏列表页面。
53 |
54 | MVVM是将页面绑定到视图模型,所有的操作和事件响应通过视图模型完成。视图模型中没有页面控件的定义,因此和页面是解耦的,可以独立进行测试。在视图模型中我们只关心数据,而不关心展示数据的控件。
55 |
56 | 首先,我们定义一个视图模型的基类,下一步在改造其它页面时,会用到这个基类:"
57 | });
58 | }
59 | easy!.AddIndex(list);
60 |
61 | ```
62 |
63 | ## 检索
64 |
65 | ``` csharp
66 | var result = easy!.Search(new SearchRequest()
67 | {
68 | keyword = "事件模型",
69 | index = 1,
70 | size = 20,
71 | fields = new string[] { "Title", "Content" },
72 | OrderByField = "Id",
73 | });
74 | Console.WriteLine("一共:" + result.Total);
75 | foreach (var item in result.list)
76 | {
77 | Console.WriteLine($"id:{item.Id} title:{item.Title}");
78 | }
79 | Console.WriteLine($"分词:{string.Join(" ", result.cutKeys)}");
80 | Console.WriteLine("完成");
81 | ```
82 |
83 | ## 删除索引
84 |
85 | 传递一个文档对应的模型,只需要给主键赋值即可
86 |
87 | ``` csharp
88 | easy.Delete(new Article { Id = 1 });
89 | ```
90 |
91 | ## 联系我
92 |
93 |
94 |
95 |
96 |
97 |
--------------------------------------------------------------------------------
/Segmenter/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolqingcheng/EasyLuceneNET/60d445d1e91e1864b31c7c4013fe105e70544f8f/Segmenter/.DS_Store
--------------------------------------------------------------------------------
/Segmenter/Common/Extensions.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text.RegularExpressions;
5 |
6 | namespace JiebaNet.Segmenter.Common
7 | {
8 | public static class Extensions
9 | {
10 | private static readonly Regex RegexDigits = new Regex(@"\d+", RegexOptions.Compiled);
11 | private static readonly Regex RegexNewline = new Regex("(\r\n|\n|\r)", RegexOptions.Compiled);
12 |
13 | #region Objects
14 |
15 | public static bool IsNull(this object obj)
16 | {
17 | return obj == null;
18 | }
19 |
20 | public static bool IsNotNull(this object obj)
21 | {
22 | return obj != null;
23 | }
24 |
25 | #endregion
26 |
27 |
28 | #region Enumerable
29 |
30 | public static bool IsEmpty(this IEnumerable enumerable)
31 | {
32 | return (enumerable == null) || !enumerable.Any();
33 | }
34 |
35 | public static bool IsNotEmpty(this IEnumerable enumerable)
36 | {
37 | return (enumerable != null) && enumerable.Any();
38 | }
39 |
40 | public static TValue GetValueOrDefault(this IDictionary d, TKey key)
41 | {
42 | return d.ContainsKey(key) ? d[key] : default(TValue);
43 | }
44 |
45 | public static TValue GetDefault(this IDictionary dict, TKey key, TValue defaultValue)
46 | {
47 | if (dict.ContainsKey(key))
48 | {
49 | return dict[key];
50 | }
51 | return defaultValue;
52 | }
53 |
54 | public static void Update(this IDictionary dict, IDictionary other)
55 | {
56 | foreach (var key in other.Keys)
57 | {
58 | dict[key] = other[key];
59 | }
60 | }
61 |
62 | #endregion
63 |
64 | #region String & Text
65 |
66 | public static string Left(this string s, int endIndex)
67 | {
68 | if (string.IsNullOrEmpty(s))
69 | {
70 | return s;
71 | }
72 |
73 | return s.Substring(0, endIndex);
74 | }
75 |
76 | public static string Right(this string s, int startIndex)
77 | {
78 | if (string.IsNullOrEmpty(s))
79 | {
80 | return s;
81 | }
82 |
83 |
84 | return s.Substring(startIndex);
85 | }
86 |
87 | public static string Sub(this string s, int startIndex, int endIndex)
88 | {
89 | return s.Substring(startIndex, endIndex - startIndex);
90 | }
91 |
92 | public static bool IsInt32(this string s)
93 | {
94 | return RegexDigits.IsMatch(s);
95 | }
96 |
97 | public static string[] SplitLines(this string s)
98 | {
99 | return RegexNewline.Split(s);
100 | }
101 |
102 | public static string Join(this IEnumerable inputs, string separator = ", ")
103 | {
104 | return string.Join(separator, inputs);
105 | }
106 |
107 | public static IEnumerable SubGroupValues(this GroupCollection groups)
108 | {
109 | var result = from Group g in groups
110 | select g.Value;
111 | return result.Skip(1);
112 | }
113 |
114 | #endregion
115 |
116 | #region Conversion
117 |
118 | public static int ToInt32(this char ch)
119 | {
120 | return ch;
121 | }
122 |
123 | public static char ToChar(this int i)
124 | {
125 | return (char)i;
126 | }
127 |
128 | #endregion
129 | }
130 | }
--------------------------------------------------------------------------------
/Segmenter/Common/FileExtension.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.Extensions.FileProviders;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.IO;
5 | using System.Reflection;
6 | using System.Text;
7 |
8 | namespace JiebaNet.Segmenter.Common
9 | {
10 | public static class FileExtension
11 | {
12 | public static string ReadEmbeddedAllLine(string path)
13 | {
14 | return ReadEmbeddedAllLine(path, Encoding.UTF8);
15 | }
16 |
17 | public static string ReadEmbeddedAllLine(string path,Encoding encoding)
18 | {
19 | var provider = new EmbeddedFileProvider(typeof(FileExtension).GetTypeInfo().Assembly);
20 | var fileInfo = provider.GetFileInfo(path);
21 | using (var sr = new StreamReader(fileInfo.CreateReadStream(), encoding))
22 | {
23 | return sr.ReadToEnd();
24 | }
25 | }
26 |
27 | public static List ReadEmbeddedAllLines(string path, Encoding encoding)
28 | {
29 | var provider = new EmbeddedFileProvider(typeof(FileExtension).GetTypeInfo().Assembly);
30 | var fileInfo = provider.GetFileInfo(path);
31 | List list = new List();
32 | using (StreamReader streamReader = new StreamReader(fileInfo.CreateReadStream(), encoding))
33 | {
34 | string item;
35 | while ((item = streamReader.ReadLine()) != null)
36 | {
37 | list.Add(item);
38 | }
39 | }
40 | return list;
41 | }
42 |
43 | public static List ReadEmbeddedAllLines(string path)
44 | {
45 | return ReadEmbeddedAllLines(path, Encoding.UTF8);
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/Segmenter/Common/Trie.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 |
5 | namespace JiebaNet.Segmenter.Common
6 | {
7 | // Refer to: https://github.com/brianfromoregon/trie
8 | public class TrieNode
9 | {
10 | public char Char { get; set; }
11 | public int Frequency { get; set; }
12 | public Dictionary Children { get; set; }
13 |
14 | public TrieNode(char ch)
15 | {
16 | Char = ch;
17 | Frequency = 0;
18 |
19 | // TODO: or an empty dict?
20 | //Children = null;
21 | }
22 |
23 | public int Insert(string s, int pos, int freq = 1)
24 | {
25 | if (string.IsNullOrEmpty(s) || pos >= s.Length)
26 | {
27 | return 0;
28 | }
29 |
30 | if (Children == null)
31 | {
32 | Children = new Dictionary();
33 | }
34 |
35 | var c = s[pos];
36 | if (!Children.ContainsKey(c))
37 | {
38 | Children[c] = new TrieNode(c);
39 | }
40 |
41 | var curNode = Children[c];
42 | if (pos == s.Length - 1)
43 | {
44 | curNode.Frequency += freq;
45 | return curNode.Frequency;
46 | }
47 |
48 | return curNode.Insert(s, pos + 1, freq);
49 | }
50 |
51 | public TrieNode Search(string s, int pos)
52 | {
53 | if (string.IsNullOrEmpty(s))
54 | {
55 | return null;
56 | }
57 |
58 | // if out of range or without any child nodes
59 | if (pos >= s.Length || Children == null)
60 | {
61 | return null;
62 | }
63 | // if reaches the last char of s, it's time to make the decision.
64 | if (pos == s.Length - 1)
65 | {
66 | return Children.ContainsKey(s[pos]) ? Children[s[pos]] : null;
67 | }
68 | // continue if necessary.
69 | return Children.ContainsKey(s[pos]) ? Children[s[pos]].Search(s, pos + 1) : null;
70 | }
71 | }
72 |
73 | public interface ITrie
74 | {
75 | //string BestMatch(string word, long maxTime);
76 | bool Contains(string word);
77 | int Frequency(string word);
78 | int Insert(string word, int freq = 1);
79 | //bool Remove(string word);
80 | int Count { get; }
81 | int TotalFrequency { get; }
82 | }
83 |
84 | public class Trie : ITrie
85 | {
86 | private static readonly char RootChar = '\0';
87 |
88 | internal TrieNode Root;
89 |
90 | public int Count { get; private set; }
91 | public int TotalFrequency { get; private set; }
92 |
93 | public Trie()
94 | {
95 | Root = new TrieNode(RootChar);
96 | Count = 0;
97 | }
98 |
99 | public bool Contains(string word)
100 | {
101 | CheckWord(word);
102 |
103 | var node = Root.Search(word.Trim(), 0);
104 | return node.IsNotNull() && node.Frequency > 0;
105 | }
106 |
107 | public bool ContainsPrefix(string word)
108 | {
109 | CheckWord(word);
110 |
111 | var node = Root.Search(word.Trim(), 0);
112 | return node.IsNotNull();
113 | }
114 |
115 | public int Frequency(string word)
116 | {
117 | CheckWord(word);
118 |
119 | var node = Root.Search(word.Trim(), 0);
120 | return node.IsNull() ? 0 : node.Frequency;
121 | }
122 |
123 | public int Insert(string word, int freq = 1)
124 | {
125 | CheckWord(word);
126 |
127 | var i = Root.Insert(word.Trim(), 0, freq);
128 | if (i > 0)
129 | {
130 | TotalFrequency += freq;
131 | Count++;
132 | }
133 |
134 | return i;
135 | }
136 |
137 | public IEnumerable ChildChars(string prefix)
138 | {
139 | var node = Root.Search(prefix.Trim(), 0);
140 | return node.IsNull() || node.Children.IsNull() ? null : node.Children.Select(p => p.Key);
141 | }
142 |
143 | private void CheckWord(string word)
144 | {
145 | if (string.IsNullOrWhiteSpace(word))
146 | {
147 | throw new ArgumentException("word must not be null or whitespace");
148 | }
149 | }
150 | }
151 | }
152 |
--------------------------------------------------------------------------------
/Segmenter/ConfigManager.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.IO;
3 |
4 | namespace JiebaNet.Segmenter
5 | {
6 | public class ConfigManager
7 | {
8 | public static string ConfigFileBaseDir
9 | {
10 | get
11 | {
12 | var configFileDir = "Resources";
13 | return configFileDir;
14 | }
15 | }
16 |
17 | public static string MainDictFile
18 | {
19 | get { return Path.Combine(ConfigFileBaseDir, "dict.txt"); }
20 | }
21 |
22 | public static string ProbTransFile
23 | {
24 | get { return Path.Combine(ConfigFileBaseDir, "prob_trans.json"); }
25 | }
26 |
27 | public static string ProbEmitFile
28 | {
29 | get { return Path.Combine(ConfigFileBaseDir, "prob_emit.json"); }
30 | }
31 |
32 | public static string PosProbStartFile
33 | {
34 | get { return Path.Combine(ConfigFileBaseDir, "pos_prob_start.json"); }
35 | }
36 |
37 | public static string PosProbTransFile
38 | {
39 | get { return Path.Combine(ConfigFileBaseDir, "pos_prob_trans.json"); }
40 | }
41 |
42 | public static string PosProbEmitFile
43 | {
44 | get { return Path.Combine(ConfigFileBaseDir, "pos_prob_emit.json"); }
45 | }
46 |
47 | public static string CharStateTabFile
48 | {
49 | get { return Path.Combine(ConfigFileBaseDir, "char_state_tab.json"); }
50 | }
51 | }
52 | }
--------------------------------------------------------------------------------
/Segmenter/Constants.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Linq;
3 |
4 | namespace JiebaNet.Segmenter
5 | {
6 | public class Constants
7 | {
8 | public static readonly double MinProb = -3.14e100;
9 |
10 | public static readonly List NounPos = new List() { "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz" };
11 | public static readonly List VerbPos = new List() { "v", "vd", "vg", "vi", "vn", "vq" };
12 | public static readonly List NounAndVerbPos = NounPos.Union(VerbPos).ToList();
13 | public static readonly List IdiomPos = new List() { "i" };
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/Segmenter/DefaultDictionary.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 | using System.Threading.Tasks;
6 |
7 | namespace JiebaNet.Segmenter
8 | {
9 | public class DefaultDictionary : Dictionary
10 | {
11 | public new TValue this[TKey key]
12 | {
13 | get
14 | {
15 | if (!ContainsKey(key))
16 | {
17 | Add(key, default(TValue));
18 | }
19 | return base[key];
20 | }
21 | set { base[key] = value; }
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/Segmenter/FinalSeg/IFinalSeg.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 |
4 | namespace JiebaNet.Segmenter.FinalSeg
5 | {
6 | public interface IFinalSeg
7 | {
8 | IEnumerable Cut(string sentence);
9 | }
10 | }
--------------------------------------------------------------------------------
/Segmenter/FinalSeg/Viterbi.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Text.RegularExpressions;
7 | using JiebaNet.Segmenter.Common;
8 | using Newtonsoft.Json;
9 |
10 | namespace JiebaNet.Segmenter.FinalSeg
11 | {
12 | public class Viterbi : IFinalSeg
13 | {
14 | private static readonly Lazy Lazy = new Lazy(() => new Viterbi());
15 | private static readonly char[] States = { 'B', 'M', 'E', 'S' };
16 |
17 | private static readonly Regex RegexChinese = new Regex(@"([\u4E00-\u9FD5]+)", RegexOptions.Compiled);
18 | private static readonly Regex RegexSkip = new Regex(@"(\d+\.\d+|[a-zA-Z0-9]+)", RegexOptions.Compiled);
19 |
20 | private static IDictionary> _emitProbs;
21 | private static IDictionary _startProbs;
22 | private static IDictionary> _transProbs;
23 | private static IDictionary _prevStatus;
24 |
25 | private Viterbi()
26 | {
27 | LoadModel();
28 | }
29 |
30 | // TODO: synchronized
31 | public static Viterbi Instance
32 | {
33 | get { return Lazy.Value; }
34 | }
35 |
36 | public IEnumerable Cut(string sentence)
37 | {
38 | var tokens = new List();
39 | foreach (var blk in RegexChinese.Split(sentence))
40 | {
41 | if (RegexChinese.IsMatch(blk))
42 | {
43 | tokens.AddRange(ViterbiCut(blk));
44 | }
45 | else
46 | {
47 | var segments = RegexSkip.Split(blk).Where(seg => !string.IsNullOrEmpty(seg));
48 | tokens.AddRange(segments);
49 | }
50 | }
51 | return tokens;
52 | }
53 |
54 | #region Private Helpers
55 |
56 | private void LoadModel()
57 | {
58 | var stopWatch = new Stopwatch();
59 | stopWatch.Start();
60 |
61 | _prevStatus = new Dictionary()
62 | {
63 | {'B', new []{'E', 'S'}},
64 | {'M', new []{'M', 'B'}},
65 | {'S', new []{'S', 'E'}},
66 | {'E', new []{'B', 'M'}}
67 | };
68 |
69 | _startProbs = new Dictionary()
70 | {
71 | {'B', -0.26268660809250016},
72 | {'E', -3.14e+100},
73 | {'M', -3.14e+100},
74 | {'S', -1.4652633398537678}
75 | };
76 |
77 | var transJson = FileExtension.ReadEmbeddedAllLine(ConfigManager.ProbTransFile);
78 | _transProbs = JsonConvert.DeserializeObject>>(transJson);
79 |
80 | var emitJson = FileExtension.ReadEmbeddedAllLine(ConfigManager.ProbEmitFile);
81 | _emitProbs = JsonConvert.DeserializeObject>>(emitJson);
82 |
83 | stopWatch.Stop();
84 | Debug.WriteLine("model loading finished, time elapsed {0} ms.", stopWatch.ElapsedMilliseconds);
85 | }
86 |
87 | private IEnumerable ViterbiCut(string sentence)
88 | {
89 | var v = new List>();
90 | IDictionary path = new Dictionary();
91 |
92 | // Init weights and paths.
93 | v.Add(new Dictionary());
94 | foreach (var state in States)
95 | {
96 | var emP = _emitProbs[state].GetDefault(sentence[0], Constants.MinProb);
97 | v[0][state] = _startProbs[state] + emP;
98 | path[state] = new Node(state, null);
99 | }
100 |
101 | // For each remaining char
102 | for (var i = 1; i < sentence.Length; ++i)
103 | {
104 | IDictionary vv = new Dictionary();
105 | v.Add(vv);
106 | IDictionary newPath = new Dictionary();
107 | foreach (var y in States)
108 | {
109 | var emp = _emitProbs[y].GetDefault(sentence[i], Constants.MinProb);
110 |
111 | Pair candidate = new Pair('\0', double.MinValue);
112 | foreach (var y0 in _prevStatus[y])
113 | {
114 | var tranp = _transProbs[y0].GetDefault(y, Constants.MinProb);
115 | tranp = v[i - 1][y0] + tranp + emp;
116 | if (candidate.Freq <= tranp)
117 | {
118 | candidate.Freq = tranp;
119 | candidate.Key = y0;
120 | }
121 | }
122 | vv[y] = candidate.Freq;
123 | newPath[y] = new Node(y, path[candidate.Key]);
124 | }
125 | path = newPath;
126 | }
127 |
128 | var probE = v[sentence.Length - 1]['E'];
129 | var probS = v[sentence.Length - 1]['S'];
130 | var finalPath = probE < probS ? path['S'] : path['E'];
131 |
132 | var posList = new List(sentence.Length);
133 | while (finalPath != null)
134 | {
135 | posList.Add(finalPath.Value);
136 | finalPath = finalPath.Parent;
137 | }
138 | posList.Reverse();
139 |
140 | var tokens = new List();
141 | int begin = 0, next = 0;
142 | for (var i = 0; i < sentence.Length; i++)
143 | {
144 | var pos = posList[i];
145 | if (pos == 'B')
146 | begin = i;
147 | else if (pos == 'E')
148 | {
149 | tokens.Add(sentence.Sub(begin, i + 1));
150 | next = i + 1;
151 | }
152 | else if (pos == 'S')
153 | {
154 | tokens.Add(sentence.Sub(i, i + 1));
155 | next = i + 1;
156 | }
157 | }
158 | if (next < sentence.Length)
159 | {
160 | tokens.Add(sentence.Substring(next));
161 | }
162 |
163 | return tokens;
164 | }
165 |
166 | #endregion
167 | }
168 | }
--------------------------------------------------------------------------------
/Segmenter/JiebaSegmenter.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Text.RegularExpressions;
7 | using JiebaNet.Segmenter.Common;
8 | using JiebaNet.Segmenter.FinalSeg;
9 | using System.IO;
10 |
11 | namespace JiebaNet.Segmenter
12 | {
13 | public class JiebaSegmenter
14 | {
15 | private static readonly WordDictionary WordDict = WordDictionary.Instance;
16 | private static readonly IFinalSeg FinalSeg = Viterbi.Instance;
17 | private static readonly ISet LoadedPath = new HashSet();
18 |
19 | private static readonly object locker = new object();
20 |
21 | internal IDictionary UserWordTagTab { get; set; }
22 |
23 | #region Regular Expressions
24 |
25 | internal static readonly Regex RegexChineseDefault = new Regex(@"([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", RegexOptions.Compiled);
26 |
27 | internal static readonly Regex RegexSkipDefault = new Regex(@"(\r\n|\s)", RegexOptions.Compiled);
28 |
29 | internal static readonly Regex RegexChineseCutAll = new Regex(@"([\u4E00-\u9FD5]+)", RegexOptions.Compiled);
30 | internal static readonly Regex RegexSkipCutAll = new Regex(@"[^a-zA-Z0-9+#\n]", RegexOptions.Compiled);
31 |
32 | internal static readonly Regex RegexEnglishChars = new Regex(@"[a-zA-Z0-9]", RegexOptions.Compiled);
33 |
34 | internal static readonly Regex RegexUserDict = new Regex("^(?.+?)(? [0-9]+)?(? [a-z]+)?$", RegexOptions.Compiled);
35 |
36 | #endregion
37 |
38 | public JiebaSegmenter()
39 | {
40 | UserWordTagTab = new Dictionary();
41 | }
42 |
43 | ///
44 | /// The main function that segments an entire sentence that contains
45 | /// Chinese characters into seperated words.
46 | ///
47 | /// The string to be segmented.
48 | /// Specify segmentation pattern. True for full pattern, False for accurate pattern.
49 | /// Whether to use the Hidden Markov Model.
50 | ///
51 | public IEnumerable Cut(string text, bool cutAll = false, bool hmm = true)
52 | {
53 | var reHan = RegexChineseDefault;
54 | var reSkip = RegexSkipDefault;
55 | Func> cutMethod = null;
56 |
57 | if (cutAll)
58 | {
59 | reHan = RegexChineseCutAll;
60 | reSkip = RegexSkipCutAll;
61 | }
62 |
63 | if (cutAll)
64 | {
65 | cutMethod = CutAll;
66 | }
67 | else if (hmm)
68 | {
69 | cutMethod = CutDag;
70 | }
71 | else
72 | {
73 | cutMethod = CutDagWithoutHmm;
74 | }
75 |
76 | return CutIt(text, cutMethod, reHan, reSkip, cutAll);
77 | }
78 |
79 | public IEnumerable Cut2(string text,bool cutAll=false,bool hmm=true)
80 | {
81 | var reHan = RegexChineseDefault;
82 | var reSkip = RegexSkipDefault;
83 | Func> cutMethod = null;
84 |
85 | if (cutAll)
86 | {
87 | reHan = RegexChineseCutAll;
88 | reSkip = RegexSkipCutAll;
89 | }
90 |
91 | if (cutAll)
92 | {
93 | cutMethod = CutAll;
94 | }
95 | else if (hmm)
96 | {
97 | cutMethod = CutDag;
98 | }
99 | else
100 | {
101 | cutMethod = CutDagWithoutHmm;
102 | }
103 |
104 | return CutIt2(text, cutMethod, reHan, reSkip, cutAll);
105 | }
106 |
107 | public IEnumerable CutForSearch(string text, bool hmm = true)
108 | {
109 | var result = new List();
110 |
111 | var words = Cut(text, hmm: hmm);
112 | foreach (var w in words)
113 | {
114 | if (w.Length > 2)
115 | {
116 | foreach (var i in Enumerable.Range(0, w.Length - 1))
117 | {
118 | var gram2 = w.Substring(i, 2);
119 | if (WordDict.ContainsWord(gram2))
120 | {
121 | result.Add(gram2);
122 | }
123 | }
124 | }
125 |
126 | if (w.Length > 3)
127 | {
128 | foreach (var i in Enumerable.Range(0, w.Length - 2))
129 | {
130 | var gram3 = w.Substring(i, 3);
131 | if (WordDict.ContainsWord(gram3))
132 | {
133 | result.Add(gram3);
134 | }
135 | }
136 | }
137 |
138 | result.Add(w);
139 | }
140 |
141 | return result;
142 | }
143 |
144 | public IEnumerable Tokenize(string text, TokenizerMode mode = TokenizerMode.Default, bool hmm = true)
145 | {
146 | var result = new List();
147 |
148 | if (mode == TokenizerMode.Default)
149 | {
150 | foreach (var w in Cut2(text, hmm: hmm))
151 | {
152 | var width = w.value.Length;
153 | result.Add(new Token(w.value, w.position, w.position + width));
154 |
155 | }
156 | }
157 | else
158 | {
159 | //var xx = Cut2(text, hmm: hmm);
160 | foreach (var w in Cut2(text, hmm: hmm))
161 | {
162 | var width = w.value.Length;
163 | if (width > 2)
164 | {
165 | for (var i = 0; i < width - 1; i++)
166 | {
167 | var gram2 = w.value.Substring(i, 2);
168 | if (WordDict.ContainsWord(gram2))
169 | {
170 | result.Add(new Token(gram2, w.position + i, w.position + i + 2));
171 | }
172 | }
173 | }
174 | if (width > 3)
175 | {
176 | for (var i = 0; i < width - 2; i++)
177 | {
178 | var gram3 = w.value.Substring(i, 3);
179 | if (WordDict.ContainsWord(gram3))
180 | {
181 | result.Add(new Token(gram3, w.position + i, w.position + i + 3));
182 | }
183 | }
184 | }
185 |
186 | result.Add(new Token(w.value, w.position, w.position + width));
187 |
188 | }
189 | }
190 |
191 | return result;
192 | }
193 |
194 | #region Internal Cut Methods
195 |
196 | internal IDictionary> GetDag(string sentence)
197 | {
198 | var dag = new Dictionary>();
199 | var trie = WordDict.Trie;
200 |
201 | var N = sentence.Length;
202 | for (var k = 0; k < sentence.Length; k++)
203 | {
204 | var templist = new List();
205 | var i = k;
206 | var frag = sentence.Substring(k, 1);
207 | while (i < N && trie.ContainsKey(frag))
208 | {
209 | if (trie[frag] > 0)
210 | {
211 | templist.Add(i);
212 | }
213 |
214 | i++;
215 | // TODO:
216 | if (i < N)
217 | {
218 | frag = sentence.Sub(k, i + 1);
219 | }
220 | }
221 | if (templist.Count == 0)
222 | {
223 | templist.Add(k);
224 | }
225 | dag[k] = templist;
226 | }
227 |
228 | return dag;
229 | }
230 |
231 | internal IDictionary> Calc(string sentence, IDictionary> dag)
232 | {
233 | var n = sentence.Length;
234 | var route = new Dictionary>();
235 | route[n] = new Pair(0, 0.0);
236 |
237 | var logtotal = Math.Log(WordDict.Total);
238 | for (var i = n - 1; i > -1; i--)
239 | {
240 | var candidate = new Pair(-1, double.MinValue);
241 | foreach (int x in dag[i])
242 | {
243 | var freq = Math.Log(WordDict.GetFreqOrDefault(sentence.Sub(i, x + 1))) - logtotal + route[x + 1].Freq;
244 | if (candidate.Freq < freq)
245 | {
246 | candidate.Freq = freq;
247 | candidate.Key = x;
248 | }
249 | }
250 | route[i] = candidate;
251 | }
252 | return route;
253 | }
254 |
255 | internal IEnumerable CutAll(string sentence)
256 | {
257 | var dag = GetDag(sentence);
258 |
259 | var words = new List();
260 | var lastPos = -1;
261 |
262 | foreach (var pair in dag)
263 | {
264 | var k = pair.Key;
265 | var nexts = pair.Value;
266 | if (nexts.Count == 1 && k > lastPos)
267 | {
268 | words.Add(sentence.Substring(k, nexts[0] + 1 - k));
269 | lastPos = nexts[0];
270 | }
271 | else
272 | {
273 | foreach (var j in nexts)
274 | {
275 | if (j > k)
276 | {
277 | words.Add(sentence.Substring(k, j + 1 - k));
278 | lastPos = j;
279 | }
280 | }
281 | }
282 | }
283 |
284 | return words;
285 | }
286 |
287 | internal IEnumerable CutDag(string sentence)
288 | {
289 | var dag = GetDag(sentence);
290 | var route = Calc(sentence, dag);
291 |
292 | var tokens = new List();
293 |
294 | var x = 0;
295 | var n = sentence.Length;
296 | var buf = string.Empty;
297 | while (x < n)
298 | {
299 | var y = route[x].Key + 1;
300 | var w = sentence.Substring(x, y - x);
301 | if (y - x == 1)
302 | {
303 | buf += w;
304 | }
305 | else
306 | {
307 | if (buf.Length > 0)
308 | {
309 | AddBufferToWordList(tokens, buf);
310 | buf = string.Empty;
311 | }
312 | tokens.Add(w);
313 | }
314 | x = y;
315 | }
316 |
317 | if (buf.Length > 0)
318 | {
319 | AddBufferToWordList(tokens, buf);
320 | }
321 |
322 | return tokens;
323 | }
324 |
325 | internal IEnumerable CutDagWithoutHmm(string sentence)
326 | {
327 | var dag = GetDag(sentence);
328 | var route = Calc(sentence, dag);
329 |
330 | var words = new List();
331 |
332 | var x = 0;
333 | string buf = string.Empty;
334 | var N = sentence.Length;
335 |
336 | var y = -1;
337 | while (x < N)
338 | {
339 | y = route[x].Key + 1;
340 | var l_word = sentence.Substring(x, y - x);
341 | if (RegexEnglishChars.IsMatch(l_word) && l_word.Length == 1)
342 | {
343 | buf += l_word;
344 | x = y;
345 | }
346 | else
347 | {
348 | if (buf.Length > 0)
349 | {
350 | words.Add(buf);
351 | buf = string.Empty;
352 | }
353 | words.Add(l_word);
354 | x = y;
355 | }
356 | }
357 |
358 | if (buf.Length > 0)
359 | {
360 | words.Add(buf);
361 | }
362 |
363 | return words;
364 | }
365 |
366 | internal IEnumerable CutIt2(string text, Func> cutMethod,
367 | Regex reHan, Regex reSkip, bool cutAll)
368 | {
369 | var result = new List();
370 | var blocks = reHan.Split(text);
371 | var start = 0;
372 | foreach(var blk in blocks)
373 | {
374 | if(string.IsNullOrWhiteSpace(blk))
375 | {
376 | start += blk.Length;
377 | continue;
378 | }
379 | if(reHan.IsMatch(blk))
380 | {
381 | foreach(var word in cutMethod(blk))
382 | {
383 | result.Add(new WordInfo(word,start));
384 | start += word.Length;
385 | }
386 | }
387 | else
388 | {
389 | var tmp = reSkip.Split(blk);
390 | foreach(var x in tmp)
391 | {
392 | if(reSkip.IsMatch(x))
393 | {
394 | result.Add(new WordInfo(x,start));
395 | start += x.Length;
396 | }
397 | else if(!cutAll)
398 | {
399 | foreach(var ch in x)
400 | {
401 | result.Add(new WordInfo(ch.ToString(),start));
402 | start += ch.ToString().Length;
403 | }
404 | }
405 | else{
406 |
407 | result.Add(new WordInfo(x,start));
408 | start += x.Length;
409 |
410 | }
411 | }
412 | }
413 | }
414 |
415 | return result;
416 | }
417 |
418 | internal IEnumerable CutIt(string text, Func> cutMethod,
419 | Regex reHan, Regex reSkip, bool cutAll)
420 | {
421 | var result = new List();
422 | var blocks = reHan.Split(text);
423 | foreach (var blk in blocks)
424 | {
425 | if (string.IsNullOrWhiteSpace(blk))
426 | {
427 | continue;
428 | }
429 |
430 | if (reHan.IsMatch(blk))
431 | {
432 | foreach (var word in cutMethod(blk))
433 | {
434 | result.Add(word);
435 | }
436 | }
437 | else
438 | {
439 | var tmp = reSkip.Split(blk);
440 | foreach (var x in tmp)
441 | {
442 | if (reSkip.IsMatch(x))
443 | {
444 | result.Add(x);
445 | }
446 | else if (!cutAll)
447 | {
448 | foreach (var ch in x)
449 | {
450 | result.Add(ch.ToString());
451 | }
452 | }
453 | else
454 | {
455 | result.Add(x);
456 | }
457 | }
458 | }
459 | }
460 |
461 | return result;
462 | }
463 |
464 | #endregion
465 |
466 | #region Extend Main Dict
467 |
468 | ///
469 | /// Loads user dictionaries.
470 | ///
471 | ///
472 | public void LoadUserDict(string userDictFile)
473 | {
474 | var dictFullPath = Path.GetFullPath(userDictFile);
475 | Debug.WriteLine("Initializing user dictionary: " + userDictFile);
476 |
477 | lock (locker)
478 | {
479 | if (LoadedPath.Contains(dictFullPath))
480 | return;
481 |
482 | try
483 | {
484 | var startTime = DateTime.Now.Millisecond;
485 |
486 | var lines = FileExtension.ReadEmbeddedAllLines(dictFullPath);
487 | foreach (var line in lines)
488 | {
489 | if (string.IsNullOrWhiteSpace(line))
490 | {
491 | continue;
492 | }
493 |
494 | var tokens = RegexUserDict.Match(line.Trim()).Groups;
495 | var word = tokens["word"].Value.Trim();
496 | var freq = tokens["freq"].Value.Trim();
497 | var tag = tokens["tag"].Value.Trim();
498 |
499 | var actualFreq = freq.Length > 0 ? int.Parse(freq) : 0;
500 | AddWord(word, actualFreq, tag);
501 | }
502 |
503 | Debug.WriteLine("user dict '{0}' load finished, time elapsed {1} ms",
504 | dictFullPath, DateTime.Now.Millisecond - startTime);
505 | }
506 | catch (IOException e)
507 | {
508 | Debug.Fail(string.Format("'{0}' load failure, reason: {1}", dictFullPath, e.Message));
509 | }
510 | catch (FormatException fe)
511 | {
512 | Debug.Fail(fe.Message);
513 | }
514 | }
515 | }
516 |
517 | public void AddWord(string word, int freq = 0, string tag = null)
518 | {
519 | if (freq <= 0)
520 | {
521 | freq = WordDict.SuggestFreq(word, Cut(word, hmm: false));
522 | }
523 | WordDict.AddWord(word, freq);
524 |
525 | // Add user word tag of POS
526 | if (!string.IsNullOrEmpty(tag))
527 | {
528 | UserWordTagTab[word] = tag;
529 | }
530 | }
531 |
532 | public void DeleteWord(string word)
533 | {
534 | WordDict.DeleteWord(word);
535 | }
536 |
537 | #endregion
538 |
539 | #region Private Helpers
540 |
541 | private void AddBufferToWordList(List words, string buf)
542 | {
543 | if (buf.Length == 1)
544 | {
545 | words.Add(buf);
546 | }
547 | else
548 | {
549 | if (!WordDict.ContainsWord(buf))
550 | {
551 | var tokens = FinalSeg.Cut(buf);
552 | words.AddRange(tokens);
553 | }
554 | else
555 | {
556 | words.AddRange(buf.Select(ch => ch.ToString()));
557 | }
558 | }
559 | }
560 |
561 | #endregion
562 | }
563 |
564 | public enum TokenizerMode
565 | {
566 | Default,
567 | Search
568 | }
569 |
570 |
571 | }
572 |
--------------------------------------------------------------------------------
/Segmenter/Node.cs:
--------------------------------------------------------------------------------
1 | namespace JiebaNet.Segmenter
2 | {
3 | public class Node
4 | {
5 | public char Value { get; private set; }
6 | public Node Parent { get; private set; }
7 |
8 | public Node(char value, Node parent)
9 | {
10 | Value = value;
11 | Parent = parent;
12 | }
13 | }
14 | }
--------------------------------------------------------------------------------
/Segmenter/Pair.cs:
--------------------------------------------------------------------------------
1 | namespace JiebaNet.Segmenter
2 | {
3 | public class Pair
4 | {
5 | public TKey Key { get;set; }
6 | public double Freq { get; set; }
7 |
8 | public Pair(TKey key, double freq)
9 | {
10 | Key = key;
11 | Freq = freq;
12 | }
13 |
14 | public override string ToString()
15 | {
16 | return "Candidate [Key=" + Key + ", Freq=" + Freq + "]";
17 | }
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/Segmenter/PosSeg/Pair.cs:
--------------------------------------------------------------------------------
1 | namespace JiebaNet.Segmenter.PosSeg
2 | {
3 | public class Pair
4 | {
5 | public string Word { get; set; }
6 | public string Flag { get; set; }
7 | public Pair(string word, string flag)
8 | {
9 | Word = word;
10 | Flag = flag;
11 | }
12 |
13 | public override string ToString()
14 | {
15 | return string.Format("{0}/{1}", Word, Flag);
16 | }
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/Segmenter/PosSeg/PosSegmenter.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Text.RegularExpressions;
7 | using JiebaNet.Segmenter.Common;
8 |
9 | namespace JiebaNet.Segmenter.PosSeg
10 | {
11 | public class PosSegmenter
12 | {
13 | private static readonly WordDictionary WordDict = WordDictionary.Instance;
14 | private static readonly Viterbi PosSeg = Viterbi.Instance;
15 |
16 | // TODO:
17 | private static readonly object locker = new object();
18 |
19 | #region Regular Expressions
20 |
21 | internal static readonly Regex RegexChineseInternal = new Regex(@"([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", RegexOptions.Compiled);
22 | internal static readonly Regex RegexSkipInternal = new Regex(@"(\r\n|\s)", RegexOptions.Compiled);
23 |
24 | internal static readonly Regex RegexChineseDetail = new Regex(@"([\u4E00-\u9FD5]+)", RegexOptions.Compiled);
25 | internal static readonly Regex RegexSkipDetail = new Regex(@"([\.0-9]+|[a-zA-Z0-9]+)", RegexOptions.Compiled);
26 |
27 | internal static readonly Regex RegexEnglishWords = new Regex(@"[a-zA-Z0-9]+", RegexOptions.Compiled);
28 | internal static readonly Regex RegexNumbers = new Regex(@"[\.0-9]+", RegexOptions.Compiled);
29 |
30 | internal static readonly Regex RegexEnglishChar = new Regex(@"^[a-zA-Z0-9]$", RegexOptions.Compiled);
31 |
32 | #endregion
33 |
34 | private static IDictionary _wordTagTab;
35 |
36 | static PosSegmenter()
37 | {
38 | LoadWordTagTab();
39 | }
40 |
41 | private static void LoadWordTagTab()
42 | {
43 | try
44 | {
45 | _wordTagTab = new Dictionary();
46 | var lines = FileExtension.ReadEmbeddedAllLines(ConfigManager.MainDictFile);
47 | foreach (var line in lines)
48 | {
49 | var tokens = line.Split(' ');
50 | if (tokens.Length < 2)
51 | {
52 | Debug.Fail(string.Format("Invalid line: {0}", line));
53 | continue;
54 | }
55 |
56 | var word = tokens[0];
57 | var tag = tokens[2];
58 |
59 | _wordTagTab[word] = tag;
60 | }
61 | }
62 | catch (System.IO.IOException e)
63 | {
64 | Debug.Fail(string.Format("Word tag table load failure, reason: {0}", e.Message));
65 | }
66 | catch (FormatException fe)
67 | {
68 | Debug.Fail(fe.Message);
69 | }
70 | }
71 |
72 | private JiebaSegmenter _segmenter;
73 |
74 | public PosSegmenter()
75 | {
76 | _segmenter = new JiebaSegmenter();
77 | }
78 |
79 | public PosSegmenter(JiebaSegmenter segmenter)
80 | {
81 | _segmenter = segmenter;
82 | }
83 |
84 | private void CheckNewUserWordTags()
85 | {
86 | if (_segmenter.UserWordTagTab.IsNotEmpty())
87 | {
88 | _wordTagTab.Update(_segmenter.UserWordTagTab);
89 | _segmenter.UserWordTagTab = new Dictionary();
90 | }
91 | }
92 |
93 | public IEnumerable Cut(string text, bool hmm = true)
94 | {
95 | return CutInternal(text, hmm);
96 | }
97 |
98 | #region Internal Cut Methods
99 |
100 | internal IEnumerable CutInternal(string text, bool hmm = true)
101 | {
102 | CheckNewUserWordTags();
103 |
104 | var blocks = RegexChineseInternal.Split(text);
105 | Func> cutMethod = null;
106 | if (hmm)
107 | {
108 | cutMethod = CutDag;
109 | }
110 | else
111 | {
112 | cutMethod = CutDagWithoutHmm;
113 | }
114 |
115 | var tokens = new List();
116 | foreach (var blk in blocks)
117 | {
118 | if (RegexChineseInternal.IsMatch(blk))
119 | {
120 | tokens.AddRange(cutMethod(blk));
121 | }
122 | else
123 | {
124 | var tmp = RegexSkipInternal.Split(blk);
125 | foreach (var x in tmp)
126 | {
127 | if (RegexSkipInternal.IsMatch(x))
128 | {
129 | tokens.Add(new Pair(x, "x"));
130 | }
131 | else
132 | {
133 | foreach (var xx in x)
134 | {
135 | // TODO: each char?
136 | var xxs = xx.ToString();
137 | if (RegexNumbers.IsMatch(xxs))
138 | {
139 | tokens.Add(new Pair(xxs, "m"));
140 | }
141 | else if (RegexEnglishWords.IsMatch(x))
142 | {
143 | tokens.Add(new Pair(xxs, "eng"));
144 | }
145 | else
146 | {
147 | tokens.Add(new Pair(xxs, "x"));
148 | }
149 | }
150 | }
151 | }
152 | }
153 | }
154 |
155 | return tokens;
156 | }
157 |
158 | internal IEnumerable CutDag(string sentence)
159 | {
160 | var dag = _segmenter.GetDag(sentence);
161 | var route = _segmenter.Calc(sentence, dag);
162 |
163 | var tokens = new List();
164 |
165 | var x = 0;
166 | var n = sentence.Length;
167 | var buf = string.Empty;
168 | while (x < n)
169 | {
170 | var y = route[x].Key + 1;
171 | var w = sentence.Substring(x, y - x);
172 | if (y - x == 1)
173 | {
174 | buf += w;
175 | }
176 | else
177 | {
178 | if (buf.Length > 0)
179 | {
180 | AddBufferToWordList(tokens, buf);
181 | buf = string.Empty;
182 | }
183 | tokens.Add(new Pair(w, _wordTagTab.GetDefault(w, "x")));
184 | }
185 | x = y;
186 | }
187 |
188 | if (buf.Length > 0)
189 | {
190 | AddBufferToWordList(tokens, buf);
191 | }
192 |
193 | return tokens;
194 | }
195 |
196 | internal IEnumerable CutDagWithoutHmm(string sentence)
197 | {
198 | var dag = _segmenter.GetDag(sentence);
199 | var route = _segmenter.Calc(sentence, dag);
200 |
201 | var tokens = new List();
202 |
203 | var x = 0;
204 | var buf = string.Empty;
205 | var n = sentence.Length;
206 |
207 | var y = -1;
208 | while (x < n)
209 | {
210 | y = route[x].Key + 1;
211 | var w = sentence.Substring(x, y - x);
212 | // TODO: char or word?
213 | if (RegexEnglishChar.IsMatch(w))
214 | {
215 | buf += w;
216 | x = y;
217 | }
218 | else
219 | {
220 | if (buf.Length > 0)
221 | {
222 | tokens.Add(new Pair(buf, "eng"));
223 | buf = string.Empty;
224 | }
225 | tokens.Add(new Pair(w, _wordTagTab.GetDefault(w, "x")));
226 | x = y;
227 | }
228 | }
229 |
230 | if (buf.Length > 0)
231 | {
232 | tokens.Add(new Pair(buf, "eng"));
233 | }
234 |
235 | return tokens;
236 | }
237 |
238 | internal IEnumerable CutDetail(string text)
239 | {
240 | var tokens = new List();
241 | var blocks = RegexChineseDetail.Split(text);
242 | foreach (var blk in blocks)
243 | {
244 | if (RegexChineseDetail.IsMatch(blk))
245 | {
246 | tokens.AddRange(PosSeg.Cut(blk));
247 | }
248 | else
249 | {
250 | var tmp = RegexSkipDetail.Split(blk);
251 | foreach (var x in tmp)
252 | {
253 | if (!string.IsNullOrWhiteSpace(x))
254 | {
255 | if (RegexNumbers.IsMatch(x))
256 | {
257 | tokens.Add(new Pair(x, "m"));
258 | }
259 | else if(RegexEnglishWords.IsMatch(x))
260 | {
261 | tokens.Add(new Pair(x, "eng"));
262 | }
263 | else
264 | {
265 | tokens.Add(new Pair(x, "x"));
266 | }
267 | }
268 | }
269 | }
270 | }
271 |
272 | return tokens;
273 | }
274 |
275 | #endregion
276 |
277 | #region Private Helpers
278 |
279 | private void AddBufferToWordList(List words, string buf)
280 | {
281 | if (buf.Length == 1)
282 | {
283 | words.Add(new Pair(buf, _wordTagTab.GetDefault(buf, "x")));
284 | }
285 | else
286 | {
287 | if (!WordDict.ContainsWord(buf))
288 | {
289 | var tokens = CutDetail(buf);
290 | words.AddRange(tokens);
291 | }
292 | else
293 | {
294 | words.AddRange(buf.Select(ch => new Pair(ch.ToString(), "x")));
295 | }
296 | }
297 | }
298 |
299 | #endregion
300 | }
301 | }
--------------------------------------------------------------------------------
/Segmenter/PosSeg/Viterbi.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using JiebaNet.Segmenter.Common;
5 | using Newtonsoft.Json;
6 |
7 | namespace JiebaNet.Segmenter.PosSeg
8 | {
9 | public class Viterbi
10 | {
11 | private static readonly Lazy Lazy = new Lazy(() => new Viterbi());
12 |
13 | private static IDictionary _startProbs;
14 | private static IDictionary> _transProbs;
15 | private static IDictionary> _emitProbs;
16 | private static IDictionary> _stateTab;
17 |
18 | private Viterbi()
19 | {
20 | LoadModel();
21 | }
22 |
23 | // TODO: synchronized
24 | public static Viterbi Instance
25 | {
26 | get { return Lazy.Value; }
27 | }
28 |
29 | public IEnumerable Cut(string sentence)
30 | {
31 | var probPosList = ViterbiCut(sentence);
32 | var posList = probPosList.Item2;
33 |
34 | var tokens = new List();
35 | int begin = 0, next = 0;
36 | for (var i = 0; i < sentence.Length; i++)
37 | {
38 | var parts = posList[i].Split('-');
39 | var charState = parts[0][0];
40 | var pos = parts[1];
41 | if (charState == 'B')
42 | begin = i;
43 | else if (charState == 'E')
44 | {
45 | tokens.Add(new Pair(sentence.Sub(begin, i + 1), pos));
46 | next = i + 1;
47 | }
48 | else if (charState == 'S')
49 | {
50 | tokens.Add(new Pair(sentence.Sub(i, i + 1), pos));
51 | next = i + 1;
52 | }
53 | }
54 | if (next < sentence.Length)
55 | {
56 | tokens.Add(new Pair(sentence.Substring(next), posList[next].Split('-')[1]));
57 | }
58 |
59 | return tokens;
60 | }
61 |
62 | #region Private Helpers
63 |
64 | private static void LoadModel()
65 | {
66 | var startJson = FileExtension.ReadEmbeddedAllLine(ConfigManager.PosProbStartFile);
67 | _startProbs = JsonConvert.DeserializeObject>(startJson);
68 |
69 | var transJson = FileExtension.ReadEmbeddedAllLine(ConfigManager.PosProbTransFile);
70 | _transProbs = JsonConvert.DeserializeObject>>(transJson);
71 |
72 | var emitJson = FileExtension.ReadEmbeddedAllLine(ConfigManager.PosProbEmitFile);
73 | _emitProbs = JsonConvert.DeserializeObject>>(emitJson);
74 |
75 | var tabJson = FileExtension.ReadEmbeddedAllLine(ConfigManager.CharStateTabFile);
76 | _stateTab = JsonConvert.DeserializeObject>>(tabJson);
77 | }
78 |
79 | // TODO: change sentence to obs?
80 | private Tuple> ViterbiCut(string sentence)
81 | {
82 | var v = new List>();
83 | var memPath = new List>();
84 |
85 | var allStates = _transProbs.Keys.ToList();
86 |
87 | // Init weights and paths.
88 | v.Add(new Dictionary());
89 | memPath.Add(new Dictionary());
90 | foreach (var state in _stateTab.GetDefault(sentence[0], allStates))
91 | {
92 | var emP = _emitProbs[state].GetDefault(sentence[0], Constants.MinProb);
93 | v[0][state] = _startProbs[state] + emP;
94 | memPath[0][state] = string.Empty;
95 | }
96 |
97 | // For each remaining char
98 | for (var i = 1; i < sentence.Length; ++i)
99 | {
100 | v.Add(new Dictionary());
101 | memPath.Add(new Dictionary());
102 |
103 | var prevStates = memPath[i - 1].Keys.Where(k => _transProbs[k].Count > 0);
104 | var curPossibleStates = new HashSet(prevStates.SelectMany(s => _transProbs[s].Keys));
105 |
106 | IEnumerable obsStates = _stateTab.GetDefault(sentence[i], allStates);
107 | obsStates = curPossibleStates.Intersect(obsStates);
108 |
109 | if (!obsStates.Any())
110 | {
111 | if (curPossibleStates.Count > 0)
112 | {
113 | obsStates = curPossibleStates;
114 | }
115 | else
116 | {
117 | obsStates = allStates;
118 | }
119 | }
120 |
121 | foreach (var y in obsStates)
122 | {
123 | var emp = _emitProbs[y].GetDefault(sentence[i], Constants.MinProb);
124 |
125 | var prob = double.MinValue;
126 | var state = string.Empty;
127 |
128 | foreach (var y0 in prevStates)
129 | {
130 | var tranp = _transProbs[y0].GetDefault(y, double.MinValue);
131 | tranp = v[i - 1][y0] + tranp + emp;
132 | // TODO: compare two very small values;
133 | // TODO: how to deal with negative infinity
134 | if (prob < tranp ||
135 | (prob == tranp && string.Compare(state, y0, StringComparison.CurrentCultureIgnoreCase) < 0))
136 | {
137 | prob = tranp;
138 | state = y0;
139 | }
140 | }
141 | v[i][y] = prob;
142 | memPath[i][y] = state;
143 | }
144 | }
145 |
146 | var vLast = v.Last();
147 | var last = memPath.Last().Keys.Select(y => new {State = y, Prob = vLast[y]});
148 | var endProb = double.MinValue;
149 | var endState = string.Empty;
150 | foreach (var endPoint in last)
151 | {
152 | // TODO: compare two very small values;
153 | if (endProb < endPoint.Prob ||
154 | (endProb == endPoint.Prob && String.Compare(endState, endPoint.State, StringComparison.CurrentCultureIgnoreCase) < 0))
155 | {
156 | endProb = endPoint.Prob;
157 | endState = endPoint.State;
158 | }
159 | }
160 |
161 | var route = new string[sentence.Length];
162 | var n = sentence.Length - 1;
163 | var curState = endState;
164 | while(n >= 0)
165 | {
166 | route[n] = curState;
167 | curState = memPath[n][curState];
168 | n--;
169 | }
170 |
171 | return new Tuple>(endProb, route.ToList());
172 | }
173 |
174 | #endregion
175 | }
176 | }
--------------------------------------------------------------------------------
/Segmenter/Resources/pos_prob_start.json:
--------------------------------------------------------------------------------
1 | {
2 | "E-e": -3.14e+100,
3 | "E-d": -3.14e+100,
4 | "E-g": -3.14e+100,
5 | "E-f": -3.14e+100,
6 | "E-a": -3.14e+100,
7 | "E-c": -3.14e+100,
8 | "E-b": -3.14e+100,
9 | "E-m": -3.14e+100,
10 | "S-rg": -10.275268591948773,
11 | "E-o": -3.14e+100,
12 | "E-n": -3.14e+100,
13 | "E-i": -3.14e+100,
14 | "E-h": -3.14e+100,
15 | "E-k": -3.14e+100,
16 | "E-j": -3.14e+100,
17 | "E-u": -3.14e+100,
18 | "E-t": -3.14e+100,
19 | "E-w": -3.14e+100,
20 | "E-v": -3.14e+100,
21 | "E-q": -3.14e+100,
22 | "E-p": -3.14e+100,
23 | "E-s": -3.14e+100,
24 | "M-bg": -3.14e+100,
25 | "M-uj": -3.14e+100,
26 | "E-y": -3.14e+100,
27 | "E-x": -3.14e+100,
28 | "E-z": -3.14e+100,
29 | "B-uz": -3.14e+100,
30 | "S-d": -3.903919764181873,
31 | "M-rg": -3.14e+100,
32 | "E-nt": -3.14e+100,
33 | "B-d": -3.9750475297585357,
34 | "B-uv": -3.14e+100,
35 | "E-vi": -3.14e+100,
36 | "B-mq": -6.78695300139688,
37 | "M-rr": -3.14e+100,
38 | "S-ag": -6.954113917960154,
39 | "M-jn": -3.14e+100,
40 | "E-l": -3.14e+100,
41 | "M-rz": -3.14e+100,
42 | "B-ud": -3.14e+100,
43 | "S-an": -12.84021794941031,
44 | "B-qg": -3.14e+100,
45 | "B-ug": -3.14e+100,
46 | "M-y": -3.14e+100,
47 | "S-qg": -3.14e+100,
48 | "S-z": -3.14e+100,
49 | "S-y": -6.1970794699489575,
50 | "S-x": -8.427419656069674,
51 | "S-w": -3.14e+100,
52 | "S-v": -3.053292303412302,
53 | "S-u": -6.940320595827818,
54 | "S-t": -3.14e+100,
55 | "B-nrt": -4.985642733519195,
56 | "S-r": -2.7635336784127853,
57 | "S-q": -4.888658618255058,
58 | "M-zg": -3.14e+100,
59 | "S-o": -8.464460927750023,
60 | "S-n": -3.8551483897645107,
61 | "B-zg": -3.14e+100,
62 | "S-l": -3.14e+100,
63 | "S-k": -6.940320595827818,
64 | "S-in": -3.14e+100,
65 | "S-i": -3.14e+100,
66 | "S-h": -8.650563207383884,
67 | "S-g": -6.507826815331734,
68 | "B-f": -5.491630418482717,
69 | "S-e": -5.942513006281674,
70 | "M-en": -3.14e+100,
71 | "S-c": -4.786966795861212,
72 | "S-b": -6.472888763970454,
73 | "S-a": -3.9025396831295227,
74 | "B-g": -3.14e+100,
75 | "B-b": -5.018374362109218,
76 | "B-c": -3.423880184954888,
77 | "M-ug": -3.14e+100,
78 | "B-a": -4.762305214596967,
79 | "E-qe": -3.14e+100,
80 | "M-x": -3.14e+100,
81 | "E-nz": -3.14e+100,
82 | "M-z": -3.14e+100,
83 | "M-u": -3.14e+100,
84 | "B-k": -3.14e+100,
85 | "M-w": -3.14e+100,
86 | "B-jn": -3.14e+100,
87 | "S-yg": -13.533365129970255,
88 | "B-o": -8.433498702146057,
89 | "B-l": -4.905883584659895,
90 | "B-m": -3.6524299819046386,
91 | "M-m": -3.14e+100,
92 | "M-l": -3.14e+100,
93 | "M-o": -3.14e+100,
94 | "M-n": -3.14e+100,
95 | "M-i": -3.14e+100,
96 | "M-h": -3.14e+100,
97 | "B-t": -3.3647479094528574,
98 | "M-ul": -3.14e+100,
99 | "B-z": -7.045681111485645,
100 | "M-d": -3.14e+100,
101 | "M-mg": -3.14e+100,
102 | "B-y": -9.844485675856319,
103 | "M-a": -3.14e+100,
104 | "S-nrt": -3.14e+100,
105 | "M-c": -3.14e+100,
106 | "M-uz": -3.14e+100,
107 | "E-mg": -3.14e+100,
108 | "B-i": -6.1157847275557105,
109 | "M-b": -3.14e+100,
110 | "E-uz": -3.14e+100,
111 | "B-n": -1.6966257797548328,
112 | "E-uv": -3.14e+100,
113 | "M-ud": -3.14e+100,
114 | "M-p": -3.14e+100,
115 | "E-ul": -3.14e+100,
116 | "E-mq": -3.14e+100,
117 | "M-s": -3.14e+100,
118 | "M-yg": -3.14e+100,
119 | "E-uj": -3.14e+100,
120 | "E-ud": -3.14e+100,
121 | "S-ln": -3.14e+100,
122 | "M-r": -3.14e+100,
123 | "E-ng": -3.14e+100,
124 | "B-r": -3.4098187790818413,
125 | "E-en": -3.14e+100,
126 | "M-qg": -3.14e+100,
127 | "B-s": -5.522673590839954,
128 | "S-rr": -3.14e+100,
129 | "B-p": -4.200984132085048,
130 | "B-dg": -3.14e+100,
131 | "M-uv": -3.14e+100,
132 | "S-zg": -3.14e+100,
133 | "B-v": -2.6740584874265685,
134 | "S-tg": -6.272842531880403,
135 | "B-w": -3.14e+100,
136 | "B-e": -8.563551830394255,
137 | "M-k": -3.14e+100,
138 | "M-j": -3.14e+100,
139 | "B-df": -8.888974230828882,
140 | "M-e": -3.14e+100,
141 | "E-tg": -3.14e+100,
142 | "M-t": -3.14e+100,
143 | "E-nr": -3.14e+100,
144 | "M-nrfg": -3.14e+100,
145 | "B-nr": -2.2310495913769506,
146 | "E-df": -3.14e+100,
147 | "E-dg": -3.14e+100,
148 | "S-jn": -3.14e+100,
149 | "M-q": -3.14e+100,
150 | "B-mg": -3.14e+100,
151 | "B-ln": -3.14e+100,
152 | "M-f": -3.14e+100,
153 | "E-ln": -3.14e+100,
154 | "E-yg": -3.14e+100,
155 | "S-bg": -3.14e+100,
156 | "E-ns": -3.14e+100,
157 | "B-tg": -3.14e+100,
158 | "E-qg": -3.14e+100,
159 | "S-nr": -4.483663103956885,
160 | "S-ns": -3.14e+100,
161 | "M-vn": -3.14e+100,
162 | "S-nt": -12.147070768850364,
163 | "S-nz": -3.14e+100,
164 | "S-ad": -11.048458480182255,
165 | "B-yg": -3.14e+100,
166 | "M-v": -3.14e+100,
167 | "E-vn": -3.14e+100,
168 | "S-ng": -4.913434861102905,
169 | "M-g": -3.14e+100,
170 | "M-nt": -3.14e+100,
171 | "S-en": -3.14e+100,
172 | "M-nr": -3.14e+100,
173 | "M-ns": -3.14e+100,
174 | "S-vq": -3.14e+100,
175 | "B-uj": -3.14e+100,
176 | "M-nz": -3.14e+100,
177 | "B-qe": -3.14e+100,
178 | "M-in": -3.14e+100,
179 | "M-ng": -3.14e+100,
180 | "S-vn": -11.453923588290419,
181 | "E-zg": -3.14e+100,
182 | "S-vi": -3.14e+100,
183 | "S-vg": -5.9430181843676895,
184 | "S-vd": -3.14e+100,
185 | "B-ad": -6.680066036784177,
186 | "E-rz": -3.14e+100,
187 | "B-ag": -3.14e+100,
188 | "B-vd": -9.044728760238115,
189 | "S-mq": -3.14e+100,
190 | "B-vi": -12.434752841302146,
191 | "E-rr": -3.14e+100,
192 | "B-rr": -12.434752841302146,
193 | "M-vq": -3.14e+100,
194 | "E-jn": -3.14e+100,
195 | "B-vn": -4.3315610890163585,
196 | "S-mg": -10.825314928868044,
197 | "B-in": -3.14e+100,
198 | "M-vi": -3.14e+100,
199 | "M-an": -3.14e+100,
200 | "M-vd": -3.14e+100,
201 | "B-rg": -3.14e+100,
202 | "M-vg": -3.14e+100,
203 | "M-ad": -3.14e+100,
204 | "M-ag": -3.14e+100,
205 | "E-rg": -3.14e+100,
206 | "S-uz": -9.299258625372996,
207 | "B-en": -3.14e+100,
208 | "S-uv": -8.15808672228609,
209 | "S-df": -3.14e+100,
210 | "S-dg": -8.948397651299683,
211 | "M-qe": -3.14e+100,
212 | "B-ng": -3.14e+100,
213 | "E-bg": -3.14e+100,
214 | "S-ul": -8.4153713175535,
215 | "S-uj": -6.85251045118004,
216 | "S-ug": -7.5394037026636855,
217 | "B-ns": -2.8228438314969213,
218 | "S-ud": -7.728230161053767,
219 | "B-nt": -4.846091668182416,
220 | "B-ul": -3.14e+100,
221 | "E-in": -3.14e+100,
222 | "B-bg": -3.14e+100,
223 | "M-df": -3.14e+100,
224 | "M-dg": -3.14e+100,
225 | "M-nrt": -3.14e+100,
226 | "B-j": -5.0576191284681915,
227 | "E-ug": -3.14e+100,
228 | "E-vq": -3.14e+100,
229 | "B-vg": -3.14e+100,
230 | "B-nz": -3.94698846057672,
231 | "S-qe": -3.14e+100,
232 | "B-rz": -7.946116471570005,
233 | "B-nrfg": -5.873722175405573,
234 | "E-ad": -3.14e+100,
235 | "E-ag": -3.14e+100,
236 | "B-u": -9.163917277503234,
237 | "M-ln": -3.14e+100,
238 | "B-an": -8.697083223018778,
239 | "M-mq": -3.14e+100,
240 | "E-an": -3.14e+100,
241 | "S-s": -3.14e+100,
242 | "B-q": -6.998123858956596,
243 | "E-nrt": -3.14e+100,
244 | "B-h": -13.533365129970255,
245 | "E-r": -3.14e+100,
246 | "S-p": -2.9868401813596317,
247 | "M-tg": -3.14e+100,
248 | "S-rz": -3.14e+100,
249 | "S-nrfg": -3.14e+100,
250 | "B-vq": -12.147070768850364,
251 | "B-x": -3.14e+100,
252 | "E-vd": -3.14e+100,
253 | "E-nrfg": -3.14e+100,
254 | "S-m": -3.269200652116097,
255 | "E-vg": -3.14e+100,
256 | "S-f": -5.194820249981676,
257 | "S-j": -4.911992119644354
258 | }
--------------------------------------------------------------------------------
/Segmenter/Resources/prob_trans.json:
--------------------------------------------------------------------------------
1 | {
2 | "M": {
3 | "M": -1.2603623820268226,
4 | "E": -0.33344856811948514
5 | },
6 | "S": {
7 | "S": -0.6658631448798212,
8 | "B": -0.7211965654669841
9 | },
10 | "B": {
11 | "M": -0.916290731874155,
12 | "E": -0.51082562376599
13 | },
14 | "E": {
15 | "S": -0.8085250474669937,
16 | "B": -0.5897149736854513
17 | }
18 | }
--------------------------------------------------------------------------------
/Segmenter/Segmenter.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netstandard2.0
5 | Lucene.JIEba.Segment
6 | 1.0.0
7 | SilentCC
8 | JIEba.Lucene.Net is an analyzer tools for lucene.net which is kind to chinese
9 | false
10 | https://github.com/SilentCC/JIEba-netcore2.0/
11 | Copyright 2019 (c) AgileLabs. All rights reserved.
12 | Analyzer Segment JIEba.net core2.0
13 | true
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/Segmenter/Spelling/SpellChecker.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Linq;
3 | using JiebaNet.Segmenter.Common;
4 |
5 | namespace JiebaNet.Segmenter.Spelling
6 | {
7 | public interface ISpellChecker
8 | {
9 | IEnumerable Suggests(string word);
10 | }
11 |
12 | public class SpellChecker : ISpellChecker
13 | {
14 | internal static readonly WordDictionary WordDict = WordDictionary.Instance;
15 |
16 | internal readonly Trie WordTrie;
17 | internal readonly Dictionary> FirstChars;
18 |
19 | public SpellChecker()
20 | {
21 | var wordDict = WordDictionary.Instance;
22 | WordTrie = new Trie();
23 | FirstChars = new Dictionary>();
24 |
25 | foreach (var wd in wordDict.Trie)
26 | {
27 | if (wd.Value > 0)
28 | {
29 | WordTrie.Insert(wd.Key, wd.Value);
30 |
31 | if (wd.Key.Length >= 2)
32 | {
33 | var second = wd.Key[1];
34 | var first = wd.Key[0];
35 | if (!FirstChars.ContainsKey(second))
36 | {
37 | FirstChars[second] = new HashSet();
38 | }
39 | FirstChars[second].Add(first);
40 | }
41 | }
42 | }
43 | }
44 |
45 | internal ISet GetEdits1(string word)
46 | {
47 | var splits = new List();
48 | for (var i = 0; i <= word.Length; i++)
49 | {
50 | splits.Add(new WordSplit() { Left = word.Substring(0, i), Right = word.Substring(i) });
51 | }
52 |
53 | var deletes = splits
54 | .Where(s => !string.IsNullOrEmpty(s.Right))
55 | .Select(s => s.Left + s.Right.Substring(1));
56 |
57 | var transposes = splits
58 | .Where(s => s.Right.Length > 1)
59 | .Select(s => s.Left + s.Right[1] + s.Right[0] + s.Right.Substring(2));
60 |
61 | var replaces = new HashSet();
62 | if (word.Length > 1)
63 | {
64 | var firsts = FirstChars[word[1]];
65 | foreach (var first in firsts)
66 | {
67 | if (first != word[0])
68 | {
69 | replaces.Add(first + word.Substring(1));
70 | }
71 | }
72 |
73 | var node = WordTrie.Root.Children[word[0]];
74 | for (int i = 1; node.IsNotNull() && node.Children.IsNotEmpty() && i < word.Length; i++)
75 | {
76 | foreach (var c in node.Children.Keys)
77 | {
78 | replaces.Add(word.Substring(0, i) + c + word.Substring(i + 1));
79 | }
80 | node = node.Children.GetValueOrDefault(word[i]);
81 | }
82 | }
83 |
84 | var inserts = new HashSet();
85 | if (word.Length > 1)
86 | {
87 | if (FirstChars.ContainsKey(word[0]))
88 | {
89 | var firsts = FirstChars[word[0]];
90 | foreach (var first in firsts)
91 | {
92 | inserts.Add(first + word);
93 | }
94 | }
95 |
96 | var node = WordTrie.Root.Children.GetValueOrDefault(word[0]);
97 | for (int i = 0; node.IsNotNull() && node.Children.IsNotEmpty() && i < word.Length; i++)
98 | {
99 | foreach (var c in node.Children.Keys)
100 | {
101 | inserts.Add(word.Substring(0, i+1) + c + word.Substring(i+1));
102 | }
103 |
104 | if (i < word.Length - 1)
105 | {
106 | node = node.Children.GetValueOrDefault(word[i + 1]);
107 | }
108 | }
109 | }
110 |
111 | var result = new HashSet();
112 | result.UnionWith(deletes);
113 | result.UnionWith(transposes);
114 | result.UnionWith(replaces);
115 | result.UnionWith(inserts);
116 |
117 | return result;
118 | }
119 |
120 | internal ISet GetKnownEdits2(string word)
121 | {
122 | var result = new HashSet();
123 | foreach (var e1 in GetEdits1(word))
124 | {
125 | result.UnionWith(GetEdits1(e1).Where(e => WordDictionary.Instance.ContainsWord(e)));
126 | }
127 | return result;
128 | }
129 |
130 | internal ISet GetKnownWords(IEnumerable words)
131 | {
132 | return new HashSet(words.Where(w => WordDictionary.Instance.ContainsWord(w)));
133 | }
134 |
135 | public IEnumerable Suggests(string word)
136 | {
137 | if (WordDict.ContainsWord(word))
138 | {
139 | return new[] {word};
140 | }
141 |
142 | var candicates = GetKnownWords(GetEdits1(word));
143 | if (candicates.IsNotEmpty())
144 | {
145 | return candicates.OrderByDescending(c => WordDict.GetFreqOrDefault(c));
146 | }
147 |
148 | candicates.UnionWith(GetKnownEdits2(word));
149 | return candicates.OrderByDescending(c => WordDict.GetFreqOrDefault(c));
150 | }
151 | }
152 |
153 | internal class WordSplit
154 | {
155 | public string Left { get; set; }
156 | public string Right { get; set; }
157 | }
158 | }
159 |
--------------------------------------------------------------------------------
/Segmenter/Token.cs:
--------------------------------------------------------------------------------
1 | namespace JiebaNet.Segmenter
2 | {
3 | public class Token
4 | {
5 | public string Word { get; set; }
6 | public int StartIndex { get; set; }
7 | public int EndIndex { get; set; }
8 |
9 | public Token(string word, int startIndex, int endIndex)
10 | {
11 | Word = word;
12 | StartIndex = startIndex;
13 | EndIndex = endIndex;
14 | }
15 |
16 | public override string ToString()
17 | {
18 | return string.Format("[{0}, ({1}, {2})]", Word, StartIndex, EndIndex);
19 | }
20 | }
21 | }
--------------------------------------------------------------------------------
/Segmenter/WordDictionary.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 | using System.IO;
5 | using System.Linq;
6 | using System.Text;
7 | using JiebaNet.Segmenter.Common;
8 | using Microsoft.Extensions.FileProviders;
9 | using System.Reflection;
10 |
11 | namespace JiebaNet.Segmenter
12 | {
13 | public class WordDictionary
14 | {
15 | private static readonly Lazy lazy = new Lazy(() => new WordDictionary());
16 | private static readonly string MainDict = ConfigManager.MainDictFile;
17 |
18 | internal IDictionary Trie = new Dictionary();
19 |
20 | ///
21 | /// total occurrence of all words.
22 | ///
23 | public double Total { get; set; }
24 |
25 | private WordDictionary()
26 | {
27 | LoadDict();
28 |
29 | Debug.WriteLine("{0} words (and their prefixes)", Trie.Count);
30 | Debug.WriteLine("total freq: {0}", Total);
31 | }
32 |
33 | public static WordDictionary Instance
34 | {
35 | get { return lazy.Value; }
36 | }
37 |
38 | private void LoadDict()
39 | {
40 | try
41 | {
42 | var stopWatch = new Stopwatch();
43 | stopWatch.Start();
44 | var filePath = ConfigManager.MainDictFile;
45 | var provider = new EmbeddedFileProvider(GetType().GetTypeInfo().Assembly);
46 | var fileInfo = provider.GetFileInfo(filePath);
47 | using (var sr = new StreamReader(fileInfo.CreateReadStream(), Encoding.UTF8))
48 | {
49 | string line = null;
50 | while ((line = sr.ReadLine()) != null)
51 | {
52 | var tokens = line.Split(' ');
53 | if (tokens.Length < 2)
54 | {
55 | Debug.Fail(string.Format("Invalid line: {0}", line));
56 | continue;
57 | }
58 |
59 | var word = tokens[0];
60 | var freq = int.Parse(tokens[1]);
61 |
62 | Trie[word] = freq;
63 | Total += freq;
64 |
65 | foreach (var ch in Enumerable.Range(0, word.Length))
66 | {
67 | var wfrag = word.Sub(0, ch + 1);
68 | if (!Trie.ContainsKey(wfrag))
69 | {
70 | Trie[wfrag] = 0;
71 | }
72 | }
73 | }
74 | }
75 |
76 | stopWatch.Stop();
77 | Debug.WriteLine("main dict load finished, time elapsed {0} ms", stopWatch.ElapsedMilliseconds);
78 | }
79 | catch (IOException e)
80 | {
81 | Debug.Fail(string.Format("{0} load failure, reason: {1}", MainDict, e.Message));
82 | }
83 | catch (FormatException fe)
84 | {
85 | Debug.Fail(fe.Message);
86 | }
87 | }
88 |
89 | public bool ContainsWord(string word)
90 | {
91 | return Trie.ContainsKey(word) && Trie[word] > 0;
92 | }
93 |
94 | public int GetFreqOrDefault(string key)
95 | {
96 | if (ContainsWord(key))
97 | return Trie[key];
98 | else
99 | return 1;
100 | }
101 |
102 | public void AddWord(string word, int freq, string tag = null)
103 | {
104 | if (ContainsWord(word))
105 | {
106 | Total -= Trie[word];
107 | }
108 |
109 | Trie[word] = freq;
110 | Total += freq;
111 | for (var i = 0; i < word.Length; i++)
112 | {
113 | var wfrag = word.Substring(0, i + 1);
114 | if (!Trie.ContainsKey(wfrag))
115 | {
116 | Trie[wfrag] = 0;
117 | }
118 | }
119 | }
120 |
121 | public void DeleteWord(string word)
122 | {
123 | AddWord(word, 0);
124 | }
125 |
126 | internal int SuggestFreq(string word, IEnumerable segments)
127 | {
128 | double freq = 1;
129 | foreach (var seg in segments)
130 | {
131 | freq *= GetFreqOrDefault(seg) / Total;
132 | }
133 |
134 | return Math.Max((int)(freq * Total) + 1, GetFreqOrDefault(word));
135 | }
136 | }
137 | }
--------------------------------------------------------------------------------
/Segmenter/WordInfo.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | namespace JiebaNet.Segmenter
3 | {
4 | public class WordInfo
5 | {
6 | public WordInfo(string value,int position)
7 | {
8 | this.value = value;
9 | this.position = position;
10 | }
11 | //分词的内容
12 | public string value { get; set; }
13 | //分词的初始位置
14 | public int position { get; set; }
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Test/SegmentTest.cs:
--------------------------------------------------------------------------------
1 | using JiebaNet.Segmenter;
2 | using System;
3 | using System.Text;
4 | using System.Collections.Generic;
5 | using System.IO;
6 | using System.Linq;
7 | using jieba.NET;
8 | using Xunit;
9 |
10 | namespace Test
11 | {
12 | public class SegmenterTest
13 | {
14 | [Fact]
15 | public void TestCut()
16 | {
17 | var segmenter = new JiebaSegmenter();
18 | var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);
19 |
20 | var resultWords = new List {"我", "来到", "北京", "清华", "清华大学", "华大", "大学"};
21 | Compared(segments, resultWords);
22 |
23 | segments = segmenter.Cut("我来到北京清华大学");
24 | resultWords = new List { "我","来到", "北京", "清华大学"};
25 | Compared(segments, resultWords);
26 |
27 | segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型
28 | resultWords = new List {"他", "来到", "了", "网易", "杭研", "大厦"};
29 | Compared(segments, resultWords);
30 |
31 | segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式
32 | resultWords = new List {"小明","硕士" ,"毕业","于","中国" ,"科学","学院", "科学院" ,"中国科学院","计算", "计算所","," , "后"
33 | ,"在" ,"日本","京都" ,"大学", "日本京都大学" ,"深造"};
34 | Compared(segments, resultWords);
35 |
36 | segments = segmenter.Cut("结过婚的和尚未结过婚的");
37 | resultWords = new List {"结过婚","的" ,"和" ,"尚未" ,"结过婚","的"};
38 |
39 | Compared(segments, resultWords);
40 |
41 | segments = segmenter.Cut("快奔三", false, false);
42 | resultWords = new List {"快","奔三"};
43 |
44 | Compared(segments, resultWords);
45 | }
46 |
47 | private void Compared(IEnumerable segments,List resultWords)
48 | {
49 | Assert.Equal(segments.Count(),resultWords.Count());
50 | for (int i = 0; i < segments.Count(); i++)
51 | {
52 | Assert.Equal(segments.ElementAt(i),resultWords[i]);
53 | }
54 | }
55 |
56 | [Fact]
57 | public void TestNewCut()
58 | {
59 | var segmenter = new JiebaSegmenter();
60 |
61 | var wordInfos = segmenter.Cut2("推荐系统终于发布了最终的版本,点击率蹭蹭上涨");
62 |
63 | Assert.Equal(wordInfos.ElementAt(0).position, 0);
64 | for (int i = 1; i < wordInfos.Count(); i++)
65 | {
66 | Assert.Equal(wordInfos.ElementAt(i).position,
67 | wordInfos.ElementAt(i - 1).position + wordInfos.ElementAt(i - 1).value.Length);
68 | }
69 | }
70 |
71 | [Fact]
72 | public void TestJIEbaTokenizer()
73 | {
74 | var tokenizer = new JieBaTokenizer(TextReader.Null, TokenizerMode.Default);
75 |
76 | Assert.NotEmpty(tokenizer.StopWords);
77 |
78 | Assert.True(tokenizer.StopWords.ContainsKey("是"));
79 | Assert.True(tokenizer.StopWords.ContainsKey("什么"));
80 |
81 | }
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/Test/Test.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netcoreapp2.2
5 | false
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | all
17 | runtime; build; native; contentfiles; analyzers; buildtransitive
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/jieba.NET.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.0.32126.317
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "jieba.NET", "jieba.NET\jieba.NET.csproj", "{89EFA758-206C-4681-ACF6-6F2AB2415279}"
7 | EndProject
8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Analyser", "Analyser\Analyser.csproj", "{4F0DEF27-C5FE-448F-9B08-F8C2254A1075}"
9 | EndProject
10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Segmenter", "Segmenter\Segmenter.csproj", "{C564CDCB-B52B-455E-86E9-FC0DAE37EF08}"
11 | EndProject
12 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ConsoleApp1", "ConsoleApp1\ConsoleApp1.csproj", "{C90214B5-CE37-46C4-9CC7-C9C6A2FBD452}"
13 | EndProject
14 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "EasyLuceneNET", "EasyLuceneNET\EasyLuceneNET.csproj", "{5458D618-C3FA-4B19-B1AF-7950F789AA14}"
15 | EndProject
16 | Global
17 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
18 | Debug|Any CPU = Debug|Any CPU
19 | Release|Any CPU = Release|Any CPU
20 | EndGlobalSection
21 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
22 | {89EFA758-206C-4681-ACF6-6F2AB2415279}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
23 | {89EFA758-206C-4681-ACF6-6F2AB2415279}.Debug|Any CPU.Build.0 = Debug|Any CPU
24 | {89EFA758-206C-4681-ACF6-6F2AB2415279}.Release|Any CPU.ActiveCfg = Release|Any CPU
25 | {89EFA758-206C-4681-ACF6-6F2AB2415279}.Release|Any CPU.Build.0 = Release|Any CPU
26 | {4F0DEF27-C5FE-448F-9B08-F8C2254A1075}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
27 | {4F0DEF27-C5FE-448F-9B08-F8C2254A1075}.Debug|Any CPU.Build.0 = Debug|Any CPU
28 | {4F0DEF27-C5FE-448F-9B08-F8C2254A1075}.Release|Any CPU.ActiveCfg = Release|Any CPU
29 | {4F0DEF27-C5FE-448F-9B08-F8C2254A1075}.Release|Any CPU.Build.0 = Release|Any CPU
30 | {C564CDCB-B52B-455E-86E9-FC0DAE37EF08}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
31 | {C564CDCB-B52B-455E-86E9-FC0DAE37EF08}.Debug|Any CPU.Build.0 = Debug|Any CPU
32 | {C564CDCB-B52B-455E-86E9-FC0DAE37EF08}.Release|Any CPU.ActiveCfg = Release|Any CPU
33 | {C564CDCB-B52B-455E-86E9-FC0DAE37EF08}.Release|Any CPU.Build.0 = Release|Any CPU
34 | {C90214B5-CE37-46C4-9CC7-C9C6A2FBD452}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
35 | {C90214B5-CE37-46C4-9CC7-C9C6A2FBD452}.Debug|Any CPU.Build.0 = Debug|Any CPU
36 | {C90214B5-CE37-46C4-9CC7-C9C6A2FBD452}.Release|Any CPU.ActiveCfg = Release|Any CPU
37 | {C90214B5-CE37-46C4-9CC7-C9C6A2FBD452}.Release|Any CPU.Build.0 = Release|Any CPU
38 | {5458D618-C3FA-4B19-B1AF-7950F789AA14}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
39 | {5458D618-C3FA-4B19-B1AF-7950F789AA14}.Debug|Any CPU.Build.0 = Debug|Any CPU
40 | {5458D618-C3FA-4B19-B1AF-7950F789AA14}.Release|Any CPU.ActiveCfg = Release|Any CPU
41 | {5458D618-C3FA-4B19-B1AF-7950F789AA14}.Release|Any CPU.Build.0 = Release|Any CPU
42 | EndGlobalSection
43 | GlobalSection(SolutionProperties) = preSolution
44 | HideSolutionNode = FALSE
45 | EndGlobalSection
46 | GlobalSection(ExtensibilityGlobals) = postSolution
47 | SolutionGuid = {4A38C532-715A-4F73-8690-CF9424A2EABE}
48 | EndGlobalSection
49 | EndGlobal
50 |
--------------------------------------------------------------------------------
/jieba.NET/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coolqingcheng/EasyLuceneNET/60d445d1e91e1864b31c7c4013fe105e70544f8f/jieba.NET/.DS_Store
--------------------------------------------------------------------------------
/jieba.NET/JieBaAnalyzer.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using Lucene.Net.Analysis;
3 | using Lucene.Net.Analysis.Core;
4 | using Lucene.Net.Analysis.TokenAttributes;
5 | using Lucene.Net.Analysis.Util;
6 | using System.IO;
7 | using JiebaNet.Segmenter;
8 |
9 |
10 | namespace jieba.NET
11 | {
12 | public class JieBaAnalyzer
13 | :Analyzer
14 | {
15 | public TokenizerMode mode;
16 | public JieBaAnalyzer(TokenizerMode Mode)
17 | :base()
18 | {
19 | this.mode = Mode;
20 | }
21 |
22 | protected override TokenStreamComponents CreateComponents(string filedName,TextReader reader)
23 | {
24 | var tokenizer = new JieBaTokenizer(reader,mode);
25 |
26 | var tokenstream = (TokenStream)new LowerCaseFilter(Lucene.Net.Util.LuceneVersion.LUCENE_48, tokenizer);
27 |
28 | tokenstream.AddAttribute();
29 | tokenstream.AddAttribute();
30 |
31 | return new TokenStreamComponents(tokenizer, tokenstream);
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/jieba.NET/JieBaTokenizer.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using Lucene.Net.Analysis.TokenAttributes;
3 | using Lucene.Net.Analysis;
4 | using JiebaNet.Segmenter;
5 | using System.IO;
6 | using System.Collections.Generic;
7 | using System.Reflection;
8 | using Microsoft.Extensions.FileProviders;
9 |
10 | namespace jieba.NET
11 | {
12 | public class JieBaTokenizer
13 | : Tokenizer
14 | {
15 | private static bool _initial = false;
16 | private string _inputText;
17 | private bool _originalResult = false;
18 | private int _start = 0;
19 |
20 | private readonly string _stropWordsPath = "Resources/stopwords.txt";
21 |
22 | private readonly JiebaSegmenter _segmenter;
23 | private TokenizerMode _mode;
24 | private ICharTermAttribute _termAtt;
25 | private IOffsetAttribute _offsetAtt;
26 | private IPositionIncrementAttribute _posIncrAtt;
27 | private ITypeAttribute _typeAtt;
28 |
29 | private Dictionary _stopWords = new Dictionary();
30 | private List _wordList = new List();
31 |
32 | private IEnumerator _iter;
33 |
34 | public JieBaTokenizer(TextReader input, TokenizerMode Mode)
35 | : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input)
36 | {
37 | _segmenter = new JiebaSegmenter();
38 | _mode = Mode;
39 | LoadStopWords();
40 | Init();
41 | }
42 |
43 | public Dictionary StopWords
44 | {
45 | get => _stopWords;
46 | }
47 |
48 | private void LoadStopWords()
49 | {
50 | var fileProvider = new EmbeddedFileProvider(GetType().GetTypeInfo().Assembly);
51 | var fileInfo = fileProvider.GetFileInfo(_stropWordsPath);
52 |
53 | using (var reader = new StreamReader(fileInfo.CreateReadStream()))
54 | {
55 | var s = "";
56 | while ((s = reader.ReadLine()) != null)
57 | {
58 | if (String.IsNullOrEmpty(s))
59 | continue;
60 | if (_stopWords.ContainsKey(s))
61 | continue;
62 | _stopWords.Add(s, 1);
63 | }
64 | }
65 | }
66 |
67 | private void Init()
68 | {
69 | _termAtt = AddAttribute();
70 | _offsetAtt = AddAttribute();
71 | _posIncrAtt = AddAttribute();
72 | _typeAtt = AddAttribute();
73 | }
74 |
75 | private string ReadToEnd(TextReader input)
76 | {
77 | return input.ReadToEnd();
78 | }
79 |
80 | public sealed override Boolean IncrementToken()
81 | {
82 | ClearAttributes();
83 |
84 | var word = Next();
85 | if (word != null)
86 | {
87 | var buffer = word.ToString();
88 | _termAtt.SetEmpty().Append(buffer);
89 | _offsetAtt.SetOffset(CorrectOffset(word.StartOffset), CorrectOffset(word.EndOffset));
90 | _typeAtt.Type = word.Type;
91 | return true;
92 | }
93 |
94 | End();
95 | Dispose();
96 | return false;
97 | }
98 |
99 | private Lucene.Net.Analysis.Token Next()
100 | {
101 | var length = 0;
102 | var res = _iter.MoveNext();
103 | if (res)
104 | {
105 | var word = _iter.Current;
106 | var token = new Lucene.Net.Analysis.Token(word.Word, word.StartIndex, word.EndIndex);
107 | _start += length;
108 | return token;
109 | }
110 | return null;
111 | }
112 |
113 | public override void Reset()
114 | {
115 | base.Reset();
116 |
117 | _inputText = ReadToEnd(base.m_input);
118 | RemoveStopWords(_segmenter.Tokenize(_inputText, _mode));
119 |
120 | _start = 0;
121 | _iter = _wordList.GetEnumerator();
122 | }
123 |
124 | private void RemoveStopWords(IEnumerable words)
125 | {
126 | _wordList.Clear();
127 |
128 | foreach (var x in words)
129 | {
130 | if (!_stopWords.ContainsKey(x.Word))
131 | {
132 | _wordList.Add(x);
133 | }
134 | }
135 | }
136 | }
137 | }
--------------------------------------------------------------------------------
/jieba.NET/Resources/stopwords.txt:
--------------------------------------------------------------------------------
1 | i
2 | me
3 | my
4 | myself
5 | we
6 | our
7 | ours
8 | ourselves
9 | you
10 | your
11 | yours
12 | yourself
13 | yourselves
14 | he
15 | him
16 | his
17 | himself
18 | she
19 | her
20 | hers
21 | herself
22 | it
23 | its
24 | itself
25 | they
26 | them
27 | their
28 | theirs
29 | themselves
30 | what
31 | which
32 | who
33 | whom
34 | this
35 | that
36 | these
37 | those
38 | am
39 | is
40 | are
41 | was
42 | were
43 | be
44 | been
45 | being
46 | have
47 | has
48 | had
49 | having
50 | do
51 | does
52 | did
53 | doing
54 | a
55 | an
56 | the
57 | and
58 | but
59 | if
60 | or
61 | because
62 | as
63 | until
64 | while
65 | of
66 | at
67 | by
68 | for
69 | with
70 | about
71 | against
72 | between
73 | into
74 | through
75 | during
76 | before
77 | after
78 | above
79 | below
80 | to
81 | from
82 | up
83 | down
84 | in
85 | out
86 | on
87 | off
88 | over
89 | under
90 | again
91 | further
92 | then
93 | once
94 | here
95 | there
96 | when
97 | where
98 | why
99 | how
100 | all
101 | any
102 | both
103 | each
104 | few
105 | more
106 | most
107 | other
108 | some
109 | such
110 | no
111 | nor
112 | not
113 | only
114 | own
115 | same
116 | so
117 | than
118 | too
119 | very
120 | s
121 | t
122 | can
123 | will
124 | just
125 | don
126 | should
127 | now
128 | 一番
129 | 一直
130 | 一个
131 | 一些
132 | 许多
133 | 种
134 | 有的是
135 | 也就是说
136 | 阿
137 | 哎呀
138 | 哎哟
139 | 俺
140 | 俺们
141 | 按
142 | 按照
143 | 吧
144 | 吧哒
145 | 把
146 | 罢了
147 | 被
148 | 本
149 | 本着
150 | 比
151 | 比方
152 | 比如
153 | 鄙人
154 | 彼
155 | 彼此
156 | 边
157 | 别
158 | 别的
159 | 别说
160 | 并
161 | 并且
162 | 不比
163 | 不成
164 | 不单
165 | 不但
166 | 不独
167 | 不管
168 | 不光
169 | 不过
170 | 不仅
171 | 不拘
172 | 不论
173 | 不怕
174 | 不然
175 | 不如
176 | 不特
177 | 不惟
178 | 不问
179 | 不只
180 | 朝
181 | 朝着
182 | 趁
183 | 趁着
184 | 乘
185 | 冲
186 | 除
187 | 除此之外
188 | 除非
189 | 除了
190 | 此
191 | 此间
192 | 此外
193 | 从
194 | 从而
195 | 打
196 | 待
197 | 但
198 | 但是
199 | 当
200 | 当着
201 | 到
202 | 得
203 | 的
204 | 的话
205 | 等
206 | 等等
207 | 地
208 | 第
209 | 叮咚
210 | 对
211 | 对于
212 | 多
213 | 多少
214 | 而
215 | 而况
216 | 而且
217 | 而是
218 | 而外
219 | 而言
220 | 而已
221 | 尔后
222 | 反过来
223 | 反过来说
224 | 反之
225 | 非但
226 | 非徒
227 | 否则
228 | 嘎
229 | 嘎登
230 | 该
231 | 赶
232 | 个
233 | 各
234 | 各个
235 | 各位
236 | 各种
237 | 各自
238 | 给
239 | 根据
240 | 跟
241 | 故
242 | 故此
243 | 固然
244 | 关于
245 | 管
246 | 归
247 | 果然
248 | 果真
249 | 过
250 | 和
251 | 何
252 | 何处
253 | 何况
254 | 何时
255 | 嘿
256 | 哼
257 | 哼唷
258 | 呼哧
259 | 乎
260 | 哗
261 | 还是
262 | 还有
263 | 换句话说
264 | 换言之
265 | 或
266 | 或是
267 | 或者
268 | 极了
269 | 及
270 | 及其
271 | 及至
272 | 即
273 | 即便
274 | 即或
275 | 即令
276 | 即若
277 | 即使
278 | 几
279 | 几时
280 | 己
281 | 既
282 | 既然
283 | 既是
284 | 继而
285 | 加之
286 | 假如
287 | 假若
288 | 假使
289 | 鉴于
290 | 将
291 | 较
292 | 较之
293 | 叫
294 | 接着
295 | 结果
296 | 借
297 | 紧接着
298 | 进而
299 | 尽
300 | 尽管
301 | 经
302 | 经过
303 | 就
304 | 就是
305 | 就是说
306 | 据
307 | 具体地说
308 | 具体说来
309 | 开始
310 | 开外
311 | 靠
312 | 咳
313 | 可
314 | 可见
315 | 可是
316 | 可以
317 | 况且
318 | 啦
319 | 来
320 | 来着
321 | 离
322 | 例如
323 | 哩
324 | 连
325 | 连同
326 | 两者
327 | 了
328 | 临
329 | 另
330 | 另外
331 | 另一方面
332 | 论
333 | 嘛
334 | 吗
335 | 慢说
336 | 漫说
337 | 冒
338 | 么
339 | 每
340 | 每当
341 | 们
342 | 莫若
343 | 某
344 | 某个
345 | 某些
346 | 拿
347 | 哪
348 | 哪边
349 | 哪儿
350 | 哪个
351 | 哪里
352 | 哪年
353 | 哪怕
354 | 哪天
355 | 哪些
356 | 哪样
357 | 那
358 | 那边
359 | 那儿
360 | 那个
361 | 那会儿
362 | 那里
363 | 那么
364 | 那么些
365 | 那么样
366 | 那时
367 | 那些
368 | 那样
369 | 乃
370 | 乃至
371 | 呢
372 | 能
373 | 你
374 | 你们
375 | 您
376 | 宁
377 | 宁可
378 | 宁肯
379 | 宁愿
380 | 哦
381 | 啪达
382 | 旁人
383 | 凭
384 | 凭借
385 | 其
386 | 其次
387 | 其二
388 | 其他
389 | 其它
390 | 其一
391 | 其余
392 | 其中
393 | 起
394 | 起见
395 | 起见
396 | 岂但
397 | 恰恰相反
398 | 前后
399 | 前者
400 | 且
401 | 然而
402 | 然后
403 | 然则
404 | 让
405 | 人家
406 | 任
407 | 任何
408 | 任凭
409 | 如
410 | 如此
411 | 如果
412 | 如何
413 | 如其
414 | 如若
415 | 如上所述
416 | 若
417 | 若非
418 | 若是
419 | 啥
420 | 上下
421 | 尚且
422 | 设若
423 | 设使
424 | 甚而
425 | 甚么
426 | 甚至
427 | 省得
428 | 时候
429 | 什么
430 | 什么样
431 | 使得
432 | 是
433 | 是的
434 | 首先
435 | 谁
436 | 顺
437 | 顺着
438 | 似的
439 | 虽
440 | 虽然
441 | 虽说
442 | 虽则
443 | 随
444 | 随着
445 | 所
446 | 所以
447 | 他
448 | 他们
449 | 他人
450 | 它
451 | 它们
452 | 她
453 | 她们
454 | 倘
455 | 倘或
456 | 倘然
457 | 倘若
458 | 倘使
459 | 腾
460 | 替
461 | 通过
462 | 同
463 | 同时
464 | 哇
465 | 万一
466 | 往
467 | 望
468 | 为
469 | 为何
470 | 为了
471 | 为什么
472 | 为着
473 | 喂
474 | 嗡嗡
475 | 我
476 | 我们
477 | 呜
478 | 呜呼
479 | 乌乎
480 | 无论
481 | 无宁
482 | 毋宁
483 | 嘻
484 | 吓
485 | 相对而言
486 | 像
487 | 向
488 | 向着
489 | 嘘
490 | 焉
491 | 沿
492 | 沿着
493 | 要
494 | 要不
495 | 要不然
496 | 要不是
497 | 要么
498 | 要是
499 | 也
500 | 也罢
501 | 也好
502 | 一
503 | 一旦
504 | 一方面
505 | 一来
506 | 一切
507 | 一样
508 | 一则
509 | 依
510 | 依照
511 | 矣
512 | 以
513 | 以便
514 | 以及
515 | 以免
516 | 以至
517 | 以至于
518 | 以致
519 | 抑或
520 | 因
521 | 因此
522 | 因而
523 | 因为
524 | 用
525 | 由
526 | 由此可见
527 | 由于
528 | 有
529 | 有的
530 | 有关
531 | 有些
532 | 又
533 | 于
534 | 于是
535 | 于是乎
536 | 与
537 | 与此同时
538 | 与否
539 | 与其
540 | 越是
541 | 云云
542 | 哉
543 | 再说
544 | 再者
545 | 在
546 | 在下
547 | 咱
548 | 咱们
549 | 则
550 | 怎
551 | 怎么办
552 | 怎么样
553 | 咋
554 | 照
555 | 照着
556 | 者
557 | 这
558 | 这边
559 | 这儿
560 | 这个
561 | 这会儿
562 | 这就是说
563 | 这里
564 | 这么
565 | 这么点儿
566 | 这么些
567 | 这么样
568 | 这时
569 | 这些
570 | 这样
571 | 正如
572 | 吱
573 | 之
574 | 之类
575 | 之所以
576 | 之一
577 | 只是
578 | 只限
579 | 只要
580 | 只有
581 | 至
582 | 至于
583 | 诸位
584 | 着
585 | 着呢
586 | 自
587 | 自从
588 | 自个儿
589 | 自各儿
590 | 自己
591 | 自家
592 | 自身
593 | 综上所述
594 | 总的来看
595 | 总的来说
596 | 总的说来
597 | 总而言之
598 | 总之
599 | 纵
600 | 纵令
601 | 纵然
602 | 纵使
603 | 遵照
604 | 作为
605 | 兮
606 | 呗
607 | 咚
608 | 咦
609 | 喏
610 | 啐
611 | 喔唷
612 | 嗬
613 | 嗯
614 | 嗳
615 | 。
616 | ,
617 | :
618 | ;
619 | 、
620 | “
621 | ”
622 | 【
623 | 】
624 | 《
625 | 》
626 | (
627 | )
628 | —
629 | …
630 | .
631 | ,
632 | :
633 | ;
634 | "
635 | "
636 | [
637 | ]
638 | <
639 | >
640 | (
641 | )
642 | @
643 | #
644 | *
645 | &
646 | %
647 | ¥
648 | $
649 | -
650 | +
651 | =
652 | |
653 | \
654 |
--------------------------------------------------------------------------------
/jieba.NET/jieba.NET.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netstandard2.0
5 | Lucene.JIEba.net
6 | 1.1.1
7 | SilentCC
8 | JIEba.Lucene.Net is an analyzer tools for lucene.net which is kind to chinese
9 | false
10 | https://github.com/SilentCC/JIEba-netcore2.0/
11 | Copyright 2019 (c) AgileLabs. All rights reserved.
12 | Analyzer Segment JIEba.net core2.0
13 | true
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------