├── .gitattributes ├── .gitignore ├── Analyser ├── .DS_Store ├── Analyser.csproj ├── ConfigManager.cs ├── IdfLoader.cs ├── KeywordExtractor.cs ├── Resources │ ├── idf.txt │ └── stopwords.txt ├── TextRankExtractor.cs ├── TfidfExtractor.cs └── UndirectWeightedGraph.cs ├── ConsoleApp1 ├── Article.cs ├── ConsoleApp1.csproj └── Program.cs ├── EasyLuceneNET ├── EasyLuceneNET.csproj ├── EasyLuceneNetDefaultProvider.cs ├── EasyLuceneNetExtensions.cs └── IEasyLuceneNet.cs ├── LICENSE ├── README.md ├── Segmenter ├── .DS_Store ├── Common │ ├── Extensions.cs │ ├── FileExtension.cs │ └── Trie.cs ├── ConfigManager.cs ├── Constants.cs ├── DefaultDictionary.cs ├── FinalSeg │ ├── IFinalSeg.cs │ └── Viterbi.cs ├── JiebaSegmenter.cs ├── Node.cs ├── Pair.cs ├── PosSeg │ ├── Pair.cs │ ├── PosSegmenter.cs │ └── Viterbi.cs ├── Resources │ ├── char_state_tab.json │ ├── dict.txt │ ├── pos_prob_emit.json │ ├── pos_prob_start.json │ ├── pos_prob_trans.json │ ├── prob_emit.json │ └── prob_trans.json ├── Segmenter.csproj ├── Spelling │ └── SpellChecker.cs ├── Token.cs ├── WordDictionary.cs └── WordInfo.cs ├── Test ├── SegmentTest.cs └── Test.csproj ├── jieba.NET.sln └── jieba.NET ├── .DS_Store ├── JieBaAnalyzer.cs ├── JieBaTokenizer.cs ├── Resources └── stopwords.txt └── jieba.NET.csproj /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Build results 17 | [Dd]ebug/ 18 | [Dd]ebugPublic/ 19 | [Rr]elease/ 20 | [Rr]eleases/ 21 | x64/ 22 | x86/ 23 | bld/ 24 | [Bb]in/ 25 | [Oo]bj/ 26 | [Ll]og/ 27 | 28 | # Visual Studio 2015/2017 cache/options directory 29 | .vs/ 30 | # Uncomment if you have tasks that create the project's static files in wwwroot 31 | #wwwroot/ 32 | 33 | # Visual Studio 2017 auto generated files 34 | Generated\ Files/ 35 | 36 | # MSTest test Results 37 | [Tt]est[Rr]esult*/ 38 | [Bb]uild[Ll]og.* 39 | 40 | # NUNIT 41 | *.VisualState.xml 42 | TestResult.xml 43 | 44 | # Build Results of an ATL Project 45 | [Dd]ebugPS/ 46 | [Rr]eleasePS/ 47 | dlldata.c 48 | 49 | # Benchmark Results 50 | BenchmarkDotNet.Artifacts/ 51 | 52 | # .NET Core 53 | project.lock.json 54 | project.fragment.lock.json 55 | artifacts/ 56 | 57 | # StyleCop 58 | StyleCopReport.xml 59 | 60 | # Files built by Visual Studio 61 | *_i.c 62 | *_p.c 63 | *_h.h 64 | *.ilk 65 | *.meta 66 | *.obj 67 | *.iobj 68 | *.pch 69 | *.pdb 70 | *.ipdb 71 | *.pgc 72 | *.pgd 73 | *.rsp 74 | *.sbr 75 | *.tlb 76 | *.tli 77 | *.tlh 78 | *.tmp 79 | *.tmp_proj 80 | *_wpftmp.csproj 81 | *.log 82 | *.vspscc 83 | *.vssscc 84 | .builds 85 | *.pidb 86 | *.svclog 87 | *.scc 88 | 89 | # Chutzpah Test files 90 | _Chutzpah* 91 | 92 | # Visual C++ cache files 93 | ipch/ 94 | *.aps 95 | *.ncb 96 | *.opendb 97 | *.opensdf 98 | *.sdf 99 | *.cachefile 100 | *.VC.db 101 | *.VC.VC.opendb 102 | 103 | # Visual Studio profiler 104 | *.psess 105 | *.vsp 106 | *.vspx 107 | *.sap 108 | 109 | # Visual Studio Trace Files 110 | *.e2e 111 | 112 | # TFS 2012 Local Workspace 113 | $tf/ 114 | 115 | # Guidance Automation Toolkit 116 | *.gpState 117 | 118 | # ReSharper is a .NET coding add-in 119 | _ReSharper*/ 120 | *.[Rr]e[Ss]harper 121 | *.DotSettings.user 122 | 123 | # JustCode is a .NET coding add-in 124 | .JustCode 125 | 126 | # TeamCity is a build add-in 127 | _TeamCity* 128 | 129 | # DotCover is a Code Coverage Tool 130 | *.dotCover 131 | 132 | # AxoCover is a Code Coverage Tool 133 | .axoCover/* 134 | !.axoCover/settings.json 135 | 136 | # Visual Studio code coverage results 137 | *.coverage 138 | *.coveragexml 139 | 140 | # NCrunch 141 | _NCrunch_* 142 | .*crunch*.local.xml 143 | nCrunchTemp_* 144 | 145 | # MightyMoose 146 | *.mm.* 147 | AutoTest.Net/ 148 | 149 | # Web workbench (sass) 150 | .sass-cache/ 151 | 152 | # Installshield output folder 153 | [Ee]xpress/ 154 | 155 | # DocProject is a documentation generator add-in 156 | DocProject/buildhelp/ 157 | DocProject/Help/*.HxT 158 | DocProject/Help/*.HxC 159 | DocProject/Help/*.hhc 160 | DocProject/Help/*.hhk 161 | DocProject/Help/*.hhp 162 | DocProject/Help/Html2 163 | DocProject/Help/html 164 | 165 | # Click-Once directory 166 | publish/ 167 | 168 | # Publish Web Output 169 | *.[Pp]ublish.xml 170 | *.azurePubxml 171 | # Note: Comment the next line if you want to checkin your web deploy settings, 172 | # but database connection strings (with potential passwords) will be unencrypted 173 | *.pubxml 174 | *.publishproj 175 | 176 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 177 | # checkin your Azure Web App publish settings, but sensitive information contained 178 | # in these scripts will be unencrypted 179 | PublishScripts/ 180 | 181 | # NuGet Packages 182 | *.nupkg 183 | # The packages folder can be ignored because of Package Restore 184 | **/[Pp]ackages/* 185 | # except build/, which is used as an MSBuild target. 186 | !**/[Pp]ackages/build/ 187 | # Uncomment if necessary however generally it will be regenerated when needed 188 | #!**/[Pp]ackages/repositories.config 189 | # NuGet v3's project.json files produces more ignorable files 190 | *.nuget.props 191 | *.nuget.targets 192 | 193 | # Microsoft Azure Build Output 194 | csx/ 195 | *.build.csdef 196 | 197 | # Microsoft Azure Emulator 198 | ecf/ 199 | rcf/ 200 | 201 | # Windows Store app package directories and files 202 | AppPackages/ 203 | BundleArtifacts/ 204 | Package.StoreAssociation.xml 205 | _pkginfo.txt 206 | *.appx 207 | 208 | # Visual Studio cache files 209 | # files ending in .cache can be ignored 210 | *.[Cc]ache 211 | # but keep track of directories ending in .cache 212 | !*.[Cc]ache/ 213 | 214 | # Others 215 | ClientBin/ 216 | ~$* 217 | *~ 218 | *.dbmdl 219 | *.dbproj.schemaview 220 | *.jfm 221 | *.pfx 222 | *.publishsettings 223 | orleans.codegen.cs 224 | 225 | # Including strong name files can present a security risk 226 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 227 | #*.snk 228 | 229 | # Since there are multiple workflows, uncomment next line to ignore bower_components 230 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 231 | #bower_components/ 232 | 233 | # RIA/Silverlight projects 234 | Generated_Code/ 235 | 236 | # Backup & report files from converting an old project file 237 | # to a newer Visual Studio version. Backup files are not needed, 238 | # because we have git ;-) 239 | _UpgradeReport_Files/ 240 | Backup*/ 241 | UpgradeLog*.XML 242 | UpgradeLog*.htm 243 | ServiceFabricBackup/ 244 | *.rptproj.bak 245 | 246 | # SQL Server files 247 | *.mdf 248 | *.ldf 249 | *.ndf 250 | 251 | # Business Intelligence projects 252 | *.rdl.data 253 | *.bim.layout 254 | *.bim_*.settings 255 | *.rptproj.rsuser 256 | 257 | # Microsoft Fakes 258 | FakesAssemblies/ 259 | 260 | # GhostDoc plugin setting file 261 | *.GhostDoc.xml 262 | 263 | # Node.js Tools for Visual Studio 264 | .ntvs_analysis.dat 265 | node_modules/ 266 | 267 | # Visual Studio 6 build log 268 | *.plg 269 | 270 | # Visual Studio 6 workspace options file 271 | *.opt 272 | 273 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 274 | *.vbw 275 | 276 | # Visual Studio LightSwitch build output 277 | **/*.HTMLClient/GeneratedArtifacts 278 | **/*.DesktopClient/GeneratedArtifacts 279 | **/*.DesktopClient/ModelManifest.xml 280 | **/*.Server/GeneratedArtifacts 281 | **/*.Server/ModelManifest.xml 282 | _Pvt_Extensions 283 | 284 | # Paket dependency manager 285 | .paket/paket.exe 286 | paket-files/ 287 | 288 | # FAKE - F# Make 289 | .fake/ 290 | 291 | # JetBrains Rider 292 | .idea/ 293 | *.sln.iml 294 | 295 | # CodeRush personal settings 296 | .cr/personal 297 | 298 | # Python Tools for Visual Studio (PTVS) 299 | __pycache__/ 300 | *.pyc 301 | 302 | # Cake - Uncomment if you are using it 303 | # tools/** 304 | # !tools/packages.config 305 | 306 | # Tabs Studio 307 | *.tss 308 | 309 | # Telerik's JustMock configuration file 310 | *.jmconfig 311 | 312 | # BizTalk build output 313 | *.btp.cs 314 | *.btm.cs 315 | *.odx.cs 316 | *.xsd.cs 317 | 318 | # OpenCover UI analysis results 319 | OpenCover/ 320 | 321 | # Azure Stream Analytics local run output 322 | ASALocalRun/ 323 | 324 | # MSBuild Binary and Structured Log 325 | *.binlog 326 | 327 | # NVidia Nsight GPU debugger configuration file 328 | *.nvuser 329 | 330 | # MFractors (Xamarin productivity tool) working folder 331 | .mfractor/ 332 | 333 | # Local History for Visual Studio 334 | .localhistory/ 335 | -------------------------------------------------------------------------------- /Analyser/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolqingcheng/EasyLuceneNET/60d445d1e91e1864b31c7c4013fe105e70544f8f/Analyser/.DS_Store -------------------------------------------------------------------------------- /Analyser/Analyser.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | netstandard2.0 4 | Lucene.JIEba.Analyzer 5 | 1.0.0 6 | SilentCC 7 | JIEba.Lucene.Net is an analyzer tools for lucene.net which is kind to chinese 8 | false 9 | https://github.com/SilentCC/JIEba-netcore2.0/ 10 | Copyright 2019 (c) AgileLabs. All rights reserved. 11 | Analyzer Segment JIEba.net core2.0 12 | true 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /Analyser/ConfigManager.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using System; 3 | 4 | namespace JiebaNet.Analyser 5 | { 6 | public class ConfigManager 7 | { 8 | // TODO: duplicate codes. 9 | public static string ConfigFileBaseDir 10 | { 11 | get 12 | { 13 | return "Resources"; 14 | } 15 | } 16 | 17 | public static string IdfFile 18 | { 19 | get { return Path.Combine(ConfigFileBaseDir, "idf.txt"); } 20 | } 21 | 22 | public static string StopWordsFile 23 | { 24 | get { return Path.Combine(ConfigFileBaseDir, "stopwords.txt"); } 25 | } 26 | } 27 | } -------------------------------------------------------------------------------- /Analyser/IdfLoader.cs: -------------------------------------------------------------------------------- 1 | using JiebaNet.Segmenter.Common; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Reflection; 6 | using System.Text; 7 | 8 | namespace JiebaNet.Analyser 9 | { 10 | public class IdfLoader 11 | { 12 | internal string IdfFilePath { get; set; } 13 | internal IDictionary IdfFreq { get; set; } 14 | internal double MedianIdf { get; set; } 15 | 16 | public IdfLoader(string idfPath = null) 17 | { 18 | IdfFilePath = string.Empty; 19 | IdfFreq = new Dictionary(); 20 | MedianIdf = 0.0; 21 | if (!string.IsNullOrWhiteSpace(idfPath)) 22 | { 23 | SetNewPath(idfPath); 24 | } 25 | } 26 | 27 | public void SetNewPath(string newIdfPath) 28 | { 29 | var idfPath = newIdfPath; 30 | if (IdfFilePath != idfPath) 31 | { 32 | IdfFilePath = idfPath; 33 | var lines = FileExtension.ReadEmbeddedAllLines(idfPath, Encoding.UTF8); 34 | IdfFreq = new Dictionary(); 35 | foreach (var line in lines) 36 | { 37 | var parts = line.Trim().Split(' '); 38 | var word = parts[0]; 39 | var freq = double.Parse(parts[1]); 40 | IdfFreq[word] = freq; 41 | } 42 | 43 | MedianIdf = IdfFreq.Values.OrderBy(v => v).ToList()[IdfFreq.Count / 2]; 44 | } 45 | } 46 | } 47 | } -------------------------------------------------------------------------------- /Analyser/KeywordExtractor.cs: -------------------------------------------------------------------------------- 1 | using JiebaNet.Segmenter.Common; 2 | using Microsoft.Extensions.FileProviders; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Reflection; 6 | 7 | namespace JiebaNet.Analyser 8 | { 9 | public abstract class KeywordExtractor 10 | { 11 | protected static readonly List DefaultStopWords = new List() 12 | { 13 | "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are", 14 | "by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it", 15 | "this", "then", "at", "have", "all", "not", "one", "has", "or", "that" 16 | }; 17 | 18 | protected virtual ISet StopWords { get; set; } 19 | 20 | public void SetStopWords(string stopWordsFile) 21 | { 22 | StopWords = new HashSet(); 23 | var lines = FileExtension.ReadEmbeddedAllLines(stopWordsFile); 24 | foreach (var line in lines) 25 | { 26 | StopWords.Add(line.Trim()); 27 | } 28 | } 29 | 30 | public abstract IEnumerable ExtractTags(string text, int count = 20, IEnumerable allowPos = null); 31 | public abstract IEnumerable ExtractTagsWithWeight(string text, int count = 20, IEnumerable allowPos = null); 32 | } 33 | } -------------------------------------------------------------------------------- /Analyser/Resources/stopwords.txt: -------------------------------------------------------------------------------- 1 | i 2 | me 3 | my 4 | myself 5 | we 6 | our 7 | ours 8 | ourselves 9 | you 10 | your 11 | yours 12 | yourself 13 | yourselves 14 | he 15 | him 16 | his 17 | himself 18 | she 19 | her 20 | hers 21 | herself 22 | it 23 | its 24 | itself 25 | they 26 | them 27 | their 28 | theirs 29 | themselves 30 | what 31 | which 32 | who 33 | whom 34 | this 35 | that 36 | these 37 | those 38 | am 39 | is 40 | are 41 | was 42 | were 43 | be 44 | been 45 | being 46 | have 47 | has 48 | had 49 | having 50 | do 51 | does 52 | did 53 | doing 54 | a 55 | an 56 | the 57 | and 58 | but 59 | if 60 | or 61 | because 62 | as 63 | until 64 | while 65 | of 66 | at 67 | by 68 | for 69 | with 70 | about 71 | against 72 | between 73 | into 74 | through 75 | during 76 | before 77 | after 78 | above 79 | below 80 | to 81 | from 82 | up 83 | down 84 | in 85 | out 86 | on 87 | off 88 | over 89 | under 90 | again 91 | further 92 | then 93 | once 94 | here 95 | there 96 | when 97 | where 98 | why 99 | how 100 | all 101 | any 102 | both 103 | each 104 | few 105 | more 106 | most 107 | other 108 | some 109 | such 110 | no 111 | nor 112 | not 113 | only 114 | own 115 | same 116 | so 117 | than 118 | too 119 | very 120 | s 121 | t 122 | can 123 | will 124 | just 125 | don 126 | should 127 | now 128 | 一番 129 | 一直 130 | 一个 131 | 一些 132 | 许多 133 | 种 134 | 有的是 135 | 也就是说 136 | 阿 137 | 哎呀 138 | 哎哟 139 | 俺 140 | 俺们 141 | 按 142 | 按照 143 | 吧 144 | 吧哒 145 | 把 146 | 罢了 147 | 被 148 | 本 149 | 本着 150 | 比 151 | 比方 152 | 比如 153 | 鄙人 154 | 彼 155 | 彼此 156 | 边 157 | 别 158 | 别的 159 | 别说 160 | 并 161 | 并且 162 | 不比 163 | 不成 164 | 不单 165 | 不但 166 | 不独 167 | 不管 168 | 不光 169 | 不过 170 | 不仅 171 | 不拘 172 | 不论 173 | 不怕 174 | 不然 175 | 不如 176 | 不特 177 | 不惟 178 | 不问 179 | 不只 180 | 朝 181 | 朝着 182 | 趁 183 | 趁着 184 | 乘 185 | 冲 186 | 除 187 | 除此之外 188 | 除非 189 | 除了 190 | 此 191 | 此间 192 | 此外 193 | 从 194 | 从而 195 | 打 196 | 待 197 | 但 198 | 但是 199 | 当 200 | 当着 201 | 到 202 | 得 203 | 的 204 | 的话 205 | 等 206 | 等等 207 | 地 208 | 第 209 | 叮咚 210 | 对 211 | 对于 212 | 多 213 | 多少 214 | 而 215 | 而况 216 | 而且 217 | 而是 218 | 而外 219 | 而言 220 | 而已 221 | 尔后 222 | 反过来 223 | 反过来说 224 | 反之 225 | 非但 226 | 非徒 227 | 否则 228 | 嘎 229 | 嘎登 230 | 该 231 | 赶 232 | 个 233 | 各 234 | 各个 235 | 各位 236 | 各种 237 | 各自 238 | 给 239 | 根据 240 | 跟 241 | 故 242 | 故此 243 | 固然 244 | 关于 245 | 管 246 | 归 247 | 果然 248 | 果真 249 | 过 250 | 和 251 | 何 252 | 何处 253 | 何况 254 | 何时 255 | 嘿 256 | 哼 257 | 哼唷 258 | 呼哧 259 | 乎 260 | 哗 261 | 还是 262 | 还有 263 | 换句话说 264 | 换言之 265 | 或 266 | 或是 267 | 或者 268 | 极了 269 | 及 270 | 及其 271 | 及至 272 | 即 273 | 即便 274 | 即或 275 | 即令 276 | 即若 277 | 即使 278 | 几 279 | 几时 280 | 己 281 | 既 282 | 既然 283 | 既是 284 | 继而 285 | 加之 286 | 假如 287 | 假若 288 | 假使 289 | 鉴于 290 | 将 291 | 较 292 | 较之 293 | 叫 294 | 接着 295 | 结果 296 | 借 297 | 紧接着 298 | 进而 299 | 尽 300 | 尽管 301 | 经 302 | 经过 303 | 就 304 | 就是 305 | 就是说 306 | 据 307 | 具体地说 308 | 具体说来 309 | 开始 310 | 开外 311 | 靠 312 | 咳 313 | 可 314 | 可见 315 | 可是 316 | 可以 317 | 况且 318 | 啦 319 | 来 320 | 来着 321 | 离 322 | 例如 323 | 哩 324 | 连 325 | 连同 326 | 两者 327 | 了 328 | 临 329 | 另 330 | 另外 331 | 另一方面 332 | 论 333 | 嘛 334 | 吗 335 | 慢说 336 | 漫说 337 | 冒 338 | 么 339 | 每 340 | 每当 341 | 们 342 | 莫若 343 | 某 344 | 某个 345 | 某些 346 | 拿 347 | 哪 348 | 哪边 349 | 哪儿 350 | 哪个 351 | 哪里 352 | 哪年 353 | 哪怕 354 | 哪天 355 | 哪些 356 | 哪样 357 | 那 358 | 那边 359 | 那儿 360 | 那个 361 | 那会儿 362 | 那里 363 | 那么 364 | 那么些 365 | 那么样 366 | 那时 367 | 那些 368 | 那样 369 | 乃 370 | 乃至 371 | 呢 372 | 能 373 | 你 374 | 你们 375 | 您 376 | 宁 377 | 宁可 378 | 宁肯 379 | 宁愿 380 | 哦 381 | 啪达 382 | 旁人 383 | 凭 384 | 凭借 385 | 其 386 | 其次 387 | 其二 388 | 其他 389 | 其它 390 | 其一 391 | 其余 392 | 其中 393 | 起 394 | 起见 395 | 起见 396 | 岂但 397 | 恰恰相反 398 | 前后 399 | 前者 400 | 且 401 | 然而 402 | 然后 403 | 然则 404 | 让 405 | 人家 406 | 任 407 | 任何 408 | 任凭 409 | 如 410 | 如此 411 | 如果 412 | 如何 413 | 如其 414 | 如若 415 | 如上所述 416 | 若 417 | 若非 418 | 若是 419 | 啥 420 | 上下 421 | 尚且 422 | 设若 423 | 设使 424 | 甚而 425 | 甚么 426 | 甚至 427 | 省得 428 | 时候 429 | 什么 430 | 什么样 431 | 使得 432 | 是 433 | 是的 434 | 首先 435 | 谁 436 | 顺 437 | 顺着 438 | 似的 439 | 虽 440 | 虽然 441 | 虽说 442 | 虽则 443 | 随 444 | 随着 445 | 所 446 | 所以 447 | 他 448 | 他们 449 | 他人 450 | 它 451 | 它们 452 | 她 453 | 她们 454 | 倘 455 | 倘或 456 | 倘然 457 | 倘若 458 | 倘使 459 | 腾 460 | 替 461 | 通过 462 | 同 463 | 同时 464 | 哇 465 | 万一 466 | 往 467 | 望 468 | 为 469 | 为何 470 | 为了 471 | 为什么 472 | 为着 473 | 喂 474 | 嗡嗡 475 | 我 476 | 我们 477 | 呜 478 | 呜呼 479 | 乌乎 480 | 无论 481 | 无宁 482 | 毋宁 483 | 嘻 484 | 吓 485 | 相对而言 486 | 像 487 | 向 488 | 向着 489 | 嘘 490 | 焉 491 | 沿 492 | 沿着 493 | 要 494 | 要不 495 | 要不然 496 | 要不是 497 | 要么 498 | 要是 499 | 也 500 | 也罢 501 | 也好 502 | 一 503 | 一旦 504 | 一方面 505 | 一来 506 | 一切 507 | 一样 508 | 一则 509 | 依 510 | 依照 511 | 矣 512 | 以 513 | 以便 514 | 以及 515 | 以免 516 | 以至 517 | 以至于 518 | 以致 519 | 抑或 520 | 因 521 | 因此 522 | 因而 523 | 因为 524 | 用 525 | 由 526 | 由此可见 527 | 由于 528 | 有 529 | 有的 530 | 有关 531 | 有些 532 | 又 533 | 于 534 | 于是 535 | 于是乎 536 | 与 537 | 与此同时 538 | 与否 539 | 与其 540 | 越是 541 | 云云 542 | 哉 543 | 再说 544 | 再者 545 | 在 546 | 在下 547 | 咱 548 | 咱们 549 | 则 550 | 怎 551 | 怎么办 552 | 怎么样 553 | 咋 554 | 照 555 | 照着 556 | 者 557 | 这 558 | 这边 559 | 这儿 560 | 这个 561 | 这会儿 562 | 这就是说 563 | 这里 564 | 这么 565 | 这么点儿 566 | 这么些 567 | 这么样 568 | 这时 569 | 这些 570 | 这样 571 | 正如 572 | 吱 573 | 之 574 | 之类 575 | 之所以 576 | 之一 577 | 只是 578 | 只限 579 | 只要 580 | 只有 581 | 至 582 | 至于 583 | 诸位 584 | 着 585 | 着呢 586 | 自 587 | 自从 588 | 自个儿 589 | 自各儿 590 | 自己 591 | 自家 592 | 自身 593 | 综上所述 594 | 总的来看 595 | 总的来说 596 | 总的说来 597 | 总而言之 598 | 总之 599 | 纵 600 | 纵令 601 | 纵然 602 | 纵使 603 | 遵照 604 | 作为 605 | 兮 606 | 呗 607 | 咚 608 | 咦 609 | 喏 610 | 啐 611 | 喔唷 612 | 嗬 613 | 嗯 614 | 嗳 615 | 。 616 | , 617 | : 618 | ; 619 | 、 620 | “ 621 | ” 622 | 【 623 | 】 624 | 《 625 | 》 626 | ( 627 | ) 628 | — 629 | … 630 | . 631 | , 632 | : 633 | ; 634 | " 635 | " 636 | [ 637 | ] 638 | < 639 | > 640 | ( 641 | ) 642 | @ 643 | # 644 | * 645 | & 646 | % 647 | ¥ 648 | $ 649 | - 650 | + 651 | = 652 | | 653 | \ 654 | -------------------------------------------------------------------------------- /Analyser/TextRankExtractor.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using JiebaNet.Segmenter; 4 | using JiebaNet.Segmenter.Common; 5 | using JiebaNet.Segmenter.PosSeg; 6 | 7 | namespace JiebaNet.Analyser 8 | { 9 | public class TextRankExtractor : KeywordExtractor 10 | { 11 | private static readonly IEnumerable DefaultPosFilter = new List() 12 | { 13 | "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "v", "vd", "vg", "vi", "vn", "vq" 14 | }; 15 | 16 | private JiebaSegmenter Segmenter { get; set; } 17 | private PosSegmenter PosSegmenter { get; set; } 18 | 19 | public int Span { get; set; } 20 | 21 | public bool PairFilter(Pair wp) 22 | { 23 | return DefaultPosFilter.Contains(wp.Flag) 24 | && wp.Word.Trim().Length >= 2 25 | && !StopWords.Contains(wp.Word.ToLower()); 26 | } 27 | 28 | public TextRankExtractor() 29 | { 30 | Span = 5; 31 | 32 | Segmenter = new JiebaSegmenter(); 33 | PosSegmenter = new PosSegmenter(Segmenter); 34 | SetStopWords(ConfigManager.StopWordsFile); 35 | if (StopWords.IsEmpty()) 36 | { 37 | StopWords.UnionWith(DefaultStopWords); 38 | } 39 | } 40 | 41 | public override IEnumerable ExtractTags(string text, int count = 20, IEnumerable allowPos = null) 42 | { 43 | var rank = ExtractTagRank(text, allowPos); 44 | if (count <= 0) { count = 20; } 45 | return rank.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count); 46 | } 47 | 48 | public override IEnumerable ExtractTagsWithWeight(string text, int count = 20, IEnumerable allowPos = null) 49 | { 50 | var rank = ExtractTagRank(text, allowPos); 51 | if (count <= 0) { count = 20; } 52 | return rank.OrderByDescending(p => p.Value).Select(p => new WordWeightPair() 53 | { 54 | Word = p.Key, Weight = p.Value 55 | }).Take(count); 56 | } 57 | 58 | #region Private Helpers 59 | 60 | private IDictionary ExtractTagRank(string text, IEnumerable allowPos) 61 | { 62 | if (allowPos.IsEmpty()) 63 | { 64 | allowPos = DefaultPosFilter; 65 | } 66 | 67 | var g = new UndirectWeightedGraph(); 68 | var cm = new Dictionary(); 69 | var words = PosSegmenter.Cut(text).ToList(); 70 | 71 | for (var i = 0; i < words.Count(); i++) 72 | { 73 | var wp = words[i]; 74 | if (PairFilter(wp)) 75 | { 76 | for (var j = i + 1; j < i + Span; j++) 77 | { 78 | if (j >= words.Count) 79 | { 80 | break; 81 | } 82 | if (!PairFilter(words[j])) 83 | { 84 | continue; 85 | } 86 | 87 | // TODO: better separator. 88 | var key = wp.Word + "$" + words[j].Word; 89 | if (!cm.ContainsKey(key)) 90 | { 91 | cm[key] = 0; 92 | } 93 | cm[key] += 1; 94 | } 95 | } 96 | } 97 | 98 | foreach (var p in cm) 99 | { 100 | var terms = p.Key.Split('$'); 101 | g.AddEdge(terms[0], terms[1], p.Value); 102 | } 103 | 104 | return g.Rank(); 105 | } 106 | 107 | #endregion 108 | } 109 | } -------------------------------------------------------------------------------- /Analyser/TfidfExtractor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using JiebaNet.Segmenter; 5 | using JiebaNet.Segmenter.Common; 6 | using JiebaNet.Segmenter.PosSeg; 7 | 8 | namespace JiebaNet.Analyser 9 | { 10 | public class TfidfExtractor : KeywordExtractor 11 | { 12 | private static readonly string DefaultIdfFile = ConfigManager.IdfFile; 13 | private static readonly int DefaultWordCount = 20; 14 | 15 | private JiebaSegmenter Segmenter { get; set; } 16 | private PosSegmenter PosSegmenter { get; set; } 17 | private IdfLoader Loader { get; set; } 18 | 19 | private IDictionary IdfFreq { get; set; } 20 | private double MedianIdf { get; set; } 21 | 22 | public TfidfExtractor(JiebaSegmenter segmenter = null) 23 | { 24 | if (segmenter.IsNull()) 25 | { 26 | Segmenter = new JiebaSegmenter(); 27 | } 28 | else 29 | { 30 | Segmenter = segmenter; 31 | } 32 | PosSegmenter = new PosSegmenter(Segmenter); 33 | SetStopWords(ConfigManager.StopWordsFile); 34 | if (StopWords.IsEmpty()) 35 | { 36 | StopWords.UnionWith(DefaultStopWords); 37 | } 38 | 39 | Loader = new IdfLoader(DefaultIdfFile); 40 | 41 | IdfFreq = Loader.IdfFreq; 42 | MedianIdf = Loader.MedianIdf; 43 | } 44 | 45 | public void SetIdfPath(string idfPath) 46 | { 47 | Loader.SetNewPath(idfPath); 48 | IdfFreq = Loader.IdfFreq; 49 | MedianIdf = Loader.MedianIdf; 50 | } 51 | 52 | private IEnumerable FilterCutByPos(string text, IEnumerable allowPos) 53 | { 54 | var posTags = PosSegmenter.Cut(text).Where(p => allowPos.Contains(p.Flag)); 55 | return posTags.Select(p => p.Word); 56 | } 57 | 58 | private IDictionary GetWordIfidf(string text, IEnumerable allowPos) 59 | { 60 | IEnumerable words = null; 61 | if (allowPos.IsNotEmpty()) 62 | { 63 | words = FilterCutByPos(text, allowPos); 64 | } 65 | else 66 | { 67 | words = Segmenter.Cut(text); 68 | } 69 | 70 | // Calculate TF 71 | var freq = new Dictionary(); 72 | foreach (var word in words) 73 | { 74 | var w = word; 75 | if (string.IsNullOrEmpty(w) || w.Trim().Length < 2 || StopWords.Contains(w.ToLower())) 76 | { 77 | continue; 78 | } 79 | freq[w] = freq.GetDefault(w, 0.0) + 1.0; 80 | } 81 | var total = freq.Values.Sum(); 82 | foreach (var k in freq.Keys.ToList()) 83 | { 84 | freq[k] *= IdfFreq.GetDefault(k, MedianIdf) / total; 85 | } 86 | 87 | return freq; 88 | } 89 | 90 | public override IEnumerable ExtractTags(string text, int count = 20, IEnumerable allowPos = null) 91 | { 92 | if (count <= 0) { count = DefaultWordCount; } 93 | 94 | var freq = GetWordIfidf(text, allowPos); 95 | return freq.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count); 96 | } 97 | 98 | public override IEnumerable ExtractTagsWithWeight(string text, int count = 20, IEnumerable allowPos = null) 99 | { 100 | if (count <= 0) { count = DefaultWordCount; } 101 | 102 | var freq = GetWordIfidf(text, allowPos); 103 | return freq.OrderByDescending(p => p.Value).Select(p => new WordWeightPair() 104 | { 105 | Word = p.Key, Weight = p.Value 106 | }).Take(count); 107 | } 108 | } 109 | 110 | public class WordWeightPair 111 | { 112 | public string Word { get; set; } 113 | public double Weight { get; set; } 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /Analyser/UndirectWeightedGraph.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | 5 | namespace JiebaNet.Analyser 6 | { 7 | public class Edge 8 | { 9 | public string Start { get; set; } 10 | public string End { get; set; } 11 | public double Weight { get; set; } 12 | } 13 | 14 | public class UndirectWeightedGraph 15 | { 16 | private static readonly double d = 0.85; 17 | 18 | public IDictionary> Graph { get; set; } 19 | public UndirectWeightedGraph() 20 | { 21 | Graph = new Dictionary>(); 22 | } 23 | 24 | public void AddEdge(string start, string end, double weight) 25 | { 26 | if (!Graph.ContainsKey(start)) 27 | { 28 | Graph[start] = new List(); 29 | } 30 | 31 | if (!Graph.ContainsKey(end)) 32 | { 33 | Graph[end] = new List(); 34 | } 35 | 36 | Graph[start].Add(new Edge(){ Start = start, End = end, Weight = weight }); 37 | Graph[end].Add(new Edge(){ Start = end, End = start, Weight = weight }); 38 | } 39 | 40 | public IDictionary Rank() 41 | { 42 | var ws = new Dictionary(); 43 | var outSum = new Dictionary(); 44 | 45 | // init scores 46 | var count = Graph.Count > 0 ? Graph.Count : 1; 47 | var wsdef = 1.0/count; 48 | 49 | foreach (var pair in Graph) 50 | { 51 | ws[pair.Key] = wsdef; 52 | outSum[pair.Key] = pair.Value.Sum(e => e.Weight); 53 | } 54 | 55 | // TODO: 10 iterations? 56 | var sortedKeys = Graph.Keys.OrderBy(k => k); 57 | for (var i = 0; i < 10; i++) 58 | { 59 | foreach (var n in sortedKeys) 60 | { 61 | var s = 0d; 62 | foreach (var edge in Graph[n]) 63 | { 64 | s += edge.Weight/outSum[edge.End]*ws[edge.End]; 65 | } 66 | ws[n] = (1 - d) + d*s; 67 | } 68 | } 69 | 70 | var minRank = double.MaxValue; 71 | var maxRank = double.MinValue; 72 | 73 | foreach (var w in ws.Values) 74 | { 75 | if (w < minRank) 76 | { 77 | minRank = w; 78 | } 79 | if(w > maxRank) 80 | { 81 | maxRank = w; 82 | } 83 | } 84 | 85 | foreach (var pair in ws.ToList()) 86 | { 87 | ws[pair.Key] = (pair.Value - minRank/10.0)/(maxRank - minRank/10.0); 88 | } 89 | 90 | return ws; 91 | } 92 | } 93 | } -------------------------------------------------------------------------------- /ConsoleApp1/Article.cs: -------------------------------------------------------------------------------- 1 | using Lucene.Net.Documents; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace ConsoleApp1 9 | { 10 | public class Article 11 | { 12 | [Lucene(FieldStore = Field.Store.YES, IsUnique = true, type = LuceneFieldType.Int32)] 13 | public int Id { get; set; } 14 | [Lucene(FieldStore = Field.Store.YES, IsUnique = false, type = LuceneFieldType.Text)] 15 | public string Title { get; set; } 16 | 17 | 18 | [Lucene(FieldStore = Field.Store.YES, IsUnique = false, type = LuceneFieldType.Text)] 19 | public string Content { get; set; } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /ConsoleApp1/ConsoleApp1.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net6.0 6 | enable 7 | enable 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /ConsoleApp1/Program.cs: -------------------------------------------------------------------------------- 1 | using ConsoleApp1; 2 | using EasyLuceneNET; 3 | using Microsoft.Extensions.DependencyInjection; 4 | 5 | var service = new ServiceCollection(); 6 | service.AddLogging(); 7 | service.AddEasyLuceneNet(); 8 | var serviceProvider = service.BuildServiceProvider(); 9 | 10 | var easy = serviceProvider.GetService(); 11 | 12 | //删除索引 13 | 14 | 15 | //传递一个文档对应的模型,只需要给主键赋值即可 16 | easy.Delete(new Article { Id = 1 }); 17 | 18 | //创建索引 19 | 20 | //var list = new List
(); 21 | //for (int i = 0; i < 100; i++) 22 | //{ 23 | // list.Add(new Article() 24 | // { 25 | // Id = i, 26 | // Title = i + "使用Xamarin开发移动应用示例——数独游戏(八)使用MVVM实现完成游戏列表页面", 27 | // Content = @"前面我们已经完成了游戏的大部分功能,玩家可以玩预制的数独游戏,也可以自己添加新的游戏。现在我们实现展示已完成游戏列表页面,显示用户已经完成的游戏列表,从这个列表可以进入详细的复盘页面。 28 | 29 | //前面的页面我们采用的是传统的事件驱动模型,在XAML文件中定义页面,在后台的cs文件中编写事件响应代码。采用这种模型是因为很多页面需要动态生成控件,然后动态改变这些控件的属性,事件驱动模型在这种场景下比较好理解。现在我们采用MVVM方式编写完成游戏列表页面。 30 | 31 | //MVVM是将页面绑定到视图模型,所有的操作和事件响应通过视图模型完成。视图模型中没有页面控件的定义,因此和页面是解耦的,可以独立进行测试。在视图模型中我们只关心数据,而不关心展示数据的控件。 32 | 33 | //首先,我们定义一个视图模型的基类,下一步在改造其它页面时,会用到这个基类:" 34 | // }); 35 | //} 36 | //easy!.AddIndex(list); 37 | 38 | //全文检索 39 | 40 | var result = easy!.Search
(new SearchRequest() 41 | { 42 | keyword = "事件模型", 43 | index = 1, 44 | size = 20, 45 | fields = new string[] { "Title", "Content" }, 46 | OrderByField = "Id", 47 | }); 48 | Console.WriteLine("一共:" + result.Total); 49 | foreach (var item in result.list) 50 | { 51 | Console.WriteLine($"id:{item.Id} title:{item.Title}"); 52 | } 53 | Console.WriteLine($"分词:{string.Join(" ", result.cutKeys)}"); 54 | Console.WriteLine("完成"); 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /EasyLuceneNET/EasyLuceneNET.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | netstandard2.1 5 | 6 | enable 7 | EasyLuceneNET 8 | 1.5 9 | 简单的封装了jiba.net和Lucene.net进行中文检索,适用与基本的文档和网站站内检索 10 | https://github.com/coolqingcheng/EasyLuceneNET 11 | 青城 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /EasyLuceneNET/EasyLuceneNetDefaultProvider.cs: -------------------------------------------------------------------------------- 1 | using jieba.NET; 2 | using JiebaNet.Segmenter; 3 | using Lucene.Net.Analysis; 4 | using Lucene.Net.Documents; 5 | using Lucene.Net.Index; 6 | using Lucene.Net.Search; 7 | using Lucene.Net.Search.Highlight; 8 | using Lucene.Net.Store; 9 | using Lucene.Net.Util; 10 | using Microsoft.Extensions.Logging; 11 | using System; 12 | using System.Collections.Generic; 13 | using System.IO; 14 | using System.Linq; 15 | using System.Reflection; 16 | 17 | namespace EasyLuceneNET 18 | { 19 | public class EasyLuceneNetDefaultProvider : IEasyLuceneNet, IDisposable 20 | { 21 | const LuceneVersion AppLuceneVersion = LuceneVersion.LUCENE_48; 22 | readonly IndexWriter writer; 23 | 24 | private ILogger _logger; 25 | 26 | private FSDirectory dir; 27 | 28 | //private readonly JieBaAnalyzer analyzer; 29 | 30 | public EasyLuceneNetDefaultProvider(ILogger logger) 31 | { 32 | _logger = logger; 33 | var indexPath = Path.Combine(AppContext.BaseDirectory, "indexs"); 34 | 35 | dir = FSDirectory.Open(indexPath); 36 | 37 | // Create an analyzer to process the text 38 | Analyzer analyzer = new JieBaAnalyzer(TokenizerMode.Search); 39 | // Create an index writer 40 | var indexConfig = new IndexWriterConfig(AppLuceneVersion, analyzer); 41 | writer = new IndexWriter(dir, indexConfig); 42 | } 43 | 44 | public void AddIndex(List list) 45 | { 46 | if (list != null) 47 | { 48 | list.ForEach(item => 49 | { 50 | var doc = new Document(); 51 | var properties = item.GetType().GetProperties(BindingFlags.Instance | BindingFlags.Public); 52 | _logger.LogDebug("添加到文档:" + DateTime.Now); 53 | Term term = null; 54 | foreach (var property in properties) 55 | { 56 | string name = property.Name; 57 | var value = property.GetValue(item); 58 | var att = property.GetCustomAttribute(); 59 | if (att == null) 60 | { 61 | _logger.LogWarning($"文档字段为:{name} 没有贴上Lucene标签,不索引"); 62 | continue; 63 | } 64 | if (att.type == LuceneFieldType.String) 65 | { 66 | //默认用StringField 67 | doc.Add(new StringField(name, value.ToString(), Field.Store.YES)); 68 | } 69 | else 70 | { 71 | 72 | if (att.type == LuceneFieldType.Text) 73 | { 74 | doc.Add(new TextField(name, value.ToString(), att.FieldStore)); 75 | } 76 | if (att.type == LuceneFieldType.Int32) 77 | { 78 | doc.Add(new Int32Field(name, Convert.ToInt32(value), att.FieldStore)); 79 | } 80 | 81 | } 82 | if (att.IsUnique) 83 | { 84 | if (new Type[] { typeof(int), typeof(long), typeof(short), typeof(uint), typeof(ulong), typeof(ushort) }.Contains(value.GetType())) 85 | { 86 | var bytes = new BytesRef(NumericUtils.BUF_SIZE_INT32); 87 | NumericUtils.Int32ToPrefixCoded(Convert.ToInt32(value), 0, bytes); 88 | term = new Term(name, bytes); 89 | } 90 | else 91 | { 92 | term = new Term(name, value.ToString()); 93 | } 94 | } 95 | } 96 | if (term == null) 97 | { 98 | writer.AddDocument(doc); 99 | } 100 | else 101 | { 102 | writer.UpdateDocument(term, doc); 103 | } 104 | 105 | }); 106 | var begin = DateTime.Now; 107 | _logger.LogDebug("正在提交索引:" + begin); 108 | writer.Flush(triggerMerge: false, applyAllDeletes: false); 109 | writer.Commit(); 110 | var end = DateTime.Now; 111 | _logger.LogDebug("索引提交完成:" + end); 112 | writer.Flush(false, false); 113 | writer.Commit(); 114 | } 115 | } 116 | 117 | public void Dispose() 118 | { 119 | writer.Dispose(); 120 | dir.Dispose(); 121 | } 122 | 123 | public SearchResult Search(SearchRequest request) where T : class, new() 124 | { 125 | 126 | if (request.keyword.Length > 75) 127 | { 128 | request.keyword = request.keyword.Substring(0, 75); 129 | } 130 | if (request.index <= 1) 131 | { 132 | request.index = 1; 133 | } 134 | if (request.size < 15) 135 | { 136 | request.index = 15; 137 | } 138 | var result = new SearchResult(); 139 | var segmenter = new JiebaSegmenter(); 140 | var keywords = segmenter.Cut(request.keyword); 141 | result.cutKeys.AddRange(keywords); 142 | var biaodian = "[’!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+()【】,。: ".ToCharArray(); 143 | keywords = keywords.Where(a => !biaodian.Where(b => b.ToString() == a).Any()).ToList(); 144 | BooleanQuery query = new BooleanQuery(); 145 | foreach (var item in keywords) 146 | { 147 | foreach (var field in request.fields) 148 | { 149 | if (biaodian.Any(a => a.ToString() == item) == false) 150 | { 151 | query.Add(new TermQuery(new Term(field, item)), Occur.SHOULD); 152 | } 153 | } 154 | } 155 | 156 | var i = request.index * request.size; 157 | 158 | using var reader = writer.GetReader(applyAllDeletes: true); 159 | var searcher = new IndexSearcher(reader); 160 | var sort = new Sort(); 161 | if (!string.IsNullOrWhiteSpace(request.OrderByDescField)) 162 | { 163 | sort.SetSort(new SortField(request.OrderByDescField, SortFieldType.INT32, true)); 164 | } 165 | if (!string.IsNullOrWhiteSpace(request.OrderByField)) 166 | { 167 | sort.SetSort(new SortField(request.OrderByField, SortFieldType.INT32, false)); 168 | } 169 | TopFieldDocs? doc = searcher.Search(query, request.size * 10, sort); 170 | var scorer = new QueryScorer(query, "Content"); 171 | Highlighter highlighter = new Highlighter(scorer); 172 | Search(request.index, 173 | request.size, 174 | result, 175 | searcher, 176 | doc); 177 | return result; 178 | } 179 | 180 | private static void Search(int index, int size, SearchResult result, IndexSearcher searcher, TopDocs doc) where T : class, new() 181 | { 182 | result.Total = doc.TotalHits; 183 | var maxIndex = doc.ScoreDocs.Length - 2; 184 | var endIndex = ((index - 1) * size) + size; 185 | if (endIndex < maxIndex) 186 | { 187 | maxIndex = endIndex; 188 | } 189 | for (int j = ((index - 1) * size); j < maxIndex; j++) 190 | { 191 | var foundDoc = searcher.Doc(doc.ScoreDocs[j].Doc); 192 | var t = new T(); 193 | var type = t.GetType(); 194 | var propertity = type.GetProperties(BindingFlags.Instance | BindingFlags.Public); 195 | 196 | foreach (var item in propertity) 197 | { 198 | var sValue = foundDoc.Get(item.Name); 199 | if (sValue != null) 200 | { 201 | 202 | try 203 | { 204 | var v = Convert.ChangeType(sValue, item.PropertyType); 205 | 206 | item.SetValue(t, v, null); 207 | } 208 | catch (Exception) 209 | { 210 | 211 | } 212 | } 213 | } 214 | result.list.Add(t); 215 | } 216 | } 217 | 218 | private String highlightField(Query query, String fieldName, String text) 219 | { 220 | TokenStream tokenStream = new JieBaAnalyzer(TokenizerMode.Search) 221 | .GetTokenStream(fieldName, text); 222 | // Assuming "", "" used to highlight 223 | SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(); 224 | QueryScorer scorer = new QueryScorer(query); 225 | Highlighter highlighter = new Highlighter(formatter, scorer) 226 | { 227 | TextFragmenter = (new SimpleFragmenter(int.MaxValue)) 228 | }; 229 | 230 | String rv = highlighter.GetBestFragments(tokenStream, text, 1, "(FIELD TEXT TRUNCATED)"); 231 | return rv.Length == 0 ? text : rv; 232 | } 233 | 234 | public void Delete(T entity) 235 | { 236 | if (entity != null) 237 | { 238 | var properties = entity.GetType().GetProperties(BindingFlags.Instance | BindingFlags.Public); 239 | var item = properties.Where(p => p.GetCustomAttribute().IsUnique = true).FirstOrDefault(); 240 | if (item != null) 241 | { 242 | var value = item.GetValue(entity, null); 243 | Term term; 244 | if (new Type[] { typeof(int), typeof(long), typeof(short), typeof(uint), typeof(ulong), typeof(ushort) }.Contains(value.GetType())) 245 | { 246 | var bytes = new BytesRef(NumericUtils.BUF_SIZE_INT32); 247 | NumericUtils.Int32ToPrefixCoded(Convert.ToInt32(value), 0, bytes); 248 | term = new Term(item.Name, bytes); 249 | } 250 | else 251 | { 252 | term = new Term(item.Name, value.ToString()); 253 | } 254 | writer.DeleteDocuments(term); 255 | writer.Flush(true, true); 256 | writer.Commit(); 257 | } 258 | 259 | } 260 | } 261 | } 262 | } 263 | 264 | public class LuceneAttribute : System.Attribute 265 | { 266 | public LuceneFieldType type { get; set; } = LuceneFieldType.Text; 267 | 268 | public Field.Store FieldStore { get; set; } 269 | 270 | public bool IsUnique { get; set; } = false; 271 | 272 | } 273 | 274 | public enum LuceneFieldType 275 | { 276 | Text, 277 | /// 278 | /// 用于不需要检索的,如果需要检索,选择Text 279 | /// 280 | String, 281 | Int32 282 | } -------------------------------------------------------------------------------- /EasyLuceneNET/EasyLuceneNetExtensions.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.Extensions.DependencyInjection; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Threading.Tasks; 7 | 8 | namespace EasyLuceneNET 9 | { 10 | public static class EasyLuceneNetExtensions 11 | { 12 | public static IServiceCollection AddEasyLuceneNet(this IServiceCollection service) 13 | { 14 | service.AddSingleton(); 15 | return service; 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /EasyLuceneNET/IEasyLuceneNet.cs: -------------------------------------------------------------------------------- 1 | using Lucene.Net.Search; 2 | using System.Collections.Generic; 3 | 4 | namespace EasyLuceneNET 5 | { 6 | public interface IEasyLuceneNet 7 | { 8 | /// 9 | /// 检索 10 | /// 11 | /// 12 | /// 13 | /// 14 | SearchResult Search(SearchRequest request) where T : class, new(); 15 | /// 16 | /// 创建索引 17 | /// 18 | /// 19 | /// 20 | void AddIndex(List list); 21 | 22 | /// 23 | /// 删除 24 | /// 25 | /// 26 | void Delete(T entity); 27 | } 28 | 29 | public class SearchResult where T : class, new() 30 | { 31 | public int Total { get; set; } 32 | 33 | public List cutKeys { get; set; } = new List(); 34 | 35 | public List list { get; set; } = new List(); 36 | } 37 | 38 | public class SearchRequest 39 | { 40 | public string keyword { get; set; } 41 | public int index { get; set; } = 1; 42 | public int size { get; set; } = 15; 43 | public string[] fields { get; set; } 44 | 45 | /// 46 | /// 倒序排列字段 47 | /// 48 | public string OrderByDescField { get; set; } 49 | 50 | 51 | /// 52 | /// 顺序排序字段 53 | /// 54 | public string OrderByField { get; set; } 55 | } 56 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 qingcheng 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 基于https://github.com/SilentCC/JIEba-netcore 封装了一个lucene.net的全文检索工具 2 | 3 | # 使用 4 | 5 | ## 安装nuget包 6 | 7 | ``` 8 | Install-Package EasyLuceneNET 9 | ``` 10 | 11 | ## 创建模型 12 | 13 | ``` csharp 14 | public class Article 15 | { 16 | [Lucene(FieldStore = Field.Store.YES, IsUnique = true, type = LuceneFieldType.Int32)] 17 | public int Id { get; set; } 18 | [Lucene(FieldStore = Field.Store.YES, IsUnique = false, type = LuceneFieldType.Text)] 19 | public string Title { get; set; } 20 | 21 | 22 | [Lucene(FieldStore = Field.Store.YES, IsUnique = false, type = LuceneFieldType.Text)] 23 | public string Content { get; set; } 24 | } 25 | ``` 26 | 27 | ## 依赖注入 28 | 29 | ``` csharp 30 | var service = new ServiceCollection(); 31 | service.AddLogging(); 32 | service.AddEasyLuceneNet(); 33 | var serviceProvider = service.BuildServiceProvider(); 34 | 35 | var easy = serviceProvider.GetService(); 36 | ``` 37 | 38 | ## 创建索引 39 | 40 | ``` csharp 41 | 42 | 43 | var list = new List
(); 44 | for (int i = 0; i < 100; i++) 45 | { 46 | list.Add(new Article() 47 | { 48 | Id = i, 49 | Title = i + "使用Xamarin开发移动应用示例——数独游戏(八)使用MVVM实现完成游戏列表页面", 50 | Content = @"前面我们已经完成了游戏的大部分功能,玩家可以玩预制的数独游戏,也可以自己添加新的游戏。现在我们实现展示已完成游戏列表页面,显示用户已经完成的游戏列表,从这个列表可以进入详细的复盘页面。 51 | 52 | 前面的页面我们采用的是传统的事件驱动模型,在XAML文件中定义页面,在后台的cs文件中编写事件响应代码。采用这种模型是因为很多页面需要动态生成控件,然后动态改变这些控件的属性,事件驱动模型在这种场景下比较好理解。现在我们采用MVVM方式编写完成游戏列表页面。 53 | 54 | MVVM是将页面绑定到视图模型,所有的操作和事件响应通过视图模型完成。视图模型中没有页面控件的定义,因此和页面是解耦的,可以独立进行测试。在视图模型中我们只关心数据,而不关心展示数据的控件。 55 | 56 | 首先,我们定义一个视图模型的基类,下一步在改造其它页面时,会用到这个基类:" 57 | }); 58 | } 59 | easy!.AddIndex(list); 60 | 61 | ``` 62 | 63 | ## 检索 64 | 65 | ``` csharp 66 | var result = easy!.Search
(new SearchRequest() 67 | { 68 | keyword = "事件模型", 69 | index = 1, 70 | size = 20, 71 | fields = new string[] { "Title", "Content" }, 72 | OrderByField = "Id", 73 | }); 74 | Console.WriteLine("一共:" + result.Total); 75 | foreach (var item in result.list) 76 | { 77 | Console.WriteLine($"id:{item.Id} title:{item.Title}"); 78 | } 79 | Console.WriteLine($"分词:{string.Join(" ", result.cutKeys)}"); 80 | Console.WriteLine("完成"); 81 | ``` 82 | 83 | ## 删除索引 84 | 85 | 传递一个文档对应的模型,只需要给主键赋值即可 86 | 87 | ``` csharp 88 | easy.Delete(new Article { Id = 1 }); 89 | ``` 90 | 91 | ## 联系我 92 | 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /Segmenter/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolqingcheng/EasyLuceneNET/60d445d1e91e1864b31c7c4013fe105e70544f8f/Segmenter/.DS_Store -------------------------------------------------------------------------------- /Segmenter/Common/Extensions.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text.RegularExpressions; 5 | 6 | namespace JiebaNet.Segmenter.Common 7 | { 8 | public static class Extensions 9 | { 10 | private static readonly Regex RegexDigits = new Regex(@"\d+", RegexOptions.Compiled); 11 | private static readonly Regex RegexNewline = new Regex("(\r\n|\n|\r)", RegexOptions.Compiled); 12 | 13 | #region Objects 14 | 15 | public static bool IsNull(this object obj) 16 | { 17 | return obj == null; 18 | } 19 | 20 | public static bool IsNotNull(this object obj) 21 | { 22 | return obj != null; 23 | } 24 | 25 | #endregion 26 | 27 | 28 | #region Enumerable 29 | 30 | public static bool IsEmpty(this IEnumerable enumerable) 31 | { 32 | return (enumerable == null) || !enumerable.Any(); 33 | } 34 | 35 | public static bool IsNotEmpty(this IEnumerable enumerable) 36 | { 37 | return (enumerable != null) && enumerable.Any(); 38 | } 39 | 40 | public static TValue GetValueOrDefault(this IDictionary d, TKey key) 41 | { 42 | return d.ContainsKey(key) ? d[key] : default(TValue); 43 | } 44 | 45 | public static TValue GetDefault(this IDictionary dict, TKey key, TValue defaultValue) 46 | { 47 | if (dict.ContainsKey(key)) 48 | { 49 | return dict[key]; 50 | } 51 | return defaultValue; 52 | } 53 | 54 | public static void Update(this IDictionary dict, IDictionary other) 55 | { 56 | foreach (var key in other.Keys) 57 | { 58 | dict[key] = other[key]; 59 | } 60 | } 61 | 62 | #endregion 63 | 64 | #region String & Text 65 | 66 | public static string Left(this string s, int endIndex) 67 | { 68 | if (string.IsNullOrEmpty(s)) 69 | { 70 | return s; 71 | } 72 | 73 | return s.Substring(0, endIndex); 74 | } 75 | 76 | public static string Right(this string s, int startIndex) 77 | { 78 | if (string.IsNullOrEmpty(s)) 79 | { 80 | return s; 81 | } 82 | 83 | 84 | return s.Substring(startIndex); 85 | } 86 | 87 | public static string Sub(this string s, int startIndex, int endIndex) 88 | { 89 | return s.Substring(startIndex, endIndex - startIndex); 90 | } 91 | 92 | public static bool IsInt32(this string s) 93 | { 94 | return RegexDigits.IsMatch(s); 95 | } 96 | 97 | public static string[] SplitLines(this string s) 98 | { 99 | return RegexNewline.Split(s); 100 | } 101 | 102 | public static string Join(this IEnumerable inputs, string separator = ", ") 103 | { 104 | return string.Join(separator, inputs); 105 | } 106 | 107 | public static IEnumerable SubGroupValues(this GroupCollection groups) 108 | { 109 | var result = from Group g in groups 110 | select g.Value; 111 | return result.Skip(1); 112 | } 113 | 114 | #endregion 115 | 116 | #region Conversion 117 | 118 | public static int ToInt32(this char ch) 119 | { 120 | return ch; 121 | } 122 | 123 | public static char ToChar(this int i) 124 | { 125 | return (char)i; 126 | } 127 | 128 | #endregion 129 | } 130 | } -------------------------------------------------------------------------------- /Segmenter/Common/FileExtension.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.Extensions.FileProviders; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Reflection; 6 | using System.Text; 7 | 8 | namespace JiebaNet.Segmenter.Common 9 | { 10 | public static class FileExtension 11 | { 12 | public static string ReadEmbeddedAllLine(string path) 13 | { 14 | return ReadEmbeddedAllLine(path, Encoding.UTF8); 15 | } 16 | 17 | public static string ReadEmbeddedAllLine(string path,Encoding encoding) 18 | { 19 | var provider = new EmbeddedFileProvider(typeof(FileExtension).GetTypeInfo().Assembly); 20 | var fileInfo = provider.GetFileInfo(path); 21 | using (var sr = new StreamReader(fileInfo.CreateReadStream(), encoding)) 22 | { 23 | return sr.ReadToEnd(); 24 | } 25 | } 26 | 27 | public static List ReadEmbeddedAllLines(string path, Encoding encoding) 28 | { 29 | var provider = new EmbeddedFileProvider(typeof(FileExtension).GetTypeInfo().Assembly); 30 | var fileInfo = provider.GetFileInfo(path); 31 | List list = new List(); 32 | using (StreamReader streamReader = new StreamReader(fileInfo.CreateReadStream(), encoding)) 33 | { 34 | string item; 35 | while ((item = streamReader.ReadLine()) != null) 36 | { 37 | list.Add(item); 38 | } 39 | } 40 | return list; 41 | } 42 | 43 | public static List ReadEmbeddedAllLines(string path) 44 | { 45 | return ReadEmbeddedAllLines(path, Encoding.UTF8); 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /Segmenter/Common/Trie.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | 5 | namespace JiebaNet.Segmenter.Common 6 | { 7 | // Refer to: https://github.com/brianfromoregon/trie 8 | public class TrieNode 9 | { 10 | public char Char { get; set; } 11 | public int Frequency { get; set; } 12 | public Dictionary Children { get; set; } 13 | 14 | public TrieNode(char ch) 15 | { 16 | Char = ch; 17 | Frequency = 0; 18 | 19 | // TODO: or an empty dict? 20 | //Children = null; 21 | } 22 | 23 | public int Insert(string s, int pos, int freq = 1) 24 | { 25 | if (string.IsNullOrEmpty(s) || pos >= s.Length) 26 | { 27 | return 0; 28 | } 29 | 30 | if (Children == null) 31 | { 32 | Children = new Dictionary(); 33 | } 34 | 35 | var c = s[pos]; 36 | if (!Children.ContainsKey(c)) 37 | { 38 | Children[c] = new TrieNode(c); 39 | } 40 | 41 | var curNode = Children[c]; 42 | if (pos == s.Length - 1) 43 | { 44 | curNode.Frequency += freq; 45 | return curNode.Frequency; 46 | } 47 | 48 | return curNode.Insert(s, pos + 1, freq); 49 | } 50 | 51 | public TrieNode Search(string s, int pos) 52 | { 53 | if (string.IsNullOrEmpty(s)) 54 | { 55 | return null; 56 | } 57 | 58 | // if out of range or without any child nodes 59 | if (pos >= s.Length || Children == null) 60 | { 61 | return null; 62 | } 63 | // if reaches the last char of s, it's time to make the decision. 64 | if (pos == s.Length - 1) 65 | { 66 | return Children.ContainsKey(s[pos]) ? Children[s[pos]] : null; 67 | } 68 | // continue if necessary. 69 | return Children.ContainsKey(s[pos]) ? Children[s[pos]].Search(s, pos + 1) : null; 70 | } 71 | } 72 | 73 | public interface ITrie 74 | { 75 | //string BestMatch(string word, long maxTime); 76 | bool Contains(string word); 77 | int Frequency(string word); 78 | int Insert(string word, int freq = 1); 79 | //bool Remove(string word); 80 | int Count { get; } 81 | int TotalFrequency { get; } 82 | } 83 | 84 | public class Trie : ITrie 85 | { 86 | private static readonly char RootChar = '\0'; 87 | 88 | internal TrieNode Root; 89 | 90 | public int Count { get; private set; } 91 | public int TotalFrequency { get; private set; } 92 | 93 | public Trie() 94 | { 95 | Root = new TrieNode(RootChar); 96 | Count = 0; 97 | } 98 | 99 | public bool Contains(string word) 100 | { 101 | CheckWord(word); 102 | 103 | var node = Root.Search(word.Trim(), 0); 104 | return node.IsNotNull() && node.Frequency > 0; 105 | } 106 | 107 | public bool ContainsPrefix(string word) 108 | { 109 | CheckWord(word); 110 | 111 | var node = Root.Search(word.Trim(), 0); 112 | return node.IsNotNull(); 113 | } 114 | 115 | public int Frequency(string word) 116 | { 117 | CheckWord(word); 118 | 119 | var node = Root.Search(word.Trim(), 0); 120 | return node.IsNull() ? 0 : node.Frequency; 121 | } 122 | 123 | public int Insert(string word, int freq = 1) 124 | { 125 | CheckWord(word); 126 | 127 | var i = Root.Insert(word.Trim(), 0, freq); 128 | if (i > 0) 129 | { 130 | TotalFrequency += freq; 131 | Count++; 132 | } 133 | 134 | return i; 135 | } 136 | 137 | public IEnumerable ChildChars(string prefix) 138 | { 139 | var node = Root.Search(prefix.Trim(), 0); 140 | return node.IsNull() || node.Children.IsNull() ? null : node.Children.Select(p => p.Key); 141 | } 142 | 143 | private void CheckWord(string word) 144 | { 145 | if (string.IsNullOrWhiteSpace(word)) 146 | { 147 | throw new ArgumentException("word must not be null or whitespace"); 148 | } 149 | } 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /Segmenter/ConfigManager.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | 4 | namespace JiebaNet.Segmenter 5 | { 6 | public class ConfigManager 7 | { 8 | public static string ConfigFileBaseDir 9 | { 10 | get 11 | { 12 | var configFileDir = "Resources"; 13 | return configFileDir; 14 | } 15 | } 16 | 17 | public static string MainDictFile 18 | { 19 | get { return Path.Combine(ConfigFileBaseDir, "dict.txt"); } 20 | } 21 | 22 | public static string ProbTransFile 23 | { 24 | get { return Path.Combine(ConfigFileBaseDir, "prob_trans.json"); } 25 | } 26 | 27 | public static string ProbEmitFile 28 | { 29 | get { return Path.Combine(ConfigFileBaseDir, "prob_emit.json"); } 30 | } 31 | 32 | public static string PosProbStartFile 33 | { 34 | get { return Path.Combine(ConfigFileBaseDir, "pos_prob_start.json"); } 35 | } 36 | 37 | public static string PosProbTransFile 38 | { 39 | get { return Path.Combine(ConfigFileBaseDir, "pos_prob_trans.json"); } 40 | } 41 | 42 | public static string PosProbEmitFile 43 | { 44 | get { return Path.Combine(ConfigFileBaseDir, "pos_prob_emit.json"); } 45 | } 46 | 47 | public static string CharStateTabFile 48 | { 49 | get { return Path.Combine(ConfigFileBaseDir, "char_state_tab.json"); } 50 | } 51 | } 52 | } -------------------------------------------------------------------------------- /Segmenter/Constants.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | 4 | namespace JiebaNet.Segmenter 5 | { 6 | public class Constants 7 | { 8 | public static readonly double MinProb = -3.14e100; 9 | 10 | public static readonly List NounPos = new List() { "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz" }; 11 | public static readonly List VerbPos = new List() { "v", "vd", "vg", "vi", "vn", "vq" }; 12 | public static readonly List NounAndVerbPos = NounPos.Union(VerbPos).ToList(); 13 | public static readonly List IdiomPos = new List() { "i" }; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /Segmenter/DefaultDictionary.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace JiebaNet.Segmenter 8 | { 9 | public class DefaultDictionary : Dictionary 10 | { 11 | public new TValue this[TKey key] 12 | { 13 | get 14 | { 15 | if (!ContainsKey(key)) 16 | { 17 | Add(key, default(TValue)); 18 | } 19 | return base[key]; 20 | } 21 | set { base[key] = value; } 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /Segmenter/FinalSeg/IFinalSeg.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | 4 | namespace JiebaNet.Segmenter.FinalSeg 5 | { 6 | public interface IFinalSeg 7 | { 8 | IEnumerable Cut(string sentence); 9 | } 10 | } -------------------------------------------------------------------------------- /Segmenter/FinalSeg/Viterbi.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Text.RegularExpressions; 7 | using JiebaNet.Segmenter.Common; 8 | using Newtonsoft.Json; 9 | 10 | namespace JiebaNet.Segmenter.FinalSeg 11 | { 12 | public class Viterbi : IFinalSeg 13 | { 14 | private static readonly Lazy Lazy = new Lazy(() => new Viterbi()); 15 | private static readonly char[] States = { 'B', 'M', 'E', 'S' }; 16 | 17 | private static readonly Regex RegexChinese = new Regex(@"([\u4E00-\u9FD5]+)", RegexOptions.Compiled); 18 | private static readonly Regex RegexSkip = new Regex(@"(\d+\.\d+|[a-zA-Z0-9]+)", RegexOptions.Compiled); 19 | 20 | private static IDictionary> _emitProbs; 21 | private static IDictionary _startProbs; 22 | private static IDictionary> _transProbs; 23 | private static IDictionary _prevStatus; 24 | 25 | private Viterbi() 26 | { 27 | LoadModel(); 28 | } 29 | 30 | // TODO: synchronized 31 | public static Viterbi Instance 32 | { 33 | get { return Lazy.Value; } 34 | } 35 | 36 | public IEnumerable Cut(string sentence) 37 | { 38 | var tokens = new List(); 39 | foreach (var blk in RegexChinese.Split(sentence)) 40 | { 41 | if (RegexChinese.IsMatch(blk)) 42 | { 43 | tokens.AddRange(ViterbiCut(blk)); 44 | } 45 | else 46 | { 47 | var segments = RegexSkip.Split(blk).Where(seg => !string.IsNullOrEmpty(seg)); 48 | tokens.AddRange(segments); 49 | } 50 | } 51 | return tokens; 52 | } 53 | 54 | #region Private Helpers 55 | 56 | private void LoadModel() 57 | { 58 | var stopWatch = new Stopwatch(); 59 | stopWatch.Start(); 60 | 61 | _prevStatus = new Dictionary() 62 | { 63 | {'B', new []{'E', 'S'}}, 64 | {'M', new []{'M', 'B'}}, 65 | {'S', new []{'S', 'E'}}, 66 | {'E', new []{'B', 'M'}} 67 | }; 68 | 69 | _startProbs = new Dictionary() 70 | { 71 | {'B', -0.26268660809250016}, 72 | {'E', -3.14e+100}, 73 | {'M', -3.14e+100}, 74 | {'S', -1.4652633398537678} 75 | }; 76 | 77 | var transJson = FileExtension.ReadEmbeddedAllLine(ConfigManager.ProbTransFile); 78 | _transProbs = JsonConvert.DeserializeObject>>(transJson); 79 | 80 | var emitJson = FileExtension.ReadEmbeddedAllLine(ConfigManager.ProbEmitFile); 81 | _emitProbs = JsonConvert.DeserializeObject>>(emitJson); 82 | 83 | stopWatch.Stop(); 84 | Debug.WriteLine("model loading finished, time elapsed {0} ms.", stopWatch.ElapsedMilliseconds); 85 | } 86 | 87 | private IEnumerable ViterbiCut(string sentence) 88 | { 89 | var v = new List>(); 90 | IDictionary path = new Dictionary(); 91 | 92 | // Init weights and paths. 93 | v.Add(new Dictionary()); 94 | foreach (var state in States) 95 | { 96 | var emP = _emitProbs[state].GetDefault(sentence[0], Constants.MinProb); 97 | v[0][state] = _startProbs[state] + emP; 98 | path[state] = new Node(state, null); 99 | } 100 | 101 | // For each remaining char 102 | for (var i = 1; i < sentence.Length; ++i) 103 | { 104 | IDictionary vv = new Dictionary(); 105 | v.Add(vv); 106 | IDictionary newPath = new Dictionary(); 107 | foreach (var y in States) 108 | { 109 | var emp = _emitProbs[y].GetDefault(sentence[i], Constants.MinProb); 110 | 111 | Pair candidate = new Pair('\0', double.MinValue); 112 | foreach (var y0 in _prevStatus[y]) 113 | { 114 | var tranp = _transProbs[y0].GetDefault(y, Constants.MinProb); 115 | tranp = v[i - 1][y0] + tranp + emp; 116 | if (candidate.Freq <= tranp) 117 | { 118 | candidate.Freq = tranp; 119 | candidate.Key = y0; 120 | } 121 | } 122 | vv[y] = candidate.Freq; 123 | newPath[y] = new Node(y, path[candidate.Key]); 124 | } 125 | path = newPath; 126 | } 127 | 128 | var probE = v[sentence.Length - 1]['E']; 129 | var probS = v[sentence.Length - 1]['S']; 130 | var finalPath = probE < probS ? path['S'] : path['E']; 131 | 132 | var posList = new List(sentence.Length); 133 | while (finalPath != null) 134 | { 135 | posList.Add(finalPath.Value); 136 | finalPath = finalPath.Parent; 137 | } 138 | posList.Reverse(); 139 | 140 | var tokens = new List(); 141 | int begin = 0, next = 0; 142 | for (var i = 0; i < sentence.Length; i++) 143 | { 144 | var pos = posList[i]; 145 | if (pos == 'B') 146 | begin = i; 147 | else if (pos == 'E') 148 | { 149 | tokens.Add(sentence.Sub(begin, i + 1)); 150 | next = i + 1; 151 | } 152 | else if (pos == 'S') 153 | { 154 | tokens.Add(sentence.Sub(i, i + 1)); 155 | next = i + 1; 156 | } 157 | } 158 | if (next < sentence.Length) 159 | { 160 | tokens.Add(sentence.Substring(next)); 161 | } 162 | 163 | return tokens; 164 | } 165 | 166 | #endregion 167 | } 168 | } -------------------------------------------------------------------------------- /Segmenter/JiebaSegmenter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Text.RegularExpressions; 7 | using JiebaNet.Segmenter.Common; 8 | using JiebaNet.Segmenter.FinalSeg; 9 | using System.IO; 10 | 11 | namespace JiebaNet.Segmenter 12 | { 13 | public class JiebaSegmenter 14 | { 15 | private static readonly WordDictionary WordDict = WordDictionary.Instance; 16 | private static readonly IFinalSeg FinalSeg = Viterbi.Instance; 17 | private static readonly ISet LoadedPath = new HashSet(); 18 | 19 | private static readonly object locker = new object(); 20 | 21 | internal IDictionary UserWordTagTab { get; set; } 22 | 23 | #region Regular Expressions 24 | 25 | internal static readonly Regex RegexChineseDefault = new Regex(@"([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", RegexOptions.Compiled); 26 | 27 | internal static readonly Regex RegexSkipDefault = new Regex(@"(\r\n|\s)", RegexOptions.Compiled); 28 | 29 | internal static readonly Regex RegexChineseCutAll = new Regex(@"([\u4E00-\u9FD5]+)", RegexOptions.Compiled); 30 | internal static readonly Regex RegexSkipCutAll = new Regex(@"[^a-zA-Z0-9+#\n]", RegexOptions.Compiled); 31 | 32 | internal static readonly Regex RegexEnglishChars = new Regex(@"[a-zA-Z0-9]", RegexOptions.Compiled); 33 | 34 | internal static readonly Regex RegexUserDict = new Regex("^(?.+?)(? [0-9]+)?(? [a-z]+)?$", RegexOptions.Compiled); 35 | 36 | #endregion 37 | 38 | public JiebaSegmenter() 39 | { 40 | UserWordTagTab = new Dictionary(); 41 | } 42 | 43 | /// 44 | /// The main function that segments an entire sentence that contains 45 | /// Chinese characters into seperated words. 46 | /// 47 | /// The string to be segmented. 48 | /// Specify segmentation pattern. True for full pattern, False for accurate pattern. 49 | /// Whether to use the Hidden Markov Model. 50 | /// 51 | public IEnumerable Cut(string text, bool cutAll = false, bool hmm = true) 52 | { 53 | var reHan = RegexChineseDefault; 54 | var reSkip = RegexSkipDefault; 55 | Func> cutMethod = null; 56 | 57 | if (cutAll) 58 | { 59 | reHan = RegexChineseCutAll; 60 | reSkip = RegexSkipCutAll; 61 | } 62 | 63 | if (cutAll) 64 | { 65 | cutMethod = CutAll; 66 | } 67 | else if (hmm) 68 | { 69 | cutMethod = CutDag; 70 | } 71 | else 72 | { 73 | cutMethod = CutDagWithoutHmm; 74 | } 75 | 76 | return CutIt(text, cutMethod, reHan, reSkip, cutAll); 77 | } 78 | 79 | public IEnumerable Cut2(string text,bool cutAll=false,bool hmm=true) 80 | { 81 | var reHan = RegexChineseDefault; 82 | var reSkip = RegexSkipDefault; 83 | Func> cutMethod = null; 84 | 85 | if (cutAll) 86 | { 87 | reHan = RegexChineseCutAll; 88 | reSkip = RegexSkipCutAll; 89 | } 90 | 91 | if (cutAll) 92 | { 93 | cutMethod = CutAll; 94 | } 95 | else if (hmm) 96 | { 97 | cutMethod = CutDag; 98 | } 99 | else 100 | { 101 | cutMethod = CutDagWithoutHmm; 102 | } 103 | 104 | return CutIt2(text, cutMethod, reHan, reSkip, cutAll); 105 | } 106 | 107 | public IEnumerable CutForSearch(string text, bool hmm = true) 108 | { 109 | var result = new List(); 110 | 111 | var words = Cut(text, hmm: hmm); 112 | foreach (var w in words) 113 | { 114 | if (w.Length > 2) 115 | { 116 | foreach (var i in Enumerable.Range(0, w.Length - 1)) 117 | { 118 | var gram2 = w.Substring(i, 2); 119 | if (WordDict.ContainsWord(gram2)) 120 | { 121 | result.Add(gram2); 122 | } 123 | } 124 | } 125 | 126 | if (w.Length > 3) 127 | { 128 | foreach (var i in Enumerable.Range(0, w.Length - 2)) 129 | { 130 | var gram3 = w.Substring(i, 3); 131 | if (WordDict.ContainsWord(gram3)) 132 | { 133 | result.Add(gram3); 134 | } 135 | } 136 | } 137 | 138 | result.Add(w); 139 | } 140 | 141 | return result; 142 | } 143 | 144 | public IEnumerable Tokenize(string text, TokenizerMode mode = TokenizerMode.Default, bool hmm = true) 145 | { 146 | var result = new List(); 147 | 148 | if (mode == TokenizerMode.Default) 149 | { 150 | foreach (var w in Cut2(text, hmm: hmm)) 151 | { 152 | var width = w.value.Length; 153 | result.Add(new Token(w.value, w.position, w.position + width)); 154 | 155 | } 156 | } 157 | else 158 | { 159 | //var xx = Cut2(text, hmm: hmm); 160 | foreach (var w in Cut2(text, hmm: hmm)) 161 | { 162 | var width = w.value.Length; 163 | if (width > 2) 164 | { 165 | for (var i = 0; i < width - 1; i++) 166 | { 167 | var gram2 = w.value.Substring(i, 2); 168 | if (WordDict.ContainsWord(gram2)) 169 | { 170 | result.Add(new Token(gram2, w.position + i, w.position + i + 2)); 171 | } 172 | } 173 | } 174 | if (width > 3) 175 | { 176 | for (var i = 0; i < width - 2; i++) 177 | { 178 | var gram3 = w.value.Substring(i, 3); 179 | if (WordDict.ContainsWord(gram3)) 180 | { 181 | result.Add(new Token(gram3, w.position + i, w.position + i + 3)); 182 | } 183 | } 184 | } 185 | 186 | result.Add(new Token(w.value, w.position, w.position + width)); 187 | 188 | } 189 | } 190 | 191 | return result; 192 | } 193 | 194 | #region Internal Cut Methods 195 | 196 | internal IDictionary> GetDag(string sentence) 197 | { 198 | var dag = new Dictionary>(); 199 | var trie = WordDict.Trie; 200 | 201 | var N = sentence.Length; 202 | for (var k = 0; k < sentence.Length; k++) 203 | { 204 | var templist = new List(); 205 | var i = k; 206 | var frag = sentence.Substring(k, 1); 207 | while (i < N && trie.ContainsKey(frag)) 208 | { 209 | if (trie[frag] > 0) 210 | { 211 | templist.Add(i); 212 | } 213 | 214 | i++; 215 | // TODO: 216 | if (i < N) 217 | { 218 | frag = sentence.Sub(k, i + 1); 219 | } 220 | } 221 | if (templist.Count == 0) 222 | { 223 | templist.Add(k); 224 | } 225 | dag[k] = templist; 226 | } 227 | 228 | return dag; 229 | } 230 | 231 | internal IDictionary> Calc(string sentence, IDictionary> dag) 232 | { 233 | var n = sentence.Length; 234 | var route = new Dictionary>(); 235 | route[n] = new Pair(0, 0.0); 236 | 237 | var logtotal = Math.Log(WordDict.Total); 238 | for (var i = n - 1; i > -1; i--) 239 | { 240 | var candidate = new Pair(-1, double.MinValue); 241 | foreach (int x in dag[i]) 242 | { 243 | var freq = Math.Log(WordDict.GetFreqOrDefault(sentence.Sub(i, x + 1))) - logtotal + route[x + 1].Freq; 244 | if (candidate.Freq < freq) 245 | { 246 | candidate.Freq = freq; 247 | candidate.Key = x; 248 | } 249 | } 250 | route[i] = candidate; 251 | } 252 | return route; 253 | } 254 | 255 | internal IEnumerable CutAll(string sentence) 256 | { 257 | var dag = GetDag(sentence); 258 | 259 | var words = new List(); 260 | var lastPos = -1; 261 | 262 | foreach (var pair in dag) 263 | { 264 | var k = pair.Key; 265 | var nexts = pair.Value; 266 | if (nexts.Count == 1 && k > lastPos) 267 | { 268 | words.Add(sentence.Substring(k, nexts[0] + 1 - k)); 269 | lastPos = nexts[0]; 270 | } 271 | else 272 | { 273 | foreach (var j in nexts) 274 | { 275 | if (j > k) 276 | { 277 | words.Add(sentence.Substring(k, j + 1 - k)); 278 | lastPos = j; 279 | } 280 | } 281 | } 282 | } 283 | 284 | return words; 285 | } 286 | 287 | internal IEnumerable CutDag(string sentence) 288 | { 289 | var dag = GetDag(sentence); 290 | var route = Calc(sentence, dag); 291 | 292 | var tokens = new List(); 293 | 294 | var x = 0; 295 | var n = sentence.Length; 296 | var buf = string.Empty; 297 | while (x < n) 298 | { 299 | var y = route[x].Key + 1; 300 | var w = sentence.Substring(x, y - x); 301 | if (y - x == 1) 302 | { 303 | buf += w; 304 | } 305 | else 306 | { 307 | if (buf.Length > 0) 308 | { 309 | AddBufferToWordList(tokens, buf); 310 | buf = string.Empty; 311 | } 312 | tokens.Add(w); 313 | } 314 | x = y; 315 | } 316 | 317 | if (buf.Length > 0) 318 | { 319 | AddBufferToWordList(tokens, buf); 320 | } 321 | 322 | return tokens; 323 | } 324 | 325 | internal IEnumerable CutDagWithoutHmm(string sentence) 326 | { 327 | var dag = GetDag(sentence); 328 | var route = Calc(sentence, dag); 329 | 330 | var words = new List(); 331 | 332 | var x = 0; 333 | string buf = string.Empty; 334 | var N = sentence.Length; 335 | 336 | var y = -1; 337 | while (x < N) 338 | { 339 | y = route[x].Key + 1; 340 | var l_word = sentence.Substring(x, y - x); 341 | if (RegexEnglishChars.IsMatch(l_word) && l_word.Length == 1) 342 | { 343 | buf += l_word; 344 | x = y; 345 | } 346 | else 347 | { 348 | if (buf.Length > 0) 349 | { 350 | words.Add(buf); 351 | buf = string.Empty; 352 | } 353 | words.Add(l_word); 354 | x = y; 355 | } 356 | } 357 | 358 | if (buf.Length > 0) 359 | { 360 | words.Add(buf); 361 | } 362 | 363 | return words; 364 | } 365 | 366 | internal IEnumerable CutIt2(string text, Func> cutMethod, 367 | Regex reHan, Regex reSkip, bool cutAll) 368 | { 369 | var result = new List(); 370 | var blocks = reHan.Split(text); 371 | var start = 0; 372 | foreach(var blk in blocks) 373 | { 374 | if(string.IsNullOrWhiteSpace(blk)) 375 | { 376 | start += blk.Length; 377 | continue; 378 | } 379 | if(reHan.IsMatch(blk)) 380 | { 381 | foreach(var word in cutMethod(blk)) 382 | { 383 | result.Add(new WordInfo(word,start)); 384 | start += word.Length; 385 | } 386 | } 387 | else 388 | { 389 | var tmp = reSkip.Split(blk); 390 | foreach(var x in tmp) 391 | { 392 | if(reSkip.IsMatch(x)) 393 | { 394 | result.Add(new WordInfo(x,start)); 395 | start += x.Length; 396 | } 397 | else if(!cutAll) 398 | { 399 | foreach(var ch in x) 400 | { 401 | result.Add(new WordInfo(ch.ToString(),start)); 402 | start += ch.ToString().Length; 403 | } 404 | } 405 | else{ 406 | 407 | result.Add(new WordInfo(x,start)); 408 | start += x.Length; 409 | 410 | } 411 | } 412 | } 413 | } 414 | 415 | return result; 416 | } 417 | 418 | internal IEnumerable CutIt(string text, Func> cutMethod, 419 | Regex reHan, Regex reSkip, bool cutAll) 420 | { 421 | var result = new List(); 422 | var blocks = reHan.Split(text); 423 | foreach (var blk in blocks) 424 | { 425 | if (string.IsNullOrWhiteSpace(blk)) 426 | { 427 | continue; 428 | } 429 | 430 | if (reHan.IsMatch(blk)) 431 | { 432 | foreach (var word in cutMethod(blk)) 433 | { 434 | result.Add(word); 435 | } 436 | } 437 | else 438 | { 439 | var tmp = reSkip.Split(blk); 440 | foreach (var x in tmp) 441 | { 442 | if (reSkip.IsMatch(x)) 443 | { 444 | result.Add(x); 445 | } 446 | else if (!cutAll) 447 | { 448 | foreach (var ch in x) 449 | { 450 | result.Add(ch.ToString()); 451 | } 452 | } 453 | else 454 | { 455 | result.Add(x); 456 | } 457 | } 458 | } 459 | } 460 | 461 | return result; 462 | } 463 | 464 | #endregion 465 | 466 | #region Extend Main Dict 467 | 468 | /// 469 | /// Loads user dictionaries. 470 | /// 471 | /// 472 | public void LoadUserDict(string userDictFile) 473 | { 474 | var dictFullPath = Path.GetFullPath(userDictFile); 475 | Debug.WriteLine("Initializing user dictionary: " + userDictFile); 476 | 477 | lock (locker) 478 | { 479 | if (LoadedPath.Contains(dictFullPath)) 480 | return; 481 | 482 | try 483 | { 484 | var startTime = DateTime.Now.Millisecond; 485 | 486 | var lines = FileExtension.ReadEmbeddedAllLines(dictFullPath); 487 | foreach (var line in lines) 488 | { 489 | if (string.IsNullOrWhiteSpace(line)) 490 | { 491 | continue; 492 | } 493 | 494 | var tokens = RegexUserDict.Match(line.Trim()).Groups; 495 | var word = tokens["word"].Value.Trim(); 496 | var freq = tokens["freq"].Value.Trim(); 497 | var tag = tokens["tag"].Value.Trim(); 498 | 499 | var actualFreq = freq.Length > 0 ? int.Parse(freq) : 0; 500 | AddWord(word, actualFreq, tag); 501 | } 502 | 503 | Debug.WriteLine("user dict '{0}' load finished, time elapsed {1} ms", 504 | dictFullPath, DateTime.Now.Millisecond - startTime); 505 | } 506 | catch (IOException e) 507 | { 508 | Debug.Fail(string.Format("'{0}' load failure, reason: {1}", dictFullPath, e.Message)); 509 | } 510 | catch (FormatException fe) 511 | { 512 | Debug.Fail(fe.Message); 513 | } 514 | } 515 | } 516 | 517 | public void AddWord(string word, int freq = 0, string tag = null) 518 | { 519 | if (freq <= 0) 520 | { 521 | freq = WordDict.SuggestFreq(word, Cut(word, hmm: false)); 522 | } 523 | WordDict.AddWord(word, freq); 524 | 525 | // Add user word tag of POS 526 | if (!string.IsNullOrEmpty(tag)) 527 | { 528 | UserWordTagTab[word] = tag; 529 | } 530 | } 531 | 532 | public void DeleteWord(string word) 533 | { 534 | WordDict.DeleteWord(word); 535 | } 536 | 537 | #endregion 538 | 539 | #region Private Helpers 540 | 541 | private void AddBufferToWordList(List words, string buf) 542 | { 543 | if (buf.Length == 1) 544 | { 545 | words.Add(buf); 546 | } 547 | else 548 | { 549 | if (!WordDict.ContainsWord(buf)) 550 | { 551 | var tokens = FinalSeg.Cut(buf); 552 | words.AddRange(tokens); 553 | } 554 | else 555 | { 556 | words.AddRange(buf.Select(ch => ch.ToString())); 557 | } 558 | } 559 | } 560 | 561 | #endregion 562 | } 563 | 564 | public enum TokenizerMode 565 | { 566 | Default, 567 | Search 568 | } 569 | 570 | 571 | } 572 | -------------------------------------------------------------------------------- /Segmenter/Node.cs: -------------------------------------------------------------------------------- 1 | namespace JiebaNet.Segmenter 2 | { 3 | public class Node 4 | { 5 | public char Value { get; private set; } 6 | public Node Parent { get; private set; } 7 | 8 | public Node(char value, Node parent) 9 | { 10 | Value = value; 11 | Parent = parent; 12 | } 13 | } 14 | } -------------------------------------------------------------------------------- /Segmenter/Pair.cs: -------------------------------------------------------------------------------- 1 | namespace JiebaNet.Segmenter 2 | { 3 | public class Pair 4 | { 5 | public TKey Key { get;set; } 6 | public double Freq { get; set; } 7 | 8 | public Pair(TKey key, double freq) 9 | { 10 | Key = key; 11 | Freq = freq; 12 | } 13 | 14 | public override string ToString() 15 | { 16 | return "Candidate [Key=" + Key + ", Freq=" + Freq + "]"; 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /Segmenter/PosSeg/Pair.cs: -------------------------------------------------------------------------------- 1 | namespace JiebaNet.Segmenter.PosSeg 2 | { 3 | public class Pair 4 | { 5 | public string Word { get; set; } 6 | public string Flag { get; set; } 7 | public Pair(string word, string flag) 8 | { 9 | Word = word; 10 | Flag = flag; 11 | } 12 | 13 | public override string ToString() 14 | { 15 | return string.Format("{0}/{1}", Word, Flag); 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /Segmenter/PosSeg/PosSegmenter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Text.RegularExpressions; 7 | using JiebaNet.Segmenter.Common; 8 | 9 | namespace JiebaNet.Segmenter.PosSeg 10 | { 11 | public class PosSegmenter 12 | { 13 | private static readonly WordDictionary WordDict = WordDictionary.Instance; 14 | private static readonly Viterbi PosSeg = Viterbi.Instance; 15 | 16 | // TODO: 17 | private static readonly object locker = new object(); 18 | 19 | #region Regular Expressions 20 | 21 | internal static readonly Regex RegexChineseInternal = new Regex(@"([\u4E00-\u9FD5a-zA-Z0-9+#&\._]+)", RegexOptions.Compiled); 22 | internal static readonly Regex RegexSkipInternal = new Regex(@"(\r\n|\s)", RegexOptions.Compiled); 23 | 24 | internal static readonly Regex RegexChineseDetail = new Regex(@"([\u4E00-\u9FD5]+)", RegexOptions.Compiled); 25 | internal static readonly Regex RegexSkipDetail = new Regex(@"([\.0-9]+|[a-zA-Z0-9]+)", RegexOptions.Compiled); 26 | 27 | internal static readonly Regex RegexEnglishWords = new Regex(@"[a-zA-Z0-9]+", RegexOptions.Compiled); 28 | internal static readonly Regex RegexNumbers = new Regex(@"[\.0-9]+", RegexOptions.Compiled); 29 | 30 | internal static readonly Regex RegexEnglishChar = new Regex(@"^[a-zA-Z0-9]$", RegexOptions.Compiled); 31 | 32 | #endregion 33 | 34 | private static IDictionary _wordTagTab; 35 | 36 | static PosSegmenter() 37 | { 38 | LoadWordTagTab(); 39 | } 40 | 41 | private static void LoadWordTagTab() 42 | { 43 | try 44 | { 45 | _wordTagTab = new Dictionary(); 46 | var lines = FileExtension.ReadEmbeddedAllLines(ConfigManager.MainDictFile); 47 | foreach (var line in lines) 48 | { 49 | var tokens = line.Split(' '); 50 | if (tokens.Length < 2) 51 | { 52 | Debug.Fail(string.Format("Invalid line: {0}", line)); 53 | continue; 54 | } 55 | 56 | var word = tokens[0]; 57 | var tag = tokens[2]; 58 | 59 | _wordTagTab[word] = tag; 60 | } 61 | } 62 | catch (System.IO.IOException e) 63 | { 64 | Debug.Fail(string.Format("Word tag table load failure, reason: {0}", e.Message)); 65 | } 66 | catch (FormatException fe) 67 | { 68 | Debug.Fail(fe.Message); 69 | } 70 | } 71 | 72 | private JiebaSegmenter _segmenter; 73 | 74 | public PosSegmenter() 75 | { 76 | _segmenter = new JiebaSegmenter(); 77 | } 78 | 79 | public PosSegmenter(JiebaSegmenter segmenter) 80 | { 81 | _segmenter = segmenter; 82 | } 83 | 84 | private void CheckNewUserWordTags() 85 | { 86 | if (_segmenter.UserWordTagTab.IsNotEmpty()) 87 | { 88 | _wordTagTab.Update(_segmenter.UserWordTagTab); 89 | _segmenter.UserWordTagTab = new Dictionary(); 90 | } 91 | } 92 | 93 | public IEnumerable Cut(string text, bool hmm = true) 94 | { 95 | return CutInternal(text, hmm); 96 | } 97 | 98 | #region Internal Cut Methods 99 | 100 | internal IEnumerable CutInternal(string text, bool hmm = true) 101 | { 102 | CheckNewUserWordTags(); 103 | 104 | var blocks = RegexChineseInternal.Split(text); 105 | Func> cutMethod = null; 106 | if (hmm) 107 | { 108 | cutMethod = CutDag; 109 | } 110 | else 111 | { 112 | cutMethod = CutDagWithoutHmm; 113 | } 114 | 115 | var tokens = new List(); 116 | foreach (var blk in blocks) 117 | { 118 | if (RegexChineseInternal.IsMatch(blk)) 119 | { 120 | tokens.AddRange(cutMethod(blk)); 121 | } 122 | else 123 | { 124 | var tmp = RegexSkipInternal.Split(blk); 125 | foreach (var x in tmp) 126 | { 127 | if (RegexSkipInternal.IsMatch(x)) 128 | { 129 | tokens.Add(new Pair(x, "x")); 130 | } 131 | else 132 | { 133 | foreach (var xx in x) 134 | { 135 | // TODO: each char? 136 | var xxs = xx.ToString(); 137 | if (RegexNumbers.IsMatch(xxs)) 138 | { 139 | tokens.Add(new Pair(xxs, "m")); 140 | } 141 | else if (RegexEnglishWords.IsMatch(x)) 142 | { 143 | tokens.Add(new Pair(xxs, "eng")); 144 | } 145 | else 146 | { 147 | tokens.Add(new Pair(xxs, "x")); 148 | } 149 | } 150 | } 151 | } 152 | } 153 | } 154 | 155 | return tokens; 156 | } 157 | 158 | internal IEnumerable CutDag(string sentence) 159 | { 160 | var dag = _segmenter.GetDag(sentence); 161 | var route = _segmenter.Calc(sentence, dag); 162 | 163 | var tokens = new List(); 164 | 165 | var x = 0; 166 | var n = sentence.Length; 167 | var buf = string.Empty; 168 | while (x < n) 169 | { 170 | var y = route[x].Key + 1; 171 | var w = sentence.Substring(x, y - x); 172 | if (y - x == 1) 173 | { 174 | buf += w; 175 | } 176 | else 177 | { 178 | if (buf.Length > 0) 179 | { 180 | AddBufferToWordList(tokens, buf); 181 | buf = string.Empty; 182 | } 183 | tokens.Add(new Pair(w, _wordTagTab.GetDefault(w, "x"))); 184 | } 185 | x = y; 186 | } 187 | 188 | if (buf.Length > 0) 189 | { 190 | AddBufferToWordList(tokens, buf); 191 | } 192 | 193 | return tokens; 194 | } 195 | 196 | internal IEnumerable CutDagWithoutHmm(string sentence) 197 | { 198 | var dag = _segmenter.GetDag(sentence); 199 | var route = _segmenter.Calc(sentence, dag); 200 | 201 | var tokens = new List(); 202 | 203 | var x = 0; 204 | var buf = string.Empty; 205 | var n = sentence.Length; 206 | 207 | var y = -1; 208 | while (x < n) 209 | { 210 | y = route[x].Key + 1; 211 | var w = sentence.Substring(x, y - x); 212 | // TODO: char or word? 213 | if (RegexEnglishChar.IsMatch(w)) 214 | { 215 | buf += w; 216 | x = y; 217 | } 218 | else 219 | { 220 | if (buf.Length > 0) 221 | { 222 | tokens.Add(new Pair(buf, "eng")); 223 | buf = string.Empty; 224 | } 225 | tokens.Add(new Pair(w, _wordTagTab.GetDefault(w, "x"))); 226 | x = y; 227 | } 228 | } 229 | 230 | if (buf.Length > 0) 231 | { 232 | tokens.Add(new Pair(buf, "eng")); 233 | } 234 | 235 | return tokens; 236 | } 237 | 238 | internal IEnumerable CutDetail(string text) 239 | { 240 | var tokens = new List(); 241 | var blocks = RegexChineseDetail.Split(text); 242 | foreach (var blk in blocks) 243 | { 244 | if (RegexChineseDetail.IsMatch(blk)) 245 | { 246 | tokens.AddRange(PosSeg.Cut(blk)); 247 | } 248 | else 249 | { 250 | var tmp = RegexSkipDetail.Split(blk); 251 | foreach (var x in tmp) 252 | { 253 | if (!string.IsNullOrWhiteSpace(x)) 254 | { 255 | if (RegexNumbers.IsMatch(x)) 256 | { 257 | tokens.Add(new Pair(x, "m")); 258 | } 259 | else if(RegexEnglishWords.IsMatch(x)) 260 | { 261 | tokens.Add(new Pair(x, "eng")); 262 | } 263 | else 264 | { 265 | tokens.Add(new Pair(x, "x")); 266 | } 267 | } 268 | } 269 | } 270 | } 271 | 272 | return tokens; 273 | } 274 | 275 | #endregion 276 | 277 | #region Private Helpers 278 | 279 | private void AddBufferToWordList(List words, string buf) 280 | { 281 | if (buf.Length == 1) 282 | { 283 | words.Add(new Pair(buf, _wordTagTab.GetDefault(buf, "x"))); 284 | } 285 | else 286 | { 287 | if (!WordDict.ContainsWord(buf)) 288 | { 289 | var tokens = CutDetail(buf); 290 | words.AddRange(tokens); 291 | } 292 | else 293 | { 294 | words.AddRange(buf.Select(ch => new Pair(ch.ToString(), "x"))); 295 | } 296 | } 297 | } 298 | 299 | #endregion 300 | } 301 | } -------------------------------------------------------------------------------- /Segmenter/PosSeg/Viterbi.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using JiebaNet.Segmenter.Common; 5 | using Newtonsoft.Json; 6 | 7 | namespace JiebaNet.Segmenter.PosSeg 8 | { 9 | public class Viterbi 10 | { 11 | private static readonly Lazy Lazy = new Lazy(() => new Viterbi()); 12 | 13 | private static IDictionary _startProbs; 14 | private static IDictionary> _transProbs; 15 | private static IDictionary> _emitProbs; 16 | private static IDictionary> _stateTab; 17 | 18 | private Viterbi() 19 | { 20 | LoadModel(); 21 | } 22 | 23 | // TODO: synchronized 24 | public static Viterbi Instance 25 | { 26 | get { return Lazy.Value; } 27 | } 28 | 29 | public IEnumerable Cut(string sentence) 30 | { 31 | var probPosList = ViterbiCut(sentence); 32 | var posList = probPosList.Item2; 33 | 34 | var tokens = new List(); 35 | int begin = 0, next = 0; 36 | for (var i = 0; i < sentence.Length; i++) 37 | { 38 | var parts = posList[i].Split('-'); 39 | var charState = parts[0][0]; 40 | var pos = parts[1]; 41 | if (charState == 'B') 42 | begin = i; 43 | else if (charState == 'E') 44 | { 45 | tokens.Add(new Pair(sentence.Sub(begin, i + 1), pos)); 46 | next = i + 1; 47 | } 48 | else if (charState == 'S') 49 | { 50 | tokens.Add(new Pair(sentence.Sub(i, i + 1), pos)); 51 | next = i + 1; 52 | } 53 | } 54 | if (next < sentence.Length) 55 | { 56 | tokens.Add(new Pair(sentence.Substring(next), posList[next].Split('-')[1])); 57 | } 58 | 59 | return tokens; 60 | } 61 | 62 | #region Private Helpers 63 | 64 | private static void LoadModel() 65 | { 66 | var startJson = FileExtension.ReadEmbeddedAllLine(ConfigManager.PosProbStartFile); 67 | _startProbs = JsonConvert.DeserializeObject>(startJson); 68 | 69 | var transJson = FileExtension.ReadEmbeddedAllLine(ConfigManager.PosProbTransFile); 70 | _transProbs = JsonConvert.DeserializeObject>>(transJson); 71 | 72 | var emitJson = FileExtension.ReadEmbeddedAllLine(ConfigManager.PosProbEmitFile); 73 | _emitProbs = JsonConvert.DeserializeObject>>(emitJson); 74 | 75 | var tabJson = FileExtension.ReadEmbeddedAllLine(ConfigManager.CharStateTabFile); 76 | _stateTab = JsonConvert.DeserializeObject>>(tabJson); 77 | } 78 | 79 | // TODO: change sentence to obs? 80 | private Tuple> ViterbiCut(string sentence) 81 | { 82 | var v = new List>(); 83 | var memPath = new List>(); 84 | 85 | var allStates = _transProbs.Keys.ToList(); 86 | 87 | // Init weights and paths. 88 | v.Add(new Dictionary()); 89 | memPath.Add(new Dictionary()); 90 | foreach (var state in _stateTab.GetDefault(sentence[0], allStates)) 91 | { 92 | var emP = _emitProbs[state].GetDefault(sentence[0], Constants.MinProb); 93 | v[0][state] = _startProbs[state] + emP; 94 | memPath[0][state] = string.Empty; 95 | } 96 | 97 | // For each remaining char 98 | for (var i = 1; i < sentence.Length; ++i) 99 | { 100 | v.Add(new Dictionary()); 101 | memPath.Add(new Dictionary()); 102 | 103 | var prevStates = memPath[i - 1].Keys.Where(k => _transProbs[k].Count > 0); 104 | var curPossibleStates = new HashSet(prevStates.SelectMany(s => _transProbs[s].Keys)); 105 | 106 | IEnumerable obsStates = _stateTab.GetDefault(sentence[i], allStates); 107 | obsStates = curPossibleStates.Intersect(obsStates); 108 | 109 | if (!obsStates.Any()) 110 | { 111 | if (curPossibleStates.Count > 0) 112 | { 113 | obsStates = curPossibleStates; 114 | } 115 | else 116 | { 117 | obsStates = allStates; 118 | } 119 | } 120 | 121 | foreach (var y in obsStates) 122 | { 123 | var emp = _emitProbs[y].GetDefault(sentence[i], Constants.MinProb); 124 | 125 | var prob = double.MinValue; 126 | var state = string.Empty; 127 | 128 | foreach (var y0 in prevStates) 129 | { 130 | var tranp = _transProbs[y0].GetDefault(y, double.MinValue); 131 | tranp = v[i - 1][y0] + tranp + emp; 132 | // TODO: compare two very small values; 133 | // TODO: how to deal with negative infinity 134 | if (prob < tranp || 135 | (prob == tranp && string.Compare(state, y0, StringComparison.CurrentCultureIgnoreCase) < 0)) 136 | { 137 | prob = tranp; 138 | state = y0; 139 | } 140 | } 141 | v[i][y] = prob; 142 | memPath[i][y] = state; 143 | } 144 | } 145 | 146 | var vLast = v.Last(); 147 | var last = memPath.Last().Keys.Select(y => new {State = y, Prob = vLast[y]}); 148 | var endProb = double.MinValue; 149 | var endState = string.Empty; 150 | foreach (var endPoint in last) 151 | { 152 | // TODO: compare two very small values; 153 | if (endProb < endPoint.Prob || 154 | (endProb == endPoint.Prob && String.Compare(endState, endPoint.State, StringComparison.CurrentCultureIgnoreCase) < 0)) 155 | { 156 | endProb = endPoint.Prob; 157 | endState = endPoint.State; 158 | } 159 | } 160 | 161 | var route = new string[sentence.Length]; 162 | var n = sentence.Length - 1; 163 | var curState = endState; 164 | while(n >= 0) 165 | { 166 | route[n] = curState; 167 | curState = memPath[n][curState]; 168 | n--; 169 | } 170 | 171 | return new Tuple>(endProb, route.ToList()); 172 | } 173 | 174 | #endregion 175 | } 176 | } -------------------------------------------------------------------------------- /Segmenter/Resources/pos_prob_start.json: -------------------------------------------------------------------------------- 1 | { 2 | "E-e": -3.14e+100, 3 | "E-d": -3.14e+100, 4 | "E-g": -3.14e+100, 5 | "E-f": -3.14e+100, 6 | "E-a": -3.14e+100, 7 | "E-c": -3.14e+100, 8 | "E-b": -3.14e+100, 9 | "E-m": -3.14e+100, 10 | "S-rg": -10.275268591948773, 11 | "E-o": -3.14e+100, 12 | "E-n": -3.14e+100, 13 | "E-i": -3.14e+100, 14 | "E-h": -3.14e+100, 15 | "E-k": -3.14e+100, 16 | "E-j": -3.14e+100, 17 | "E-u": -3.14e+100, 18 | "E-t": -3.14e+100, 19 | "E-w": -3.14e+100, 20 | "E-v": -3.14e+100, 21 | "E-q": -3.14e+100, 22 | "E-p": -3.14e+100, 23 | "E-s": -3.14e+100, 24 | "M-bg": -3.14e+100, 25 | "M-uj": -3.14e+100, 26 | "E-y": -3.14e+100, 27 | "E-x": -3.14e+100, 28 | "E-z": -3.14e+100, 29 | "B-uz": -3.14e+100, 30 | "S-d": -3.903919764181873, 31 | "M-rg": -3.14e+100, 32 | "E-nt": -3.14e+100, 33 | "B-d": -3.9750475297585357, 34 | "B-uv": -3.14e+100, 35 | "E-vi": -3.14e+100, 36 | "B-mq": -6.78695300139688, 37 | "M-rr": -3.14e+100, 38 | "S-ag": -6.954113917960154, 39 | "M-jn": -3.14e+100, 40 | "E-l": -3.14e+100, 41 | "M-rz": -3.14e+100, 42 | "B-ud": -3.14e+100, 43 | "S-an": -12.84021794941031, 44 | "B-qg": -3.14e+100, 45 | "B-ug": -3.14e+100, 46 | "M-y": -3.14e+100, 47 | "S-qg": -3.14e+100, 48 | "S-z": -3.14e+100, 49 | "S-y": -6.1970794699489575, 50 | "S-x": -8.427419656069674, 51 | "S-w": -3.14e+100, 52 | "S-v": -3.053292303412302, 53 | "S-u": -6.940320595827818, 54 | "S-t": -3.14e+100, 55 | "B-nrt": -4.985642733519195, 56 | "S-r": -2.7635336784127853, 57 | "S-q": -4.888658618255058, 58 | "M-zg": -3.14e+100, 59 | "S-o": -8.464460927750023, 60 | "S-n": -3.8551483897645107, 61 | "B-zg": -3.14e+100, 62 | "S-l": -3.14e+100, 63 | "S-k": -6.940320595827818, 64 | "S-in": -3.14e+100, 65 | "S-i": -3.14e+100, 66 | "S-h": -8.650563207383884, 67 | "S-g": -6.507826815331734, 68 | "B-f": -5.491630418482717, 69 | "S-e": -5.942513006281674, 70 | "M-en": -3.14e+100, 71 | "S-c": -4.786966795861212, 72 | "S-b": -6.472888763970454, 73 | "S-a": -3.9025396831295227, 74 | "B-g": -3.14e+100, 75 | "B-b": -5.018374362109218, 76 | "B-c": -3.423880184954888, 77 | "M-ug": -3.14e+100, 78 | "B-a": -4.762305214596967, 79 | "E-qe": -3.14e+100, 80 | "M-x": -3.14e+100, 81 | "E-nz": -3.14e+100, 82 | "M-z": -3.14e+100, 83 | "M-u": -3.14e+100, 84 | "B-k": -3.14e+100, 85 | "M-w": -3.14e+100, 86 | "B-jn": -3.14e+100, 87 | "S-yg": -13.533365129970255, 88 | "B-o": -8.433498702146057, 89 | "B-l": -4.905883584659895, 90 | "B-m": -3.6524299819046386, 91 | "M-m": -3.14e+100, 92 | "M-l": -3.14e+100, 93 | "M-o": -3.14e+100, 94 | "M-n": -3.14e+100, 95 | "M-i": -3.14e+100, 96 | "M-h": -3.14e+100, 97 | "B-t": -3.3647479094528574, 98 | "M-ul": -3.14e+100, 99 | "B-z": -7.045681111485645, 100 | "M-d": -3.14e+100, 101 | "M-mg": -3.14e+100, 102 | "B-y": -9.844485675856319, 103 | "M-a": -3.14e+100, 104 | "S-nrt": -3.14e+100, 105 | "M-c": -3.14e+100, 106 | "M-uz": -3.14e+100, 107 | "E-mg": -3.14e+100, 108 | "B-i": -6.1157847275557105, 109 | "M-b": -3.14e+100, 110 | "E-uz": -3.14e+100, 111 | "B-n": -1.6966257797548328, 112 | "E-uv": -3.14e+100, 113 | "M-ud": -3.14e+100, 114 | "M-p": -3.14e+100, 115 | "E-ul": -3.14e+100, 116 | "E-mq": -3.14e+100, 117 | "M-s": -3.14e+100, 118 | "M-yg": -3.14e+100, 119 | "E-uj": -3.14e+100, 120 | "E-ud": -3.14e+100, 121 | "S-ln": -3.14e+100, 122 | "M-r": -3.14e+100, 123 | "E-ng": -3.14e+100, 124 | "B-r": -3.4098187790818413, 125 | "E-en": -3.14e+100, 126 | "M-qg": -3.14e+100, 127 | "B-s": -5.522673590839954, 128 | "S-rr": -3.14e+100, 129 | "B-p": -4.200984132085048, 130 | "B-dg": -3.14e+100, 131 | "M-uv": -3.14e+100, 132 | "S-zg": -3.14e+100, 133 | "B-v": -2.6740584874265685, 134 | "S-tg": -6.272842531880403, 135 | "B-w": -3.14e+100, 136 | "B-e": -8.563551830394255, 137 | "M-k": -3.14e+100, 138 | "M-j": -3.14e+100, 139 | "B-df": -8.888974230828882, 140 | "M-e": -3.14e+100, 141 | "E-tg": -3.14e+100, 142 | "M-t": -3.14e+100, 143 | "E-nr": -3.14e+100, 144 | "M-nrfg": -3.14e+100, 145 | "B-nr": -2.2310495913769506, 146 | "E-df": -3.14e+100, 147 | "E-dg": -3.14e+100, 148 | "S-jn": -3.14e+100, 149 | "M-q": -3.14e+100, 150 | "B-mg": -3.14e+100, 151 | "B-ln": -3.14e+100, 152 | "M-f": -3.14e+100, 153 | "E-ln": -3.14e+100, 154 | "E-yg": -3.14e+100, 155 | "S-bg": -3.14e+100, 156 | "E-ns": -3.14e+100, 157 | "B-tg": -3.14e+100, 158 | "E-qg": -3.14e+100, 159 | "S-nr": -4.483663103956885, 160 | "S-ns": -3.14e+100, 161 | "M-vn": -3.14e+100, 162 | "S-nt": -12.147070768850364, 163 | "S-nz": -3.14e+100, 164 | "S-ad": -11.048458480182255, 165 | "B-yg": -3.14e+100, 166 | "M-v": -3.14e+100, 167 | "E-vn": -3.14e+100, 168 | "S-ng": -4.913434861102905, 169 | "M-g": -3.14e+100, 170 | "M-nt": -3.14e+100, 171 | "S-en": -3.14e+100, 172 | "M-nr": -3.14e+100, 173 | "M-ns": -3.14e+100, 174 | "S-vq": -3.14e+100, 175 | "B-uj": -3.14e+100, 176 | "M-nz": -3.14e+100, 177 | "B-qe": -3.14e+100, 178 | "M-in": -3.14e+100, 179 | "M-ng": -3.14e+100, 180 | "S-vn": -11.453923588290419, 181 | "E-zg": -3.14e+100, 182 | "S-vi": -3.14e+100, 183 | "S-vg": -5.9430181843676895, 184 | "S-vd": -3.14e+100, 185 | "B-ad": -6.680066036784177, 186 | "E-rz": -3.14e+100, 187 | "B-ag": -3.14e+100, 188 | "B-vd": -9.044728760238115, 189 | "S-mq": -3.14e+100, 190 | "B-vi": -12.434752841302146, 191 | "E-rr": -3.14e+100, 192 | "B-rr": -12.434752841302146, 193 | "M-vq": -3.14e+100, 194 | "E-jn": -3.14e+100, 195 | "B-vn": -4.3315610890163585, 196 | "S-mg": -10.825314928868044, 197 | "B-in": -3.14e+100, 198 | "M-vi": -3.14e+100, 199 | "M-an": -3.14e+100, 200 | "M-vd": -3.14e+100, 201 | "B-rg": -3.14e+100, 202 | "M-vg": -3.14e+100, 203 | "M-ad": -3.14e+100, 204 | "M-ag": -3.14e+100, 205 | "E-rg": -3.14e+100, 206 | "S-uz": -9.299258625372996, 207 | "B-en": -3.14e+100, 208 | "S-uv": -8.15808672228609, 209 | "S-df": -3.14e+100, 210 | "S-dg": -8.948397651299683, 211 | "M-qe": -3.14e+100, 212 | "B-ng": -3.14e+100, 213 | "E-bg": -3.14e+100, 214 | "S-ul": -8.4153713175535, 215 | "S-uj": -6.85251045118004, 216 | "S-ug": -7.5394037026636855, 217 | "B-ns": -2.8228438314969213, 218 | "S-ud": -7.728230161053767, 219 | "B-nt": -4.846091668182416, 220 | "B-ul": -3.14e+100, 221 | "E-in": -3.14e+100, 222 | "B-bg": -3.14e+100, 223 | "M-df": -3.14e+100, 224 | "M-dg": -3.14e+100, 225 | "M-nrt": -3.14e+100, 226 | "B-j": -5.0576191284681915, 227 | "E-ug": -3.14e+100, 228 | "E-vq": -3.14e+100, 229 | "B-vg": -3.14e+100, 230 | "B-nz": -3.94698846057672, 231 | "S-qe": -3.14e+100, 232 | "B-rz": -7.946116471570005, 233 | "B-nrfg": -5.873722175405573, 234 | "E-ad": -3.14e+100, 235 | "E-ag": -3.14e+100, 236 | "B-u": -9.163917277503234, 237 | "M-ln": -3.14e+100, 238 | "B-an": -8.697083223018778, 239 | "M-mq": -3.14e+100, 240 | "E-an": -3.14e+100, 241 | "S-s": -3.14e+100, 242 | "B-q": -6.998123858956596, 243 | "E-nrt": -3.14e+100, 244 | "B-h": -13.533365129970255, 245 | "E-r": -3.14e+100, 246 | "S-p": -2.9868401813596317, 247 | "M-tg": -3.14e+100, 248 | "S-rz": -3.14e+100, 249 | "S-nrfg": -3.14e+100, 250 | "B-vq": -12.147070768850364, 251 | "B-x": -3.14e+100, 252 | "E-vd": -3.14e+100, 253 | "E-nrfg": -3.14e+100, 254 | "S-m": -3.269200652116097, 255 | "E-vg": -3.14e+100, 256 | "S-f": -5.194820249981676, 257 | "S-j": -4.911992119644354 258 | } -------------------------------------------------------------------------------- /Segmenter/Resources/prob_trans.json: -------------------------------------------------------------------------------- 1 | { 2 | "M": { 3 | "M": -1.2603623820268226, 4 | "E": -0.33344856811948514 5 | }, 6 | "S": { 7 | "S": -0.6658631448798212, 8 | "B": -0.7211965654669841 9 | }, 10 | "B": { 11 | "M": -0.916290731874155, 12 | "E": -0.51082562376599 13 | }, 14 | "E": { 15 | "S": -0.8085250474669937, 16 | "B": -0.5897149736854513 17 | } 18 | } -------------------------------------------------------------------------------- /Segmenter/Segmenter.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | netstandard2.0 5 | Lucene.JIEba.Segment 6 | 1.0.0 7 | SilentCC 8 | JIEba.Lucene.Net is an analyzer tools for lucene.net which is kind to chinese 9 | false 10 | https://github.com/SilentCC/JIEba-netcore2.0/ 11 | Copyright 2019 (c) AgileLabs. All rights reserved. 12 | Analyzer Segment JIEba.net core2.0 13 | true 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /Segmenter/Spelling/SpellChecker.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using JiebaNet.Segmenter.Common; 4 | 5 | namespace JiebaNet.Segmenter.Spelling 6 | { 7 | public interface ISpellChecker 8 | { 9 | IEnumerable Suggests(string word); 10 | } 11 | 12 | public class SpellChecker : ISpellChecker 13 | { 14 | internal static readonly WordDictionary WordDict = WordDictionary.Instance; 15 | 16 | internal readonly Trie WordTrie; 17 | internal readonly Dictionary> FirstChars; 18 | 19 | public SpellChecker() 20 | { 21 | var wordDict = WordDictionary.Instance; 22 | WordTrie = new Trie(); 23 | FirstChars = new Dictionary>(); 24 | 25 | foreach (var wd in wordDict.Trie) 26 | { 27 | if (wd.Value > 0) 28 | { 29 | WordTrie.Insert(wd.Key, wd.Value); 30 | 31 | if (wd.Key.Length >= 2) 32 | { 33 | var second = wd.Key[1]; 34 | var first = wd.Key[0]; 35 | if (!FirstChars.ContainsKey(second)) 36 | { 37 | FirstChars[second] = new HashSet(); 38 | } 39 | FirstChars[second].Add(first); 40 | } 41 | } 42 | } 43 | } 44 | 45 | internal ISet GetEdits1(string word) 46 | { 47 | var splits = new List(); 48 | for (var i = 0; i <= word.Length; i++) 49 | { 50 | splits.Add(new WordSplit() { Left = word.Substring(0, i), Right = word.Substring(i) }); 51 | } 52 | 53 | var deletes = splits 54 | .Where(s => !string.IsNullOrEmpty(s.Right)) 55 | .Select(s => s.Left + s.Right.Substring(1)); 56 | 57 | var transposes = splits 58 | .Where(s => s.Right.Length > 1) 59 | .Select(s => s.Left + s.Right[1] + s.Right[0] + s.Right.Substring(2)); 60 | 61 | var replaces = new HashSet(); 62 | if (word.Length > 1) 63 | { 64 | var firsts = FirstChars[word[1]]; 65 | foreach (var first in firsts) 66 | { 67 | if (first != word[0]) 68 | { 69 | replaces.Add(first + word.Substring(1)); 70 | } 71 | } 72 | 73 | var node = WordTrie.Root.Children[word[0]]; 74 | for (int i = 1; node.IsNotNull() && node.Children.IsNotEmpty() && i < word.Length; i++) 75 | { 76 | foreach (var c in node.Children.Keys) 77 | { 78 | replaces.Add(word.Substring(0, i) + c + word.Substring(i + 1)); 79 | } 80 | node = node.Children.GetValueOrDefault(word[i]); 81 | } 82 | } 83 | 84 | var inserts = new HashSet(); 85 | if (word.Length > 1) 86 | { 87 | if (FirstChars.ContainsKey(word[0])) 88 | { 89 | var firsts = FirstChars[word[0]]; 90 | foreach (var first in firsts) 91 | { 92 | inserts.Add(first + word); 93 | } 94 | } 95 | 96 | var node = WordTrie.Root.Children.GetValueOrDefault(word[0]); 97 | for (int i = 0; node.IsNotNull() && node.Children.IsNotEmpty() && i < word.Length; i++) 98 | { 99 | foreach (var c in node.Children.Keys) 100 | { 101 | inserts.Add(word.Substring(0, i+1) + c + word.Substring(i+1)); 102 | } 103 | 104 | if (i < word.Length - 1) 105 | { 106 | node = node.Children.GetValueOrDefault(word[i + 1]); 107 | } 108 | } 109 | } 110 | 111 | var result = new HashSet(); 112 | result.UnionWith(deletes); 113 | result.UnionWith(transposes); 114 | result.UnionWith(replaces); 115 | result.UnionWith(inserts); 116 | 117 | return result; 118 | } 119 | 120 | internal ISet GetKnownEdits2(string word) 121 | { 122 | var result = new HashSet(); 123 | foreach (var e1 in GetEdits1(word)) 124 | { 125 | result.UnionWith(GetEdits1(e1).Where(e => WordDictionary.Instance.ContainsWord(e))); 126 | } 127 | return result; 128 | } 129 | 130 | internal ISet GetKnownWords(IEnumerable words) 131 | { 132 | return new HashSet(words.Where(w => WordDictionary.Instance.ContainsWord(w))); 133 | } 134 | 135 | public IEnumerable Suggests(string word) 136 | { 137 | if (WordDict.ContainsWord(word)) 138 | { 139 | return new[] {word}; 140 | } 141 | 142 | var candicates = GetKnownWords(GetEdits1(word)); 143 | if (candicates.IsNotEmpty()) 144 | { 145 | return candicates.OrderByDescending(c => WordDict.GetFreqOrDefault(c)); 146 | } 147 | 148 | candicates.UnionWith(GetKnownEdits2(word)); 149 | return candicates.OrderByDescending(c => WordDict.GetFreqOrDefault(c)); 150 | } 151 | } 152 | 153 | internal class WordSplit 154 | { 155 | public string Left { get; set; } 156 | public string Right { get; set; } 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /Segmenter/Token.cs: -------------------------------------------------------------------------------- 1 | namespace JiebaNet.Segmenter 2 | { 3 | public class Token 4 | { 5 | public string Word { get; set; } 6 | public int StartIndex { get; set; } 7 | public int EndIndex { get; set; } 8 | 9 | public Token(string word, int startIndex, int endIndex) 10 | { 11 | Word = word; 12 | StartIndex = startIndex; 13 | EndIndex = endIndex; 14 | } 15 | 16 | public override string ToString() 17 | { 18 | return string.Format("[{0}, ({1}, {2})]", Word, StartIndex, EndIndex); 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /Segmenter/WordDictionary.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Text; 7 | using JiebaNet.Segmenter.Common; 8 | using Microsoft.Extensions.FileProviders; 9 | using System.Reflection; 10 | 11 | namespace JiebaNet.Segmenter 12 | { 13 | public class WordDictionary 14 | { 15 | private static readonly Lazy lazy = new Lazy(() => new WordDictionary()); 16 | private static readonly string MainDict = ConfigManager.MainDictFile; 17 | 18 | internal IDictionary Trie = new Dictionary(); 19 | 20 | /// 21 | /// total occurrence of all words. 22 | /// 23 | public double Total { get; set; } 24 | 25 | private WordDictionary() 26 | { 27 | LoadDict(); 28 | 29 | Debug.WriteLine("{0} words (and their prefixes)", Trie.Count); 30 | Debug.WriteLine("total freq: {0}", Total); 31 | } 32 | 33 | public static WordDictionary Instance 34 | { 35 | get { return lazy.Value; } 36 | } 37 | 38 | private void LoadDict() 39 | { 40 | try 41 | { 42 | var stopWatch = new Stopwatch(); 43 | stopWatch.Start(); 44 | var filePath = ConfigManager.MainDictFile; 45 | var provider = new EmbeddedFileProvider(GetType().GetTypeInfo().Assembly); 46 | var fileInfo = provider.GetFileInfo(filePath); 47 | using (var sr = new StreamReader(fileInfo.CreateReadStream(), Encoding.UTF8)) 48 | { 49 | string line = null; 50 | while ((line = sr.ReadLine()) != null) 51 | { 52 | var tokens = line.Split(' '); 53 | if (tokens.Length < 2) 54 | { 55 | Debug.Fail(string.Format("Invalid line: {0}", line)); 56 | continue; 57 | } 58 | 59 | var word = tokens[0]; 60 | var freq = int.Parse(tokens[1]); 61 | 62 | Trie[word] = freq; 63 | Total += freq; 64 | 65 | foreach (var ch in Enumerable.Range(0, word.Length)) 66 | { 67 | var wfrag = word.Sub(0, ch + 1); 68 | if (!Trie.ContainsKey(wfrag)) 69 | { 70 | Trie[wfrag] = 0; 71 | } 72 | } 73 | } 74 | } 75 | 76 | stopWatch.Stop(); 77 | Debug.WriteLine("main dict load finished, time elapsed {0} ms", stopWatch.ElapsedMilliseconds); 78 | } 79 | catch (IOException e) 80 | { 81 | Debug.Fail(string.Format("{0} load failure, reason: {1}", MainDict, e.Message)); 82 | } 83 | catch (FormatException fe) 84 | { 85 | Debug.Fail(fe.Message); 86 | } 87 | } 88 | 89 | public bool ContainsWord(string word) 90 | { 91 | return Trie.ContainsKey(word) && Trie[word] > 0; 92 | } 93 | 94 | public int GetFreqOrDefault(string key) 95 | { 96 | if (ContainsWord(key)) 97 | return Trie[key]; 98 | else 99 | return 1; 100 | } 101 | 102 | public void AddWord(string word, int freq, string tag = null) 103 | { 104 | if (ContainsWord(word)) 105 | { 106 | Total -= Trie[word]; 107 | } 108 | 109 | Trie[word] = freq; 110 | Total += freq; 111 | for (var i = 0; i < word.Length; i++) 112 | { 113 | var wfrag = word.Substring(0, i + 1); 114 | if (!Trie.ContainsKey(wfrag)) 115 | { 116 | Trie[wfrag] = 0; 117 | } 118 | } 119 | } 120 | 121 | public void DeleteWord(string word) 122 | { 123 | AddWord(word, 0); 124 | } 125 | 126 | internal int SuggestFreq(string word, IEnumerable segments) 127 | { 128 | double freq = 1; 129 | foreach (var seg in segments) 130 | { 131 | freq *= GetFreqOrDefault(seg) / Total; 132 | } 133 | 134 | return Math.Max((int)(freq * Total) + 1, GetFreqOrDefault(word)); 135 | } 136 | } 137 | } -------------------------------------------------------------------------------- /Segmenter/WordInfo.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | namespace JiebaNet.Segmenter 3 | { 4 | public class WordInfo 5 | { 6 | public WordInfo(string value,int position) 7 | { 8 | this.value = value; 9 | this.position = position; 10 | } 11 | //分词的内容 12 | public string value { get; set; } 13 | //分词的初始位置 14 | public int position { get; set; } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Test/SegmentTest.cs: -------------------------------------------------------------------------------- 1 | using JiebaNet.Segmenter; 2 | using System; 3 | using System.Text; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Linq; 7 | using jieba.NET; 8 | using Xunit; 9 | 10 | namespace Test 11 | { 12 | public class SegmenterTest 13 | { 14 | [Fact] 15 | public void TestCut() 16 | { 17 | var segmenter = new JiebaSegmenter(); 18 | var segments = segmenter.Cut("我来到北京清华大学", cutAll: true); 19 | 20 | var resultWords = new List {"我", "来到", "北京", "清华", "清华大学", "华大", "大学"}; 21 | Compared(segments, resultWords); 22 | 23 | segments = segmenter.Cut("我来到北京清华大学"); 24 | resultWords = new List { "我","来到", "北京", "清华大学"}; 25 | Compared(segments, resultWords); 26 | 27 | segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式,同时也使用HMM模型 28 | resultWords = new List {"他", "来到", "了", "网易", "杭研", "大厦"}; 29 | Compared(segments, resultWords); 30 | 31 | segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造"); // 搜索引擎模式 32 | resultWords = new List {"小明","硕士" ,"毕业","于","中国" ,"科学","学院", "科学院" ,"中国科学院","计算", "计算所","," , "后" 33 | ,"在" ,"日本","京都" ,"大学", "日本京都大学" ,"深造"}; 34 | Compared(segments, resultWords); 35 | 36 | segments = segmenter.Cut("结过婚的和尚未结过婚的"); 37 | resultWords = new List {"结过婚","的" ,"和" ,"尚未" ,"结过婚","的"}; 38 | 39 | Compared(segments, resultWords); 40 | 41 | segments = segmenter.Cut("快奔三", false, false); 42 | resultWords = new List {"快","奔三"}; 43 | 44 | Compared(segments, resultWords); 45 | } 46 | 47 | private void Compared(IEnumerable segments,List resultWords) 48 | { 49 | Assert.Equal(segments.Count(),resultWords.Count()); 50 | for (int i = 0; i < segments.Count(); i++) 51 | { 52 | Assert.Equal(segments.ElementAt(i),resultWords[i]); 53 | } 54 | } 55 | 56 | [Fact] 57 | public void TestNewCut() 58 | { 59 | var segmenter = new JiebaSegmenter(); 60 | 61 | var wordInfos = segmenter.Cut2("推荐系统终于发布了最终的版本,点击率蹭蹭上涨"); 62 | 63 | Assert.Equal(wordInfos.ElementAt(0).position, 0); 64 | for (int i = 1; i < wordInfos.Count(); i++) 65 | { 66 | Assert.Equal(wordInfos.ElementAt(i).position, 67 | wordInfos.ElementAt(i - 1).position + wordInfos.ElementAt(i - 1).value.Length); 68 | } 69 | } 70 | 71 | [Fact] 72 | public void TestJIEbaTokenizer() 73 | { 74 | var tokenizer = new JieBaTokenizer(TextReader.Null, TokenizerMode.Default); 75 | 76 | Assert.NotEmpty(tokenizer.StopWords); 77 | 78 | Assert.True(tokenizer.StopWords.ContainsKey("是")); 79 | Assert.True(tokenizer.StopWords.ContainsKey("什么")); 80 | 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /Test/Test.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | netcoreapp2.2 5 | false 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | all 17 | runtime; build; native; contentfiles; analyzers; buildtransitive 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /jieba.NET.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.0.32126.317 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "jieba.NET", "jieba.NET\jieba.NET.csproj", "{89EFA758-206C-4681-ACF6-6F2AB2415279}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Analyser", "Analyser\Analyser.csproj", "{4F0DEF27-C5FE-448F-9B08-F8C2254A1075}" 9 | EndProject 10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Segmenter", "Segmenter\Segmenter.csproj", "{C564CDCB-B52B-455E-86E9-FC0DAE37EF08}" 11 | EndProject 12 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ConsoleApp1", "ConsoleApp1\ConsoleApp1.csproj", "{C90214B5-CE37-46C4-9CC7-C9C6A2FBD452}" 13 | EndProject 14 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "EasyLuceneNET", "EasyLuceneNET\EasyLuceneNET.csproj", "{5458D618-C3FA-4B19-B1AF-7950F789AA14}" 15 | EndProject 16 | Global 17 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 18 | Debug|Any CPU = Debug|Any CPU 19 | Release|Any CPU = Release|Any CPU 20 | EndGlobalSection 21 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 22 | {89EFA758-206C-4681-ACF6-6F2AB2415279}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 23 | {89EFA758-206C-4681-ACF6-6F2AB2415279}.Debug|Any CPU.Build.0 = Debug|Any CPU 24 | {89EFA758-206C-4681-ACF6-6F2AB2415279}.Release|Any CPU.ActiveCfg = Release|Any CPU 25 | {89EFA758-206C-4681-ACF6-6F2AB2415279}.Release|Any CPU.Build.0 = Release|Any CPU 26 | {4F0DEF27-C5FE-448F-9B08-F8C2254A1075}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 27 | {4F0DEF27-C5FE-448F-9B08-F8C2254A1075}.Debug|Any CPU.Build.0 = Debug|Any CPU 28 | {4F0DEF27-C5FE-448F-9B08-F8C2254A1075}.Release|Any CPU.ActiveCfg = Release|Any CPU 29 | {4F0DEF27-C5FE-448F-9B08-F8C2254A1075}.Release|Any CPU.Build.0 = Release|Any CPU 30 | {C564CDCB-B52B-455E-86E9-FC0DAE37EF08}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 31 | {C564CDCB-B52B-455E-86E9-FC0DAE37EF08}.Debug|Any CPU.Build.0 = Debug|Any CPU 32 | {C564CDCB-B52B-455E-86E9-FC0DAE37EF08}.Release|Any CPU.ActiveCfg = Release|Any CPU 33 | {C564CDCB-B52B-455E-86E9-FC0DAE37EF08}.Release|Any CPU.Build.0 = Release|Any CPU 34 | {C90214B5-CE37-46C4-9CC7-C9C6A2FBD452}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 35 | {C90214B5-CE37-46C4-9CC7-C9C6A2FBD452}.Debug|Any CPU.Build.0 = Debug|Any CPU 36 | {C90214B5-CE37-46C4-9CC7-C9C6A2FBD452}.Release|Any CPU.ActiveCfg = Release|Any CPU 37 | {C90214B5-CE37-46C4-9CC7-C9C6A2FBD452}.Release|Any CPU.Build.0 = Release|Any CPU 38 | {5458D618-C3FA-4B19-B1AF-7950F789AA14}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 39 | {5458D618-C3FA-4B19-B1AF-7950F789AA14}.Debug|Any CPU.Build.0 = Debug|Any CPU 40 | {5458D618-C3FA-4B19-B1AF-7950F789AA14}.Release|Any CPU.ActiveCfg = Release|Any CPU 41 | {5458D618-C3FA-4B19-B1AF-7950F789AA14}.Release|Any CPU.Build.0 = Release|Any CPU 42 | EndGlobalSection 43 | GlobalSection(SolutionProperties) = preSolution 44 | HideSolutionNode = FALSE 45 | EndGlobalSection 46 | GlobalSection(ExtensibilityGlobals) = postSolution 47 | SolutionGuid = {4A38C532-715A-4F73-8690-CF9424A2EABE} 48 | EndGlobalSection 49 | EndGlobal 50 | -------------------------------------------------------------------------------- /jieba.NET/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolqingcheng/EasyLuceneNET/60d445d1e91e1864b31c7c4013fe105e70544f8f/jieba.NET/.DS_Store -------------------------------------------------------------------------------- /jieba.NET/JieBaAnalyzer.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Lucene.Net.Analysis; 3 | using Lucene.Net.Analysis.Core; 4 | using Lucene.Net.Analysis.TokenAttributes; 5 | using Lucene.Net.Analysis.Util; 6 | using System.IO; 7 | using JiebaNet.Segmenter; 8 | 9 | 10 | namespace jieba.NET 11 | { 12 | public class JieBaAnalyzer 13 | :Analyzer 14 | { 15 | public TokenizerMode mode; 16 | public JieBaAnalyzer(TokenizerMode Mode) 17 | :base() 18 | { 19 | this.mode = Mode; 20 | } 21 | 22 | protected override TokenStreamComponents CreateComponents(string filedName,TextReader reader) 23 | { 24 | var tokenizer = new JieBaTokenizer(reader,mode); 25 | 26 | var tokenstream = (TokenStream)new LowerCaseFilter(Lucene.Net.Util.LuceneVersion.LUCENE_48, tokenizer); 27 | 28 | tokenstream.AddAttribute(); 29 | tokenstream.AddAttribute(); 30 | 31 | return new TokenStreamComponents(tokenizer, tokenstream); 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /jieba.NET/JieBaTokenizer.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Lucene.Net.Analysis.TokenAttributes; 3 | using Lucene.Net.Analysis; 4 | using JiebaNet.Segmenter; 5 | using System.IO; 6 | using System.Collections.Generic; 7 | using System.Reflection; 8 | using Microsoft.Extensions.FileProviders; 9 | 10 | namespace jieba.NET 11 | { 12 | public class JieBaTokenizer 13 | : Tokenizer 14 | { 15 | private static bool _initial = false; 16 | private string _inputText; 17 | private bool _originalResult = false; 18 | private int _start = 0; 19 | 20 | private readonly string _stropWordsPath = "Resources/stopwords.txt"; 21 | 22 | private readonly JiebaSegmenter _segmenter; 23 | private TokenizerMode _mode; 24 | private ICharTermAttribute _termAtt; 25 | private IOffsetAttribute _offsetAtt; 26 | private IPositionIncrementAttribute _posIncrAtt; 27 | private ITypeAttribute _typeAtt; 28 | 29 | private Dictionary _stopWords = new Dictionary(); 30 | private List _wordList = new List(); 31 | 32 | private IEnumerator _iter; 33 | 34 | public JieBaTokenizer(TextReader input, TokenizerMode Mode) 35 | : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input) 36 | { 37 | _segmenter = new JiebaSegmenter(); 38 | _mode = Mode; 39 | LoadStopWords(); 40 | Init(); 41 | } 42 | 43 | public Dictionary StopWords 44 | { 45 | get => _stopWords; 46 | } 47 | 48 | private void LoadStopWords() 49 | { 50 | var fileProvider = new EmbeddedFileProvider(GetType().GetTypeInfo().Assembly); 51 | var fileInfo = fileProvider.GetFileInfo(_stropWordsPath); 52 | 53 | using (var reader = new StreamReader(fileInfo.CreateReadStream())) 54 | { 55 | var s = ""; 56 | while ((s = reader.ReadLine()) != null) 57 | { 58 | if (String.IsNullOrEmpty(s)) 59 | continue; 60 | if (_stopWords.ContainsKey(s)) 61 | continue; 62 | _stopWords.Add(s, 1); 63 | } 64 | } 65 | } 66 | 67 | private void Init() 68 | { 69 | _termAtt = AddAttribute(); 70 | _offsetAtt = AddAttribute(); 71 | _posIncrAtt = AddAttribute(); 72 | _typeAtt = AddAttribute(); 73 | } 74 | 75 | private string ReadToEnd(TextReader input) 76 | { 77 | return input.ReadToEnd(); 78 | } 79 | 80 | public sealed override Boolean IncrementToken() 81 | { 82 | ClearAttributes(); 83 | 84 | var word = Next(); 85 | if (word != null) 86 | { 87 | var buffer = word.ToString(); 88 | _termAtt.SetEmpty().Append(buffer); 89 | _offsetAtt.SetOffset(CorrectOffset(word.StartOffset), CorrectOffset(word.EndOffset)); 90 | _typeAtt.Type = word.Type; 91 | return true; 92 | } 93 | 94 | End(); 95 | Dispose(); 96 | return false; 97 | } 98 | 99 | private Lucene.Net.Analysis.Token Next() 100 | { 101 | var length = 0; 102 | var res = _iter.MoveNext(); 103 | if (res) 104 | { 105 | var word = _iter.Current; 106 | var token = new Lucene.Net.Analysis.Token(word.Word, word.StartIndex, word.EndIndex); 107 | _start += length; 108 | return token; 109 | } 110 | return null; 111 | } 112 | 113 | public override void Reset() 114 | { 115 | base.Reset(); 116 | 117 | _inputText = ReadToEnd(base.m_input); 118 | RemoveStopWords(_segmenter.Tokenize(_inputText, _mode)); 119 | 120 | _start = 0; 121 | _iter = _wordList.GetEnumerator(); 122 | } 123 | 124 | private void RemoveStopWords(IEnumerable words) 125 | { 126 | _wordList.Clear(); 127 | 128 | foreach (var x in words) 129 | { 130 | if (!_stopWords.ContainsKey(x.Word)) 131 | { 132 | _wordList.Add(x); 133 | } 134 | } 135 | } 136 | } 137 | } -------------------------------------------------------------------------------- /jieba.NET/Resources/stopwords.txt: -------------------------------------------------------------------------------- 1 | i 2 | me 3 | my 4 | myself 5 | we 6 | our 7 | ours 8 | ourselves 9 | you 10 | your 11 | yours 12 | yourself 13 | yourselves 14 | he 15 | him 16 | his 17 | himself 18 | she 19 | her 20 | hers 21 | herself 22 | it 23 | its 24 | itself 25 | they 26 | them 27 | their 28 | theirs 29 | themselves 30 | what 31 | which 32 | who 33 | whom 34 | this 35 | that 36 | these 37 | those 38 | am 39 | is 40 | are 41 | was 42 | were 43 | be 44 | been 45 | being 46 | have 47 | has 48 | had 49 | having 50 | do 51 | does 52 | did 53 | doing 54 | a 55 | an 56 | the 57 | and 58 | but 59 | if 60 | or 61 | because 62 | as 63 | until 64 | while 65 | of 66 | at 67 | by 68 | for 69 | with 70 | about 71 | against 72 | between 73 | into 74 | through 75 | during 76 | before 77 | after 78 | above 79 | below 80 | to 81 | from 82 | up 83 | down 84 | in 85 | out 86 | on 87 | off 88 | over 89 | under 90 | again 91 | further 92 | then 93 | once 94 | here 95 | there 96 | when 97 | where 98 | why 99 | how 100 | all 101 | any 102 | both 103 | each 104 | few 105 | more 106 | most 107 | other 108 | some 109 | such 110 | no 111 | nor 112 | not 113 | only 114 | own 115 | same 116 | so 117 | than 118 | too 119 | very 120 | s 121 | t 122 | can 123 | will 124 | just 125 | don 126 | should 127 | now 128 | 一番 129 | 一直 130 | 一个 131 | 一些 132 | 许多 133 | 种 134 | 有的是 135 | 也就是说 136 | 阿 137 | 哎呀 138 | 哎哟 139 | 俺 140 | 俺们 141 | 按 142 | 按照 143 | 吧 144 | 吧哒 145 | 把 146 | 罢了 147 | 被 148 | 本 149 | 本着 150 | 比 151 | 比方 152 | 比如 153 | 鄙人 154 | 彼 155 | 彼此 156 | 边 157 | 别 158 | 别的 159 | 别说 160 | 并 161 | 并且 162 | 不比 163 | 不成 164 | 不单 165 | 不但 166 | 不独 167 | 不管 168 | 不光 169 | 不过 170 | 不仅 171 | 不拘 172 | 不论 173 | 不怕 174 | 不然 175 | 不如 176 | 不特 177 | 不惟 178 | 不问 179 | 不只 180 | 朝 181 | 朝着 182 | 趁 183 | 趁着 184 | 乘 185 | 冲 186 | 除 187 | 除此之外 188 | 除非 189 | 除了 190 | 此 191 | 此间 192 | 此外 193 | 从 194 | 从而 195 | 打 196 | 待 197 | 但 198 | 但是 199 | 当 200 | 当着 201 | 到 202 | 得 203 | 的 204 | 的话 205 | 等 206 | 等等 207 | 地 208 | 第 209 | 叮咚 210 | 对 211 | 对于 212 | 多 213 | 多少 214 | 而 215 | 而况 216 | 而且 217 | 而是 218 | 而外 219 | 而言 220 | 而已 221 | 尔后 222 | 反过来 223 | 反过来说 224 | 反之 225 | 非但 226 | 非徒 227 | 否则 228 | 嘎 229 | 嘎登 230 | 该 231 | 赶 232 | 个 233 | 各 234 | 各个 235 | 各位 236 | 各种 237 | 各自 238 | 给 239 | 根据 240 | 跟 241 | 故 242 | 故此 243 | 固然 244 | 关于 245 | 管 246 | 归 247 | 果然 248 | 果真 249 | 过 250 | 和 251 | 何 252 | 何处 253 | 何况 254 | 何时 255 | 嘿 256 | 哼 257 | 哼唷 258 | 呼哧 259 | 乎 260 | 哗 261 | 还是 262 | 还有 263 | 换句话说 264 | 换言之 265 | 或 266 | 或是 267 | 或者 268 | 极了 269 | 及 270 | 及其 271 | 及至 272 | 即 273 | 即便 274 | 即或 275 | 即令 276 | 即若 277 | 即使 278 | 几 279 | 几时 280 | 己 281 | 既 282 | 既然 283 | 既是 284 | 继而 285 | 加之 286 | 假如 287 | 假若 288 | 假使 289 | 鉴于 290 | 将 291 | 较 292 | 较之 293 | 叫 294 | 接着 295 | 结果 296 | 借 297 | 紧接着 298 | 进而 299 | 尽 300 | 尽管 301 | 经 302 | 经过 303 | 就 304 | 就是 305 | 就是说 306 | 据 307 | 具体地说 308 | 具体说来 309 | 开始 310 | 开外 311 | 靠 312 | 咳 313 | 可 314 | 可见 315 | 可是 316 | 可以 317 | 况且 318 | 啦 319 | 来 320 | 来着 321 | 离 322 | 例如 323 | 哩 324 | 连 325 | 连同 326 | 两者 327 | 了 328 | 临 329 | 另 330 | 另外 331 | 另一方面 332 | 论 333 | 嘛 334 | 吗 335 | 慢说 336 | 漫说 337 | 冒 338 | 么 339 | 每 340 | 每当 341 | 们 342 | 莫若 343 | 某 344 | 某个 345 | 某些 346 | 拿 347 | 哪 348 | 哪边 349 | 哪儿 350 | 哪个 351 | 哪里 352 | 哪年 353 | 哪怕 354 | 哪天 355 | 哪些 356 | 哪样 357 | 那 358 | 那边 359 | 那儿 360 | 那个 361 | 那会儿 362 | 那里 363 | 那么 364 | 那么些 365 | 那么样 366 | 那时 367 | 那些 368 | 那样 369 | 乃 370 | 乃至 371 | 呢 372 | 能 373 | 你 374 | 你们 375 | 您 376 | 宁 377 | 宁可 378 | 宁肯 379 | 宁愿 380 | 哦 381 | 啪达 382 | 旁人 383 | 凭 384 | 凭借 385 | 其 386 | 其次 387 | 其二 388 | 其他 389 | 其它 390 | 其一 391 | 其余 392 | 其中 393 | 起 394 | 起见 395 | 起见 396 | 岂但 397 | 恰恰相反 398 | 前后 399 | 前者 400 | 且 401 | 然而 402 | 然后 403 | 然则 404 | 让 405 | 人家 406 | 任 407 | 任何 408 | 任凭 409 | 如 410 | 如此 411 | 如果 412 | 如何 413 | 如其 414 | 如若 415 | 如上所述 416 | 若 417 | 若非 418 | 若是 419 | 啥 420 | 上下 421 | 尚且 422 | 设若 423 | 设使 424 | 甚而 425 | 甚么 426 | 甚至 427 | 省得 428 | 时候 429 | 什么 430 | 什么样 431 | 使得 432 | 是 433 | 是的 434 | 首先 435 | 谁 436 | 顺 437 | 顺着 438 | 似的 439 | 虽 440 | 虽然 441 | 虽说 442 | 虽则 443 | 随 444 | 随着 445 | 所 446 | 所以 447 | 他 448 | 他们 449 | 他人 450 | 它 451 | 它们 452 | 她 453 | 她们 454 | 倘 455 | 倘或 456 | 倘然 457 | 倘若 458 | 倘使 459 | 腾 460 | 替 461 | 通过 462 | 同 463 | 同时 464 | 哇 465 | 万一 466 | 往 467 | 望 468 | 为 469 | 为何 470 | 为了 471 | 为什么 472 | 为着 473 | 喂 474 | 嗡嗡 475 | 我 476 | 我们 477 | 呜 478 | 呜呼 479 | 乌乎 480 | 无论 481 | 无宁 482 | 毋宁 483 | 嘻 484 | 吓 485 | 相对而言 486 | 像 487 | 向 488 | 向着 489 | 嘘 490 | 焉 491 | 沿 492 | 沿着 493 | 要 494 | 要不 495 | 要不然 496 | 要不是 497 | 要么 498 | 要是 499 | 也 500 | 也罢 501 | 也好 502 | 一 503 | 一旦 504 | 一方面 505 | 一来 506 | 一切 507 | 一样 508 | 一则 509 | 依 510 | 依照 511 | 矣 512 | 以 513 | 以便 514 | 以及 515 | 以免 516 | 以至 517 | 以至于 518 | 以致 519 | 抑或 520 | 因 521 | 因此 522 | 因而 523 | 因为 524 | 用 525 | 由 526 | 由此可见 527 | 由于 528 | 有 529 | 有的 530 | 有关 531 | 有些 532 | 又 533 | 于 534 | 于是 535 | 于是乎 536 | 与 537 | 与此同时 538 | 与否 539 | 与其 540 | 越是 541 | 云云 542 | 哉 543 | 再说 544 | 再者 545 | 在 546 | 在下 547 | 咱 548 | 咱们 549 | 则 550 | 怎 551 | 怎么办 552 | 怎么样 553 | 咋 554 | 照 555 | 照着 556 | 者 557 | 这 558 | 这边 559 | 这儿 560 | 这个 561 | 这会儿 562 | 这就是说 563 | 这里 564 | 这么 565 | 这么点儿 566 | 这么些 567 | 这么样 568 | 这时 569 | 这些 570 | 这样 571 | 正如 572 | 吱 573 | 之 574 | 之类 575 | 之所以 576 | 之一 577 | 只是 578 | 只限 579 | 只要 580 | 只有 581 | 至 582 | 至于 583 | 诸位 584 | 着 585 | 着呢 586 | 自 587 | 自从 588 | 自个儿 589 | 自各儿 590 | 自己 591 | 自家 592 | 自身 593 | 综上所述 594 | 总的来看 595 | 总的来说 596 | 总的说来 597 | 总而言之 598 | 总之 599 | 纵 600 | 纵令 601 | 纵然 602 | 纵使 603 | 遵照 604 | 作为 605 | 兮 606 | 呗 607 | 咚 608 | 咦 609 | 喏 610 | 啐 611 | 喔唷 612 | 嗬 613 | 嗯 614 | 嗳 615 | 。 616 | , 617 | : 618 | ; 619 | 、 620 | “ 621 | ” 622 | 【 623 | 】 624 | 《 625 | 》 626 | ( 627 | ) 628 | — 629 | … 630 | . 631 | , 632 | : 633 | ; 634 | " 635 | " 636 | [ 637 | ] 638 | < 639 | > 640 | ( 641 | ) 642 | @ 643 | # 644 | * 645 | & 646 | % 647 | ¥ 648 | $ 649 | - 650 | + 651 | = 652 | | 653 | \ 654 | -------------------------------------------------------------------------------- /jieba.NET/jieba.NET.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | netstandard2.0 5 | Lucene.JIEba.net 6 | 1.1.1 7 | SilentCC 8 | JIEba.Lucene.Net is an analyzer tools for lucene.net which is kind to chinese 9 | false 10 | https://github.com/SilentCC/JIEba-netcore2.0/ 11 | Copyright 2019 (c) AgileLabs. All rights reserved. 12 | Analyzer Segment JIEba.net core2.0 13 | true 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | --------------------------------------------------------------------------------