├── readme.md ├── WordSegmentation ├── WordInfo.cs ├── Properties │ └── AssemblyInfo.cs ├── WordSegmentation.csproj ├── stopword.txt ├── ArticleUtils.cs ├── Dict.cs └── WordTool.cs ├── Example ├── Properties │ └── AssemblyInfo.cs ├── Example.csproj └── Program.cs ├── WordSegmentation.sln └── .gitignore /readme.md: -------------------------------------------------------------------------------- 1 | ## 中文分词组件 2 | 3 | ### 主要功能 4 | 1. 中文分词 5 | 2. 计算文章相关性 6 | 7 | 8 | ### 使用算法 9 | 10 | 1. 使用 Trie 树结构进行词语查找 11 | 2. 生成句子中所有成词情况的有向无环图 12 | 3. 使用动态规划查找最大概率路径 13 | 14 | ### 不足 15 | 16 | 1. 未处理未登陆词 17 | -------------------------------------------------------------------------------- /WordSegmentation/WordInfo.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace WordSegmentation 7 | { 8 | public class WordInfo 9 | { 10 | /// 11 | /// 文档频率 12 | /// 13 | public float Freq { get; set; } 14 | 15 | /// 16 | /// 逆文档频率 17 | /// 18 | public float IDF { get; set; } 19 | 20 | public int RowNumber { get; set; } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /Example/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // 有关程序集的常规信息通过以下 6 | // 特性集控制。更改这些特性值可修改 7 | // 与程序集关联的信息。 8 | [assembly: AssemblyTitle("Example")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("Microsoft")] 12 | [assembly: AssemblyProduct("Example")] 13 | [assembly: AssemblyCopyright("Copyright © Microsoft 2013")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // 将 ComVisible 设置为 false 使此程序集中的类型 18 | // 对 COM 组件不可见。如果需要从 COM 访问此程序集中的类型, 19 | // 则将该类型上的 ComVisible 特性设置为 true。 20 | [assembly: ComVisible(false)] 21 | 22 | // 如果此项目向 COM 公开,则下列 GUID 用于类型库的 ID 23 | [assembly: Guid("445998de-268b-4c33-8094-ff80a7d7393f")] 24 | 25 | // 程序集的版本信息由下面四个值组成: 26 | // 27 | // 主版本 28 | // 次版本 29 | // 生成号 30 | // 修订号 31 | // 32 | // 可以指定所有这些值,也可以使用“生成号”和“修订号”的默认值, 33 | // 方法是按如下所示使用“*”: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /WordSegmentation/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // 有关程序集的常规信息通过以下 6 | // 特性集控制。更改这些特性值可修改 7 | // 与程序集关联的信息。 8 | [assembly: AssemblyTitle("WordSegmentation")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("Microsoft")] 12 | [assembly: AssemblyProduct("WordSegmentation")] 13 | [assembly: AssemblyCopyright("Copyright © Microsoft 2013")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // 将 ComVisible 设置为 false 使此程序集中的类型 18 | // 对 COM 组件不可见。如果需要从 COM 访问此程序集中的类型, 19 | // 则将该类型上的 ComVisible 特性设置为 true。 20 | [assembly: ComVisible(false)] 21 | 22 | // 如果此项目向 COM 公开,则下列 GUID 用于类型库的 ID 23 | [assembly: Guid("33d57035-9050-4051-ba4e-0bfd374f9a04")] 24 | 25 | // 程序集的版本信息由下面四个值组成: 26 | // 27 | // 主版本 28 | // 次版本 29 | // 生成号 30 | // 修订号 31 | // 32 | // 可以指定所有这些值,也可以使用“生成号”和“修订号”的默认值, 33 | // 方法是按如下所示使用“*”: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /WordSegmentation.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2012 4 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "WordSegmentation", "WordSegmentation\WordSegmentation.csproj", "{12A2A010-81A9-4466-BDAB-D55EF338A770}" 5 | EndProject 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Example", "Example\Example.csproj", "{1CF5A71B-9A9A-4446-A758-435595BF90A6}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {12A2A010-81A9-4466-BDAB-D55EF338A770}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {12A2A010-81A9-4466-BDAB-D55EF338A770}.Debug|Any CPU.Build.0 = Debug|Any CPU 16 | {12A2A010-81A9-4466-BDAB-D55EF338A770}.Release|Any CPU.ActiveCfg = Release|Any CPU 17 | {12A2A010-81A9-4466-BDAB-D55EF338A770}.Release|Any CPU.Build.0 = Release|Any CPU 18 | {1CF5A71B-9A9A-4446-A758-435595BF90A6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 19 | {1CF5A71B-9A9A-4446-A758-435595BF90A6}.Debug|Any CPU.Build.0 = Debug|Any CPU 20 | {1CF5A71B-9A9A-4446-A758-435595BF90A6}.Release|Any CPU.ActiveCfg = Release|Any CPU 21 | {1CF5A71B-9A9A-4446-A758-435595BF90A6}.Release|Any CPU.Build.0 = Release|Any CPU 22 | EndGlobalSection 23 | GlobalSection(SolutionProperties) = preSolution 24 | HideSolutionNode = FALSE 25 | EndGlobalSection 26 | EndGlobal 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.sln.docstates 8 | 9 | # Build results 10 | 11 | [Dd]ebug/ 12 | [Rr]elease/ 13 | x64/ 14 | build/ 15 | [Bb]in/ 16 | [Oo]bj/ 17 | 18 | # MSTest test Results 19 | [Tt]est[Rr]esult*/ 20 | [Bb]uild[Ll]og.* 21 | 22 | *_i.c 23 | *_p.c 24 | *.ilk 25 | *.meta 26 | *.obj 27 | *.pch 28 | *.pdb 29 | *.pgc 30 | *.pgd 31 | *.rsp 32 | *.sbr 33 | *.tlb 34 | *.tli 35 | *.tlh 36 | *.tmp 37 | *.tmp_proj 38 | *.log 39 | *.vspscc 40 | *.vssscc 41 | .builds 42 | *.pidb 43 | *.log 44 | *.scc 45 | 46 | # Visual C++ cache files 47 | ipch/ 48 | *.aps 49 | *.ncb 50 | *.opensdf 51 | *.sdf 52 | *.cachefile 53 | 54 | # Visual Studio profiler 55 | *.psess 56 | *.vsp 57 | *.vspx 58 | 59 | # Guidance Automation Toolkit 60 | *.gpState 61 | 62 | # ReSharper is a .NET coding add-in 63 | _ReSharper*/ 64 | *.[Rr]e[Ss]harper 65 | 66 | # TeamCity is a build add-in 67 | _TeamCity* 68 | 69 | # DotCover is a Code Coverage Tool 70 | *.dotCover 71 | 72 | # NCrunch 73 | *.ncrunch* 74 | .*crunch*.local.xml 75 | 76 | # Installshield output folder 77 | [Ee]xpress/ 78 | 79 | # DocProject is a documentation generator add-in 80 | DocProject/buildhelp/ 81 | DocProject/Help/*.HxT 82 | DocProject/Help/*.HxC 83 | DocProject/Help/*.hhc 84 | DocProject/Help/*.hhk 85 | DocProject/Help/*.hhp 86 | DocProject/Help/Html2 87 | DocProject/Help/html 88 | 89 | # Click-Once directory 90 | publish/ 91 | 92 | # Publish Web Output 93 | *.Publish.xml 94 | *.pubxml 95 | 96 | # NuGet Packages Directory 97 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line 98 | #packages/ 99 | 100 | # Windows Azure Build Output 101 | csx 102 | *.build.csdef 103 | 104 | # Windows Store app package directory 105 | AppPackages/ 106 | 107 | # Others 108 | sql/ 109 | ClientBin/ 110 | [Ss]tyle[Cc]op.* 111 | ~$* 112 | *~ 113 | *.dbmdl 114 | *.[Pp]ublish.xml 115 | *.pfx 116 | *.publishsettings 117 | 118 | # RIA/Silverlight projects 119 | Generated_Code/ 120 | 121 | # Backup & report files from converting an old project file to a newer 122 | # Visual Studio version. Backup files are not needed, because we have git ;-) 123 | _UpgradeReport_Files/ 124 | Backup*/ 125 | UpgradeLog*.XML 126 | UpgradeLog*.htm 127 | 128 | # SQL Server files 129 | App_Data/*.mdf 130 | App_Data/*.ldf 131 | 132 | # ========================= 133 | # Windows detritus 134 | # ========================= 135 | 136 | # Windows image file caches 137 | Thumbs.db 138 | ehthumbs.db 139 | 140 | # Folder config file 141 | Desktop.ini 142 | 143 | # Recycle Bin used on file shares 144 | $RECYCLE.BIN/ 145 | 146 | # Mac crap 147 | .DS_Store 148 | 149 | -------------------------------------------------------------------------------- /Example/Example.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {1CF5A71B-9A9A-4446-A758-435595BF90A6} 8 | Exe 9 | Properties 10 | Example 11 | Example 12 | v4.0 13 | 512 14 | 15 | 16 | AnyCPU 17 | true 18 | full 19 | false 20 | bin\Debug\ 21 | DEBUG;TRACE 22 | prompt 23 | 4 24 | 25 | 26 | AnyCPU 27 | pdbonly 28 | true 29 | bin\Release\ 30 | TRACE 31 | prompt 32 | 4 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | {12a2a010-81a9-4466-bdab-d55ef338a770} 50 | WordSegmentation 51 | 52 | 53 | 54 | 61 | -------------------------------------------------------------------------------- /WordSegmentation/WordSegmentation.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {12A2A010-81A9-4466-BDAB-D55EF338A770} 8 | Library 9 | Properties 10 | WordSegmentation 11 | WordSegmentation 12 | v4.0 13 | 512 14 | 15 | 16 | true 17 | full 18 | false 19 | bin\Debug\ 20 | DEBUG;TRACE 21 | prompt 22 | 4 23 | 24 | 25 | pdbonly 26 | true 27 | bin\Release\ 28 | TRACE 29 | prompt 30 | 4 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | PreserveNewest 50 | 51 | 52 | PreserveNewest 53 | 54 | 55 | 56 | 63 | -------------------------------------------------------------------------------- /WordSegmentation/stopword.txt: -------------------------------------------------------------------------------- 1 | about 2 | a 3 | after 4 | all 5 | also 6 | an 7 | and 8 | another 9 | any 10 | are 11 | as 12 | at 13 | be 14 | because 15 | been 16 | before 17 | being 18 | between 19 | both 20 | but 21 | by 22 | came 23 | can 24 | come 25 | could 26 | did 27 | do 28 | each 29 | for 30 | from 31 | get 32 | got 33 | had 34 | has 35 | have 36 | he 37 | her 38 | here 39 | him 40 | himself 41 | his 42 | how 43 | i 44 | if 45 | in 46 | into 47 | is 48 | it 49 | like 50 | make 51 | many 52 | me 53 | might 54 | more 55 | most 56 | much 57 | must 58 | my 59 | never 60 | now 61 | of 62 | on 63 | only 64 | or 65 | other 66 | our 67 | out 68 | over 69 | said 70 | same 71 | see 72 | should 73 | since 74 | some 75 | still 76 | such 77 | take 78 | than 79 | that 80 | the 81 | their 82 | them 83 | then 84 | there 85 | these 86 | they 87 | this 88 | those 89 | through 90 | to 91 | too 92 | under 93 | up 94 | very 95 | was 96 | way 97 | we 98 | well 99 | were 100 | what 101 | where 102 | which 103 | while 104 | who 105 | with 106 | would 107 | you 108 | your 109 | 的 110 | 了 111 | 在 112 | 是 113 | 我 114 | 有 115 | 和 116 | 就 117 | 不 118 | 入 119 | 都 120 | 一 121 | 一个 122 | 上 123 | 也 124 | 很 125 | 到 126 | 说 127 | 要 128 | 去 129 | 你 130 | 会 131 | 着 132 | 没有 133 | 看 134 | 好 135 | 自己 136 | 这 137 | 来 138 | 想 139 | 中 140 | 多 141 | 还 142 | 对 143 | 让 144 | 又 145 | 而 146 | 里 147 | 我的 148 | 可以 149 | 就是 150 | 能 151 | 把 152 | 他 153 | 个 154 | 给 155 | 这个 156 | 我们 157 | 过 158 | 得 159 | 但 160 | 被 161 | 时候 162 | 还是 163 | 那 164 | 做 165 | 什么 166 | 为 167 | 地 168 | 因为 169 | 大 170 | 才不 171 | 吧 172 | 最 173 | 没 174 | 从 175 | 小 176 | 与 177 | 不是 178 | 年 179 | 用 180 | 已经 181 | 再 182 | 下 183 | 月 184 | 时 185 | 如果 186 | 却 187 | 开始 188 | 后 189 | 呢 190 | 啊 191 | 但是 192 | 所以 193 | 天 194 | 之 195 | 都是 196 | 才 197 | 她 198 | 等 199 | 很多 200 | 这样 201 | 觉得 202 | 只 203 | 出 204 | 也不 205 | 像 206 | 点 207 | 日 208 | 那么 209 | 更 210 | 看到 211 | 只是 212 | 还有 213 | 将 214 | 当 215 | 以 216 | 前 217 | 他们 218 | 可 219 | 一直 220 | 不过 221 | 事 222 | 不能 223 | 其实 224 | 你的 225 | 也是 226 | 所 227 | 一些 228 | 一样 229 | 它 230 | 怎么 231 | 两 232 | 虽然 233 | 一下 234 | 写 235 | 走 236 | 比 237 | 吃 238 | 可能 239 | 于 240 | 可是 241 | 应该 242 | 真 243 | 只有 244 | 无 245 | 跟 246 | 最后 247 | 吗 248 | 然后 249 | 不会 250 | 听 251 | 这么 252 | 这是 253 | 那个 254 | 这些 255 | 们 256 | 他的 257 | 所有 258 | 起 259 | 一种 260 | 家 261 | 三 262 | 起来 263 | 一次 264 | 到了 265 | 不要 266 | 叫 267 | 谁 268 | 为了 269 | 而且 270 | 这种 271 | 中的 272 | 长 273 | 一起 274 | 一天 275 | 打 276 | 或 277 | 两个 278 | 第一 279 | 为什么 280 | 成 281 | 找 282 | 一个人 283 | 地方 284 | 我也 285 | 一切 286 | 终于 287 | 总是 288 | 向 289 | 如 290 | 我是 291 | 以后 292 | 也许 293 | 一点 294 | 该 295 | 于是 296 | 之后 297 | 或者 298 | 那些 299 | 的话 300 | 其 301 | 一定 302 | 有些 303 | 些 304 | 真是 305 | 成为 306 | 行 307 | 当然 308 | 只能 309 | 我在 310 | 甚至 311 | 对于 312 | 如此 313 | 这里 314 | 我们的 315 | 只要 316 | 她的 317 | 后来 318 | 都有 319 | 以为 320 | 好象 321 | 我说 -------------------------------------------------------------------------------- /WordSegmentation/ArticleUtils.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | 7 | namespace WordSegmentation 8 | { 9 | public static class ArticleUtils 10 | { 11 | /// 12 | /// 计算相似度 13 | /// 14 | /// 15 | /// 16 | /// 17 | /// 18 | /// 19 | public static double CaclSimilar(Dictionary vector1, Dictionary vector2, double length1, double length2) 20 | { 21 | if (vector1.Count == 0 && vector2.Count == 0) 22 | return 1; 23 | if (vector1.Count == 0 || vector2.Count == 0) 24 | return 0; 25 | 26 | float numerator = 0; 27 | foreach (KeyValuePair idAndTFIDF in vector1) 28 | { 29 | if (vector2.ContainsKey(idAndTFIDF.Key)) 30 | numerator += idAndTFIDF.Value * vector2[idAndTFIDF.Key]; 31 | } 32 | 33 | double scale = length1 > length2 ? length2 / length1 : length1 / length2; 34 | return numerator * scale / (length1 * length2); 35 | } 36 | 37 | /// 38 | /// 计算向量长度 39 | /// 40 | /// 41 | /// 42 | public static double CaclVectorLength(Dictionary vector1) 43 | { 44 | double result = 0; 45 | foreach (float tfidf in vector1.Values) 46 | { 47 | result += tfidf * tfidf; 48 | } 49 | return Math.Sqrt(result); 50 | } 51 | 52 | /// 53 | /// 获取特征向量 54 | /// 55 | /// 56 | /// 57 | public static Dictionary GetFeatureVector(string sentence) 58 | { 59 | Dictionary> wordId_countIDF = new Dictionary>(); 60 | int totalWordCount = 0; 61 | foreach (string word in WordTool.Cut(sentence)) 62 | { 63 | if (Dict.StopWords.Contains(word)) 64 | continue; 65 | 66 | WordInfo info; 67 | if (Dict.WordExtraInfos.TryGetValue(word, out info)) 68 | { 69 | Tuple countAndIDF; 70 | if (!wordId_countIDF.TryGetValue(info.RowNumber, out countAndIDF)) 71 | countAndIDF = Tuple.Create(0, info.IDF); 72 | wordId_countIDF[info.RowNumber] = Tuple.Create(countAndIDF.Item1 + 1, countAndIDF.Item2); 73 | } 74 | totalWordCount++; 75 | } 76 | 77 | Dictionary idAndTFIDF = new Dictionary(); 78 | foreach (KeyValuePair> pair in wordId_countIDF) 79 | { 80 | //计算TF-IDF值 81 | idAndTFIDF[pair.Key] = ((float)pair.Value.Item1 / totalWordCount) * pair.Value.Item2; 82 | } 83 | 84 | return idAndTFIDF; 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /WordSegmentation/Dict.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Text; 7 | 8 | namespace WordSegmentation 9 | { 10 | 11 | public static class Dict 12 | { 13 | private static Hashtable _trie; 14 | 15 | private static Dictionary _wordExtraInfos; 16 | 17 | private static HashSet _stopwords; 18 | 19 | private static float _minFreq; 20 | 21 | private static int _minCount; 22 | 23 | private static long _totalCount; 24 | 25 | private static readonly object syncTrieRoot = new object(); 26 | 27 | private static readonly object syncStopwordRoot = new object(); 28 | 29 | /// 30 | /// 单词查找树 31 | /// 32 | public static Hashtable Trie 33 | { 34 | get 35 | { 36 | Init(); 37 | return _trie; 38 | } 39 | } 40 | 41 | /// 42 | /// 词语附加信息 43 | /// 44 | public static Dictionary WordExtraInfos 45 | { 46 | get 47 | { 48 | Init(); 49 | return _wordExtraInfos; 50 | } 51 | } 52 | 53 | /// 54 | /// 停止词 55 | /// 56 | public static HashSet StopWords 57 | { 58 | get 59 | { 60 | if(_stopwords==null) 61 | { 62 | lock (syncStopwordRoot) 63 | { 64 | if(_stopwords==null) 65 | { 66 | using (StreamReader reader=new StreamReader("stopword.txt")) 67 | { 68 | _stopwords = new HashSet(); 69 | while (!reader.EndOfStream) 70 | { 71 | string word = reader.ReadLine(); 72 | if(!string.IsNullOrEmpty(word)) 73 | _stopwords.Add(word); 74 | } 75 | } 76 | } 77 | } 78 | } 79 | 80 | return _stopwords; 81 | } 82 | } 83 | 84 | /// 85 | /// 最小文档频率 86 | /// 87 | public static float MinFreq 88 | { 89 | get 90 | { 91 | Init(); 92 | return _minFreq; 93 | } 94 | } 95 | 96 | public static void Init(string dictFile = "dict.txt") 97 | { 98 | if (_trie == null) 99 | { 100 | lock (syncTrieRoot) 101 | { 102 | if (_trie == null) 103 | { 104 | LoadDict(dictFile); 105 | } 106 | } 107 | } 108 | } 109 | 110 | private static void LoadDict(string dictFile) 111 | { 112 | using (StreamReader reader = new StreamReader(dictFile)) 113 | { 114 | _trie = new Hashtable(); 115 | _wordExtraInfos = new Dictionary(); 116 | 117 | string line; 118 | int rn = 0; 119 | while ((line = reader.ReadLine()) != null) 120 | { 121 | Hashtable root = _trie; 122 | 123 | string[] arrOfLine = line.Split(' '); 124 | string word = arrOfLine[0]; 125 | int count = int.Parse(arrOfLine[1]); 126 | 127 | //构造单词查找表 128 | for (int i = 0; i < word.Length; i++) 129 | { 130 | string key = word.Substring(i, 1); 131 | if (!root.ContainsKey(key)) 132 | { 133 | root.Add(key, new Hashtable()); 134 | } 135 | root = (Hashtable)root[key]; 136 | } 137 | root[""] = "";//结束标记 138 | 139 | //计算词最小出现次数 140 | if (_minCount == 0 || count < _minCount) 141 | _minCount = count; 142 | 143 | _totalCount += count; 144 | 145 | //填充单词额外信息 146 | WordInfo info = new WordInfo() { Freq = count, RowNumber = rn }; //freq先设置为次数 后面要重新计算 147 | _wordExtraInfos[word] = info; 148 | rn++; 149 | } 150 | } 151 | 152 | foreach (KeyValuePair wordExtraInfo in _wordExtraInfos) 153 | { 154 | //计算 逆文档频率(一个词出现次数越高 则越不重要) 155 | wordExtraInfo.Value.IDF = (float)Math.Log(_totalCount / (wordExtraInfo.Value.Freq + 1)); 156 | 157 | //计算 文档频率 158 | wordExtraInfo.Value.Freq = (float)Math.Log(wordExtraInfo.Value.Freq / _totalCount); 159 | } 160 | 161 | _minFreq = (float)Math.Log((float)_minCount / _totalCount); 162 | } 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /WordSegmentation/WordTool.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Text.RegularExpressions; 7 | 8 | namespace WordSegmentation 9 | { 10 | public static class WordTool 11 | { 12 | private static Regex re_chinese = new Regex(@"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", RegexOptions.Compiled); 13 | private static Regex re_alphabet_digit = new Regex(@"(\d+\.\d+|[a-zA-Z0-9]+)", RegexOptions.Compiled); 14 | 15 | /// 16 | /// 分词 17 | /// 18 | /// 19 | /// 20 | public static IEnumerable Cut(string sentence) 21 | { 22 | string[] blocks = re_chinese.Split(sentence); 23 | foreach (string block in blocks) 24 | { 25 | if (re_chinese.IsMatch(block)) 26 | { 27 | foreach (string word in CutBlock(block)) 28 | { 29 | yield return word; 30 | } 31 | } 32 | } 33 | } 34 | 35 | /// 36 | /// 对一个词块分词 37 | /// 38 | /// 39 | /// 40 | private static IEnumerable CutBlock(string block) 41 | { 42 | Dictionary> dag = GetDAG(block); 43 | int[] route = CalcRoute(block, dag); 44 | int length = block.Length; 45 | int i = 0; 46 | string buffer = string.Empty; 47 | while (i < length) 48 | { 49 | int end = route[i]; 50 | string word = block.Substring(i, end - i + 1); 51 | //不存在的单个词放入缓冲区 52 | if (end - i == 0 && !Dict.WordExtraInfos.ContainsKey(word)) 53 | buffer += word; 54 | else 55 | { 56 | if (buffer.Length > 0) 57 | { 58 | foreach (string s in CutBuffer(buffer)) 59 | { 60 | yield return s; 61 | } 62 | 63 | buffer = string.Empty; 64 | } 65 | 66 | yield return word; 67 | } 68 | 69 | i = end + 1; 70 | } 71 | 72 | if (buffer.Length > 0) 73 | { 74 | foreach (string s in CutBuffer(buffer)) 75 | { 76 | yield return s; 77 | } 78 | } 79 | } 80 | 81 | /// 82 | /// 对缓冲区的字符进行分词 83 | /// 84 | /// 85 | /// 86 | private static IEnumerable CutBuffer(string buffer) 87 | { 88 | if (buffer.Length == 1) 89 | yield return buffer; 90 | else 91 | { 92 | //不存在的词 使用字母和数字 拆分后返回 93 | string[] tmp = re_alphabet_digit.Split(buffer); 94 | foreach (string s in tmp) 95 | { 96 | if (!string.IsNullOrEmpty(s)) 97 | yield return s; 98 | } 99 | } 100 | } 101 | 102 | /// 103 | /// 计算最大概率路径(使用动态规划) 104 | /// 105 | /// 106 | /// 107 | /// 108 | private static int[] CalcRoute(string block, Dictionary> dag) 109 | { 110 | int length = block.Length; 111 | int[] route = new int[length]; 112 | float[] freq = new float[length + 1]; 113 | 114 | //汉语重心经常落在后面 采用逆向最大匹配 115 | for (int i = length - 1; i >= 0; i--) 116 | { 117 | var candidates = (from end in dag[i] 118 | select Tuple.Create(GetFreq(block.Substring(i, end - i + 1)) + freq[end + 1], end)).ToList(); 119 | Tuple freqAndend = candidates.OrderByDescending(t => t.Item1).FirstOrDefault(); 120 | freq[i] = freqAndend.Item1; 121 | route[i] = freqAndend.Item2; 122 | } 123 | 124 | return route; 125 | } 126 | 127 | /// 128 | /// 获取词语的文档频率 129 | /// 130 | /// 131 | /// 132 | private static float GetFreq(string word) 133 | { 134 | WordInfo info; 135 | if (Dict.WordExtraInfos.TryGetValue(word, out info)) 136 | return info.Freq; 137 | 138 | return Dict.MinFreq; 139 | } 140 | 141 | /// 142 | /// 获取有向无环图 143 | /// 144 | /// 145 | /// 146 | private static Dictionary> GetDAG(string block) 147 | { 148 | int length = block.Length; 149 | Hashtable trie = Dict.Trie; 150 | int i = 0, end = 0; 151 | 152 | Dictionary> dag = new Dictionary>(); 153 | while (i < length) 154 | { 155 | string key = block.Substring(end, 1); 156 | bool goNextWord; 157 | if (trie.ContainsKey(key)) 158 | { 159 | trie = (Hashtable)trie[key]; 160 | if (trie.ContainsKey("")) 161 | { 162 | if (!dag.ContainsKey(i)) 163 | { 164 | dag.Add(i, new List()); 165 | } 166 | dag[i].Add(end); 167 | } 168 | 169 | end++; 170 | goNextWord = end >= length; 171 | } 172 | else 173 | goNextWord = true; 174 | 175 | if (goNextWord) 176 | { 177 | end = ++i; 178 | trie = Dict.Trie; 179 | } 180 | } 181 | 182 | for (int k = 0; k < length; k++) 183 | { 184 | if (!dag.ContainsKey(k)) 185 | dag[k] = new List() { k }; 186 | } 187 | 188 | return dag; 189 | } 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /Example/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Text; 7 | using System.Text.RegularExpressions; 8 | using WordSegmentation; 9 | 10 | namespace Example 11 | { 12 | class Program 13 | { 14 | static void Main(string[] args) 15 | { 16 | SimilarTest(); 17 | 18 | Console.ReadKey(); 19 | } 20 | 21 | private static void SimilarTest() 22 | { 23 | Console.WriteLine("usage : input filename1 filename2 to compare\n\texample: 1 2\n\tq : exit"); 24 | 25 | while (true) 26 | { 27 | Console.Write("input filename : "); 28 | string input = Console.ReadLine(); 29 | if (input == "q") 30 | break; 31 | 32 | string[] split = input.Split(' '); 33 | if (split.Length != 2) 34 | { 35 | Console.WriteLine("wrong input, please try again"); 36 | continue; 37 | } 38 | 39 | string sentence1 = GetArticle(split[0]); 40 | string sentence2 = GetArticle(split[1]); 41 | 42 | Dictionary vector1 = ArticleUtils.GetFeatureVector(sentence1); 43 | Console.WriteLine("top 3 word rownumber of sentence1 :" + string.Join(" , ", vector1.OrderByDescending(t => t.Value).Select(t => t.Key).Take(3))); 44 | 45 | Dictionary vector2 = ArticleUtils.GetFeatureVector(sentence2); 46 | Console.WriteLine("top 3 word rownumber of sentence2 :" + string.Join(" , ", vector2.OrderByDescending(t => t.Value).Select(t => t.Key).Take(3))); 47 | 48 | double length1 = ArticleUtils.CaclVectorLength(vector1); 49 | double length2 = ArticleUtils.CaclVectorLength(vector2); 50 | 51 | Console.WriteLine(length1 + " : " + length2); 52 | 53 | double similar = ArticleUtils.CaclSimilar(vector1, vector2, length1, length2); 54 | 55 | Console.WriteLine(similar); 56 | } 57 | } 58 | 59 | private static string GetArticle(string filename) 60 | { 61 | using (StreamReader reader = new StreamReader(filename + ".txt")) 62 | { 63 | return reader.ReadToEnd(); 64 | } 65 | } 66 | 67 | private static void cuttest() 68 | { 69 | cuttest("我不喜欢日本和服。"); 70 | cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"); 71 | cuttest("我不喜欢日本和服。"); 72 | cuttest("雷猴回归人间。"); 73 | cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"); 74 | cuttest("我需要廉租房"); 75 | cuttest("永和服装饰品有限公司"); 76 | cuttest("我爱北京天安门"); 77 | cuttest("abc"); 78 | cuttest("隐马尔可夫"); 79 | cuttest("雷猴是个好网站"); 80 | cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成"); 81 | cuttest("草泥马和欺实马是今年的流行词汇"); 82 | cuttest("伊藤洋华堂总府店"); 83 | cuttest("中国科学院计算技术研究所"); 84 | cuttest("罗密欧与朱丽叶"); 85 | cuttest("我购买了道具和服装"); 86 | cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍"); 87 | cuttest("湖北省石首市"); 88 | cuttest("湖北省十堰市"); 89 | cuttest("总经理完成了这件事情"); 90 | cuttest("电脑修好了"); 91 | cuttest("做好了这件事情就一了百了了"); 92 | cuttest("人们审美的观点是不同的"); 93 | cuttest("我们买了一个美的空调"); 94 | cuttest("线程初始化时我们要注意"); 95 | cuttest("一个分子是由好多原子组织成的"); 96 | cuttest("祝你马到功成"); 97 | cuttest("他掉进了无底洞里"); 98 | cuttest("中国的首都是北京"); 99 | cuttest("孙君意"); 100 | cuttest("外交部发言人马朝旭"); 101 | cuttest("领导人会议和第四届东亚峰会"); 102 | cuttest("在过去的这五年"); 103 | cuttest("还需要很长的路要走"); 104 | cuttest("60周年首都阅兵"); 105 | cuttest("你好人们审美的观点是不同的"); 106 | cuttest("买水果然后来世博园"); 107 | cuttest("买水果然后去世博园"); 108 | cuttest("但是后来我才知道你是对的"); 109 | cuttest("存在即合理"); 110 | cuttest("的的的的的在的的的的就以和和和"); 111 | cuttest("I love你,不以为耻,反以为rong"); 112 | cuttest("因"); 113 | cuttest(""); 114 | cuttest("hello你好人们审美的观点是不同的"); 115 | cuttest("很好但主要是基于网页形式"); 116 | cuttest("hello你好人们审美的观点是不同的"); 117 | cuttest("为什么我不能拥有想要的生活"); 118 | cuttest("后来我才"); 119 | cuttest("此次来中国是为了"); 120 | cuttest("使用了它就可以解决一些问题"); 121 | cuttest(",使用了它就可以解决一些问题"); 122 | cuttest("其实使用了它就可以解决一些问题"); 123 | cuttest("好人使用了它就可以解决一些问题"); 124 | cuttest("是因为和国家"); 125 | cuttest("老年搜索还支持"); 126 | cuttest( 127 | "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 "); 128 | cuttest("大"); 129 | cuttest(""); 130 | cuttest("他说的确实在理"); 131 | cuttest("长春市长春节讲话"); 132 | cuttest("结婚的和尚未结婚的"); 133 | cuttest("结合成分子时"); 134 | cuttest("旅游和服务是最好的"); 135 | cuttest("这件事情的确是我的错"); 136 | cuttest("供大家参考指正"); 137 | cuttest("哈尔滨政府公布塌桥原因"); 138 | cuttest("我在机场入口处"); 139 | cuttest("邢永臣摄影报道"); 140 | cuttest("BP神经网络如何训练才能在分类时增加区分度?"); 141 | cuttest("南京市长江大桥"); 142 | cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究"); 143 | cuttest("长春市长春药店"); 144 | cuttest("邓颖超生前最喜欢的衣服"); 145 | cuttest("胡锦涛是热爱世界和平的政治局常委"); 146 | cuttest("程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪"); 147 | cuttest("一次性交多少钱"); 148 | cuttest("两块五一套,三块八一斤,四块七一本,五块六一条"); 149 | cuttest("小和尚留了一个像大和尚一样的和尚头"); 150 | cuttest("我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站"); 151 | cuttest("张晓梅去人民医院做了个B超然后去买了件T恤"); 152 | cuttest("AT&T是一件不错的公司,给你发offer了吗?"); 153 | cuttest("C++和c#是什么关系?11+122=133,是吗?PI=3.14159"); 154 | cuttest("你认识那个和主席握手的的哥吗?他开一辆黑色的士。"); 155 | cuttest("枪杆子中出政权"); 156 | cuttest("张三风同学走上了不归路"); 157 | cuttest("阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。"); 158 | cuttest("在1号店能买到小S和大S八卦的书,还有3D电视。"); 159 | 160 | //Regex re_chinese = new Regex(@"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", RegexOptions.Compiled); 161 | //string[] blocks= re_chinese.Split("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成"); 162 | //Console.WriteLine(string.Join(" / ", blocks)); 163 | 164 | //Hashtable trie=Dict.Trie; 165 | //foreach (string key in trie.Keys) 166 | //{ 167 | // Console.WriteLine(key); 168 | //} 169 | //Console.WriteLine(trie.Keys.Count); 170 | } 171 | 172 | private static void cuttest(string sentence) 173 | { 174 | string[] result = WordTool.Cut(sentence).ToArray(); 175 | Console.WriteLine(string.Join(" / ", result)); 176 | } 177 | } 178 | } 179 | --------------------------------------------------------------------------------