├── readme.md
├── WordSegmentation
├── WordInfo.cs
├── Properties
│ └── AssemblyInfo.cs
├── WordSegmentation.csproj
├── stopword.txt
├── ArticleUtils.cs
├── Dict.cs
└── WordTool.cs
├── Example
├── Properties
│ └── AssemblyInfo.cs
├── Example.csproj
└── Program.cs
├── WordSegmentation.sln
└── .gitignore
/readme.md:
--------------------------------------------------------------------------------
1 | ## 中文分词组件
2 |
3 | ### 主要功能
4 | 1. 中文分词
5 | 2. 计算文章相关性
6 |
7 |
8 | ### 使用算法
9 |
10 | 1. 使用 Trie 树结构进行词语查找
11 | 2. 生成句子中所有成词情况的有向无环图
12 | 3. 使用动态规划查找最大概率路径
13 |
14 | ### 不足
15 |
16 | 1. 未处理未登陆词
17 |
--------------------------------------------------------------------------------
/WordSegmentation/WordInfo.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 |
6 | namespace WordSegmentation
7 | {
8 | public class WordInfo
9 | {
10 | ///
11 | /// 文档频率
12 | ///
13 | public float Freq { get; set; }
14 |
15 | ///
16 | /// 逆文档频率
17 | ///
18 | public float IDF { get; set; }
19 |
20 | public int RowNumber { get; set; }
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/Example/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // 有关程序集的常规信息通过以下
6 | // 特性集控制。更改这些特性值可修改
7 | // 与程序集关联的信息。
8 | [assembly: AssemblyTitle("Example")]
9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("Microsoft")]
12 | [assembly: AssemblyProduct("Example")]
13 | [assembly: AssemblyCopyright("Copyright © Microsoft 2013")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 |
17 | // 将 ComVisible 设置为 false 使此程序集中的类型
18 | // 对 COM 组件不可见。如果需要从 COM 访问此程序集中的类型,
19 | // 则将该类型上的 ComVisible 特性设置为 true。
20 | [assembly: ComVisible(false)]
21 |
22 | // 如果此项目向 COM 公开,则下列 GUID 用于类型库的 ID
23 | [assembly: Guid("445998de-268b-4c33-8094-ff80a7d7393f")]
24 |
25 | // 程序集的版本信息由下面四个值组成:
26 | //
27 | // 主版本
28 | // 次版本
29 | // 生成号
30 | // 修订号
31 | //
32 | // 可以指定所有这些值,也可以使用“生成号”和“修订号”的默认值,
33 | // 方法是按如下所示使用“*”:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("1.0.0.0")]
36 | [assembly: AssemblyFileVersion("1.0.0.0")]
37 |
--------------------------------------------------------------------------------
/WordSegmentation/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // 有关程序集的常规信息通过以下
6 | // 特性集控制。更改这些特性值可修改
7 | // 与程序集关联的信息。
8 | [assembly: AssemblyTitle("WordSegmentation")]
9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("Microsoft")]
12 | [assembly: AssemblyProduct("WordSegmentation")]
13 | [assembly: AssemblyCopyright("Copyright © Microsoft 2013")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 |
17 | // 将 ComVisible 设置为 false 使此程序集中的类型
18 | // 对 COM 组件不可见。如果需要从 COM 访问此程序集中的类型,
19 | // 则将该类型上的 ComVisible 特性设置为 true。
20 | [assembly: ComVisible(false)]
21 |
22 | // 如果此项目向 COM 公开,则下列 GUID 用于类型库的 ID
23 | [assembly: Guid("33d57035-9050-4051-ba4e-0bfd374f9a04")]
24 |
25 | // 程序集的版本信息由下面四个值组成:
26 | //
27 | // 主版本
28 | // 次版本
29 | // 生成号
30 | // 修订号
31 | //
32 | // 可以指定所有这些值,也可以使用“生成号”和“修订号”的默认值,
33 | // 方法是按如下所示使用“*”:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("1.0.0.0")]
36 | [assembly: AssemblyFileVersion("1.0.0.0")]
37 |
--------------------------------------------------------------------------------
/WordSegmentation.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 2012
4 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "WordSegmentation", "WordSegmentation\WordSegmentation.csproj", "{12A2A010-81A9-4466-BDAB-D55EF338A770}"
5 | EndProject
6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Example", "Example\Example.csproj", "{1CF5A71B-9A9A-4446-A758-435595BF90A6}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|Any CPU = Debug|Any CPU
11 | Release|Any CPU = Release|Any CPU
12 | EndGlobalSection
13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | {12A2A010-81A9-4466-BDAB-D55EF338A770}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | {12A2A010-81A9-4466-BDAB-D55EF338A770}.Debug|Any CPU.Build.0 = Debug|Any CPU
16 | {12A2A010-81A9-4466-BDAB-D55EF338A770}.Release|Any CPU.ActiveCfg = Release|Any CPU
17 | {12A2A010-81A9-4466-BDAB-D55EF338A770}.Release|Any CPU.Build.0 = Release|Any CPU
18 | {1CF5A71B-9A9A-4446-A758-435595BF90A6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
19 | {1CF5A71B-9A9A-4446-A758-435595BF90A6}.Debug|Any CPU.Build.0 = Debug|Any CPU
20 | {1CF5A71B-9A9A-4446-A758-435595BF90A6}.Release|Any CPU.ActiveCfg = Release|Any CPU
21 | {1CF5A71B-9A9A-4446-A758-435595BF90A6}.Release|Any CPU.Build.0 = Release|Any CPU
22 | EndGlobalSection
23 | GlobalSection(SolutionProperties) = preSolution
24 | HideSolutionNode = FALSE
25 | EndGlobalSection
26 | EndGlobal
27 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 |
4 | # User-specific files
5 | *.suo
6 | *.user
7 | *.sln.docstates
8 |
9 | # Build results
10 |
11 | [Dd]ebug/
12 | [Rr]elease/
13 | x64/
14 | build/
15 | [Bb]in/
16 | [Oo]bj/
17 |
18 | # MSTest test Results
19 | [Tt]est[Rr]esult*/
20 | [Bb]uild[Ll]og.*
21 |
22 | *_i.c
23 | *_p.c
24 | *.ilk
25 | *.meta
26 | *.obj
27 | *.pch
28 | *.pdb
29 | *.pgc
30 | *.pgd
31 | *.rsp
32 | *.sbr
33 | *.tlb
34 | *.tli
35 | *.tlh
36 | *.tmp
37 | *.tmp_proj
38 | *.log
39 | *.vspscc
40 | *.vssscc
41 | .builds
42 | *.pidb
43 | *.log
44 | *.scc
45 |
46 | # Visual C++ cache files
47 | ipch/
48 | *.aps
49 | *.ncb
50 | *.opensdf
51 | *.sdf
52 | *.cachefile
53 |
54 | # Visual Studio profiler
55 | *.psess
56 | *.vsp
57 | *.vspx
58 |
59 | # Guidance Automation Toolkit
60 | *.gpState
61 |
62 | # ReSharper is a .NET coding add-in
63 | _ReSharper*/
64 | *.[Rr]e[Ss]harper
65 |
66 | # TeamCity is a build add-in
67 | _TeamCity*
68 |
69 | # DotCover is a Code Coverage Tool
70 | *.dotCover
71 |
72 | # NCrunch
73 | *.ncrunch*
74 | .*crunch*.local.xml
75 |
76 | # Installshield output folder
77 | [Ee]xpress/
78 |
79 | # DocProject is a documentation generator add-in
80 | DocProject/buildhelp/
81 | DocProject/Help/*.HxT
82 | DocProject/Help/*.HxC
83 | DocProject/Help/*.hhc
84 | DocProject/Help/*.hhk
85 | DocProject/Help/*.hhp
86 | DocProject/Help/Html2
87 | DocProject/Help/html
88 |
89 | # Click-Once directory
90 | publish/
91 |
92 | # Publish Web Output
93 | *.Publish.xml
94 | *.pubxml
95 |
96 | # NuGet Packages Directory
97 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line
98 | #packages/
99 |
100 | # Windows Azure Build Output
101 | csx
102 | *.build.csdef
103 |
104 | # Windows Store app package directory
105 | AppPackages/
106 |
107 | # Others
108 | sql/
109 | ClientBin/
110 | [Ss]tyle[Cc]op.*
111 | ~$*
112 | *~
113 | *.dbmdl
114 | *.[Pp]ublish.xml
115 | *.pfx
116 | *.publishsettings
117 |
118 | # RIA/Silverlight projects
119 | Generated_Code/
120 |
121 | # Backup & report files from converting an old project file to a newer
122 | # Visual Studio version. Backup files are not needed, because we have git ;-)
123 | _UpgradeReport_Files/
124 | Backup*/
125 | UpgradeLog*.XML
126 | UpgradeLog*.htm
127 |
128 | # SQL Server files
129 | App_Data/*.mdf
130 | App_Data/*.ldf
131 |
132 | # =========================
133 | # Windows detritus
134 | # =========================
135 |
136 | # Windows image file caches
137 | Thumbs.db
138 | ehthumbs.db
139 |
140 | # Folder config file
141 | Desktop.ini
142 |
143 | # Recycle Bin used on file shares
144 | $RECYCLE.BIN/
145 |
146 | # Mac crap
147 | .DS_Store
148 |
149 |
--------------------------------------------------------------------------------
/Example/Example.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | AnyCPU
7 | {1CF5A71B-9A9A-4446-A758-435595BF90A6}
8 | Exe
9 | Properties
10 | Example
11 | Example
12 | v4.0
13 | 512
14 |
15 |
16 | AnyCPU
17 | true
18 | full
19 | false
20 | bin\Debug\
21 | DEBUG;TRACE
22 | prompt
23 | 4
24 |
25 |
26 | AnyCPU
27 | pdbonly
28 | true
29 | bin\Release\
30 | TRACE
31 | prompt
32 | 4
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 | {12a2a010-81a9-4466-bdab-d55ef338a770}
50 | WordSegmentation
51 |
52 |
53 |
54 |
61 |
--------------------------------------------------------------------------------
/WordSegmentation/WordSegmentation.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | AnyCPU
7 | {12A2A010-81A9-4466-BDAB-D55EF338A770}
8 | Library
9 | Properties
10 | WordSegmentation
11 | WordSegmentation
12 | v4.0
13 | 512
14 |
15 |
16 | true
17 | full
18 | false
19 | bin\Debug\
20 | DEBUG;TRACE
21 | prompt
22 | 4
23 |
24 |
25 | pdbonly
26 | true
27 | bin\Release\
28 | TRACE
29 | prompt
30 | 4
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 | PreserveNewest
50 |
51 |
52 | PreserveNewest
53 |
54 |
55 |
56 |
63 |
--------------------------------------------------------------------------------
/WordSegmentation/stopword.txt:
--------------------------------------------------------------------------------
1 | about
2 | a
3 | after
4 | all
5 | also
6 | an
7 | and
8 | another
9 | any
10 | are
11 | as
12 | at
13 | be
14 | because
15 | been
16 | before
17 | being
18 | between
19 | both
20 | but
21 | by
22 | came
23 | can
24 | come
25 | could
26 | did
27 | do
28 | each
29 | for
30 | from
31 | get
32 | got
33 | had
34 | has
35 | have
36 | he
37 | her
38 | here
39 | him
40 | himself
41 | his
42 | how
43 | i
44 | if
45 | in
46 | into
47 | is
48 | it
49 | like
50 | make
51 | many
52 | me
53 | might
54 | more
55 | most
56 | much
57 | must
58 | my
59 | never
60 | now
61 | of
62 | on
63 | only
64 | or
65 | other
66 | our
67 | out
68 | over
69 | said
70 | same
71 | see
72 | should
73 | since
74 | some
75 | still
76 | such
77 | take
78 | than
79 | that
80 | the
81 | their
82 | them
83 | then
84 | there
85 | these
86 | they
87 | this
88 | those
89 | through
90 | to
91 | too
92 | under
93 | up
94 | very
95 | was
96 | way
97 | we
98 | well
99 | were
100 | what
101 | where
102 | which
103 | while
104 | who
105 | with
106 | would
107 | you
108 | your
109 | 的
110 | 了
111 | 在
112 | 是
113 | 我
114 | 有
115 | 和
116 | 就
117 | 不
118 | 入
119 | 都
120 | 一
121 | 一个
122 | 上
123 | 也
124 | 很
125 | 到
126 | 说
127 | 要
128 | 去
129 | 你
130 | 会
131 | 着
132 | 没有
133 | 看
134 | 好
135 | 自己
136 | 这
137 | 来
138 | 想
139 | 中
140 | 多
141 | 还
142 | 对
143 | 让
144 | 又
145 | 而
146 | 里
147 | 我的
148 | 可以
149 | 就是
150 | 能
151 | 把
152 | 他
153 | 个
154 | 给
155 | 这个
156 | 我们
157 | 过
158 | 得
159 | 但
160 | 被
161 | 时候
162 | 还是
163 | 那
164 | 做
165 | 什么
166 | 为
167 | 地
168 | 因为
169 | 大
170 | 才不
171 | 吧
172 | 最
173 | 没
174 | 从
175 | 小
176 | 与
177 | 不是
178 | 年
179 | 用
180 | 已经
181 | 再
182 | 下
183 | 月
184 | 时
185 | 如果
186 | 却
187 | 开始
188 | 后
189 | 呢
190 | 啊
191 | 但是
192 | 所以
193 | 天
194 | 之
195 | 都是
196 | 才
197 | 她
198 | 等
199 | 很多
200 | 这样
201 | 觉得
202 | 只
203 | 出
204 | 也不
205 | 像
206 | 点
207 | 日
208 | 那么
209 | 更
210 | 看到
211 | 只是
212 | 还有
213 | 将
214 | 当
215 | 以
216 | 前
217 | 他们
218 | 可
219 | 一直
220 | 不过
221 | 事
222 | 不能
223 | 其实
224 | 你的
225 | 也是
226 | 所
227 | 一些
228 | 一样
229 | 它
230 | 怎么
231 | 两
232 | 虽然
233 | 一下
234 | 写
235 | 走
236 | 比
237 | 吃
238 | 可能
239 | 于
240 | 可是
241 | 应该
242 | 真
243 | 只有
244 | 无
245 | 跟
246 | 最后
247 | 吗
248 | 然后
249 | 不会
250 | 听
251 | 这么
252 | 这是
253 | 那个
254 | 这些
255 | 们
256 | 他的
257 | 所有
258 | 起
259 | 一种
260 | 家
261 | 三
262 | 起来
263 | 一次
264 | 到了
265 | 不要
266 | 叫
267 | 谁
268 | 为了
269 | 而且
270 | 这种
271 | 中的
272 | 长
273 | 一起
274 | 一天
275 | 打
276 | 或
277 | 两个
278 | 第一
279 | 为什么
280 | 成
281 | 找
282 | 一个人
283 | 地方
284 | 我也
285 | 一切
286 | 终于
287 | 总是
288 | 向
289 | 如
290 | 我是
291 | 以后
292 | 也许
293 | 一点
294 | 该
295 | 于是
296 | 之后
297 | 或者
298 | 那些
299 | 的话
300 | 其
301 | 一定
302 | 有些
303 | 些
304 | 真是
305 | 成为
306 | 行
307 | 当然
308 | 只能
309 | 我在
310 | 甚至
311 | 对于
312 | 如此
313 | 这里
314 | 我们的
315 | 只要
316 | 她的
317 | 后来
318 | 都有
319 | 以为
320 | 好象
321 | 我说
--------------------------------------------------------------------------------
/WordSegmentation/ArticleUtils.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 |
7 | namespace WordSegmentation
8 | {
9 | public static class ArticleUtils
10 | {
11 | ///
12 | /// 计算相似度
13 | ///
14 | ///
15 | ///
16 | ///
17 | ///
18 | ///
19 | public static double CaclSimilar(Dictionary vector1, Dictionary vector2, double length1, double length2)
20 | {
21 | if (vector1.Count == 0 && vector2.Count == 0)
22 | return 1;
23 | if (vector1.Count == 0 || vector2.Count == 0)
24 | return 0;
25 |
26 | float numerator = 0;
27 | foreach (KeyValuePair idAndTFIDF in vector1)
28 | {
29 | if (vector2.ContainsKey(idAndTFIDF.Key))
30 | numerator += idAndTFIDF.Value * vector2[idAndTFIDF.Key];
31 | }
32 |
33 | double scale = length1 > length2 ? length2 / length1 : length1 / length2;
34 | return numerator * scale / (length1 * length2);
35 | }
36 |
37 | ///
38 | /// 计算向量长度
39 | ///
40 | ///
41 | ///
42 | public static double CaclVectorLength(Dictionary vector1)
43 | {
44 | double result = 0;
45 | foreach (float tfidf in vector1.Values)
46 | {
47 | result += tfidf * tfidf;
48 | }
49 | return Math.Sqrt(result);
50 | }
51 |
52 | ///
53 | /// 获取特征向量
54 | ///
55 | ///
56 | ///
57 | public static Dictionary GetFeatureVector(string sentence)
58 | {
59 | Dictionary> wordId_countIDF = new Dictionary>();
60 | int totalWordCount = 0;
61 | foreach (string word in WordTool.Cut(sentence))
62 | {
63 | if (Dict.StopWords.Contains(word))
64 | continue;
65 |
66 | WordInfo info;
67 | if (Dict.WordExtraInfos.TryGetValue(word, out info))
68 | {
69 | Tuple countAndIDF;
70 | if (!wordId_countIDF.TryGetValue(info.RowNumber, out countAndIDF))
71 | countAndIDF = Tuple.Create(0, info.IDF);
72 | wordId_countIDF[info.RowNumber] = Tuple.Create(countAndIDF.Item1 + 1, countAndIDF.Item2);
73 | }
74 | totalWordCount++;
75 | }
76 |
77 | Dictionary idAndTFIDF = new Dictionary();
78 | foreach (KeyValuePair> pair in wordId_countIDF)
79 | {
80 | //计算TF-IDF值
81 | idAndTFIDF[pair.Key] = ((float)pair.Value.Item1 / totalWordCount) * pair.Value.Item2;
82 | }
83 |
84 | return idAndTFIDF;
85 | }
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/WordSegmentation/Dict.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections;
3 | using System.Collections.Generic;
4 | using System.IO;
5 | using System.Linq;
6 | using System.Text;
7 |
8 | namespace WordSegmentation
9 | {
10 |
11 | public static class Dict
12 | {
13 | private static Hashtable _trie;
14 |
15 | private static Dictionary _wordExtraInfos;
16 |
17 | private static HashSet _stopwords;
18 |
19 | private static float _minFreq;
20 |
21 | private static int _minCount;
22 |
23 | private static long _totalCount;
24 |
25 | private static readonly object syncTrieRoot = new object();
26 |
27 | private static readonly object syncStopwordRoot = new object();
28 |
29 | ///
30 | /// 单词查找树
31 | ///
32 | public static Hashtable Trie
33 | {
34 | get
35 | {
36 | Init();
37 | return _trie;
38 | }
39 | }
40 |
41 | ///
42 | /// 词语附加信息
43 | ///
44 | public static Dictionary WordExtraInfos
45 | {
46 | get
47 | {
48 | Init();
49 | return _wordExtraInfos;
50 | }
51 | }
52 |
53 | ///
54 | /// 停止词
55 | ///
56 | public static HashSet StopWords
57 | {
58 | get
59 | {
60 | if(_stopwords==null)
61 | {
62 | lock (syncStopwordRoot)
63 | {
64 | if(_stopwords==null)
65 | {
66 | using (StreamReader reader=new StreamReader("stopword.txt"))
67 | {
68 | _stopwords = new HashSet();
69 | while (!reader.EndOfStream)
70 | {
71 | string word = reader.ReadLine();
72 | if(!string.IsNullOrEmpty(word))
73 | _stopwords.Add(word);
74 | }
75 | }
76 | }
77 | }
78 | }
79 |
80 | return _stopwords;
81 | }
82 | }
83 |
84 | ///
85 | /// 最小文档频率
86 | ///
87 | public static float MinFreq
88 | {
89 | get
90 | {
91 | Init();
92 | return _minFreq;
93 | }
94 | }
95 |
96 | public static void Init(string dictFile = "dict.txt")
97 | {
98 | if (_trie == null)
99 | {
100 | lock (syncTrieRoot)
101 | {
102 | if (_trie == null)
103 | {
104 | LoadDict(dictFile);
105 | }
106 | }
107 | }
108 | }
109 |
110 | private static void LoadDict(string dictFile)
111 | {
112 | using (StreamReader reader = new StreamReader(dictFile))
113 | {
114 | _trie = new Hashtable();
115 | _wordExtraInfos = new Dictionary();
116 |
117 | string line;
118 | int rn = 0;
119 | while ((line = reader.ReadLine()) != null)
120 | {
121 | Hashtable root = _trie;
122 |
123 | string[] arrOfLine = line.Split(' ');
124 | string word = arrOfLine[0];
125 | int count = int.Parse(arrOfLine[1]);
126 |
127 | //构造单词查找表
128 | for (int i = 0; i < word.Length; i++)
129 | {
130 | string key = word.Substring(i, 1);
131 | if (!root.ContainsKey(key))
132 | {
133 | root.Add(key, new Hashtable());
134 | }
135 | root = (Hashtable)root[key];
136 | }
137 | root[""] = "";//结束标记
138 |
139 | //计算词最小出现次数
140 | if (_minCount == 0 || count < _minCount)
141 | _minCount = count;
142 |
143 | _totalCount += count;
144 |
145 | //填充单词额外信息
146 | WordInfo info = new WordInfo() { Freq = count, RowNumber = rn }; //freq先设置为次数 后面要重新计算
147 | _wordExtraInfos[word] = info;
148 | rn++;
149 | }
150 | }
151 |
152 | foreach (KeyValuePair wordExtraInfo in _wordExtraInfos)
153 | {
154 | //计算 逆文档频率(一个词出现次数越高 则越不重要)
155 | wordExtraInfo.Value.IDF = (float)Math.Log(_totalCount / (wordExtraInfo.Value.Freq + 1));
156 |
157 | //计算 文档频率
158 | wordExtraInfo.Value.Freq = (float)Math.Log(wordExtraInfo.Value.Freq / _totalCount);
159 | }
160 |
161 | _minFreq = (float)Math.Log((float)_minCount / _totalCount);
162 | }
163 | }
164 | }
165 |
--------------------------------------------------------------------------------
/WordSegmentation/WordTool.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 | using System.Text.RegularExpressions;
7 |
8 | namespace WordSegmentation
9 | {
10 | public static class WordTool
11 | {
12 | private static Regex re_chinese = new Regex(@"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", RegexOptions.Compiled);
13 | private static Regex re_alphabet_digit = new Regex(@"(\d+\.\d+|[a-zA-Z0-9]+)", RegexOptions.Compiled);
14 |
15 | ///
16 | /// 分词
17 | ///
18 | ///
19 | ///
20 | public static IEnumerable Cut(string sentence)
21 | {
22 | string[] blocks = re_chinese.Split(sentence);
23 | foreach (string block in blocks)
24 | {
25 | if (re_chinese.IsMatch(block))
26 | {
27 | foreach (string word in CutBlock(block))
28 | {
29 | yield return word;
30 | }
31 | }
32 | }
33 | }
34 |
35 | ///
36 | /// 对一个词块分词
37 | ///
38 | ///
39 | ///
40 | private static IEnumerable CutBlock(string block)
41 | {
42 | Dictionary> dag = GetDAG(block);
43 | int[] route = CalcRoute(block, dag);
44 | int length = block.Length;
45 | int i = 0;
46 | string buffer = string.Empty;
47 | while (i < length)
48 | {
49 | int end = route[i];
50 | string word = block.Substring(i, end - i + 1);
51 | //不存在的单个词放入缓冲区
52 | if (end - i == 0 && !Dict.WordExtraInfos.ContainsKey(word))
53 | buffer += word;
54 | else
55 | {
56 | if (buffer.Length > 0)
57 | {
58 | foreach (string s in CutBuffer(buffer))
59 | {
60 | yield return s;
61 | }
62 |
63 | buffer = string.Empty;
64 | }
65 |
66 | yield return word;
67 | }
68 |
69 | i = end + 1;
70 | }
71 |
72 | if (buffer.Length > 0)
73 | {
74 | foreach (string s in CutBuffer(buffer))
75 | {
76 | yield return s;
77 | }
78 | }
79 | }
80 |
81 | ///
82 | /// 对缓冲区的字符进行分词
83 | ///
84 | ///
85 | ///
86 | private static IEnumerable CutBuffer(string buffer)
87 | {
88 | if (buffer.Length == 1)
89 | yield return buffer;
90 | else
91 | {
92 | //不存在的词 使用字母和数字 拆分后返回
93 | string[] tmp = re_alphabet_digit.Split(buffer);
94 | foreach (string s in tmp)
95 | {
96 | if (!string.IsNullOrEmpty(s))
97 | yield return s;
98 | }
99 | }
100 | }
101 |
102 | ///
103 | /// 计算最大概率路径(使用动态规划)
104 | ///
105 | ///
106 | ///
107 | ///
108 | private static int[] CalcRoute(string block, Dictionary> dag)
109 | {
110 | int length = block.Length;
111 | int[] route = new int[length];
112 | float[] freq = new float[length + 1];
113 |
114 | //汉语重心经常落在后面 采用逆向最大匹配
115 | for (int i = length - 1; i >= 0; i--)
116 | {
117 | var candidates = (from end in dag[i]
118 | select Tuple.Create(GetFreq(block.Substring(i, end - i + 1)) + freq[end + 1], end)).ToList();
119 | Tuple freqAndend = candidates.OrderByDescending(t => t.Item1).FirstOrDefault();
120 | freq[i] = freqAndend.Item1;
121 | route[i] = freqAndend.Item2;
122 | }
123 |
124 | return route;
125 | }
126 |
127 | ///
128 | /// 获取词语的文档频率
129 | ///
130 | ///
131 | ///
132 | private static float GetFreq(string word)
133 | {
134 | WordInfo info;
135 | if (Dict.WordExtraInfos.TryGetValue(word, out info))
136 | return info.Freq;
137 |
138 | return Dict.MinFreq;
139 | }
140 |
141 | ///
142 | /// 获取有向无环图
143 | ///
144 | ///
145 | ///
146 | private static Dictionary> GetDAG(string block)
147 | {
148 | int length = block.Length;
149 | Hashtable trie = Dict.Trie;
150 | int i = 0, end = 0;
151 |
152 | Dictionary> dag = new Dictionary>();
153 | while (i < length)
154 | {
155 | string key = block.Substring(end, 1);
156 | bool goNextWord;
157 | if (trie.ContainsKey(key))
158 | {
159 | trie = (Hashtable)trie[key];
160 | if (trie.ContainsKey(""))
161 | {
162 | if (!dag.ContainsKey(i))
163 | {
164 | dag.Add(i, new List());
165 | }
166 | dag[i].Add(end);
167 | }
168 |
169 | end++;
170 | goNextWord = end >= length;
171 | }
172 | else
173 | goNextWord = true;
174 |
175 | if (goNextWord)
176 | {
177 | end = ++i;
178 | trie = Dict.Trie;
179 | }
180 | }
181 |
182 | for (int k = 0; k < length; k++)
183 | {
184 | if (!dag.ContainsKey(k))
185 | dag[k] = new List() { k };
186 | }
187 |
188 | return dag;
189 | }
190 | }
191 | }
192 |
--------------------------------------------------------------------------------
/Example/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections;
3 | using System.Collections.Generic;
4 | using System.IO;
5 | using System.Linq;
6 | using System.Text;
7 | using System.Text.RegularExpressions;
8 | using WordSegmentation;
9 |
10 | namespace Example
11 | {
12 | class Program
13 | {
14 | static void Main(string[] args)
15 | {
16 | SimilarTest();
17 |
18 | Console.ReadKey();
19 | }
20 |
21 | private static void SimilarTest()
22 | {
23 | Console.WriteLine("usage : input filename1 filename2 to compare\n\texample: 1 2\n\tq : exit");
24 |
25 | while (true)
26 | {
27 | Console.Write("input filename : ");
28 | string input = Console.ReadLine();
29 | if (input == "q")
30 | break;
31 |
32 | string[] split = input.Split(' ');
33 | if (split.Length != 2)
34 | {
35 | Console.WriteLine("wrong input, please try again");
36 | continue;
37 | }
38 |
39 | string sentence1 = GetArticle(split[0]);
40 | string sentence2 = GetArticle(split[1]);
41 |
42 | Dictionary vector1 = ArticleUtils.GetFeatureVector(sentence1);
43 | Console.WriteLine("top 3 word rownumber of sentence1 :" + string.Join(" , ", vector1.OrderByDescending(t => t.Value).Select(t => t.Key).Take(3)));
44 |
45 | Dictionary vector2 = ArticleUtils.GetFeatureVector(sentence2);
46 | Console.WriteLine("top 3 word rownumber of sentence2 :" + string.Join(" , ", vector2.OrderByDescending(t => t.Value).Select(t => t.Key).Take(3)));
47 |
48 | double length1 = ArticleUtils.CaclVectorLength(vector1);
49 | double length2 = ArticleUtils.CaclVectorLength(vector2);
50 |
51 | Console.WriteLine(length1 + " : " + length2);
52 |
53 | double similar = ArticleUtils.CaclSimilar(vector1, vector2, length1, length2);
54 |
55 | Console.WriteLine(similar);
56 | }
57 | }
58 |
59 | private static string GetArticle(string filename)
60 | {
61 | using (StreamReader reader = new StreamReader(filename + ".txt"))
62 | {
63 | return reader.ReadToEnd();
64 | }
65 | }
66 |
67 | private static void cuttest()
68 | {
69 | cuttest("我不喜欢日本和服。");
70 | cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。");
71 | cuttest("我不喜欢日本和服。");
72 | cuttest("雷猴回归人间。");
73 | cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作");
74 | cuttest("我需要廉租房");
75 | cuttest("永和服装饰品有限公司");
76 | cuttest("我爱北京天安门");
77 | cuttest("abc");
78 | cuttest("隐马尔可夫");
79 | cuttest("雷猴是个好网站");
80 | cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成");
81 | cuttest("草泥马和欺实马是今年的流行词汇");
82 | cuttest("伊藤洋华堂总府店");
83 | cuttest("中国科学院计算技术研究所");
84 | cuttest("罗密欧与朱丽叶");
85 | cuttest("我购买了道具和服装");
86 | cuttest("PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍");
87 | cuttest("湖北省石首市");
88 | cuttest("湖北省十堰市");
89 | cuttest("总经理完成了这件事情");
90 | cuttest("电脑修好了");
91 | cuttest("做好了这件事情就一了百了了");
92 | cuttest("人们审美的观点是不同的");
93 | cuttest("我们买了一个美的空调");
94 | cuttest("线程初始化时我们要注意");
95 | cuttest("一个分子是由好多原子组织成的");
96 | cuttest("祝你马到功成");
97 | cuttest("他掉进了无底洞里");
98 | cuttest("中国的首都是北京");
99 | cuttest("孙君意");
100 | cuttest("外交部发言人马朝旭");
101 | cuttest("领导人会议和第四届东亚峰会");
102 | cuttest("在过去的这五年");
103 | cuttest("还需要很长的路要走");
104 | cuttest("60周年首都阅兵");
105 | cuttest("你好人们审美的观点是不同的");
106 | cuttest("买水果然后来世博园");
107 | cuttest("买水果然后去世博园");
108 | cuttest("但是后来我才知道你是对的");
109 | cuttest("存在即合理");
110 | cuttest("的的的的的在的的的的就以和和和");
111 | cuttest("I love你,不以为耻,反以为rong");
112 | cuttest("因");
113 | cuttest("");
114 | cuttest("hello你好人们审美的观点是不同的");
115 | cuttest("很好但主要是基于网页形式");
116 | cuttest("hello你好人们审美的观点是不同的");
117 | cuttest("为什么我不能拥有想要的生活");
118 | cuttest("后来我才");
119 | cuttest("此次来中国是为了");
120 | cuttest("使用了它就可以解决一些问题");
121 | cuttest(",使用了它就可以解决一些问题");
122 | cuttest("其实使用了它就可以解决一些问题");
123 | cuttest("好人使用了它就可以解决一些问题");
124 | cuttest("是因为和国家");
125 | cuttest("老年搜索还支持");
126 | cuttest(
127 | "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于弱势地位的消费者由此将陷入万劫不复的境地。 ");
128 | cuttest("大");
129 | cuttest("");
130 | cuttest("他说的确实在理");
131 | cuttest("长春市长春节讲话");
132 | cuttest("结婚的和尚未结婚的");
133 | cuttest("结合成分子时");
134 | cuttest("旅游和服务是最好的");
135 | cuttest("这件事情的确是我的错");
136 | cuttest("供大家参考指正");
137 | cuttest("哈尔滨政府公布塌桥原因");
138 | cuttest("我在机场入口处");
139 | cuttest("邢永臣摄影报道");
140 | cuttest("BP神经网络如何训练才能在分类时增加区分度?");
141 | cuttest("南京市长江大桥");
142 | cuttest("应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究");
143 | cuttest("长春市长春药店");
144 | cuttest("邓颖超生前最喜欢的衣服");
145 | cuttest("胡锦涛是热爱世界和平的政治局常委");
146 | cuttest("程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪");
147 | cuttest("一次性交多少钱");
148 | cuttest("两块五一套,三块八一斤,四块七一本,五块六一条");
149 | cuttest("小和尚留了一个像大和尚一样的和尚头");
150 | cuttest("我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站");
151 | cuttest("张晓梅去人民医院做了个B超然后去买了件T恤");
152 | cuttest("AT&T是一件不错的公司,给你发offer了吗?");
153 | cuttest("C++和c#是什么关系?11+122=133,是吗?PI=3.14159");
154 | cuttest("你认识那个和主席握手的的哥吗?他开一辆黑色的士。");
155 | cuttest("枪杆子中出政权");
156 | cuttest("张三风同学走上了不归路");
157 | cuttest("阿Q腰间挂着BB机手里拿着大哥大,说:我一般吃饭不AA制的。");
158 | cuttest("在1号店能买到小S和大S八卦的书,还有3D电视。");
159 |
160 | //Regex re_chinese = new Regex(@"([\u4E00-\u9FA5a-zA-Z0-9+#&\._]+)", RegexOptions.Compiled);
161 | //string[] blocks= re_chinese.Split("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成");
162 | //Console.WriteLine(string.Join(" / ", blocks));
163 |
164 | //Hashtable trie=Dict.Trie;
165 | //foreach (string key in trie.Keys)
166 | //{
167 | // Console.WriteLine(key);
168 | //}
169 | //Console.WriteLine(trie.Keys.Count);
170 | }
171 |
172 | private static void cuttest(string sentence)
173 | {
174 | string[] result = WordTool.Cut(sentence).ToArray();
175 | Console.WriteLine(string.Join(" / ", result));
176 | }
177 | }
178 | }
179 |
--------------------------------------------------------------------------------