├── stop_words
├── README.md
├── words_mining.py
└── words.hongloumeng
/stop_words:
--------------------------------------------------------------------------------
1 | 的
2 | 了
3 | 着
4 | 这
5 | 在
6 | 是
7 | 去
8 | 却
9 | 让
10 | 你
11 | 我
12 | 他
13 | 她
14 | 它
15 | 们
16 | 令人
17 | 已被
18 | 得很
19 | 结果
20 | 已毕
21 | 并没
22 | 性情
23 | 家务
24 | 原故
25 | 正值
26 | 现场
27 | 功劳
28 | 实在
29 | 为何
30 | 全部
31 | 全都
32 | 只有
33 | 所有
34 | 事情
35 | 出现
36 | 身体
37 | 身躯
38 | 身材
39 | 还请
40 | 身份
41 | 体积
42 | 体型
43 | 身形
44 | 听完
45 | 之后
46 | 之物
47 | 就是
48 | 还是
49 | 还有
50 | 有点
51 | 其他
52 | 都没
53 | 没有
54 | 还请
55 | 承受
56 | 担心
57 | 达到
58 | 模样
59 | 照片
60 | 背景
61 | 关系
62 | 并不
63 | 无法
64 | 情节
65 | 接著
66 | 什麽
67 | 一眼
68 | 一个
69 | 一丝
70 | 一些
71 | 一定
72 | 一旁
73 | 一句
74 | 一声
75 | 一阵
76 | 一径
77 | 一伸
78 | 一愣
79 | 一笑
80 | 一边
81 | 效果
82 | 这个
83 | 已经
84 | 之后
85 | 之意
86 | 之恩
87 | 之罪
88 | 大胆
89 | 自己
90 | 我们
91 | 你们
92 | 多久
93 | 他们
94 | 算是
95 | 数量
96 | 等人
97 | 像是
98 | 都是
99 | 什么
100 | 也就
101 | 也是
102 | 也可
103 | 也会
104 | 随之
105 | 根本
106 | 从来
107 | 从此
108 | 最后
109 | 却是
110 | 极为
111 | 差点
112 | 感觉
113 | 名字
114 | 唤做
115 | 方才
116 | 双手
117 | 谢恩
118 | 即忙
119 | 即唤
120 | 即命
121 | 今已
122 | 证据
123 | 脑袋
124 | 远超
125 | 吓得
126 | 情况
127 | 威觉
128 | 交代
129 | 交流
130 | 问题
131 | 现场
132 | 好便似
133 | 好道也
134 | 也不敢
135 | 不弱于
136 | 不可能
137 | 这么多
138 | 能不能
139 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 基于自由度及凝固度的新词发现/新词挖掘
2 |
3 | ## 流程图
4 |
5 |
6 |

7 |
8 |
9 | ## 四大名著样例
10 |
11 | TOP50(按词频降序排序)
12 |
13 | 《红楼梦》
14 | 宝玉, 凤姐, 贾母, 黛玉, 袭人, 姑娘, 王夫人, 宝钗, 丫头, 老太太, 贾政, 奶奶, 众人, 贾琏, 老爷, 东西, 姐姐, 大家, 二爷, 薛姨妈, 凤姐儿, 探春, 鸳鸯, 紫鹃, 湘云, 婆子, 妹妹, 贾珍, 银子, 李纨, 晴雯, 尤氏, 媳妇, 外头, 刘姥姥, 薛蟠, 邢夫人, 小丫头, 孩子, 林黛玉, 姊妹, 香菱, 麝月, 哥哥, 丫鬟, 贾蓉, 小厮, 意思, 二奶奶, 主意
15 |
16 | 《西游记》
17 | 行者, 八戒, 师父, 三藏, 大圣, 唐僧, 沙僧, 和尚, 菩萨, 长老, 妖精, 老孙, 悟空, 国王, 那怪, 徒弟, 闻言, 大王, 小妖, 兄弟, 宝贝, 孙行者, 铁棒, 龙王, 妖怪, 师徒, 太子, 东土, 性命, 孙大圣, 老爷, 那呆子, 神通, 公主, 妖魔, 人家, 玉帝, 猴王, 哥哥, 土地, 道士, 师兄, 贫僧, 行李, 云头, 陛下, 太宗, 那妖精, 闻得, 爷爷
18 |
19 | 《水浒传》
20 | 宋江, 李逵, 武松, 林冲, 军马, 哥哥, 吴用, 头领, 太尉, 众人, 戴宗, 兄弟, 卢俊义, 梁山泊, 先锋, 燕青, 好汉, 花荣, 王庆, 晁盖, 石秀, 那妇人, 杨志, 鲁智深, 柴进, 呼延灼, 太公, 山寨, 秦明, 和尚, 史进, 天子, 张顺, 公孙胜, 兄长, 弟兄, 关胜, 军士, 朱仝, 知府, 张清, 庄客, 杨雄, 李俊, 性命, 那厮, 小弟, 东京, 小喽罗, 大官人
21 |
22 | 《三国演义》
23 | 玄德, 孔明, 将军, 曹操, 司马, 二人, 丞相, 关公, 引兵, 云长, 荆州, 蜀兵, 夏侯, 张飞, 吕布, 诸葛, 主公, 魏延, 孙权, 赵云, 军士, 魏兵, 刘备, 司马懿, 夫人, 姜维, 袁绍, 东吴, 诸将, 周瑜, 汉中, 都督, 马超, 陛下, 天子, 后主, 黄忠, 张郃, 先主, 太守, 先生, 邓艾, 孟获, 先锋, 诸葛亮, 汝等, 江东, 曹仁, 张辽, 领兵
24 |
25 | ## 计算速度
26 | 单进程 40K/s
27 |
28 |
29 |
30 | 参考文章: [互联网时代的社会语言学:基于SNS的文本数据挖掘][1]
31 |
32 | [1]: http://www.matrix67.com/blog/archives/5044
33 |
--------------------------------------------------------------------------------
/words_mining.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import time
4 | import math
5 |
6 |
7 | def loda_word_dict():
8 | """
9 | 加载
10 | 结巴词典词性: https://github.com/elephantnose/words_mining/blob/master/dict.txt.big
11 | 停用词: https://github.com/elephantnose/words_mining/blob/master/stop_words
12 | *结巴自带词典词性标注准确度一般, 可根据需要修改
13 | """
14 | word_dict = {}
15 | stop_words = {}
16 |
17 | if os.path.exists("./dict.txt.big"):
18 | with open("./dict.txt.big", "r", encoding="utf8") as fr:
19 | word_dict = {word.strip().split(" ")[0]: word.strip().split(" ")[2] for word in fr}
20 |
21 | if os.path.exists("./stop_words"):
22 | with open("./stop_words", encoding="utf8") as fr:
23 | stop_words = set([word.strip() for word in fr])
24 |
25 | return word_dict, stop_words
26 |
27 |
28 | class Config(object):
29 | __author__ = "elephantnose"
30 | __github__ = "https://github.com/elephantnose"
31 | __jianshu__ = "https://www.jianshu.com/u/39efcd8e2587"
32 | __segmentfault__ = "https://segmentfault.com/u/elephantnose"
33 |
34 | """
35 | 各阈值设定
36 | words_length: 新词字数默认个字
37 | pmi_limit: 凝固度阈值
38 | left_entropy_limit: 左熵阈值
39 | right_entropy_limit: 右熵阈值
40 | word_frequency: 词频阈值
41 | """
42 | words_length = 5
43 | pmi_limit = 1.5
44 | left_entropy_limit = 1
45 | right_entropy_limit = 1
46 | word_frequency_limit = 5
47 | word_dict, stop_words = loda_word_dict()
48 |
49 |
50 | class ContentHandler(object):
51 | def __init__(self, content):
52 | """
53 | length: 文本字数
54 | book_content: 正序文本
55 | r_book_content: 倒序文本
56 | tire_tree: 正序文本 tire树
57 | r_tire_tree: 倒序文本 tire树
58 | """
59 | self.length = 0
60 | self.book_content, self.r_book_content = self._preprocess(content=content)
61 |
62 | self.tire_tree = TireTree(self.book_content)
63 | self.r_tire_tree = TireTree(self.r_book_content)
64 |
65 | def pmi(self, char_node):
66 | """计算凝固度"""
67 | p_x_y = char_node.count / self.length
68 | px_py_list = []
69 |
70 | # 枚举所有组成词的情况, 并取最大概率值
71 | for i in range(1, len(char_node.name)):
72 | px = self.tire_tree.search_node(char_node.name[:i]).count / self.length
73 | py = self.tire_tree.search_node(char_node.name[i:]).count / self.length
74 | px_py_list.append(px*py)
75 |
76 | px_py = max(px_py_list)
77 | p = math.log10(p_x_y / px_py)
78 | return p
79 |
80 | def left_entropy(self, char_node):
81 | """计算左熵"""
82 | r_char_node = self.r_tire_tree.search_node(char_node.name[::-1])
83 | father_set = r_char_node.child
84 | le = 0
85 | for father_name, father_node in father_set.items():
86 | p_father = father_node.count / r_char_node.child_counts
87 | p = p_father * math.log10(p_father)
88 | le += p
89 | return -le
90 |
91 | def right_entropy(self, char_node):
92 | """计算右熵"""
93 | child_set = self.tire_tree.search_node(char_node.name).child
94 | re = 0
95 | for child_name, child_node in child_set.items():
96 | p_child = child_node.count / char_node.child_counts
97 | p = p_child * math.log10(p_child)
98 | re += p
99 | return -re
100 |
101 | def word_frequency(self, char_node):
102 | """计算词频"""
103 | return char_node.count
104 |
105 | def get_words(self, node, layer, res_data):
106 | if layer >= self.tire_tree.layer-1:
107 | return
108 |
109 | for c_name, c_node in node.child.items():
110 | # 递归
111 | self.get_words(c_node, layer+1, res_data)
112 |
113 | # 纯小写英文及纯数字过滤
114 | if (c_name.encode("utf8").isalpha() and c_name.encode("utf8").islower()) or c_name.encode("utf8").isdigit():
115 | continue
116 | # 词典过滤
117 | is_continue = self.word_dict_filter(c_name)
118 | if not is_continue:
119 | continue
120 | # 阈值过滤
121 | plrf = self.limit_filter(c_node)
122 | if not plrf:
123 | continue
124 |
125 | res_data.append(plrf)
126 |
127 | def limit_filter(self, node):
128 | wf = node.count
129 | if wf < Config.word_frequency_limit:
130 | return False
131 |
132 | pmi = self.pmi(node)
133 | if pmi < Config.pmi_limit:
134 | return False
135 |
136 | le = self.left_entropy(node)
137 | if le < Config.left_entropy_limit:
138 | return False
139 |
140 | re = self.right_entropy(node)
141 | if re < Config.right_entropy_limit:
142 | return False
143 |
144 | return [node.name, wf, pmi, le, re]
145 |
146 | def word_dict_filter(self, chars):
147 | """词典过滤"""
148 | for char in self.permutation(1, chars):
149 | # 过滤掉已经收录于结巴词典的非名词
150 | if not Config.word_dict.get(char, "n").startswith("n"):
151 | return False
152 | # 过滤停用词
153 | if char in Config.stop_words:
154 | return False
155 | return True
156 |
157 | def permutation(self, start_size, char):
158 | """字符排列组合"""
159 | for size in range(start_size, len(char)+1):
160 | for index in range(len(char)+1-size):
161 | yield char[index: index+size]
162 |
163 | def _preprocess(self, content):
164 | """返回正序文本及倒序文本列表, 按符号拆分"""
165 | content_list = re.split("[^\u4E00-\u9FFFa-zA-Z0-9]", content)
166 | r_content_list = re.split("[^\u4E00-\u9FFFa-zA-Z0-9]", content[::-1])
167 |
168 | if not self.length:
169 | self.length = sum([len(i) for i in content_list if i])
170 |
171 | return content_list, r_content_list
172 |
173 |
174 | class Node(object):
175 | def __init__(self, name, father):
176 | """
177 | 节点名称
178 | 节点出现次数
179 | 父节点
180 | 子节点列表
181 | 未去重子集量
182 | """
183 | self.name = name
184 | self.count = 0
185 | self.father = father
186 | self.child = {}
187 | self.child_counts = 0
188 |
189 |
190 | class TireTree(object):
191 | def __init__(self,
192 | content,
193 | layer_num=Config.words_length+1,
194 | step=1):
195 | """
196 | 字典树对象
197 | layer_num: 字典树层数
198 | content: 构建字典树的字符串
199 | """
200 | self.content = content
201 | self.root = Node("ROOT", None)
202 | self.layer = layer_num
203 | self.step = step
204 | self.word_counts = 0
205 |
206 | self.build_tree()
207 |
208 | def build_tree(self):
209 | # 按层构建字典树, layer 从1开始, 表示第一层, 第0层为根节点
210 | for layer in range(1, self.layer+1):
211 | # 创建切割窗口对象
212 | char_session = CharSession(size=layer, step=self.step)
213 | for char in char_session.split_char(self.content):
214 | # 判断 char 是否在该层 没有节点则添加, 有则更新
215 | if not self.search_node(char, layer):
216 | self.add_node(char, layer)
217 | else:
218 | self.update_node(char, layer)
219 |
220 | def add_node(self, char, layer=None):
221 | """
222 | 在指定层添加指定字符串节点
223 | """
224 | if not layer:
225 | layer = len(char)
226 |
227 | # 创建节点对象
228 | if layer == 1:
229 | father = self.root
230 | else:
231 | father = self.search_node(char[:-1])
232 |
233 | node = Node(name=char, father=father)
234 | node.count = 1
235 |
236 | # 将此节点挂入tire树
237 | father.child[char] = node
238 | father.child_counts += 1
239 | return True
240 |
241 | def del_node(self, char, layer=None):
242 | pass
243 |
244 | def update_node(self, char, layer=None):
245 | """
246 | 更新指定节点信息
247 | """
248 | if not layer:
249 | layer = len(char)
250 |
251 | node = self.search_node(char, layer)
252 | node.count += 1
253 | node.father.child_counts += 1
254 | return True
255 |
256 | def search_node(self, char, layer=None):
257 | """
258 | 指定字符串, 指定层 查找节点是否存在
259 | """
260 | if char == "ROOT":
261 | return self.root
262 | elif not layer:
263 | layer = len(char)
264 |
265 | node = self.root
266 | for layer_index in range(1, layer+1):
267 | node = node.child.get(char[:layer_index], None)
268 | if not node:
269 | return None
270 |
271 | return node
272 |
273 |
274 | class CharSession(object):
275 | def __init__(self, size, step=1):
276 | """
277 | 窗口对象
278 | size: 窗口大小
279 | step: 移动步长
280 | """
281 | self.size = size
282 | self.step = step
283 |
284 | def split_char(self, content):
285 | """
286 | 按指定窗口大小及步长切割文本
287 | """
288 | for seq in content:
289 | while seq:
290 | if len(seq) >= self.size:
291 | yield seq[:self.size]
292 | seq = seq[self.step:]
293 | else:
294 | break
295 |
296 |
297 | def find_word(file_like):
298 | """
299 | file_like: 文本内容或文本路径
300 | """
301 | if os.path.exists(file_like):
302 | with open(file_like, encoding="utf8") as fr:
303 | content = fr.read()
304 | else:
305 | content = file_like
306 |
307 | content_handler = ContentHandler(content)
308 | words = []
309 |
310 | for child_node in content_handler.tire_tree.search_node("ROOT").child.values():
311 | content_handler.get_words(child_node, layer=1, res_data=words)
312 |
313 | return words
314 |
315 |
316 | if __name__ == '__main__':
317 | stime = time.time()
318 |
319 | res_data = find_word("./hongloumeng.txt")
320 |
321 | for each_ele in res_data:
322 | print(*each_ele, sep="\t")
323 |
324 | etime = time.time()
325 | print("ALL DONE! 耗时 {} s".format(etime-stime))
326 |
--------------------------------------------------------------------------------
/words.hongloumeng:
--------------------------------------------------------------------------------
1 | 宝玉 3861 1.920249695660435 1.8683176015164489 2.008151912411824
2 | 凤姐 1696 2.275124596786582 1.7668487030119284 1.6380616776396568
3 | 贾母 1623 2.031328880152934 1.6402792484677713 1.8720408641173383
4 | 黛玉 1317 2.082141604276614 1.3953585334549905 1.91362302465702
5 | 袭人 1129 1.8210406779736985 1.7331325814552954 1.825454318223042
6 | 姑娘 1097 2.496967933291215 1.7031773172478153 1.9506689103882342
7 | 王夫人 1065 1.8402287433548559 1.5802223696419404 1.8642857385020823
8 | 宝钗 1048 2.0871233075074116 1.7386438236061736 1.8692063744580105
9 | 丫头 969 2.23731541908231 1.3194072121984664 1.844804920230693
10 | 老太太 946 2.1492575135373446 1.7774458681510545 1.9395640257048075
11 | 贾政 912 2.132523384311854 1.6999817333454295 1.843800906243123
12 | 奶奶 846 2.2839769647542743 1.3761953348430698 2.047871938223925
13 | 众人 834 1.703348563597313 1.5659121110474876 1.5729242846707672
14 | 贾琏 764 2.085585127936571 1.7552048027535454 1.8841105963529399
15 | 老爷 638 2.006942917138157 1.7615066285152836 2.000439762118316
16 | 东西 584 2.843978170464574 1.4587457769845338 1.8539709720758677
17 | 姐姐 579 1.5600462902044658 1.5996945301764403 1.9537517096113668
18 | 二爷 496 1.897262354148805 1.478486490822326 2.019948791132446
19 | 薛姨妈 453 2.8027356654731035 1.5713701725230598 1.7643545136821832
20 | 凤姐儿 451 1.5029221997147726 1.604483628877994 1.7051544369753253
21 | 探春 428 2.8027749677486096 1.6245619336969461 1.657029991585864
22 | 鸳鸯 426 3.209579987836052 1.6268124409154276 1.7681143376631647
23 | 紫鹃 416 3.147255291244207 1.5307701614640774 1.8131400985222335
24 | 湘云 394 2.8394749131649806 1.045556330403844 1.6507707586826375
25 | 婆子 392 1.9469532701585468 1.081621517364775 1.7302313100945823
26 | 妹妹 390 2.3215042503787076 1.287503631044515 1.9107197787171628
27 | 贾珍 384 2.008835906825046 1.6716602143105526 1.8137807198175355
28 | 银子 362 1.9153151722484163 1.3218141118396858 1.6883526875269577
29 | 李纨 359 3.019340005686991 1.5298303229786305 1.597757693637426
30 | 晴雯 338 3.2984114846165857 1.646512294315143 1.7571766995585902
31 | 尤氏 338 2.978673419736106 1.6235918474430888 1.7023080352427158
32 | 媳妇 317 3.192794387643852 1.498207203046196 1.8527035621945067
33 | 外头 314 1.7533887708167075 1.3222011233131437 1.882111856100834
34 | 刘姥姥 293 3.0455788425703787 1.4059942841206015 1.6304486651592753
35 | 薛蟠 284 2.837619363552421 1.685418814980996 1.8505170187878108
36 | 邢夫人 283 1.8402287433548559 1.667419299459424 1.7291522687552563
37 | 小丫头 275 2.0147226659882445 1.2464278857974624 1.3452231538009827
38 | 孩子 250 2.0101141525141037 1.0219335790144668 1.6251820373882222
39 | 林黛玉 249 2.084443814418511 1.367402241373067 1.7558237514585642
40 | 姊妹 247 2.752092861141172 1.5301659534875365 1.474738042407255
41 | 香菱 241 2.8887437740650905 1.6512189824986296 1.8790629552762699
42 | 麝月 233 2.932531915435284 1.4324506205217118 1.512570954939784
43 | 哥哥 230 2.5250966915000497 1.3421007156264415 1.8502219499810209
44 | 丫鬟 229 2.7666080150789183 1.4214167951894632 1.5941761442487457
45 | 贾蓉 224 2.0153347892181097 1.475562172457836 1.8060627943594254
46 | 小厮 212 2.4971145030341195 1.2617759440290894 1.4631055762142229
47 | 二奶奶 209 1.838338516824433 1.4969280832987364 1.8207428710118478
48 | 主意 201 2.35357723398317 1.1231725941765343 1.4167625700981725
49 | 惜春 195 2.700535797254927 1.5027015629896956 1.6377491552823815
50 | 贾芸 191 2.0560817187703218 1.2809803285833437 1.685368108137415
51 | 贾赦 188 2.0937386003176433 1.6996524707924239 1.5666510102884166
52 | 雨村 183 3.2469244950313993 1.2682676993439737 1.7969732300400767
53 | 嫂子 180 2.059775106871739 1.320929711722148 1.7250800110874873
54 | 兄弟 180 3.0540585069713693 1.2787164734532022 1.8760567372531505
55 | 母亲 174 1.7406039675379534 1.0613764177689347 1.8324175714368378
56 | 素日 173 2.1121415853864356 1.4646164340126455 1.8863904967574552
57 | 芳官 158 2.856166987360585 1.540783425612464 1.723180213673603
58 | 金桂 153 2.891757665760179 1.5369577011967788 1.5796429029927048
59 | 妙玉 152 1.849646895025003 1.3888967513586365 1.6530102617813554
60 | 贾环 151 1.9467260600742642 1.5250966645371395 1.556825062712983
61 | 雪雁 147 3.353379210064482 1.3124134562881018 1.6721377095428114
62 | 迎春 142 2.545180585217271 1.5874329334451118 1.7021202724264213
63 | 奴才 136 2.4465478220558246 1.3293526877206518 1.6157211773376368
64 | 赵姨娘 132 2.5576267401990345 1.4225518242127881 1.7094382677092332
65 | 衣服 132 2.8501725784733782 1.3420341604416457 1.6541386143739198
66 | 林姑娘 132 2.0483536457842275 1.5302460913640068 1.7332787874185303
67 | 莺儿 130 2.036778820599282 1.5045717658027198 1.6218718165925505
68 | 府里 130 1.5202859688851877 1.268319759623021 1.5206564009430266
69 | 和尚 119 2.2688787855663004 1.1629743299053878 1.561667906450683
70 | 宝蟾 117 2.085586271776365 1.4858028950409359 1.6228263060831636
71 | 宝琴 117 1.891909837919571 1.49128394831757 1.6324843146737815
72 | 妈妈 116 1.980441737517973 1.3271901757696414 1.6741004472224266
73 | 亲戚 110 2.7578076882803284 1.3675028229825108 1.3533020389655708
74 | 地方 108 2.013091659803695 1.1847732720361819 1.372555727613529
75 | 秦钟 107 3.26586151431019 1.4643613601434027 1.708315093074342
76 | 姨太太 107 1.6361861101509727 1.493844288873282 1.6424906261420298
77 | 光景 105 3.139127931698933 1.003122699259533 1.2570217505692511
78 | 老婆子 104 1.8696514267891124 1.2067355014299854 1.5611997189892064
79 | 薛蝌 102 2.8476022729596813 1.3710046675713963 1.6029214798522224
80 | 父母 100 1.973901353507686 1.6133192428399632 1.6861954824038432
81 | 秋纹 99 3.2413020971732776 1.0628765744289488 1.500343103494676
82 | 衣裳 96 3.200650747571753 1.3910367184470929 1.4022682063551253
83 | 林妹妹 94 2.3500495887262884 1.3437158107723286 1.551174469853685
84 | 岫烟 94 3.4644669064647613 1.1721462863970828 1.6179077531491255
85 | 贾兰 91 1.9083446598593719 1.2074451710530374 1.5846531980656087
86 | 尤二姐 91 2.314782157370063 1.256882502758549 1.6075655580689003
87 | 小丫头子 91 1.6464305949931832 1.1671278327377754 1.455560157892341
88 | 婶子 89 1.8857802049769736 1.4263217150512204 1.6442323657891802
89 | 赖大 88 1.9720535841114635 1.3481817823021764 1.3570497166214297
90 | 茗烟 88 3.2094253476481542 1.2936676187805067 1.462697289790773
91 | 工夫 87 2.3355545161254727 1.1700450977613046 1.472683057527292
92 | 先生 86 1.782602162880173 1.2490782999849188 1.6725413374666984
93 | 史湘云 83 2.936615468150483 1.1167266692561717 1.4818584627411038
94 | 书房 81 1.9447220420871507 1.1980029450630048 1.1122918056832731
95 | 侄儿 79 1.8552246658420986 1.4724532966944543 1.6062397302877012
96 | 老祖宗 78 2.277300916847658 1.4434654963717144 1.5071576787489693
97 | 荣府 74 2.543049146462673 1.4175367779887604 1.326531569472486
98 | 士隐 74 3.2989707821593113 1.0974329396418676 1.5351919473584572
99 | 贾瑞 73 1.5298405861108517 1.2202277529836547 1.4791824959295714
100 | 秦氏 73 2.6793532470755475 1.3496486086534079 1.5503521948873955
101 | 旺儿 72 1.9487322679225307 1.3360715182633534 1.4139028374643523
102 | 叔叔 72 2.9811882373757586 1.3368973539377464 1.5939565763911745
103 | 眼睛 70 3.001888507252217 1.2518272343083605 1.447296889754225
104 | 吃茶 70 1.8560314821799047 1.5123749522725076 1.1836914970589403
105 | 女孩儿 68 1.7835756200989357 1.1345292598453884 1.244650590369866
106 | 司棋 68 3.4821914891145633 1.3376474758949746 1.5279235635487676
107 | 读书 67 2.8118313631458163 1.6056695829482026 1.2338558116407277
108 | 规矩 67 3.9007439146222627 1.2841283941227133 1.4106706871416514
109 | 闻得 66 1.7453538415426273 1.3481817823021767 1.648778844169048
110 | 太监 66 1.9744595331288537 1.1442856792286127 1.469948286599035
111 | 贾蔷 63 1.965029670906714 1.209981208498597 1.40668475109739
112 | 眼泪 59 2.2484165327398427 1.1969913744053495 1.288687099355286
113 | 宝兄弟 59 1.6157652868854915 1.2744462746869218 1.568213117869548
114 | 琥珀 58 4.087651707165842 1.0845840056546714 1.3612742599196843
115 | 焙茗 58 3.646959718072854 1.2019459763759048 1.216485355672154
116 | 彩云 57 2.4163427585876653 1.2183584300502075 1.4258106636882955
117 | 包勇 56 3.451838759036323 1.1293529119916217 1.400320382450168
118 | 晚饭 54 2.3438612018060754 1.0205192698167445 1.1236701124308353
119 | 宁府 54 2.5767293876986415 1.3423372781277623 1.3357335131551762
120 | 倒象 53 1.8851311291385087 1.001773499021995 1.2508068690320109
121 | 并无 51 1.6331655459032257 1.2570217505692516 1.5281985296598823
122 | 大观园 51 2.3264239883710016 1.3059165694524901 1.1339577373391931
123 | 鲍二 50 2.420256728487632 1.2879122273284427 1.2252133255197561
124 | 风流 50 2.581543407972812 1.5792881853887293 1.3892526180087998
125 | 舅舅 50 2.8456846688420727 1.2390210148604943 1.4691843482137972
126 | 巧姐儿 50 1.7728201363108833 1.2602934735623719 1.4862307299936792
127 | 代儒 50 3.7104459755939607 1.056942025200422 1.3119144044890476
128 | 文章 49 3.282877976661784 1.3166251430300524 1.1680505452985441
129 | 北静王 47 2.594454060611855 1.2209650647939894 1.2943079330492295
130 | 元妃 46 3.2726149012144616 1.2672962814527096 1.4908927082394159
131 | 亲友 46 2.421576683356765 1.4457130351072818 1.3290657973597024
132 | 海棠 45 3.523380276727279 1.3230654384702145 1.2537106030277168
133 | 扇子 45 1.8454359590341283 1.274604822671226 1.29548525189965
134 | 师父 44 2.944304024159872 1.1659601640196429 1.5031606305853589
135 | 弟兄 43 2.43225445744765 1.1600794788105424 1.3729390241768458
136 | 邢王二夫人 42 1.8402287433548559 1.184420824864272 1.3028034946859763
137 | 玉钏儿 42 1.799208085937762 1.1424355007077336 1.3432307444612757
138 | 混帐 42 2.8384433058454066 1.1275442038508285 1.1051638171517637
139 | 打谅 42 2.5188914049100717 1.2028877460153864 1.1657608993919972
140 | 丰儿 42 1.8689118834491711 1.1162337185377762 1.3286609289269573
141 | 角门 41 2.3174068232255873 1.2926055547460202 1.1659315219598279
142 | 厅上 41 1.917000226687207 1.0811621533994946 1.3170731724059996
143 | 作诗 40 1.9144629260642272 1.4130175530116131 1.048843810057643
144 | 针线 39 3.5891766167264594 1.2281339676616938 1.1692558083232352
145 | 金钏儿 39 1.9616659366860698 1.1781710659629392 1.4257431431580816
146 | 张华 39 2.831963410281706 1.2125707966370325 1.3851623617739424
147 | 倪二 39 2.364529342843184 1.0244373361363204 1.352145250701002
148 | 颜色 38 3.05392154019883 1.175714509968042 1.2991925804077487
149 | 菩萨 38 4.238295844009043 1.3343515271072905 1.3205710848716485
150 | 板儿 38 1.703712592934992 1.0516015891793142 1.329185343066291
151 | 大嫂子 38 1.6509350798845057 1.0977958056755082 1.3405575876193938
152 | 金陵 37 2.9376238522247884 1.1062373268101808 1.2310779570619612
153 | 神仙 37 2.503428007265168 1.1217611155679588 1.2287224654717086
154 | 李嬷嬷 37 2.4357453431927816 1.0132944582093153 1.234789286917152
155 | 银钱 36 1.8906327077713856 1.1547757200762878 1.2122438289850557
156 | 金荣 36 2.1648531959863924 1.195261601815712 1.296312483049863
157 | 精神 36 2.669807929169317 1.3221618768308905 1.3922799373649233
158 | 甄宝玉 36 1.741501153824392 1.1083941944520206 1.3851623617739424
159 | 王仁 36 2.2915720600862803 1.2122438289850554 1.3701266324953756
160 | 月钱 36 1.7675412494986393 1.0266378908108658 1.1561994984127493
161 | 族中 36 2.316250089389043 1.37256000258626 1.1351571219224417
162 | 坠儿 36 1.886285979518594 1.2107592620876937 1.288928464604505
163 | 二哥哥 36 1.6401372585020968 1.091137509132795 1.258556136010284
164 | 二叔 36 1.6382531146275454 1.0493381639457804 1.2848996986361956
165 | 秋桐 35 3.2795372334047546 1.15231566512091 1.3592466188729728
166 | 新鲜 35 3.074067380381816 1.2812285866959998 1.213572880038171
167 | 尤三姐 35 2.3119892571858927 1.0857961488469916 1.2768882211571222
168 | 荣国府 34 2.985053120873014 1.1593349918922735 1.167630178225665
169 | 照管 34 1.941901373626415 1.34075177440518 1.2226279650578922
170 | 梅花 34 2.5370265002774586 1.3715192480087866 1.2593485078862006
171 | 园门 34 1.5394236759916733 1.0494004530231424 1.100982673971512
172 | 藕官 33 2.7780838316120122 1.0886293931650515 1.3114547368203622
173 | 甄家 33 1.7106287979422312 1.2222606640310345 1.1609416736221438
174 | 琏二爷 33 1.7286433789846491 1.2609309280316907 1.2815181461465335
175 | 春燕 33 2.4371706142299274 1.1031237505833833 1.2194636332345474
176 | 女婿 33 2.817948497952052 1.2935651950091311 1.3100293267182603
177 | 饮食 32 3.3301051623032345 1.1681337652831403 1.0886293931650515
178 | 婆婆 32 1.8101202923201596 1.1819257557628253 1.3014011874747542
179 | 箱子 31 1.7855746776303887 1.25270812939223 1.0425513192597389
180 | 桂花 31 2.048946251091027 1.296312483049863 1.2055908646113058
181 | 庙里 31 1.6376759197526574 1.22114767275494 1.2885271841708332
182 | 山石 31 2.5973978483122715 1.1016302863790104 1.0142697749229166
183 | 蒋玉菡 29 4.307011656378503 1.1214813959193146 1.3497752093658009
184 | 荷包 29 3.1810258858987317 1.1196192149346 1.15231566512091
185 | 琏二奶奶 29 2.0478628273848614 1.0527487960856206 1.322348733920363
186 | 珍大爷 29 2.270355233080358 1.1405288753313518 1.2374070497365839
187 | 比先 29 1.6552182045734707 1.2471415861560746 1.085958301447765
188 | 桃花 29 2.515587215569386 1.2603235910956663 1.1104159301996999
189 | 张罗 29 2.8297620577239355 1.4001159298305461 1.060119465488529
190 | 帖儿 29 1.5960868315062946 1.0046303971855928 1.0627843095674208
191 | 夫妻 29 2.0160411147674786 1.1686120220698188 1.185898936242587
192 | 题目 28 2.84156810264724 1.0634105123887099 1.026637890810866
193 | 男女 28 2.11942187750539 1.3100293267182606 1.2935497713373494
194 | 宁国府 28 2.9976422481810343 1.1242030405556924 1.012450248147257
195 | 众姊妹 28 1.8474945484160505 1.0791812460476247 1.2306771003829997
196 | 芙蓉 27 3.352411849936793 1.2703994189276515 1.1204119982655922
197 | 舅母 27 1.6033109526104048 1.0441456077951246 1.2453135859556532
198 | 益发 27 2.3925817436295778 1.0405296753565512 1.2891562186230305
199 | 玻璃 27 4.373958446009117 1.0703221291572096 1.3300395757676093
200 | 手帕子 27 1.9898839365943446 1.1647752718815907 1.044981826173945
201 | 女尼 27 2.605859585624852 1.0746090020803172 1.0232249084065224
202 | 李氏 26 1.7492797703980105 1.0958165935452864 1.2110849003808126
203 | 房屋 26 1.560163689861027 1.0757790619213747 1.1359539263004839
204 | 冯紫英 26 4.063292361306397 1.0601194654885289 1.1063788149324716
205 | 雀儿 25 1.7126835208666074 1.02599452062265 1.2603235910956663
206 | 铁槛寺 25 4.065749865718011 1.0703221291572096 1.0880336186233166
207 | 贵妃 25 2.726784196062831 1.1951419200116562 1.0556720202883954
208 | 胭脂 25 4.045986822386106 1.0961844950135782 1.1788716777510708
209 | 盒子 24 1.6436102780935022 1.0293335830649735 1.044981826173945
210 | 珍珠 24 2.3375735550102417 1.0405296753565512 1.1596183341632196
211 | 王子腾 24 2.6383595463109364 1.108356562103204 1.1050717496266387
212 | 形容 24 2.861592803490504 1.2876899543378464 1.2499943283000725
213 | 含泪 24 2.665348722183645 1.1596183341632194 1.1260332722757374
214 | 分派 24 2.082672865518848 1.2055908646113058 1.1757522959833626
215 | 金银 23 1.6331325273240023 1.0962320868634188 1.1680505452985441
216 | 胡乱 23 2.4923174682963203 1.0438877668097386 1.158731857656642
217 | 省亲 23 1.874865329069792 1.21537886502357 1.0179571597331107
218 | 彩霞 23 3.1714148515881466 1.0003923426013537 1.1596183341632196
219 | 家伙 23 1.7579626767378616 1.1947885434319039 1.0676310452816091
220 | 阴阳 22 3.1813825457464717 1.06608149544718 1.1549291732153124
221 | 贾赦等 22 1.644724913196067 1.1031237505833835 1.2603235910956663
222 | 素习 22 2.8581447914018097 1.098458818336966 1.3150563175800265
223 | 柳湘莲 22 2.987162323770918 1.0912337342819318 1.1288624837399295
224 | 俱已 22 1.9877865878244654 1.2041199826559248 1.2876899543378464
225 | 风月 21 1.6317322314219185 1.1664912331979271 1.1088567413913666
226 | 梨香院 21 3.3145212581572485 1.1107114985054047 1.048843810057643
227 | 祖父 20 2.245855486144543 1.1950336277707467 1.1692558083232354
228 | 情愿 20 2.0175947230999833 1.0394022484799927 1.0958165935452864
229 | 舌头 19 1.7335031626915223 1.0013342591828385 1.0438877668097386
230 | 笔砚 19 3.4907454305978702 1.1203167611296814 1.0003923426013537
231 | 碟子 19 1.814410890428682 1.0210614281209909 1.1950336277707467
232 | 册子 18 1.6577185320048693 1.0754089640953688 1.021318738256381
233 | 世兄 17 2.0120085212718264 1.041392685158225 1.1664912331979271
234 | 骨肉 16 3.2501068050420305 1.0601194654885289 1.041392685158225
235 | 顺路 16 2.555485114643579 1.021318738256381 1.0770309979379904
236 | 梯己 15 2.6857364352063198 1.0405296753565512 1.041392685158225
237 | 官员 13 2.577633226696886 1.021318738256381 1.041392685158225
238 |
--------------------------------------------------------------------------------