├── stop_words ├── README.md ├── words_mining.py └── words.hongloumeng /stop_words: -------------------------------------------------------------------------------- 1 | 的 2 | 了 3 | 着 4 | 这 5 | 在 6 | 是 7 | 去 8 | 却 9 | 让 10 | 你 11 | 我 12 | 他 13 | 她 14 | 它 15 | 们 16 | 令人 17 | 已被 18 | 得很 19 | 结果 20 | 已毕 21 | 并没 22 | 性情 23 | 家务 24 | 原故 25 | 正值 26 | 现场 27 | 功劳 28 | 实在 29 | 为何 30 | 全部 31 | 全都 32 | 只有 33 | 所有 34 | 事情 35 | 出现 36 | 身体 37 | 身躯 38 | 身材 39 | 还请 40 | 身份 41 | 体积 42 | 体型 43 | 身形 44 | 听完 45 | 之后 46 | 之物 47 | 就是 48 | 还是 49 | 还有 50 | 有点 51 | 其他 52 | 都没 53 | 没有 54 | 还请 55 | 承受 56 | 担心 57 | 达到 58 | 模样 59 | 照片 60 | 背景 61 | 关系 62 | 并不 63 | 无法 64 | 情节 65 | 接著 66 | 什麽 67 | 一眼 68 | 一个 69 | 一丝 70 | 一些 71 | 一定 72 | 一旁 73 | 一句 74 | 一声 75 | 一阵 76 | 一径 77 | 一伸 78 | 一愣 79 | 一笑 80 | 一边 81 | 效果 82 | 这个 83 | 已经 84 | 之后 85 | 之意 86 | 之恩 87 | 之罪 88 | 大胆 89 | 自己 90 | 我们 91 | 你们 92 | 多久 93 | 他们 94 | 算是 95 | 数量 96 | 等人 97 | 像是 98 | 都是 99 | 什么 100 | 也就 101 | 也是 102 | 也可 103 | 也会 104 | 随之 105 | 根本 106 | 从来 107 | 从此 108 | 最后 109 | 却是 110 | 极为 111 | 差点 112 | 感觉 113 | 名字 114 | 唤做 115 | 方才 116 | 双手 117 | 谢恩 118 | 即忙 119 | 即唤 120 | 即命 121 | 今已 122 | 证据 123 | 脑袋 124 | 远超 125 | 吓得 126 | 情况 127 | 威觉 128 | 交代 129 | 交流 130 | 问题 131 | 现场 132 | 好便似 133 | 好道也 134 | 也不敢 135 | 不弱于 136 | 不可能 137 | 这么多 138 | 能不能 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 基于自由度及凝固度的新词发现/新词挖掘 2 | 3 | ## 流程图 4 | 5 |
6 | 7 |
8 | 9 | ## 四大名著样例 10 | 11 | TOP50(按词频降序排序) 12 | 13 | 《红楼梦》 14 | 宝玉, 凤姐, 贾母, 黛玉, 袭人, 姑娘, 王夫人, 宝钗, 丫头, 老太太, 贾政, 奶奶, 众人, 贾琏, 老爷, 东西, 姐姐, 大家, 二爷, 薛姨妈, 凤姐儿, 探春, 鸳鸯, 紫鹃, 湘云, 婆子, 妹妹, 贾珍, 银子, 李纨, 晴雯, 尤氏, 媳妇, 外头, 刘姥姥, 薛蟠, 邢夫人, 小丫头, 孩子, 林黛玉, 姊妹, 香菱, 麝月, 哥哥, 丫鬟, 贾蓉, 小厮, 意思, 二奶奶, 主意 15 | 16 | 《西游记》 17 | 行者, 八戒, 师父, 三藏, 大圣, 唐僧, 沙僧, 和尚, 菩萨, 长老, 妖精, 老孙, 悟空, 国王, 那怪, 徒弟, 闻言, 大王, 小妖, 兄弟, 宝贝, 孙行者, 铁棒, 龙王, 妖怪, 师徒, 太子, 东土, 性命, 孙大圣, 老爷, 那呆子, 神通, 公主, 妖魔, 人家, 玉帝, 猴王, 哥哥, 土地, 道士, 师兄, 贫僧, 行李, 云头, 陛下, 太宗, 那妖精, 闻得, 爷爷 18 | 19 | 《水浒传》 20 | 宋江, 李逵, 武松, 林冲, 军马, 哥哥, 吴用, 头领, 太尉, 众人, 戴宗, 兄弟, 卢俊义, 梁山泊, 先锋, 燕青, 好汉, 花荣, 王庆, 晁盖, 石秀, 那妇人, 杨志, 鲁智深, 柴进, 呼延灼, 太公, 山寨, 秦明, 和尚, 史进, 天子, 张顺, 公孙胜, 兄长, 弟兄, 关胜, 军士, 朱仝, 知府, 张清, 庄客, 杨雄, 李俊, 性命, 那厮, 小弟, 东京, 小喽罗, 大官人 21 | 22 | 《三国演义》 23 | 玄德, 孔明, 将军, 曹操, 司马, 二人, 丞相, 关公, 引兵, 云长, 荆州, 蜀兵, 夏侯, 张飞, 吕布, 诸葛, 主公, 魏延, 孙权, 赵云, 军士, 魏兵, 刘备, 司马懿, 夫人, 姜维, 袁绍, 东吴, 诸将, 周瑜, 汉中, 都督, 马超, 陛下, 天子, 后主, 黄忠, 张郃, 先主, 太守, 先生, 邓艾, 孟获, 先锋, 诸葛亮, 汝等, 江东, 曹仁, 张辽, 领兵 24 | 25 | ## 计算速度 26 | 单进程 40K/s 27 | 28 | 29 | 30 | 参考文章: [互联网时代的社会语言学:基于SNS的文本数据挖掘][1] 31 | 32 | [1]: http://www.matrix67.com/blog/archives/5044 33 | -------------------------------------------------------------------------------- /words_mining.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import time 4 | import math 5 | 6 | 7 | def loda_word_dict(): 8 | """ 9 | 加载 10 | 结巴词典词性: https://github.com/elephantnose/words_mining/blob/master/dict.txt.big 11 | 停用词: https://github.com/elephantnose/words_mining/blob/master/stop_words 12 | *结巴自带词典词性标注准确度一般, 可根据需要修改 13 | """ 14 | word_dict = {} 15 | stop_words = {} 16 | 17 | if os.path.exists("./dict.txt.big"): 18 | with open("./dict.txt.big", "r", encoding="utf8") as fr: 19 | word_dict = {word.strip().split(" ")[0]: word.strip().split(" ")[2] for word in fr} 20 | 21 | if os.path.exists("./stop_words"): 22 | with open("./stop_words", encoding="utf8") as fr: 23 | stop_words = set([word.strip() for word in fr]) 24 | 25 | return word_dict, stop_words 26 | 27 | 28 | class Config(object): 29 | __author__ = "elephantnose" 30 | __github__ = "https://github.com/elephantnose" 31 | __jianshu__ = "https://www.jianshu.com/u/39efcd8e2587" 32 | __segmentfault__ = "https://segmentfault.com/u/elephantnose" 33 | 34 | """ 35 | 各阈值设定 36 | words_length: 新词字数默认个字 37 | pmi_limit: 凝固度阈值 38 | left_entropy_limit: 左熵阈值 39 | right_entropy_limit: 右熵阈值 40 | word_frequency: 词频阈值 41 | """ 42 | words_length = 5 43 | pmi_limit = 1.5 44 | left_entropy_limit = 1 45 | right_entropy_limit = 1 46 | word_frequency_limit = 5 47 | word_dict, stop_words = loda_word_dict() 48 | 49 | 50 | class ContentHandler(object): 51 | def __init__(self, content): 52 | """ 53 | length: 文本字数 54 | book_content: 正序文本 55 | r_book_content: 倒序文本 56 | tire_tree: 正序文本 tire树 57 | r_tire_tree: 倒序文本 tire树 58 | """ 59 | self.length = 0 60 | self.book_content, self.r_book_content = self._preprocess(content=content) 61 | 62 | self.tire_tree = TireTree(self.book_content) 63 | self.r_tire_tree = TireTree(self.r_book_content) 64 | 65 | def pmi(self, char_node): 66 | """计算凝固度""" 67 | p_x_y = char_node.count / self.length 68 | px_py_list = [] 69 | 70 | # 枚举所有组成词的情况, 并取最大概率值 71 | for i in range(1, len(char_node.name)): 72 | px = self.tire_tree.search_node(char_node.name[:i]).count / self.length 73 | py = self.tire_tree.search_node(char_node.name[i:]).count / self.length 74 | px_py_list.append(px*py) 75 | 76 | px_py = max(px_py_list) 77 | p = math.log10(p_x_y / px_py) 78 | return p 79 | 80 | def left_entropy(self, char_node): 81 | """计算左熵""" 82 | r_char_node = self.r_tire_tree.search_node(char_node.name[::-1]) 83 | father_set = r_char_node.child 84 | le = 0 85 | for father_name, father_node in father_set.items(): 86 | p_father = father_node.count / r_char_node.child_counts 87 | p = p_father * math.log10(p_father) 88 | le += p 89 | return -le 90 | 91 | def right_entropy(self, char_node): 92 | """计算右熵""" 93 | child_set = self.tire_tree.search_node(char_node.name).child 94 | re = 0 95 | for child_name, child_node in child_set.items(): 96 | p_child = child_node.count / char_node.child_counts 97 | p = p_child * math.log10(p_child) 98 | re += p 99 | return -re 100 | 101 | def word_frequency(self, char_node): 102 | """计算词频""" 103 | return char_node.count 104 | 105 | def get_words(self, node, layer, res_data): 106 | if layer >= self.tire_tree.layer-1: 107 | return 108 | 109 | for c_name, c_node in node.child.items(): 110 | # 递归 111 | self.get_words(c_node, layer+1, res_data) 112 | 113 | # 纯小写英文及纯数字过滤 114 | if (c_name.encode("utf8").isalpha() and c_name.encode("utf8").islower()) or c_name.encode("utf8").isdigit(): 115 | continue 116 | # 词典过滤 117 | is_continue = self.word_dict_filter(c_name) 118 | if not is_continue: 119 | continue 120 | # 阈值过滤 121 | plrf = self.limit_filter(c_node) 122 | if not plrf: 123 | continue 124 | 125 | res_data.append(plrf) 126 | 127 | def limit_filter(self, node): 128 | wf = node.count 129 | if wf < Config.word_frequency_limit: 130 | return False 131 | 132 | pmi = self.pmi(node) 133 | if pmi < Config.pmi_limit: 134 | return False 135 | 136 | le = self.left_entropy(node) 137 | if le < Config.left_entropy_limit: 138 | return False 139 | 140 | re = self.right_entropy(node) 141 | if re < Config.right_entropy_limit: 142 | return False 143 | 144 | return [node.name, wf, pmi, le, re] 145 | 146 | def word_dict_filter(self, chars): 147 | """词典过滤""" 148 | for char in self.permutation(1, chars): 149 | # 过滤掉已经收录于结巴词典的非名词 150 | if not Config.word_dict.get(char, "n").startswith("n"): 151 | return False 152 | # 过滤停用词 153 | if char in Config.stop_words: 154 | return False 155 | return True 156 | 157 | def permutation(self, start_size, char): 158 | """字符排列组合""" 159 | for size in range(start_size, len(char)+1): 160 | for index in range(len(char)+1-size): 161 | yield char[index: index+size] 162 | 163 | def _preprocess(self, content): 164 | """返回正序文本及倒序文本列表, 按符号拆分""" 165 | content_list = re.split("[^\u4E00-\u9FFFa-zA-Z0-9]", content) 166 | r_content_list = re.split("[^\u4E00-\u9FFFa-zA-Z0-9]", content[::-1]) 167 | 168 | if not self.length: 169 | self.length = sum([len(i) for i in content_list if i]) 170 | 171 | return content_list, r_content_list 172 | 173 | 174 | class Node(object): 175 | def __init__(self, name, father): 176 | """ 177 | 节点名称 178 | 节点出现次数 179 | 父节点 180 | 子节点列表 181 | 未去重子集量 182 | """ 183 | self.name = name 184 | self.count = 0 185 | self.father = father 186 | self.child = {} 187 | self.child_counts = 0 188 | 189 | 190 | class TireTree(object): 191 | def __init__(self, 192 | content, 193 | layer_num=Config.words_length+1, 194 | step=1): 195 | """ 196 | 字典树对象 197 | layer_num: 字典树层数 198 | content: 构建字典树的字符串 199 | """ 200 | self.content = content 201 | self.root = Node("ROOT", None) 202 | self.layer = layer_num 203 | self.step = step 204 | self.word_counts = 0 205 | 206 | self.build_tree() 207 | 208 | def build_tree(self): 209 | # 按层构建字典树, layer 从1开始, 表示第一层, 第0层为根节点 210 | for layer in range(1, self.layer+1): 211 | # 创建切割窗口对象 212 | char_session = CharSession(size=layer, step=self.step) 213 | for char in char_session.split_char(self.content): 214 | # 判断 char 是否在该层 没有节点则添加, 有则更新 215 | if not self.search_node(char, layer): 216 | self.add_node(char, layer) 217 | else: 218 | self.update_node(char, layer) 219 | 220 | def add_node(self, char, layer=None): 221 | """ 222 | 在指定层添加指定字符串节点 223 | """ 224 | if not layer: 225 | layer = len(char) 226 | 227 | # 创建节点对象 228 | if layer == 1: 229 | father = self.root 230 | else: 231 | father = self.search_node(char[:-1]) 232 | 233 | node = Node(name=char, father=father) 234 | node.count = 1 235 | 236 | # 将此节点挂入tire树 237 | father.child[char] = node 238 | father.child_counts += 1 239 | return True 240 | 241 | def del_node(self, char, layer=None): 242 | pass 243 | 244 | def update_node(self, char, layer=None): 245 | """ 246 | 更新指定节点信息 247 | """ 248 | if not layer: 249 | layer = len(char) 250 | 251 | node = self.search_node(char, layer) 252 | node.count += 1 253 | node.father.child_counts += 1 254 | return True 255 | 256 | def search_node(self, char, layer=None): 257 | """ 258 | 指定字符串, 指定层 查找节点是否存在 259 | """ 260 | if char == "ROOT": 261 | return self.root 262 | elif not layer: 263 | layer = len(char) 264 | 265 | node = self.root 266 | for layer_index in range(1, layer+1): 267 | node = node.child.get(char[:layer_index], None) 268 | if not node: 269 | return None 270 | 271 | return node 272 | 273 | 274 | class CharSession(object): 275 | def __init__(self, size, step=1): 276 | """ 277 | 窗口对象 278 | size: 窗口大小 279 | step: 移动步长 280 | """ 281 | self.size = size 282 | self.step = step 283 | 284 | def split_char(self, content): 285 | """ 286 | 按指定窗口大小及步长切割文本 287 | """ 288 | for seq in content: 289 | while seq: 290 | if len(seq) >= self.size: 291 | yield seq[:self.size] 292 | seq = seq[self.step:] 293 | else: 294 | break 295 | 296 | 297 | def find_word(file_like): 298 | """ 299 | file_like: 文本内容或文本路径 300 | """ 301 | if os.path.exists(file_like): 302 | with open(file_like, encoding="utf8") as fr: 303 | content = fr.read() 304 | else: 305 | content = file_like 306 | 307 | content_handler = ContentHandler(content) 308 | words = [] 309 | 310 | for child_node in content_handler.tire_tree.search_node("ROOT").child.values(): 311 | content_handler.get_words(child_node, layer=1, res_data=words) 312 | 313 | return words 314 | 315 | 316 | if __name__ == '__main__': 317 | stime = time.time() 318 | 319 | res_data = find_word("./hongloumeng.txt") 320 | 321 | for each_ele in res_data: 322 | print(*each_ele, sep="\t") 323 | 324 | etime = time.time() 325 | print("ALL DONE! 耗时 {} s".format(etime-stime)) 326 | -------------------------------------------------------------------------------- /words.hongloumeng: -------------------------------------------------------------------------------- 1 | 宝玉 3861 1.920249695660435 1.8683176015164489 2.008151912411824 2 | 凤姐 1696 2.275124596786582 1.7668487030119284 1.6380616776396568 3 | 贾母 1623 2.031328880152934 1.6402792484677713 1.8720408641173383 4 | 黛玉 1317 2.082141604276614 1.3953585334549905 1.91362302465702 5 | 袭人 1129 1.8210406779736985 1.7331325814552954 1.825454318223042 6 | 姑娘 1097 2.496967933291215 1.7031773172478153 1.9506689103882342 7 | 王夫人 1065 1.8402287433548559 1.5802223696419404 1.8642857385020823 8 | 宝钗 1048 2.0871233075074116 1.7386438236061736 1.8692063744580105 9 | 丫头 969 2.23731541908231 1.3194072121984664 1.844804920230693 10 | 老太太 946 2.1492575135373446 1.7774458681510545 1.9395640257048075 11 | 贾政 912 2.132523384311854 1.6999817333454295 1.843800906243123 12 | 奶奶 846 2.2839769647542743 1.3761953348430698 2.047871938223925 13 | 众人 834 1.703348563597313 1.5659121110474876 1.5729242846707672 14 | 贾琏 764 2.085585127936571 1.7552048027535454 1.8841105963529399 15 | 老爷 638 2.006942917138157 1.7615066285152836 2.000439762118316 16 | 东西 584 2.843978170464574 1.4587457769845338 1.8539709720758677 17 | 姐姐 579 1.5600462902044658 1.5996945301764403 1.9537517096113668 18 | 二爷 496 1.897262354148805 1.478486490822326 2.019948791132446 19 | 薛姨妈 453 2.8027356654731035 1.5713701725230598 1.7643545136821832 20 | 凤姐儿 451 1.5029221997147726 1.604483628877994 1.7051544369753253 21 | 探春 428 2.8027749677486096 1.6245619336969461 1.657029991585864 22 | 鸳鸯 426 3.209579987836052 1.6268124409154276 1.7681143376631647 23 | 紫鹃 416 3.147255291244207 1.5307701614640774 1.8131400985222335 24 | 湘云 394 2.8394749131649806 1.045556330403844 1.6507707586826375 25 | 婆子 392 1.9469532701585468 1.081621517364775 1.7302313100945823 26 | 妹妹 390 2.3215042503787076 1.287503631044515 1.9107197787171628 27 | 贾珍 384 2.008835906825046 1.6716602143105526 1.8137807198175355 28 | 银子 362 1.9153151722484163 1.3218141118396858 1.6883526875269577 29 | 李纨 359 3.019340005686991 1.5298303229786305 1.597757693637426 30 | 晴雯 338 3.2984114846165857 1.646512294315143 1.7571766995585902 31 | 尤氏 338 2.978673419736106 1.6235918474430888 1.7023080352427158 32 | 媳妇 317 3.192794387643852 1.498207203046196 1.8527035621945067 33 | 外头 314 1.7533887708167075 1.3222011233131437 1.882111856100834 34 | 刘姥姥 293 3.0455788425703787 1.4059942841206015 1.6304486651592753 35 | 薛蟠 284 2.837619363552421 1.685418814980996 1.8505170187878108 36 | 邢夫人 283 1.8402287433548559 1.667419299459424 1.7291522687552563 37 | 小丫头 275 2.0147226659882445 1.2464278857974624 1.3452231538009827 38 | 孩子 250 2.0101141525141037 1.0219335790144668 1.6251820373882222 39 | 林黛玉 249 2.084443814418511 1.367402241373067 1.7558237514585642 40 | 姊妹 247 2.752092861141172 1.5301659534875365 1.474738042407255 41 | 香菱 241 2.8887437740650905 1.6512189824986296 1.8790629552762699 42 | 麝月 233 2.932531915435284 1.4324506205217118 1.512570954939784 43 | 哥哥 230 2.5250966915000497 1.3421007156264415 1.8502219499810209 44 | 丫鬟 229 2.7666080150789183 1.4214167951894632 1.5941761442487457 45 | 贾蓉 224 2.0153347892181097 1.475562172457836 1.8060627943594254 46 | 小厮 212 2.4971145030341195 1.2617759440290894 1.4631055762142229 47 | 二奶奶 209 1.838338516824433 1.4969280832987364 1.8207428710118478 48 | 主意 201 2.35357723398317 1.1231725941765343 1.4167625700981725 49 | 惜春 195 2.700535797254927 1.5027015629896956 1.6377491552823815 50 | 贾芸 191 2.0560817187703218 1.2809803285833437 1.685368108137415 51 | 贾赦 188 2.0937386003176433 1.6996524707924239 1.5666510102884166 52 | 雨村 183 3.2469244950313993 1.2682676993439737 1.7969732300400767 53 | 嫂子 180 2.059775106871739 1.320929711722148 1.7250800110874873 54 | 兄弟 180 3.0540585069713693 1.2787164734532022 1.8760567372531505 55 | 母亲 174 1.7406039675379534 1.0613764177689347 1.8324175714368378 56 | 素日 173 2.1121415853864356 1.4646164340126455 1.8863904967574552 57 | 芳官 158 2.856166987360585 1.540783425612464 1.723180213673603 58 | 金桂 153 2.891757665760179 1.5369577011967788 1.5796429029927048 59 | 妙玉 152 1.849646895025003 1.3888967513586365 1.6530102617813554 60 | 贾环 151 1.9467260600742642 1.5250966645371395 1.556825062712983 61 | 雪雁 147 3.353379210064482 1.3124134562881018 1.6721377095428114 62 | 迎春 142 2.545180585217271 1.5874329334451118 1.7021202724264213 63 | 奴才 136 2.4465478220558246 1.3293526877206518 1.6157211773376368 64 | 赵姨娘 132 2.5576267401990345 1.4225518242127881 1.7094382677092332 65 | 衣服 132 2.8501725784733782 1.3420341604416457 1.6541386143739198 66 | 林姑娘 132 2.0483536457842275 1.5302460913640068 1.7332787874185303 67 | 莺儿 130 2.036778820599282 1.5045717658027198 1.6218718165925505 68 | 府里 130 1.5202859688851877 1.268319759623021 1.5206564009430266 69 | 和尚 119 2.2688787855663004 1.1629743299053878 1.561667906450683 70 | 宝蟾 117 2.085586271776365 1.4858028950409359 1.6228263060831636 71 | 宝琴 117 1.891909837919571 1.49128394831757 1.6324843146737815 72 | 妈妈 116 1.980441737517973 1.3271901757696414 1.6741004472224266 73 | 亲戚 110 2.7578076882803284 1.3675028229825108 1.3533020389655708 74 | 地方 108 2.013091659803695 1.1847732720361819 1.372555727613529 75 | 秦钟 107 3.26586151431019 1.4643613601434027 1.708315093074342 76 | 姨太太 107 1.6361861101509727 1.493844288873282 1.6424906261420298 77 | 光景 105 3.139127931698933 1.003122699259533 1.2570217505692511 78 | 老婆子 104 1.8696514267891124 1.2067355014299854 1.5611997189892064 79 | 薛蝌 102 2.8476022729596813 1.3710046675713963 1.6029214798522224 80 | 父母 100 1.973901353507686 1.6133192428399632 1.6861954824038432 81 | 秋纹 99 3.2413020971732776 1.0628765744289488 1.500343103494676 82 | 衣裳 96 3.200650747571753 1.3910367184470929 1.4022682063551253 83 | 林妹妹 94 2.3500495887262884 1.3437158107723286 1.551174469853685 84 | 岫烟 94 3.4644669064647613 1.1721462863970828 1.6179077531491255 85 | 贾兰 91 1.9083446598593719 1.2074451710530374 1.5846531980656087 86 | 尤二姐 91 2.314782157370063 1.256882502758549 1.6075655580689003 87 | 小丫头子 91 1.6464305949931832 1.1671278327377754 1.455560157892341 88 | 婶子 89 1.8857802049769736 1.4263217150512204 1.6442323657891802 89 | 赖大 88 1.9720535841114635 1.3481817823021764 1.3570497166214297 90 | 茗烟 88 3.2094253476481542 1.2936676187805067 1.462697289790773 91 | 工夫 87 2.3355545161254727 1.1700450977613046 1.472683057527292 92 | 先生 86 1.782602162880173 1.2490782999849188 1.6725413374666984 93 | 史湘云 83 2.936615468150483 1.1167266692561717 1.4818584627411038 94 | 书房 81 1.9447220420871507 1.1980029450630048 1.1122918056832731 95 | 侄儿 79 1.8552246658420986 1.4724532966944543 1.6062397302877012 96 | 老祖宗 78 2.277300916847658 1.4434654963717144 1.5071576787489693 97 | 荣府 74 2.543049146462673 1.4175367779887604 1.326531569472486 98 | 士隐 74 3.2989707821593113 1.0974329396418676 1.5351919473584572 99 | 贾瑞 73 1.5298405861108517 1.2202277529836547 1.4791824959295714 100 | 秦氏 73 2.6793532470755475 1.3496486086534079 1.5503521948873955 101 | 旺儿 72 1.9487322679225307 1.3360715182633534 1.4139028374643523 102 | 叔叔 72 2.9811882373757586 1.3368973539377464 1.5939565763911745 103 | 眼睛 70 3.001888507252217 1.2518272343083605 1.447296889754225 104 | 吃茶 70 1.8560314821799047 1.5123749522725076 1.1836914970589403 105 | 女孩儿 68 1.7835756200989357 1.1345292598453884 1.244650590369866 106 | 司棋 68 3.4821914891145633 1.3376474758949746 1.5279235635487676 107 | 读书 67 2.8118313631458163 1.6056695829482026 1.2338558116407277 108 | 规矩 67 3.9007439146222627 1.2841283941227133 1.4106706871416514 109 | 闻得 66 1.7453538415426273 1.3481817823021767 1.648778844169048 110 | 太监 66 1.9744595331288537 1.1442856792286127 1.469948286599035 111 | 贾蔷 63 1.965029670906714 1.209981208498597 1.40668475109739 112 | 眼泪 59 2.2484165327398427 1.1969913744053495 1.288687099355286 113 | 宝兄弟 59 1.6157652868854915 1.2744462746869218 1.568213117869548 114 | 琥珀 58 4.087651707165842 1.0845840056546714 1.3612742599196843 115 | 焙茗 58 3.646959718072854 1.2019459763759048 1.216485355672154 116 | 彩云 57 2.4163427585876653 1.2183584300502075 1.4258106636882955 117 | 包勇 56 3.451838759036323 1.1293529119916217 1.400320382450168 118 | 晚饭 54 2.3438612018060754 1.0205192698167445 1.1236701124308353 119 | 宁府 54 2.5767293876986415 1.3423372781277623 1.3357335131551762 120 | 倒象 53 1.8851311291385087 1.001773499021995 1.2508068690320109 121 | 并无 51 1.6331655459032257 1.2570217505692516 1.5281985296598823 122 | 大观园 51 2.3264239883710016 1.3059165694524901 1.1339577373391931 123 | 鲍二 50 2.420256728487632 1.2879122273284427 1.2252133255197561 124 | 风流 50 2.581543407972812 1.5792881853887293 1.3892526180087998 125 | 舅舅 50 2.8456846688420727 1.2390210148604943 1.4691843482137972 126 | 巧姐儿 50 1.7728201363108833 1.2602934735623719 1.4862307299936792 127 | 代儒 50 3.7104459755939607 1.056942025200422 1.3119144044890476 128 | 文章 49 3.282877976661784 1.3166251430300524 1.1680505452985441 129 | 北静王 47 2.594454060611855 1.2209650647939894 1.2943079330492295 130 | 元妃 46 3.2726149012144616 1.2672962814527096 1.4908927082394159 131 | 亲友 46 2.421576683356765 1.4457130351072818 1.3290657973597024 132 | 海棠 45 3.523380276727279 1.3230654384702145 1.2537106030277168 133 | 扇子 45 1.8454359590341283 1.274604822671226 1.29548525189965 134 | 师父 44 2.944304024159872 1.1659601640196429 1.5031606305853589 135 | 弟兄 43 2.43225445744765 1.1600794788105424 1.3729390241768458 136 | 邢王二夫人 42 1.8402287433548559 1.184420824864272 1.3028034946859763 137 | 玉钏儿 42 1.799208085937762 1.1424355007077336 1.3432307444612757 138 | 混帐 42 2.8384433058454066 1.1275442038508285 1.1051638171517637 139 | 打谅 42 2.5188914049100717 1.2028877460153864 1.1657608993919972 140 | 丰儿 42 1.8689118834491711 1.1162337185377762 1.3286609289269573 141 | 角门 41 2.3174068232255873 1.2926055547460202 1.1659315219598279 142 | 厅上 41 1.917000226687207 1.0811621533994946 1.3170731724059996 143 | 作诗 40 1.9144629260642272 1.4130175530116131 1.048843810057643 144 | 针线 39 3.5891766167264594 1.2281339676616938 1.1692558083232352 145 | 金钏儿 39 1.9616659366860698 1.1781710659629392 1.4257431431580816 146 | 张华 39 2.831963410281706 1.2125707966370325 1.3851623617739424 147 | 倪二 39 2.364529342843184 1.0244373361363204 1.352145250701002 148 | 颜色 38 3.05392154019883 1.175714509968042 1.2991925804077487 149 | 菩萨 38 4.238295844009043 1.3343515271072905 1.3205710848716485 150 | 板儿 38 1.703712592934992 1.0516015891793142 1.329185343066291 151 | 大嫂子 38 1.6509350798845057 1.0977958056755082 1.3405575876193938 152 | 金陵 37 2.9376238522247884 1.1062373268101808 1.2310779570619612 153 | 神仙 37 2.503428007265168 1.1217611155679588 1.2287224654717086 154 | 李嬷嬷 37 2.4357453431927816 1.0132944582093153 1.234789286917152 155 | 银钱 36 1.8906327077713856 1.1547757200762878 1.2122438289850557 156 | 金荣 36 2.1648531959863924 1.195261601815712 1.296312483049863 157 | 精神 36 2.669807929169317 1.3221618768308905 1.3922799373649233 158 | 甄宝玉 36 1.741501153824392 1.1083941944520206 1.3851623617739424 159 | 王仁 36 2.2915720600862803 1.2122438289850554 1.3701266324953756 160 | 月钱 36 1.7675412494986393 1.0266378908108658 1.1561994984127493 161 | 族中 36 2.316250089389043 1.37256000258626 1.1351571219224417 162 | 坠儿 36 1.886285979518594 1.2107592620876937 1.288928464604505 163 | 二哥哥 36 1.6401372585020968 1.091137509132795 1.258556136010284 164 | 二叔 36 1.6382531146275454 1.0493381639457804 1.2848996986361956 165 | 秋桐 35 3.2795372334047546 1.15231566512091 1.3592466188729728 166 | 新鲜 35 3.074067380381816 1.2812285866959998 1.213572880038171 167 | 尤三姐 35 2.3119892571858927 1.0857961488469916 1.2768882211571222 168 | 荣国府 34 2.985053120873014 1.1593349918922735 1.167630178225665 169 | 照管 34 1.941901373626415 1.34075177440518 1.2226279650578922 170 | 梅花 34 2.5370265002774586 1.3715192480087866 1.2593485078862006 171 | 园门 34 1.5394236759916733 1.0494004530231424 1.100982673971512 172 | 藕官 33 2.7780838316120122 1.0886293931650515 1.3114547368203622 173 | 甄家 33 1.7106287979422312 1.2222606640310345 1.1609416736221438 174 | 琏二爷 33 1.7286433789846491 1.2609309280316907 1.2815181461465335 175 | 春燕 33 2.4371706142299274 1.1031237505833833 1.2194636332345474 176 | 女婿 33 2.817948497952052 1.2935651950091311 1.3100293267182603 177 | 饮食 32 3.3301051623032345 1.1681337652831403 1.0886293931650515 178 | 婆婆 32 1.8101202923201596 1.1819257557628253 1.3014011874747542 179 | 箱子 31 1.7855746776303887 1.25270812939223 1.0425513192597389 180 | 桂花 31 2.048946251091027 1.296312483049863 1.2055908646113058 181 | 庙里 31 1.6376759197526574 1.22114767275494 1.2885271841708332 182 | 山石 31 2.5973978483122715 1.1016302863790104 1.0142697749229166 183 | 蒋玉菡 29 4.307011656378503 1.1214813959193146 1.3497752093658009 184 | 荷包 29 3.1810258858987317 1.1196192149346 1.15231566512091 185 | 琏二奶奶 29 2.0478628273848614 1.0527487960856206 1.322348733920363 186 | 珍大爷 29 2.270355233080358 1.1405288753313518 1.2374070497365839 187 | 比先 29 1.6552182045734707 1.2471415861560746 1.085958301447765 188 | 桃花 29 2.515587215569386 1.2603235910956663 1.1104159301996999 189 | 张罗 29 2.8297620577239355 1.4001159298305461 1.060119465488529 190 | 帖儿 29 1.5960868315062946 1.0046303971855928 1.0627843095674208 191 | 夫妻 29 2.0160411147674786 1.1686120220698188 1.185898936242587 192 | 题目 28 2.84156810264724 1.0634105123887099 1.026637890810866 193 | 男女 28 2.11942187750539 1.3100293267182606 1.2935497713373494 194 | 宁国府 28 2.9976422481810343 1.1242030405556924 1.012450248147257 195 | 众姊妹 28 1.8474945484160505 1.0791812460476247 1.2306771003829997 196 | 芙蓉 27 3.352411849936793 1.2703994189276515 1.1204119982655922 197 | 舅母 27 1.6033109526104048 1.0441456077951246 1.2453135859556532 198 | 益发 27 2.3925817436295778 1.0405296753565512 1.2891562186230305 199 | 玻璃 27 4.373958446009117 1.0703221291572096 1.3300395757676093 200 | 手帕子 27 1.9898839365943446 1.1647752718815907 1.044981826173945 201 | 女尼 27 2.605859585624852 1.0746090020803172 1.0232249084065224 202 | 李氏 26 1.7492797703980105 1.0958165935452864 1.2110849003808126 203 | 房屋 26 1.560163689861027 1.0757790619213747 1.1359539263004839 204 | 冯紫英 26 4.063292361306397 1.0601194654885289 1.1063788149324716 205 | 雀儿 25 1.7126835208666074 1.02599452062265 1.2603235910956663 206 | 铁槛寺 25 4.065749865718011 1.0703221291572096 1.0880336186233166 207 | 贵妃 25 2.726784196062831 1.1951419200116562 1.0556720202883954 208 | 胭脂 25 4.045986822386106 1.0961844950135782 1.1788716777510708 209 | 盒子 24 1.6436102780935022 1.0293335830649735 1.044981826173945 210 | 珍珠 24 2.3375735550102417 1.0405296753565512 1.1596183341632196 211 | 王子腾 24 2.6383595463109364 1.108356562103204 1.1050717496266387 212 | 形容 24 2.861592803490504 1.2876899543378464 1.2499943283000725 213 | 含泪 24 2.665348722183645 1.1596183341632194 1.1260332722757374 214 | 分派 24 2.082672865518848 1.2055908646113058 1.1757522959833626 215 | 金银 23 1.6331325273240023 1.0962320868634188 1.1680505452985441 216 | 胡乱 23 2.4923174682963203 1.0438877668097386 1.158731857656642 217 | 省亲 23 1.874865329069792 1.21537886502357 1.0179571597331107 218 | 彩霞 23 3.1714148515881466 1.0003923426013537 1.1596183341632196 219 | 家伙 23 1.7579626767378616 1.1947885434319039 1.0676310452816091 220 | 阴阳 22 3.1813825457464717 1.06608149544718 1.1549291732153124 221 | 贾赦等 22 1.644724913196067 1.1031237505833835 1.2603235910956663 222 | 素习 22 2.8581447914018097 1.098458818336966 1.3150563175800265 223 | 柳湘莲 22 2.987162323770918 1.0912337342819318 1.1288624837399295 224 | 俱已 22 1.9877865878244654 1.2041199826559248 1.2876899543378464 225 | 风月 21 1.6317322314219185 1.1664912331979271 1.1088567413913666 226 | 梨香院 21 3.3145212581572485 1.1107114985054047 1.048843810057643 227 | 祖父 20 2.245855486144543 1.1950336277707467 1.1692558083232354 228 | 情愿 20 2.0175947230999833 1.0394022484799927 1.0958165935452864 229 | 舌头 19 1.7335031626915223 1.0013342591828385 1.0438877668097386 230 | 笔砚 19 3.4907454305978702 1.1203167611296814 1.0003923426013537 231 | 碟子 19 1.814410890428682 1.0210614281209909 1.1950336277707467 232 | 册子 18 1.6577185320048693 1.0754089640953688 1.021318738256381 233 | 世兄 17 2.0120085212718264 1.041392685158225 1.1664912331979271 234 | 骨肉 16 3.2501068050420305 1.0601194654885289 1.041392685158225 235 | 顺路 16 2.555485114643579 1.021318738256381 1.0770309979379904 236 | 梯己 15 2.6857364352063198 1.0405296753565512 1.041392685158225 237 | 官员 13 2.577633226696886 1.021318738256381 1.041392685158225 238 | --------------------------------------------------------------------------------