├── NovelsAna.py ├── README.md ├── TSocial.py ├── image ├── nullFile └── quanzhigaoshou.gif ├── quanzhigaoshou.json ├── removeWords.json ├── 全职高手.rar └── 全职高手关系分析.gif /NovelsAna.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | # 自动提取小说的角色,情节。 4 | #分析步骤: 5 | #1、扫描提取主要角色名称,自动提取,手动删除,手动归类。图形分析(角色出现次数、时间线)。 6 | #2、扫描角色关系,建立关系线,gephi图,GIF图 7 | #3、角色关系自动判断?角色情节自动提取?情节情绪判断?人物关系程度判断?(利用情节库进行训练,进行情节归类) 8 | #4、建立数据结构,存储分析中间结果,方便进行复现。 9 | #5、建立Django网站提供文本分析服务:txt上载,角色列表编辑,多线程管理,关系图展示与下载,各类结果数据存储管理。 10 | #用户表,原始文本表,角色表,关系表, 11 | #网页:人员列表调整界面:生成人员名称列表json。从初始姓名检测结果列表中,挑选人名,填入人员名称列表json中,其他的写入delete列表中。 12 | import jieba 13 | import collections 14 | import re 15 | import TSocial 16 | import json 17 | import networkx as nx 18 | import random 19 | # import matplotlib.pyplot as plt 20 | import matplotlib 21 | class textAna: 22 | def __init__(self): 23 | self.G = nx.DiGraph() 24 | 25 | TEXT_PATH = 'c:/temp/死人经.txt' # 文本路径 26 | # TEXT_PATH = 'c:/temp/射雕英雄传.txt' # 文本路径 27 | person_counter = collections.defaultdict(int) # 人物出场次数计数器 28 | person_per_paragraph = [] 29 | relationships = {} 30 | synonymous_dict = {} 31 | same_name_list_file='sirenjingname.json' 32 | remove_words_file='removewords.json' 33 | graph = nx.Graph() 34 | pattern1 = r'[赵|钱|孙|李|周|吴|郑|王|冯|陈|褚|卫|蒋|沈|韩|杨|朱|秦|尤|许|何|吕|施|张|孔|曹|严|华|金|魏|陶|姜|戚|谢|邹|喻|柏|水|窦|章|云|苏|潘|葛|奚|范|彭|郎|鲁|韦|昌|马|苗|凤|花|方|俞|任|袁|柳|酆|鲍|史|唐|费|廉|岑|薛|雷|贺|倪|汤|滕|殷|罗|毕|郝|邬|安|常|乐|于|时|傅|皮|卞|齐|康|伍|余|元|卜|顾|孟|平|黄|和|穆|萧|尹|姚|邵|湛|汪|祁|毛|禹|狄|米|贝|明|臧|计|伏|成|戴|谈|宋|茅|庞|熊|纪|舒|屈|项|祝|董|梁|杜|阮|蓝|闵|席|季|麻|强|贾|路|娄|危|江|童|颜|郭|梅|盛|林|刁|锺|徐|邱|骆|高|夏|蔡|田|樊|胡|凌|霍|虞|万|支|柯|昝|管|卢|莫|经|房|裘|缪|干|解|应|宗|丁|宣|贲|邓|郁|单|杭|洪|包|诸|左|石|崔|吉|钮|龚|程|嵇|邢|滑|裴|陆|荣|翁|荀|羊|於|惠|甄|麴|家|封|芮|羿|储|靳|汲|邴|糜|松|井|段|富|巫|乌|焦|巴|弓|牧|隗|山|谷|车|侯|宓|蓬|全|郗|班|仰|秋|仲|伊|宫|宁|仇|栾|暴|甘|钭|历|戎|祖|武|符|刘|景|詹|束|龙|叶|幸|司|韶|郜|黎|溥|印|宿|白|怀|蒲|邰|从|鄂|索|咸|籍|卓|蔺|屠|蒙|池|乔|阳|郁|胥|能|苍|双|闻|莘|党|翟|谭|贡|劳|逄|姬|申|扶|堵|冉|宰|郦|雍|却|桑|桂|濮|牛|寿|通|边|扈|燕|冀|浦|尚|农|温|别|庄|晏|柴|瞿|充|慕|连|茹|习|宦|艾|鱼|容|向|古|易|慎|戈|廖|庾|终|暨|居|衡|步|都|耿|满|弘|匡|国|文|寇|广|禄|阙|东|欧|沃|利|蔚|越|夔|隆|师|巩|厍|聂|晁|勾|敖|融|冷|訾|辛|阚|那|简|饶|空|曾|毋|沙|乜|养|鞠|须|丰|巢|关|蒯|相|荆|红|游|竺|权|司马|上官|欧阳|夏侯|诸葛|闻人|东方|赫连|皇甫|尉迟|公羊|澹台|公冶宗政|濮阳|淳于|单于|太叔|申屠|公孙|仲孙|轩辕|令狐|钟离|宇文|长孙|慕容|司徒|司空|召|有|舜|岳|黄辰|寸|贰|皇|侨|彤|竭|端|赫|实|甫|集|象|翠|狂|辟|典|良|函|芒|苦|其|京|中|夕|乌孙|完颜|富察|费莫|蹇|称|诺|来|多|繁|戊|朴|回|毓|鉏|税|荤|靖|绪|愈|硕|牢|买|但|巧|枚|撒|泰|秘|亥|绍|以|壬|森|斋|释|奕|姒|朋|求|羽|用|占|真|穰|翦|闾|漆|贵|代|贯|旁|崇|栋|告|休|褒|谏|锐|皋|闳|在|歧|禾|示|是|委|钊|频|嬴|呼|大|威|昂|律|冒|保|系|抄|定|化|莱|校|么|抗|祢|綦|悟|宏|功|庚|务|敏|捷|拱|兆|丑|丙|畅|苟|随|类|卯|俟|友|答|乙|允|甲|留|尾|佼|玄|乘|裔|延|植|环|矫|赛|昔|侍|度|旷|遇|偶|前|由|咎|塞|敛|受|泷|袭|衅|叔|圣|御|夫|仆|镇|藩|邸|府|掌|首|员|焉|戏|可|智|尔|凭|悉|进|笃|厚|仁|业|肇|资|合|仍|九|衷|哀|刑|俎|仵|圭|夷|徭|蛮|汗|孛|乾|帖|罕|洛|淦|洋|邶|郸|郯|邗|邛|剑|虢|隋|蒿|茆|菅|苌|树|桐|锁|钟|机|盘|铎|斛|玉|线|针|箕|庹|绳|磨|蒉|瓮|弭|刀|疏|牵|浑|恽|势|世|仝|同|蚁|止|戢|睢|冼|种|涂|肖|己|泣|潜|卷|脱|谬|蹉|赧|浮|顿|说|次|错|念|夙|斯|完|丹|表|聊|源|姓|吾|寻|展|出|不|户|闭|才|无|书|学|愚|本|性|雪|霜|烟|寒|少|字|桥|板|斐|独|千|诗|嘉|扬|善|揭|祈|析|赤|紫|青|柔|刚|奇|拜|佛|陀|弥|阿|素|长|僧|隐|仙|隽|宇|祭|酒|淡|塔|琦|闪|始|星|南|天|接|波|碧|速|禚|腾|潮|镜|似|澄|潭|謇|纵|渠|奈|风|春|濯|沐|茂|英|兰|檀|藤|枝|检|生|折|登|驹|骑|貊|虎|肥|鹿|雀|野|禽|飞|节|宜|鲜|粟|栗|豆|帛|官|布|衣|藏|宝|钞|银|门|盈|庆|喜|及|普|建|营|巨|望|希|道|载|声|漫|犁|力|贸|勤|革|改|兴|亓|睦|修|信|闽|北|守|坚|勇|汉|练|尉|士|旅|五|令|将|旗|军|行|奉|敬|恭|仪|母|堂|丘|义|礼|慈|孝|理|伦|卿|问|永|辉|位|让|尧|依|犹|介|承|市|所|苑|杞|剧|第|零|谌|招|续|达|忻|六|鄞|战|迟|候|宛|励|粘|萨|邝|覃|辜|初|楼|城|区|局|台|原|考|妫|纳|泉|老|清|德|卑|过|麦|曲|竹|百|福|言|第五|佟|爱|年|笪|谯|哈|墨|连|南宫|赏|伯|佴|佘|牟|商|西门|东门|左丘|梁丘|琴|后|况|亢|缑|帅|微生|羊舌|海|归|呼延|南门|东郭|百里|钦|鄢|汝|法|闫|楚|晋|谷梁|宰父|夹谷|拓跋|壤驷|乐正|漆雕|公西|巫马|端木|颛孙|子车|督|仉|司寇|亓官|三小|鲜于|锺离|盖|逯|库|郏|逢|阴|薄|厉|稽|闾丘|公良|段干|开|光|操|瑞|眭|泥|运|摩|郄|伟|铁|迮|木|荷|虚|君][\u4e00-\u9fa5]{1,3}$' 35 | 36 | ''' 37 | person_counter是一个计数器,用来统计人物出现的次数。{'a':1,'b':2} 38 | person_per_paragraph每段文字中出现的人物[['a','b'],[]] 39 | relationships保存的是人物间的关系。key为人物A,value为字典,包含人物B和权值。 40 | ''' 41 | 42 | def get_clean_paragraphs(self): 43 | # 读取文件 44 | # fn = open('QTSK2.txt') # 打开文件 45 | fn = open(self.TEXT_PATH, encoding='UTF-8') # 打开文件 46 | paragraphs = fn.readlines() # 读出整个文件 47 | fn.close() # 关闭文件 48 | para=[] 49 | # 文本预处理 50 | pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"') # 定义正则表达式匹配模式 51 | for p in paragraphs: 52 | p1=[] 53 | p= re.sub(pattern, '', p) 54 | p1=p.split('。') 55 | # if len(p)>0: 56 | # para.append(p) # 将符合模式的字符去除 57 | for pp1 in p1: 58 | para.append(pp1) 59 | return para 60 | 61 | 62 | #寻找角色关系 dead_line 表示截至统计点 63 | def find_relation(self,start_line,dead_line): 64 | ''' 65 | 1、人物名称添加jieba自定义字典 66 | 2、分割段落 67 | 3、段落分析,关系入库。 68 | ''' 69 | Social_list_str=[] 70 | paragraphs = self.get_clean_paragraphs() 71 | # same_name_list=[['顾慎为','龙王','小顾','杨欢','欢奴'],['荷女','御众师'],['上官如','九公子'],['上官怒','八少主'],['大头神'],['罗宁茶','八少奶'],['上官伐','堡主','独步王'],['屠狗'],['许小益','小益','许益'],['许烟微'],['遥奴'],['木老头'],['杨元帅'],['顾仑'],['杨峥'],['顾翠兰','翠兰']] 72 | same_name_list=[] 73 | newfile= open(self.same_name_list_file, 'r',encoding='gb18030') 74 | s=newfile.read() 75 | same_name_list=json.loads(s) 76 | 77 | # jieba.load_userdict('namedict.txt') 78 | # for p in paragraphs: 79 | # pp=jieba.cut(p) 80 | # for x in pp: 81 | # if x in same_name_list[0]: 82 | # print(p) 83 | # continue 84 | #角色名称信息包含角色名称、角色使用名称,角色出现计数,角色出现语句列表,角色关系。 name_info={'name':'顾慎为','same_name':['顾慎为','龙王','小顾','杨欢','欢奴'],'name_count':133,'paragraphs':[],'tss':[]} 85 | #初始化角色名称信息列表 name_info_list 86 | name_info_list=[] 87 | nameID=1 88 | for n in same_name_list: 89 | name_info={} 90 | tss=TSocial.TSocial() 91 | tss.Bubble=n[0] 92 | name_info['nameID']=nameID 93 | nameID+=1 94 | name_info['name']=tss.Bubble=n[0] 95 | name_info['tss']=tss 96 | name_info['name_count']=0 97 | name_info['same_name']=n 98 | name_info_list.append(name_info) 99 | 100 | p_info={'wordcounts':0,'pcount':0,'p_start':'','p_end':'','start_line':start_line,'dead_line':dead_line} 101 | #开始逐句搜索关系 102 | p_count=0 103 | for p in paragraphs: 104 | p_count+=1 105 | if p_count/len(paragraphs) dead_line/100: 109 | p_info['p_end']=str(p) 110 | break 111 | # 统计基本信息 112 | p_info['wordcounts']+=len(p) 113 | p_info['pcount']+=1 114 | if p_info['p_start']=='': 115 | p_info['p_start']=str(p) 116 | #统计段落包含的角色名称 117 | name_in_p=[] 118 | for name_info in name_info_list: 119 | for sn in name_info['same_name']: 120 | if sn in p: 121 | name_info['name_count']+=1 122 | name_in_p.append(name_info) 123 | if len(name_in_p)<=1: 124 | continue 125 | # 开始注册关系线 126 | p1=p.replace(' ','') 127 | Scene={'eventname':p1[0:8],'roundcount':p_count} #paragraphs.index(p)} 128 | for name_info in name_info_list: 129 | if name_info not in name_in_p: 130 | continue 131 | for name_info_other in name_in_p: 132 | if name_info_other['name'] in name_info['tss'].OtherIDs: 133 | name_info['tss'].RelationsModify(Scene,name_info_other['name'])#维护旧关系 134 | else: 135 | name_info['tss'].RelationsRegist1(Scene,name_info_other['name'])#注册新关系 136 | 137 | 138 | # print(name_info_lisjt) 139 | print(paragraphs[p_count-2]) 140 | #关系结果输出到字符串Social_list_str 141 | for n in name_info_list: 142 | x=n['tss'].ToDic() 143 | x['主人ID']=n['nameID'] 144 | x['主人姓名']=n['name'] 145 | Social_list_str.append(x) 146 | #作图 147 | self.G.clear() 148 | self.MakeMatplot(Social_list_str) 149 | self.Drawplt() 150 | return p_info 151 | 152 | def MakeMatplot(self,j): 153 | ''' 154 | 组织绘制关系图的数据。 155 | j:Social_list_str 关系字符串 156 | ''' 157 | # print (matplotlib.matplotlib_fname()) # 将会获得matplotlib包所在文件夹 158 | matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS'] 159 | matplotlib.rcParams['font.serif'] = ['Arial Unicode MS'] 160 | # 显示matplatlib全部字体 161 | # plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] 162 | # a=sorted([f.name for f in matplotlib.font_manager.fontManager.ttflist]) 163 | # for i in a: 164 | # print (i) 165 | 166 | import seaborn as sns 167 | sns.set_style("ticks",{"font.sans-serif":['Arial Unicode MS', 'Arial']}) 168 | 169 | 170 | ''' 171 | seaborn提供5中主题风格: 172 | 173 | darkgrid 174 | whitegrid 175 | dark 176 | white 177 | ticks 178 | ''' 179 | 180 | # print(j) 181 | # import json 182 | # j = json.dumps(BubbleList.Bubbles[0].MySocial.ToDic(), ensure_ascii=False) 183 | 184 | for SocialText in j: 185 | if SocialText['关系数量']!=0: 186 | self.G.add_node(SocialText['主人姓名'],node_color='b',weight=SocialText['关系数量'],Position=(random.randrange(0, 100), random.randrange(0, 100))) 187 | for SocialText in j: 188 | for gx in SocialText['关系']: 189 | self.G.add_edge(SocialText['主人姓名'],gx['对端姓名'],weight=gx['交往密切程度']) 190 | 191 | def Drawplt(self): 192 | # print("输出全部节点:{}".format(self.G.nodes())) 193 | # print("输出全部边:{}".format(self.G.edges())) 194 | print("输出全部点的数量:{}".format(self.G.number_of_nodes())) 195 | print("输出全部边的数量:{}".format(self.G.number_of_edges())) 196 | 197 | # pos= nx.spring_layout(self.G, dim=2, k=None, pos=None, fixed=None, iterations=50, weight='weight', scale=1.0) 198 | pos= nx.shell_layout(self.G) 199 | 200 | nx.draw_networkx(self.G,pos, edge_color='b',node_shape = 'o', node_size=800,cmap=matplotlib.pyplot.cm.gray,dpi = 4000) 201 | # # global graph 202 | # nx.draw(self.G, pos=nx.get_node_attributes(self.G,'Position'),edge_color='b',node_shape = 'o', node_size=800,cmap=matplotlib.pyplot.cm.gray,dpi = 4000) 203 | # matplotlib.pyplot.show() 204 | matplotlib.pyplot.pause(1) 205 | matplotlib.pyplot.clf() 206 | 207 | def DrawAni(self,mode): 208 | ''' 209 | 连续统计人物关系,按百分比分一百份,并批量绘制关系图, 210 | mode:0:每次从头统计,1:每次只统计一份数据 211 | ''' 212 | a=0 213 | for i in range(1,100): 214 | if mode==0: 215 | a=0 216 | else: 217 | a=i-1 218 | p_info=self.find_relation(0,i) 219 | print(p_info) 220 | 221 | 222 | 223 | 224 | # def find_name(self): 225 | # # 读取文件 226 | # # fn = open('QTSK2.txt') # 打开文件 227 | # fn = open(self.TEXT_PATH, encoding='UTF-8') # 打开文件 228 | # string_data = fn.read() # 读出整个文件 229 | # fn.close() # 关闭文件 230 | 231 | # # 文本预处理 232 | # pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"|的') # 定义正则表达式匹配模式 233 | # string_data = re.sub(pattern, '', string_data) # 将符合模式的字符去除 234 | 235 | # # 文本分词 236 | # seg_list_exact = jieba.cut(string_data, cut_all = False) # 精确模式分词 237 | # object_list = [] 238 | # remove_words = [u'的', u',',u'和', u'是', u'随着', u'对于', u'对',u'等',u'能',u'都',u'。',u' ',u'、',u'中',u'在',u'了', 239 | # u'通常',u'如果',u'我们',u'需要',u'卷',u'_',u'【',u'】',u'·',u'\u3000',u':',u'“',u'”',u'我',u'道',u'他',u'说', 240 | # u'你',u'也',u'人',u'但是',u'就',u'一个',u'自己',u'这',u'但',u'不',u'没有',u'有',u'被',u'要',u'就是', 241 | # u'这个',u'他们',u'到',u'并',u'!',u'(',u')',u'而',u'已经',u'还',u'却',u'着',u'不是',u'很',u'会',u'?',u'上', 242 | # u'[',u']',u'们',u'去',u'这些',u'这样',u'先',u'因为',u'把',u'后',u'什么',u'让',u'可以',u'地',u'来',u'—',u'不过', 243 | # u'这种',u'将',u'那',u'从',u'虽然',u'还是',u'这位',u'给',u'个',u'十分',u'又',u'与',u'下',u'做',u'过',u'好', 244 | # u'很多',u'得',u'实在',u'用',u'终于',u'此时',u'呢',u'可能',u'不能',u'时',u'开始',u'似乎',u'之',u'可', 245 | # u'时候',u'才',u'应该',u'可是',u'于是',u'吧',u'向',u'现在',u'当',u'能够',u'想',u'之后',u'多',u'她', 246 | # u'还有',u'所以',u'这里',u'便',u'所谓',u'说道',u'你们',u'只',u'…',u'见',u'跟',u'一声',u'请',u'谁', 247 | # u'有点',u'以为',u'公子',u'有人',u'士兵',u'出来',u'计划',u'许多',u'仍然',u'武功',u'说话',u'不想',u'军队',u'请',u'谁', 248 | # u'问道',u'咱们',u'已',u'再',u'罢',u'怎么',u'不知',u'只见',u'里',u'啊',u'本书',u'小说网',u'用户',u'明白',u'希望',u'那个'] # 自定义去除词库 249 | 250 | # name_list=[] 251 | # # for word in seg_list_exact: # 循环读出每个分词 252 | # # if word not in remove_words: # 如果不在去除词库中 253 | # # object_list.append(word) # 分词追加到列表 254 | # # match = re.match(self.pattern1,word) 255 | # # if match != None: 256 | # # name_list.append(word) 257 | # name_found=0 258 | # name_1='' 259 | # name_2='' 260 | # name_3='' 261 | # for word in seg_list_exact: # 循环读出每个分词 262 | # if word not in remove_words: # 如果不在去除词库中 263 | # object_list.append(word) # 分词追加到列表 264 | # match = re.match(self.pattern1,word) 265 | # if match != None and len(word)>=2: 266 | # name_list.append(word) 267 | 268 | 269 | # if match != None and name_found==0: 270 | # name_found=1 271 | # name_1=word 272 | # # if word =='木': 273 | # # print(word) 274 | # continue 275 | 276 | 277 | # if name_found==1 and len(name_2)==1 and len(word)==1: 278 | # name_3=word 279 | # name_found=0 280 | # name_list.append(name_1+name_2+name_3) 281 | # name_2='' 282 | # name_3='' 283 | # continue 284 | # if name_found==1 and len(word)==1: 285 | # name_2=word 286 | # if len(name_1+name_2)>=3: 287 | # name_found=0 288 | # name_list.append(name_1+name_2) 289 | # name_2='' 290 | # name_3='' 291 | # continue 292 | 293 | # if name_found==1 and len(word)==2: 294 | # name_2=word 295 | # name_found=0 296 | # name_list.append(name_1+name_2) 297 | # name_2='' 298 | # # 词频统计 299 | # # word_counts = collections.Counter(object_list) # 对分词做词频统计 300 | # # word_counts_top10 = word_counts.most_common(40) # 获取前10最高频的词 301 | # word_counts = collections.Counter(name_list) # 对分词做词频统计 302 | # word_counts_top10 = word_counts.most_common(40) # 获取前10最高频的词 303 | # print (word_counts_top10) # 输出检查 304 | 305 | 306 | 307 | #寻找作品中的名字并统计,非jieba分词。 308 | def find_name(self,CheckPatter,foreign_novel): 309 | ''' 310 | 寻找作品中的名字并统计 311 | CheckPatter=1,进行百家姓模板验证 312 | foreign_novel=1 是外国小说 313 | 314 | ''' 315 | 316 | # 读取文件 317 | # fn = open('QTSK2.txt') # 打开文件 318 | fn = open(self.TEXT_PATH, encoding='UTF-8') # 打开文件 319 | string_data = fn.read() # 读出整个文件 320 | fn.close() # 关闭文件 321 | 322 | # 文本预处理 323 | pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"| |!|。|,|“|;|”|?|:|…|—|=|_') # 定义正则表达式匹配模式 324 | string_data = re.sub(pattern, '', string_data) # 将符合模式的字符去除 325 | 326 | # 文本分词 327 | # seg_list_exact = jieba.cut(string_data, cut_all = False) # 精确模式分词 328 | 329 | object_list = [] 330 | remove_words = [] # 自定义去除词库 331 | # with open('removewords1s.json', 'w') as f_obj: 332 | # json.dump(remove_words, f_obj,ensure_ascii=False) 333 | with open(self.remove_words_file, 'r', encoding='utf-8') as f_obj: 334 | remove_words = json.load(f_obj) 335 | name_list=[] 336 | 337 | name_found=0 338 | name_1='' 339 | 340 | tnames=[] 341 | # 比较名称 CheckPatter=1,进行百家姓模板验证 342 | def comp_name(thename,CheckPatter): 343 | ''' 344 | www 345 | 346 | ''' 347 | if thename not in remove_words and '的' not in thename : 348 | if CheckPatter==1: 349 | if re.match(self.pattern1,thename)!=None: 350 | name_list.append(thename) 351 | else: 352 | name_list.append(thename) 353 | # CheckPatter=0 354 | if foreign_novel==0: 355 | name_length=4 356 | elif foreign_novel==1: 357 | name_length=12 358 | # 取若干字数作为名称进行比对判断。外国作品人名较长。 359 | for i in range(name_length+10,len(string_data)): 360 | name_1='' 361 | b=0 362 | for a in range(name_length,0,-1) : 363 | name_1=name_1+string_data[i-a] 364 | b+=1 365 | if b>1: 366 | comp_name(name_1,CheckPatter) 367 | 368 | 369 | # 词频统计 370 | word_counts = collections.Counter(name_list) # 对分词做词频统计 371 | word_counts_top10 = word_counts.most_common(500) # 获取前10最高频的词 372 | word_counts_top=[] 373 | def quchong():#去除名字列表中的重复。 374 | Flag=0 375 | for name01 in word_counts_top10: 376 | Flag=0 377 | if name01 in word_counts_top: 378 | continue 379 | for name02 in word_counts_top10: 380 | if (name01[0] in name02[0]) and (name01[0] != name02[0]) and abs((name01[1]/name02[1])-1)<0.09: 381 | Flag=1 382 | if name02 not in word_counts_top: 383 | word_counts_top.append(name02) 384 | if Flag==0: 385 | if name01 not in word_counts_top: 386 | word_counts_top.append(name01) 387 | 388 | quchong() 389 | word_counts_top10=word_counts_top 390 | word_counts_top=[] 391 | quchong() 392 | 393 | print (word_counts_top) # 输出检查 394 | print('\n') 395 | name_single=[] 396 | 397 | for w in word_counts_top : 398 | x=[] 399 | x.append(w[0]) 400 | name_single.append(x) 401 | print(name_single) 402 | 403 | 404 | 405 | 406 | 407 | 408 | textA=textAna() 409 | textA.remove_words_file='removewords.json' 410 | textA.same_name_list_file='sirenjingname.json' 411 | textA.TEXT_PATH = 'c:/temp/死人经.txt' 412 | textA.same_name_list_file='sanguo.json' 413 | textA.TEXT_PATH = 'c:/temp/三国演义.txt' 414 | textA.same_name_list_file='quanzhigaoshou.json' 415 | textA.TEXT_PATH = 'c:/temp/全职高手.txt' 416 | 417 | # textA.same_name_list_file='sanguo.json' 418 | # textA.TEXT_PATH = 'c:/temp/哈利波特.txt' 419 | 420 | # print(match) 421 | # textA.find_name(0,1) 422 | # textA.count_person() 423 | 424 | 425 | i=-1 426 | deal_line=0 427 | 428 | # for i in range(1,100): 429 | # textA.find_relation(i) 430 | textA.DrawAni(1) 431 | while 1: 432 | i=(input('输入时间点:')) 433 | if int(i) ==0 or len(i)==0: 434 | break 435 | deal_line+=int(i) 436 | print('deal_line=',deal_line) 437 | textA.find_relation(0,deal_line) 438 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python- 2 | 中文小说情节分析/角色分析/角色关系分析, 中文小说情节分析/角色分析/角色关系分析 Plot analysis / role analysis / role relationship analysis of Chinese Novels 3 | 主要函数: 4 | textAna: 5 | find_name:统计小说人物名称; 6 | find_relation:利用同时出现原则建立人物间的关系; 7 | DrawAni:按百分比分一百份,连续统计人物关系,并批量绘制关系图; 8 | TSocial:社会关系类 9 | 10 | 11 | 主要配置文件(json格式): 12 | remove_words_file='removewords.json':去除关键字列表; 13 | same_name_list_file='sirenjingname.json'相同名字列表; 14 | TEXT_PATH = 'c:/temp/死人经.txt' 小说文件。(注意,有些小说文本文件需要转换格式,在记事本选择另存--编码格式选择UTF-8) 15 | 16 | 17 | 18 | ![image](https://github.com/yuanren88/python-/blob/master/image/quanzhigaoshou.gif) 19 | -------------------------------------------------------------------------------- /TSocial.py: -------------------------------------------------------------------------------- 1 | #-- coding:UTF-8 -- 2 | ''' 3 | 社会关系及社交能力 TSocial 4 | 个人关系 TRelationship 5 | 共同经历: TSocialLine 6 | 当时—建立关系线 7 | 过后—回忆,维护关系线''' 8 | 9 | #类:社交线 描述两者之间每一次的接触 10 | class TSocialLine: 11 | def __init__(self): 12 | self.OnActive=0 #是否有效,true:有效,false:失效。。 13 | self.LineState='' #关系线状态 14 | self.OtherID=0 #对端ID 15 | self.OtherIDName='' #对端姓名 16 | self.POther=0 #对端地址 17 | self.SpcialSocialType='' #特殊的交往类型:如夫妻,奴役,师徒,上下级等等。 18 | self.CreateTime=0 #创建时间-回合 19 | #解析场景 20 | def ExplanScene(self,Scene): 21 | # self.SpcialSocialType=Scene[0] 22 | # self.CreateTime=Scene[1] 23 | self.SpcialSocialType=Scene['eventname'] 24 | self.CreateTime=Scene['roundcount'] 25 | 26 | 27 | #输出字典 28 | def ToDic(self): 29 | return {'对端ID':self.OtherID,'状态': self.LineState,'类型':self.SpcialSocialType,'创建时间':self.CreateTime} 30 | 31 | 32 | 33 | #类:关系 描述两者之间的关系 34 | class TRelationship: 35 | def __init__(self): 36 | #社交线数组 37 | self.SocialLines=[] 38 | #社交线数量 39 | self.SocialLineCount=0 40 | 41 | self.OtherID=0 #对端ID 42 | self.OtherIDName='' #对端姓名 43 | self.Other=None #对端对象 44 | self.Famely=0 #是否家庭成员 45 | self.CreateTime=0 #创建时间-回合 46 | self.LeftInfluence,self.RightInfluence=0,0 #影响力与被影响力:程度数值 47 | self.SocialCloseClassName=0 #交往密切程度等级, 48 | self.SocialCloseCount=0 #交往程度打分0至1000,影响密切程度。 49 | self.impression=0 #对对方对印象 -1000 至 1000 50 | self.SpcialSocialTypes=[] #特殊的交往类型 集合:如夫妻,奴役,师徒,上下级等等。 51 | #关系线注册 52 | def SocialLineRegist(self,Scene,OtherID): 53 | sl=TSocialLine() 54 | sl.OnActive=1 55 | sl.OtherID=OtherID 56 | sl.ExplanScene(Scene) 57 | self.SocialLines.append(sl) 58 | #关系线注销--未完成---删除指定I 59 | def SocialLineCancel(): 60 | del self.SocialLines[0] 61 | 62 | #计算亲密等级--未完成 63 | def getSocialCloseClassName(self): 64 | self.SocialCloseCount=len(self.SocialLines) 65 | if self.SocialCloseCount<2: 66 | self.SocialCloseClassName='认识' 67 | elif self.SocialCloseCount<10: 68 | self.SocialCloseClassName='朋友' 69 | elif self.SocialCloseCount<20: 70 | self.SocialCloseClassName='亲密' 71 | 72 | return self.SocialCloseCount 73 | #输出字典 74 | def ToDic(self): 75 | self.getSocialCloseClassName() 76 | a=TSocialLine 77 | b=[] 78 | for a in range(len(self.SocialLines)): 79 | b.append(self.SocialLines[a].ToDic()) 80 | if self.SocialLines[a].SpcialSocialType not in self.SpcialSocialTypes: 81 | self.SpcialSocialTypes.append(self.SocialLines[a].SpcialSocialType) 82 | 83 | 84 | # return {'对端ID':self.OtherID,'对端姓名':self.Other.surname+self.Other.firstname,'交往密切程度':self.SocialCloseCount,'交往密切程度等 级':self.SocialCloseClassName,'类型':self.SpcialSocialTypes,'创建时间':self.CreateTime,'联系线':b} 85 | return {'对端ID':self.OtherID,'对端姓名':self.OtherID,'交往密切程度':self.SocialCloseCount,'交往密切程度等级':self.SocialCloseClassName,'类型':self.SpcialSocialTypes,'创建时间':self.CreateTime,'联系线':b} 86 | 87 | 88 | 89 | 90 | #类:社会关系。描述一个个体的社会关系及社交能力 91 | class TSocial: 92 | def __init__(self): 93 | #关系人数组 94 | self.Relations=[] #关系数组 95 | self.OtherIDs=[] #有关系的人们的ID 96 | self.Others=[] #有关系的人们 97 | #主人 98 | self.BubbleID=0 99 | self.Bubble=None 100 | #社交能力 101 | self.SocialAbility=0 102 | #社交趋向,是偏向上层还是偏向下层 103 | self.SocialDes=0 104 | 105 | #最佳社交数量 106 | self.BestSocialLineCount=0 107 | #社交满意状态 108 | self.SocialSatisfy=0#饥渴的,一般的,满足的 109 | #社交地位 110 | self.WorldSocialClass=0 111 | #正在被绘制 112 | 113 | self.OnDrawing=0 114 | self.DrawedCount=0 115 | # 关系支援力量 116 | self.SocailPower=0 117 | #关系注册-------- 118 | def RelationsRegist1(self,Scene,OtherID): 119 | #判断第一次发生关系---源自一次事件 120 | if OtherID not in self.OtherIDs: 121 | self.OtherIDs.append(OtherID) 122 | r=TRelationship() 123 | r.OtherID=OtherID 124 | r.SocialLineRegist(Scene,OtherID) 125 | self.Relations.append(r) 126 | 127 | 128 | #关系注册-------- 129 | def RelationsRegist(self,Scene,Other): 130 | #判断第一次发生关系---源自一次事件 131 | if Other.BubbleID not in self.OtherIDs: 132 | self.OtherIDs.append(Other.BubbleID) 133 | self.Others.append(Other) 134 | r=TRelationship() 135 | r.OtherID=Other.BubbleID 136 | r.Other=Other 137 | r.SocialLineRegist(Scene,Other.BubbleID) 138 | self.Relations.append(r) 139 | 140 | #关系维护 141 | def RelationsModify(self,Scene,OtherID): 142 | #找到对应关系 143 | for r in (self.Relations): 144 | if r.OtherID ==OtherID: 145 | #添加事件 146 | r.SocialLineRegist(Scene,OtherID) 147 | break 148 | 149 | #关系注销 150 | def RelationsCancel(self,Scene,OtherID): 151 | for r in self.Relations: 152 | if r.OtherID==OtherID: 153 | del self.Relations[r] 154 | break 155 | #关系字典描述 156 | def ToDic(self): 157 | 158 | x=[] 159 | for a in range(len(self.Relations)): 160 | x.append(self.Relations[a].ToDic()) 161 | 162 | sorted(x,key=lambda y:y['交往密切程度']) 163 | #for a in self.Relations: 164 | #x.append( a().ToDic() ) 165 | # return {'主人ID':self.Bubble.BubbleID,'主人姓名':self.Bubble.surname+self.Bubble.firstname,'主人性别':self.Bubble.gender,'社交能力': self.SocialAbility,'关系数量':len(x),'关系':x} 166 | return {'社交能力': self.SocialAbility,'关系数量':len(x),'关系':x} 167 | ''' 168 | s=TSocial() 169 | s.RelationsRegist('xx',2) 170 | s.RelationsRegist('xx1',3) 171 | s.RelationsRegist('xx2',4) 172 | s.RelationsRegist('xx3',5) 173 | s.RelationsRegist('xx4',6) 174 | print(s.ToDic()) 175 | 176 | import json 177 | json = json.dumps(s.ToDic()) 178 | print(json) 179 | ''' 180 | 181 | -------------------------------------------------------------------------------- /image/nullFile: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /image/quanzhigaoshou.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuanren88/python-/accc377741693d3e8d0d6d42202ce2d9055214a1/image/quanzhigaoshou.gif -------------------------------------------------------------------------------- /quanzhigaoshou.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuanren88/python-/accc377741693d3e8d0d6d42202ce2d9055214a1/quanzhigaoshou.json -------------------------------------------------------------------------------- /removeWords.json: -------------------------------------------------------------------------------- 1 | 2 | ["中原", 3 | "有一", 4 | "西域", 5 | "玉城", 6 | "是他", 7 | "人都", 8 | "声音", 9 | "不知道", 10 | "在这", 11 | "都不", 12 | "不到", 13 | "任何", 14 | "是我", 15 | "大雪", 16 | "是在", 17 | "大雪山", 18 | "所有", 19 | "不可", 20 | "是这", 21 | "剑客", 22 | "让他", 23 | "喜欢", 24 | "不少", 25 | "是龙", 26 | "多敦", 27 | "过来", 28 | "离开", 29 | "在一", 30 | "开口", 31 | "问题", 32 | "马上", 33 | "但他", 34 | "机会", 35 | "其实", 36 | "不了", 37 | "来了", 38 | "曾经", 39 | "是龙王", 40 | "在他", 41 | "从前", 42 | "不要", 43 | "不敢", 44 | "是你", 45 | "江湖", 46 | "大家", 47 | "东西", 48 | "不得", 49 | "同时", 50 | "剑法", 51 | "可不", 52 | "小益", 53 | "小姐", 54 | "成为", 55 | "其中", 56 | "子里", 57 | "真人", 58 | "高手", 59 | "不用", 60 | "和尚", 61 | "青觉", 62 | "夫人", 63 | "少主", 64 | "从来", 65 | "营地", 66 | "保护", 67 | "卫兵", 68 | "不出", 69 | "不住", 70 | "都没", 71 | "接受", 72 | "说出", 73 | "第二", 74 | "不在", 75 | "王后", 76 | "越来", 77 | "独孤", 78 | "龙军", 79 | "越来越", 80 | "告诉", 81 | "大概", 82 | "其他", 83 | "受到", 84 | "是有", 85 | "有些", 86 | "过去", 87 | "无仙", 88 | "人不", 89 | "留下", 90 | "王子", 91 | "真正", 92 | "回来", 93 | "关系", 94 | "多少", 95 | "石堡", 96 | "进入", 97 | "别人", 98 | "信任", 99 | "说过", 100 | "出去", 101 | "是一个", 102 | "那些", 103 | "门口", 104 | "留在", 105 | "那么", 106 | "简单", 107 | "不多", 108 | "清楚", 109 | "回答", 110 | "让我", 111 | "父亲", 112 | "在地", 113 | "完全", 114 | "却没", 115 | "不太", 116 | "是没", 117 | "利图", 118 | "真是", 119 | "全都", 120 | "正在", 121 | "人是", 122 | "是个", 123 | "有一个", 124 | "第一次", 125 | "声说", 126 | "那是", 127 | "不管", 128 | "骑兵", 129 | "不同", 130 | "疏勒", 131 | "是谁", 132 | "在龙", 133 | "人经", 134 | "普通", 135 | "来没", 136 | "南城", 137 | "那里", 138 | "军师", 139 | "后一", 140 | "有什么", 141 | "怀疑", 142 | "进来", 143 | "王不", 144 | "有多", 145 | "师父", 146 | "都有", 147 | "军营", 148 | "解释", 149 | "人一", 150 | "完待续", 151 | "上去", 152 | "却不", 153 | "年轻", 154 | "任务", 155 | "从来没", 156 | "是什么", 157 | "多了", 158 | "兴趣", 159 | "容易", 160 | "向龙", 161 | "原人", 162 | "刚刚", 163 | "求推荐", 164 | "却没有", 165 | "多人", 166 | "中原人", 167 | "是为", 168 | "向龙王", 169 | "正是", 170 | "铁寒", 171 | "所有人", 172 | "孟明", 173 | "秘密", 174 | "里面", 175 | "大人", 176 | "寒锋", 177 | "军官", 178 | "求收", 179 | "行动", 180 | "求收藏", 181 | "才是", 182 | "来说", 183 | "藏求", 184 | "骆家", 185 | "犹豫", 186 | "小声", 187 | "后面", 188 | "是要", 189 | "尤其", 190 | "安全", 191 | "出现", 192 | "却是", 193 | "天下", 194 | "是大", 195 | "王一", 196 | "王与", 197 | "无法", 198 | "中一", 199 | "慎为不", 200 | "不着", 201 | "是真", 202 | "原来", 203 | "在地上", 204 | "同意", 205 | "是不是", 206 | "是金", 207 | "不明", 208 | "说了", 209 | "掌门", 210 | "是想", 211 | "成了", 212 | "国王", 213 | "不一", 214 | "可他", 215 | "都在", 216 | "不仅", 217 | "回到", 218 | "王爷", 219 | "有这", 220 | "都会", 221 | "在这里", 222 | "真实", 223 | "原因", 224 | "学徒", 225 | "高兴", 226 | "过一", 227 | "那一", 228 | "是最", 229 | "不像", 230 | "不再", 231 | "哈哈", 232 | "大都", 233 | "才能", 234 | "北城", 235 | "石国", 236 | "在龙王", 237 | "不如", 238 | "小心", 239 | "无道", 240 | "危险", 241 | "永远", 242 | "出了", 243 | "支持", 244 | "是上", 245 | "多年", 246 | "是金鹏", 247 | "三个", 248 | "兴奋", 249 | "皇帝", 250 | "以后", 251 | "来一", 252 | "同样", 253 | "是杀", 254 | "是她", 255 | "大声", 256 | "双手", 257 | "随后", 258 | "侍军", 259 | "全是", 260 | "实话", 261 | "凤钗", 262 | "过了", 263 | "银子", 264 | "正常", 265 | "王是", 266 | "骆启", 267 | "官怒", 268 | "不肯", 269 | "求订阅", 270 | "仙人", 271 | "表示", 272 | "解决", 273 | "龙王不", 274 | "改变", 275 | "是对", 276 | "太多", 277 | "大头", 278 | "道自", 279 | "功力", 280 | "是自", 281 | "是那", 282 | "朋友", 283 | "道自己", 284 | "不可能", 285 | "太后", 286 | "保持", 287 | "人群", 288 | "不好", 289 | "出手", 290 | "母亲", 291 | "莫林", 292 | "孟家", 293 | "来不", 294 | "刀法", 295 | "能让", 296 | "利用", 297 | "子上", 298 | "上一", 299 | "不相", 300 | "说不", 301 | "中间", 302 | "是一名", 303 | "习惯", 304 | "公开", 305 | "慎为一", 306 | "人也", 307 | "在意", 308 | "力量", 309 | "是上官", 310 | "成功", 311 | "第三", 312 | "奇怪", 313 | "路上", 314 | "都要", 315 | "其他人", 316 | "明显", 317 | "微微", 318 | "不动", 319 | "三十", 320 | "不见", 321 | "经过", 322 | "么也", 323 | "人在", 324 | "龙王与", 325 | "是怎", 326 | "白了", 327 | "其是", 328 | "理由", 329 | "承认", 330 | "不起", 331 | "三名", 332 | "尤其是", 333 | "是怎么", 334 | "说得", 335 | "印象", 336 | "回事", 337 | "是因", 338 | "有几", 339 | "不清", 340 | "金鹏杀", 341 | "隐藏", 342 | "将他", 343 | "保证", 344 | "仇恨", 345 | "都尉", 346 | "前面", 347 | "进去", 348 | "接着", 349 | "是顾", 350 | "道理", 351 | "堂弟", 352 | "王身", 353 | "龙王一", 354 | "是被", 355 | "何人", 356 | "无所", 357 | "堂弟子", 358 | "干嘛", 359 | "是从", 360 | "让你", 361 | "进行", 362 | "不上", 363 | "战争", 364 | "本来", 365 | "是因为", 366 | "强盗", 367 | "在一起", 368 | "势力", 369 | "小小", 370 | "小山", 371 | "无论", 372 | "不安", 373 | "任何人", 374 | "本不", 375 | "慎为说", 376 | "房间", 377 | "是为了", 378 | "来到", 379 | "刀剑", 380 | "不对", 381 | "寻找", 382 | "都没有", 383 | "许多人", 384 | "不一样", 385 | "是这样", 386 | "明白了", 387 | "年前", 388 | "不由", 389 | "时辰", 390 | "人就", 391 | "过这", 392 | "威胁", 393 | "用大", 394 | "无关", 395 | "慎为与", 396 | "是中", 397 | "表现","的", ",", "和", "是", "随着", "对于", "对", "等", "能", "都", "。", " ", "、", "中", "在", "了", "通常", "如果", "我们", "需要", "卷", "_", "【", "】", "·", " ", ":", "“", "”", "我", "道", "他", "说", "你", "也", "人", "但是", "就", "一个", "自己", "这", "但", "不", "没有", "有", "被", "要", "就是", "这个", "他们", "到", "并", "!", "(", ")", "而", "已经", "还", "却", "着", "不是", "很", "会", "?", "上", "[", "]", "们", "去", "这些", "这样", "先", "因为", "把", "后", "什么", "让", "可以", "地", "来", "—", "不过", "这种", "将", "那", "从", "虽然", "还是", "这位", "给", "个", "十分", "又", "与", "下", "做", "过", "好", "很多", "得", "实在", "用", "终于", "此时", "呢", "可能", "不能", "时", "开始", "似乎", "之", "可", "时候", "才", "应该", "可是", "于是", "吧", "向", "现在", "当", "能够", "想", "之后", "多", "她", "还有", "所以", "这里", "便", "所谓", "说道", "你们", "只", "…", "见", "跟", "一声", "请", "谁", "都是", "第一", "是一", "人的", "不会", "相信", "时间", "少年", "公主", "王的", "来的", "是不", "出一", "谁", "有点", "以为", "公子", "有人", "士兵", "出来", "计划", "许多", "仍然", "武功", "说话", "不想", "军队", "请", "问道", "咱们", "已", "再", "罢", "怎么", "不知", "只见", "里", "啊", "本书", "小说网", "用户", "明白", "希望", "那个"] 398 | 399 | -------------------------------------------------------------------------------- /全职高手.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuanren88/python-/accc377741693d3e8d0d6d42202ce2d9055214a1/全职高手.rar -------------------------------------------------------------------------------- /全职高手关系分析.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuanren88/python-/accc377741693d3e8d0d6d42202ce2d9055214a1/全职高手关系分析.gif --------------------------------------------------------------------------------