├── __init__.py ├── 黑色裂变.txt ├── hibiscusTools.pyc ├── README.md ├── .gitignore ├── hibiscusTools.py └── hibiscusMain.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /黑色裂变.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwsy/FictionAnalysis/HEAD/黑色裂变.txt -------------------------------------------------------------------------------- /hibiscusTools.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwsy/FictionAnalysis/HEAD/hibiscusTools.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FictionAnalysis 2 | 对小说文本进行分析,提炼小说剧情内容和人物关系 3 | 4 | 如果一部从未度过的小说,经过程序的分析解读便可列出小说中的主要人物,并阐明人物关系,是不是很有意思呢! 5 | #程序的主要思路如下: 6 | 7 | 1、在没有词库的情况下,解析文本,通过计算一个词的内部凝固程度和自由运用度来判断一个词可否为单词 8 | 9 | 2、提取人名和地名 10 | 11 | 3、通过人物名称在文本中的位置和发生关系的动作建立人物关系图谱 12 | 13 | 听起来是不是很高大上呢,哈哈,现在勉强完成了第一步,在此感谢这篇博客的作者 http://www.csdn.net/article/2013-05-08/2815186 ,程序的第一步完全是在实现这篇博客的算法 14 | 15 | #如何使用 16 | 17 | 18 | python hibiscusMain.py "path/file" 19 | 20 | -- > file.xls 21 | 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | 5 | # Runtime data 6 | pids 7 | *.pid 8 | *.seed 9 | 10 | # Directory for instrumented libs generated by jscoverage/JSCover 11 | lib-cov 12 | 13 | # Coverage directory used by tools like istanbul 14 | coverage 15 | 16 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 17 | .grunt 18 | 19 | # node-waf configuration 20 | .lock-wscript 21 | 22 | # Compiled binary addons (http://nodejs.org/api/addons.html) 23 | build/Release 24 | 25 | # Dependency directory 26 | # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git 27 | node_modules 28 | -------------------------------------------------------------------------------- /hibiscusTools.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | ''' 3 | Created on 2016-1-23 4 | 5 | @author: kwsy 6 | ''' 7 | import re 8 | import math 9 | 10 | minLen = 1 11 | maxLen = 4 12 | 13 | 14 | def getAllChineseCharacters(content): 15 | regex = u'[\u4e00-\u9fa5]+' 16 | res=re.findall(regex, content) 17 | return res 18 | 19 | 20 | def getLatentword2(txt,length,width,index): 21 | lst = [] 22 | for i in range(length): 23 | if i+width<=length: 24 | word = txt[i:i+width] 25 | left = None 26 | right = None 27 | if i>0: 28 | left = txt[i-1:i] 29 | if i30 and len(k)>1 and v['solidification']>50 and v['freedom']>3: 53 | lst.append(v) 54 | 55 | lst = sorted(lst,key=lambda x:x['count'],reverse=True) 56 | 57 | line = 1 58 | for index ,item in enumerate(lst): 59 | table.write(line,0,item['word']) 60 | table.write(line,1,item['count']) 61 | table.write(line,2,item['solidification']) 62 | table.write(line,3,item['freedom']) 63 | line +=1 64 | wb.save('./'+os.path.splitext(os.path.basename(filename))[0] +'.xls') 65 | 66 | def calculte(self): 67 | for word,info in self.novelInfo.items(): 68 | self.novelInfo[word]['solidification']= self.getSolidification(word) 69 | self.novelInfo[word]['freedom'] = self.getFreedom(self.novelInfo[word]) 70 | 71 | def getFreedom(self,wordinfo): 72 | leftfreedom = hibiscusTools.calculateFreedom(wordinfo['leftLst']) 73 | rightfreedom = hibiscusTools.calculateFreedom(wordinfo['rightLst']) 74 | if leftfreedom