├── __init__.py
├── 黑色裂变.txt
├── hibiscusTools.pyc
├── README.md
├── .gitignore
├── hibiscusTools.py
└── hibiscusMain.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/黑色裂变.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwsy/FictionAnalysis/HEAD/黑色裂变.txt


--------------------------------------------------------------------------------
/hibiscusTools.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwsy/FictionAnalysis/HEAD/hibiscusTools.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FictionAnalysis
 2 | 对小说文本进行分析，提炼小说剧情内容和人物关系
 3 | 
 4 | 如果一部从未度过的小说，经过程序的分析解读便可列出小说中的主要人物，并阐明人物关系，是不是很有意思呢！
 5 | #程序的主要思路如下:
 6 | 
 7 | 1、在没有词库的情况下，解析文本，通过计算一个词的内部凝固程度和自由运用度来判断一个词可否为单词
 8 | 
 9 | 2、提取人名和地名
10 | 
11 | 3、通过人物名称在文本中的位置和发生关系的动作建立人物关系图谱
12 | 
13 | 听起来是不是很高大上呢，哈哈，现在勉强完成了第一步，在此感谢这篇博客的作者 http://www.csdn.net/article/2013-05-08/2815186 ，程序的第一步完全是在实现这篇博客的算法
14 | 
15 | #如何使用
16 | <!--hibiscusMain.py 为主程序，直接运行即可，程序将读取《黑色裂变.txt》并最终输出一个excel-->
17 |    
18 |     python hibiscusMain.py "path/file"
19 | 
20 |     -- > file.xls
21 | 
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Logs
 2 | logs
 3 | *.log
 4 | 
 5 | # Runtime data
 6 | pids
 7 | *.pid
 8 | *.seed
 9 | 
10 | # Directory for instrumented libs generated by jscoverage/JSCover
11 | lib-cov
12 | 
13 | # Coverage directory used by tools like istanbul
14 | coverage
15 | 
16 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
17 | .grunt
18 | 
19 | # node-waf configuration
20 | .lock-wscript
21 | 
22 | # Compiled binary addons (http://nodejs.org/api/addons.html)
23 | build/Release
24 | 
25 | # Dependency directory
26 | # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git
27 | node_modules
28 | 


--------------------------------------------------------------------------------
/hibiscusTools.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | '''
 3 | Created on 2016-1-23
 4 | 
 5 | @author: kwsy
 6 | '''
 7 | import re
 8 | import math
 9 | 
10 | minLen = 1
11 | maxLen = 4
12 | 
13 | 
14 | def getAllChineseCharacters(content):
15 |     regex = u'[\u4e00-\u9fa5]+'
16 |     res=re.findall(regex, content)
17 |     return res
18 | 
19 | 
20 | def getLatentword2(txt,length,width,index):
21 |     lst = []
22 |     for i in range(length):
23 |         if i+width<=length:
24 |             word = txt[i:i+width]
25 |             left = None
26 |             right = None
27 |             if i>0:
28 |                 left = txt[i-1:i]
29 |             if i<length-1:
30 |                 right = txt[i+width:i+width+1]
31 |             wordindex = index+i
32 |             item = {'word':word,'left':left,'right':right,'wordindex':wordindex}
33 |             lst.append(item)
34 |     return lst
35 | 
36 | 
37 | def getLatentword(txt,index):
38 |     LatentLst = []
39 |     length = len(txt)
40 |     for i in range(minLen,maxLen+1):
41 |         lst = getLatentword2(txt,length,i,index)
42 |         LatentLst.extend(lst)
43 |     '''
44 |     for item in LatentLst:
45 |         print item['word'],' ',item['left'],' ',item['right'],' ',item['wordindex']
46 |     '''
47 |     return LatentLst
48 | 
49 | 
50 | def splitWord(word):
51 |     lst = []
52 |     length = len(word)
53 |     for i in range(1,length):
54 |         lst.append((word[0:i],word[i:length]))
55 |     return lst
56 | 
57 | 
58 | def calculateFreedom(wordLst):
59 |     wordDic = {}
60 |     for word in wordLst:
61 |         if not word in wordDic:
62 |             wordDic[word] = 0
63 |         wordDic[word] = wordDic[word]+1
64 |     
65 |     count = len(wordLst)
66 |     freedom = 0
67 |     for word,wordcount in wordDic.items():
68 |         freedom = freedom - float(wordcount)/float(count)*math.log(float(wordcount)/float(count))
69 |     return freedom
70 | 
71 | 
72 | if __name__ == '__main__':
73 |    lst = [ '不', '皮', '倒', '皮']
74 |    print(calculateFreedom(lst))


--------------------------------------------------------------------------------
/hibiscusMain.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | '''
  3 | Created on 2016-1-23
  4 | 
  5 | @author: kwsy
  6 | '''
  7 | import os
  8 | import hibiscusTools
  9 | import codecs
 10 | from xlwt import Workbook
 11 | import sys  
 12 | 
 13 | 
 14 | class Hibiscus():
 15 |     def analyseNovel(self,filename):
 16 |         if not os.path.exists(filename):
 17 |             pass
 18 |         with codecs.open(filename, encoding='GBK') as file:
 19 |             content = file.read()
 20 | 
 21 |         txtlist = hibiscusTools.getAllChineseCharacters(content)
 22 |         
 23 |         
 24 |         self.novelInfo = {}
 25 |         index = 0
 26 |         for txt in txtlist:
 27 |             itemlst = hibiscusTools.getLatentword(txt, index)
 28 |             index = index+len(txt)
 29 |             for item in itemlst:
 30 |                 word = item['word']
 31 |                 if not word in self.novelInfo:
 32 |                     self.novelInfo[word] = {'leftLst':[],'rightLst':[],'wordindexLst':[],'count':0,'word':word}
 33 |                 if not item['left']==None:
 34 |                     self.novelInfo[word]['leftLst'].append(item['left'])
 35 |                 if not item['right']==None:
 36 |                     self.novelInfo[word]['rightLst'].append(item['right'])
 37 |                 self.novelInfo[word]['wordindexLst'].append(item['wordindex'])
 38 |                 self.novelInfo[word]['count'] = self.novelInfo[word]['count']+1
 39 |                 
 40 |         self.charCount = index
 41 |         self.calculte()
 42 |         
 43 |     def outExcel(self,filename):
 44 |         wb = Workbook()
 45 |         table = wb.add_sheet('新词')
 46 |         table.write(0,0,'单词')
 47 |         table.write(0,1,'出现次数')
 48 |         table.write(0,2,'凝结度')
 49 |         table.write(0,3,'自由度')
 50 |         lst = []
 51 |         for k,v in self.novelInfo.items():
 52 |             if v['count']>30 and len(k)>1 and v['solidification']>50 and v['freedom']>3:
 53 |                 lst.append(v)
 54 |         
 55 |         lst = sorted(lst,key=lambda x:x['count'],reverse=True)
 56 |         
 57 |         line = 1
 58 |         for index ,item in enumerate(lst):
 59 |             table.write(line,0,item['word'])
 60 |             table.write(line,1,item['count'])
 61 |             table.write(line,2,item['solidification'])
 62 |             table.write(line,3,item['freedom'])
 63 |             line +=1
 64 |         wb.save('./'+os.path.splitext(os.path.basename(filename))[0] +'.xls')
 65 |         
 66 |     def calculte(self):
 67 |         for word,info in self.novelInfo.items():
 68 |             self.novelInfo[word]['solidification']= self.getSolidification(word)       
 69 |             self.novelInfo[word]['freedom'] = self.getFreedom(self.novelInfo[word])
 70 | 
 71 |     def getFreedom(self,wordinfo):
 72 |         leftfreedom = hibiscusTools.calculateFreedom(wordinfo['leftLst'])
 73 |         rightfreedom = hibiscusTools.calculateFreedom(wordinfo['rightLst'])
 74 |         if leftfreedom<rightfreedom:
 75 |             return leftfreedom
 76 |         return rightfreedom
 77 | 
 78 |     def getSolidification(self,word):
 79 |         splitLst = hibiscusTools.splitWord(word)
 80 |         wordcount = self.novelInfo[word]['count']
 81 |         probability = float(wordcount)/float(self.charCount)
 82 |         min = 10000000
 83 |         for item in splitLst:
 84 |             left,right = item[0],item[1]
 85 |             leftcount,rightcount = self.novelInfo[left]['count'],self.novelInfo[right]['count']
 86 |             
 87 |             Togetherprobability = probability/((float(rightcount)/float(self.charCount))*(float(leftcount)/float(self.charCount)))
 88 |             if Togetherprobability<min:
 89 |                 min = Togetherprobability
 90 |         return min
 91 | 
 92 | 
 93 | def excute(name):
 94 |     filename = sys.argv[1]
 95 |     hibi = Hibiscus()
 96 |     hibi.analyseNovel(filename) 
 97 |     hibi.outExcel(filename)
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     excute( sys.argv[1:])
102 | 


--------------------------------------------------------------------------------