├── .gitignore ├── .gitattributes ├── serveice.log ├── 白鹿原201708101054.docx ├── 疯狂的石头201708101529.docx ├── 让子弹飞201708101126.docx ├── .idea ├── dictionaries │ └── sunkai.xml ├── vcs.xml ├── misc.xml ├── modules.xml ├── fb.iml └── workspace.xml ├── README.md ├── punctuation_mark.txt ├── hibiscusTools.py ├── Global_Variables.py ├── hibiscusMain.py ├── line.py ├── stop_words.txt ├── session.py ├── handle_script.py └── sensitive_words.txt /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | 万人膜拜.txt 3 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto -------------------------------------------------------------------------------- /serveice.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunkaiiii/analyze_movie_script_with_python/HEAD/serveice.log -------------------------------------------------------------------------------- /白鹿原201708101054.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunkaiiii/analyze_movie_script_with_python/HEAD/白鹿原201708101054.docx -------------------------------------------------------------------------------- /疯狂的石头201708101529.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunkaiiii/analyze_movie_script_with_python/HEAD/疯狂的石头201708101529.docx -------------------------------------------------------------------------------- /让子弹飞201708101126.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunkaiiii/analyze_movie_script_with_python/HEAD/让子弹飞201708101126.docx -------------------------------------------------------------------------------- /.idea/dictionaries/sunkai.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | charactor 5 | 6 | 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # analyze_movie_script_with_python 2 | ## 主要功能
3 | 读取剧本
4 | 判断主角
5 | 处理场次信息
6 | 统计台词数
7 | 统计角色出场次数
8 | 统计情感词信息
9 | 统计广告词信息
10 | 统计敏感词信息
11 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /punctuation_mark.txt: -------------------------------------------------------------------------------- 1 | , 2 | , 3 | ? 4 | 、 5 | 。 6 | “ 7 | ” 8 | 《 9 | 》 10 | ! 11 | , 12 | : 13 | ; 14 | ? 15 | ‘ 16 | ’ 17 | " 18 | " 19 | ' 20 | < 21 | > 22 | ! 23 | . 24 | @ 25 | ~ 26 | # 27 | $ 28 | ^ 29 | & 30 | * 31 | - 32 | = 33 | + 34 | _ 35 | ~ 36 | ¥ 37 | …… 38 | ( 39 | ) 40 | ( 41 | ) 42 | ) 43 | —— 44 | — 45 | 。 46 | 47 | -------------------------------------------------------------------------------- /.idea/fb.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /hibiscusTools.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | """ 3 | 引用自GitHub@kwsy的开源大规模预料新词发现算法 4 | 具体请参阅https://github.com/kwsy 5 | """ 6 | ''' 7 | Created on 2016-1-23 8 | 9 | @author: kwsy 10 | ''' 11 | import re 12 | import math 13 | 14 | minLen = 1 15 | maxLen = 4 16 | 17 | def getAllChineseCharacters(content): 18 | regex = u'[\u4e00-\u9fa5]+' 19 | res=re.findall(regex, content) 20 | return res 21 | 22 | 23 | def getLatentword2(txt,length,width,index): 24 | lst = [] 25 | for i in range(length): 26 | if i+width<=length: 27 | word = txt[i:i+width] 28 | left = None 29 | right = None 30 | if i>0: 31 | left = txt[i-1:i] 32 | if i1): 10 | line=line.split("\t") 11 | dic.setdefault(line[0],[]) 12 | dic[line[0]].append(line[1]) 13 | return dic 14 | 15 | def read_sensitive_word(): 16 | file=open(sensitive_word_file,encoding="utf8").read().split("\n") 17 | sensitive_dic={} 18 | for line in file: 19 | if len(line.split(' '))<2: 20 | continue 21 | word_type,word=line.split(" ") 22 | sensitive_dic.setdefault(word_type,[]) 23 | sensitive_dic[word_type].append(word) 24 | return sensitive_dic 25 | def convert_userdic(user_dic): 26 | user_dic_convert = {} 27 | for section in user_dic: 28 | # print(section) 29 | key = section[1].replace(' ', '').replace('\u3000', '') 30 | user_dic_convert.setdefault(key, []) 31 | user_dic_convert[key].append(section[0].replace('\u3000', '')) 32 | # count=0 33 | # for k,v in user_dic_convert.items(): 34 | # count+=len(v) 35 | # print(k,v) 36 | # print(count) 37 | return user_dic_convert 38 | 39 | 40 | word_list_dic =read_user_dic() 41 | name_list = [] 42 | filename = 'name_bai.txt' 43 | puncutation_file = 'punctuation_mark.txt' 44 | stopword_file='stop_words.txt' 45 | time = ['日', '晚上', '昼', '夜', '晨', '凌晨', '清晨', '早晨', '上午', '中午', '正午', '下午', 46 | '昏', '傍晚', '佛晓', '黎明', '日出', '日落'] 47 | place = ['外', '内', '室内', '室外', '户内', '户外'] 48 | out_place=['外', '室外','户外'] 49 | in_place=['内','室内','户内'] 50 | sensitive_word=read_sensitive_word() 51 | ad_word=open("ad.txt",encoding="utf8").read().split("\n") 52 | stop_word=set(open(stopword_file,encoding='utf-8').read().split('\n')) 53 | punctuation_mark = set(open(puncutation_file, encoding='utf-8').read().split('\n')) -------------------------------------------------------------------------------- /hibiscusMain.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | """ 3 | 引用自GitHub@kwsy的开源大规模预料新词发现算法 4 | 具体请参阅https://github.com/kwsy 5 | """ 6 | ''' 7 | Created on 2016-1-23 8 | 9 | @author: kwsy 10 | ''' 11 | 12 | import os 13 | import hibiscusTools 14 | import codecs 15 | from xlwt import Workbook 16 | from audioop import reverse 17 | import sys 18 | 19 | class Hibiscus(): 20 | def analyseNovel(self,content): 21 | content = content 22 | txtlist = hibiscusTools.getAllChineseCharacters(content) 23 | self.novelInfo = {} 24 | index = 0 25 | for txt in txtlist: 26 | itemlst = hibiscusTools.getLatentword(txt, index) 27 | index = index+len(txt) 28 | for item in itemlst: 29 | word = item['word'] 30 | if not word in self.novelInfo: 31 | self.novelInfo[word] = {'leftLst':[],'rightLst':[],'wordindexLst':[],'count':0,'word':word} 32 | if not item['left']==None: 33 | self.novelInfo[word]['leftLst'].append(item['left']) 34 | if not item['right']==None: 35 | self.novelInfo[word]['rightLst'].append(item['right']) 36 | self.novelInfo[word]['wordindexLst'].append(item['wordindex']) 37 | self.novelInfo[word]['count'] = self.novelInfo[word]['count']+1 38 | 39 | self.charCount = index 40 | self.calculte() 41 | result=self.outResult() 42 | return result 43 | 44 | def outResult(self): 45 | wb = Workbook() 46 | table = wb.add_sheet(u'新词') 47 | table.write(0,0,u'单词') 48 | table.write(0,1,u'出现次数') 49 | table.write(0,2,u'凝结度') 50 | table.write(0,3,u'自由度') 51 | lst = [] 52 | for k,v in self.novelInfo.items(): 53 | if v['count']>30 and len(k)>1 and v['solidification']>50 and v['freedom']>1: #原来为30,50,3 54 | lst.append(v) 55 | 56 | lst = sorted(lst,key=lambda x:x['count'],reverse=True) 57 | 58 | count=0; 59 | result=[] 60 | for index,item in enumerate(lst): 61 | result.append(item['word']) 62 | count+=1 63 | if count==5: 64 | break 65 | return result 66 | 67 | def calculte(self): 68 | for word,info in self.novelInfo.items(): 69 | self.novelInfo[word]['solidification']= self.getSolidification(word) 70 | self.novelInfo[word]['freedom'] = self.getFreedom(self.novelInfo[word]) 71 | def getFreedom(self,wordinfo): 72 | leftfreedom = hibiscusTools.calculateFreedom(wordinfo['leftLst']) 73 | rightfreedom = hibiscusTools.calculateFreedom(wordinfo['rightLst']) 74 | if leftfreedom= '0' and i[index] <= '9': #读取剧本当中的连续数字,则独处的数字认为是剧本场次号 117 | num += i[index] 118 | elif len(num) > 0: 119 | self.session_number = num 120 | break 121 | session_info = i.replace(num, '').replace('.', '').replace('、', '').replace(" ", '') 122 | '''找到对应的日夜内外的文字信息,删除对应的段,最后留下的即为场景地点''' 123 | for time in Global_Variables.time: 124 | if time in session_info: 125 | self.session_time = time 126 | session_info = session_info.replace(time, '') 127 | break 128 | for place in Global_Variables.place: 129 | if place in session_info: 130 | self.session_place = place 131 | session_info = session_info.replace(place, "") 132 | self.session_location = session_info 133 | count += 1 134 | for name, word in self.session_emotion_words_dic.items(): 135 | self.session_emotion_words_set_dic[name] = set(word) 136 | # self.show_info() 137 | 138 | def cal_sensitive_words(self): 139 | self.session_sensitive_word_set=[] 140 | for line in self.line_list: 141 | for key,words in line.sensitive_word.items(): 142 | self.session_sensitive_word.setdefault(key,[]) 143 | self.session_sensitive_word[key].extend(words) 144 | self.session_sensitive_word_set.extend(words) 145 | self.session_sensitive_word_set=set(self.session_sensitive_word_set) 146 | for key,words in self.session_sensitive_word.items(): 147 | for word in words: 148 | self.session_sensitive_word_count_dic.setdefault(word,0) 149 | self.session_sensitive_word_count_dic[word]+=1 150 | # print(self.session_sensitive_word_set) 151 | 152 | def cal_ad_words(self): 153 | for line in self.line_list: 154 | self.session_ad_word.extend(line.ad_word) 155 | self.session_ad_word_set=set(self.session_ad_word) 156 | for word in self.session_ad_word: 157 | self.session_ad_word_count_dic.setdefault(word,0) 158 | self.session_ad_word_count_dic[word]+=1 159 | # print(self.session_ad_word_set) 160 | 161 | def cal_words_amount(self): 162 | ''' 163 | 计算角色的台词数和情感词 164 | :return: 165 | ''' 166 | for line in self.line_list: 167 | for charactor in line.other_character: 168 | # print(charactor) 169 | self.session_charactor_dic[charactor].appearance = True #角色在这个场出现(没说话,但是别人有提到)在这里统计的时候,其效果和角色说话是一样的,都是“在这场出现” 170 | if line.type == 'talk': 171 | self.session_all_charactor.append(line.who_said_no_cut) 172 | if line.who_said in Global_Variables.name_list: 173 | said_word = line.content 174 | self.session_charactor_dic[line.who_said].appearance = True #角色在这个场出现(角色说话) 175 | self.session_charactor_dic[line.who_said].charactor_worlds.append(said_word) 176 | cut_said_word = jieba.cut(said_word) 177 | for word in cut_said_word: 178 | for name, words in Global_Variables.word_list_dic.items(): 179 | if word in words: 180 | self.session_charactor_dic[line.who_said].charactor_emotion_dic[name].append(word) 181 | # print(self.session_charactor_dic[line.who_said].charactor_emotion_dic) 182 | for i in Global_Variables.punctuation_mark: 183 | said_word = said_word.replace(i, '') # 统计台词量之前,去除标点符号 184 | self.session_charactor_dic[line.who_said].charactor_world_amount += len(said_word) 185 | for v in self.session_charactor_dic.values(): 186 | self.session_words_amount += v.charactor_world_amount 187 | self.session_all_charactor_set = set(self.session_all_charactor) 188 | 189 | def cal_main_content(self): 190 | """ 191 | 计算场次主要内容使用了一个第三方库,用的TextRank算法提取的主要内容 192 | 具体更多内容和用法可以参考https://github.com/letiantian/TextRank4ZH 193 | """ 194 | content="" 195 | for line in self.line_list: 196 | if line.type=='event': 197 | content+=line.content 198 | tr4s=TextRank4Sentence() 199 | tr4s.analyze(text=content,lower=True,source='all_filters') 200 | for item in tr4s.get_key_sentences(2): #目前暂时的数量为摘选出主要的两句话 201 | self.session_main_content+=item.sentence 202 | # print(self.session_main_content) 203 | 204 | def show_info(self, show_line_detail=0): 205 | ''' 206 | :param show_line_detail: 1为显示行具体信息,0为只显示场的信息 207 | ''' 208 | print('场次编号:' + str(self.session_number)) 209 | print('场次时间:' + str(self.session_time)) 210 | print('室内室外:' + str(self.session_place)) 211 | print('场景地点:' + str(self.session_location)) 212 | print('场景台词数:' + str(self.session_words_amount)) 213 | print('场景情感值:' + str(self.session_emotion_value)) 214 | print('主要内容:'+str(self.session_main_content)) 215 | print('敏感词为:'+str(self.session_sensitive_word)) 216 | print(self.session_sensitive_word_set) 217 | print(self.session_sensitive_word_count_dic) 218 | print('广告词'+str(self.session_ad_word)) 219 | for Charactor in self.session_charactor_dic.values(): 220 | if len(Charactor.charactor_worlds)>0: 221 | print(Charactor.name+','+str(Charactor.charactor_worlds)) 222 | for key,values in Charactor.charactor_emotion_dic.items(): 223 | if len(values)>0: 224 | print(key+str(values)) 225 | 226 | if show_line_detail == 1: 227 | for line in self.line_list: 228 | line.showInfo() 229 | 230 | 231 | if __name__ == "__main__": 232 | # a = open('test.txt', encoding='utf-8').read() 233 | a = Session("sb\nsb\nsb", mode=1) 234 | a.show_info() 235 | -------------------------------------------------------------------------------- /handle_script.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import session 3 | import os 4 | import Global_Variables 5 | from docx import Document 6 | import jieba 7 | import hibiscusMain 8 | import multiprocessing 9 | 10 | jieba.load_userdict('user_dic.txt') 11 | """ 12 | ----------handle_scipt.py------------- 13 | 记录整个剧本信息 14 | """ 15 | if not os.path.exists('out'): 16 | os.mkdir('out') 17 | 18 | 19 | class shunjingbiao: 20 | '''存储顺景表信息的类''' 21 | 22 | def __init__(self, script_id=-1, script_num=-1, script_content='', main_content="", time='', role=[]): 23 | self.script_id = script_id 24 | self.script_num = script_num 25 | self.script_content = script_content 26 | self.main_content = main_content 27 | self.time = time 28 | self.role = role 29 | self.pagenum = float(len(self.script_content.split('\n'))) / 50.0 30 | 31 | 32 | class Script: 33 | ''' 34 | 记录整个剧本的信息,包含多个场景(session)的类的实例 35 | ''' 36 | 37 | def __init__(self, filename): 38 | '''最后时间不太够,所有的方法都放在了init上面执行了,有点乱…… 39 | #待修改部分 40 | ''' 41 | self.script_name = '' 42 | self.save_path = "out\\" 43 | self.session_list = [] # 存放所有场次信息的list 44 | self.charactor_overrall_word_count_dic = {} # 角色台词数 45 | self.charactor_overral_apear_in_session = {} # 角色出现场次数 46 | self.charactor_emetion_word_in_session = {} # 角色情感词 47 | self.filename=filename 48 | self.shunjingbiao = {} 49 | self.shunchangbiao = {} 50 | self.all_ad_count = [] 51 | self.session_ad_count = [] 52 | self.all_sensitive_word_count_dic = {} 53 | self.charactor = Global_Variables.name_list 54 | # for i in Global_Variables.name_list: 55 | # self.charactor_overrall_word_count_dic[i] = 0 56 | self.all_charactor_count = {} 57 | 58 | def cal_all_info(self): 59 | print('读取剧本') 60 | self.file_text = self.read_script_file(self.filename) 61 | Global_Variables.name_list = [] 62 | print('程序推测主角') 63 | self.find_main_charactor(self.file_text) 64 | main_role = '' 65 | for name in Global_Variables.name_list: 66 | main_role += name + "," 67 | main_role = main_role[:-1] 68 | print('推测主角为' + main_role) 69 | print('处理场次信息') 70 | self.handle_session(self.file_text) 71 | print('统计角色台词数') 72 | self.cal_overrall_count() 73 | print('计算非主角出场次数') 74 | self.cal_all_character() 75 | print('计算主要角色出场次数') 76 | self.cal_character_apear_count() 77 | print("计算敏感词信息") 78 | self.cal_all_senstive_word_count() 79 | print("计算广告信息") 80 | self.session_ad_count = self.cal_ad_words_count() 81 | 82 | def write_info(self): 83 | self.write_script_detail() 84 | self.write_script_role() 85 | self.write_session_role_word() 86 | self.write_participle() 87 | self.write_session_ad_args() 88 | self.wrtie_script_sensitive_args() 89 | 90 | def test_muiltiprocess(self): 91 | self.cal_all_info() 92 | self.write_info() 93 | 94 | def find_main_charactor(self, file_text, mode=1): 95 | """ 96 | 两种剧本模式的推测主角方法不一 97 | 1、简版剧本使用统计剧本中说话的频次数(即xxx说中的xxx出现次数的排序,前五个即为主角) 98 | 2、标准版剧本使用一个开源大规模预料分析的额库,可以猜测没有在词库的情况下推测词(在剧本中,主角们被提到的次数通常是最多的,所以可以用来推测主角) 99 | 但在推测主角过程中,如果跟人物小传中所记录的内容不一样(比如万人膜拜这个剧本人物小传和剧本中姓名并不对应)会导致统计出来的结果出现问题 100 | 所以暂时没有启用这个推测功能(人物小传中应到加入一个别名,在别名内所有的称呼、昵称都应为这个角色,功能未做) 101 | """ 102 | if mode == 0: 103 | a = 1 104 | # result = hibiscusMain.Hibiscus().analyseNovel(self.file_text) 105 | # for c in result: 106 | # Global_Variables.name_list.append(c) 107 | elif mode == 1: 108 | user_dic = {} 109 | session_list = file_text.split('\n\n') 110 | for session in session_list: 111 | session = session.split('\n') 112 | for line in session: 113 | line = line.replace(':', ":").replace(' ', '').replace('\n', '').replace('\ufeff', '') 114 | if ':' in line: 115 | if mode == 1: 116 | charactor = line.split(':')[0] 117 | user_dic.setdefault(charactor, 0) 118 | user_dic[charactor] += 1 119 | elif mode == 0: 120 | info_list = Global_Variables.session_info_title 121 | info_list.extend(Global_Variables.character_biographies) 122 | if line.split(':')[0] in info_list: 123 | continue 124 | else: 125 | charactor = line.split(':')[0] 126 | user_dic.setdefault(charactor, 0) 127 | user_dic[charactor] += 1 128 | # elif mode==0: 129 | # 130 | user_dic = sorted(user_dic.items(), key=lambda x: x[1], reverse=True) 131 | # print(user_dic) 132 | Global_Variables.name_list = [] 133 | character_range = 5 134 | for i in range(0, character_range): 135 | Global_Variables.name_list.append(user_dic[i][0]) 136 | # print(Global_Variables.name_list) 137 | 138 | for word in Global_Variables.name_list: 139 | jieba.add_word(word, 10000) 140 | 141 | def read_script_file(self, filename): 142 | """ 143 | 读取剧本,并处理剧本名字(剧本名字是带有剧本名字+时间戳的)转化为 script类的script_name 144 | :param filename: 上传的剧本的路径 145 | :return: 读取的剧本文本内容 146 | """ 147 | name = os.path.splitext(filename)[0] 148 | self.script_name = name.split('\\')[len(name.split('\\')) - 1] 149 | script = "" 150 | self.save_path += self.script_name + "\\" 151 | if not os.path.exists(self.save_path): 152 | os.mkdir(self.save_path) 153 | # script=open(filename,encoding='utf-8').read() 154 | document = Document(filename) 155 | for para in document.paragraphs: 156 | script += para.text + '\n' 157 | # print(script) 158 | return script 159 | 160 | def handle_session(self, script): 161 | count = 0 162 | split_script = script.split('\n\n') # 以双回车判断是否为一个场 163 | for s in split_script: 164 | if (len(s) <= 7): 165 | continue 166 | ss = session.Session(s) 167 | self.session_list.append(ss) 168 | count += 1 169 | # ss.show_info() 170 | if count % 20 == 0: 171 | print('已处理' + str(count) + '场') 172 | 173 | def cal_overrall_count(self): 174 | """ 175 | 统计每个角色的台词数 176 | """ 177 | for session in self.session_list: 178 | for keys, session_charactor_info in session.session_charactor_dic.items(): 179 | self.charactor_overrall_word_count_dic.setdefault(keys,0) 180 | self.charactor_overrall_word_count_dic[keys] += session_charactor_info.charactor_world_amount 181 | 182 | def cal_all_character(self): 183 | """ 184 | 计算角色(包含非主要角色)出场次数 185 | """ 186 | for session in self.session_list: 187 | for name in session.session_all_charactor_set: 188 | self.all_charactor_count.setdefault(name, 0) 189 | self.all_charactor_count[name] += 1 190 | 191 | '''输出所有角色出现次数的排序(未分词)到屏幕,可以发现主要人物''' 192 | # print(sorted(self.all_charactor_count.items(), key=lambda x: x[1], reverse=True)) 193 | 194 | def cal_character_apear_count(self): 195 | """ 196 | 计算主要角色的出场次数 197 | """ 198 | for session in self.session_list: 199 | for name, apear in session.session_charactor_dic.items(): 200 | self.charactor_overral_apear_in_session.setdefault(name, 0) 201 | if apear.appearance: 202 | self.charactor_overral_apear_in_session[name] += 1 203 | # print(self.charactor_overral_apear_in_session) 204 | 205 | def cal_ad_words_count(self): 206 | """ 207 | 统计广告词广告词 208 | return:返回(场次编号、广告词、广告词计数) 209 | """ 210 | args = [] 211 | self.all_ad_count = {} # 先转换为字典方便存储 212 | for session in self.session_list: 213 | for word, count in session.session_ad_word_count_dic.items(): 214 | args.append((session.session_number, word, count)) 215 | self.all_ad_count.setdefault(word, 0) 216 | self.all_ad_count[word] += 1 217 | self.all_ad_count = sorted(self.all_ad_count.items(), key=lambda x: x[1], reverse=True) 218 | # for i in self.all_ad_count: 219 | # print(i) 220 | # print(args) 221 | return args 222 | 223 | def cal_all_senstive_word_count(self): 224 | for session in self.session_list: 225 | for key, word_count in session.session_sensitive_word_count_dic.items(): 226 | self.all_sensitive_word_count_dic.setdefault(key, 0) 227 | self.all_sensitive_word_count_dic[key] += word_count 228 | 229 | def write_script_detail(self): 230 | '''输出剧本场景详情''' 231 | script_detail_args = "" 232 | for session in self.session_list: 233 | '''此处变量名与数据库中字段名对应,方便使用''' 234 | script_number = session.session_number 235 | content = session.session_content 236 | role = "" 237 | role_number = 0 238 | for character in session.session_charactor_dic.values(): 239 | if character.appearance: 240 | role_number += 1 241 | role += character.name + '|' 242 | role = role[:-1] 243 | if len(session.session_time) > 0: 244 | if session.session_time not in Global_Variables.time: 245 | Global_Variables.time.append(session.session_time) 246 | period = session.session_time 247 | else: 248 | period = 0 249 | scene = session.session_location 250 | if len(session.session_place) > 0: 251 | if session.session_place not in Global_Variables.place: 252 | Global_Variables.place.append(session.session_place) 253 | surroundings = session.session_place 254 | else: 255 | surroundings = 0 256 | # role_number = len(session.session_all_charactor_set) 257 | script_detail_args += str(script_number) + '\t' + str(period) + '\t' + str(scene) + '\t' + str( 258 | surroundings) + '\t' + role + '\t' + str(role_number) + '\t' + session.session_main_content + '\n' 259 | # for i in script_detail_args: 260 | # print(i) 261 | file = open(self.save_path + '剧本场景信息.txt', 'w', encoding="utf8") 262 | file.write(script_detail_args) 263 | file.close() 264 | 265 | def write_script_role(self): 266 | '''输出剧本角色信息''' 267 | script_roles = "" 268 | for role_name, word_count in self.charactor_overrall_word_count_dic.items(): 269 | # print(role_name,self.charactor_overral_apear_in_session[role_name],word_count) 270 | script_roles += role_name + '\t' + str(word_count) + '\t' + str( 271 | self.charactor_overral_apear_in_session[role_name]) + '\n' 272 | f = open(self.save_path + '角色信息.txt', 'w', encoding="utf8") 273 | f.write(script_roles) 274 | f.close() 275 | 276 | def write_session_role_word(self): 277 | '''输出剧本角色情感词''' 278 | args = "" 279 | for session in self.session_list: 280 | self.charactor_emetion_word_in_session.setdefault(session.session_number, []) 281 | for Charactor in session.session_charactor_dic.values(): 282 | self.charactor_emetion_word_in_session[session.session_number].append(Charactor) 283 | # print(self.charactor_emetion_word_in_session) 284 | for key, value in Charactor.charactor_emotion_dic.items(): 285 | for word in value: 286 | args += key + '\t' + word + '\t' + Charactor.name + '\t' + str(session.session_number) + '\n' 287 | # print(args) 288 | f = open(self.save_path + '角色情感词.txt', 'w', encoding="utf8") 289 | f.write(args) 290 | f.close() 291 | return (args) 292 | 293 | def write_participle(self): 294 | '''输出情感词分词内容''' 295 | participle_args = "" 296 | word_dic = {} 297 | for session in self.session_list: 298 | for type, word_list in session.session_emotion_words_dic.items(): 299 | for word in word_list: 300 | word_dic.setdefault((word, session.session_number, type), 0) 301 | word_dic[(word, session.session_number, type)] += 1 302 | for word_item, count in word_dic.items(): 303 | participle_args += str(word_item[0]) + '\t' + str(word_item[1]) + '\t' + str(word_item[2]) + '\t' + str( 304 | count) + '\n' 305 | # for i in participle_args: 306 | # print(i) 307 | file = open(self.save_path + '分词信息.txt', 'w', encoding="utf8") 308 | file.write(participle_args) 309 | file.close() 310 | 311 | def write_session_ad_args(self): 312 | """输出剧本广告词信息""" 313 | args = "" 314 | for info in self.session_ad_count: 315 | args += str(info[0]) + '\t' + str(info[1]) + '\t' + str(info[2]) + '\n' 316 | file = open(self.save_path + '场景广告信息.txt', 'w', encoding="utf8") 317 | file.write(args) 318 | file.close() 319 | 320 | def wrtie_script_sensitive_args(self): 321 | """输出剧本敏感词信息""" 322 | args = "" 323 | sensitive_word_sort = sorted(self.all_sensitive_word_count_dic.items(), key=lambda x: x[1], reverse=True) 324 | for word, count in sensitive_word_sort: 325 | args += word + '\t' + str(count) + '\n' 326 | file = open(self.save_path + '剧本敏感词信息.txt', 'w', encoding="utf8") 327 | file.write(args) 328 | file.close() 329 | 330 | def showinfo(self, show_session_detail=0, show_line_detail=0): 331 | for k, v in self.charactor_overrall_word_count_dic.items(): 332 | print(k + str(v)) 333 | if show_session_detail == 1: 334 | for i in self.session_list: 335 | i.show_info(show_line_detail=show_line_detail) 336 | 337 | 338 | if __name__ == "__main__": 339 | script = Script('白鹿原201708101054.docx') 340 | p1=multiprocessing.Process(target=script.test_muiltiprocess) 341 | script2 = Script('让子弹飞201708101126.docx') 342 | p2=multiprocessing.Process(target=script2.test_muiltiprocess) 343 | script3 = Script('疯狂的石头201708101529.docx') 344 | p3=multiprocessing.Process(target=script3.test_muiltiprocess) 345 | p1.start() 346 | p2.start() 347 | p3.start() 348 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 66 | 67 | 68 | 69 | session_list 70 | handle 71 | sorted 72 | 计算敏感词 73 | script_id 74 | print 75 | shunjingbiao_args 76 | Charactor 77 | other_character 78 | replace 79 | name_list 80 | character_biographies_dic 81 | print( 82 | script_ad_args 83 | sensitive 84 | implan 85 | mysql 86 | character_biographies 87 | ad 88 | Global_Variables.ad_word 89 | self.script_name 90 | script_name 91 | cal_script_role 92 | write_sc 93 | cal_session_role_word 94 | cal_participle 95 | self.script_id 96 | hibiscusMain 97 | cal_ad_words_count 98 | charactor_overrall_word_count_dic 99 | 100 | 101 | from Global_Variables import Global_Variables 102 | self.shunjingbiao 103 | lib_ad_key_words 104 | 105 | 106 | 107 | 109 | 110 | 135 | 136 | 137 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 |