├── README ├── conf ├── emotion_dict.txt └── emotion_match.txt └── src └── deal_text.py /README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wendy1990/HotTopic_emotion_classification/645e4075d3ba7fe602dc9612cd7039bbbdd6f6c2/README -------------------------------------------------------------------------------- /conf/emotion_match.txt: -------------------------------------------------------------------------------- 1 | PA 1 1 快乐 2 | PE 2 1 安心 3 | PD 3 2 尊敬 4 | PH 4 2 赞扬 5 | PG 5 2 相信 6 | PB 6 2 喜爱 7 | PK 7 2 祝愿 8 | NA 8 3 愤怒 9 | NB 9 4 悲伤 10 | NJ 10 4 失望 11 | NH 11 4 疚 12 | PF 12 4 思 13 | NI 13 5 慌 14 | NC 14 5 恐惧 15 | NG 15 5 羞 16 | NE 16 6 烦闷 17 | ND 17 6 憎恶 18 | NN 18 6 贬责 19 | NK 19 6 妒忌 20 | NL 20 6 怀疑 21 | PC 21 7 惊奇 22 | -------------------------------------------------------------------------------- /src/deal_text.py: -------------------------------------------------------------------------------- 1 | #!/user/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | import string 6 | import sys 7 | import os 8 | import argparse 9 | import re 10 | import chardet 11 | 12 | def process(inp,outp): 13 | dict_1={} 14 | input_file= inp 15 | 16 | f_in=open(input_file,'r') 17 | pos_1_lines=f_in.readlines() 18 | f_in.close() 19 | for pos_1 in pos_1_lines: 20 | tmp_pos_1=pos_1.rstrip() 21 | tmp_pos=re.sub('#[^#]*#','',tmp_pos_1) 22 | #print tmp_pos 23 | if tmp_pos in dict_1: 24 | continue 25 | else: 26 | dict_1[tmp_pos]=1 27 | print len(dict_1) 28 | 29 | dict_file=open('../conf/emotion_dict.txt','r') 30 | dict_lines=dict_file.readlines() 31 | dict_file.close() 32 | all_dict={} 33 | for dict_line in dict_lines: 34 | dict_line=dict_line.split('\t')[0] 35 | if dict_line in all_dict: 36 | continue 37 | else: 38 | all_dict[dict_line] =1 39 | print len(all_dict) 40 | 41 | output_file=outp 42 | f_save=open(output_file,'w') 43 | for k,v in dict_1.items(): 44 | f_flag=0 45 | #print chardet.detect(k) 46 | for k1,v2 in all_dict.items(): 47 | #print chardet.detect(k1) 48 | if k1 in k : 49 | f_flag=1 50 | break 51 | else: 52 | continue 53 | #print ("%s\t%s"%(f_flag,k)) 54 | f_save.write("%s\t%s"%(f_flag,k)) 55 | f_save.write("\n") 56 | f_save.flush() 57 | 58 | 59 | 60 | if __name__ == "__main__": 61 | input_file="train_2014.07.14.txt" 62 | output_file="flag_train_2014.07.14.txt" 63 | process(input_file,output_file) 64 | 65 | --------------------------------------------------------------------------------