├── README.md ├── config.py ├── data └── cnews.test.txt ├── data_helper.py ├── fast_text.py ├── predict.py └── train.py /README.md: -------------------------------------------------------------------------------- 1 | # 基于字向量FastText文本分类 2 | 3 | # 目录说明 4 |
5 | -data 存放数据 6 | -vocabs 存放字典文件 7 | -runs 存放模型及tensorboard文件 8 | -data_helper.py 处理数据 9 | -config.py 配置文件 10 | -fast_text.py FastText类 11 | -train.py 训练模型 12 | -predict.py 预测 13 |14 | 15 | # 训练 16 | ``` 17 | python train.py 18 | ``` 19 | 20 | # 预测 21 | ``` 22 | python predict.py 23 | ``` -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | config = { 4 | 'sequence_length': 300, # 文本长度,当文本大于该长度则截断 5 | 'num_classes': 10, # 文本分类数 6 | 'vocab_size': 5000, # 字典大小 7 | 'embedding_size': 300, # embedding词向量维度 8 | 'device': '/cpu:0', # 设置device 9 | 'batch_size': 50, # batch大小 10 | 'num_epochs': 10, # epoch数目 11 | 'evaluate_every': 100, # 每隔多少步打印一次验证集结果 12 | 'checkpoint_every': 100, # 每隔多少步保存一次模型 13 | 'num_checkpoints': 5, # 最多保存模型的个数 14 | 'allow_soft_placement': True, # 是否允许程序自动选择备用device 15 | 'log_device_placement': False, # 是否允许在终端打印日志文件 16 | 'train_test_dev_rate': [0.97, 0.02, 0.01], # 训练集,测试集,验证集比例 17 | 'data_path': './data/cnews.test.txt', # 数据路径 格式:标签\t文本 18 | 'learning_rate': 0.003, # 学习率 19 | 'vocab_path': './vocabs', # 保存词典路径 20 | } -------------------------------------------------------------------------------- /data_helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from config import config 4 | from collections import Counter 5 | import os 6 | import json 7 | import numpy as np 8 | import re 9 | 10 | 11 | def process_data(config): 12 | """ 13 | 文本格式 14 | :param config: 15 | :return: 16 | """ 17 | data_path = config['data_path'] 18 | sequence_length = config['sequence_length'] 19 | X = [] 20 | y = [] 21 | with open(data_path, 'r', encoding='utf-8') as f: 22 | for line in f: 23 | lis = line.strip().split('\t') 24 | X.append(re.sub('\s+', '', lis[1])[:sequence_length]) 25 | y.append(lis[0]) 26 | return X, y 27 | 28 | 29 | def generate_vocab(X, y, config): 30 | words = [] 31 | for sent in X: 32 | words.extend(list(sent)) 33 | words = Counter(words).most_common(config['vocab_size']-1) 34 | 35 | word_to_index = {} 36 | index_to_word = {} 37 | for i in range(len(words)): 38 | word_to_index[words[i][0]] = i+1 39 | index_to_word[int(i+1)] = words[i][0] 40 | 41 | word_to_index['UNK'] = 0 42 | index_to_word[int(0)] = 'UNK' 43 | 44 | label_to_index = {} 45 | index_to_label = {} 46 | labels = set(y) 47 | 48 | for i, label in enumerate(labels): 49 | label_to_index[label] = i 50 | index_to_label[int(i)] = label 51 | 52 | vocab_path = config['vocab_path'] 53 | 54 | if not os.path.exists(vocab_path): 55 | os.mkdir(vocab_path) 56 | 57 | with open(os.path.join(vocab_path, 'word_to_index.json'), 'w', encoding='utf-8') as f: 58 | json.dump(word_to_index, f, ensure_ascii=False) 59 | 60 | with open(os.path.join(vocab_path, 'index_to_word.json'), 'w', encoding='utf-8') as f: 61 | json.dump(index_to_word, f, ensure_ascii=False) 62 | 63 | with open(os.path.join(vocab_path, 'label_to_index.json'), 'w', encoding='utf-8') as f: 64 | json.dump(label_to_index, f, ensure_ascii=False) 65 | 66 | with open(os.path.join(vocab_path, 'index_to_label.json'), 'w', encoding='utf-8') as f: 67 | json.dump(index_to_label, f, ensure_ascii=False) 68 | 69 | return word_to_index, label_to_index 70 | 71 | 72 | def padding(X, y, config, word_to_index, label_to_index): 73 | sequence_length = config['sequence_length'] 74 | num_classes = config['num_classes'] 75 | input_x = [] 76 | for line in X: 77 | temp = [] 78 | for item in list(line): 79 | temp.append(word_to_index.get(item, 0)) 80 | input_x.append(temp[:sequence_length]+[0]*(sequence_length-len(temp))) 81 | if not y: 82 | return input_x 83 | 84 | input_y = [] 85 | for item in y: 86 | temp = [0] * num_classes 87 | temp[label_to_index[item]] = 1 88 | input_y.append(temp) 89 | return input_x, input_y 90 | 91 | 92 | def split_data(input_x, input_y, config): 93 | rate = config['train_test_dev_rate'] 94 | shuffle_indices = np.random.permutation(np.arange(len(input_y))) 95 | # print(shuffle_indices) 96 | # print(input_y) 97 | x_shuffled = np.array(input_x)[shuffle_indices] 98 | y_shuffled = np.array(input_y)[shuffle_indices] 99 | x_train, y_train = x_shuffled[: int(rate[0]*len(input_y))], y_shuffled[: int(rate[0]*len(input_y))] 100 | x_test, y_test = x_shuffled[int(rate[0]*len(input_y)): int(sum(rate[:2])*len(input_y))], \ 101 | y_shuffled[int(rate[0]*len(input_y)): int(sum(rate[:2])*len(input_y))] 102 | x_dev, y_dev = x_shuffled[int(sum(rate[:2])*len(input_y)): ], y_shuffled[int(sum(rate[:2])*len(input_y)): ] 103 | return x_train, y_train, x_test, y_test, x_dev, y_dev 104 | 105 | 106 | def generate_batchs(x_train, y_train, config, shuffle=True): 107 | data = np.array(list(zip(x_train, y_train))) 108 | data_size = len(data) 109 | num_batches_per_epoch = int((len(data)-1)/config['batch_size']) + 1 110 | for epoch in range(config['num_epochs']): 111 | if shuffle: 112 | shuffle_indices = np.random.permutation(np.arange(data_size)) 113 | shuffle_data = data[shuffle_indices] 114 | else: 115 | shuffle_data = data 116 | 117 | for batch_num in range(num_batches_per_epoch): 118 | start_index = batch_num * config['batch_size'] 119 | end_index = min((batch_num+1)*config['batch_size'], data_size) 120 | yield shuffle_data[start_index: end_index] 121 | 122 | 123 | def load_json(json_file_path): 124 | with open(json_file_path, 'r', encoding='utf-8') as f: 125 | return json.loads(f.read(), encoding='utf-8') 126 | 127 | 128 | if __name__ == '__main__': 129 | generate_vocab(['abcd', 'dbgj'], [1, 0], config) -------------------------------------------------------------------------------- /fast_text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | 6 | class FastText(): 7 | def __init__(self, config): 8 | self.sequence_length = config['sequence_length'] 9 | self.num_classes = config['num_classes'] 10 | self.vocab_size = config['vocab_size'] 11 | self.embedding_size = config['embedding_size'] 12 | self.device = config['device'] 13 | 14 | # placeholder 15 | self.input_x = tf.placeholder(tf.int32, [None, self.sequence_length], name='input_x') 16 | self.input_y = tf.placeholder(tf.float32, [None, self.num_classes], name='input_y') 17 | self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob') 18 | 19 | # embedding layer 20 | with tf.device(self.device), tf.name_scope('embedding'): 21 | self.W = tf.Variable( 22 | tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0), 23 | name='W' 24 | ) 25 | self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) 26 | 27 | # average vectors, to get the representation of the sentence 28 | self.embedded_chars_mean = tf.reduce_mean(self.embedded_chars, axis=1) 29 | 30 | # final scores and predictions 31 | with tf.name_scope('output'): 32 | W = tf.get_variable( 33 | "W", 34 | shape=[self.embedding_size, self.num_classes], 35 | initializer=tf.contrib.layers.xavier_initializer() 36 | ) 37 | b = tf.Variable(tf.constant(0.1, shape=[self.num_classes]), name='b') 38 | self.scores = tf.nn.xw_plus_b(self.embedded_chars_mean, W, b, name='scores') 39 | self.predictions = tf.argmax(self.scores, 1, name='predictions') 40 | 41 | # loss 42 | with tf.name_scope('loss'): 43 | losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y) 44 | self.loss = tf.reduce_mean(losses) 45 | 46 | # accuracy 47 | with tf.name_scope('accuracy'): 48 | correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) 49 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy') 50 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | from config import config 5 | from data_helper import load_json, padding 6 | 7 | 8 | class Predict(): 9 | def __init__(self, config, model_path='./runs/1539678339/checkpoints/model-200', word_to_index='./vocabs/word_to_index.json', 10 | index_to_label='./vocabs/index_to_label.json'): 11 | self.word_to_index = load_json(word_to_index) 12 | self.index_to_label = load_json(index_to_label) 13 | 14 | graph = tf.Graph() 15 | with graph.as_default(): 16 | session_conf = tf.ConfigProto( 17 | allow_soft_placement=config['allow_soft_placement'], 18 | log_device_placement=config['log_device_placement']) 19 | self.sess = tf.Session(config=session_conf) 20 | with self.sess.as_default(): 21 | # Load the saved meta graph and restore variables 22 | saver = tf.train.import_meta_graph("{}.meta".format(model_path)) 23 | saver.restore(self.sess, model_path) 24 | 25 | # Get the placeholders from the graph by name 26 | self.input_x = graph.get_operation_by_name("input_x").outputs[0] 27 | 28 | self.dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 29 | 30 | # Tensors we want to evaluate 31 | self.predictions = graph.get_operation_by_name("output/predictions").outputs[0] 32 | 33 | def predict(self, list_str): 34 | input_x = padding(list_str, None, config, self.word_to_index, None) 35 | feed_dict = { 36 | self.input_x: input_x, 37 | self.dropout_keep_prob: 1.0 38 | } 39 | predictions = self.sess.run(self.predictions, feed_dict=feed_dict) 40 | return [self.index_to_label[str(idx)] for idx in predictions] 41 | 42 | 43 | if __name__ == '__main__': 44 | prediction = Predict(config) 45 | result = prediction.predict( 46 | ["""黄蜂vs湖人首发:科比带伤战保罗 加索尔救赎之战 新浪体育讯北京时间4月27日,NBA季后赛首轮洛杉矶湖人主场迎战新奥尔良黄蜂,此前的比赛中,双方战成2-2平,因此本场比赛对于两支球队来说都非常重要,赛前双方也公布了首发阵容:湖人队:费舍尔、科比、阿泰斯特、加索尔、拜纳姆黄蜂队:保罗、贝里内利、阿里扎、兰德里、奥卡福[新浪NBA官方微博][新浪NBA湖人新闻动态微博][新浪NBA专题][黄蜂vs湖人图文直播室](新浪体育)""", 47 | """上投摩根投资总监孙延群病休基金经理“过劳症”公开第一例:曹元“3月20日还见到他(孙延群)在一个颁奖活动上发言,没想到这么快就申请病休了。”上海一基金公司的人士看到上投摩根基金管理公司(下称上投摩根)的公告感叹着。3月25日,上投摩根基金管理有限公司公告称,投资总监孙延群先生由于健康原因需要治疗,上投摩根根据有关法规和公司制度批准孙延群先生5个月假期的申请,孙延群将因病休养至2009年8月,在此期间孙延群先生暂停履行基金经理职务。实际上,这也是基金业基金经理暂时离职而发信息披露的第一案。而此前,有许多基金经理或因出国留学、或因其他事务暂离工作岗位数月均未见向公众披露。按照现行规定,只要不是离职这些信息不需对外披露。上投摩根此番做法客观上是积极响应即将实施的信息披露新规中的相关条例。“过劳症”公开第一例孙延群2005年加入上投摩根并随后担任阿尔法股票基金经理,在吕俊辞职上投摩根后,他接任该公司投资总监。在上文提及的颁奖活动上,孙虽然双鬓发白,但看起来较有神采,因此外界对孙的突然发病感到吃惊。上投摩根的一纸公告并未披露孙延群患何疾病。该公司市场部人士称因为是孙个人隐私,不便于对外披露。但有知情人透露一个细节,孙去年就因胃病去过医院,他推测孙此次病休可能是因为旧病重犯。实际上,胃病在基金公司投资管理人员并不鲜见。深圳某合资基金公司投资总监介绍,基金投研人员的工作非常不规律,尤其是近期上市公司年报高发期,投研人员没有不加班的。而他本人也是常常半夜收到研究员的上市公司报告。他称,吃饭不规律、休息不充分导致胃病成为这个行业的“职业病”。“这也是这个行业的压力所致。”他补充到。参加3月20日理柏中国基金奖的人士还记得孙延群在会上说过这么一句话:“投资者的信任这也使我们倍感压力。”现在看来,孙的这句话不是套话。事实上基金的投研面临的压力还有多种。2008年,股市遭遇系统性风险,基金公司和其投研人员感受到了从天堂到地狱的极端变化。深圳一位基金经理描述道,2008年春节回家,当亲戚得知他是基金经理赶紧凑过来问东问西;而2009年回家,这种待遇消失了反而开起了近乎讽刺的玩笑。在网络上,投资者对基金经理的失望乃至谩骂也处处可见。这种近乎一边倒的社会舆论直接对基金经理产生压力。另一方面,每年的排名之争直接决定着基金经理在基金持有人心目中的地位;而内部的业绩考核亦决定基金投研人员的职业前景。在舞动数十亿乃至数百亿元资金的同时,伴随的是各方给予的压力。据另一基金公司人士介绍,除了胃病,基金从业人员还常伴有颈椎方面的疾病以及“三高”疾病。对于基金持有者而言,好的消息是大多基金公司建立了比较系统性的投研体系,个别人的暂时离职不会对基金的投资策略带来重大影响。现在基金公司多实行投资决策委员会作为投资策略的决策机构,投资决策由集体智慧产生,而非个人。而基金经理在选择个股和仓位的权利被削弱。不好的一方面是同一基金公司相同类型的基金策略相近,缺乏特色;好的一方面则是投资风险降低,亦不会因人员变动给基金业绩带来太大冲击。还有些公司实行双基金经理制度,这进一步避免了个别投研人员流动带来的风险。基金经理动态更透明行业内,关于基金经理的信息披露中,常以基金经理变动居多,此次孙延群暂时离开工作岗位也进行信息披露尚属行业内首次。这表明基金信息披露更加“阳光”透明。关于基金投研人员的信息披露规定最早的文件是1999年颁布的《证券投资基金信息披露指引》(下称《指引》),该《指引》于当年3月10日披露之日起实施。这纸《指引》共计16条,3000言,附带4条附件,形成了基金公司信息披露的基础框架。但该指引对基金经理的信息披露规定较为简单。其中第十二条指出基金发生重大事件,有关信息披露义务人应当于第一时间报告中国证监会,并编制临时报告书,经上市的证券交易所核准后予以公告,同时报中国证监会。该条例认为基金管理人的董事长、总经理、基金托管部的总经理变动属于重大事项。不足的是,《指引》中并没有详细规定投资管理人(其中包括基金经理)发生变动需要公告。直至2004年7月1日,《证券投资基金信息披露管理办法》(下称《办法》)正式出台,首次将基金经理的变动规定要做信息披露。《办法》全文共计8章、38条、4900字,较《指引》详实许多。该《办法》也首次规范基金经理的信息披露。其中第二十三条规定,基金发生重大事件,有关信息披露义务人应当在两日内编制临时报告书,予以公告,并在公开披露日分别报中国证监会和基金管理人主要办公场所所在地中国证监会派出机构备案。其中基金管理人的董事长、总经理及其他高级管理人员、基金经理和基金托管人基金托管部门负责人发生变动属于重大事项。但是,《办法》并未对基金高管暂时离开的情况进行规范。事实上,这种情况在基金业中一直存在,甚至发生在一些明星基金经理的身上。比如北京某大型基金公司明星基金经理曾在2002-2003年出国充电,公司未予披露;又有中邮基金投资总监彭旭去年出国半年,亦未见正式披露。2007年10月,彭在接受本报采访时曾说,“等我哪一天去学习了,我一定对外披露。但现在一切要以公告为准。”事实证明,没有制度约束,口头承诺也成一纸空文。3月25日上投摩根披露投资总监孙延群休假5个月在业内算是首创。业内认为,上投摩根首开先例也是为了主动迎合了即将于4月1日实施的基金业信息披露新规。今年3月17日,证监会发布修订后的《基金管理公司投资管理人员管理指导意见》(下称意见),该意见自2009年4月1日起施行。《意见》第三十六条明确规定:投资管理人员“拟离开工作岗位1个月以上”,督察长应当在知悉该信息之日起3个工作日内,向中国证监会相关派出机构报告。《意见》中也明确说明“投资管理人员”是指公司负责基金投资、研究、交易的人员以及实际履行相应职责的人员,涵盖了基金经理和投资总监。 48 | """]) 49 | print(result) 50 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | from fast_text import FastText 5 | from config import config 6 | import time 7 | import os 8 | import datetime 9 | import data_helper 10 | import json 11 | 12 | 13 | def train(config): 14 | print('parameters: ') 15 | print(json.dumps(config, indent=4, ensure_ascii=False)) 16 | 17 | # load data 18 | print('load data .....') 19 | X, y = data_helper.process_data(config) 20 | 21 | # make vocab 22 | print('make vocab .....') 23 | word_to_index, label_to_index = data_helper.generate_vocab(X, y, config) 24 | 25 | # padding data 26 | print('padding data .....') 27 | input_x, input_y = data_helper.padding(X, y, config, word_to_index, label_to_index) 28 | 29 | # split data 30 | print('split data .....') 31 | x_train, y_train, x_test, y_test, x_dev, y_dev = data_helper.split_data(input_x, input_y, config) 32 | 33 | print('length train: {}'.format(len(x_train))) 34 | print('length test: {}'.format(len(x_test))) 35 | print('length dev: {}'.format(len(x_dev))) 36 | 37 | print('training .....') 38 | with tf.Graph().as_default(): 39 | sess_config = tf.ConfigProto( 40 | allow_soft_placement=config['allow_soft_placement'], 41 | log_device_placement=config['log_device_placement'] 42 | ) 43 | with tf.Session(config=sess_config) as sess: 44 | fast_text = FastText(config) 45 | 46 | # training procedure 47 | global_step = tf.Variable(0, name='global_step', trainable=False) 48 | optimizer = tf.train.AdamOptimizer(config['learning_rate']) 49 | grads_and_vars = optimizer.compute_gradients(fast_text.loss) 50 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 51 | 52 | # keep track of gradient values and sparsity 53 | grad_summaries = [] 54 | for g, v in grads_and_vars: 55 | if g is not None: 56 | grad_hist_summary = tf.summary.histogram('{}/grad/hist'.format(v.name), g) 57 | sparsity_summary = tf.summary.scalar('{}/grad/sparsity'.format(v.name), tf.nn.zero_fraction(g)) 58 | grad_summaries.append(grad_hist_summary) 59 | grad_summaries.append(sparsity_summary) 60 | grad_summaries_merged = tf.summary.merge(grad_summaries) 61 | 62 | # output dir for models and summaries 63 | timestamp = str(int(time.time())) 64 | outdir = os.path.abspath(os.path.join(os.path.curdir, 'runs', timestamp)) 65 | print('writing to {}'.format(outdir)) 66 | 67 | # summary for loss and accuracy 68 | loss_summary = tf.summary.scalar('loss', fast_text.loss) 69 | acc_summary = tf.summary.scalar('accuracy', fast_text.accuracy) 70 | 71 | # train summary 72 | train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged]) 73 | train_summary_dir = os.path.join(outdir, 'summaries', 'train') 74 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) 75 | 76 | # dev summary 77 | dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) 78 | dev_summary_dir = os.path.join(outdir, 'summaries', 'dev') 79 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) 80 | 81 | # checkpoint dirctory 82 | checkpoint_dir = os.path.abspath(os.path.join(outdir, 'checkpoints')) 83 | checkpoint_prefix = os.path.join(checkpoint_dir, 'model') 84 | 85 | if not os.path.exists(checkpoint_dir): 86 | os.mkdir(checkpoint_dir) 87 | 88 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=config['num_checkpoints']) 89 | 90 | sess.run(tf.global_variables_initializer()) 91 | 92 | def train_step(x_batch, y_batch): 93 | feed_dict = { 94 | fast_text.input_x: x_batch, 95 | fast_text.input_y: y_batch, 96 | } 97 | 98 | _, step, summaries, loss, accuracy = sess.run( 99 | [train_op, global_step, train_summary_op, fast_text.loss, fast_text.accuracy], 100 | feed_dict=feed_dict 101 | ) 102 | 103 | time_str = datetime.datetime.now().isoformat() 104 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 105 | train_summary_writer.add_summary(summaries, step) 106 | 107 | def dev_step(x_batch, y_batch, writer=None): 108 | feed_dic = { 109 | fast_text.input_x: x_batch, 110 | fast_text.input_y: y_batch, 111 | fast_text.dropout_keep_prob: 1.0 112 | } 113 | 114 | step, summaries, loss, accuracy = sess.run( 115 | [global_step, dev_summary_op, fast_text.loss, fast_text.accuracy], 116 | feed_dict=feed_dic 117 | ) 118 | 119 | time_str = datetime.datetime.now().isoformat() 120 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 121 | if writer: 122 | writer.add_summary(summaries, step) 123 | 124 | # generate batches 125 | batches = data_helper.generate_batchs(x_train, y_train, config) 126 | for batch in batches: 127 | x_batch, y_batch = zip(*batch) 128 | train_step(x_batch, y_batch) 129 | current_step = tf.train.global_step(sess, global_step) 130 | if current_step % config['evaluate_every'] == 0: 131 | print('Evaluation:') 132 | dev_step(x_dev, y_dev, writer=dev_summary_writer) 133 | 134 | if current_step % config['checkpoint_every'] == 0: 135 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 136 | print('save model checkpoint to {}'.format(path)) 137 | 138 | # test accuracy 139 | test_accuracy = sess.run([fast_text.accuracy], feed_dict={ 140 | fast_text.input_x: x_test, fast_text.input_y: y_test, fast_text.dropout_keep_prob: 1.0}) 141 | print('Test dataset accuracy: {}'.format(test_accuracy)) 142 | 143 | 144 | if __name__ == '__main__': 145 | train(config) 146 | 147 | 148 | 149 | --------------------------------------------------------------------------------