├── BERT ├── BERT+CNN.py ├── BERT+CNNLSTM.py ├── BERT+RCNN.py ├── Bert_LR.py ├── base_model.py └── models.py ├── BiLSTM ├── BiLSTM+glove+Sentiment.py └── BiLSTM+random+Sentiment.py ├── CNN ├── CNN+glove+Senntiment.py └── CNN+random+Sentiment.py ├── CNN_LSTM ├── CNNLSTM+Random+Sentiment.py └── CNNLSTM+glove+Sentiment.py ├── LSTM ├── LSTM+glove+Sentiment.py └── LSTM+random+Sentiment.py └── Transformer_ATT ├── Transformer_ATT_sentiment.py └── Transformer_Attention.py /BERT/BERT+CNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: BERT+CNN 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/25: 11 | ------------------------------------------------- 12 | """ 13 | 14 | from kashgari.embeddings import BERTEmbedding 15 | from models import CNNModel 16 | import jieba 17 | from tqdm import tqdm 18 | import keras 19 | #每1000次更新一次 20 | # tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 21 | tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs/cnn_bert', update_freq=1000) 22 | 23 | 24 | def read_neg_data(dataset_path): 25 | x_list = [] 26 | y_list = [] 27 | lines = open(dataset_path, 'r', encoding='utf-8').readlines() 28 | for line in tqdm(lines): 29 | line = line.strip() 30 | if len(line) > 1: 31 | label = '0' 32 | y_list.append(label) 33 | seg_text = list(jieba.cut(line)) 34 | # print(seg_text) 35 | x_list.append(seg_text) 36 | else: 37 | continue 38 | return x_list, y_list 39 | 40 | def read_pos_data(dataset_path): 41 | x_list = [] 42 | y_list = [] 43 | lines = open(dataset_path, 'r', encoding='utf-8').readlines() 44 | for line in tqdm(lines): 45 | line = line.strip() 46 | if len(line) > 1: 47 | label = '1' 48 | y_list.append(label) 49 | seg_text = list(jieba.cut(line)) 50 | # print(seg_text) 51 | x_list.append(seg_text) 52 | else: 53 | continue 54 | return x_list, y_list 55 | 56 | def concate_data(pos_x, pos_y, neg_x, neg_y): 57 | data_x = [] 58 | data_y = [] 59 | for i in range(len(pos_x)): 60 | data_x.append(pos_x[i]) 61 | data_y.append(pos_y[i]) 62 | for j in range(len(neg_x)): 63 | data_x.append(neg_x[j]) 64 | data_y.append(neg_y[j]) 65 | return data_x, data_y 66 | 67 | 68 | def train(): 69 | pos_data_path = '../dataset/weibo60000/pos60000_utf8.txt_updated' 70 | pos_x, pos_y = read_pos_data(pos_data_path) 71 | print(len(pos_x)) 72 | print(len(pos_y)) 73 | # print(pos_y) 74 | 75 | neg_data_path = '../dataset/weibo60000/neg60000_utf8.txt_updated' 76 | neg_x, neg_y = read_neg_data(neg_data_path) 77 | print(len(neg_x)) 78 | print(len(neg_y)) 79 | # print(neg_y) 80 | 81 | train_pos_x = pos_x[:41025] 82 | train_pos_y = pos_y[:41025] 83 | val_pos_x = pos_x[41025:52746] 84 | val_pos_y = pos_y[41025:52746] 85 | test_pos_x = pos_x[52746:] 86 | test_pos_y = pos_y[52746:] 87 | 88 | train_neg_x = neg_x[:41165] 89 | train_neg_y = neg_y[:41165] 90 | val_neg_x = neg_x[41165:52926] 91 | val_neg_y = neg_y[41165:52926] 92 | test_neg_x = neg_x[52926:] 93 | test_neg_y = neg_y[52926:] 94 | 95 | train_x, train_y = concate_data(train_pos_x, train_pos_y, train_neg_x, train_neg_y) 96 | val_x, val_y = concate_data(val_pos_x, val_pos_y, val_neg_x, val_neg_y) 97 | test_x, test_y = concate_data(test_pos_x, test_pos_y, test_neg_x, test_neg_y) 98 | 99 | print('The number of train-set:', len(train_x)) 100 | # print(len(train_y)) 101 | print('The number of val-set:', len(val_x)) 102 | # print(len(val_y)) 103 | print('The number of test-set:', len(test_x)) 104 | # print(len(test_y)) 105 | 106 | embedding = BERTEmbedding('../dataset/chinese_L-12_H-768_A-12', sequence_length=100) 107 | print('embedding_size', embedding.embedding_size) 108 | # print(embedding.model.output 109 | 110 | model = CNNModel(embedding) 111 | model.fit(train_x, train_y, val_x, val_y, batch_size=128, epochs=20, fit_kwargs={'callbacks': [tf_board_callback]}) 112 | model.evaluate(test_x, test_y) 113 | model.save('./model/cnn_bert_model') 114 | 115 | if __name__ == '__main__': 116 | train() 117 | -------------------------------------------------------------------------------- /BERT/BERT+CNNLSTM.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: BERT+CNNLSTM 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/25: 11 | ------------------------------------------------- 12 | """ 13 | 14 | from kashgari.embeddings import BERTEmbedding 15 | from models import CNNLSTMModel 16 | import jieba 17 | from tqdm import tqdm 18 | import keras 19 | #每1000次更新一次 20 | # tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 21 | tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs/cnnlstm_bert', update_freq=1000) 22 | 23 | 24 | def read_neg_data(dataset_path): 25 | x_list = [] 26 | y_list = [] 27 | lines = open(dataset_path, 'r', encoding='utf-8').readlines() 28 | for line in tqdm(lines): 29 | line = line.strip() 30 | if len(line) > 1: 31 | label = '0' 32 | y_list.append(label) 33 | seg_text = list(jieba.cut(line)) 34 | # print(seg_text) 35 | x_list.append(seg_text) 36 | else: 37 | continue 38 | return x_list, y_list 39 | 40 | def read_pos_data(dataset_path): 41 | x_list = [] 42 | y_list = [] 43 | lines = open(dataset_path, 'r', encoding='utf-8').readlines() 44 | for line in tqdm(lines): 45 | line = line.strip() 46 | if len(line) > 1: 47 | label = '1' 48 | y_list.append(label) 49 | seg_text = list(jieba.cut(line)) 50 | # print(seg_text) 51 | x_list.append(seg_text) 52 | else: 53 | continue 54 | return x_list, y_list 55 | 56 | def concate_data(pos_x, pos_y, neg_x, neg_y): 57 | data_x = [] 58 | data_y = [] 59 | for i in range(len(pos_x)): 60 | data_x.append(pos_x[i]) 61 | data_y.append(pos_y[i]) 62 | for j in range(len(neg_x)): 63 | data_x.append(neg_x[j]) 64 | data_y.append(neg_y[j]) 65 | return data_x, data_y 66 | 67 | 68 | def train(): 69 | pos_data_path = '../dataset/weibo60000/pos60000_utf8.txt_updated' 70 | pos_x, pos_y = read_pos_data(pos_data_path) 71 | print(len(pos_x)) 72 | print(len(pos_y)) 73 | # print(pos_y) 74 | 75 | neg_data_path = '../dataset/weibo60000/neg60000_utf8.txt_updated' 76 | neg_x, neg_y = read_neg_data(neg_data_path) 77 | print(len(neg_x)) 78 | print(len(neg_y)) 79 | # print(neg_y) 80 | 81 | train_pos_x = pos_x[:41025] 82 | train_pos_y = pos_y[:41025] 83 | val_pos_x = pos_x[41025:52746] 84 | val_pos_y = pos_y[41025:52746] 85 | test_pos_x = pos_x[52746:] 86 | test_pos_y = pos_y[52746:] 87 | 88 | train_neg_x = neg_x[:41165] 89 | train_neg_y = neg_y[:41165] 90 | val_neg_x = neg_x[41165:52926] 91 | val_neg_y = neg_y[41165:52926] 92 | test_neg_x = neg_x[52926:] 93 | test_neg_y = neg_y[52926:] 94 | 95 | train_x, train_y = concate_data(train_pos_x, train_pos_y, train_neg_x, train_neg_y) 96 | val_x, val_y = concate_data(val_pos_x, val_pos_y, val_neg_x, val_neg_y) 97 | test_x, test_y = concate_data(test_pos_x, test_pos_y, test_neg_x, test_neg_y) 98 | 99 | print('The number of train-set:', len(train_x)) 100 | # print(len(train_y)) 101 | print('The number of val-set:', len(val_x)) 102 | # print(len(val_y)) 103 | print('The number of test-set:', len(test_x)) 104 | # print(len(test_y)) 105 | 106 | embedding = BERTEmbedding('../dataset/chinese_L-12_H-768_A-12', sequence_length=100) 107 | print('embedding_size', embedding.embedding_size) 108 | # print(embedding.model.output 109 | 110 | model = CNNLSTMModel(embedding) 111 | model.fit(train_x, train_y, val_x, val_y, batch_size=128, epochs=20, fit_kwargs={'callbacks': [tf_board_callback]}) 112 | model.evaluate(test_x, test_y) 113 | model.save('./model/cnnlstm_bert_model') 114 | 115 | if __name__ == '__main__': 116 | train() 117 | -------------------------------------------------------------------------------- /BERT/BERT+RCNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: BERT+AV 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/25: 11 | ------------------------------------------------- 12 | """ 13 | from kashgari.embeddings import BERTEmbedding 14 | from models import RCNNModel 15 | import jieba 16 | from tqdm import tqdm 17 | import keras 18 | #每1000次更新一次 19 | # tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 20 | tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs/rcnn_bert', update_freq=1000) 21 | 22 | def read_neg_data(dataset_path): 23 | x_list = [] 24 | y_list = [] 25 | lines = open(dataset_path, 'r', encoding='utf-8').readlines() 26 | for line in tqdm(lines): 27 | line = line.strip() 28 | if len(line) > 1: 29 | label = '0' 30 | y_list.append(label) 31 | seg_text = list(jieba.cut(line)) 32 | # print(seg_text) 33 | x_list.append(seg_text) 34 | else: 35 | continue 36 | return x_list, y_list 37 | 38 | def read_pos_data(dataset_path): 39 | x_list = [] 40 | y_list = [] 41 | lines = open(dataset_path, 'r', encoding='utf-8').readlines() 42 | for line in tqdm(lines): 43 | line = line.strip() 44 | if len(line) > 1: 45 | label = '1' 46 | y_list.append(label) 47 | seg_text = list(jieba.cut(line)) 48 | # print(seg_text) 49 | x_list.append(seg_text) 50 | else: 51 | continue 52 | return x_list, y_list 53 | 54 | def concate_data(pos_x, pos_y, neg_x, neg_y): 55 | data_x = [] 56 | data_y = [] 57 | for i in range(len(pos_x)): 58 | data_x.append(pos_x[i]) 59 | data_y.append(pos_y[i]) 60 | for j in range(len(neg_x)): 61 | data_x.append(neg_x[j]) 62 | data_y.append(neg_y[j]) 63 | return data_x, data_y 64 | 65 | 66 | def train(): 67 | pos_data_path = '../dataset/weibo60000/pos60000_utf8.txt_updated' 68 | pos_x, pos_y = read_pos_data(pos_data_path) 69 | print(len(pos_x)) 70 | print(len(pos_y)) 71 | # print(pos_y) 72 | 73 | neg_data_path = '../dataset/weibo60000/neg60000_utf8.txt_updated' 74 | neg_x, neg_y = read_neg_data(neg_data_path) 75 | print(len(neg_x)) 76 | print(len(neg_y)) 77 | # print(neg_y) 78 | 79 | train_pos_x = pos_x[:41025] 80 | train_pos_y = pos_y[:41025] 81 | val_pos_x = pos_x[41025:52746] 82 | val_pos_y = pos_y[41025:52746] 83 | test_pos_x = pos_x[52746:] 84 | test_pos_y = pos_y[52746:] 85 | 86 | train_neg_x = neg_x[:41165] 87 | train_neg_y = neg_y[:41165] 88 | val_neg_x = neg_x[41165:52926] 89 | val_neg_y = neg_y[41165:52926] 90 | test_neg_x = neg_x[52926:] 91 | test_neg_y = neg_y[52926:] 92 | 93 | train_x, train_y = concate_data(train_pos_x, train_pos_y, train_neg_x, train_neg_y) 94 | val_x, val_y = concate_data(val_pos_x, val_pos_y, val_neg_x, val_neg_y) 95 | test_x, test_y = concate_data(test_pos_x, test_pos_y, test_neg_x, test_neg_y) 96 | 97 | print('The number of train-set:', len(train_x)) 98 | # print(len(train_y)) 99 | print('The number of val-set:', len(val_x)) 100 | # print(len(val_y)) 101 | print('The number of test-set:', len(test_x)) 102 | # print(len(test_y)) 103 | 104 | embedding = BERTEmbedding('../dataset/chinese_L-12_H-768_A-12', sequence_length=100) 105 | print('embedding_size', embedding.embedding_size) 106 | # print(embedding.model.output 107 | 108 | model = RCNNModel(embedding) 109 | model.fit(train_x, train_y, val_x, val_y, batch_size=128, epochs=20, fit_kwargs={'callbacks': [tf_board_callback]}) 110 | model.evaluate(test_x, test_y) 111 | model.save('./model/rcnn_bert_model') 112 | 113 | if __name__ == '__main__': 114 | train() 115 | 116 | -------------------------------------------------------------------------------- /BERT/Bert_LR.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: test 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/22 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/22: 11 | ------------------------------------------------- 12 | """ 13 | 14 | """使用Bert-encode+LogisticRegression进行分类""" 15 | import gensim 16 | from sklearn.linear_model import LogisticRegression 17 | import pandas as pd 18 | from sklearn.model_selection import train_test_split 19 | import numpy as np 20 | import jieba 21 | from gensim.test.utils import common_texts 22 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument 23 | import tensorflow as tf 24 | from bert_serving.client import BertClient 25 | 26 | tf.flags.DEFINE_string('positive_data_file', './weibo60000/pos60000_utf8.txt', 'Data source for the positive data') 27 | tf.flags.DEFINE_string('negative_data_file', './weibo60000/neg60000_utf8.txt', 'Data source for the negative data') 28 | FLAGS = tf.flags.FLAGS 29 | 30 | """从文件中读取数据和标签""" 31 | def load_data_and_label(pos_filename, neg_filename): 32 | """读取积极类别的数据""" 33 | positive_texts = open(pos_filename, 'r', encoding='utf-8').readlines() 34 | # print(positive_texts) 35 | # positive_texts = open(positive_filename, 'rb').readlines() 36 | positive_texts = [line for line in positive_texts] 37 | print('积极句子数目:', len(positive_texts)) 38 | # print(len(positive_texts)) 39 | """读取消极类别的数据""" 40 | negative_texts = open(neg_filename, 'r', encoding='utf-8').readlines() 41 | # negative_texts = open(positive_filename, 'rb').readlines() 42 | negative_texts = [line for line in negative_texts] 43 | print('消极句子数目:', len(negative_texts)) 44 | 45 | """拼接""" 46 | x_text = positive_texts + negative_texts 47 | # print(x_text) 48 | print('全部句子数目:', len(x_text)) 49 | 50 | """生成标签""" 51 | positive_labels = [1 for _ in negative_texts] 52 | negative_labels = [0 for _ in negative_texts] 53 | y = np.concatenate([positive_labels, negative_labels], 0) 54 | print('标签数目:', len(y)) 55 | # print(y) 56 | # for mat in y: 57 | # print(mat) 58 | return [x_text, y] 59 | 60 | x_text, y = load_data_and_label(FLAGS.positive_data_file, FLAGS.negative_data_file) 61 | 62 | # documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(x_text)] 63 | # model = Doc2Vec(documents, size=100, window=8, min_count=100, workers=8) 64 | 65 | bc = BertClient(ip='192.168.2.17') 66 | model = bc.encode(x_text) 67 | 68 | #生成文本向量 69 | print(model[1]) 70 | # print(type(model.docvecs[1])) 71 | # print(type(model.docvecs)) 72 | 73 | 74 | #使用逻辑回归进行预测 75 | def LR(): 76 | clf = LogisticRegression() 77 | return clf 78 | def getRecognitionRate(testPre, testClass): 79 | testNum = len(testPre) 80 | rightNum = 0 81 | for i in range(0, testNum): 82 | if testClass[i] == testPre[i]: 83 | rightNum += 1 84 | return float(rightNum) / float(testNum) 85 | 86 | def getData(): 87 | #生成pandas 88 | # tigs = [] 89 | data_dict = {} 90 | 91 | for i in range(len(model)): 92 | data_dict['p' + str(i)] = model[i] 93 | # print(tigs) 94 | print(data_dict) 95 | data = pd.DataFrame(data_dict) 96 | data = data.T 97 | # data['class0'] = tigs 98 | X_train1, X_test1, y_train1, y_test1 = train_test_split(data, y, test_size=0.4, random_state=0) 99 | return X_train1, y_train1, X_test1, y_test1 100 | 101 | T = getData() 102 | trainMatrix, trainClass, testMatrix, testClass = T[0], T[1], T[2], T[3] 103 | clf_LR=LR() 104 | clf_LR.fit(trainMatrix, trainClass) 105 | print('Logistic Regression recognition rate: ', getRecognitionRate(clf_LR.predict(testMatrix), testClass)) -------------------------------------------------------------------------------- /BERT/base_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: base_model 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/25: 11 | ------------------------------------------------- 12 | """ 13 | 14 | # encoding: utf-8 15 | """ 16 | @author: BrikerMan 17 | @contact: eliyar917@gmail.com 18 | @blog: https://eliyar.biz 19 | 20 | @version: 1.0 21 | @license: Apache Licence 22 | @file: base_model.py 23 | @time: 2019-01-19 11:50 24 | 25 | """ 26 | import logging 27 | import random 28 | from typing import Tuple, Dict 29 | 30 | import numpy as np 31 | from keras.preprocessing import sequence 32 | from keras.utils import to_categorical 33 | from sklearn import metrics 34 | from sklearn.utils import class_weight as class_weight_calculte 35 | from sklearn.preprocessing import MultiLabelBinarizer 36 | 37 | from kashgari import macros as k 38 | from kashgari.tasks.base import BaseModel 39 | from kashgari.embeddings import BaseEmbedding 40 | from kashgari.type_hints import * 41 | 42 | 43 | class ClassificationModel(BaseModel): 44 | 45 | def __init__(self, 46 | embedding: BaseEmbedding = None, 47 | hyper_parameters: Dict = None, 48 | multi_label: bool = False, 49 | **kwargs): 50 | """ 51 | 52 | :param embedding: 53 | :param hyper_parameters: 54 | :param multi_label: 55 | :param kwargs: 56 | """ 57 | super(ClassificationModel, self).__init__(embedding, hyper_parameters, **kwargs) 58 | self.multi_label = multi_label 59 | self.multi_label_binarizer: MultiLabelBinarizer = None 60 | 61 | if self.multi_label: 62 | if not hyper_parameters or \ 63 | hyper_parameters.get('compile_params', {}).get('loss') is None: 64 | self.hyper_parameters['compile_params']['loss'] = 'binary_crossentropy' 65 | else: 66 | logging.warning('recommend to use binary_crossentropy loss for multi_label task') 67 | 68 | if not hyper_parameters or \ 69 | hyper_parameters.get('compile_params', {}).get('metrics') is None: 70 | self.hyper_parameters['compile_params']['metrics'] = ['categorical_accuracy'] 71 | else: 72 | logging.warning('recommend to use categorical_accuracy metrivs for multi_label task') 73 | 74 | if not hyper_parameters or \ 75 | hyper_parameters.get('activation_layer', {}).get('sigmoid') is None: 76 | self.hyper_parameters['activation_layer']['activation'] = 'sigmoid' 77 | else: 78 | logging.warning('recommend to use sigmoid activation for multi_label task') 79 | 80 | def info(self): 81 | info = super(ClassificationModel, self).info() 82 | info['model_info']['multi_label'] = self.multi_label 83 | return info 84 | 85 | @property 86 | def label2idx(self) -> Dict[str, int]: 87 | return self._label2idx 88 | 89 | @property 90 | def token2idx(self) -> Dict[str, int]: 91 | return self.embedding.token2idx 92 | 93 | @label2idx.setter 94 | def label2idx(self, value): 95 | self._label2idx = value 96 | self._idx2label = dict([(val, key) for (key, val) in value.items()]) 97 | 98 | def build_model(self): 99 | """ 100 | build model function 101 | :return: 102 | """ 103 | raise NotImplementedError() 104 | 105 | @classmethod 106 | def load_model(cls, model_path: str): 107 | agent: ClassificationModel = super(ClassificationModel, cls).load_model(model_path) 108 | agent.multi_label = agent.model_info.get('multi_label', False) 109 | if agent.multi_label: 110 | keys = list(agent.label2idx.keys()) 111 | agent.multi_label_binarizer = MultiLabelBinarizer(classes=keys) 112 | agent.multi_label_binarizer.fit(keys[0]) 113 | return agent 114 | 115 | def build_token2id_label2id_dict(self, 116 | x_train: List[List[str]], 117 | y_train: List[str], 118 | x_validate: List[List[str]] = None, 119 | y_validate: List[str] = None): 120 | if x_validate: 121 | x_data = x_train + x_validate 122 | y_data = y_train + y_validate 123 | else: 124 | x_data = x_train 125 | y_data = y_train 126 | self.embedding.build_token2idx_dict(x_data, 3) 127 | 128 | if self.multi_label: 129 | label_set = set() 130 | for i in y_data: 131 | label_set = label_set.union(list(i)) 132 | else: 133 | label_set = set(y_data) 134 | 135 | label2idx = {} 136 | for idx, label in enumerate(label_set): 137 | label2idx[label] = idx 138 | self._label2idx = label2idx 139 | self._idx2label = dict([(val, key) for (key, val) in label2idx.items()]) 140 | self.multi_label_binarizer = MultiLabelBinarizer(classes=list(self.label2idx.keys())) 141 | 142 | def convert_label_to_idx(self, label: Union[List[str], str]) -> Union[List[int], int]: 143 | if isinstance(label, str): 144 | return self.label2idx[label] 145 | else: 146 | return [self.label2idx[l] for l in label] 147 | 148 | def convert_idx_to_label(self, token: Union[List[int], int]) -> Union[List[str], str]: 149 | if isinstance(token, int): 150 | return self._idx2label[token] 151 | else: 152 | return [self._idx2label[l] for l in token] 153 | 154 | def get_data_generator(self, 155 | x_data: List[List[str]], 156 | y_data: List[str], 157 | batch_size: int = 64, 158 | is_bert: bool = False): 159 | while True: 160 | page_list = list(range((len(x_data) // batch_size) + 1)) 161 | random.shuffle(page_list) 162 | for page in page_list: 163 | start_index = page * batch_size 164 | end_index = start_index + batch_size 165 | target_x = x_data[start_index: end_index] 166 | target_y = y_data[start_index: end_index] 167 | if len(target_x) == 0: 168 | target_x = x_data[0: batch_size] 169 | target_y = y_data[0: batch_size] 170 | 171 | tokenized_x = self.embedding.tokenize(target_x) 172 | 173 | padded_x = sequence.pad_sequences(tokenized_x, 174 | maxlen=self.embedding.sequence_length, 175 | padding='post') 176 | 177 | if self.multi_label: 178 | padded_y = self.multi_label_binarizer.fit_transform(target_y) 179 | else: 180 | tokenized_y = self.convert_label_to_idx(target_y) 181 | padded_y = to_categorical(tokenized_y, 182 | num_classes=len(self.label2idx), 183 | dtype=np.int) 184 | if is_bert: 185 | padded_x_seg = np.zeros(shape=(len(padded_x), self.embedding.sequence_length)) 186 | x_input_data = [padded_x, padded_x_seg] 187 | else: 188 | x_input_data = padded_x 189 | yield (x_input_data, padded_y) 190 | 191 | def fit(self, 192 | x_train: List[List[str]], 193 | y_train: Union[List[str], List[List[str]], List[Tuple[str]]], 194 | x_validate: List[List[str]] = None, 195 | y_validate: Union[List[str], List[List[str]], List[Tuple[str]]] = None, 196 | batch_size: int = 64, 197 | epochs: int = 5, 198 | class_weight: bool = False, 199 | fit_kwargs: Dict = None, 200 | **kwargs): 201 | """ 202 | 203 | :param x_train: list of training data. 204 | :param y_train: list of training target label data. 205 | :param x_validate: list of validation data. 206 | :param y_validate: list of validation target label data. 207 | :param batch_size: batch size for trainer model 208 | :param epochs: Number of epochs to train the model. 209 | :param class_weight: set class weights for imbalanced classes 210 | :param fit_kwargs: additional kwargs to be passed to 211 | :func:`~keras.models.Model.fit` 212 | :param kwargs: 213 | :return: 214 | """ 215 | assert len(x_train) == len(y_train) 216 | self.build_token2id_label2id_dict(x_train, y_train, x_validate, y_validate) 217 | 218 | if len(x_train) < batch_size: 219 | batch_size = len(x_train) // 2 220 | 221 | if not self.model: 222 | if self.embedding.sequence_length == 0: 223 | self.embedding.sequence_length = sorted([len(x) for x in x_train])[int(0.95 * len(x_train))] 224 | logging.info('sequence length set to {}'.format(self.embedding.sequence_length)) 225 | self.build_model() 226 | 227 | train_generator = self.get_data_generator(x_train, 228 | y_train, 229 | batch_size, 230 | is_bert=self.embedding.is_bert) 231 | 232 | if fit_kwargs is None: 233 | fit_kwargs = {} 234 | 235 | if x_validate: 236 | validation_generator = self.get_data_generator(x_validate, 237 | y_validate, 238 | batch_size, 239 | is_bert=self.embedding.is_bert) 240 | fit_kwargs['validation_data'] = validation_generator 241 | fit_kwargs['validation_steps'] = max(len(x_validate) // batch_size, 1) 242 | 243 | if class_weight: 244 | y_list = self.convert_label_to_idx(y_train) 245 | class_weights = class_weight_calculte.compute_class_weight('balanced', 246 | np.unique(y_list), 247 | y_list) 248 | else: 249 | class_weights = None 250 | 251 | self.model.fit_generator(train_generator, 252 | steps_per_epoch=len(x_train) // batch_size, 253 | epochs=epochs, 254 | class_weight=class_weights, 255 | **fit_kwargs) 256 | 257 | def _format_output_dic(self, words: List[str], res: np.ndarray): 258 | results = sorted(list(enumerate(res)), key=lambda x: -x[1]) 259 | candidates = [] 260 | for result in results: 261 | candidates.append({ 262 | 'name': self.convert_idx_to_label([result[0]])[0], 263 | 'confidence': float(result[1]), 264 | }) 265 | 266 | data = { 267 | 'words': words, 268 | 'class': candidates[0], 269 | 'class_candidates': candidates 270 | } 271 | return data 272 | 273 | def predict(self, 274 | sentence: Union[List[str], List[List[str]]], 275 | batch_size=None, 276 | output_dict=False, 277 | multi_label_threshold=0.6, 278 | debug_info=False) -> Union[List[str], str, List[Dict], Dict]: 279 | """ 280 | predict with model 281 | :param sentence: single sentence as List[str] or list of sentence as List[List[str]] 282 | :param batch_size: predict batch_size 283 | :param output_dict: return dict with result with confidence 284 | :param multi_label_threshold: 285 | :param debug_info: print debug info using logging.debug when True 286 | :return: 287 | """ 288 | tokens = self.embedding.tokenize(sentence) 289 | is_list = not isinstance(sentence[0], str) 290 | if is_list: 291 | padded_tokens = sequence.pad_sequences(tokens, 292 | maxlen=self.embedding.sequence_length, 293 | padding='post') 294 | else: 295 | padded_tokens = sequence.pad_sequences([tokens], 296 | maxlen=self.embedding.sequence_length, 297 | padding='post') 298 | if self.embedding.is_bert: 299 | x = [padded_tokens, np.zeros(shape=(len(padded_tokens), self.embedding.sequence_length))] 300 | else: 301 | x = padded_tokens 302 | res = self.model.predict(x, batch_size=batch_size) 303 | 304 | if self.multi_label: 305 | if debug_info: 306 | logging.info('raw output: {}'.format(res)) 307 | res[res >= multi_label_threshold] = 1 308 | res[res < multi_label_threshold] = 0 309 | predict_result = res 310 | else: 311 | predict_result = res.argmax(-1) 312 | 313 | if debug_info: 314 | logging.info('input: {}'.format(x)) 315 | logging.info('output: {}'.format(res)) 316 | logging.info('output argmax: {}'.format(predict_result)) 317 | 318 | if output_dict: 319 | if is_list: 320 | words_list: List[List[str]] = sentence 321 | else: 322 | words_list: List[List[str]] = [sentence] 323 | results = [] 324 | for index in range(len(words_list)): 325 | results.append(self._format_output_dic(words_list[index], res[index])) 326 | if is_list: 327 | return results 328 | else: 329 | return results[0] 330 | else: 331 | if self.multi_label: 332 | results = self.multi_label_binarizer.inverse_transform(predict_result) 333 | else: 334 | results = self.convert_idx_to_label(predict_result) 335 | if is_list: 336 | return results 337 | else: 338 | return results[0] 339 | 340 | def evaluate(self, x_data, y_data, batch_size=None, digits=4, debug_info=False) -> Tuple[float, float, Dict]: 341 | y_pred = self.predict(x_data, batch_size=batch_size) 342 | #report = metrics.classification_report(y_data, y_pred, output_dict=True, digits=digits) 343 | #report = metrics.classification_report(y_data, y_pred, digits=digits) 344 | print(metrics.classification_report(y_data, y_pred, digits=digits)) 345 | if debug_info: 346 | for index in random.sample(list(range(len(x_data))), 5): 347 | logging.debug('------ sample {} ------'.format(index)) 348 | logging.debug('x : {}'.format(x_data[index])) 349 | logging.debug('y : {}'.format(y_data[index])) 350 | logging.debug('y_pred : {}'.format(y_pred[index])) 351 | #return report -------------------------------------------------------------------------------- /BERT/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: models 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/25: 11 | ------------------------------------------------- 12 | """ 13 | # encoding: utf-8 14 | """ 15 | @author: Alex 16 | @contact: ialexwwang@gmail.com 17 | 18 | @version: 0.1 19 | @license: Apache Licence 20 | @file: deep_models.py 21 | @time: 2019-02-21 17:54 22 | 23 | @Reference: https://github.com/zake7749/DeepToxic/blob/master/sotoxic/models/keras/model_zoo.py 24 | """ 25 | # from __future__ import absolute_import, division 26 | 27 | import keras 28 | from keras.layers import Bidirectional, Conv1D 29 | from keras.layers import Dense, Lambda, Flatten 30 | from keras.layers import Dropout, SpatialDropout1D 31 | from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, MaxPooling1D 32 | from keras.layers import concatenate 33 | from keras.models import Model 34 | 35 | from kashgari.layers import AttentionWeightedAverage, KMaxPooling, LSTMLayer, GRULayer 36 | from kashgari.tasks.classification.base_model import ClassificationModel 37 | 38 | 39 | 40 | class CNNModel(ClassificationModel): 41 | __architect_name__ = 'CNNModel' 42 | __base_hyper_parameters__ = { 43 | 'conv1d_layer': { 44 | 'filters': 128, 45 | 'kernel_size': 5, 46 | 'activation': 'relu' 47 | }, 48 | 'max_pool_layer': {}, 49 | 'dense_1_layer': { 50 | 'units': 64, 51 | 'activation': 'relu' 52 | }, 53 | 'activation_layer': { 54 | 'activation': 'softmax' 55 | }, 56 | 'optimizer': { 57 | 'module': 'keras.optimizers', 58 | 'name': 'Adam', 59 | 'params': { 60 | 'lr': 1e-3, 61 | 'decay': 0.0 62 | } 63 | }, 64 | 'compile_params': { 65 | 'loss': 'categorical_crossentropy', 66 | # 'optimizer': 'adam', 67 | 'metrics': ['accuracy'] 68 | } 69 | } 70 | 71 | def build_model(self): 72 | base_model = self.embedding.model 73 | conv1d_layer = Conv1D(**self.hyper_parameters['conv1d_layer'])(base_model.output) 74 | max_pool_layer = GlobalMaxPooling1D(**self.hyper_parameters['max_pool_layer'])(conv1d_layer) 75 | dense_1_layer = Dense(**self.hyper_parameters['dense_1_layer'])(max_pool_layer) 76 | dense_2_layer = Dense(len(self.label2idx), **self.hyper_parameters['activation_layer'])(dense_1_layer) 77 | 78 | model = Model(base_model.inputs, dense_2_layer) 79 | optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), 80 | self.hyper_parameters['optimizer']['name'])( 81 | **self.hyper_parameters['optimizer']['params']) 82 | model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) 83 | self.model = model 84 | self.model.summary() 85 | 86 | 87 | class BLSTMModel(ClassificationModel): 88 | __architect_name__ = 'BLSTMModel' 89 | __base_hyper_parameters__ = { 90 | 'lstm_layer': { 91 | 'units': 256, 92 | 'return_sequences': False 93 | }, 94 | 'activation_layer': { 95 | 'activation': 'softmax' 96 | }, 97 | 'optimizer': { 98 | 'module': 'keras.optimizers', 99 | 'name': 'Adam', 100 | 'params': { 101 | 'lr': 1e-3, 102 | 'decay': 0.0 103 | } 104 | }, 105 | 'compile_params': { 106 | 'loss': 'categorical_crossentropy', 107 | # 'optimizer': 'adam', 108 | 'metrics': ['accuracy'] 109 | } 110 | } 111 | 112 | def build_model(self): 113 | base_model = self.embedding.model 114 | blstm_layer = Bidirectional(LSTMLayer(**self.hyper_parameters['lstm_layer']))(base_model.output) 115 | dense_layer = Dense(len(self.label2idx), **self.hyper_parameters['activation_layer'])(blstm_layer) 116 | output_layers = [dense_layer] 117 | 118 | model = Model(base_model.inputs, output_layers) 119 | optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), 120 | self.hyper_parameters['optimizer']['name'])( 121 | **self.hyper_parameters['optimizer']['params']) 122 | model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) 123 | self.model = model 124 | self.model.summary() 125 | 126 | 127 | class CNNLSTMModel(ClassificationModel): 128 | __architect_name__ = 'CNNLSTMModel' 129 | __base_hyper_parameters__ = { 130 | 'conv_layer': { 131 | 'filters': 32, 132 | 'kernel_size': 3, 133 | 'padding': 'same', 134 | 'activation': 'relu' 135 | }, 136 | 'max_pool_layer': { 137 | 'pool_size': 2 138 | }, 139 | 'lstm_layer': { 140 | 'units': 100 141 | }, 142 | 'activation_layer': { 143 | 'activation': 'softmax' 144 | }, 145 | 'optimizer': { 146 | 'module': 'keras.optimizers', 147 | 'name': 'Adam', 148 | 'params': { 149 | 'lr': 1e-3, 150 | 'decay': 0.0 151 | } 152 | }, 153 | 'compile_params': { 154 | 'loss': 'categorical_crossentropy', 155 | # 'optimizer': 'adam', 156 | 'metrics': ['accuracy'] 157 | } 158 | } 159 | 160 | def build_model(self): 161 | base_model = self.embedding.model 162 | conv_layer = Conv1D(**self.hyper_parameters['conv_layer'])(base_model.output) 163 | max_pool_layer = MaxPooling1D(**self.hyper_parameters['max_pool_layer'])(conv_layer) 164 | lstm_layer = LSTMLayer(**self.hyper_parameters['lstm_layer'])(max_pool_layer) 165 | dense_layer = Dense(len(self.label2idx), 166 | **self.hyper_parameters['activation_layer'])(lstm_layer) 167 | output_layers = [dense_layer] 168 | 169 | model = Model(base_model.inputs, output_layers) 170 | optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), 171 | self.hyper_parameters['optimizer']['name'])( 172 | **self.hyper_parameters['optimizer']['params']) 173 | model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) 174 | self.model = model 175 | self.model.summary() 176 | 177 | 178 | class AVCNNModel(ClassificationModel): 179 | __architect_name__ = 'AVCNNModel' 180 | __base_hyper_parameters__ = { 181 | 'spatial_dropout': { 182 | 'rate': 0.25 183 | }, 184 | 'conv_0': { 185 | 'filters': 300, 186 | 'kernel_size': 1, 187 | 'kernel_initializer': 'normal', 188 | 'padding': 'valid', 189 | 'activation': 'relu' 190 | }, 191 | 'conv_1': { 192 | 'filters': 300, 193 | 'kernel_size': 2, 194 | 'kernel_initializer': 'normal', 195 | 'padding': 'valid', 196 | 'activation': 'relu' 197 | }, 198 | 'conv_2': { 199 | 'filters': 300, 200 | 'kernel_size': 3, 201 | 'kernel_initializer': 'normal', 202 | 'padding': 'valid', 203 | 'activation': 'relu' 204 | }, 205 | 'conv_3': { 206 | 'filters': 300, 207 | 'kernel_size': 4, 208 | 'kernel_initializer': 'normal', 209 | 'padding': 'valid', 210 | 'activation': 'relu' 211 | }, 212 | # --- 213 | 'attn_0': {}, 214 | 'avg_0': {}, 215 | 'maxpool_0': {}, 216 | # --- 217 | 'maxpool_1': {}, 218 | 'attn_1': {}, 219 | 'avg_1': {}, 220 | # --- 221 | 'maxpool_2': {}, 222 | 'attn_2': {}, 223 | 'avg_2': {}, 224 | # --- 225 | 'maxpool_3': {}, 226 | 'attn_3': {}, 227 | 'avg_3': {}, 228 | # --- 229 | 'v0_col': { 230 | # 'mode': 'concat', 231 | 'axis': 1 232 | }, 233 | 'v1_col': { 234 | # 'mode': 'concat', 235 | 'axis': 1 236 | }, 237 | 'v2_col': { 238 | # 'mode': 'concat', 239 | 'axis': 1 240 | }, 241 | 'merged_tensor': { 242 | # 'mode': 'concat', 243 | 'axis': 1 244 | }, 245 | 'dropout': { 246 | 'rate': 0.7 247 | }, 248 | 'dense': { 249 | 'units': 144, 250 | 'activation': 'relu' 251 | }, 252 | 'activation_layer': { 253 | 'activation': 'softmax' 254 | }, 255 | 'optimizer': { 256 | 'module': 'keras.optimizers', 257 | 'name': 'Adam', 258 | 'params': { 259 | 'lr': 1e-3, 260 | 'decay': 1e-7 261 | } 262 | }, 263 | 'compile_params': { 264 | 'loss': 'categorical_crossentropy', 265 | # 'optimizer': 'adam', 266 | 'metrics': ['accuracy'] 267 | } 268 | } 269 | 270 | def build_model(self): 271 | base_model = self.embedding.model 272 | embedded_seq = SpatialDropout1D(**self.hyper_parameters['spatial_dropout'])(base_model.output) 273 | conv_0 = Conv1D(**self.hyper_parameters['conv_0'])(embedded_seq) 274 | conv_1 = Conv1D(**self.hyper_parameters['conv_1'])(embedded_seq) 275 | conv_2 = Conv1D(**self.hyper_parameters['conv_2'])(embedded_seq) 276 | conv_3 = Conv1D(**self.hyper_parameters['conv_3'])(embedded_seq) 277 | 278 | maxpool_0 = GlobalMaxPooling1D()(conv_0) 279 | attn_0 = AttentionWeightedAverage()(conv_0) 280 | avg_0 = GlobalAveragePooling1D()(conv_0) 281 | 282 | maxpool_1 = GlobalMaxPooling1D()(conv_1) 283 | attn_1 = AttentionWeightedAverage()(conv_1) 284 | avg_1 = GlobalAveragePooling1D()(conv_1) 285 | 286 | maxpool_2 = GlobalMaxPooling1D()(conv_2) 287 | attn_2 = AttentionWeightedAverage()(conv_2) 288 | avg_2 = GlobalAveragePooling1D()(conv_2) 289 | 290 | maxpool_3 = GlobalMaxPooling1D()(conv_3) 291 | attn_3 = AttentionWeightedAverage()(conv_3) 292 | avg_3 = GlobalAveragePooling1D()(conv_3) 293 | 294 | v0_col = concatenate([maxpool_0, maxpool_1, maxpool_2, maxpool_3], 295 | **self.hyper_parameters['v0_col']) 296 | v1_col = concatenate([attn_0, attn_1, attn_2, attn_3], 297 | **self.hyper_parameters['v1_col']) 298 | v2_col = concatenate([avg_1, avg_2, avg_0, avg_3], 299 | **self.hyper_parameters['v2_col']) 300 | merged_tensor = concatenate([v0_col, v1_col, v2_col], 301 | **self.hyper_parameters['merged_tensor']) 302 | output = Dropout(**self.hyper_parameters['dropout'])(merged_tensor) 303 | output = Dense(**self.hyper_parameters['dense'])(output) 304 | output = Dense(len(self.label2idx), 305 | **self.hyper_parameters['activation_layer'])(output) 306 | 307 | model = Model(base_model.inputs, output) 308 | optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), 309 | self.hyper_parameters['optimizer']['name'])( 310 | **self.hyper_parameters['optimizer']['params']) 311 | model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) 312 | self.model = model 313 | self.model.summary() 314 | 315 | 316 | class KMaxCNNModel(ClassificationModel): 317 | __architect_name__ = 'KMaxCNNModel' 318 | __base_hyper_parameters__ = { 319 | 'spatial_dropout': { 320 | 'rate': 0.2 321 | }, 322 | 'conv_0': { 323 | 'filters': 180, 324 | 'kernel_size': 1, 325 | 'kernel_initializer': 'normal', 326 | 'padding': 'valid', 327 | 'activation': 'relu' 328 | }, 329 | 'conv_1': { 330 | 'filters': 180, 331 | 'kernel_size': 2, 332 | 'kernel_initializer': 'normal', 333 | 'padding': 'valid', 334 | 'activation': 'relu' 335 | }, 336 | 'conv_2': { 337 | 'filters': 180, 338 | 'kernel_size': 3, 339 | 'kernel_initializer': 'normal', 340 | 'padding': 'valid', 341 | 'activation': 'relu' 342 | }, 343 | 'conv_3': { 344 | 'filters': 180, 345 | 'kernel_size': 4, 346 | 'kernel_initializer': 'normal', 347 | 'padding': 'valid', 348 | 'activation': 'relu' 349 | }, 350 | 'maxpool_0': { 351 | 'k': 3 352 | }, 353 | 'maxpool_1': { 354 | 'k': 3 355 | }, 356 | 'maxpool_2': { 357 | 'k': 3 358 | }, 359 | 'maxpool_3': { 360 | 'k': 3 361 | }, 362 | 'merged_tensor': { 363 | # 'mode': 'concat', 364 | 'axis': 1 365 | }, 366 | 'dropout': { 367 | 'rate': 0.6 368 | }, 369 | 'dense': { 370 | 'units': 144, 371 | 'activation': 'relu' 372 | }, 373 | 'activation_layer': { 374 | 'activation': 'softmax' 375 | }, 376 | 'optimizer': { 377 | 'module': 'keras.optimizers', 378 | 'name': 'Adam', 379 | 'params': { 380 | 'lr': 1e-3, 381 | 'decay': 1e-7 382 | } 383 | }, 384 | 'compile_params': { 385 | 'loss': 'categorical_crossentropy', 386 | # 'optimizer': 'adam', 387 | 'metrics': ['accuracy'] 388 | } 389 | } 390 | 391 | def build_model(self): 392 | base_model = self.embedding.model 393 | embedded_seq = SpatialDropout1D(**self.hyper_parameters['spatial_dropout'])(base_model.output) 394 | conv_0 = Conv1D(**self.hyper_parameters['conv_0'])(embedded_seq) 395 | conv_1 = Conv1D(**self.hyper_parameters['conv_1'])(embedded_seq) 396 | conv_2 = Conv1D(**self.hyper_parameters['conv_2'])(embedded_seq) 397 | conv_3 = Conv1D(**self.hyper_parameters['conv_3'])(embedded_seq) 398 | 399 | maxpool_0 = KMaxPooling(**self.hyper_parameters['maxpool_0'])(conv_0) 400 | # maxpool_0f = Reshape((-1,))(maxpool_0) 401 | maxpool_0f = Flatten()(maxpool_0) 402 | maxpool_1 = KMaxPooling(**self.hyper_parameters['maxpool_1'])(conv_1) 403 | # maxpool_1f = Reshape((-1,))(maxpool_1) 404 | maxpool_1f = Flatten()(maxpool_1) 405 | maxpool_2 = KMaxPooling(**self.hyper_parameters['maxpool_2'])(conv_2) 406 | # maxpool_2f = Reshape((-1,))(maxpool_2) 407 | maxpool_2f = Flatten()(maxpool_2) 408 | maxpool_3 = KMaxPooling(**self.hyper_parameters['maxpool_3'])(conv_3) 409 | # maxpool_3f = Reshape((-1,))(maxpool_3) 410 | maxpool_3f = Flatten()(maxpool_3) 411 | # maxpool_0 = GlobalMaxPooling1D()(conv_0) 412 | # maxpool_1 = GlobalMaxPooling1D()(conv_1) 413 | # maxpool_2 = GlobalMaxPooling1D()(conv_2) 414 | # maxpool_3 = GlobalMaxPooling1D()(conv_3) 415 | 416 | # merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2, maxpool_3], 417 | # **self.hyper_parameters['merged_tensor']) 418 | merged_tensor = concatenate([maxpool_0f, maxpool_1f, maxpool_2f, maxpool_3f], 419 | **self.hyper_parameters['merged_tensor']) 420 | # flatten = Reshape((-1,))(merged_tensor) 421 | # output = Dropout(**self.hyper_parameters['dropout'])(flatten) 422 | output = Dropout(**self.hyper_parameters['dropout'])(merged_tensor) 423 | output = Dense(**self.hyper_parameters['dense'])(output) 424 | output = Dense(len(self.label2idx), 425 | **self.hyper_parameters['activation_layer'])(output) 426 | 427 | model = Model(base_model.inputs, output) 428 | optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), 429 | self.hyper_parameters['optimizer']['name'])( 430 | **self.hyper_parameters['optimizer']['params']) 431 | model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) 432 | self.model = model 433 | self.model.summary() 434 | 435 | 436 | class RCNNModel(ClassificationModel): 437 | __architect_name__ = 'RCNNModel' 438 | __base_hyper_parameters__ = { 439 | 'spatial_dropout': { 440 | 'rate': 0.2 441 | }, 442 | 'rnn_0': { 443 | 'units': 64, 444 | 'return_sequences': True 445 | }, 446 | 'conv_0': { 447 | 'filters': 128, 448 | 'kernel_size': 2, 449 | 'kernel_initializer': 'normal', 450 | 'padding': 'valid', 451 | 'activation': 'relu', 452 | 'strides': 1 453 | }, 454 | 'maxpool': {}, 455 | 'attn': {}, 456 | 'average': {}, 457 | 'concat': { 458 | 'axis': 1 459 | }, 460 | 'dropout': { 461 | 'rate': 0.5 462 | }, 463 | 'dense': { 464 | 'units': 120, 465 | 'activation': 'relu' 466 | }, 467 | 'activation_layer': { 468 | 'activation': 'softmax' 469 | }, 470 | 'optimizer': { 471 | 'module': 'keras.optimizers', 472 | 'name': 'Adam', 473 | 'params': { 474 | 'lr': 1e-3, 475 | 'clipvalue': 5, 476 | 'decay': 1e-5 477 | } 478 | }, 479 | 'compile_params': { 480 | 'loss': 'categorical_crossentropy', 481 | # 'optimizer': 'adam', 482 | 'metrics': ['accuracy'] 483 | } 484 | } 485 | 486 | def build_model(self): 487 | base_model = self.embedding.model 488 | embedded_seq = SpatialDropout1D(**self.hyper_parameters['spatial_dropout'])(base_model.output) 489 | rnn_0 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_0']))(embedded_seq) 490 | conv_0 = Conv1D(**self.hyper_parameters['conv_0'])(rnn_0) 491 | maxpool = GlobalMaxPooling1D()(conv_0) 492 | attn = AttentionWeightedAverage()(conv_0) 493 | average = GlobalAveragePooling1D()(conv_0) 494 | 495 | concatenated = concatenate([maxpool, attn, average], 496 | **self.hyper_parameters['concat']) 497 | output = Dropout(**self.hyper_parameters['dropout'])(concatenated) 498 | output = Dense(**self.hyper_parameters['dense'])(output) 499 | output = Dense(len(self.label2idx), 500 | **self.hyper_parameters['activation_layer'])(output) 501 | 502 | model = Model(base_model.inputs, output) 503 | optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), 504 | self.hyper_parameters['optimizer']['name'])( 505 | **self.hyper_parameters['optimizer']['params']) 506 | model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) 507 | self.model = model 508 | self.model.summary() 509 | 510 | 511 | class AVRNNModel(ClassificationModel): 512 | __architect_name__ = 'AVRNNModel' 513 | __base_hyper_parameters__ = { 514 | 'spatial_dropout': { 515 | 'rate': 0.25 516 | }, 517 | 'rnn_0': { 518 | 'units': 60, 519 | 'return_sequences': True 520 | }, 521 | 'rnn_1': { 522 | 'units': 60, 523 | 'return_sequences': True 524 | }, 525 | 'concat_rnn': { 526 | 'axis': 2 527 | }, 528 | 'last': {}, 529 | 'maxpool': {}, 530 | 'attn': {}, 531 | 'average': {}, 532 | 'all_views': { 533 | 'axis': 1 534 | }, 535 | 'dropout': { 536 | 'rate': 0.5 537 | }, 538 | 'dense': { 539 | 'units': 144, 540 | 'activation': 'relu' 541 | }, 542 | 'activation_layer': { 543 | 'activation': 'softmax' 544 | }, 545 | 'optimizer': { 546 | 'module': 'keras.optimizers', 547 | 'name': 'Adam', 548 | 'params': { 549 | 'lr': 1e-3, 550 | 'clipvalue': 5, 551 | 'decay': 1e-6 552 | } 553 | }, 554 | 'compile_params': { 555 | 'loss': 'categorical_crossentropy', 556 | # 'optimizer': 'adam', 557 | 'metrics': ['accuracy'] 558 | } 559 | } 560 | 561 | def build_model(self): 562 | base_model = self.embedding.model 563 | embedded_seq = SpatialDropout1D(**self.hyper_parameters['spatial_dropout'])(base_model.output) 564 | rnn_0 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_0']))(embedded_seq) 565 | rnn_1 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_1']))(rnn_0) 566 | concat_rnn = concatenate([rnn_0, rnn_1], 567 | **self.hyper_parameters['concat_rnn']) 568 | 569 | last = Lambda(lambda t: t[:, -1], name='last')(concat_rnn) 570 | maxpool = GlobalMaxPooling1D()(concat_rnn) 571 | attn = AttentionWeightedAverage()(concat_rnn) 572 | average = GlobalAveragePooling1D()(concat_rnn) 573 | 574 | all_views = concatenate([last, maxpool, attn, average], 575 | **self.hyper_parameters['all_views']) 576 | output = Dropout(**self.hyper_parameters['dropout'])(all_views) 577 | output = Dense(**self.hyper_parameters['dense'])(output) 578 | output = Dense(len(self.label2idx), 579 | **self.hyper_parameters['activation_layer'])(output) 580 | 581 | model = Model(base_model.inputs, output) 582 | optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), 583 | self.hyper_parameters['optimizer']['name'])( 584 | **self.hyper_parameters['optimizer']['params']) 585 | model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) 586 | self.model = model 587 | self.model.summary() 588 | 589 | 590 | class DropoutBGRUModel(ClassificationModel): 591 | __architect_name__ = 'DropoutBGRUModel' 592 | __base_hyper_parameters__ = { 593 | 'spatial_dropout': { 594 | 'rate': 0.15 595 | }, 596 | 'rnn_0': { 597 | 'units': 64, 598 | 'return_sequences': True 599 | }, 600 | 'dropout_rnn': { 601 | 'rate': 0.35 602 | }, 603 | 'rnn_1': { 604 | 'units': 64, 605 | 'return_sequences': True 606 | }, 607 | 'last': {}, 608 | 'maxpool': {}, 609 | 'average': {}, 610 | 'all_views': { 611 | 'axis': 1 612 | }, 613 | 'dropout': { 614 | 'rate': 0.5 615 | }, 616 | 'dense': { 617 | 'units': 72, 618 | 'activation': 'relu' 619 | }, 620 | 'activation_layer': { 621 | 'activation': 'softmax' 622 | }, 623 | 'optimizer': { 624 | 'module': 'keras.optimizers', 625 | 'name': 'Adam', 626 | 'params': { 627 | 'lr': 1e-3, 628 | 'decay': 0.0 629 | } 630 | }, 631 | 'compile_params': { 632 | 'loss': 'categorical_crossentropy', 633 | # 'optimizer': 'adam', 634 | 'metrics': ['accuracy'] 635 | } 636 | } 637 | 638 | def build_model(self): 639 | base_model = self.embedding.model 640 | embedded_seq = SpatialDropout1D(**self.hyper_parameters['spatial_dropout'])(base_model.output) 641 | rnn_0 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_0']))(embedded_seq) 642 | dropout_rnn = Dropout(**self.hyper_parameters['dropout_rnn'])(rnn_0) 643 | rnn_1 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_1']))(dropout_rnn) 644 | last = Lambda(lambda t: t[:, -1], name='last')(rnn_1) 645 | maxpool = GlobalMaxPooling1D()(rnn_1) 646 | # attn = AttentionWeightedAverage()(rnn_1) 647 | average = GlobalAveragePooling1D()(rnn_1) 648 | 649 | all_views = concatenate([last, maxpool, average], 650 | **self.hyper_parameters['all_views']) 651 | output = Dropout(**self.hyper_parameters['dropout'])(all_views) 652 | output = Dense(**self.hyper_parameters['dense'])(output) 653 | output = Dense(len(self.label2idx), 654 | **self.hyper_parameters['activation_layer'])(output) 655 | 656 | model = Model(base_model.inputs, output) 657 | optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), 658 | self.hyper_parameters['optimizer']['name'])( 659 | **self.hyper_parameters['optimizer']['params']) 660 | model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) 661 | self.model = model 662 | self.model.summary() 663 | 664 | 665 | class DropoutAVRNNModel(ClassificationModel): 666 | __architect_name__ = 'DropoutAVRNNModel' 667 | __base_hyper_parameters__ = { 668 | 'spatial_dropout': { 669 | 'rate': 0.25 670 | }, 671 | 'rnn_0': { 672 | 'units': 56, 673 | 'return_sequences': True 674 | }, 675 | 'rnn_dropout': { 676 | 'rate': 0.3 677 | }, 678 | 'rnn_1': { 679 | 'units': 56, 680 | 'return_sequences': True 681 | }, 682 | 'last': {}, 683 | 'maxpool': {}, 684 | 'attn': {}, 685 | 'average': {}, 686 | 'all_views': { 687 | 'axis': 1 688 | }, 689 | 'dropout_0': { 690 | 'rate': 0.5 691 | }, 692 | 'dense': { 693 | 'units': 128, 694 | 'activation': 'relu' 695 | }, 696 | 'dropout_1': { 697 | 'rate': 0.25 698 | }, 699 | 'activation_layer': { 700 | 'activation': 'softmax' 701 | }, 702 | 'optimizer': { 703 | 'module': 'keras.optimizers', 704 | 'name': 'Adam', 705 | 'params': { 706 | 'lr': 1e-3, 707 | 'clipvalue': 5, 708 | 'decay': 1e-7 709 | } 710 | }, 711 | 'compile_params': { 712 | 'loss': 'categorical_crossentropy', 713 | # 'optimizer': 'adam', 714 | 'metrics': ['accuracy'] 715 | } 716 | } 717 | 718 | def build_model(self): 719 | base_model = self.embedding.model 720 | embedded_seq = SpatialDropout1D(**self.hyper_parameters['spatial_dropout'])(base_model.output) 721 | rnn_0 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_0']))(embedded_seq) 722 | rnn_dropout = SpatialDropout1D(**self.hyper_parameters['rnn_dropout'])(rnn_0) 723 | rnn_1 = Bidirectional(GRULayer(**self.hyper_parameters['rnn_1']))(rnn_dropout) 724 | 725 | last = Lambda(lambda t: t[:, -1], name='last')(rnn_1) 726 | maxpool = GlobalMaxPooling1D()(rnn_1) 727 | attn = AttentionWeightedAverage()(rnn_1) 728 | average = GlobalAveragePooling1D()(rnn_1) 729 | 730 | all_views = concatenate([last, maxpool, attn, average], 731 | **self.hyper_parameters['all_views']) 732 | output = Dropout(**self.hyper_parameters['dropout_0'])(all_views) 733 | output = Dense(**self.hyper_parameters['dense'])(output) 734 | output = Dropout(**self.hyper_parameters['dropout_1'])(output) 735 | output = Dense(len(self.label2idx), 736 | **self.hyper_parameters['activation_layer'])(output) 737 | 738 | model = Model(base_model.inputs, output) 739 | optimizer = getattr(eval(self.hyper_parameters['optimizer']['module']), 740 | self.hyper_parameters['optimizer']['name'])( 741 | **self.hyper_parameters['optimizer']['params']) 742 | model.compile(optimizer=optimizer, **self.hyper_parameters['compile_params']) 743 | self.model = model 744 | self.model.summary() 745 | 746 | 747 | if __name__ == '__main__': 748 | from kashgari.corpus import TencentDingdangSLUCorpus 749 | from kashgari.embeddings import WordEmbeddings, BERTEmbedding 750 | 751 | train_x, train_y = TencentDingdangSLUCorpus.get_classification_data() 752 | 753 | w2v = WordEmbeddings('sgns.weibo.bigram', 754 | sequence_length=15, 755 | limit=5000) 756 | bert = BERTEmbedding('bert-base-chinese', sequence_length=15) 757 | t_model = CNNModel(bert) 758 | t_model.fit(train_x, train_y, epochs=1) 759 | -------------------------------------------------------------------------------- /BiLSTM/BiLSTM+glove+Sentiment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: test 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/25: 11 | ------------------------------------------------- 12 | """ 13 | 14 | import numpy as np 15 | import jieba 16 | import re 17 | from tensorflow.contrib import learn 18 | import tensorflow as tf 19 | from sklearn.model_selection import train_test_split 20 | import os 21 | import keras 22 | tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1000, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 23 | 24 | from keras.initializers import Constant 25 | from keras.preprocessing.text import Tokenizer 26 | from keras.preprocessing.sequence import pad_sequences 27 | from keras.layers import Dense, Input, GlobalMaxPooling1D 28 | from keras.layers import Conv1D, MaxPooling1D, Embedding 29 | from keras.models import Model 30 | from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional 31 | 32 | 33 | #读取数据参数设置 34 | # tf.flags.DEFINE_float('dev_sample_percentage', .1, 'Percentage of the training data to use for validation') 35 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 36 | tf.flags.DEFINE_string('positive_data_file', '../dataset/weibo60000/pos60000_utf8.txt_updated', 'Data source for the positive data') 37 | tf.flags.DEFINE_string('negative_data_file', '../dataset/weibo60000/neg60000_utf8.txt_updated', 'Data source for the negative data') 38 | tf.flags.DEFINE_string('glove_dir', '../dataset/glove.6B.100d.txt', 'Data source for the pretrained glove word vector') 39 | tf.flags.DEFINE_integer('max_num_words', '40000', '出现频率最高的40000个词语保留在词表中') 40 | 41 | # FLAGS = tf.flags.FLAGS 42 | FLAGS = tf.flags.FLAGS 43 | 44 | """从文件中读取数据和标签""" 45 | def load_data_and_label(pos_filename, neg_filename): 46 | """读取积极类别的数据""" 47 | positive_texts = open(pos_filename, 'r', encoding='utf-8').readlines() 48 | # print(positive_texts) 49 | # positive_texts = open(positive_filename, 'rb').readlines() 50 | positive_texts = [' '.join(list(jieba.cut(line.strip()))) for line in positive_texts] 51 | print('积极句子数目:', len(positive_texts)) 52 | # print(len(positive_texts)) 53 | """读取消极类别的数据""" 54 | negative_texts = open(neg_filename, 'r', encoding='utf-8').readlines() 55 | # negative_texts = open(positive_filename, 'rb').readlines() 56 | negative_texts = [' '.join(list(jieba.cut(line.strip()))) for line in negative_texts] 57 | print('消极句子数目:', len(negative_texts)) 58 | 59 | """拼接""" 60 | x_text = positive_texts + negative_texts 61 | # print(x_text) 62 | print('全部句子数目:', len(x_text)) 63 | 64 | """生成标签""" 65 | positive_labels = [1 for _ in negative_texts] 66 | negative_labels = [0 for _ in negative_texts] 67 | y = np.concatenate([positive_labels, negative_labels], 0) 68 | print('标签数目:', len(y)) 69 | # print(y) 70 | # for mat in y: 71 | # print(mat) 72 | return [x_text, y] 73 | 74 | def construct_dataset(): 75 | print('加载数据......') 76 | # positive_filename = './data/rt-polaritydata/rt-polarity.pos' 77 | # negative_filename = './data/rt-polaritydata/rt-polarity.neg' 78 | # positive_filename = './data/rt-polarity.pos' 79 | # negative_filename = './data/rt-polarity.neg' 80 | x_text, y = load_data_and_label(FLAGS.positive_data_file, FLAGS.negative_data_file) 81 | 82 | """建立词汇表""" 83 | max_sentence_length = max([len(text.split(' ')) for text in x_text]) 84 | print('最长句子长度:', max_sentence_length) 85 | 86 | 87 | #tf.contrib.learn.preprocessing.VocabularyProcessor:生成词汇表,每一个文档/句子的长度<=max_sentnce_length,记录的是单词的位置信息 88 | # vocab_processor = learn.preprocessing.VocabularyProcessor(max_sentence_length) 89 | # #x:每一个句子中的单词对应词汇表的位置,word2id 90 | # x = np.array(list(vocab_processor.fit_transform(x_text))) 91 | 92 | tokenizer = Tokenizer() 93 | tokenizer.fit_on_texts(x_text) 94 | sequences = tokenizer.texts_to_sequences(x_text) 95 | 96 | word_index = tokenizer.word_index 97 | print('词表大小:', len(word_index)) 98 | 99 | x = pad_sequences(sequences, maxlen=max_sentence_length) 100 | 101 | print('词汇表建立完毕!') 102 | print('len(x):',len(x)) 103 | print('x:',x) 104 | print('x.shape:', x.shape) 105 | print('type(x):', type(x)) 106 | 107 | """随机模糊数据,即打乱各个元素的顺序,重新洗牌""" 108 | np.random.seed(10) 109 | #np.range()返回的是range object,而np.nrange()返回的是numpy.ndarray() 110 | shuffle_indices = np.random.permutation(np.arange(len(y))) 111 | x_shuffled = x[shuffle_indices] 112 | y_shuffled = y[shuffle_indices] 113 | 114 | """划分训练集/测试集,此处直接切分""" 115 | #此处加负号表示是从列表的后面开始查找对应位置 116 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 117 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 118 | # print('划分索引:', dev_sample_index) 119 | # x_train, x_dev = x_shuffled[:dev_sample_index], x[dev_sample_index:] 120 | # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 121 | 122 | """使用sklearn中的cross-validation划分数据集""" 123 | x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1, random_state=10) 124 | 125 | print('数据集构造完毕,信息如下:') 126 | print('训练集样本数目:', len(x_train)) 127 | print('训练集标签数目:', len(y_train)) 128 | print('开发集样本数目:', len(x_dev)) 129 | print('开发集标签数目:', len(y_dev)) 130 | # print(type(y_dev)) 131 | 132 | del x, y, x_shuffled, y_shuffled 133 | 134 | # print('词汇表 Size:', len(vocab_processor.vocabulary_)) 135 | # print(vocab_processor.vocabulary_) 136 | 137 | print('x的数据类型:', type(x_train[1][1])) 138 | print('y的数据类型:', type(y_train[1])) 139 | 140 | # return x_train, x_dev, y_train, y_dev, vocab_processor 141 | return x_train, y_train, x_dev, y_dev, word_index 142 | 143 | def load_glove_model(): 144 | print('Indexing word vectors.') 145 | 146 | embeddings_index = {} 147 | with open(FLAGS.glove_dir, 'r', encoding='utf-8') as f: 148 | for line in f: 149 | values = line.split() 150 | word = values[0] 151 | coefs = np.asarray(values[1:], dtype='float32') 152 | embeddings_index[word] = coefs 153 | 154 | print('Found %s word vectors.' % len(embeddings_index)) 155 | return embeddings_index 156 | 157 | def word_embedding(Max_Sequence_Length, embedding_dim, word_index, embeddings_index): 158 | # prepare embedding matrix 159 | # num_words = min(MAX_NUM_WORDS, len(word_index)) + 1 160 | num_words = len(word_index) + 1 161 | word_index = word_index 162 | embedding_matrix = np.zeros((num_words, embedding_dim)) 163 | for word, i in word_index.items(): 164 | # if i > MAX_NUM_WORDS: 165 | # continue 166 | embedding_vector = embeddings_index.get(word) 167 | if embedding_vector is not None: 168 | # words not found in embedding index will be all-zeros. 169 | embedding_matrix[i] = embedding_vector 170 | 171 | # load pre-trained word embeddings into an Embedding layer 172 | # note that we set trainable = False so as to keep the embeddings fixed 173 | embedding_layer = Embedding(num_words, 174 | embedding_dim, 175 | embeddings_initializer=Constant(embedding_matrix), 176 | input_length=Max_Sequence_Length, 177 | trainable=False) 178 | return embedding_layer 179 | 180 | 181 | if __name__ == '__main__': 182 | print('Load dataset...') 183 | x_train, y_train, x_dev, y_dev, word_index = construct_dataset() 184 | Max_Sequence_Length = x_train.shape[1] 185 | print('Max_Sequence_Length: ', Max_Sequence_Length) #202 186 | print('x_train.shape: ', np.shape(x_train)) 187 | print('y_dev.shape: ', np.shape(y_dev)) 188 | print('Load glove word vector...') 189 | embeddings_index = load_glove_model() 190 | embedding_dim = 100 191 | # print(word_index) 192 | 193 | embedding_layer = word_embedding(Max_Sequence_Length, embedding_dim, word_index, embeddings_index) 194 | 195 | sequence_input = Input(shape=(Max_Sequence_Length,), dtype=tf.int32) 196 | embeddings = embedding_layer(sequence_input) 197 | 198 | x = Bidirectional(LSTM(64))(embeddings) 199 | x = Dropout(0.5)(x) 200 | preds = Dense(1, activation='sigmoid')(x) 201 | model = Model(sequence_input, preds) 202 | 203 | model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 204 | 205 | print('Train...') 206 | model.fit(x_train, y_train, batch_size=64, validation_data=[x_dev, y_dev], epochs=5, callbacks=[tf_board_callback]) 207 | -------------------------------------------------------------------------------- /BiLSTM/BiLSTM+random+Sentiment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: BiLSTM+random+Sentiment 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/25: 11 | ------------------------------------------------- 12 | """ 13 | import numpy as np 14 | import jieba 15 | import tensorflow as tf 16 | from sklearn.model_selection import train_test_split 17 | import os 18 | import keras 19 | tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1000, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 20 | 21 | from keras.initializers import Constant 22 | from keras.preprocessing.text import Tokenizer 23 | from keras.preprocessing.sequence import pad_sequences 24 | import numpy as np 25 | 26 | from keras.preprocessing import sequence 27 | from keras.models import Sequential 28 | from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional 29 | from keras.datasets import imdb 30 | 31 | 32 | #读取数据参数设置 33 | # tf.flags.DEFINE_float('dev_sample_percentage', .1, 'Percentage of the training data to use for validation') 34 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 35 | tf.flags.DEFINE_string('positive_data_file', '../dataset//weibo60000/pos60000_utf8.txt_updated', 'Data source for the positive data') 36 | tf.flags.DEFINE_string('negative_data_file', '../dataset/weibo60000/neg60000_utf8.txt_updated', 'Data source for the negative data') 37 | tf.flags.DEFINE_string('glove_dir', '../dataset/glove.6B.100d.txt', 'Data source for the pretrained glove word vector') 38 | tf.flags.DEFINE_integer('max_num_words', '40000', '出现频率最高的40000个词语保留在词表中') 39 | 40 | # FLAGS = tf.flags.FLAGS 41 | FLAGS = tf.flags.FLAGS 42 | 43 | """从文件中读取数据和标签""" 44 | def load_data_and_label(pos_filename, neg_filename): 45 | """读取积极类别的数据""" 46 | positive_texts = open(pos_filename, 'r', encoding='utf-8').readlines() 47 | # print(positive_texts) 48 | # positive_texts = open(positive_filename, 'rb').readlines() 49 | positive_texts = [' '.join(list(jieba.cut(line.strip()))) for line in positive_texts] 50 | print('积极句子数目:', len(positive_texts)) 51 | # print(len(positive_texts)) 52 | """读取消极类别的数据""" 53 | negative_texts = open(neg_filename, 'r', encoding='utf-8').readlines() 54 | # negative_texts = open(positive_filename, 'rb').readlines() 55 | negative_texts = [' '.join(list(jieba.cut(line.strip()))) for line in negative_texts] 56 | print('消极句子数目:', len(negative_texts)) 57 | 58 | """拼接""" 59 | x_text = positive_texts + negative_texts 60 | # print(x_text) 61 | print('全部句子数目:', len(x_text)) 62 | 63 | """生成标签""" 64 | positive_labels = [1 for _ in negative_texts] 65 | negative_labels = [0 for _ in negative_texts] 66 | y = np.concatenate([positive_labels, negative_labels], 0) 67 | print('标签数目:', len(y)) 68 | # print(y) 69 | # for mat in y: 70 | # print(mat) 71 | return [x_text, y] 72 | 73 | def construct_dataset(): 74 | print('加载数据......') 75 | # positive_filename = './data/rt-polaritydata/rt-polarity.pos' 76 | # negative_filename = './data/rt-polaritydata/rt-polarity.neg' 77 | # positive_filename = './data/rt-polarity.pos' 78 | # negative_filename = './data/rt-polarity.neg' 79 | x_text, y = load_data_and_label(FLAGS.positive_data_file, FLAGS.negative_data_file) 80 | 81 | """建立词汇表""" 82 | max_sentence_length = max([len(text.split(' ')) for text in x_text]) 83 | print('最长句子长度:', max_sentence_length) 84 | 85 | 86 | #tf.contrib.learn.preprocessing.VocabularyProcessor:生成词汇表,每一个文档/句子的长度<=max_sentnce_length,记录的是单词的位置信息 87 | # vocab_processor = learn.preprocessing.VocabularyProcessor(max_sentence_length) 88 | # #x:每一个句子中的单词对应词汇表的位置,word2id 89 | # x = np.array(list(vocab_processor.fit_transform(x_text))) 90 | 91 | tokenizer = Tokenizer() 92 | tokenizer.fit_on_texts(x_text) 93 | sequences = tokenizer.texts_to_sequences(x_text) 94 | 95 | word_index = tokenizer.word_index 96 | print('词表大小:', len(word_index)) 97 | 98 | x = pad_sequences(sequences, maxlen=max_sentence_length) 99 | 100 | print('词汇表建立完毕!') 101 | print('len(x):',len(x)) 102 | print('x:',x) 103 | print('x.shape:', x.shape) 104 | print('type(x):', type(x)) 105 | 106 | """随机模糊数据,即打乱各个元素的顺序,重新洗牌""" 107 | np.random.seed(10) 108 | #np.range()返回的是range object,而np.nrange()返回的是numpy.ndarray() 109 | shuffle_indices = np.random.permutation(np.arange(len(y))) 110 | x_shuffled = x[shuffle_indices] 111 | y_shuffled = y[shuffle_indices] 112 | 113 | """划分训练集/测试集,此处直接切分""" 114 | #此处加负号表示是从列表的后面开始查找对应位置 115 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 116 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 117 | # print('划分索引:', dev_sample_index) 118 | # x_train, x_dev = x_shuffled[:dev_sample_index], x[dev_sample_index:] 119 | # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 120 | 121 | """使用sklearn中的cross-validation划分数据集""" 122 | x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1, random_state=10) 123 | 124 | print('数据集构造完毕,信息如下:') 125 | print('训练集样本数目:', len(x_train)) 126 | print('训练集标签数目:', len(y_train)) 127 | print('开发集样本数目:', len(x_dev)) 128 | print('开发集标签数目:', len(y_dev)) 129 | # print(type(y_dev)) 130 | 131 | del x, y, x_shuffled, y_shuffled 132 | 133 | # print('词汇表 Size:', len(vocab_processor.vocabulary_)) 134 | # print(vocab_processor.vocabulary_) 135 | 136 | print('x的数据类型:', type(x_train[1][1])) 137 | print('y的数据类型:', type(y_train[1])) 138 | 139 | # return x_train, x_dev, y_train, y_dev, vocab_processor 140 | return x_train, y_train, x_dev, y_dev, word_index 141 | 142 | if __name__ == '__main__': 143 | print('Load dataset...') 144 | x_train, y_train, x_dev, y_dev, word_index = construct_dataset() 145 | Max_Sequence_Length = x_train.shape[1] 146 | print('Max_Sequence_Length: ', Max_Sequence_Length) #202 147 | print('x_train.shape: ', np.shape(x_train)) 148 | print('y_dev.shape: ', np.shape(y_dev)) 149 | 150 | max_features = FLAGS.max_num_words 151 | # cut texts after this number of words 152 | # (among top max_features most common words) 153 | maxlen = Max_Sequence_Length 154 | batch_size = 32 155 | 156 | model = Sequential() 157 | model.add(Embedding(len(word_index)+1, 128, input_length=maxlen)) 158 | model.add(Bidirectional(LSTM(64))) 159 | model.add(Dropout(0.5)) 160 | model.add(Dense(1, activation='sigmoid')) 161 | 162 | # try using different optimizers and different optimizer configs 163 | model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) 164 | 165 | print('Train...') 166 | model.fit(x_train, y_train, 167 | batch_size=batch_size, 168 | epochs=4, 169 | validation_data=[x_dev, y_dev], 170 | callbacks=[tf_board_callback]) -------------------------------------------------------------------------------- /CNN/CNN+glove+Senntiment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: CNN+glove+Senntiment 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/25: 11 | ------------------------------------------------- 12 | """ 13 | 14 | from keras.preprocessing import sequence 15 | import numpy as np 16 | import jieba 17 | import tensorflow as tf 18 | from sklearn.model_selection import train_test_split 19 | from keras.initializers import Constant 20 | from keras.preprocessing.text import Tokenizer 21 | from keras.preprocessing.sequence import pad_sequences 22 | from keras.models import Sequential, Model 23 | from keras.layers import Dense, Dropout, Activation 24 | from keras.layers import Embedding, Input 25 | from keras.layers import Conv1D, GlobalMaxPooling1D 26 | import keras 27 | tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1000, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 28 | 29 | #读取数据参数设置 30 | # tf.flags.DEFINE_float('dev_sample_percentage', .1, 'Percentage of the training data to use for validation') 31 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 32 | tf.flags.DEFINE_string('positive_data_file', '../dataset/weibo60000/pos60000_utf8.txt_updated', 'Data source for the positive data') 33 | tf.flags.DEFINE_string('negative_data_file', '../dataset/weibo60000/neg60000_utf8.txt_updated', 'Data source for the negative data') 34 | tf.flags.DEFINE_string('glove_dir', '../dataset/glove.6B.100d.txt', 'Data source for the pretrained glove word vector') 35 | tf.flags.DEFINE_integer('embedding_dims', '100', '随机初始化的词嵌入矩阵的维度') 36 | tf.flags.DEFINE_integer('filters', '250', 'CNN的卷积核的数目') 37 | tf.flags.DEFINE_integer('kernel_size', '3', 'CNN卷积核的大小') 38 | tf.flags.DEFINE_integer('hidden_dims', '250', '全连阶层的单元数目') 39 | tf.flags.DEFINE_integer('epochs', '5', 'The number of epoch') 40 | tf.flags.DEFINE_integer('batch_size', '64', '批量大小') 41 | 42 | 43 | # FLAGS = tf.flags.FLAGS 44 | FLAGS = tf.flags.FLAGS 45 | 46 | """从文件中读取数据和标签""" 47 | def load_data_and_label(pos_filename, neg_filename): 48 | """读取积极类别的数据""" 49 | positive_texts = open(pos_filename, 'r', encoding='utf-8').readlines() 50 | # print(positive_texts) 51 | # positive_texts = open(positive_filename, 'rb').readlines() 52 | positive_texts = [' '.join(list(jieba.cut(line.strip()))) for line in positive_texts] 53 | print('积极句子数目:', len(positive_texts)) 54 | # print(len(positive_texts)) 55 | """读取消极类别的数据""" 56 | negative_texts = open(neg_filename, 'r', encoding='utf-8').readlines() 57 | # negative_texts = open(positive_filename, 'rb').readlines() 58 | negative_texts = [' '.join(list(jieba.cut(line.strip()))) for line in negative_texts] 59 | print('消极句子数目:', len(negative_texts)) 60 | 61 | """拼接""" 62 | x_text = positive_texts + negative_texts 63 | # print(x_text) 64 | print('全部句子数目:', len(x_text)) 65 | 66 | """生成标签""" 67 | positive_labels = [1 for _ in negative_texts] 68 | negative_labels = [0 for _ in negative_texts] 69 | y = np.concatenate([positive_labels, negative_labels], 0) 70 | print('标签数目:', len(y)) 71 | # print(y) 72 | # for mat in y: 73 | # print(mat) 74 | return [x_text, y] 75 | 76 | def construct_dataset(): 77 | print('加载数据......') 78 | # positive_filename = './data/rt-polaritydata/rt-polarity.pos' 79 | # negative_filename = './data/rt-polaritydata/rt-polarity.neg' 80 | # positive_filename = './data/rt-polarity.pos' 81 | # negative_filename = './data/rt-polarity.neg' 82 | x_text, y = load_data_and_label(FLAGS.positive_data_file, FLAGS.negative_data_file) 83 | 84 | """建立词汇表""" 85 | max_sentence_length = max([len(text.split(' ')) for text in x_text]) 86 | print('最长句子长度:', max_sentence_length) 87 | 88 | 89 | #tf.contrib.learn.preprocessing.VocabularyProcessor:生成词汇表,每一个文档/句子的长度<=max_sentnce_length,记录的是单词的位置信息 90 | # vocab_processor = learn.preprocessing.VocabularyProcessor(max_sentence_length) 91 | # #x:每一个句子中的单词对应词汇表的位置,word2id 92 | # x = np.array(list(vocab_processor.fit_transform(x_text))) 93 | 94 | tokenizer = Tokenizer() 95 | tokenizer.fit_on_texts(x_text) 96 | sequences = tokenizer.texts_to_sequences(x_text) 97 | 98 | word_index = tokenizer.word_index 99 | print('词表大小:', len(word_index)) 100 | 101 | x = pad_sequences(sequences, maxlen=max_sentence_length) 102 | 103 | print('词汇表建立完毕!') 104 | print('len(x):',len(x)) 105 | print('x:',x) 106 | print('x.shape:', x.shape) 107 | print('type(x):', type(x)) 108 | 109 | """随机模糊数据,即打乱各个元素的顺序,重新洗牌""" 110 | np.random.seed(10) 111 | #np.range()返回的是range object,而np.nrange()返回的是numpy.ndarray() 112 | shuffle_indices = np.random.permutation(np.arange(len(y))) 113 | x_shuffled = x[shuffle_indices] 114 | y_shuffled = y[shuffle_indices] 115 | 116 | """划分训练集/测试集,此处直接切分""" 117 | #此处加负号表示是从列表的后面开始查找对应位置 118 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 119 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 120 | # print('划分索引:', dev_sample_index) 121 | # x_train, x_dev = x_shuffled[:dev_sample_index], x[dev_sample_index:] 122 | # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 123 | 124 | """使用sklearn中的cross-validation划分数据集""" 125 | x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1, random_state=10) 126 | 127 | print('数据集构造完毕,信息如下:') 128 | print('训练集样本数目:', len(x_train)) 129 | print('训练集标签数目:', len(y_train)) 130 | print('开发集样本数目:', len(x_dev)) 131 | print('开发集标签数目:', len(y_dev)) 132 | # print(type(y_dev)) 133 | 134 | del x, y, x_shuffled, y_shuffled 135 | 136 | # print('词汇表 Size:', len(vocab_processor.vocabulary_)) 137 | # print(vocab_processor.vocabulary_) 138 | 139 | print('x的数据类型:', type(x_train[1][1])) 140 | print('y的数据类型:', type(y_train[1])) 141 | 142 | # return x_train, x_dev, y_train, y_dev, vocab_processor 143 | return x_train, y_train, x_dev, y_dev, word_index 144 | 145 | def load_glove_model(): 146 | print('Indexing word vectors.') 147 | 148 | embeddings_index = {} 149 | with open(FLAGS.glove_dir, 'r', encoding='utf-8') as f: 150 | for line in f: 151 | values = line.split() 152 | word = values[0] 153 | coefs = np.asarray(values[1:], dtype='float32') 154 | embeddings_index[word] = coefs 155 | 156 | print('Found %s word vectors.' % len(embeddings_index)) 157 | return embeddings_index 158 | 159 | def word_embedding(Max_Sequence_Length, embedding_dim, word_index, embeddings_index): 160 | # prepare embedding matrix 161 | # num_words = min(MAX_NUM_WORDS, len(word_index)) + 1 162 | num_words = len(word_index) + 1 163 | word_index = word_index 164 | embedding_matrix = np.zeros((num_words, embedding_dim)) 165 | for word, i in word_index.items(): 166 | # if i > MAX_NUM_WORDS: 167 | # continue 168 | embedding_vector = embeddings_index.get(word) 169 | if embedding_vector is not None: 170 | # words not found in embedding index will be all-zeros. 171 | embedding_matrix[i] = embedding_vector 172 | 173 | # load pre-trained word embeddings into an Embedding layer 174 | # note that we set trainable = False so as to keep the embeddings fixed 175 | embedding_layer = Embedding(num_words, 176 | embedding_dim, 177 | embeddings_initializer=Constant(embedding_matrix), 178 | input_length=Max_Sequence_Length, 179 | trainable=False) 180 | return embedding_layer 181 | 182 | if __name__ == '__main__': 183 | print('Load dataset...') 184 | x_train, y_train, x_dev, y_dev, word_index = construct_dataset() 185 | Max_Sequence_Length = x_train.shape[1] 186 | print('Max_Sequence_Length: ', Max_Sequence_Length) #202 187 | print('x_train.shape: ', np.shape(x_train)) 188 | print('y_dev.shape: ', np.shape(y_dev)) 189 | 190 | print('Load glove word vector...') 191 | embeddings_index = load_glove_model() 192 | embedding_dim = 100 193 | # print(word_index) 194 | 195 | embedding_layer = word_embedding(Max_Sequence_Length, embedding_dim, word_index, embeddings_index) 196 | 197 | sequence_input = Input(shape=(Max_Sequence_Length,), dtype=tf.int32) 198 | embeddings = embedding_layer(sequence_input) 199 | 200 | x = Dropout(0.2)(embeddings) 201 | x = Conv1D(FLAGS.filters, 202 | FLAGS.kernel_size, 203 | padding='valid', 204 | activation='relu', 205 | strides=1)(x) 206 | x= GlobalMaxPooling1D()(x) 207 | x = Dense(FLAGS.hidden_dims)(x) 208 | x= Dropout(0.2)(x) 209 | x = Activation('relu')(x) 210 | x = Dense(1)(x) 211 | 212 | preds = Activation('sigmoid')(x) 213 | model = Model(sequence_input, preds) 214 | 215 | model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 216 | 217 | print('Train...') 218 | model.fit(x_train, y_train, 219 | batch_size=FLAGS.batch_size, 220 | validation_data=[x_dev, y_dev], 221 | epochs=FLAGS.epochs, 222 | callbacks=[tf_board_callback]) 223 | -------------------------------------------------------------------------------- /CNN/CNN+random+Sentiment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: CNN+random+Sentiment 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/25: 11 | ------------------------------------------------- 12 | """ 13 | 14 | from keras.preprocessing import sequence 15 | import numpy as np 16 | import jieba 17 | import tensorflow as tf 18 | import keras 19 | tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1000, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 20 | from sklearn.model_selection import train_test_split 21 | from keras.initializers import Constant 22 | from keras.preprocessing.text import Tokenizer 23 | from keras.preprocessing.sequence import pad_sequences 24 | from keras.models import Sequential 25 | from keras.layers import Dense, Dropout, Activation 26 | from keras.layers import Embedding 27 | from keras.layers import Conv1D, GlobalMaxPooling1D 28 | 29 | #读取数据参数设置 30 | # tf.flags.DEFINE_float('dev_sample_percentage', .1, 'Percentage of the training data to use for validation') 31 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 32 | tf.flags.DEFINE_string('positive_data_file', '../dataset/weibo60000/pos60000_utf8.txt_updated', 'Data source for the positive data') 33 | tf.flags.DEFINE_string('negative_data_file', '../dataset/weibo60000/neg60000_utf8.txt_updated', 'Data source for the negative data') 34 | tf.flags.DEFINE_string('glove_dir', '../dataset/glove.6B.100d.txt', 'Data source for the pretrained glove word vector') 35 | tf.flags.DEFINE_integer('embedding_dims', '100', '随机初始化的词嵌入矩阵的维度') 36 | tf.flags.DEFINE_integer('filters', '250', 'CNN的卷积核的数目') 37 | tf.flags.DEFINE_integer('kernel_size', '3', 'CNN卷积核的大小') 38 | tf.flags.DEFINE_integer('hidden_dims', '250', '全连阶层的单元数目') 39 | tf.flags.DEFINE_integer('epochs', '5', 'The number of epoch') 40 | tf.flags.DEFINE_integer('batch_size', '64', '批量大小') 41 | 42 | 43 | # FLAGS = tf.flags.FLAGS 44 | FLAGS = tf.flags.FLAGS 45 | 46 | """从文件中读取数据和标签""" 47 | def load_data_and_label(pos_filename, neg_filename): 48 | """读取积极类别的数据""" 49 | positive_texts = open(pos_filename, 'r', encoding='utf-8').readlines() 50 | # print(positive_texts) 51 | # positive_texts = open(positive_filename, 'rb').readlines() 52 | positive_texts = [' '.join(list(jieba.cut(line.strip()))) for line in positive_texts] 53 | print('积极句子数目:', len(positive_texts)) 54 | # print(len(positive_texts)) 55 | """读取消极类别的数据""" 56 | negative_texts = open(neg_filename, 'r', encoding='utf-8').readlines() 57 | # negative_texts = open(positive_filename, 'rb').readlines() 58 | negative_texts = [' '.join(list(jieba.cut(line.strip()))) for line in negative_texts] 59 | print('消极句子数目:', len(negative_texts)) 60 | 61 | """拼接""" 62 | x_text = positive_texts + negative_texts 63 | # print(x_text) 64 | print('全部句子数目:', len(x_text)) 65 | 66 | """生成标签""" 67 | positive_labels = [1 for _ in negative_texts] 68 | negative_labels = [0 for _ in negative_texts] 69 | y = np.concatenate([positive_labels, negative_labels], 0) 70 | print('标签数目:', len(y)) 71 | # print(y) 72 | # for mat in y: 73 | # print(mat) 74 | return [x_text, y] 75 | 76 | def construct_dataset(): 77 | print('加载数据......') 78 | # positive_filename = './data/rt-polaritydata/rt-polarity.pos' 79 | # negative_filename = './data/rt-polaritydata/rt-polarity.neg' 80 | # positive_filename = './data/rt-polarity.pos' 81 | # negative_filename = './data/rt-polarity.neg' 82 | x_text, y = load_data_and_label(FLAGS.positive_data_file, FLAGS.negative_data_file) 83 | 84 | """建立词汇表""" 85 | max_sentence_length = max([len(text.split(' ')) for text in x_text]) 86 | print('最长句子长度:', max_sentence_length) 87 | 88 | 89 | #tf.contrib.learn.preprocessing.VocabularyProcessor:生成词汇表,每一个文档/句子的长度<=max_sentnce_length,记录的是单词的位置信息 90 | # vocab_processor = learn.preprocessing.VocabularyProcessor(max_sentence_length) 91 | # #x:每一个句子中的单词对应词汇表的位置,word2id 92 | # x = np.array(list(vocab_processor.fit_transform(x_text))) 93 | 94 | tokenizer = Tokenizer() 95 | tokenizer.fit_on_texts(x_text) 96 | sequences = tokenizer.texts_to_sequences(x_text) 97 | 98 | word_index = tokenizer.word_index 99 | print('词表大小:', len(word_index)) 100 | 101 | x = pad_sequences(sequences, maxlen=max_sentence_length) 102 | 103 | print('词汇表建立完毕!') 104 | print('len(x):',len(x)) 105 | print('x:',x) 106 | print('x.shape:', x.shape) 107 | print('type(x):', type(x)) 108 | 109 | """随机模糊数据,即打乱各个元素的顺序,重新洗牌""" 110 | np.random.seed(10) 111 | #np.range()返回的是range object,而np.nrange()返回的是numpy.ndarray() 112 | shuffle_indices = np.random.permutation(np.arange(len(y))) 113 | x_shuffled = x[shuffle_indices] 114 | y_shuffled = y[shuffle_indices] 115 | 116 | """划分训练集/测试集,此处直接切分""" 117 | #此处加负号表示是从列表的后面开始查找对应位置 118 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 119 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 120 | # print('划分索引:', dev_sample_index) 121 | # x_train, x_dev = x_shuffled[:dev_sample_index], x[dev_sample_index:] 122 | # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 123 | 124 | """使用sklearn中的cross-validation划分数据集""" 125 | x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1, random_state=10) 126 | 127 | print('数据集构造完毕,信息如下:') 128 | print('训练集样本数目:', len(x_train)) 129 | print('训练集标签数目:', len(y_train)) 130 | print('开发集样本数目:', len(x_dev)) 131 | print('开发集标签数目:', len(y_dev)) 132 | # print(type(y_dev)) 133 | 134 | del x, y, x_shuffled, y_shuffled 135 | 136 | # print('词汇表 Size:', len(vocab_processor.vocabulary_)) 137 | # print(vocab_processor.vocabulary_) 138 | 139 | print('x的数据类型:', type(x_train[1][1])) 140 | print('y的数据类型:', type(y_train[1])) 141 | 142 | # return x_train, x_dev, y_train, y_dev, vocab_processor 143 | return x_train, y_train, x_dev, y_dev, word_index 144 | 145 | if __name__ == '__main__': 146 | print('Load dataset...') 147 | x_train, y_train, x_dev, y_dev, word_index = construct_dataset() 148 | Max_Sequence_Length = x_train.shape[1] 149 | print('Max_Sequence_Length: ', Max_Sequence_Length) #202 150 | print('x_train.shape: ', np.shape(x_train)) 151 | print('y_dev.shape: ', np.shape(y_dev)) 152 | 153 | model = Sequential() 154 | 155 | # we start off with an efficient embedding layer which maps 156 | # our vocab indices into embedding_dims dimensions 157 | model.add(Embedding(len(word_index)+1, 158 | FLAGS.embedding_dims, 159 | input_length=Max_Sequence_Length)) 160 | model.add(Dropout(0.2)) 161 | 162 | # we add a Convolution1D, which will learn filters 163 | # word group filters of size filter_length: 164 | model.add(Conv1D(FLAGS.filters, 165 | FLAGS.kernel_size, 166 | padding='valid', 167 | activation='relu', 168 | strides=1)) 169 | # we use max pooling: 170 | model.add(GlobalMaxPooling1D()) 171 | 172 | # We add a vanilla hidden layer: 173 | model.add(Dense(FLAGS.hidden_dims)) 174 | model.add(Dropout(0.2)) 175 | model.add(Activation('relu')) 176 | 177 | # We project onto a single unit output layer, and squash it with a sigmoid: 178 | model.add(Dense(1)) 179 | model.add(Activation('sigmoid')) 180 | 181 | model.compile(loss='binary_crossentropy', 182 | optimizer='adam', 183 | metrics=['accuracy']) 184 | model.fit(x_train, y_train, 185 | batch_size=FLAGS.batch_size, 186 | epochs=FLAGS.epochs, 187 | validation_data=(x_dev, y_dev), 188 | callbacks=[tf_board_callback]) 189 | 190 | -------------------------------------------------------------------------------- /CNN_LSTM/CNNLSTM+Random+Sentiment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: CNNLSTM+Random+Sentiment 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/25: 11 | ------------------------------------------------- 12 | """ 13 | from keras.preprocessing import sequence 14 | import numpy as np 15 | import jieba 16 | import tensorflow as tf 17 | from sklearn.model_selection import train_test_split 18 | from keras.initializers import Constant 19 | from keras.preprocessing.text import Tokenizer 20 | from keras.preprocessing.sequence import pad_sequences 21 | from keras.models import Sequential 22 | from keras.layers import Dense, Dropout, Activation 23 | from keras.layers import Embedding 24 | from keras.layers import Conv1D, MaxPooling1D, LSTM 25 | import keras 26 | tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1000, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 27 | 28 | #读取数据参数设置 29 | # tf.flags.DEFINE_float('dev_sample_percentage', .1, 'Percentage of the training data to use for validation') 30 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 31 | tf.flags.DEFINE_string('positive_data_file', '../dataset/weibo60000/pos60000_utf8.txt_updated', 'Data source for the positive data') 32 | tf.flags.DEFINE_string('negative_data_file', '../dataset/weibo60000/neg60000_utf8.txt_updated', 'Data source for the negative data') 33 | tf.flags.DEFINE_string('glove_dir', '../dataset/glove.6B.100d.txt', 'Data source for the pretrained glove word vector') 34 | tf.flags.DEFINE_integer('embedding_size', '100', '随机初始化的词嵌入矩阵的维度') 35 | tf.flags.DEFINE_integer('filters', '64', 'CNN的卷积核的数目') 36 | tf.flags.DEFINE_integer('kernel_size', '3', 'CNN卷积核的大小') 37 | tf.flags.DEFINE_integer('pool_size', '4', '池化窗口大小') 38 | tf.flags.DEFINE_integer('lstm_output_size', '70', '单向LSTM网络的单元数目') 39 | tf.flags.DEFINE_integer('epochs', '5', 'The number of epoch') 40 | tf.flags.DEFINE_integer('batch_size', '64', '批量大小') 41 | 42 | 43 | # FLAGS = tf.flags.FLAGS 44 | FLAGS = tf.flags.FLAGS 45 | 46 | """从文件中读取数据和标签""" 47 | def load_data_and_label(pos_filename, neg_filename): 48 | """读取积极类别的数据""" 49 | positive_texts = open(pos_filename, 'r', encoding='utf-8').readlines() 50 | # print(positive_texts) 51 | # positive_texts = open(positive_filename, 'rb').readlines() 52 | positive_texts = [' '.join(list(jieba.cut(line.strip()))) for line in positive_texts] 53 | print('积极句子数目:', len(positive_texts)) 54 | # print(len(positive_texts)) 55 | """读取消极类别的数据""" 56 | negative_texts = open(neg_filename, 'r', encoding='utf-8').readlines() 57 | # negative_texts = open(positive_filename, 'rb').readlines() 58 | negative_texts = [' '.join(list(jieba.cut(line.strip()))) for line in negative_texts] 59 | print('消极句子数目:', len(negative_texts)) 60 | 61 | """拼接""" 62 | x_text = positive_texts + negative_texts 63 | # print(x_text) 64 | print('全部句子数目:', len(x_text)) 65 | 66 | """生成标签""" 67 | positive_labels = [1 for _ in negative_texts] 68 | negative_labels = [0 for _ in negative_texts] 69 | y = np.concatenate([positive_labels, negative_labels], 0) 70 | print('标签数目:', len(y)) 71 | # print(y) 72 | # for mat in y: 73 | # print(mat) 74 | return [x_text, y] 75 | 76 | def construct_dataset(): 77 | print('加载数据......') 78 | # positive_filename = './data/rt-polaritydata/rt-polarity.pos' 79 | # negative_filename = './data/rt-polaritydata/rt-polarity.neg' 80 | # positive_filename = './data/rt-polarity.pos' 81 | # negative_filename = './data/rt-polarity.neg' 82 | x_text, y = load_data_and_label(FLAGS.positive_data_file, FLAGS.negative_data_file) 83 | 84 | """建立词汇表""" 85 | max_sentence_length = max([len(text.split(' ')) for text in x_text]) 86 | print('最长句子长度:', max_sentence_length) 87 | 88 | 89 | #tf.contrib.learn.preprocessing.VocabularyProcessor:生成词汇表,每一个文档/句子的长度<=max_sentnce_length,记录的是单词的位置信息 90 | # vocab_processor = learn.preprocessing.VocabularyProcessor(max_sentence_length) 91 | # #x:每一个句子中的单词对应词汇表的位置,word2id 92 | # x = np.array(list(vocab_processor.fit_transform(x_text))) 93 | 94 | tokenizer = Tokenizer() 95 | tokenizer.fit_on_texts(x_text) 96 | sequences = tokenizer.texts_to_sequences(x_text) 97 | 98 | word_index = tokenizer.word_index 99 | print('词表大小:', len(word_index)) 100 | 101 | x = pad_sequences(sequences, maxlen=max_sentence_length) 102 | 103 | print('词汇表建立完毕!') 104 | print('len(x):',len(x)) 105 | print('x:',x) 106 | print('x.shape:', x.shape) 107 | print('type(x):', type(x)) 108 | 109 | """随机模糊数据,即打乱各个元素的顺序,重新洗牌""" 110 | np.random.seed(10) 111 | #np.range()返回的是range object,而np.nrange()返回的是numpy.ndarray() 112 | shuffle_indices = np.random.permutation(np.arange(len(y))) 113 | x_shuffled = x[shuffle_indices] 114 | y_shuffled = y[shuffle_indices] 115 | 116 | """划分训练集/测试集,此处直接切分""" 117 | #此处加负号表示是从列表的后面开始查找对应位置 118 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 119 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 120 | # print('划分索引:', dev_sample_index) 121 | # x_train, x_dev = x_shuffled[:dev_sample_index], x[dev_sample_index:] 122 | # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 123 | 124 | """使用sklearn中的cross-validation划分数据集""" 125 | x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1, random_state=10) 126 | 127 | print('数据集构造完毕,信息如下:') 128 | print('训练集样本数目:', len(x_train)) 129 | print('训练集标签数目:', len(y_train)) 130 | print('开发集样本数目:', len(x_dev)) 131 | print('开发集标签数目:', len(y_dev)) 132 | # print(type(y_dev)) 133 | 134 | del x, y, x_shuffled, y_shuffled 135 | 136 | # print('词汇表 Size:', len(vocab_processor.vocabulary_)) 137 | # print(vocab_processor.vocabulary_) 138 | 139 | print('x的数据类型:', type(x_train[1][1])) 140 | print('y的数据类型:', type(y_train[1])) 141 | 142 | # return x_train, x_dev, y_train, y_dev, vocab_processor 143 | return x_train, y_train, x_dev, y_dev, word_index 144 | 145 | if __name__ == '__main__': 146 | print('Load dataset...') 147 | x_train, y_train, x_dev, y_dev, word_index = construct_dataset() 148 | Max_Sequence_Length = x_train.shape[1] 149 | print('Max_Sequence_Length: ', Max_Sequence_Length) #202 150 | print('x_train.shape: ', np.shape(x_train)) 151 | print('y_dev.shape: ', np.shape(y_dev)) 152 | 153 | model = Sequential() 154 | 155 | model = Sequential() 156 | model.add(Embedding(len(word_index)+1, FLAGS.embedding_size, input_length=Max_Sequence_Length)) 157 | model.add(Dropout(0.25)) 158 | model.add(Conv1D(FLAGS.filters, 159 | FLAGS.kernel_size, 160 | padding='valid', 161 | activation='relu', 162 | strides=1)) 163 | model.add(MaxPooling1D(pool_size=FLAGS.pool_size)) 164 | model.add(LSTM(FLAGS.lstm_output_size)) 165 | model.add(Dense(1)) 166 | model.add(Activation('sigmoid')) 167 | 168 | model.compile(loss='binary_crossentropy', 169 | optimizer='adam', 170 | metrics=['accuracy']) 171 | 172 | print('Train...') 173 | model.fit(x_train, y_train, 174 | batch_size=FLAGS.batch_size, 175 | epochs=FLAGS.epochs, 176 | validation_data=(x_dev, y_dev), 177 | callbacks=[tf_board_callback]) 178 | score, acc = model.evaluate(x_dev, y_dev, batch_size=FLAGS.batch_size) 179 | print('Test score:', score) 180 | print('Test accuracy:', acc) 181 | 182 | 183 | -------------------------------------------------------------------------------- /CNN_LSTM/CNNLSTM+glove+Sentiment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: CNNLSTM+glove+Sentiment 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/25: 11 | ------------------------------------------------- 12 | """ 13 | from keras.preprocessing import sequence 14 | import numpy as np 15 | import jieba 16 | import tensorflow as tf 17 | from sklearn.model_selection import train_test_split 18 | from keras.initializers import Constant 19 | from keras.preprocessing.text import Tokenizer 20 | from keras.preprocessing.sequence import pad_sequences 21 | from keras.models import Sequential 22 | from keras.layers import Dense, Dropout, Activation 23 | from keras.layers import Embedding, Input 24 | from keras.layers import Conv1D, MaxPooling1D, LSTM 25 | from keras.models import Model 26 | import keras 27 | tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1000, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 28 | 29 | #读取数据参数设置 30 | # tf.flags.DEFINE_float('dev_sample_percentage', .1, 'Percentage of the training data to use for validation') 31 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 32 | tf.flags.DEFINE_string('positive_data_file', '../dataset/weibo60000/pos60000_utf8.txt_updated', 'Data source for the positive data') 33 | tf.flags.DEFINE_string('negative_data_file', '../dataset/weibo60000/neg60000_utf8.txt_updated', 'Data source for the negative data') 34 | tf.flags.DEFINE_string('glove_dir', '../dataset/glove.6B.100d.txt', 'Data source for the pretrained glove word vector') 35 | tf.flags.DEFINE_integer('embedding_size', '100', '随机初始化的词嵌入矩阵的维度') 36 | tf.flags.DEFINE_integer('filters', '64', 'CNN的卷积核的数目') 37 | tf.flags.DEFINE_integer('kernel_size', '3', 'CNN卷积核的大小') 38 | tf.flags.DEFINE_integer('pool_size', '4', '池化窗口大小') 39 | tf.flags.DEFINE_integer('lstm_output_size', '70', '单向LSTM网络的单元数目') 40 | tf.flags.DEFINE_integer('epochs', '5', 'The number of epoch') 41 | tf.flags.DEFINE_integer('batch_size', '64', '批量大小') 42 | 43 | 44 | # FLAGS = tf.flags.FLAGS 45 | FLAGS = tf.flags.FLAGS 46 | 47 | """从文件中读取数据和标签""" 48 | def load_data_and_label(pos_filename, neg_filename): 49 | """读取积极类别的数据""" 50 | positive_texts = open(pos_filename, 'r', encoding='utf-8').readlines() 51 | # print(positive_texts) 52 | # positive_texts = open(positive_filename, 'rb').readlines() 53 | positive_texts = [' '.join(list(jieba.cut(line.strip()))) for line in positive_texts] 54 | print('积极句子数目:', len(positive_texts)) 55 | # print(len(positive_texts)) 56 | """读取消极类别的数据""" 57 | negative_texts = open(neg_filename, 'r', encoding='utf-8').readlines() 58 | # negative_texts = open(positive_filename, 'rb').readlines() 59 | negative_texts = [' '.join(list(jieba.cut(line.strip()))) for line in negative_texts] 60 | print('消极句子数目:', len(negative_texts)) 61 | 62 | """拼接""" 63 | x_text = positive_texts + negative_texts 64 | # print(x_text) 65 | print('全部句子数目:', len(x_text)) 66 | 67 | """生成标签""" 68 | positive_labels = [1 for _ in negative_texts] 69 | negative_labels = [0 for _ in negative_texts] 70 | y = np.concatenate([positive_labels, negative_labels], 0) 71 | print('标签数目:', len(y)) 72 | # print(y) 73 | # for mat in y: 74 | # print(mat) 75 | return [x_text, y] 76 | 77 | def construct_dataset(): 78 | print('加载数据......') 79 | # positive_filename = './data/rt-polaritydata/rt-polarity.pos' 80 | # negative_filename = './data/rt-polaritydata/rt-polarity.neg' 81 | # positive_filename = './data/rt-polarity.pos' 82 | # negative_filename = './data/rt-polarity.neg' 83 | x_text, y = load_data_and_label(FLAGS.positive_data_file, FLAGS.negative_data_file) 84 | 85 | """建立词汇表""" 86 | max_sentence_length = max([len(text.split(' ')) for text in x_text]) 87 | print('最长句子长度:', max_sentence_length) 88 | 89 | 90 | #tf.contrib.learn.preprocessing.VocabularyProcessor:生成词汇表,每一个文档/句子的长度<=max_sentnce_length,记录的是单词的位置信息 91 | # vocab_processor = learn.preprocessing.VocabularyProcessor(max_sentence_length) 92 | # #x:每一个句子中的单词对应词汇表的位置,word2id 93 | # x = np.array(list(vocab_processor.fit_transform(x_text))) 94 | 95 | tokenizer = Tokenizer() 96 | tokenizer.fit_on_texts(x_text) 97 | sequences = tokenizer.texts_to_sequences(x_text) 98 | 99 | word_index = tokenizer.word_index 100 | print('词表大小:', len(word_index)) 101 | 102 | x = pad_sequences(sequences, maxlen=max_sentence_length) 103 | 104 | print('词汇表建立完毕!') 105 | print('len(x):',len(x)) 106 | print('x:',x) 107 | print('x.shape:', x.shape) 108 | print('type(x):', type(x)) 109 | 110 | """随机模糊数据,即打乱各个元素的顺序,重新洗牌""" 111 | np.random.seed(10) 112 | #np.range()返回的是range object,而np.nrange()返回的是numpy.ndarray() 113 | shuffle_indices = np.random.permutation(np.arange(len(y))) 114 | x_shuffled = x[shuffle_indices] 115 | y_shuffled = y[shuffle_indices] 116 | 117 | """划分训练集/测试集,此处直接切分""" 118 | #此处加负号表示是从列表的后面开始查找对应位置 119 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 120 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 121 | # print('划分索引:', dev_sample_index) 122 | # x_train, x_dev = x_shuffled[:dev_sample_index], x[dev_sample_index:] 123 | # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 124 | 125 | """使用sklearn中的cross-validation划分数据集""" 126 | x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1, random_state=10) 127 | 128 | print('数据集构造完毕,信息如下:') 129 | print('训练集样本数目:', len(x_train)) 130 | print('训练集标签数目:', len(y_train)) 131 | print('开发集样本数目:', len(x_dev)) 132 | print('开发集标签数目:', len(y_dev)) 133 | # print(type(y_dev)) 134 | 135 | del x, y, x_shuffled, y_shuffled 136 | 137 | # print('词汇表 Size:', len(vocab_processor.vocabulary_)) 138 | # print(vocab_processor.vocabulary_) 139 | 140 | print('x的数据类型:', type(x_train[1][1])) 141 | print('y的数据类型:', type(y_train[1])) 142 | 143 | # return x_train, x_dev, y_train, y_dev, vocab_processor 144 | return x_train, y_train, x_dev, y_dev, word_index 145 | 146 | def load_glove_model(): 147 | print('Indexing word vectors.') 148 | 149 | embeddings_index = {} 150 | with open(FLAGS.glove_dir, 'r', encoding='utf-8') as f: 151 | for line in f: 152 | values = line.split() 153 | word = values[0] 154 | coefs = np.asarray(values[1:], dtype='float32') 155 | embeddings_index[word] = coefs 156 | 157 | print('Found %s word vectors.' % len(embeddings_index)) 158 | return embeddings_index 159 | 160 | def word_embedding(Max_Sequence_Length, embedding_dim, word_index, embeddings_index): 161 | # prepare embedding matrix 162 | # num_words = min(MAX_NUM_WORDS, len(word_index)) + 1 163 | num_words = len(word_index) + 1 164 | word_index = word_index 165 | embedding_matrix = np.zeros((num_words, embedding_dim)) 166 | for word, i in word_index.items(): 167 | # if i > MAX_NUM_WORDS: 168 | # continue 169 | embedding_vector = embeddings_index.get(word) 170 | if embedding_vector is not None: 171 | # words not found in embedding index will be all-zeros. 172 | embedding_matrix[i] = embedding_vector 173 | 174 | # load pre-trained word embeddings into an Embedding layer 175 | # note that we set trainable = False so as to keep the embeddings fixed 176 | embedding_layer = Embedding(num_words, 177 | embedding_dim, 178 | embeddings_initializer=Constant(embedding_matrix), 179 | input_length=Max_Sequence_Length, 180 | trainable=False) 181 | return embedding_layer 182 | 183 | if __name__ == '__main__': 184 | print('Load dataset...') 185 | x_train, y_train, x_dev, y_dev, word_index = construct_dataset() 186 | Max_Sequence_Length = x_train.shape[1] 187 | print('Max_Sequence_Length: ', Max_Sequence_Length) #202 188 | print('x_train.shape: ', np.shape(x_train)) 189 | print('y_dev.shape: ', np.shape(y_dev)) 190 | 191 | print('Load glove word vector...') 192 | embeddings_index = load_glove_model() 193 | embedding_dim = 100 194 | # print(word_index) 195 | 196 | embedding_layer = word_embedding(Max_Sequence_Length, embedding_dim, word_index, embeddings_index) 197 | 198 | sequence_input = Input(shape=(Max_Sequence_Length,), dtype=tf.int32) 199 | embeddings = embedding_layer(sequence_input) 200 | 201 | x = Dropout(0.25)(embeddings) 202 | x= Conv1D(FLAGS.filters, 203 | FLAGS.kernel_size, 204 | padding='valid', 205 | activation='relu', 206 | strides=1)(x) 207 | x = MaxPooling1D(pool_size=FLAGS.pool_size)(x) 208 | x = LSTM(FLAGS.lstm_output_size)(x) 209 | x = Dense(1)(x) 210 | preds = Activation('sigmoid')(x) 211 | 212 | model = Model(sequence_input, preds) 213 | 214 | model.compile(loss='binary_crossentropy', 215 | optimizer='adam', 216 | metrics=['accuracy']) 217 | 218 | print('Train...') 219 | model.fit(x_train, y_train, 220 | batch_size=FLAGS.batch_size, 221 | epochs=FLAGS.epochs, 222 | validation_data=(x_dev, y_dev), 223 | callbacks=[tf_board_callback]) 224 | score, acc = model.evaluate(x_dev, y_dev, batch_size=FLAGS.batch_size) 225 | print('Test score:', score) 226 | print('Test accuracy:', acc) 227 | -------------------------------------------------------------------------------- /LSTM/LSTM+glove+Sentiment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: LSTM+glove+Sentiment 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/25: 11 | ------------------------------------------------- 12 | """ 13 | 14 | import numpy as np 15 | import jieba 16 | import re 17 | from tensorflow.contrib import learn 18 | import tensorflow as tf 19 | from sklearn.model_selection import train_test_split 20 | import os 21 | import keras 22 | tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1000, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 23 | from keras.initializers import Constant 24 | from keras.preprocessing.text import Tokenizer 25 | from keras.preprocessing.sequence import pad_sequences 26 | from keras.layers import Dense, Input, GlobalMaxPooling1D 27 | from keras.layers import Conv1D, MaxPooling1D, Embedding 28 | from keras.models import Model 29 | from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional 30 | 31 | 32 | #读取数据参数设置 33 | # tf.flags.DEFINE_float('dev_sample_percentage', .1, 'Percentage of the training data to use for validation') 34 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 35 | tf.flags.DEFINE_string('positive_data_file', '../dataset/weibo60000/pos60000_utf8.txt_updated', 'Data source for the positive data') 36 | tf.flags.DEFINE_string('negative_data_file', '../dataset/weibo60000/neg60000_utf8.txt_updated', 'Data source for the negative data') 37 | tf.flags.DEFINE_string('glove_dir', '../dataset/glove.6B.100d.txt', 'Data source for the pretrained glove word vector') 38 | tf.flags.DEFINE_integer('max_num_words', '40000', '出现频率最高的40000个词语保留在词表中') 39 | tf.flags.DEFINE_integer('embedding_dim', '100', 'embedding矩阵的维度') 40 | 41 | # FLAGS = tf.flags.FLAGS 42 | FLAGS = tf.flags.FLAGS 43 | 44 | """从文件中读取数据和标签""" 45 | def load_data_and_label(pos_filename, neg_filename): 46 | """读取积极类别的数据""" 47 | positive_texts = open(pos_filename, 'r', encoding='utf-8').readlines() 48 | # print(positive_texts) 49 | # positive_texts = open(positive_filename, 'rb').readlines() 50 | positive_texts = [' '.join(list(jieba.cut(line.strip()))) for line in positive_texts] 51 | print('积极句子数目:', len(positive_texts)) 52 | # print(len(positive_texts)) 53 | """读取消极类别的数据""" 54 | negative_texts = open(neg_filename, 'r', encoding='utf-8').readlines() 55 | # negative_texts = open(positive_filename, 'rb').readlines() 56 | negative_texts = [' '.join(list(jieba.cut(line.strip()))) for line in negative_texts] 57 | print('消极句子数目:', len(negative_texts)) 58 | 59 | """拼接""" 60 | x_text = positive_texts + negative_texts 61 | # print(x_text) 62 | print('全部句子数目:', len(x_text)) 63 | 64 | """生成标签""" 65 | positive_labels = [1 for _ in negative_texts] 66 | negative_labels = [0 for _ in negative_texts] 67 | y = np.concatenate([positive_labels, negative_labels], 0) 68 | print('标签数目:', len(y)) 69 | # print(y) 70 | # for mat in y: 71 | # print(mat) 72 | return [x_text, y] 73 | 74 | def construct_dataset(): 75 | print('加载数据......') 76 | # positive_filename = './data/rt-polaritydata/rt-polarity.pos' 77 | # negative_filename = './data/rt-polaritydata/rt-polarity.neg' 78 | # positive_filename = './data/rt-polarity.pos' 79 | # negative_filename = './data/rt-polarity.neg' 80 | x_text, y = load_data_and_label(FLAGS.positive_data_file, FLAGS.negative_data_file) 81 | 82 | """建立词汇表""" 83 | max_sentence_length = max([len(text.split(' ')) for text in x_text]) 84 | print('最长句子长度:', max_sentence_length) 85 | 86 | 87 | #tf.contrib.learn.preprocessing.VocabularyProcessor:生成词汇表,每一个文档/句子的长度<=max_sentnce_length,记录的是单词的位置信息 88 | # vocab_processor = learn.preprocessing.VocabularyProcessor(max_sentence_length) 89 | # #x:每一个句子中的单词对应词汇表的位置,word2id 90 | # x = np.array(list(vocab_processor.fit_transform(x_text))) 91 | 92 | tokenizer = Tokenizer() 93 | tokenizer.fit_on_texts(x_text) 94 | sequences = tokenizer.texts_to_sequences(x_text) 95 | 96 | word_index = tokenizer.word_index 97 | print('词表大小:', len(word_index)) 98 | 99 | x = pad_sequences(sequences, maxlen=max_sentence_length) 100 | 101 | print('词汇表建立完毕!') 102 | print('len(x):',len(x)) 103 | print('x:',x) 104 | print('x.shape:', x.shape) 105 | print('type(x):', type(x)) 106 | 107 | """随机模糊数据,即打乱各个元素的顺序,重新洗牌""" 108 | np.random.seed(10) 109 | #np.range()返回的是range object,而np.nrange()返回的是numpy.ndarray() 110 | shuffle_indices = np.random.permutation(np.arange(len(y))) 111 | x_shuffled = x[shuffle_indices] 112 | y_shuffled = y[shuffle_indices] 113 | 114 | """划分训练集/测试集,此处直接切分""" 115 | #此处加负号表示是从列表的后面开始查找对应位置 116 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 117 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 118 | # print('划分索引:', dev_sample_index) 119 | # x_train, x_dev = x_shuffled[:dev_sample_index], x[dev_sample_index:] 120 | # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 121 | 122 | """使用sklearn中的cross-validation划分数据集""" 123 | x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1, random_state=10) 124 | 125 | print('数据集构造完毕,信息如下:') 126 | print('训练集样本数目:', len(x_train)) 127 | print('训练集标签数目:', len(y_train)) 128 | print('开发集样本数目:', len(x_dev)) 129 | print('开发集标签数目:', len(y_dev)) 130 | # print(type(y_dev)) 131 | 132 | del x, y, x_shuffled, y_shuffled 133 | 134 | # print('词汇表 Size:', len(vocab_processor.vocabulary_)) 135 | # print(vocab_processor.vocabulary_) 136 | 137 | print('x的数据类型:', type(x_train[1][1])) 138 | print('y的数据类型:', type(y_train[1])) 139 | 140 | # return x_train, x_dev, y_train, y_dev, vocab_processor 141 | return x_train, y_train, x_dev, y_dev, word_index 142 | 143 | def load_glove_model(): 144 | print('Indexing word vectors.') 145 | 146 | embeddings_index = {} 147 | with open(FLAGS.glove_dir, 'r', encoding='utf-8') as f: 148 | for line in f: 149 | values = line.split() 150 | word = values[0] 151 | coefs = np.asarray(values[1:], dtype='float32') 152 | embeddings_index[word] = coefs 153 | 154 | print('Found %s word vectors.' % len(embeddings_index)) 155 | return embeddings_index 156 | 157 | def word_embedding(Max_Sequence_Length, embedding_dim, word_index, embeddings_index): 158 | # prepare embedding matrix 159 | # num_words = min(MAX_NUM_WORDS, len(word_index)) + 1 160 | num_words = len(word_index) + 1 161 | word_index = word_index 162 | embedding_matrix = np.zeros((num_words, embedding_dim)) 163 | for word, i in word_index.items(): 164 | # if i > MAX_NUM_WORDS: 165 | # continue 166 | embedding_vector = embeddings_index.get(word) 167 | if embedding_vector is not None: 168 | # words not found in embedding index will be all-zeros. 169 | embedding_matrix[i] = embedding_vector 170 | 171 | # load pre-trained word embeddings into an Embedding layer 172 | # note that we set trainable = False so as to keep the embeddings fixed 173 | embedding_layer = Embedding(num_words, 174 | embedding_dim, 175 | embeddings_initializer=Constant(embedding_matrix), 176 | input_length=Max_Sequence_Length, 177 | trainable=False) 178 | return embedding_layer 179 | 180 | 181 | if __name__ == '__main__': 182 | print('Load dataset...') 183 | x_train, y_train, x_dev, y_dev, word_index = construct_dataset() 184 | Max_Sequence_Length = x_train.shape[1] 185 | print('Max_Sequence_Length: ', Max_Sequence_Length) #202 186 | print('x_train.shape: ', np.shape(x_train)) 187 | print('y_dev.shape: ', np.shape(y_dev)) 188 | print('Load glove word vector...') 189 | embeddings_index = load_glove_model() 190 | embedding_dim = FLAGS.embedding_dim 191 | # print(word_index) 192 | 193 | embedding_layer = word_embedding(Max_Sequence_Length, embedding_dim, word_index, embeddings_index) 194 | 195 | sequence_input = Input(shape=(Max_Sequence_Length,), dtype=tf.int32) 196 | embeddings = embedding_layer(sequence_input) 197 | 198 | x = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(embeddings) 199 | x = Dropout(0.5)(x) 200 | preds = Dense(1, activation='sigmoid')(x) 201 | model = Model(sequence_input, preds) 202 | 203 | model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 204 | 205 | print('Train...') 206 | model.fit(x_train, y_train, batch_size=64, validation_data=[x_dev, y_dev], epochs=5, callbacks=[tf_board_callback]) 207 | 208 | -------------------------------------------------------------------------------- /LSTM/LSTM+random+Sentiment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: LSTM+random+Sentiment 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/25: 11 | ------------------------------------------------- 12 | """ 13 | 14 | import numpy as np 15 | import jieba 16 | import tensorflow as tf 17 | from sklearn.model_selection import train_test_split 18 | import os 19 | import keras 20 | tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1000, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 21 | 22 | from keras.initializers import Constant 23 | from keras.preprocessing.text import Tokenizer 24 | from keras.preprocessing.sequence import pad_sequences 25 | import numpy as np 26 | 27 | from keras.preprocessing import sequence 28 | from keras.models import Sequential 29 | from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional 30 | from keras.datasets import imdb 31 | 32 | 33 | #读取数据参数设置 34 | # tf.flags.DEFINE_float('dev_sample_percentage', .1, 'Percentage of the training data to use for validation') 35 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 36 | tf.flags.DEFINE_string('positive_data_file', '../dataset//weibo60000/pos60000_utf8.txt_updated', 'Data source for the positive data') 37 | tf.flags.DEFINE_string('negative_data_file', '../dataset/weibo60000/neg60000_utf8.txt_updated', 'Data source for the negative data') 38 | tf.flags.DEFINE_string('glove_dir', '../dataset/glove.6B.100d.txt', 'Data source for the pretrained glove word vector') 39 | tf.flags.DEFINE_integer('max_num_words', '40000', '出现频率最高的40000个词语保留在词表中') 40 | 41 | # FLAGS = tf.flags.FLAGS 42 | FLAGS = tf.flags.FLAGS 43 | 44 | """从文件中读取数据和标签""" 45 | def load_data_and_label(pos_filename, neg_filename): 46 | """读取积极类别的数据""" 47 | positive_texts = open(pos_filename, 'r', encoding='utf-8').readlines() 48 | # print(positive_texts) 49 | # positive_texts = open(positive_filename, 'rb').readlines() 50 | positive_texts = [' '.join(list(jieba.cut(line.strip()))) for line in positive_texts] 51 | print('积极句子数目:', len(positive_texts)) 52 | # print(len(positive_texts)) 53 | """读取消极类别的数据""" 54 | negative_texts = open(neg_filename, 'r', encoding='utf-8').readlines() 55 | # negative_texts = open(positive_filename, 'rb').readlines() 56 | negative_texts = [' '.join(list(jieba.cut(line.strip()))) for line in negative_texts] 57 | print('消极句子数目:', len(negative_texts)) 58 | 59 | """拼接""" 60 | x_text = positive_texts + negative_texts 61 | # print(x_text) 62 | print('全部句子数目:', len(x_text)) 63 | 64 | """生成标签""" 65 | positive_labels = [1 for _ in negative_texts] 66 | negative_labels = [0 for _ in negative_texts] 67 | y = np.concatenate([positive_labels, negative_labels], 0) 68 | print('标签数目:', len(y)) 69 | # print(y) 70 | # for mat in y: 71 | # print(mat) 72 | return [x_text, y] 73 | 74 | def construct_dataset(): 75 | print('加载数据......') 76 | # positive_filename = './data/rt-polaritydata/rt-polarity.pos' 77 | # negative_filename = './data/rt-polaritydata/rt-polarity.neg' 78 | # positive_filename = './data/rt-polarity.pos' 79 | # negative_filename = './data/rt-polarity.neg' 80 | x_text, y = load_data_and_label(FLAGS.positive_data_file, FLAGS.negative_data_file) 81 | 82 | """建立词汇表""" 83 | max_sentence_length = max([len(text.split(' ')) for text in x_text]) 84 | print('最长句子长度:', max_sentence_length) 85 | 86 | 87 | #tf.contrib.learn.preprocessing.VocabularyProcessor:生成词汇表,每一个文档/句子的长度<=max_sentnce_length,记录的是单词的位置信息 88 | # vocab_processor = learn.preprocessing.VocabularyProcessor(max_sentence_length) 89 | # #x:每一个句子中的单词对应词汇表的位置,word2id 90 | # x = np.array(list(vocab_processor.fit_transform(x_text))) 91 | 92 | tokenizer = Tokenizer() 93 | tokenizer.fit_on_texts(x_text) 94 | sequences = tokenizer.texts_to_sequences(x_text) 95 | 96 | word_index = tokenizer.word_index 97 | print('词表大小:', len(word_index)) 98 | 99 | x = pad_sequences(sequences, maxlen=max_sentence_length) 100 | 101 | print('词汇表建立完毕!') 102 | print('len(x):',len(x)) 103 | print('x:',x) 104 | print('x.shape:', x.shape) 105 | print('type(x):', type(x)) 106 | 107 | """随机模糊数据,即打乱各个元素的顺序,重新洗牌""" 108 | np.random.seed(10) 109 | #np.range()返回的是range object,而np.nrange()返回的是numpy.ndarray() 110 | shuffle_indices = np.random.permutation(np.arange(len(y))) 111 | x_shuffled = x[shuffle_indices] 112 | y_shuffled = y[shuffle_indices] 113 | 114 | """划分训练集/测试集,此处直接切分""" 115 | #此处加负号表示是从列表的后面开始查找对应位置 116 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 117 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 118 | # print('划分索引:', dev_sample_index) 119 | # x_train, x_dev = x_shuffled[:dev_sample_index], x[dev_sample_index:] 120 | # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 121 | 122 | """使用sklearn中的cross-validation划分数据集""" 123 | x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1, random_state=10) 124 | 125 | print('数据集构造完毕,信息如下:') 126 | print('训练集样本数目:', len(x_train)) 127 | print('训练集标签数目:', len(y_train)) 128 | print('开发集样本数目:', len(x_dev)) 129 | print('开发集标签数目:', len(y_dev)) 130 | # print(type(y_dev)) 131 | 132 | del x, y, x_shuffled, y_shuffled 133 | 134 | # print('词汇表 Size:', len(vocab_processor.vocabulary_)) 135 | # print(vocab_processor.vocabulary_) 136 | 137 | print('x的数据类型:', type(x_train[1][1])) 138 | print('y的数据类型:', type(y_train[1])) 139 | 140 | # return x_train, x_dev, y_train, y_dev, vocab_processor 141 | return x_train, y_train, x_dev, y_dev, word_index 142 | 143 | if __name__ == '__main__': 144 | print('Load dataset...') 145 | x_train, y_train, x_dev, y_dev, word_index = construct_dataset() 146 | Max_Sequence_Length = x_train.shape[1] 147 | print('Max_Sequence_Length: ', Max_Sequence_Length) #202 148 | print('x_train.shape: ', np.shape(x_train)) 149 | print('y_dev.shape: ', np.shape(y_dev)) 150 | 151 | max_features = FLAGS.max_num_words 152 | # cut texts after this number of words 153 | # (among top max_features most common words) 154 | maxlen = Max_Sequence_Length 155 | batch_size = 32 156 | 157 | model = Sequential() 158 | model.add(Embedding(len(word_index)+1, 128, input_length=maxlen)) 159 | model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) 160 | model.add(Dropout(0.5)) 161 | model.add(Dense(1, activation='sigmoid')) 162 | 163 | # try using different optimizers and different optimizer configs 164 | model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) 165 | 166 | print('Train...') 167 | model.fit(x_train, y_train, 168 | batch_size=batch_size, 169 | epochs=4, 170 | validation_data=[x_dev, y_dev], 171 | callbacks=[tf_board_callback]) 172 | -------------------------------------------------------------------------------- /Transformer_ATT/Transformer_ATT_sentiment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: test 5 | Description : 6 | Author : Stephen.Lau 7 | date: 2019/3/22 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/22: 11 | ------------------------------------------------- 12 | """ 13 | import numpy as np 14 | import jieba 15 | import re 16 | from tensorflow.contrib import learn 17 | import tensorflow as tf 18 | from sklearn.model_selection import train_test_split 19 | import os 20 | import keras 21 | tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1000, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) 22 | 23 | from keras.models import Model 24 | from keras.layers import Embedding, Input, GlobalAveragePooling1D, Dropout, Dense 25 | from keras.initializers import Constant 26 | from keras.preprocessing.text import Tokenizer 27 | from keras.preprocessing.sequence import pad_sequences 28 | from Transformer_Attention import Position_Embedding, Attention 29 | 30 | #读取数据参数设置 31 | # tf.flags.DEFINE_float('dev_sample_percentage', .1, 'Percentage of the training data to use for validation') 32 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 33 | tf.flags.DEFINE_string('positive_data_file', '../dataset/weibo60000/pos60000_utf8.txt_updated', 'Data source for the positive data') 34 | tf.flags.DEFINE_string('negative_data_file', '../dataset//weibo60000/neg60000_utf8.txt_updated', 'Data source for the negative data') 35 | tf.flags.DEFINE_string('glove_dir', '../dataset/glove.6B.100d.txt', 'Data source for the pretrained glove word vector') 36 | tf.flags.DEFINE_integer('max_num_words', '40000', '出现频率最高的40000个词语保留在词表中') 37 | 38 | # FLAGS = tf.flags.FLAGS 39 | FLAGS = tf.flags.FLAGS 40 | 41 | """从文件中读取数据和标签""" 42 | def load_data_and_label(pos_filename, neg_filename): 43 | """读取积极类别的数据""" 44 | positive_texts = open(pos_filename, 'r', encoding='utf-8').readlines() 45 | # print(positive_texts) 46 | # positive_texts = open(positive_filename, 'rb').readlines() 47 | positive_texts = [' '.join(list(jieba.cut(line.strip()))) for line in positive_texts] 48 | print('积极句子数目:', len(positive_texts)) 49 | # print(len(positive_texts)) 50 | """读取消极类别的数据""" 51 | negative_texts = open(neg_filename, 'r', encoding='utf-8').readlines() 52 | # negative_texts = open(positive_filename, 'rb').readlines() 53 | negative_texts = [' '.join(list(jieba.cut(line.strip()))) for line in negative_texts] 54 | print('消极句子数目:', len(negative_texts)) 55 | 56 | """拼接""" 57 | x_text = positive_texts + negative_texts 58 | # print(x_text) 59 | print('全部句子数目:', len(x_text)) 60 | 61 | """生成标签""" 62 | positive_labels = [1 for _ in negative_texts] 63 | negative_labels = [0 for _ in negative_texts] 64 | y = np.concatenate([positive_labels, negative_labels], 0) 65 | print('标签数目:', len(y)) 66 | # print(y) 67 | # for mat in y: 68 | # print(mat) 69 | return [x_text, y] 70 | 71 | def construct_dataset(): 72 | print('加载数据......') 73 | # positive_filename = './data/rt-polaritydata/rt-polarity.pos' 74 | # negative_filename = './data/rt-polaritydata/rt-polarity.neg' 75 | # positive_filename = './data/rt-polarity.pos' 76 | # negative_filename = './data/rt-polarity.neg' 77 | x_text, y = load_data_and_label(FLAGS.positive_data_file, FLAGS.negative_data_file) 78 | 79 | """建立词汇表""" 80 | max_sentence_length = max([len(text.split(' ')) for text in x_text]) 81 | print('最长句子长度:', max_sentence_length) 82 | 83 | 84 | #tf.contrib.learn.preprocessing.VocabularyProcessor:生成词汇表,每一个文档/句子的长度<=max_sentnce_length,记录的是单词的位置信息 85 | # vocab_processor = learn.preprocessing.VocabularyProcessor(max_sentence_length) 86 | # #x:每一个句子中的单词对应词汇表的位置,word2id 87 | # x = np.array(list(vocab_processor.fit_transform(x_text))) 88 | 89 | tokenizer = Tokenizer() 90 | tokenizer.fit_on_texts(x_text) 91 | sequences = tokenizer.texts_to_sequences(x_text) 92 | 93 | word_index = tokenizer.word_index 94 | print('词表大小:', len(word_index)) 95 | 96 | x = pad_sequences(sequences, maxlen=max_sentence_length) 97 | 98 | print('词汇表建立完毕!') 99 | print('len(x):',len(x)) 100 | print('x:',x) 101 | print('x.shape:', x.shape) 102 | print('type(x):', type(x)) 103 | 104 | """随机模糊数据,即打乱各个元素的顺序,重新洗牌""" 105 | np.random.seed(10) 106 | #np.range()返回的是range object,而np.nrange()返回的是numpy.ndarray() 107 | shuffle_indices = np.random.permutation(np.arange(len(y))) 108 | x_shuffled = x[shuffle_indices] 109 | y_shuffled = y[shuffle_indices] 110 | 111 | """划分训练集/测试集,此处直接切分""" 112 | #此处加负号表示是从列表的后面开始查找对应位置 113 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 114 | # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) 115 | # print('划分索引:', dev_sample_index) 116 | # x_train, x_dev = x_shuffled[:dev_sample_index], x[dev_sample_index:] 117 | # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 118 | 119 | """使用sklearn中的cross-validation划分数据集""" 120 | x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1, random_state=10) 121 | 122 | print('数据集构造完毕,信息如下:') 123 | print('训练集样本数目:', len(x_train)) 124 | print('训练集标签数目:', len(y_train)) 125 | print('开发集样本数目:', len(x_dev)) 126 | print('开发集标签数目:', len(y_dev)) 127 | # print(type(y_dev)) 128 | 129 | del x, y, x_shuffled, y_shuffled 130 | 131 | # print('词汇表 Size:', len(vocab_processor.vocabulary_)) 132 | # print(vocab_processor.vocabulary_) 133 | 134 | print('x的数据类型:', type(x_train[1][1])) 135 | print('y的数据类型:', type(y_train[1])) 136 | 137 | # return x_train, x_dev, y_train, y_dev, vocab_processor 138 | return x_train, y_train, x_dev, y_dev, word_index 139 | 140 | def load_glove_model(): 141 | print('Indexing word vectors.') 142 | 143 | embeddings_index = {} 144 | with open(FLAGS.glove_dir, 'r', encoding='utf-8') as f: 145 | for line in f: 146 | values = line.split() 147 | word = values[0] 148 | coefs = np.asarray(values[1:], dtype='float32') 149 | embeddings_index[word] = coefs 150 | 151 | print('Found %s word vectors.' % len(embeddings_index)) 152 | return embeddings_index 153 | 154 | def word_embedding(Max_Sequence_Length, embedding_dim, word_index): 155 | # prepare embedding matrix 156 | # num_words = min(MAX_NUM_WORDS, len(word_index)) + 1 157 | num_words = len(word_index) + 1 158 | word_index = word_index 159 | embedding_matrix = np.zeros((num_words, embedding_dim)) 160 | for word, i in word_index.items(): 161 | # if i > MAX_NUM_WORDS: 162 | # continue 163 | embedding_vector = embeddings_index.get(word) 164 | if embedding_vector is not None: 165 | # words not found in embedding index will be all-zeros. 166 | embedding_matrix[i] = embedding_vector 167 | 168 | # load pre-trained word embeddings into an Embedding layer 169 | # note that we set trainable = False so as to keep the embeddings fixed 170 | embedding_layer = Embedding(num_words, 171 | embedding_dim, 172 | embeddings_initializer=Constant(embedding_matrix), 173 | input_length=Max_Sequence_Length, 174 | trainable=False) 175 | return embedding_layer 176 | 177 | 178 | if __name__ == '__main__': 179 | print('Load dataset...') 180 | x_train, y_train, x_dev, y_dev, word_index = construct_dataset() 181 | Max_Sequence_Length = x_train.shape[1] 182 | print('Max_Sequence_Length: ', Max_Sequence_Length) 183 | print('x_train.shape: ', np.shape(x_train)) 184 | print('y_dev.shape: ', np.shape(y_dev)) 185 | print('Load glove word vector...') 186 | embeddings_index = load_glove_model() 187 | embedding_dim = 100 188 | # print(word_index) 189 | 190 | embedding_layer = word_embedding(Max_Sequence_Length, embedding_dim, word_index) 191 | 192 | sequence_input = Input(shape=(Max_Sequence_Length,), dtype=tf.int32) 193 | embeddings = embedding_layer(sequence_input) 194 | 195 | embeddings = Position_Embedding()(embeddings) # 增加Position_Embedding能轻微提高准确率 196 | O_seq = Attention(8, 16)([embeddings, embeddings, embeddings]) 197 | O_seq = GlobalAveragePooling1D()(O_seq) 198 | O_seq = Dropout(0.5)(O_seq) 199 | outputs = Dense(1, activation='sigmoid')(O_seq) 200 | 201 | model = Model(inputs=sequence_input, outputs=outputs) 202 | # try using different optimizers and different optimizer configs 203 | model.compile(loss='binary_crossentropy', 204 | optimizer='adam', 205 | metrics=['accuracy']) 206 | 207 | print('Train...') 208 | 209 | model.fit(x_train, y_train, 210 | batch_size=128, 211 | epochs=50, 212 | validation_data=(x_dev, y_dev), 213 | callbacks=[tf_board_callback]) 214 | -------------------------------------------------------------------------------- /Transformer_ATT/Transformer_Attention.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: Transformer_Attention 5 | Description : transformer网络中的Attention机制实现 6 | Author : Stephen.Lau 7 | date: 2019/3/22 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/3/22: 11 | ------------------------------------------------- 12 | """ 13 | 14 | from keras import backend as K 15 | from keras.engine.topology import Layer 16 | 17 | 18 | class Position_Embedding(Layer): 19 | 20 | def __init__(self, size=None, mode='sum', **kwargs): 21 | self.size = size # 必须为偶数 22 | self.mode = mode 23 | super(Position_Embedding, self).__init__(**kwargs) 24 | 25 | def call(self, x): 26 | if (self.size == None) or (self.mode == 'sum'): 27 | self.size = int(x.shape[-1]) 28 | batch_size, seq_len = K.shape(x)[0], K.shape(x)[1] 29 | position_j = 1. / K.pow(10000., \ 30 | 2 * K.arange(self.size / 2, dtype='float32' \ 31 | ) / self.size) 32 | position_j = K.expand_dims(position_j, 0) 33 | position_i = K.cumsum(K.ones_like(x[:, :, 0]), 1) - 1 # K.arange不支持变长,只好用这种方法生成 34 | position_i = K.expand_dims(position_i, 2) 35 | position_ij = K.dot(position_i, position_j) 36 | position_ij = K.concatenate([K.cos(position_ij), K.sin(position_ij)], 2) 37 | if self.mode == 'sum': 38 | return position_ij + x 39 | elif self.mode == 'concat': 40 | return K.concatenate([position_ij, x], 2) 41 | 42 | def compute_output_shape(self, input_shape): 43 | if self.mode == 'sum': 44 | return input_shape 45 | elif self.mode == 'concat': 46 | return (input_shape[0], input_shape[1], input_shape[2] + self.size) 47 | 48 | 49 | class Attention(Layer): 50 | 51 | def __init__(self, nb_head, size_per_head, **kwargs): 52 | self.nb_head = nb_head 53 | self.size_per_head = size_per_head 54 | self.output_dim = nb_head * size_per_head 55 | super(Attention, self).__init__(**kwargs) 56 | 57 | def build(self, input_shape): 58 | self.WQ = self.add_weight(name='WQ', 59 | shape=(input_shape[0][-1], self.output_dim), 60 | initializer='glorot_uniform', 61 | trainable=True) 62 | self.WK = self.add_weight(name='WK', 63 | shape=(input_shape[1][-1], self.output_dim), 64 | initializer='glorot_uniform', 65 | trainable=True) 66 | self.WV = self.add_weight(name='WV', 67 | shape=(input_shape[2][-1], self.output_dim), 68 | initializer='glorot_uniform', 69 | trainable=True) 70 | super(Attention, self).build(input_shape) 71 | 72 | def Mask(self, inputs, seq_len, mode='mul'): 73 | if seq_len == None: 74 | return inputs 75 | else: 76 | mask = K.one_hot(seq_len[:, 0], K.shape(inputs)[1]) 77 | mask = 1 - K.cumsum(mask, 1) 78 | for _ in range(len(inputs.shape) - 2): 79 | mask = K.expand_dims(mask, 2) 80 | if mode == 'mul': 81 | return inputs * mask 82 | if mode == 'add': 83 | return inputs - (1 - mask) * 1e12 84 | 85 | def call(self, x): 86 | # 如果只传入Q_seq,K_seq,V_seq,那么就不做Mask 87 | # 如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len,那么对多余部分做Mask 88 | if len(x) == 3: 89 | Q_seq, K_seq, V_seq = x 90 | Q_len, V_len = None, None 91 | elif len(x) == 5: 92 | Q_seq, K_seq, V_seq, Q_len, V_len = x 93 | # 对Q、K、V做线性变换 94 | Q_seq = K.dot(Q_seq, self.WQ) 95 | Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head)) 96 | Q_seq = K.permute_dimensions(Q_seq, (0, 2, 1, 3)) 97 | K_seq = K.dot(K_seq, self.WK) 98 | K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head)) 99 | K_seq = K.permute_dimensions(K_seq, (0, 2, 1, 3)) 100 | V_seq = K.dot(V_seq, self.WV) 101 | V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head)) 102 | V_seq = K.permute_dimensions(V_seq, (0, 2, 1, 3)) 103 | # 计算内积,然后mask,然后softmax 104 | A = K.batch_dot(Q_seq, K_seq, axes=[3, 3]) / self.size_per_head ** 0.5 105 | A = K.permute_dimensions(A, (0, 3, 2, 1)) 106 | A = self.Mask(A, V_len, 'add') 107 | A = K.permute_dimensions(A, (0, 3, 2, 1)) 108 | A = K.softmax(A) 109 | # 输出并mask 110 | O_seq = K.batch_dot(A, V_seq, axes=[3, 2]) 111 | O_seq = K.permute_dimensions(O_seq, (0, 2, 1, 3)) 112 | O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim)) 113 | O_seq = self.Mask(O_seq, Q_len, 'mul') 114 | return O_seq 115 | 116 | def compute_output_shape(self, input_shape): 117 | return (input_shape[0][0], input_shape[0][1], self.output_dim) 118 | --------------------------------------------------------------------------------