├── Ebusiness.csv ├── README.md ├── data_preprocess.py ├── demo.py ├── net.py └── text_classification.py /README.md: -------------------------------------------------------------------------------- 1 | # public-opinion-analysis 2 | 项目是一个NLP中的一个情感分析的业务,属于二分类任务。数据是舆情系统中从某电商平台上爬取下来的评论数据。人工对数据进行标记,分为两个类:分别为正面和负面。在很多模型进行比较后,决定用卷积网络,取得了很好的效果 3 | 4 | # 文本预处理:data_preprocess.py 5 | 电商数据为csv格式,由evalution和label两个字段组成,风别为用户评论和正负面标签。对原始的文本进行分词,转编码等预处理 6 | 7 | 模型训练:net.py和text_classification.py 8 | net.py:CNN模型和模型的参数 9 | text_classification.py:训练模型 10 | 11 | 模型预测:demo.py 12 | 保存模型,输出score为0.9334 13 | -------------------------------------------------------------------------------- /data_preprocess.py: -------------------------------------------------------------------------------- 1 | import jieba 2 | from keras.preprocessing.text import Tokenizer 3 | from keras.preprocessing.sequence import pad_sequences 4 | import numpy as np 5 | 6 | jieba.setLogLevel('WARN') 7 | 8 | 9 | class DataPreprocess(): 10 | def __init__(self, tokenizer=None, 11 | label_set=None): 12 | self.tokenizer = tokenizer 13 | self.num_words = None 14 | self.label_set = label_set 15 | self.sentence_len = None 16 | self.word_len = None 17 | 18 | def cut_texts(self, texts=None, word_len=1): 19 | 20 | if word_len > 1: 21 | texts_cut = [[word for word in jieba.lcut(text) if len(word) >= word_len] for text in texts] 22 | else: 23 | texts_cut = [jieba.lcut(one_text) for one_text in texts] 24 | 25 | self.word_len = word_len 26 | 27 | return texts_cut 28 | 29 | def train_tokenizer(self, 30 | texts_cut=None, 31 | num_words=2000): 32 | 33 | tokenizer = Tokenizer(num_words=num_words) 34 | tokenizer.fit_on_texts(texts=texts_cut) 35 | num_words = min(num_words, len(tokenizer.word_index) + 1) 36 | self.tokenizer = tokenizer 37 | self.num_words = num_words 38 | 39 | def text2seq(self, 40 | texts_cut, 41 | sentence_len=30): 42 | 43 | tokenizer = self.tokenizer 44 | texts_seq = tokenizer.texts_to_sequences(texts=texts_cut) 45 | del texts_cut 46 | 47 | texts_pad_seq = pad_sequences(texts_seq, 48 | maxlen=sentence_len, 49 | padding='post', 50 | truncating='post') 51 | self.sentence_len = sentence_len 52 | return texts_pad_seq 53 | 54 | def creat_label_set(self, labels): 55 | 56 | label_set = set() 57 | for i in labels: 58 | label_set = label_set.union(set(i)) 59 | 60 | self.label_set = np.array(list(label_set)) 61 | 62 | def creat_label(self, label): 63 | 64 | label_set = self.label_set 65 | label_zero = np.zeros(len(label_set)) 66 | label_zero[np.in1d(label_set, label)] = 1 67 | return label_zero 68 | 69 | def creat_labels(self, labels=None): 70 | 71 | label_set = self.label_set 72 | labels_one_hot = [self.creat_label(label) for label in labels] 73 | 74 | return np.array(labels_one_hot) -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | from sklearn.model_selection import train_test_split 5 | import tensorflow as tf 6 | import pickle 7 | import numpy as np 8 | 9 | sess = tf.InteractiveSession() 10 | 11 | data = pd.read_csv("Ebusiness.csv", encoding='utf-8') 12 | x = data['evaluation'] 13 | y = [[i] for i in data['label']] 14 | 15 | 16 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) 17 | 18 | 19 | from TextClassification import TextClassification 20 | 21 | clf = TextClassification() 22 | texts_seq, texts_labels = clf.get_preprocess(x_train, y_train, 23 | word_len=1, 24 | num_words=2000, 25 | sentence_len=50) 26 | clf.fit(texts_seq=texts_seq, 27 | texts_labels=texts_labels, 28 | output_type=data_type, 29 | epochs=10, 30 | batch_size=64, 31 | model=None) 32 | 33 | 34 | with open('./%s.pkl' % data_type, 'wb') as f: 35 | pickle.dump(clf, f) 36 | 37 | 38 | 39 | 40 | with open('./%s.pkl' % data_type, 'rb') as f: 41 | clf = pickle.load(f) 42 | y_predict = clf.predict(x_test) 43 | y_predict = [[clf.preprocess.label_set[i.argmax()]] for i in y_predict] 44 | score = sum(y_predict == np.array(y_test)) / len(y_test) 45 | print(score) -------------------------------------------------------------------------------- /net.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | from keras.layers import Dense, Embedding, Input 3 | from keras.layers import Conv1D, GlobalMaxPool1D, Dropout 4 | 5 | 6 | def CNN(input_dim, 7 | input_length, 8 | vec_size, 9 | output_shape): 10 | 11 | data_input = Input(shape=[input_length]) 12 | word_vec = Embedding(input_dim=input_dim + 1, 13 | input_length=input_length, 14 | output_dim=vec_size)(data_input) 15 | x = Conv1D(filters=128, 16 | kernel_size=[3], 17 | strides=1, 18 | padding='same', 19 | activation='relu')(word_vec) 20 | x = GlobalMaxPool1D()(x) 21 | x = Dense(500, activation='relu')(x) 22 | x = Dropout(0.1)(x) 23 | 24 | x = Dense(output_shape, activation='softmax')(x) 25 | model = Model(inputs=data_input, outputs=x) 26 | model.compile(loss='categorical_crossentropy', 27 | optimizer='adam', 28 | metrics=['acc']) 29 | 30 | return model 31 | 32 | 33 | if __name__ == '__main__': 34 | model = CNN(input_dim=10, input_length=10, vec_size=10, output_shape=10) 35 | model.summary() -------------------------------------------------------------------------------- /text_classification.py: -------------------------------------------------------------------------------- 1 | from data_preprocess import DataPreprocess 2 | from net import CNN 3 | import numpy as np 4 | 5 | 6 | class TextClassification(): 7 | def __init__(self): 8 | self.preprocess = None 9 | self.model = None 10 | 11 | def get_preprocess(self, texts, labels, word_len=1, num_words=2000, sentence_len=30): 12 | 13 | preprocess = DataPreprocess() 14 | 15 | 16 | texts_cut = preprocess.cut_texts(texts, word_len) 17 | preprocess.train_tokenizer(texts_cut, num_words) 18 | texts_seq = preprocess.text2seq(texts_cut, sentence_len) 19 | 20 | 21 | preprocess.creat_label_set(labels) 22 | labels = preprocess.creat_labels(labels) 23 | self.preprocess = preprocess 24 | 25 | return texts_seq, labels 26 | 27 | def fit(self, texts_seq, texts_labels,epochs, batch_size, model=None): 28 | if model is None: 29 | preprocess = self.preprocess 30 | model = CNN(preprocess.num_words, 31 | preprocess.sentence_len, 32 | 128, 33 | len(preprocess.label_set) 34 | ) 35 | 36 | model.fit(texts_seq, 37 | texts_labels, 38 | epochs=epochs, 39 | batch_size=batch_size) 40 | self.model = model 41 | 42 | def predict(self, texts): 43 | preprocess = self.preprocess 44 | word_len = preprocess.word_len 45 | sentence_len = preprocess.sentence_len 46 | 47 | 48 | texts_cut = preprocess.cut_texts(texts, word_len) 49 | texts_seq = preprocess.text2seq(texts_cut, sentence_len) 50 | 51 | return self.model.predict(texts_seq) 52 | 53 | def label2toptag(self, predictions, labelset): 54 | labels = [] 55 | for prediction in predictions: 56 | label = labelset[prediction == prediction.max()] 57 | labels.append(label.tolist()) 58 | return labels 59 | 60 | def label2half(self, predictions, labelset): 61 | labels = [] 62 | for prediction in predictions: 63 | label = labelset[prediction > 0.5] 64 | labels.append(label.tolist()) 65 | return labels 66 | 67 | def label2tag(self, predictions, labelset): 68 | labels1 = self.label2toptag(predictions, labelset) 69 | labels2 = self.label2half(predictions, labelset) 70 | labels = [] 71 | for i in range(len(predictions)): 72 | if len(labels2[i]) == 0: 73 | labels.append(labels1[i]) 74 | else: 75 | labels.append(labels2[i]) 76 | return labels --------------------------------------------------------------------------------