├── Ebusiness.csv
├── README.md
├── data_preprocess.py
├── demo.py
├── net.py
└── text_classification.py


/README.md:
--------------------------------------------------------------------------------
 1 | # public-opinion-analysis
 2 | 项目是一个NLP中的一个情感分析的业务，属于二分类任务。数据是舆情系统中从某电商平台上爬取下来的评论数据。人工对数据进行标记，分为两个类：分别为正面和负面。在很多模型进行比较后，决定用卷积网络，取得了很好的效果
 3 | 
 4 | # 文本预处理：data_preprocess.py
 5 | 电商数据为csv格式，由evalution和label两个字段组成，风别为用户评论和正负面标签。对原始的文本进行分词，转编码等预处理
 6 | 
 7 | 模型训练：net.py和text_classification.py
 8 | net.py:CNN模型和模型的参数
 9 | text_classification.py：训练模型
10 | 
11 | 模型预测：demo.py
12 | 保存模型，输出score为0.9334
13 | 


--------------------------------------------------------------------------------
/data_preprocess.py:
--------------------------------------------------------------------------------
 1 | import jieba
 2 | from keras.preprocessing.text import Tokenizer
 3 | from keras.preprocessing.sequence import pad_sequences
 4 | import numpy as np
 5 | 
 6 | jieba.setLogLevel('WARN')
 7 | 
 8 | 
 9 | class DataPreprocess():
10 |     def __init__(self, tokenizer=None,
11 |                  label_set=None):
12 |         self.tokenizer = tokenizer
13 |         self.num_words = None
14 |         self.label_set = label_set
15 |         self.sentence_len = None
16 |         self.word_len = None
17 | 
18 |     def cut_texts(self, texts=None, word_len=1):
19 | 
20 |         if word_len > 1:
21 |             texts_cut = [[word for word in jieba.lcut(text) if len(word) >= word_len] for text in texts]
22 |         else:
23 |             texts_cut = [jieba.lcut(one_text) for one_text in texts]
24 | 
25 |         self.word_len = word_len
26 | 
27 |         return texts_cut
28 | 
29 |     def train_tokenizer(self,
30 |                         texts_cut=None,
31 |                         num_words=2000):
32 | 
33 |         tokenizer = Tokenizer(num_words=num_words)
34 |         tokenizer.fit_on_texts(texts=texts_cut)
35 |         num_words = min(num_words, len(tokenizer.word_index) + 1)
36 |         self.tokenizer = tokenizer
37 |         self.num_words = num_words
38 | 
39 |     def text2seq(self,
40 |                  texts_cut,
41 |                  sentence_len=30):
42 | 
43 |         tokenizer = self.tokenizer
44 |         texts_seq = tokenizer.texts_to_sequences(texts=texts_cut)
45 |         del texts_cut
46 | 
47 |         texts_pad_seq = pad_sequences(texts_seq,
48 |                                       maxlen=sentence_len,
49 |                                       padding='post',
50 |                                       truncating='post')
51 |         self.sentence_len = sentence_len
52 |         return texts_pad_seq
53 | 
54 |     def creat_label_set(self, labels):
55 | 
56 |         label_set = set()
57 |         for i in labels:
58 |             label_set = label_set.union(set(i))
59 | 
60 |         self.label_set = np.array(list(label_set))
61 | 
62 |     def creat_label(self, label):
63 | 
64 |         label_set = self.label_set
65 |         label_zero = np.zeros(len(label_set))
66 |         label_zero[np.in1d(label_set, label)] = 1
67 |         return label_zero
68 | 
69 |     def creat_labels(self, labels=None):
70 | 
71 |         label_set = self.label_set
72 |         labels_one_hot = [self.creat_label(label) for label in labels]
73 | 
74 |         return np.array(labels_one_hot)


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pandas as pd
 3 | 
 4 | from sklearn.model_selection import train_test_split
 5 | import tensorflow as tf
 6 | import pickle
 7 | import numpy as np
 8 | 
 9 | sess = tf.InteractiveSession()
10 | 
11 | data = pd.read_csv("Ebusiness.csv", encoding='utf-8')
12 | x = data['evaluation']
13 | y = [[i] for i in data['label']]
14 | 
15 | 
16 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
17 | 
18 | 
19 | from TextClassification import TextClassification
20 | 
21 | clf = TextClassification()
22 | texts_seq, texts_labels = clf.get_preprocess(x_train, y_train,
23 |                                              word_len=1,
24 |                                              num_words=2000,
25 |                                              sentence_len=50)
26 | clf.fit(texts_seq=texts_seq,
27 |         texts_labels=texts_labels,
28 |         output_type=data_type,
29 |         epochs=10,
30 |         batch_size=64,
31 |         model=None)
32 | 
33 | 
34 | with open('./%s.pkl' % data_type, 'wb') as f:
35 |     pickle.dump(clf, f)
36 | 
37 | 
38 | 
39 | 
40 | with open('./%s.pkl' % data_type, 'rb') as f:
41 |     clf = pickle.load(f)
42 | y_predict = clf.predict(x_test)
43 | y_predict = [[clf.preprocess.label_set[i.argmax()]] for i in y_predict]
44 | score = sum(y_predict == np.array(y_test)) / len(y_test)
45 | print(score)


--------------------------------------------------------------------------------
/net.py:
--------------------------------------------------------------------------------
 1 | from keras.models import Model
 2 | from keras.layers import Dense, Embedding, Input
 3 | from keras.layers import Conv1D, GlobalMaxPool1D, Dropout
 4 | 
 5 | 
 6 | def CNN(input_dim,
 7 |         input_length,
 8 |         vec_size,
 9 |         output_shape):
10 | 
11 |     data_input = Input(shape=[input_length])
12 |     word_vec = Embedding(input_dim=input_dim + 1,
13 |                          input_length=input_length,
14 |                          output_dim=vec_size)(data_input)
15 |     x = Conv1D(filters=128,
16 |                kernel_size=[3],
17 |                strides=1,
18 |                padding='same',
19 |                activation='relu')(word_vec)
20 |     x = GlobalMaxPool1D()(x)
21 |     x = Dense(500, activation='relu')(x)
22 |     x = Dropout(0.1)(x)
23 | 
24 |     x = Dense(output_shape, activation='softmax')(x)
25 |     model = Model(inputs=data_input, outputs=x)
26 |     model.compile(loss='categorical_crossentropy',
27 |                       optimizer='adam',
28 |                       metrics=['acc'])
29 | 
30 |     return model
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     model = CNN(input_dim=10, input_length=10, vec_size=10, output_shape=10)
35 |     model.summary()


--------------------------------------------------------------------------------
/text_classification.py:
--------------------------------------------------------------------------------
 1 | from data_preprocess import DataPreprocess
 2 | from net import CNN
 3 | import numpy as np
 4 | 
 5 | 
 6 | class TextClassification():
 7 |     def __init__(self):
 8 |         self.preprocess = None
 9 |         self.model = None
10 | 
11 |     def get_preprocess(self, texts, labels, word_len=1, num_words=2000, sentence_len=30):
12 | 
13 |         preprocess = DataPreprocess()
14 | 
15 | 
16 |         texts_cut = preprocess.cut_texts(texts, word_len)
17 |         preprocess.train_tokenizer(texts_cut, num_words)
18 |         texts_seq = preprocess.text2seq(texts_cut, sentence_len)
19 | 
20 | 
21 |         preprocess.creat_label_set(labels)
22 |         labels = preprocess.creat_labels(labels)
23 |         self.preprocess = preprocess
24 | 
25 |         return texts_seq, labels
26 | 
27 |     def fit(self, texts_seq, texts_labels,epochs, batch_size, model=None):
28 |         if model is None:
29 |             preprocess = self.preprocess
30 |             model = CNN(preprocess.num_words,
31 |                         preprocess.sentence_len,
32 |                         128,
33 |                         len(preprocess.label_set)
34 |                         )
35 | 
36 |         model.fit(texts_seq,
37 |                   texts_labels,
38 |                   epochs=epochs,
39 |                   batch_size=batch_size)
40 |         self.model = model
41 | 
42 |     def predict(self, texts):
43 |         preprocess = self.preprocess
44 |         word_len = preprocess.word_len
45 |         sentence_len = preprocess.sentence_len
46 | 
47 | 
48 |         texts_cut = preprocess.cut_texts(texts, word_len)
49 |         texts_seq = preprocess.text2seq(texts_cut, sentence_len)
50 | 
51 |         return self.model.predict(texts_seq)
52 | 
53 |     def label2toptag(self, predictions, labelset):
54 |         labels = []
55 |         for prediction in predictions:
56 |             label = labelset[prediction == prediction.max()]
57 |             labels.append(label.tolist())
58 |         return labels
59 | 
60 |     def label2half(self, predictions, labelset):
61 |         labels = []
62 |         for prediction in predictions:
63 |             label = labelset[prediction > 0.5]
64 |             labels.append(label.tolist())
65 |         return labels
66 | 
67 |     def label2tag(self, predictions, labelset):
68 |         labels1 = self.label2toptag(predictions, labelset)
69 |         labels2 = self.label2half(predictions, labelset)
70 |         labels = []
71 |         for i in range(len(predictions)):
72 |             if len(labels2[i]) == 0:
73 |                 labels.append(labels1[i])
74 |             else:
75 |                 labels.append(labels2[i])
76 |         return labels


--------------------------------------------------------------------------------