├── CNN.py
├── CNN_acc_loss.png
├── LSTM.py
├── LSTM_acc_loss.png
├── README.md
├── naiveBayes.py
├── sentiwordnet.py
├── svmsenti.py
├── test.py
└── utils.py


/CNN.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | import numpy as np
 4 | from utils import data_process,MyDataset
 5 | from torch.utils.data import DataLoader
 6 | from LSTM import train,test
 7 | 
 8 | class TextCNN(nn.Module):
 9 |     def __init__(self, vocab_size, embed_sizes, kernel_sizes, num_channels,
10 |                  **kwargs):
11 |         super(TextCNN, self).__init__(**kwargs)
12 |         self.embedding = nn.Embedding(vocab_size, embed_sizes)
13 |         # 这个嵌入层不需要训练
14 |         self.constant_embedding = nn.Embedding(vocab_size, embed_sizes)
15 |         self.dropout = nn.Dropout(0.5)
16 |         self.decoder = nn.Linear(sum(num_channels), 2)
17 |         # 最大时间汇聚层没有参数，因此可以共享此实例
18 |         self.pool = nn.AdaptiveAvgPool1d(1)
19 |         self.relu = nn.ReLU()
20 |         # 创建多个一维卷积层
21 |         self.convs = nn.ModuleList()
22 |         for c, k in zip(num_channels, kernel_sizes):
23 |             self.convs.append(nn.Conv1d(2 * embed_sizes, c, k))
24 | 
25 |     def forward(self, inputs):
26 |         # 沿着向量维度将两个嵌入层连结起来，
27 |         # 每个嵌入层的输出形状都是（批量大小，词元数量，词元向量维度）连结起来
28 |         embeddings = torch.cat((
29 |             self.embedding(inputs), self.constant_embedding(inputs)), dim=2)
30 |         # 根据一维卷积层的输入格式，重新排列张量，以便通道作为第2维
31 |         embeddings = embeddings.permute(0, 2, 1)
32 |         # 每个一维卷积层在最大时间汇聚层合并后，获得的张量形状是（批量大小，通道数，1）
33 |         # 删除最后一个维度并沿通道维度连结
34 |         encoding = torch.cat([
35 |             torch.squeeze(self.relu(self.pool(conv(embeddings))), dim=-1)
36 |             for conv in self.convs], dim=1)
37 |         outputs = self.decoder(self.dropout(encoding))
38 |         return outputs
39 | 
40 | def main():
41 | 
42 |     train_dir = './aclImdb_v1/aclImdb/train'  # 原训练集文件地址
43 |     train_path = './aclImdb_v1/aclImdb/train.txt'  # 预处理后的训练集文件地址
44 | 
45 |     test_dir = './aclImdb_v1/aclImdb/test'  # 原训练集文件地址
46 |     test_path = './aclImdb_v1/aclImdb/test.txt'  # 预处理后的训练集文件地址
47 | 
48 |     vocab = data_process(train_path, train_dir) # 数据预处理
49 |     data_process(test_path, test_dir)
50 |     np.save('vocab.npy', vocab) # 词典保存为本地
51 |     vocab = np.load('vocab.npy', allow_pickle=True).item()  # 加载本地已经存储的vocab
52 | 
53 |     # 构建MyDataset实例
54 |     train_data = MyDataset(text_path=train_path)
55 |     test_data = MyDataset(text_path=test_path)
56 | 
57 |     # 构建DataLoder
58 |     train_loader = DataLoader(dataset=train_data, batch_size=256, shuffle=True)
59 |     test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)
60 | 
61 |     # 生成模型
62 |     embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
63 |     net = TextCNN(len(vocab), embed_size, kernel_sizes, nums_channels)
64 | 
65 |     train(model=net, train_data=train_loader, vocab=vocab, epoch=10,method='CNN')
66 | 
67 |     # 加载训练好的模型
68 |     net.load_state_dict(torch.load('cnn.pkl', map_location=torch.device('cuda')))
69 | 
70 |     # 测试结果
71 |     acc = test(model=net, test_data=test_loader, vocab=vocab)
72 |     print(acc)
73 | 
74 | if __name__ == '__main__':
75 |     main()


--------------------------------------------------------------------------------
/CNN_acc_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hxy-62/sentimentclassify/021ebfe7227fc417a60eead51d51968fe35da77a/CNN_acc_loss.png


--------------------------------------------------------------------------------
/LSTM.py:
--------------------------------------------------------------------------------
  1 | import torch # torch==1.7.1
  2 | import torch.nn as nn
  3 | from torch.utils.data import DataLoader
  4 | import os
  5 | import re
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | from utils import tokenize,clean_str,data_process,MAX_LEN,text_transform,MyDataset
  9 | 
 10 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 11 | 
 12 | 
 13 | # 定义LSTM模型
 14 | class LSTM(nn.Module):
 15 |     def __init__(self, vocab, embed_size, num_hiddens, num_layers):
 16 |         super(LSTM, self).__init__()
 17 |         self.embedding = nn.Embedding(len(vocab), embed_size) # embedding层
 18 | 
 19 |         self.encoder = nn.LSTM(input_size=embed_size,
 20 |                                hidden_size=num_hiddens,
 21 |                                num_layers=num_layers,
 22 |                                bidirectional=False)
 23 |         self.decoder = nn.Linear(num_hiddens, 2)
 24 |         self.softmax = nn.Softmax(dim=1)
 25 | 
 26 |     def forward(self, inputs):
 27 |         # inputs的形状是（批量大小，词数），因此LSTM需要将序列长度（Seq_len）作为第一维，所以将输入转置后 再提取词特征
 28 |         embeddings = self.embedding(inputs.permute(1,0)) # permute(1,0)交换维度
 29 |         # LSTM只传入输入embeddings,因此只返回最后一层的隐藏层再各时间步的隐藏状态
 30 |         # outputs的形状是（词数，批量大小， 隐藏单元个数）
 31 |         outputs, _ = self.encoder(embeddings)
 32 |         # 连接初时间步和最终时间步的隐藏状态作为全连接层的输入。形状为(批量大小， 隐藏单元个数)
 33 |         encoding = outputs[-1] # 取LSTM最后一层结果
 34 |         outs = self.softmax(self.decoder(encoding)) # 输出层为二维概率[a,b]
 35 |         return outs
 36 | 
 37 | # 模型训练
 38 | def train(model, train_data, vocab, epoch=10,method='LSTM'):
 39 |     print('train model')
 40 |     model = model.to(device)
 41 |     loss_sigma = 0.0
 42 |     correct = 0.0
 43 |     # 定义损失函数和优化器
 44 |     if method == 'LSTM':
 45 |         criterion = torch.nn.NLLLoss()
 46 |     elif method == 'CNN':
 47 |         criterion = torch.nn.CrossEntropyLoss()
 48 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)
 49 | 
 50 |     for epoch in tqdm(range(epoch)):
 51 |         model.train()
 52 |         avg_loss = 0  # 平均损失
 53 |         avg_acc = 0  # 平均准确率
 54 |         for idx, (text, label) in enumerate(tqdm(train_data)):
 55 | 
 56 |             train_x = text_transform(text, vocab).to(device)
 57 |             train_y = label.to(device)
 58 | 
 59 |             optimizer.zero_grad()
 60 |             pred = model(train_x)
 61 |             if method == 'LSTM':
 62 |                 pred = pred.log()
 63 |             loss = criterion(pred, train_y)
 64 |             loss.backward()
 65 |             optimizer.step()
 66 |             avg_loss += loss.item()
 67 |             avg_acc += accuracy(pred, train_y)
 68 |         # 一个epoch结束后，计算平均loss和评平均acc
 69 |         avg_loss = avg_loss / len(train_data)
 70 |         avg_acc = avg_acc / len(train_data)
 71 | 
 72 |         print("avg_loss:", avg_loss, " train_avg_acc:,", avg_acc)
 73 | 
 74 |         # 保存训练完成后的模型参数
 75 |         if method == 'LSTM':
 76 |             torch.save(model.state_dict(), 'LSTM_IMDB_parameter.pkl')
 77 |         elif method == 'CNN':
 78 |             torch.save(model.state_dict(), 'CNN.pkl')
 79 | 
 80 | 
 81 | # 模型测试
 82 | def test(model, test_data, vocab):
 83 |     print('test model')
 84 |     model = model.to(device)
 85 |     model.eval()
 86 |     avg_acc = 0
 87 |     for idx, (text, label) in enumerate(tqdm(test_data)):
 88 |         train_x = text_transform(text, vocab).to(device)
 89 |         train_y = label.to(device)
 90 |         pred = model(train_x)
 91 |         avg_acc += accuracy(pred, train_y)
 92 |     avg_acc = avg_acc / len(test_data)
 93 |     return avg_acc
 94 | 
 95 | # 计算预测准确性
 96 | def accuracy(y_pred, y_true):
 97 |     label_pred = y_pred.max(dim=1)[1]
 98 |     acc = len(y_pred) - torch.sum(torch.abs(label_pred-y_true)) # 正确的个数
 99 |     return acc.detach().cpu().numpy() / len(y_pred)
100 | 
101 | def main():
102 | 
103 |     train_dir = './aclImdb_v1/aclImdb/train'  # 原训练集文件地址
104 |     train_path = './aclImdb_v1/aclImdb/train.txt'  # 预处理后的训练集文件地址
105 | 
106 |     test_dir = './aclImdb_v1/aclImdb/test'  # 原训练集文件地址
107 |     test_path = './aclImdb_v1/aclImdb/test.txt'  # 预处理后的训练集文件地址
108 | 
109 |     vocab = data_process(train_path, train_dir) # 数据预处理
110 |     data_process(test_path, test_dir)
111 |     np.save('vocab.npy', vocab) # 词典保存为本地
112 |     vocab = np.load('vocab.npy', allow_pickle=True).item()  # 加载本地已经存储的vocab
113 | 
114 |     # 构建MyDataset实例
115 |     train_data = MyDataset(text_path=train_path)
116 |     test_data = MyDataset(text_path=test_path)
117 | 
118 |     # 构建DataLoder
119 |     train_loader = DataLoader(dataset=train_data, batch_size=256, shuffle=True)
120 |     test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)
121 | 
122 |     # 生成模型
123 |     model = LSTM(vocab=vocab, embed_size=300, num_hiddens=128, num_layers=2)  # 定义模型
124 | 
125 |     train(model=model, train_data=train_loader, vocab=vocab, epoch=10,method="LSTM")
126 | 
127 |     # 加载训练好的模型
128 |     model.load_state_dict(torch.load('LSTM_IMDB_parameter.pkl', map_location=torch.device('cuda')))
129 | 
130 |     # 测试结果
131 |     acc = test(model=model, test_data=test_loader, vocab=vocab)
132 |     print(acc)
133 | 
134 | if __name__ == '__main__':
135 |     main()
136 | 
137 | 


--------------------------------------------------------------------------------
/LSTM_acc_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hxy-62/sentimentclassify/021ebfe7227fc417a60eead51d51968fe35da77a/LSTM_acc_loss.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | sentiwordnet.py为使用sentiwordnet情感词典进行情感分类的文件
2 | svmsenti.py为使用svm进行情感分析的文件
3 | LSTM.py为使用LSTM分类的文件
4 | CNN.py为使用CNN分类的文件
5 | naiveBayes.py为使用朴素贝叶斯分类的文件
6 | 
7 | 中科院成都计算所-hxy
8 | 


--------------------------------------------------------------------------------
/naiveBayes.py:
--------------------------------------------------------------------------------
 1 | from sklearn.naive_bayes import MultinomialNB
 2 | from sklearn.metrics import accuracy_score
 3 | import os
 4 | import operator
 5 | from utils import clean_str,tokenize,get_common
 6 | from nltk.corpus import stopwords
 7 | import numpy as np
 8 | 
 9 | vocab = get_common()
10 | list1 = [0]*89527
11 | dict1 = dict(zip(vocab,list1))
12 | 
13 | def get_dict(file_path,dict):
14 |     for filename in os.listdir(file_path):
15 |         if filename.endswith('.txt'):
16 |             with open(os.path.join(file_path, filename), 'r',encoding='utf-8') as file:
17 |                 sentence = clean_str(file.readline())
18 |                 words = tokenize(sentence)
19 |                 #print(words)
20 |                 for i in words:
21 |                     if i in vocab:
22 |                         dict[i] += 1
23 |                     else:
24 |                         continue
25 | 
26 | get_dict('aclImdb_v1/aclImdb/train/pos',dict1)
27 | get_dict('aclImdb_v1/aclImdb/train/neg',dict1)
28 | 
29 | list2 = []
30 | #获取频率最高的2000个单词
31 | for k,v in sorted(dict1.items(), key=operator.itemgetter(1),reverse=True)[:2000]: 
32 |     list2.append(k)
33 | stoplist = ['very', 'ourselves', 'am', 'doesn', 'through', 'me', 'against', 'up', 'just', 'her', 'ours', 
34 |             'couldn', 'because', 'is', 'isn', 'it', 'only', 'in', 'such', 'too', 'mustn', 'under', 'their', 
35 |             'if', 'to', 'my', 'himself', 'after', 'why', 'while', 'can', 'each', 'itself', 'his', 'all', 'once', 
36 |             'herself', 'more', 'our', 'they', 'hasn', 'on', 'ma', 'them', 'its', 'where', 'did', 'll', 'you', 
37 |             'didn', 'nor', 'as', 'now', 'before', 'those', 'yours', 'from', 'who', 'was', 'm', 'been', 'will', 
38 |             'into', 'same', 'how', 'some', 'of', 'out', 'with', 's', 'being', 't', 'mightn', 'she', 'again', 'be', 
39 |             'by', 'shan', 'have', 'yourselves', 'needn', 'and', 'are', 'o', 'these', 'further', 'most', 'yourself', 
40 |             'having', 'aren', 'here', 'he', 'were', 'but', 'this', 'myself', 'own', 'we', 'so', 'i', 'does', 'both', 
41 |             'when', 'between', 'd', 'had', 'the', 'y', 'has', 'down', 'off', 'than', 'haven', 'whom', 'wouldn', 
42 |             'should', 've', 'over', 'themselves', 'few', 'then', 'hadn', 'what', 'until', 'won', 'no', 'about', 
43 |             'any', 'that', 'for', 'shouldn', 'don', 'do', 'there', 'doing', 'an', 'or', 'ain', 'hers', 'wasn', 
44 |             'weren', 'above', 'a', 'at', 'your', 'theirs', 'below', 'other', 'not', 're', 'him', 'during', 'which']
45 | 
46 | feature_words = [w for w in list2 if w not in stoplist]
47 | #print(feature_words,len(feature_words))
48 | 
49 | documents = []
50 | 
51 | def get_document(file_path):
52 |     for filename in os.listdir(file_path):
53 |         if filename.endswith('.txt'):
54 |             with open(os.path.join(file_path, filename), 'r',encoding='utf-8') as file:
55 |                 sentence = clean_str(file.readline())
56 |                 words = tokenize(sentence)
57 |                 if file_path[-3:] == 'pos':
58 |                     documents.append((words,'pos'))
59 |                 elif file_path[-3:] == 'neg':
60 |                     documents.append((words,'neg'))
61 | 
62 | get_document('aclImdb_v1/aclImdb/train/pos')
63 | get_document('aclImdb_v1/aclImdb/train/neg')
64 | 
65 | features = np.zeros([len(documents), len(feature_words)], dtype = float)
66 | for i in range(len(documents)):
67 |     document_words = set(documents[i][0])
68 |     for j in range(len(feature_words)):
69 |         features[i, j] = 1 if (feature_words[j] in document_words) else 0
70 | 
71 | 
72 | target = [c for (d, c) in documents]
73 | train_X = features[:18000, :]
74 | train_Y = target[:18000]
75 | test_X = features[18000:, :]
76 | test_Y = target[18000:]
77 | 
78 | clf = MultinomialNB()
79 | # 利用朴素贝叶斯做训练
80 | clf.fit(train_X, train_Y)
81 | y_pred = clf.predict(test_X)
82 | print("accuracy on test data: ", accuracy_score(test_Y, y_pred))
83 | 


--------------------------------------------------------------------------------
/sentiwordnet.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import string
 3 | 
 4 | from nltk.tokenize import word_tokenize
 5 | from nltk import pos_tag
 6 | from nltk.corpus import stopwords
 7 | from nltk.corpus import sentiwordnet
 8 | from nltk.corpus import wordnet
 9 | from utils import clean_str
10 | 
11 | # 停用词
12 | stpw = stopwords.words('english')
13 | # 标点符号
14 | punc = list(string.punctuation)
15 | # 不需要分析的词和标点
16 | stop = punc + stpw
17 | 
18 | #将pos_tag得到的词性转化为senti_synsets中要用到的词性
19 | tag_map = {'NN': 'n', 'NNP': 'n', 'NNPS': 'n', 'NNS': 'n', 'UH': 'n',\
20 |            'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',\
21 |            'JJ': 'a', 'JJR': 'a', 'JJS': 'a',\
22 |            'RB': 'r', 'RBR': 'r', 'RBS': 'r', 'RP': 'r', 'WRB': 'r'}
23 | 
24 | 
25 | path1 = 'aclImdb_v1/aclImdb/train/pos'
26 | path2 = 'aclImdb_v1/aclImdb/train/neg'
27 | 
28 | def cal_acc(folder_path):
29 |     correct = 0
30 |     total = len(os.listdir(folder_path))  
31 |     for filename in os.listdir(folder_path):
32 |         if filename.endswith('.txt'):
33 |             with open(os.path.join(folder_path, filename), 'r',encoding='utf-8') as file:
34 |                 sentence = clean_str(file.readline())
35 |                 words = word_tokenize(sentence)
36 |                 for word in words:
37 |                     if word.lower() in stop:
38 |                         words.remove(word)
39 |                 word_tag = pos_tag(words)
40 |                 word_tag = [(t[0], tag_map[t[1]]) if t[1] in tag_map else (t[0], '') for t in word_tag]
41 |                 sentiment_synsets = [list(sentiwordnet.senti_synsets(t[0], t[1])) for t in word_tag]
42 |                 score = sum(sum([x.pos_score() - x.neg_score() for x in s]) / len(s) for s in sentiment_synsets if len(s) != 0)
43 |                 if folder_path[-3:] == 'pos':
44 |                     if score > 0 :
45 |                         correct += 1
46 |                 elif folder_path[-3:] == 'neg':
47 |                     if score < 0 :
48 |                         correct += 1
49 |     acc = correct / total
50 |     return acc, correct, total
51 | 
52 | #计算积极样本中的准确率：
53 | pos_acc, pos_correct, pos_total = cal_acc(path1)
54 | #计算消极样本中的准确率：
55 | neg_acc, neg_correct, neg_total = cal_acc(path2)
56 | total_acc = (pos_correct+neg_correct) / (pos_total+neg_total)
57 | print("积极样本中的准确率为: {:.2%},消极样本中的准确率为: {:.2%}, 总的准确率为: {:.2%}".format(pos_acc,neg_acc,total_acc))


--------------------------------------------------------------------------------
/svmsenti.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import operator
 3 | from utils import clean_str,tokenize,get_common
 4 | from nltk.corpus import stopwords
 5 | import string
 6 | import numpy as np
 7 | 
 8 | vocab = get_common()
 9 | list1 = [0]*89527
10 | dict1 = dict(zip(vocab,list1))
11 | 
12 | def get_dict(file_path,dict):
13 |     for filename in os.listdir(file_path):
14 |         if filename.endswith('.txt'):
15 |             with open(os.path.join(file_path, filename), 'r',encoding='utf-8') as file:
16 |                 sentence = clean_str(file.readline())
17 |                 words = tokenize(sentence)
18 |                 #print(words)
19 |                 for i in words:
20 |                     if i in vocab:
21 |                         dict[i] += 1
22 |                     else:
23 |                         continue
24 | 
25 | get_dict('./aclImdb_v1/aclImdb/train/pos',dict1)
26 | get_dict('./aclImdb_v1/aclImdb/train/neg',dict1)
27 | 
28 | list2 = []
29 | #获取频率最高的2000个单词
30 | for k,v in sorted(dict1.items(), key=operator.itemgetter(1),reverse=True)[:2000]: 
31 |     list2.append(k)
32 | stoplist = ['very', 'ourselves', 'am', 'doesn', 'through', 'me', 'against', 'up', 'just', 'her', 'ours', 
33 |             'couldn', 'because', 'is', 'isn', 'it', 'only', 'in', 'such', 'too', 'mustn', 'under', 'their', 
34 |             'if', 'to', 'my', 'himself', 'after', 'why', 'while', 'can', 'each', 'itself', 'his', 'all', 'once', 
35 |             'herself', 'more', 'our', 'they', 'hasn', 'on', 'ma', 'them', 'its', 'where', 'did', 'll', 'you', 
36 |             'didn', 'nor', 'as', 'now', 'before', 'those', 'yours', 'from', 'who', 'was', 'm', 'been', 'will', 
37 |             'into', 'same', 'how', 'some', 'of', 'out', 'with', 's', 'being', 't', 'mightn', 'she', 'again', 'be', 
38 |             'by', 'shan', 'have', 'yourselves', 'needn', 'and', 'are', 'o', 'these', 'further', 'most', 'yourself', 
39 |             'having', 'aren', 'here', 'he', 'were', 'but', 'this', 'myself', 'own', 'we', 'so', 'i', 'does', 'both', 
40 |             'when', 'between', 'd', 'had', 'the', 'y', 'has', 'down', 'off', 'than', 'haven', 'whom', 'wouldn', 
41 |             'should', 've', 'over', 'themselves', 'few', 'then', 'hadn', 'what', 'until', 'won', 'no', 'about', 
42 |             'any', 'that', 'for', 'shouldn', 'don', 'do', 'there', 'doing', 'an', 'or', 'ain', 'hers', 'wasn', 
43 |             'weren', 'above', 'a', 'at', 'your', 'theirs', 'below', 'other', 'not', 're', 'him', 'during', 'which']
44 | 
45 | feature_words = [w for w in list2 if w not in stoplist]
46 | #print(feature_words,len(feature_words))
47 | 
48 | documents = []
49 | 
50 | def get_document(file_path):
51 |     for filename in os.listdir(file_path):
52 |         if filename.endswith('.txt'):
53 |             with open(os.path.join(file_path, filename), 'r',encoding='utf-8') as file:
54 |                 sentence = clean_str(file.readline())
55 |                 words = tokenize(sentence)
56 |                 if file_path[-3:] == 'pos':
57 |                     documents.append((words,'pos'))
58 |                 elif file_path[-3:] == 'neg':
59 |                     documents.append((words,'neg'))
60 | 
61 | get_document('aclImdb_v1/aclImdb/train/pos')
62 | get_document('aclImdb_v1/aclImdb/train/neg')
63 | 
64 | #构建文档-单词矩阵
65 | features = np.zeros([len(documents), len(feature_words)], dtype = float)
66 | for i in range(len(documents)):
67 |     document_words = set(documents[i][0])
68 |     for j in range(len(feature_words)):
69 |         features[i, j] = 1 if (feature_words[j] in document_words) else 0
70 | 
71 | # 单个将文本表示为布尔特征
72 | def get_document_feature(document):
73 |     document_words = set(document)
74 |     features = np.zeros([1, len(feature_words)], dtype = float)
75 |     for j in range(len(feature_words)):
76 |         features[0, j] = 1 if (feature_words[j] in document_words) else 0
77 |     return features
78 | 
79 | #print(len(documents))
80 | 
81 | target = [c for (d, c) in documents]
82 | train_X = features[:18000, :]
83 | train_Y = target[:18000]
84 | test_X = features[18000:, :]
85 | test_Y = target[18000:]
86 | 
87 | from sklearn import svm
88 | classifier = svm.SVC(kernel = 'rbf')  # kernel为核函数类型，默认RBF径向基神经网络
89 | classifier.fit(train_X, train_Y)
90 | 
91 | print('支持向量机(SVM)的测试集正确率为', classifier.score(test_X, test_Y))
92 | 
93 | text = input('请输入影评文本: ')
94 | print('情感分析结果为(pos/neg): ', classifier.predict(get_document_feature(text.split(' '))))
95 | # 注意get_document_feature的参数需要是个词列表，因此需要提前分词，这里使用了最偷工减料的text.split(' ')
96 | 
97 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.download()


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from torch.utils.data import Dataset
  3 | import torch  
  4 | import os
  5 | 
  6 | MAX_WORD = 10000  # 只保留最高频的10000词
  7 | MAX_LEN = 300     # 句子统一长度为200
  8 | word_count={}     # 词-词出现的词数 词典
  9 | 
 10 | 
 11 | #去除文本中的标点符号
 12 | def clean_str(string):
 13 |     string = re.sub(r"[^A-Za-z0-9]", " ", string)
 14 |     string = re.sub(r"\'s", " \'s", string)
 15 |     string = re.sub(r"\'ve", " \'ve", string)
 16 |     string = re.sub(r"n\'t", " n\'t", string)
 17 |     string = re.sub(r"\'re", " \'re", string)
 18 |     string = re.sub(r"\'d", " \'d", string)
 19 |     string = re.sub(r"\'ll", " \'ll", string)
 20 |     string = re.sub(r",", " , ", string)
 21 |     string = re.sub(r"!", " ! ", string)
 22 |     string = re.sub(r"\(", " \( ", string)
 23 |     string = re.sub(r"\)", " \) ", string)
 24 |     string = re.sub(r"\?", " \? ", string)
 25 |     string = re.sub(r"\s{2,}", " ", string)
 26 |     string = re.sub(r"\s{2,}", " ", string)
 27 |     string = re.sub(r"sssss ", " ", string)
 28 |     return string.strip().lower()
 29 | 
 30 | def tokenize(str):
 31 |     return str.split()
 32 | 
 33 | #统计训练数据中出现次数最多的前N个词
 34 | def get_common():
 35 |     with open("aclImdb_v1/aclImdb/imdb.vocab", "r") as f:
 36 |         data = f.read().splitlines()
 37 |         #print(data)
 38 |         #返回词典列表
 39 |         return data
 40 |     
 41 | #  数据预处理过程
 42 | def data_process(text_path, text_dir): # 根据文本路径生成文本的标签
 43 | 
 44 |     print("data preprocess")
 45 |     file_pro = open(text_path,'w',encoding='utf-8')
 46 |     for root, s_dirs, _ in os.walk(text_dir): # 获取 train文件下各文件夹名称
 47 |         for sub_dir in s_dirs:
 48 |             i_dir = os.path.join(root, sub_dir)  # 获取train和test文件夹下所有的路径
 49 |             text_list = os.listdir(i_dir)
 50 |             tag = os.path.split(i_dir)[-1] # 获取标签
 51 |             if tag == 'pos':
 52 |                 label = '1'
 53 |             if tag == 'neg':
 54 |                 label = '0'
 55 |             if tag =='unsup':
 56 |                 continue
 57 | 
 58 |             for i in range(len(text_list)):
 59 |                 if not text_list[i].endswith('txt'): # 判断若不是txt,则跳过
 60 |                     continue
 61 |                 f = open(os.path.join(i_dir, text_list[i]),'r',encoding='utf-8') # 打开文本
 62 |                 raw_line = f.readline()
 63 |                 pro_line = clean_str(raw_line)
 64 |                 tokens = tokenize(pro_line) # 分词统计词数
 65 |                 for token in tokens:
 66 |                     if token in word_count.keys():
 67 |                         word_count[token] = word_count[token] + 1
 68 |                     else:
 69 |                         word_count[token] = 0
 70 |                 file_pro.write(label + ' ' + pro_line +'\n')
 71 |                 f.close()
 72 |                 file_pro.flush()
 73 |     file_pro.close()
 74 | 
 75 |     print("build vocabulary")
 76 | 
 77 |     vocab = {"<UNK>": 0, "<PAD>": 1}
 78 | 
 79 |     word_count_sort = sorted(word_count.items(), key=lambda item : item[1], reverse=True) # 对词进行排序，过滤低频词，只取前MAX_WORD个高频词
 80 |     word_number = 1
 81 |     for word in word_count_sort:
 82 |         if word[0] not in vocab.keys():
 83 |             vocab[word[0]] = len(vocab)
 84 |             word_number += 1
 85 |         if word_number > MAX_WORD:
 86 |             break
 87 |     return vocab
 88 | 
 89 | # 定义Dataset
 90 | class MyDataset(Dataset):
 91 |     def __init__(self, text_path):
 92 |         file = open(text_path, 'r', encoding='utf-8')
 93 |         self.text_with_tag = file.readlines()  # 文本标签与内容
 94 |         file.close()
 95 | 
 96 |     def __getitem__(self, index): # 重写getitem
 97 |         line = self.text_with_tag[index] # 获取一个样本的标签和文本信息
 98 |         label = int(line[0]) # 标签信息
 99 |         text = line[2:-1]  # 文本信息
100 |         return text, label
101 | 
102 |     def __len__(self):
103 |         return len(self.text_with_tag)
104 | 
105 | 
106 | # 根据vocab将句子转为定长MAX_LEN的tensor
107 | def text_transform(sentence_list, vocab):
108 |     sentence_index_list = []
109 |     for sentence in sentence_list:
110 |         sentence_idx = [vocab[token] if token in vocab.keys() else vocab['<UNK>'] for token in tokenize(sentence)] # 句子分词转为id
111 | 
112 |         if len(sentence_idx) < MAX_LEN:
113 |             for i in range(MAX_LEN-len(sentence_idx)): # 对长度不够的句子进行PAD填充
114 |                 sentence_idx.append(vocab['<PAD>'])
115 | 
116 |         sentence_idx = sentence_idx[:MAX_LEN] # 取前MAX_LEN长度
117 |         sentence_index_list.append(sentence_idx)
118 |     return torch.LongTensor(sentence_index_list) # 将转为idx的词转为tensor


--------------------------------------------------------------------------------