├── CNN.py ├── CNN_acc_loss.png ├── LSTM.py ├── LSTM_acc_loss.png ├── README.md ├── naiveBayes.py ├── sentiwordnet.py ├── svmsenti.py ├── test.py └── utils.py /CNN.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import numpy as np 4 | from utils import data_process,MyDataset 5 | from torch.utils.data import DataLoader 6 | from LSTM import train,test 7 | 8 | class TextCNN(nn.Module): 9 | def __init__(self, vocab_size, embed_sizes, kernel_sizes, num_channels, 10 | **kwargs): 11 | super(TextCNN, self).__init__(**kwargs) 12 | self.embedding = nn.Embedding(vocab_size, embed_sizes) 13 | # 这个嵌入层不需要训练 14 | self.constant_embedding = nn.Embedding(vocab_size, embed_sizes) 15 | self.dropout = nn.Dropout(0.5) 16 | self.decoder = nn.Linear(sum(num_channels), 2) 17 | # 最大时间汇聚层没有参数,因此可以共享此实例 18 | self.pool = nn.AdaptiveAvgPool1d(1) 19 | self.relu = nn.ReLU() 20 | # 创建多个一维卷积层 21 | self.convs = nn.ModuleList() 22 | for c, k in zip(num_channels, kernel_sizes): 23 | self.convs.append(nn.Conv1d(2 * embed_sizes, c, k)) 24 | 25 | def forward(self, inputs): 26 | # 沿着向量维度将两个嵌入层连结起来, 27 | # 每个嵌入层的输出形状都是(批量大小,词元数量,词元向量维度)连结起来 28 | embeddings = torch.cat(( 29 | self.embedding(inputs), self.constant_embedding(inputs)), dim=2) 30 | # 根据一维卷积层的输入格式,重新排列张量,以便通道作为第2维 31 | embeddings = embeddings.permute(0, 2, 1) 32 | # 每个一维卷积层在最大时间汇聚层合并后,获得的张量形状是(批量大小,通道数,1) 33 | # 删除最后一个维度并沿通道维度连结 34 | encoding = torch.cat([ 35 | torch.squeeze(self.relu(self.pool(conv(embeddings))), dim=-1) 36 | for conv in self.convs], dim=1) 37 | outputs = self.decoder(self.dropout(encoding)) 38 | return outputs 39 | 40 | def main(): 41 | 42 | train_dir = './aclImdb_v1/aclImdb/train' # 原训练集文件地址 43 | train_path = './aclImdb_v1/aclImdb/train.txt' # 预处理后的训练集文件地址 44 | 45 | test_dir = './aclImdb_v1/aclImdb/test' # 原训练集文件地址 46 | test_path = './aclImdb_v1/aclImdb/test.txt' # 预处理后的训练集文件地址 47 | 48 | vocab = data_process(train_path, train_dir) # 数据预处理 49 | data_process(test_path, test_dir) 50 | np.save('vocab.npy', vocab) # 词典保存为本地 51 | vocab = np.load('vocab.npy', allow_pickle=True).item() # 加载本地已经存储的vocab 52 | 53 | # 构建MyDataset实例 54 | train_data = MyDataset(text_path=train_path) 55 | test_data = MyDataset(text_path=test_path) 56 | 57 | # 构建DataLoder 58 | train_loader = DataLoader(dataset=train_data, batch_size=256, shuffle=True) 59 | test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False) 60 | 61 | # 生成模型 62 | embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100] 63 | net = TextCNN(len(vocab), embed_size, kernel_sizes, nums_channels) 64 | 65 | train(model=net, train_data=train_loader, vocab=vocab, epoch=10,method='CNN') 66 | 67 | # 加载训练好的模型 68 | net.load_state_dict(torch.load('cnn.pkl', map_location=torch.device('cuda'))) 69 | 70 | # 测试结果 71 | acc = test(model=net, test_data=test_loader, vocab=vocab) 72 | print(acc) 73 | 74 | if __name__ == '__main__': 75 | main() -------------------------------------------------------------------------------- /CNN_acc_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hxy-62/sentimentclassify/021ebfe7227fc417a60eead51d51968fe35da77a/CNN_acc_loss.png -------------------------------------------------------------------------------- /LSTM.py: -------------------------------------------------------------------------------- 1 | import torch # torch==1.7.1 2 | import torch.nn as nn 3 | from torch.utils.data import DataLoader 4 | import os 5 | import re 6 | import numpy as np 7 | from tqdm import tqdm 8 | from utils import tokenize,clean_str,data_process,MAX_LEN,text_transform,MyDataset 9 | 10 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 11 | 12 | 13 | # 定义LSTM模型 14 | class LSTM(nn.Module): 15 | def __init__(self, vocab, embed_size, num_hiddens, num_layers): 16 | super(LSTM, self).__init__() 17 | self.embedding = nn.Embedding(len(vocab), embed_size) # embedding层 18 | 19 | self.encoder = nn.LSTM(input_size=embed_size, 20 | hidden_size=num_hiddens, 21 | num_layers=num_layers, 22 | bidirectional=False) 23 | self.decoder = nn.Linear(num_hiddens, 2) 24 | self.softmax = nn.Softmax(dim=1) 25 | 26 | def forward(self, inputs): 27 | # inputs的形状是(批量大小,词数),因此LSTM需要将序列长度(Seq_len)作为第一维,所以将输入转置后 再提取词特征 28 | embeddings = self.embedding(inputs.permute(1,0)) # permute(1,0)交换维度 29 | # LSTM只传入输入embeddings,因此只返回最后一层的隐藏层再各时间步的隐藏状态 30 | # outputs的形状是(词数,批量大小, 隐藏单元个数) 31 | outputs, _ = self.encoder(embeddings) 32 | # 连接初时间步和最终时间步的隐藏状态作为全连接层的输入。形状为(批量大小, 隐藏单元个数) 33 | encoding = outputs[-1] # 取LSTM最后一层结果 34 | outs = self.softmax(self.decoder(encoding)) # 输出层为二维概率[a,b] 35 | return outs 36 | 37 | # 模型训练 38 | def train(model, train_data, vocab, epoch=10,method='LSTM'): 39 | print('train model') 40 | model = model.to(device) 41 | loss_sigma = 0.0 42 | correct = 0.0 43 | # 定义损失函数和优化器 44 | if method == 'LSTM': 45 | criterion = torch.nn.NLLLoss() 46 | elif method == 'CNN': 47 | criterion = torch.nn.CrossEntropyLoss() 48 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-3) 49 | 50 | for epoch in tqdm(range(epoch)): 51 | model.train() 52 | avg_loss = 0 # 平均损失 53 | avg_acc = 0 # 平均准确率 54 | for idx, (text, label) in enumerate(tqdm(train_data)): 55 | 56 | train_x = text_transform(text, vocab).to(device) 57 | train_y = label.to(device) 58 | 59 | optimizer.zero_grad() 60 | pred = model(train_x) 61 | if method == 'LSTM': 62 | pred = pred.log() 63 | loss = criterion(pred, train_y) 64 | loss.backward() 65 | optimizer.step() 66 | avg_loss += loss.item() 67 | avg_acc += accuracy(pred, train_y) 68 | # 一个epoch结束后,计算平均loss和评平均acc 69 | avg_loss = avg_loss / len(train_data) 70 | avg_acc = avg_acc / len(train_data) 71 | 72 | print("avg_loss:", avg_loss, " train_avg_acc:,", avg_acc) 73 | 74 | # 保存训练完成后的模型参数 75 | if method == 'LSTM': 76 | torch.save(model.state_dict(), 'LSTM_IMDB_parameter.pkl') 77 | elif method == 'CNN': 78 | torch.save(model.state_dict(), 'CNN.pkl') 79 | 80 | 81 | # 模型测试 82 | def test(model, test_data, vocab): 83 | print('test model') 84 | model = model.to(device) 85 | model.eval() 86 | avg_acc = 0 87 | for idx, (text, label) in enumerate(tqdm(test_data)): 88 | train_x = text_transform(text, vocab).to(device) 89 | train_y = label.to(device) 90 | pred = model(train_x) 91 | avg_acc += accuracy(pred, train_y) 92 | avg_acc = avg_acc / len(test_data) 93 | return avg_acc 94 | 95 | # 计算预测准确性 96 | def accuracy(y_pred, y_true): 97 | label_pred = y_pred.max(dim=1)[1] 98 | acc = len(y_pred) - torch.sum(torch.abs(label_pred-y_true)) # 正确的个数 99 | return acc.detach().cpu().numpy() / len(y_pred) 100 | 101 | def main(): 102 | 103 | train_dir = './aclImdb_v1/aclImdb/train' # 原训练集文件地址 104 | train_path = './aclImdb_v1/aclImdb/train.txt' # 预处理后的训练集文件地址 105 | 106 | test_dir = './aclImdb_v1/aclImdb/test' # 原训练集文件地址 107 | test_path = './aclImdb_v1/aclImdb/test.txt' # 预处理后的训练集文件地址 108 | 109 | vocab = data_process(train_path, train_dir) # 数据预处理 110 | data_process(test_path, test_dir) 111 | np.save('vocab.npy', vocab) # 词典保存为本地 112 | vocab = np.load('vocab.npy', allow_pickle=True).item() # 加载本地已经存储的vocab 113 | 114 | # 构建MyDataset实例 115 | train_data = MyDataset(text_path=train_path) 116 | test_data = MyDataset(text_path=test_path) 117 | 118 | # 构建DataLoder 119 | train_loader = DataLoader(dataset=train_data, batch_size=256, shuffle=True) 120 | test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False) 121 | 122 | # 生成模型 123 | model = LSTM(vocab=vocab, embed_size=300, num_hiddens=128, num_layers=2) # 定义模型 124 | 125 | train(model=model, train_data=train_loader, vocab=vocab, epoch=10,method="LSTM") 126 | 127 | # 加载训练好的模型 128 | model.load_state_dict(torch.load('LSTM_IMDB_parameter.pkl', map_location=torch.device('cuda'))) 129 | 130 | # 测试结果 131 | acc = test(model=model, test_data=test_loader, vocab=vocab) 132 | print(acc) 133 | 134 | if __name__ == '__main__': 135 | main() 136 | 137 | -------------------------------------------------------------------------------- /LSTM_acc_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hxy-62/sentimentclassify/021ebfe7227fc417a60eead51d51968fe35da77a/LSTM_acc_loss.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | sentiwordnet.py为使用sentiwordnet情感词典进行情感分类的文件 2 | svmsenti.py为使用svm进行情感分析的文件 3 | LSTM.py为使用LSTM分类的文件 4 | CNN.py为使用CNN分类的文件 5 | naiveBayes.py为使用朴素贝叶斯分类的文件 6 | 7 | 中科院成都计算所-hxy 8 | -------------------------------------------------------------------------------- /naiveBayes.py: -------------------------------------------------------------------------------- 1 | from sklearn.naive_bayes import MultinomialNB 2 | from sklearn.metrics import accuracy_score 3 | import os 4 | import operator 5 | from utils import clean_str,tokenize,get_common 6 | from nltk.corpus import stopwords 7 | import numpy as np 8 | 9 | vocab = get_common() 10 | list1 = [0]*89527 11 | dict1 = dict(zip(vocab,list1)) 12 | 13 | def get_dict(file_path,dict): 14 | for filename in os.listdir(file_path): 15 | if filename.endswith('.txt'): 16 | with open(os.path.join(file_path, filename), 'r',encoding='utf-8') as file: 17 | sentence = clean_str(file.readline()) 18 | words = tokenize(sentence) 19 | #print(words) 20 | for i in words: 21 | if i in vocab: 22 | dict[i] += 1 23 | else: 24 | continue 25 | 26 | get_dict('aclImdb_v1/aclImdb/train/pos',dict1) 27 | get_dict('aclImdb_v1/aclImdb/train/neg',dict1) 28 | 29 | list2 = [] 30 | #获取频率最高的2000个单词 31 | for k,v in sorted(dict1.items(), key=operator.itemgetter(1),reverse=True)[:2000]: 32 | list2.append(k) 33 | stoplist = ['very', 'ourselves', 'am', 'doesn', 'through', 'me', 'against', 'up', 'just', 'her', 'ours', 34 | 'couldn', 'because', 'is', 'isn', 'it', 'only', 'in', 'such', 'too', 'mustn', 'under', 'their', 35 | 'if', 'to', 'my', 'himself', 'after', 'why', 'while', 'can', 'each', 'itself', 'his', 'all', 'once', 36 | 'herself', 'more', 'our', 'they', 'hasn', 'on', 'ma', 'them', 'its', 'where', 'did', 'll', 'you', 37 | 'didn', 'nor', 'as', 'now', 'before', 'those', 'yours', 'from', 'who', 'was', 'm', 'been', 'will', 38 | 'into', 'same', 'how', 'some', 'of', 'out', 'with', 's', 'being', 't', 'mightn', 'she', 'again', 'be', 39 | 'by', 'shan', 'have', 'yourselves', 'needn', 'and', 'are', 'o', 'these', 'further', 'most', 'yourself', 40 | 'having', 'aren', 'here', 'he', 'were', 'but', 'this', 'myself', 'own', 'we', 'so', 'i', 'does', 'both', 41 | 'when', 'between', 'd', 'had', 'the', 'y', 'has', 'down', 'off', 'than', 'haven', 'whom', 'wouldn', 42 | 'should', 've', 'over', 'themselves', 'few', 'then', 'hadn', 'what', 'until', 'won', 'no', 'about', 43 | 'any', 'that', 'for', 'shouldn', 'don', 'do', 'there', 'doing', 'an', 'or', 'ain', 'hers', 'wasn', 44 | 'weren', 'above', 'a', 'at', 'your', 'theirs', 'below', 'other', 'not', 're', 'him', 'during', 'which'] 45 | 46 | feature_words = [w for w in list2 if w not in stoplist] 47 | #print(feature_words,len(feature_words)) 48 | 49 | documents = [] 50 | 51 | def get_document(file_path): 52 | for filename in os.listdir(file_path): 53 | if filename.endswith('.txt'): 54 | with open(os.path.join(file_path, filename), 'r',encoding='utf-8') as file: 55 | sentence = clean_str(file.readline()) 56 | words = tokenize(sentence) 57 | if file_path[-3:] == 'pos': 58 | documents.append((words,'pos')) 59 | elif file_path[-3:] == 'neg': 60 | documents.append((words,'neg')) 61 | 62 | get_document('aclImdb_v1/aclImdb/train/pos') 63 | get_document('aclImdb_v1/aclImdb/train/neg') 64 | 65 | features = np.zeros([len(documents), len(feature_words)], dtype = float) 66 | for i in range(len(documents)): 67 | document_words = set(documents[i][0]) 68 | for j in range(len(feature_words)): 69 | features[i, j] = 1 if (feature_words[j] in document_words) else 0 70 | 71 | 72 | target = [c for (d, c) in documents] 73 | train_X = features[:18000, :] 74 | train_Y = target[:18000] 75 | test_X = features[18000:, :] 76 | test_Y = target[18000:] 77 | 78 | clf = MultinomialNB() 79 | # 利用朴素贝叶斯做训练 80 | clf.fit(train_X, train_Y) 81 | y_pred = clf.predict(test_X) 82 | print("accuracy on test data: ", accuracy_score(test_Y, y_pred)) 83 | -------------------------------------------------------------------------------- /sentiwordnet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | 4 | from nltk.tokenize import word_tokenize 5 | from nltk import pos_tag 6 | from nltk.corpus import stopwords 7 | from nltk.corpus import sentiwordnet 8 | from nltk.corpus import wordnet 9 | from utils import clean_str 10 | 11 | # 停用词 12 | stpw = stopwords.words('english') 13 | # 标点符号 14 | punc = list(string.punctuation) 15 | # 不需要分析的词和标点 16 | stop = punc + stpw 17 | 18 | #将pos_tag得到的词性转化为senti_synsets中要用到的词性 19 | tag_map = {'NN': 'n', 'NNP': 'n', 'NNPS': 'n', 'NNS': 'n', 'UH': 'n',\ 20 | 'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',\ 21 | 'JJ': 'a', 'JJR': 'a', 'JJS': 'a',\ 22 | 'RB': 'r', 'RBR': 'r', 'RBS': 'r', 'RP': 'r', 'WRB': 'r'} 23 | 24 | 25 | path1 = 'aclImdb_v1/aclImdb/train/pos' 26 | path2 = 'aclImdb_v1/aclImdb/train/neg' 27 | 28 | def cal_acc(folder_path): 29 | correct = 0 30 | total = len(os.listdir(folder_path)) 31 | for filename in os.listdir(folder_path): 32 | if filename.endswith('.txt'): 33 | with open(os.path.join(folder_path, filename), 'r',encoding='utf-8') as file: 34 | sentence = clean_str(file.readline()) 35 | words = word_tokenize(sentence) 36 | for word in words: 37 | if word.lower() in stop: 38 | words.remove(word) 39 | word_tag = pos_tag(words) 40 | word_tag = [(t[0], tag_map[t[1]]) if t[1] in tag_map else (t[0], '') for t in word_tag] 41 | sentiment_synsets = [list(sentiwordnet.senti_synsets(t[0], t[1])) for t in word_tag] 42 | score = sum(sum([x.pos_score() - x.neg_score() for x in s]) / len(s) for s in sentiment_synsets if len(s) != 0) 43 | if folder_path[-3:] == 'pos': 44 | if score > 0 : 45 | correct += 1 46 | elif folder_path[-3:] == 'neg': 47 | if score < 0 : 48 | correct += 1 49 | acc = correct / total 50 | return acc, correct, total 51 | 52 | #计算积极样本中的准确率: 53 | pos_acc, pos_correct, pos_total = cal_acc(path1) 54 | #计算消极样本中的准确率: 55 | neg_acc, neg_correct, neg_total = cal_acc(path2) 56 | total_acc = (pos_correct+neg_correct) / (pos_total+neg_total) 57 | print("积极样本中的准确率为: {:.2%},消极样本中的准确率为: {:.2%}, 总的准确率为: {:.2%}".format(pos_acc,neg_acc,total_acc)) -------------------------------------------------------------------------------- /svmsenti.py: -------------------------------------------------------------------------------- 1 | import os 2 | import operator 3 | from utils import clean_str,tokenize,get_common 4 | from nltk.corpus import stopwords 5 | import string 6 | import numpy as np 7 | 8 | vocab = get_common() 9 | list1 = [0]*89527 10 | dict1 = dict(zip(vocab,list1)) 11 | 12 | def get_dict(file_path,dict): 13 | for filename in os.listdir(file_path): 14 | if filename.endswith('.txt'): 15 | with open(os.path.join(file_path, filename), 'r',encoding='utf-8') as file: 16 | sentence = clean_str(file.readline()) 17 | words = tokenize(sentence) 18 | #print(words) 19 | for i in words: 20 | if i in vocab: 21 | dict[i] += 1 22 | else: 23 | continue 24 | 25 | get_dict('./aclImdb_v1/aclImdb/train/pos',dict1) 26 | get_dict('./aclImdb_v1/aclImdb/train/neg',dict1) 27 | 28 | list2 = [] 29 | #获取频率最高的2000个单词 30 | for k,v in sorted(dict1.items(), key=operator.itemgetter(1),reverse=True)[:2000]: 31 | list2.append(k) 32 | stoplist = ['very', 'ourselves', 'am', 'doesn', 'through', 'me', 'against', 'up', 'just', 'her', 'ours', 33 | 'couldn', 'because', 'is', 'isn', 'it', 'only', 'in', 'such', 'too', 'mustn', 'under', 'their', 34 | 'if', 'to', 'my', 'himself', 'after', 'why', 'while', 'can', 'each', 'itself', 'his', 'all', 'once', 35 | 'herself', 'more', 'our', 'they', 'hasn', 'on', 'ma', 'them', 'its', 'where', 'did', 'll', 'you', 36 | 'didn', 'nor', 'as', 'now', 'before', 'those', 'yours', 'from', 'who', 'was', 'm', 'been', 'will', 37 | 'into', 'same', 'how', 'some', 'of', 'out', 'with', 's', 'being', 't', 'mightn', 'she', 'again', 'be', 38 | 'by', 'shan', 'have', 'yourselves', 'needn', 'and', 'are', 'o', 'these', 'further', 'most', 'yourself', 39 | 'having', 'aren', 'here', 'he', 'were', 'but', 'this', 'myself', 'own', 'we', 'so', 'i', 'does', 'both', 40 | 'when', 'between', 'd', 'had', 'the', 'y', 'has', 'down', 'off', 'than', 'haven', 'whom', 'wouldn', 41 | 'should', 've', 'over', 'themselves', 'few', 'then', 'hadn', 'what', 'until', 'won', 'no', 'about', 42 | 'any', 'that', 'for', 'shouldn', 'don', 'do', 'there', 'doing', 'an', 'or', 'ain', 'hers', 'wasn', 43 | 'weren', 'above', 'a', 'at', 'your', 'theirs', 'below', 'other', 'not', 're', 'him', 'during', 'which'] 44 | 45 | feature_words = [w for w in list2 if w not in stoplist] 46 | #print(feature_words,len(feature_words)) 47 | 48 | documents = [] 49 | 50 | def get_document(file_path): 51 | for filename in os.listdir(file_path): 52 | if filename.endswith('.txt'): 53 | with open(os.path.join(file_path, filename), 'r',encoding='utf-8') as file: 54 | sentence = clean_str(file.readline()) 55 | words = tokenize(sentence) 56 | if file_path[-3:] == 'pos': 57 | documents.append((words,'pos')) 58 | elif file_path[-3:] == 'neg': 59 | documents.append((words,'neg')) 60 | 61 | get_document('aclImdb_v1/aclImdb/train/pos') 62 | get_document('aclImdb_v1/aclImdb/train/neg') 63 | 64 | #构建文档-单词矩阵 65 | features = np.zeros([len(documents), len(feature_words)], dtype = float) 66 | for i in range(len(documents)): 67 | document_words = set(documents[i][0]) 68 | for j in range(len(feature_words)): 69 | features[i, j] = 1 if (feature_words[j] in document_words) else 0 70 | 71 | # 单个将文本表示为布尔特征 72 | def get_document_feature(document): 73 | document_words = set(document) 74 | features = np.zeros([1, len(feature_words)], dtype = float) 75 | for j in range(len(feature_words)): 76 | features[0, j] = 1 if (feature_words[j] in document_words) else 0 77 | return features 78 | 79 | #print(len(documents)) 80 | 81 | target = [c for (d, c) in documents] 82 | train_X = features[:18000, :] 83 | train_Y = target[:18000] 84 | test_X = features[18000:, :] 85 | test_Y = target[18000:] 86 | 87 | from sklearn import svm 88 | classifier = svm.SVC(kernel = 'rbf') # kernel为核函数类型,默认RBF径向基神经网络 89 | classifier.fit(train_X, train_Y) 90 | 91 | print('支持向量机(SVM)的测试集正确率为', classifier.score(test_X, test_Y)) 92 | 93 | text = input('请输入影评文本: ') 94 | print('情感分析结果为(pos/neg): ', classifier.predict(get_document_feature(text.split(' ')))) 95 | # 注意get_document_feature的参数需要是个词列表,因此需要提前分词,这里使用了最偷工减料的text.split(' ') 96 | 97 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.download() -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from torch.utils.data import Dataset 3 | import torch 4 | import os 5 | 6 | MAX_WORD = 10000 # 只保留最高频的10000词 7 | MAX_LEN = 300 # 句子统一长度为200 8 | word_count={} # 词-词出现的词数 词典 9 | 10 | 11 | #去除文本中的标点符号 12 | def clean_str(string): 13 | string = re.sub(r"[^A-Za-z0-9]", " ", string) 14 | string = re.sub(r"\'s", " \'s", string) 15 | string = re.sub(r"\'ve", " \'ve", string) 16 | string = re.sub(r"n\'t", " n\'t", string) 17 | string = re.sub(r"\'re", " \'re", string) 18 | string = re.sub(r"\'d", " \'d", string) 19 | string = re.sub(r"\'ll", " \'ll", string) 20 | string = re.sub(r",", " , ", string) 21 | string = re.sub(r"!", " ! ", string) 22 | string = re.sub(r"\(", " \( ", string) 23 | string = re.sub(r"\)", " \) ", string) 24 | string = re.sub(r"\?", " \? ", string) 25 | string = re.sub(r"\s{2,}", " ", string) 26 | string = re.sub(r"\s{2,}", " ", string) 27 | string = re.sub(r"sssss ", " ", string) 28 | return string.strip().lower() 29 | 30 | def tokenize(str): 31 | return str.split() 32 | 33 | #统计训练数据中出现次数最多的前N个词 34 | def get_common(): 35 | with open("aclImdb_v1/aclImdb/imdb.vocab", "r") as f: 36 | data = f.read().splitlines() 37 | #print(data) 38 | #返回词典列表 39 | return data 40 | 41 | # 数据预处理过程 42 | def data_process(text_path, text_dir): # 根据文本路径生成文本的标签 43 | 44 | print("data preprocess") 45 | file_pro = open(text_path,'w',encoding='utf-8') 46 | for root, s_dirs, _ in os.walk(text_dir): # 获取 train文件下各文件夹名称 47 | for sub_dir in s_dirs: 48 | i_dir = os.path.join(root, sub_dir) # 获取train和test文件夹下所有的路径 49 | text_list = os.listdir(i_dir) 50 | tag = os.path.split(i_dir)[-1] # 获取标签 51 | if tag == 'pos': 52 | label = '1' 53 | if tag == 'neg': 54 | label = '0' 55 | if tag =='unsup': 56 | continue 57 | 58 | for i in range(len(text_list)): 59 | if not text_list[i].endswith('txt'): # 判断若不是txt,则跳过 60 | continue 61 | f = open(os.path.join(i_dir, text_list[i]),'r',encoding='utf-8') # 打开文本 62 | raw_line = f.readline() 63 | pro_line = clean_str(raw_line) 64 | tokens = tokenize(pro_line) # 分词统计词数 65 | for token in tokens: 66 | if token in word_count.keys(): 67 | word_count[token] = word_count[token] + 1 68 | else: 69 | word_count[token] = 0 70 | file_pro.write(label + ' ' + pro_line +'\n') 71 | f.close() 72 | file_pro.flush() 73 | file_pro.close() 74 | 75 | print("build vocabulary") 76 | 77 | vocab = {"": 0, "": 1} 78 | 79 | word_count_sort = sorted(word_count.items(), key=lambda item : item[1], reverse=True) # 对词进行排序,过滤低频词,只取前MAX_WORD个高频词 80 | word_number = 1 81 | for word in word_count_sort: 82 | if word[0] not in vocab.keys(): 83 | vocab[word[0]] = len(vocab) 84 | word_number += 1 85 | if word_number > MAX_WORD: 86 | break 87 | return vocab 88 | 89 | # 定义Dataset 90 | class MyDataset(Dataset): 91 | def __init__(self, text_path): 92 | file = open(text_path, 'r', encoding='utf-8') 93 | self.text_with_tag = file.readlines() # 文本标签与内容 94 | file.close() 95 | 96 | def __getitem__(self, index): # 重写getitem 97 | line = self.text_with_tag[index] # 获取一个样本的标签和文本信息 98 | label = int(line[0]) # 标签信息 99 | text = line[2:-1] # 文本信息 100 | return text, label 101 | 102 | def __len__(self): 103 | return len(self.text_with_tag) 104 | 105 | 106 | # 根据vocab将句子转为定长MAX_LEN的tensor 107 | def text_transform(sentence_list, vocab): 108 | sentence_index_list = [] 109 | for sentence in sentence_list: 110 | sentence_idx = [vocab[token] if token in vocab.keys() else vocab[''] for token in tokenize(sentence)] # 句子分词转为id 111 | 112 | if len(sentence_idx) < MAX_LEN: 113 | for i in range(MAX_LEN-len(sentence_idx)): # 对长度不够的句子进行PAD填充 114 | sentence_idx.append(vocab['']) 115 | 116 | sentence_idx = sentence_idx[:MAX_LEN] # 取前MAX_LEN长度 117 | sentence_index_list.append(sentence_idx) 118 | return torch.LongTensor(sentence_index_list) # 将转为idx的词转为tensor --------------------------------------------------------------------------------