├── .gitattributes ├── CNN ├── __pycache__ │ ├── dataLoad.cpython-36.pyc │ ├── textCNN_model.cpython-36.pyc │ └── train.cpython-36.pyc ├── cnn_model_freeze │ └── textCNN_model.pt ├── dataLoad.py ├── main.py ├── model │ └── textCNN_model.pt ├── split.py ├── textCNN_model.py └── train.py ├── README.md ├── RNN ├── __pycache__ │ ├── dataLoad.cpython-36.pyc │ ├── textRNN_model.cpython-36.pyc │ └── train.cpython-36.pyc ├── dataLoad.py ├── main.py ├── model │ ├── .DS_Store │ ├── lstm │ │ ├── textRNN_lstm2_model.pt │ │ └── textRnn_lstm1_model.pt │ └── rnn │ │ ├── textRNN_rnn2_model.pt │ │ └── textRnn_rnn1_model.pt ├── textRNN_model.py └── train.py ├── data ├── .DS_Store ├── test_data.tsv ├── train.tsv ├── train_data.tsv └── val_data.tsv └── glove └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /CNN/__pycache__/dataLoad.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/CNN/__pycache__/dataLoad.cpython-36.pyc -------------------------------------------------------------------------------- /CNN/__pycache__/textCNN_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/CNN/__pycache__/textCNN_model.cpython-36.pyc -------------------------------------------------------------------------------- /CNN/__pycache__/train.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/CNN/__pycache__/train.cpython-36.pyc -------------------------------------------------------------------------------- /CNN/cnn_model_freeze/textCNN_model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/CNN/cnn_model_freeze/textCNN_model.pt -------------------------------------------------------------------------------- /CNN/dataLoad.py: -------------------------------------------------------------------------------- 1 | from torchtext.legacy import data 2 | from torchtext.vocab import Vectors 3 | import torch 4 | 5 | 6 | def dataLoad(train_data, val_data, test_data, batch_size): 7 | # print("batch_size=", batch_size) 8 | tokenize = lambda x: x.split() 9 | # 这里一定要指定一个fix_length进行padding,因为数据集中有非常短的phrase,比如一个单词,如果不padding,会出现卷积核的h维度大于token数,导致runtimeerror 10 | TEXT = data.Field(sequential=True, tokenize=tokenize, stop_words='english', fix_length=60) 11 | LABEL = data.Field(sequential=False, use_vocab=False) 12 | 13 | # 载入数据 14 | fields = [('PhraseId', None), ('SentenceId', None), ('Phrase', TEXT), ('Sentiment', LABEL)] 15 | train_data, val_data, test_data = data.TabularDataset.splits( 16 | path='../data', 17 | skip_header=True, 18 | train=train_data, 19 | validation=val_data, 20 | test=test_data, 21 | format='tsv', 22 | fields=fields 23 | ) 24 | 25 | # 加载glove预训练的词向量进行word Embedding 26 | vectors = Vectors(name='../glove/glove.6B.200d.txt') 27 | TEXT.build_vocab(train_data, val_data, test_data, vectors=vectors) 28 | LABEL.build_vocab(train_data, val_data, test_data) 29 | weights = TEXT.vocab.vectors # 嵌入矩阵的初始权重 30 | # 如果不使用预训练模型进行word embedding, 在搭建网络的时候,指明word_num和embedding_dim进行随机初始化 31 | # TEXT.build_vocab(train_data, val_data, test_data) 32 | # LABEL.build_vocab(train_data, val_data, test_data) 33 | 34 | # 设置迭代器 35 | train_itr, val_itr = data.Iterator.splits( 36 | (train_data, val_data), 37 | batch_sizes=(batch_size, batch_size), 38 | sort_key=lambda x: len(x.Phrase), 39 | device=-1 40 | ) 41 | test_itr = data.Iterator( 42 | test_data, 43 | batch_size=batch_size, 44 | sort=False, 45 | device=-1 46 | ) 47 | 48 | return train_itr, val_itr, test_itr, weights -------------------------------------------------------------------------------- /CNN/main.py: -------------------------------------------------------------------------------- 1 | import dataLoad 2 | import textCNN_model 3 | import train 4 | import argparse 5 | import torch 6 | 7 | 8 | if __name__ == '__main__': 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--lr', type=float, default=0.01) 12 | parser.add_argument('--epochs', type=int, default=8) 13 | parser.add_argument('--batch_size', type=int, default=64) 14 | parser.add_argument('--kernel_size', type=str, default='2,3,4') 15 | parser.add_argument('--kernel_num', type=int, default=100) 16 | parser.add_argument('--dropout', type=float, default=0.5) 17 | parser.add_argument('--device', type=int, default=-1) 18 | parser.add_argument('--test', action='store_true', default=False) 19 | args = parser.parse_args() 20 | 21 | train_data, val_data, test_data = 'train_data.tsv', 'val_data.tsv', 'test_data.tsv' 22 | train_itr, val_itr, test_itr, weight = dataLoad.dataLoad(train_data, val_data, test_data, args.batch_size) 23 | 24 | args.weight = weight 25 | args.label_num = 5 26 | args.kernel_size = [int(k) for k in args.kernel_size.split(',')] 27 | args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 28 | 29 | textCNN = textCNN_model.TextCNN(args) 30 | textCNN.to(args.device) 31 | 32 | if args.test: 33 | print('-----Loading Model...-----') 34 | textCNN = torch.load('model/textCNN_model.pt') 35 | print('-----Testing...-----') 36 | train.eval(test_itr, textCNN, args) 37 | else: 38 | train.train(train_itr, val_itr, textCNN, args) 39 | -------------------------------------------------------------------------------- /CNN/model/textCNN_model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/CNN/model/textCNN_model.pt -------------------------------------------------------------------------------- /CNN/split.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pandas as pd 3 | data = pd.read_csv('../data/train.tsv', delimiter='\t') 4 | 5 | # 数据分割 train:val:test = 6:2:2 6 | splitNum1 = int(len(data) * 0.6) 7 | splitNum2 = int(len(data) * 0.8) 8 | train_data = data.head(splitNum1) 9 | cv_data = data.loc[splitNum1: splitNum2] 10 | test_data = data.loc[splitNum2: len(data)] 11 | 12 | with open('../data/train_data.tsv', 'w') as fw1: 13 | fw1.write(train_data.to_csv(sep='\t', index=False)) 14 | 15 | with open('../data/val_data.tsv', 'w') as fw2: 16 | fw2.write(cv_data.to_csv(sep='\t', index=False)) 17 | 18 | with open('../data/test_data.tsv', 'w') as fw3: 19 | fw3.write(test_data.to_csv(sep='\t', index=False)) 20 | 21 | -------------------------------------------------------------------------------- /CNN/textCNN_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class TextCNN(nn.Module): 7 | def __init__(self, args): 8 | super(TextCNN, self).__init__() 9 | self.args = args 10 | 11 | # 输入通道数 12 | in_chaneels = 1 13 | # 类别数 14 | label_num = args.label_num 15 | # 卷积核的数量和尺寸 16 | kernel_num = args.kernel_num # 等价于输出通道数 17 | kernel_size = args.kernel_size 18 | # 用于Embedding的权重矩阵 19 | weight = args.weight 20 | # embedding向量维度和embedding词典大小 21 | self.embedding_dim = weight.size(1) 22 | num_embeddings = weight.size(0) 23 | 24 | # embedding层,用预训练模型进行Embedding 25 | # self.embedding = nn.Embedding(num_embeddings, embedding_dim) 26 | self.embedding = nn.Embedding.from_pretrained(weight, freeze=False) # 是否微调 27 | # 如果随机embedding 28 | # self.embedding = nn.Embedding(num_embeddings, embedding_dim) 29 | # 卷积层 输入通道数、输出通道数/每个卷积核的数量,卷积核的大小(卷积核的尺寸,嵌入维度) 30 | self.convs = nn.ModuleList([nn.Conv2d(in_chaneels, kernel_num, (ks, self.embedding_dim)) for ks in kernel_size]) 31 | # drop层 32 | self.dropout = nn.Dropout(args.dropout) 33 | # 全连接层 34 | self.fullconnection = nn.Linear(len(kernel_size) * kernel_num, label_num) 35 | 36 | def forward(self, x): 37 | # print("x.shape", x.size()) 38 | # x: (batch_size * max_length) 39 | # embedding操作,x: (batch_size * max_length * embedding_dim) 40 | x = self.embedding(x) 41 | # 在第二个维度增加一个维度 x: (batch_size, channel_num, max_length, embedding_dim) 42 | # x = x.unsqueeze(1) 43 | x = x.view(x.size(0), 1, x.size(1), self.embedding_dim) 44 | # 卷积操作, x:(batch_size, out_channel, width, height=1), width为卷积运算后的向量宽度 45 | x = [F.relu(conv(x)) for conv in self.convs] 46 | # 最大池化 x:(batch, out_channel, 1, 1) width经过最大池化为1 47 | x = [F.max_pool2d(input=x_item, kernel_size=(x_item.size(2), x_item.size(3))) for x_item in x] 48 | # 展平 x:(batch, (out_channel * 1 * 1 )) 49 | x = [x_item.view(x_item.size(0), -1) for x_item in x] 50 | # 特征组合 51 | x = torch.cat(x, 1) 52 | # dropout层 53 | x = self.dropout(x) 54 | # 全连接层 55 | logits = self.fullconnection(x) 56 | # 输出 57 | return logits 58 | 59 | -------------------------------------------------------------------------------- /CNN/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def train(train_itr, val_itr, model, args): 6 | 7 | model.to(args.device) 8 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # 优化参数,指定learning rate 9 | 10 | step = 0 11 | best_acc = 0 12 | save_path = 'model/textCNN_model.pt' 13 | 14 | model.train() 15 | print('-----Training...-----') 16 | for epoch in range(1, args.epochs+1): 17 | print('Epoch: {}/{}'.format(epoch, args.epochs)) 18 | for batch in train_itr: 19 | text, label = batch.Phrase, batch.Sentiment 20 | text.t_() 21 | # label.sub_(1) 22 | text = text.to(args.device) 23 | label = label.to(args.device) 24 | 25 | optimizer.zero_grad() # 将模型梯度置零 26 | predict = model(text) # 预测值 27 | loss = F.cross_entropy(predict, label) # 交叉熵作为损失函数 28 | loss.backward() # 反向传播 29 | optimizer.step() # 更新 30 | 31 | step += 1 32 | if step % 1000 == 0: 33 | predict_y = torch.max(predict, 1)[1].view(label.size()) 34 | accuracy = (predict_y.data == label.data).sum() / batch.batch_size 35 | print('\rBatch[{}] - loss: {:.6f} acc: {:.4f}'.format(step, loss.data.item(), accuracy)) 36 | # sys.stdout.write('\rBatch[{}] - loss: {:.6f} acc: {:.4f}'.format(step, loss.data.item(), accuracy)) 37 | 38 | if step % 1000 == 0: 39 | val_acc = eval(val_itr, model, args) 40 | if val_acc > best_acc: 41 | best_acc = val_acc 42 | torch.save(model, save_path) 43 | elif step % 500 == 0: 44 | torch.save(model, save_path) 45 | 46 | 47 | def eval(val_itr, model, args): 48 | model.eval() 49 | val_loss = 0 50 | val_correct = 0 51 | for batch in val_itr: 52 | text, label = batch.Phrase, batch.Sentiment 53 | text.t_() 54 | text = text.to(args.device) 55 | label = label.to(args.device) 56 | 57 | predict = model(text) # 预测值 58 | loss = F.cross_entropy(predict, label) # 交叉熵作为损失函数 59 | val_loss += loss.data.item() 60 | 61 | predict_y = torch.max(predict, 1)[1].view(label.size()) 62 | val_correct += (predict_y.data == label.data).sum() 63 | 64 | data_size = len(val_itr.dataset) 65 | val_loss /= data_size 66 | val_acc = val_correct / data_size 67 | print('Evaluation - loss: {:.6f} acc: {:.4f}\n'.format(val_loss, val_acc)) 68 | # sys.stdout.write('\n Evaluation - loss: {:.6f} acc: {:.4f}'.format(val_loss, val_acc)) 69 | 70 | return val_acc 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nlp-beginner-task2-TextClassification-TextCNN-TextRNN 2 | 基于深度学习的文本分类,实现基于CNN和RNN的文本分类 3 | 4 | ### 一、问题描述 5 | 6 | 1. 实现基于深度学习的文本分类,使用卷积神经网络CNN和循环神经网络RNN 7 | 8 | 2. 数据集:https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews 9 | 10 | 3. 分类要求,将sentence进行情感分类,一共五类 11 | 12 | 0: negative 13 | 14 | 1: somewhat negative 15 | 16 | 2: neutral 17 | 18 | 3: somewhat positive 19 | 20 | 4: positive 21 | 22 | ### 二、数据处理 23 | 24 | #### 1. 数据集划分 25 | 26 | ​ 将数据集按train: crossValidation: test=6:2:2的比例划分,将train.tsv分为train_data.tsv, val_data.tsv, test_data.tsv三个文件,代码见CNN/split.py 27 | 28 | #### 2. 文本处理 29 | 30 | 使用torchtext进行文本处理 31 | 32 | 1. 设置field 33 | 34 | ``` 35 | TEXT = data.Field(sequential=True, tokenize=tokenize, stop_words='english', fix_length=60) 36 | LABEL = data.Field(sequential=False, use_vocab=False) 37 | fields = [('PhraseId', None), ('SentenceId', None), ('Phrase', TEXT), ('Sentiment', LABEL)] 38 | ``` 39 | 40 | 这里一定要指定fix_length,因为数据集中有非常短的的phrase,比如一个单词,如果不padding,会出现卷积核的h维度大于token数,导致runtimeerror. 此处我设置的是60,原因是数据集中最长的phrase为52个token. 41 | 42 | 2. 数据载入 43 | 44 | ``` 45 | data.TabularDataset.splits() 46 | ``` 47 | 48 | 3. 构建词表 49 | 50 | ```python 51 | vectors = Vectors(name='../glove/glove.6B.200d.txt') 52 | TEXT.build_vocab(train_data, val_data, test_data, vectors=vectors) 53 | LABEL.build_vocab(train_data, val_data, test_data) 54 | weights = TEXT.vocab.vectors # 嵌入矩阵的初始权重 55 | ``` 56 | 57 | 使用glove预训练的词向量构建,需要保存weight,用于网络中进行embedding 58 | 59 | 也可以不使用预训练向量 60 | 61 | 4. 构建迭代器 62 | 63 | ``` 64 | data.Iterator.splits() 65 | ``` 66 | 67 | ### 三、模型搭建 68 | 69 | #### 1. TextCNN 70 | 71 | embedding层:进行WordEmbedding,可以选择随机embedding和使用预训练的词向量进行embedding 72 | 73 | 卷积层:进行卷积操作,需要指定卷积每个核的尺寸和数量 74 | 75 | 全连接层:线性操作 76 | 77 | #### 2. TextRNN 78 | 79 | 用两种模型来定义TextRNN,RNN和LSTM,可以选择是否使用双向, 80 | 81 | 网络结构由embedding层和循环层和全连接层 82 | 83 | ### 四、训练脚本 84 | 85 | 1. train() 进行模型的训练 86 | 87 | 在训练时,要注意将text进行转置,将batch_size作为第一个维度,否则在网络中进行embedding时,会出现尺寸不匹配的情况 88 | 89 | ``` 90 | text, label = batch.Phrase, batch.Sentiment 91 | text.t_() 92 | ``` 93 | 94 | 2. eval()进行交叉验证和测试 95 | 96 | ### 五、main() 97 | 98 | 设置两种不同网络需要的各种参数,载入数据,进行模型的训练、保存和测试 99 | -------------------------------------------------------------------------------- /RNN/__pycache__/dataLoad.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/RNN/__pycache__/dataLoad.cpython-36.pyc -------------------------------------------------------------------------------- /RNN/__pycache__/textRNN_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/RNN/__pycache__/textRNN_model.cpython-36.pyc -------------------------------------------------------------------------------- /RNN/__pycache__/train.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/RNN/__pycache__/train.cpython-36.pyc -------------------------------------------------------------------------------- /RNN/dataLoad.py: -------------------------------------------------------------------------------- 1 | from torchtext.legacy import data 2 | from torchtext.vocab import Vectors 3 | import torch 4 | 5 | 6 | def dataLoad(train_data, val_data, test_data, batch_size): 7 | # print("batch_size=", batch_size) 8 | tokenize = lambda x: x.split() 9 | # 这里一定要指定一个fix_length进行padding,因为数据集中有非常短的phrase,比如一个单词,如果不padding,会出现卷积核的h维度大于token数,导致runtimeerror 10 | TEXT = data.Field(sequential=True, tokenize=tokenize, stop_words='english', fix_length=60) 11 | LABEL = data.Field(sequential=False, use_vocab=False) 12 | 13 | # 载入数据 14 | fields = [('PhraseId', None), ('SentenceId', None), ('Phrase', TEXT), ('Sentiment', LABEL)] 15 | train_data, val_data, test_data = data.TabularDataset.splits( 16 | path='../data', 17 | skip_header=True, 18 | train=train_data, 19 | validation=val_data, 20 | test=test_data, 21 | format='tsv', 22 | fields=fields 23 | ) 24 | 25 | # 加载glove预训练的词向量进行word Embedding 26 | vectors = Vectors(name='../glove/glove.6B.200d.txt') 27 | TEXT.build_vocab(train_data, val_data, test_data, vectors=vectors) 28 | LABEL.build_vocab(train_data, val_data, test_data) 29 | weights = TEXT.vocab.vectors # 嵌入矩阵的初始权重 30 | # 如果不使用预训练模型进行word embedding, 在搭建网络的时候,指明word_num和embedding_dim进行随机初始化 31 | # TEXT.build_vocab(train_data, val_data, test_data) 32 | # LABEL.build_vocab(train_data, val_data, test_data) 33 | 34 | # 设置迭代器 35 | train_itr, val_itr = data.Iterator.splits( 36 | (train_data, val_data), 37 | batch_sizes=(batch_size, batch_size), 38 | sort_key=lambda x: len(x.Phrase), 39 | device=-1 40 | ) 41 | test_itr = data.Iterator( 42 | test_data, 43 | batch_size=batch_size, 44 | sort=False, 45 | device=-1 46 | ) 47 | 48 | return train_itr, val_itr, test_itr, weights -------------------------------------------------------------------------------- /RNN/main.py: -------------------------------------------------------------------------------- 1 | import dataLoad 2 | import textRNN_model 3 | import train 4 | import argparse 5 | import torch 6 | 7 | 8 | if __name__ == '__main__': 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--lr', type=float, default=0.01) 12 | parser.add_argument('--epochs', type=int, default=8) 13 | parser.add_argument('--batch_size', type=int, default=64) 14 | parser.add_argument('--rnn_type', type=str, default='rnn') 15 | parser.add_argument('--hidden_size', type=int, default=20) 16 | parser.add_argument('--num_layers', type=int, default=1) 17 | parser.add_argument('--bidirectional', type=bool, default=False) 18 | parser.add_argument('--device', type=int, default=-1) 19 | parser.add_argument('--test', action='store_true', default=False) 20 | args = parser.parse_args() 21 | 22 | print('-----Data Loading...-----') 23 | train_data, val_data, test_data = 'train_data.tsv', 'val_data.tsv', 'test_data.tsv' 24 | train_itr, val_itr, test_itr, weight = dataLoad.dataLoad(train_data, val_data, test_data, args.batch_size) 25 | print('------Data Loaded.-------') 26 | 27 | args.weight = weight 28 | args.label_num = 5 29 | args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 30 | 31 | textRNN = textRNN_model.TextRNN(args) 32 | textRNN.to(args.device) 33 | 34 | if args.test: 35 | save_path = '' 36 | if args.rnn_type == 'rnn': 37 | if args.bidirectional: 38 | save_path = 'model/rnn/textRNN_rnn2_model.pt' 39 | else: 40 | save_path = 'model/rnn/textRnn_rnn1_model.pt' 41 | elif args.rnn_type == 'lstm': 42 | if args.bidirectional: 43 | save_path = 'model/lstm/textRNN_lstm2_model.pt' 44 | else: 45 | save_path = 'model/lstm/textRnn_lstm1_model.pt' 46 | print('-----Loading Model...-----') 47 | textRNN = torch.load(save_path) 48 | print('-----Testing by textRNN...-----') 49 | train.eval(test_itr, textRNN, args) 50 | else: 51 | train.train(train_itr, val_itr, textRNN, args) 52 | -------------------------------------------------------------------------------- /RNN/model/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/RNN/model/.DS_Store -------------------------------------------------------------------------------- /RNN/model/lstm/textRNN_lstm2_model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/RNN/model/lstm/textRNN_lstm2_model.pt -------------------------------------------------------------------------------- /RNN/model/lstm/textRnn_lstm1_model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/RNN/model/lstm/textRnn_lstm1_model.pt -------------------------------------------------------------------------------- /RNN/model/rnn/textRNN_rnn2_model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/RNN/model/rnn/textRNN_rnn2_model.pt -------------------------------------------------------------------------------- /RNN/model/rnn/textRnn_rnn1_model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/RNN/model/rnn/textRnn_rnn1_model.pt -------------------------------------------------------------------------------- /RNN/textRNN_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class TextRNN(nn.Module): 6 | def __init__(self, args): 7 | super(TextRNN, self).__init__() 8 | # 类别数 9 | label_num = args.label_num 10 | # 用于embedding的权重矩阵 11 | weight = args.weight 12 | # 字典的词数和Embedding维数 13 | embedding_dim = weight.size(1) 14 | num_embeddings = weight.size(0) 15 | # 循环网络的种类 16 | self.rnn_type = args.rnn_type 17 | # 隐层的维度 18 | self.hidden_size = args.hidden_size 19 | # 循环神经网络的层数 20 | self.num_layers = args.num_layers 21 | # 是否使用双向 22 | self.bidirectional = args.bidirectional 23 | 24 | # Embedding层, 使用预训练的词向量进行word Embedding 25 | self.embedding = nn.Embedding.from_pretrained(weight, freeze=False) 26 | # LSTM 27 | if self.rnn_type == 'rnn': 28 | self.rnn = nn.RNN(input_size=embedding_dim, 29 | hidden_size=self.hidden_size, 30 | num_layers=self.num_layers, 31 | batch_first=True, 32 | bidirectional=self.bidirectional) 33 | elif self.rnn_type == 'lstm': 34 | self.lstm = nn.LSTM(input_size=embedding_dim, 35 | hidden_size=self.hidden_size, 36 | num_layers=self.num_layers, 37 | batch_first=True, 38 | bidirectional=self.bidirectional) 39 | # 全连接层 40 | if self.bidirectional: 41 | self.fullconnection = nn.Linear(self.hidden_size * 2, label_num) 42 | else: 43 | self.fullconnection = nn.Linear(self.hidden_size, label_num) 44 | 45 | def forward(self, x): 46 | # word embedding x: (batch_size, max_len) to x: (batch_size, max_len, embedding_dim) 47 | x = self.embedding(x) 48 | 49 | # 隐层 50 | if self.rnn_type == 'rnn': 51 | if self.bidirectional: 52 | h0 = torch.randn(self.num_layers * 2, x.size(0), self.hidden_size) 53 | else: 54 | h0 = torch.randn(self.num_layers, x.size(0), self.hidden_size) 55 | out, hn = self.rnn(x, h0) 56 | elif self.rnn_type == 'lstm': 57 | if self.bidirectional: 58 | h0 = torch.randn(self.num_layers * 2, x.size(0), self.hidden_size) 59 | c0 = torch.randn(self.num_layers * 2, x.size(0), self.hidden_size) 60 | else: 61 | h0 = torch.randn(self.num_layers, x.size(0), self.hidden_size) 62 | c0 = torch.randn(self.num_layers, x.size(0), self.hidden_size) 63 | out, (hn, cn) = self.lstm(x, (h0, c0)) 64 | 65 | # 全连接层 66 | logits = self.fullconnection(out[:, -1, :]) 67 | 68 | return logits -------------------------------------------------------------------------------- /RNN/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def train(train_itr, val_itr, model, args): 6 | 7 | model.to(args.device) 8 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # 优化参数,指定learning rate 9 | 10 | step = 0 11 | best_acc = 0 12 | save_path = '' 13 | if args.rnn_type == 'rnn': 14 | if args.bidirectional: 15 | save_path = 'model/rnn/textRNN_rnn2_model.pt' 16 | else: 17 | save_path = 'model/rnn/textRnn_rnn1_model.pt' 18 | elif args.rnn_type == 'lstm': 19 | if args.bidirectional: 20 | save_path = 'model/lstm/textRNN_lstm2_model.pt' 21 | else: 22 | save_path = 'model/lstm/textRnn_lstm1_model.pt' 23 | 24 | 25 | model.train() 26 | print('-----Training textRNN...-----') 27 | print('model_info: {} {}'.format(args.rnn_type, args.bidirectional)) 28 | for epoch in range(1, args.epochs+1): 29 | print('Epoch: {}/{}'.format(epoch, args.epochs)) 30 | for batch in train_itr: 31 | text, label = batch.Phrase, batch.Sentiment 32 | text.t_() 33 | # label.sub_(1) 34 | text = text.to(args.device) 35 | label = label.to(args.device) 36 | 37 | optimizer.zero_grad() # 将模型梯度置零 38 | predict = model(text) # 预测值 39 | loss = F.cross_entropy(predict, label) # 交叉熵作为损失函数 40 | loss.backward() # 反向传播 41 | optimizer.step() # 更新 42 | 43 | step += 1 44 | if step % 1000 == 0: 45 | predict_y = torch.max(predict, 1)[1].view(label.size()) 46 | accuracy = (predict_y.data == label.data).sum() / batch.batch_size 47 | print('\rBatch[{}] - loss: {:.6f} acc: {:.4f}'.format(step, loss.data.item(), accuracy)) 48 | # sys.stdout.write('\rBatch[{}] - loss: {:.6f} acc: {:.4f}'.format(step, loss.data.item(), accuracy)) 49 | 50 | if step % 1000 == 0: 51 | val_acc = eval(val_itr, model, args) 52 | if val_acc > best_acc: 53 | best_acc = val_acc 54 | torch.save(model, save_path) 55 | elif step % 500 == 0: 56 | torch.save(model, save_path) 57 | print('-----Train textRNN Finished-----') 58 | 59 | 60 | def eval(val_itr, model, args): 61 | model.eval() 62 | val_loss = 0 63 | val_correct = 0 64 | for batch in val_itr: 65 | text, label = batch.Phrase, batch.Sentiment 66 | text.t_() 67 | text = text.to(args.device) 68 | label = label.to(args.device) 69 | 70 | predict = model(text) # 预测值 71 | loss = F.cross_entropy(predict, label) # 交叉熵作为损失函数 72 | val_loss += loss.data.item() 73 | 74 | predict_y = torch.max(predict, 1)[1].view(label.size()) 75 | val_correct += (predict_y.data == label.data).sum() 76 | 77 | data_size = len(val_itr.dataset) 78 | val_loss /= data_size 79 | val_acc = val_correct / data_size 80 | print('Evaluation - loss: {:.6f} acc: {:.4f}\n'.format(val_loss, val_acc)) 81 | 82 | return val_acc 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljqiang17/nlp-beginner-task2-TextClassification-TextCNN-TextRNN/f770913dce58e3a83c79fbdc75980682ba0e495f/data/.DS_Store -------------------------------------------------------------------------------- /glove/README.md: -------------------------------------------------------------------------------- 1 | ### 请将下载的解压后的glove.6b.50d.txt、glove.6b.100d.txt、glove.6b.200d.txt、glove.6b.300d.txt放在这个目录下 2 | --------------------------------------------------------------------------------