├── README.md ├── vocab.py ├── data_loader.py ├── check_loader.ipynb └── train.py /README.md: -------------------------------------------------------------------------------- 1 | # Sequential-Data-Loader-and-Model-for-Variable-Length-Data 2 | Efficient data loader for text dataset using torch.utils.data.Dataset, collate_fn and torch.utils.data.DataLoader.
3 | Efficient Model for text using torch.nn.utils.rnn.pack_padded_sequence and torch.nn.utils.rnn.pad_packed_sequence.
4 | This Model is used for Sentiment classification on [IMDB dataset](http://ai.stanford.edu/~amaas/data/sentiment/). 5 | For different dataset you have to modify "reader" function in **data_loader.py** and **vocab.py**. 6 | 7 | # Installations Required 8 | 9 | * [PyTorch](https://pytorch.org) 10 | * [Gensim](https://radimrehurek.com/gensim/index.html) 11 | * [tqdm](https://github.com/tqdm/tqdm) 12 | 13 | # Usage 14 | Put the data in the same folder. 15 | * To create dictionary -: 16 | ``` 17 | $ python build_vocab.py 18 | ``` 19 | * To train the model -: 20 | ``` 21 | $ python train.py 22 | ``` 23 | * To just see the dataloader functioning -: 24 | ``` 25 | Use check_loader.ipynb 26 | ``` 27 | 28 | -------------------------------------------------------------------------------- /vocab.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gensim 3 | from collections import Counter 4 | import json 5 | 6 | train_path = "./aclImdb/train" 7 | test_path = "./aclImdb/test" 8 | 9 | #simple function which read the data from directory and return data and label 10 | # you can make your own reader for other dataset. 11 | def reader(path): 12 | pos_path = os.path.join(path, "pos") 13 | neg_path = os.path.join(path, "neg") 14 | data = [] 15 | label = [] 16 | for file in os.listdir(pos_path): 17 | f = open(os.path.join(pos_path, file)) 18 | data.append(f.read()) 19 | label.append(1) 20 | for file in os.listdir(neg_path): 21 | f = open(os.path.join(neg_path, file)) 22 | data.append(f.read()) 23 | label.append(0) 24 | # print(data[:1]) 25 | return data, label 26 | 27 | def build_vocab(data, min_word_count = 5): 28 | counter = Counter() 29 | for line in data: 30 | l = gensim.utils.simple_preprocess(line) 31 | counter.update(l) 32 | #initialise a dictionary or look up table 33 | word2id = {} 34 | word2id[''] = 0 35 | word2id[''] = 1 36 | # include only those in dictionary which have occered more than min word count in the entire data. 37 | words = [word for word, count in counter.items() if count>min_word_count] 38 | 39 | for i, word in enumerate(words): 40 | word2id[word] = i+2 41 | 42 | with open("word2id.json", 'w') as f: 43 | json.dump(word2id, f) 44 | return word2id 45 | 46 | data, label = reader(train_path) 47 | word2id = build_vocab(data) 48 | print("Dictionary Formed and saved. The length of dictionary is-: ", len(word2id)) 49 | -------------------------------------------------------------------------------- /data_loader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as D 3 | import numpy as np 4 | import os 5 | import gensim 6 | 7 | #simple function which read the data from directory and return data and label 8 | # you can make your own reader for other dataset. 9 | def reader(path): 10 | pos_path = os.path.join(path, "pos") 11 | neg_path = os.path.join(path, "neg") 12 | data = [] 13 | label = [] 14 | for file in os.listdir(pos_path): 15 | f = open(os.path.join(pos_path, file)) 16 | data.append(f.read()) 17 | label.append(1) 18 | for file in os.listdir(neg_path): 19 | f = open(os.path.join(neg_path, file)) 20 | data.append(f.read()) 21 | label.append(0) 22 | # print(data[:1]) 23 | return data, label 24 | 25 | #function used for custom dataset in pytorch. 26 | class Dataset(D.Dataset): 27 | def __init__(self, word2id, train_path): 28 | self.word2id = word2id 29 | self.train_path = train_path 30 | # read the data and label 31 | self.data, self.label = reader(train_path) 32 | 33 | def __getitem__(self, index): 34 | # return the seq and label 35 | seq = self.preprocess(self.data[index]) 36 | label = self.label[index] 37 | return seq, label 38 | 39 | def __len__(self): 40 | return(len(self.data)) 41 | 42 | def preprocess(self, text): 43 | # used to convert line into tokens and then into their corresponding numericals values using word2id 44 | line = gensim.utils.simple_preprocess(text) 45 | seq = [] 46 | for word in line: 47 | if word in self.word2id: 48 | seq.append(self.word2id[word]) 49 | else: 50 | seq.append(self.word2id['']) 51 | #convert list into tensor 52 | seq = torch.from_numpy(np.array(seq)) 53 | return seq 54 | 55 | def collate_fn(data): 56 | ''' 57 | 58 | We should build a custom collate_fn rather than using default collate_fn, 59 | as the size of every sentence is different and merging sequences (including padding) 60 | is not supported in default. 61 | 62 | Args: 63 | data: list of tuple (training sequence, label) 64 | Return: 65 | padded_seq - Padded Sequence, tensor of shape (batch_size, padded_length) 66 | length - Original length of each sequence(without padding), tensor of shape(batch_size) 67 | label - tensor of shape (batch_size) 68 | ''' 69 | 70 | #sorting is important for usage pack padded sequence (used in model). It should be in decreasing order. 71 | data.sort(key=lambda x: len(x[0]), reverse=True) 72 | sequences, label = zip(*data) 73 | length = [len(seq) for seq in sequences] 74 | padded_seq = torch.zeros(len(sequences), max(length)).long() 75 | for i, seq in enumerate(sequences): 76 | end = length[i] 77 | padded_seq[i,:end] = seq 78 | return padded_seq, torch.from_numpy(np.array(length)), torch.from_numpy(np.array(label)) 79 | 80 | 81 | #generates the dataloader. 82 | def dataloader(word2id, train_path, test_path, batch_size = 200): 83 | train_dataset = Dataset(word2id, train_path) 84 | test_dataset = Dataset(word2id, test_path) 85 | train_dataloader = D.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True,collate_fn=collate_fn) 86 | test_dataloader = D.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn) 87 | 88 | return train_dataloader, test_dataloader 89 | -------------------------------------------------------------------------------- /check_loader.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from data_loader import dataloader\n", 10 | "import json " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "train_path = \"./aclImdb/train\"\n", 20 | "test_path = \"./aclImdb/test\"\n", 21 | "\n", 22 | "with open('./word2id.json', 'r') as f:\n", 23 | " word2id = json.load(f)\n", 24 | "\n", 25 | "train_data, test_data = dataloader(word2id, train_path, test_path)\n", 26 | "data_iter = iter(train_data)\n", 27 | "\n", 28 | "seq, length, label = next(data_iter)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "torch.Size([100, 882])\n", 41 | "tensor([[ 3335, 3335, 567, ..., 7, 59, 212],\n", 42 | " [17728, 78, 8828, ..., 0, 0, 0],\n", 43 | " [ 22, 107, 92, ..., 0, 0, 0],\n", 44 | " ...,\n", 45 | " [ 8, 31, 83, ..., 0, 0, 0],\n", 46 | " [ 1228, 8, 193, ..., 0, 0, 0],\n", 47 | " [ 37, 7988, 13193, ..., 0, 0, 0]])\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "#seq is padded tensor of shape (batch_size, padded length)\n", 53 | "print(seq.shape)\n", 54 | "print(seq)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 4, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "tensor([882, 871, 764, 661, 562, 545, 505, 477, 460, 449, 444, 439, 398, 349,\n", 67 | " 335, 332, 328, 325, 325, 315, 315, 313, 303, 301, 297, 271, 270, 251,\n", 68 | " 247, 241, 235, 232, 229, 224, 218, 218, 211, 208, 207, 203, 202, 196,\n", 69 | " 196, 190, 181, 181, 181, 167, 165, 163, 160, 159, 158, 157, 156, 151,\n", 70 | " 149, 145, 144, 142, 142, 142, 141, 140, 139, 139, 135, 135, 131, 131,\n", 71 | " 130, 126, 126, 121, 120, 119, 115, 115, 115, 103, 102, 99, 96, 96,\n", 72 | " 95, 94, 91, 91, 87, 87, 79, 78, 73, 68, 60, 52, 48, 44,\n", 73 | " 39, 33])\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "#length will tell us length of each sequence without padding, in decreasing order\n", 79 | "print(length)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "torch.Size([100])\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "print(label.shape)" 97 | ] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "Python 3", 103 | "language": "python", 104 | "name": "python3" 105 | }, 106 | "language_info": { 107 | "codemirror_mode": { 108 | "name": "ipython", 109 | "version": 3 110 | }, 111 | "file_extension": ".py", 112 | "mimetype": "text/x-python", 113 | "name": "python", 114 | "nbconvert_exporter": "python", 115 | "pygments_lexer": "ipython3", 116 | "version": "3.6.9" 117 | } 118 | }, 119 | "nbformat": 4, 120 | "nbformat_minor": 4 121 | } 122 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from data_loader import dataloader 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import json 7 | from tqdm import tqdm 8 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 9 | 10 | class RNN(nn.Module): 11 | def __init__(self,word2id, input_dim, embedding_dim, hidden_dim, output_dim): 12 | super().__init__() 13 | self.word2id = word2id 14 | #input dimension is lenght of your dictionary 15 | self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = self.word2id['']) 16 | self.rnn = nn.RNN(embedding_dim, hidden_dim) 17 | self.fc = nn.Linear(hidden_dim, output_dim) 18 | 19 | def forward(self, text, length): 20 | #text = [batch size, sent len] 21 | 22 | text = text.permute(1,0) 23 | #text = [sent len, batch size] 24 | embedded = self.embedding(text) 25 | #embedded = [sent len, batch size, emb dim] 26 | 27 | # since we have output of different length with zero padded, when we use pack padded sequence then LSTM or RNN will only process non paded elements of our sequence. 28 | # The RNN will return a packed output (which is nothing but hidden state at all non paded elements) as well as the last hidden state of our element. 29 | # Without packed padded sequences, hidden is tensors from the last element in the sequence, which will most probably be a pad token, 30 | # however when using packed padded sequences they are both from the last non-padded element in the sequence. 31 | embedded = torch.nn.utils.rnn.pack_padded_sequence(embedded, length) 32 | 33 | packed_output, hidden = self.rnn(embedded) 34 | output, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(packed_output) 35 | #output=[sent len, batch size, hid dim] 36 | #output over padding token will be zero 37 | 38 | #hidden = [1, batch size, hid dim] 39 | #the last output and hidden should be the same, to check that uncomment below code 40 | 41 | # # convert length to index 42 | # l = [lengths-1 for lengths in length] 43 | # for i, length in enumerate(l): 44 | # assert torch.equal(output[length,i,:], hidden.squeeze(0)[i]) 45 | out = self.fc(hidden.squeeze(0)) 46 | # No softmax as we are using BCEWithLogitsLoss 47 | return out 48 | 49 | def accuracy(prediction, labels): 50 | rounded_preds = torch.round(torch.sigmoid(prediction)) 51 | correct = (rounded_preds == labels).float() #convert into float for division 52 | acc = correct.sum() / len(correct) 53 | return acc 54 | 55 | 56 | def train(train_data, model, optimizer, criterion): 57 | avg_loss = 0 58 | avg_acc = 0 59 | # print(next(model.parameters()).is_cuda) 60 | model.train() 61 | for pad_seq, length, label in tqdm(train_data): 62 | 63 | optimizer.zero_grad() 64 | 65 | pad_seq = pad_seq.to(device) 66 | label = label.to(device) 67 | length = length.to(device) 68 | label = label.type(torch.cuda.FloatTensor) 69 | 70 | # print(label, pad_seq.is_cuda, label.is_cuda) 71 | output = model(pad_seq, length) 72 | #output =[batch_size, 1] 73 | output = output.reshape(output.size(0)) 74 | loss = criterion(output, label) 75 | loss.backward() 76 | optimizer.step() 77 | 78 | acc = accuracy(output, label) 79 | avg_loss += loss.item() 80 | avg_acc += acc.item() 81 | 82 | return (1.0 * avg_loss)/len(train_data), (1.0*avg_acc)/len(train_data) 83 | 84 | def evaluate(test_data, model, criterion): 85 | avg_loss = 0 86 | avg_acc = 0 87 | # print(next(model.parameters()).is_cuda) 88 | model.eval() 89 | for pad_seq, length, label in tqdm(test_data): 90 | pad_seq = pad_seq.to(device) 91 | label = label.to(device) 92 | length = length.to(device) 93 | label = label.type(torch.cuda.FloatTensor) 94 | # print(label, pad_seq.is_cuda, label.is_cuda) 95 | output = model(pad_seq, length) 96 | #output =[batch_size, 1] 97 | output = output.reshape(output.size(0)) 98 | loss = criterion(output, label) 99 | 100 | acc = accuracy(output, label) 101 | avg_loss += loss.item() 102 | avg_acc += acc.item() 103 | 104 | return (1.0 * avg_loss)/len(test_data), (1.0*avg_acc)/len(test_data) 105 | 106 | if __name__ == '__main__': 107 | 108 | train_path = "./aclImdb/train" 109 | test_path = "./aclImdb/test" 110 | 111 | with open('./word2id.json', 'r') as f: 112 | word2id = json.load(f) 113 | 114 | train_data, test_data = dataloader(word2id, train_path, test_path) 115 | model = RNN(word2id, len(word2id), 100, 256, 1) 116 | optimizer = optim.SGD(model.parameters(), lr=1e-3) 117 | criterion = nn.BCEWithLogitsLoss() 118 | 119 | 120 | num_epochs = 20 121 | for i in range(num_epochs): 122 | print("Training") 123 | model.to(device) 124 | criterion.to(device) 125 | train_loss, train_acc = train(train_data, model, optimizer, criterion) 126 | print("Evaluating") 127 | eval_loss, eval_acc = evaluate(test_data, model, criterion) 128 | print("Training loss: {}, Evaluation loss: {}, Training accuracy: {}, Evlaution accuracy: {}". 129 | format(train_loss, eval_loss, train_acc, eval_acc)) --------------------------------------------------------------------------------