├── README.md
├── vocab.py
├── data_loader.py
├── check_loader.ipynb
└── train.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Sequential-Data-Loader-and-Model-for-Variable-Length-Data
 2 | Efficient data loader for text dataset using torch.utils.data.Dataset, collate_fn and torch.utils.data.DataLoader. <br />
 3 | Efficient Model for text using torch.nn.utils.rnn.pack_padded_sequence and torch.nn.utils.rnn.pad_packed_sequence. <br />
 4 | This Model is used for Sentiment classification on [IMDB dataset](http://ai.stanford.edu/~amaas/data/sentiment/). 
 5 | For different dataset you have to modify "reader" function in **data_loader.py** and **vocab.py**.
 6 | 
 7 | # Installations Required
 8 | 
 9 | * [PyTorch](https://pytorch.org)
10 | * [Gensim](https://radimrehurek.com/gensim/index.html)
11 | * [tqdm](https://github.com/tqdm/tqdm)
12 | 
13 | # Usage
14 | Put the data in the same folder. 
15 | * To create dictionary -:
16 | ```
17 | $ python build_vocab.py
18 | ```
19 | * To train the model -:
20 | ```
21 | $ python train.py
22 | ```
23 | * To just see the dataloader functioning -:
24 | ```
25 | Use check_loader.ipynb
26 | ```
27 | 
28 | 


--------------------------------------------------------------------------------
/vocab.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gensim
 3 | from collections import Counter
 4 | import json
 5 | 
 6 | train_path = "./aclImdb/train"
 7 | test_path = "./aclImdb/test"
 8 | 
 9 | #simple function which read the data from directory and return data and label
10 | # you can make your own reader for other dataset.
11 | def reader(path):
12 | 	pos_path = os.path.join(path, "pos")
13 | 	neg_path = os.path.join(path, "neg")
14 | 	data = []
15 | 	label = []
16 | 	for file in os.listdir(pos_path):
17 | 		f = open(os.path.join(pos_path, file))
18 | 		data.append(f.read())
19 | 		label.append(1)
20 | 	for file in os.listdir(neg_path):
21 | 		f = open(os.path.join(neg_path, file))
22 | 		data.append(f.read())
23 | 		label.append(0)
24 | 	# print(data[:1])
25 | 	return data, label
26 | 
27 | def build_vocab(data, min_word_count = 5):
28 | 	counter = Counter()
29 | 	for line in data:
30 | 		l = gensim.utils.simple_preprocess(line)
31 | 		counter.update(l)
32 | 	#initialise a dictionary or look up table
33 | 	word2id = {}
34 | 	word2id['<pad>'] = 0
35 | 	word2id['<unk>'] = 1
36 | 	# include only those in dictionary which have occered more than min word count in the entire data.
37 | 	words = [word for word, count in counter.items() if count>min_word_count]
38 | 
39 | 	for i, word in enumerate(words):
40 | 		word2id[word] = i+2
41 | 	
42 | 	with open("word2id.json", 'w') as f:
43 | 		json.dump(word2id, f)
44 | 	return word2id
45 | 
46 | data, label = reader(train_path)
47 | word2id = build_vocab(data)
48 | print("Dictionary Formed and saved. The length of dictionary is-: ", len(word2id))
49 | 


--------------------------------------------------------------------------------
/data_loader.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.utils.data as D
 3 | import numpy as np
 4 | import os
 5 | import gensim
 6 | 
 7 | #simple function which read the data from directory and return data and label
 8 | # you can make your own reader for other dataset.
 9 | def reader(path):
10 | 	pos_path = os.path.join(path, "pos")
11 | 	neg_path = os.path.join(path, "neg")
12 | 	data = []
13 | 	label = []
14 | 	for file in os.listdir(pos_path):
15 | 		f = open(os.path.join(pos_path, file))
16 | 		data.append(f.read())
17 | 		label.append(1)
18 | 	for file in os.listdir(neg_path):
19 | 		f = open(os.path.join(neg_path, file))
20 | 		data.append(f.read())
21 | 		label.append(0)
22 | 	# print(data[:1])
23 | 	return data, label
24 | 
25 | #function used for custom dataset in pytorch. 
26 | class Dataset(D.Dataset):
27 | 	def __init__(self, word2id, train_path):
28 | 		self.word2id = word2id
29 | 		self.train_path = train_path
30 | 		# read the data and label 
31 | 		self.data, self.label = reader(train_path)
32 | 
33 | 	def __getitem__(self, index):
34 | 		# return the seq and label 
35 | 		seq = self.preprocess(self.data[index])
36 | 		label = self.label[index]
37 | 		return seq, label
38 | 
39 | 	def __len__(self):
40 | 		return(len(self.data))
41 | 
42 | 	def preprocess(self, text):
43 | 		# used to convert line into tokens and then into their corresponding numericals values using word2id
44 | 		line = gensim.utils.simple_preprocess(text)
45 | 		seq = []
46 | 		for word in line:
47 | 			if word in self.word2id:
48 | 				seq.append(self.word2id[word])
49 | 			else:
50 | 				seq.append(self.word2id['<unk>'])
51 | 		#convert list into tensor
52 | 		seq = torch.from_numpy(np.array(seq))
53 | 		return seq
54 | 
55 | def collate_fn(data):
56 | 	'''  
57 | 
58 | 	We should build a custom collate_fn rather than using default collate_fn,
59 | 	as the size of every sentence is different and merging sequences (including padding) 
60 | 	is not supported in default. 
61 | 
62 | 	Args:
63 | 		data: list of tuple (training sequence, label)
64 | 	Return:
65 | 		padded_seq - Padded Sequence, tensor of shape (batch_size, padded_length)
66 | 		length - Original length of each sequence(without padding), tensor of shape(batch_size)
67 | 		label - tensor of shape (batch_size)
68 |     '''
69 | 
70 |     #sorting is important for usage pack padded sequence (used in model). It should be in decreasing order.
71 | 	data.sort(key=lambda x: len(x[0]), reverse=True)
72 | 	sequences, label = zip(*data)
73 | 	length = [len(seq) for seq in sequences]
74 | 	padded_seq = torch.zeros(len(sequences), max(length)).long()
75 | 	for i, seq in enumerate(sequences):
76 | 		end = length[i]
77 | 		padded_seq[i,:end] = seq
78 | 	return padded_seq, torch.from_numpy(np.array(length)), torch.from_numpy(np.array(label))
79 | 
80 | 
81 | #generates the dataloader. 
82 | def dataloader(word2id, train_path, test_path, batch_size = 200):
83 | 	train_dataset = Dataset(word2id, train_path)
84 | 	test_dataset = Dataset(word2id, test_path)
85 | 	train_dataloader = D.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True,collate_fn=collate_fn)
86 | 	test_dataloader = D.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
87 | 
88 | 	return train_dataloader, test_dataloader
89 | 


--------------------------------------------------------------------------------
/check_loader.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from data_loader import dataloader\n",
 10 |     "import json "
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "train_path = \"./aclImdb/train\"\n",
 20 |     "test_path = \"./aclImdb/test\"\n",
 21 |     "\n",
 22 |     "with open('./word2id.json', 'r') as f:\n",
 23 |     "    word2id = json.load(f)\n",
 24 |     "\n",
 25 |     "train_data, test_data = dataloader(word2id, train_path, test_path)\n",
 26 |     "data_iter = iter(train_data)\n",
 27 |     "\n",
 28 |     "seq, length, label = next(data_iter)"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 3,
 34 |    "metadata": {},
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "torch.Size([100, 882])\n",
 41 |       "tensor([[ 3335,  3335,   567,  ...,     7,    59,   212],\n",
 42 |       "        [17728,    78,  8828,  ...,     0,     0,     0],\n",
 43 |       "        [   22,   107,    92,  ...,     0,     0,     0],\n",
 44 |       "        ...,\n",
 45 |       "        [    8,    31,    83,  ...,     0,     0,     0],\n",
 46 |       "        [ 1228,     8,   193,  ...,     0,     0,     0],\n",
 47 |       "        [   37,  7988, 13193,  ...,     0,     0,     0]])\n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "#seq is padded tensor of shape (batch_size, padded length)\n",
 53 |     "print(seq.shape)\n",
 54 |     "print(seq)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 4,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "name": "stdout",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "tensor([882, 871, 764, 661, 562, 545, 505, 477, 460, 449, 444, 439, 398, 349,\n",
 67 |       "        335, 332, 328, 325, 325, 315, 315, 313, 303, 301, 297, 271, 270, 251,\n",
 68 |       "        247, 241, 235, 232, 229, 224, 218, 218, 211, 208, 207, 203, 202, 196,\n",
 69 |       "        196, 190, 181, 181, 181, 167, 165, 163, 160, 159, 158, 157, 156, 151,\n",
 70 |       "        149, 145, 144, 142, 142, 142, 141, 140, 139, 139, 135, 135, 131, 131,\n",
 71 |       "        130, 126, 126, 121, 120, 119, 115, 115, 115, 103, 102,  99,  96,  96,\n",
 72 |       "         95,  94,  91,  91,  87,  87,  79,  78,  73,  68,  60,  52,  48,  44,\n",
 73 |       "         39,  33])\n"
 74 |      ]
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "#length will tell us length of each sequence without padding, in decreasing order\n",
 79 |     "print(length)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 5,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "torch.Size([100])\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "print(label.shape)"
 97 |    ]
 98 |   }
 99 |  ],
100 |  "metadata": {
101 |   "kernelspec": {
102 |    "display_name": "Python 3",
103 |    "language": "python",
104 |    "name": "python3"
105 |   },
106 |   "language_info": {
107 |    "codemirror_mode": {
108 |     "name": "ipython",
109 |     "version": 3
110 |    },
111 |    "file_extension": ".py",
112 |    "mimetype": "text/x-python",
113 |    "name": "python",
114 |    "nbconvert_exporter": "python",
115 |    "pygments_lexer": "ipython3",
116 |    "version": "3.6.9"
117 |   }
118 |  },
119 |  "nbformat": 4,
120 |  "nbformat_minor": 4
121 | }
122 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | from data_loader import dataloader
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | import json
  7 | from tqdm import tqdm
  8 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
  9 | 
 10 | class RNN(nn.Module):
 11 | 	def __init__(self,word2id, input_dim, embedding_dim, hidden_dim, output_dim):
 12 | 		super().__init__()
 13 | 		self.word2id = word2id
 14 | 		#input dimension is lenght of your dictionary
 15 | 		self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = self.word2id['<pad>'])
 16 | 		self.rnn = nn.RNN(embedding_dim, hidden_dim)
 17 | 		self.fc = nn.Linear(hidden_dim, output_dim)
 18 | 		
 19 | 	def forward(self, text, length):
 20 | 		#text = [batch size, sent len]
 21 | 
 22 | 		text = text.permute(1,0)
 23 | 		#text = [sent len, batch size]
 24 | 		embedded = self.embedding(text)
 25 | 		#embedded = [sent len, batch size, emb dim]
 26 | 		
 27 | 		# since we have output of different length with zero padded, when we use pack padded sequence then LSTM or RNN will only process non paded elements of our sequence.
 28 | 		# The RNN will return a packed output (which is nothing but hidden state at all non paded elements) as well as the last hidden state of our element.  
 29 | 		# Without packed padded sequences, hidden is tensors from the last element in the sequence, which will most probably be a pad token,
 30 | 		# however when using packed padded sequences they are both from the last non-padded element in the sequence.
 31 | 		embedded = torch.nn.utils.rnn.pack_padded_sequence(embedded, length)
 32 | 
 33 | 		packed_output, hidden = self.rnn(embedded)
 34 | 		output, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(packed_output)
 35 | 		#output=[sent len, batch size, hid dim]
 36 | 		#output over padding token will be zero
 37 | 		
 38 | 		#hidden = [1, batch size, hid dim]
 39 | 		#the last output and hidden should be the same, to check that uncomment below code
 40 | 		
 41 | 		# # convert length to index
 42 | 		# l = [lengths-1 for lengths in length]
 43 | 		# for i, length in enumerate(l):
 44 | 		# 	assert torch.equal(output[length,i,:], hidden.squeeze(0)[i])
 45 | 		out = self.fc(hidden.squeeze(0))
 46 | 		# No softmax as we are using BCEWithLogitsLoss
 47 | 		return out
 48 | 
 49 | def accuracy(prediction, labels):
 50 | 	rounded_preds = torch.round(torch.sigmoid(prediction))
 51 | 	correct = (rounded_preds == labels).float() #convert into float for division 
 52 | 	acc = correct.sum() / len(correct)
 53 | 	return acc
 54 | 
 55 | 
 56 | def train(train_data, model, optimizer, criterion):
 57 | 	avg_loss = 0
 58 | 	avg_acc = 0
 59 | 	# print(next(model.parameters()).is_cuda)
 60 | 	model.train()
 61 | 	for pad_seq, length, label in tqdm(train_data):
 62 | 		
 63 | 		optimizer.zero_grad()
 64 | 		
 65 | 		pad_seq = pad_seq.to(device)
 66 | 		label = label.to(device)
 67 | 		length = length.to(device)
 68 | 		label = label.type(torch.cuda.FloatTensor)
 69 | 		
 70 | 		# print(label, pad_seq.is_cuda, label.is_cuda)
 71 | 		output = model(pad_seq, length)
 72 | 		#output =[batch_size, 1]
 73 | 		output = output.reshape(output.size(0))
 74 | 		loss = criterion(output, label)
 75 | 		loss.backward()
 76 | 		optimizer.step()
 77 | 
 78 | 		acc = accuracy(output, label)
 79 | 		avg_loss += loss.item()
 80 | 		avg_acc += acc.item()
 81 | 		
 82 | 	return (1.0 * avg_loss)/len(train_data), (1.0*avg_acc)/len(train_data) 
 83 | 
 84 | def evaluate(test_data, model, criterion):
 85 | 	avg_loss = 0
 86 | 	avg_acc = 0
 87 | 	# print(next(model.parameters()).is_cuda)
 88 | 	model.eval()
 89 | 	for pad_seq, length, label in tqdm(test_data):
 90 | 		pad_seq = pad_seq.to(device)
 91 | 		label = label.to(device)
 92 | 		length = length.to(device)
 93 | 		label = label.type(torch.cuda.FloatTensor)		
 94 | 		# print(label, pad_seq.is_cuda, label.is_cuda)
 95 | 		output = model(pad_seq, length)
 96 | 		#output =[batch_size, 1]
 97 | 		output = output.reshape(output.size(0))
 98 | 		loss = criterion(output, label)
 99 | 
100 | 		acc = accuracy(output, label)
101 | 		avg_loss += loss.item()
102 | 		avg_acc += acc.item()
103 | 		
104 | 	return (1.0 * avg_loss)/len(test_data), (1.0*avg_acc)/len(test_data) 
105 | 
106 | if __name__ == '__main__':
107 | 
108 | 	train_path = "./aclImdb/train"
109 | 	test_path = "./aclImdb/test"
110 | 
111 | 	with open('./word2id.json', 'r') as f:
112 | 		word2id = json.load(f)
113 | 	
114 | 	train_data, test_data = dataloader(word2id, train_path, test_path)
115 | 	model = RNN(word2id, len(word2id), 100, 256, 1)
116 | 	optimizer = optim.SGD(model.parameters(), lr=1e-3)
117 | 	criterion = nn.BCEWithLogitsLoss()
118 | 	
119 | 
120 | 	num_epochs = 20
121 | 	for i in range(num_epochs):
122 | 		print("Training")
123 | 		model.to(device)
124 | 		criterion.to(device)
125 | 		train_loss, train_acc = train(train_data, model, optimizer, criterion)
126 | 		print("Evaluating")
127 | 		eval_loss, eval_acc = evaluate(test_data, model, criterion)
128 | 		print("Training loss: {}, Evaluation loss: {}, Training accuracy: {}, Evlaution accuracy: {}".
129 | 			format(train_loss, eval_loss, train_acc, eval_acc))


--------------------------------------------------------------------------------