├── README.md
├── vocab.py
├── data_loader.py
├── check_loader.ipynb
└── train.py
/README.md:
--------------------------------------------------------------------------------
1 | # Sequential-Data-Loader-and-Model-for-Variable-Length-Data
2 | Efficient data loader for text dataset using torch.utils.data.Dataset, collate_fn and torch.utils.data.DataLoader.
3 | Efficient Model for text using torch.nn.utils.rnn.pack_padded_sequence and torch.nn.utils.rnn.pad_packed_sequence.
4 | This Model is used for Sentiment classification on [IMDB dataset](http://ai.stanford.edu/~amaas/data/sentiment/).
5 | For different dataset you have to modify "reader" function in **data_loader.py** and **vocab.py**.
6 |
7 | # Installations Required
8 |
9 | * [PyTorch](https://pytorch.org)
10 | * [Gensim](https://radimrehurek.com/gensim/index.html)
11 | * [tqdm](https://github.com/tqdm/tqdm)
12 |
13 | # Usage
14 | Put the data in the same folder.
15 | * To create dictionary -:
16 | ```
17 | $ python build_vocab.py
18 | ```
19 | * To train the model -:
20 | ```
21 | $ python train.py
22 | ```
23 | * To just see the dataloader functioning -:
24 | ```
25 | Use check_loader.ipynb
26 | ```
27 |
28 |
--------------------------------------------------------------------------------
/vocab.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gensim
3 | from collections import Counter
4 | import json
5 |
6 | train_path = "./aclImdb/train"
7 | test_path = "./aclImdb/test"
8 |
9 | #simple function which read the data from directory and return data and label
10 | # you can make your own reader for other dataset.
11 | def reader(path):
12 | pos_path = os.path.join(path, "pos")
13 | neg_path = os.path.join(path, "neg")
14 | data = []
15 | label = []
16 | for file in os.listdir(pos_path):
17 | f = open(os.path.join(pos_path, file))
18 | data.append(f.read())
19 | label.append(1)
20 | for file in os.listdir(neg_path):
21 | f = open(os.path.join(neg_path, file))
22 | data.append(f.read())
23 | label.append(0)
24 | # print(data[:1])
25 | return data, label
26 |
27 | def build_vocab(data, min_word_count = 5):
28 | counter = Counter()
29 | for line in data:
30 | l = gensim.utils.simple_preprocess(line)
31 | counter.update(l)
32 | #initialise a dictionary or look up table
33 | word2id = {}
34 | word2id[''] = 0
35 | word2id[''] = 1
36 | # include only those in dictionary which have occered more than min word count in the entire data.
37 | words = [word for word, count in counter.items() if count>min_word_count]
38 |
39 | for i, word in enumerate(words):
40 | word2id[word] = i+2
41 |
42 | with open("word2id.json", 'w') as f:
43 | json.dump(word2id, f)
44 | return word2id
45 |
46 | data, label = reader(train_path)
47 | word2id = build_vocab(data)
48 | print("Dictionary Formed and saved. The length of dictionary is-: ", len(word2id))
49 |
--------------------------------------------------------------------------------
/data_loader.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.utils.data as D
3 | import numpy as np
4 | import os
5 | import gensim
6 |
7 | #simple function which read the data from directory and return data and label
8 | # you can make your own reader for other dataset.
9 | def reader(path):
10 | pos_path = os.path.join(path, "pos")
11 | neg_path = os.path.join(path, "neg")
12 | data = []
13 | label = []
14 | for file in os.listdir(pos_path):
15 | f = open(os.path.join(pos_path, file))
16 | data.append(f.read())
17 | label.append(1)
18 | for file in os.listdir(neg_path):
19 | f = open(os.path.join(neg_path, file))
20 | data.append(f.read())
21 | label.append(0)
22 | # print(data[:1])
23 | return data, label
24 |
25 | #function used for custom dataset in pytorch.
26 | class Dataset(D.Dataset):
27 | def __init__(self, word2id, train_path):
28 | self.word2id = word2id
29 | self.train_path = train_path
30 | # read the data and label
31 | self.data, self.label = reader(train_path)
32 |
33 | def __getitem__(self, index):
34 | # return the seq and label
35 | seq = self.preprocess(self.data[index])
36 | label = self.label[index]
37 | return seq, label
38 |
39 | def __len__(self):
40 | return(len(self.data))
41 |
42 | def preprocess(self, text):
43 | # used to convert line into tokens and then into their corresponding numericals values using word2id
44 | line = gensim.utils.simple_preprocess(text)
45 | seq = []
46 | for word in line:
47 | if word in self.word2id:
48 | seq.append(self.word2id[word])
49 | else:
50 | seq.append(self.word2id[''])
51 | #convert list into tensor
52 | seq = torch.from_numpy(np.array(seq))
53 | return seq
54 |
55 | def collate_fn(data):
56 | '''
57 |
58 | We should build a custom collate_fn rather than using default collate_fn,
59 | as the size of every sentence is different and merging sequences (including padding)
60 | is not supported in default.
61 |
62 | Args:
63 | data: list of tuple (training sequence, label)
64 | Return:
65 | padded_seq - Padded Sequence, tensor of shape (batch_size, padded_length)
66 | length - Original length of each sequence(without padding), tensor of shape(batch_size)
67 | label - tensor of shape (batch_size)
68 | '''
69 |
70 | #sorting is important for usage pack padded sequence (used in model). It should be in decreasing order.
71 | data.sort(key=lambda x: len(x[0]), reverse=True)
72 | sequences, label = zip(*data)
73 | length = [len(seq) for seq in sequences]
74 | padded_seq = torch.zeros(len(sequences), max(length)).long()
75 | for i, seq in enumerate(sequences):
76 | end = length[i]
77 | padded_seq[i,:end] = seq
78 | return padded_seq, torch.from_numpy(np.array(length)), torch.from_numpy(np.array(label))
79 |
80 |
81 | #generates the dataloader.
82 | def dataloader(word2id, train_path, test_path, batch_size = 200):
83 | train_dataset = Dataset(word2id, train_path)
84 | test_dataset = Dataset(word2id, test_path)
85 | train_dataloader = D.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True,collate_fn=collate_fn)
86 | test_dataloader = D.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
87 |
88 | return train_dataloader, test_dataloader
89 |
--------------------------------------------------------------------------------
/check_loader.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from data_loader import dataloader\n",
10 | "import json "
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "train_path = \"./aclImdb/train\"\n",
20 | "test_path = \"./aclImdb/test\"\n",
21 | "\n",
22 | "with open('./word2id.json', 'r') as f:\n",
23 | " word2id = json.load(f)\n",
24 | "\n",
25 | "train_data, test_data = dataloader(word2id, train_path, test_path)\n",
26 | "data_iter = iter(train_data)\n",
27 | "\n",
28 | "seq, length, label = next(data_iter)"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 3,
34 | "metadata": {},
35 | "outputs": [
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "torch.Size([100, 882])\n",
41 | "tensor([[ 3335, 3335, 567, ..., 7, 59, 212],\n",
42 | " [17728, 78, 8828, ..., 0, 0, 0],\n",
43 | " [ 22, 107, 92, ..., 0, 0, 0],\n",
44 | " ...,\n",
45 | " [ 8, 31, 83, ..., 0, 0, 0],\n",
46 | " [ 1228, 8, 193, ..., 0, 0, 0],\n",
47 | " [ 37, 7988, 13193, ..., 0, 0, 0]])\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "#seq is padded tensor of shape (batch_size, padded length)\n",
53 | "print(seq.shape)\n",
54 | "print(seq)"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 4,
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "name": "stdout",
64 | "output_type": "stream",
65 | "text": [
66 | "tensor([882, 871, 764, 661, 562, 545, 505, 477, 460, 449, 444, 439, 398, 349,\n",
67 | " 335, 332, 328, 325, 325, 315, 315, 313, 303, 301, 297, 271, 270, 251,\n",
68 | " 247, 241, 235, 232, 229, 224, 218, 218, 211, 208, 207, 203, 202, 196,\n",
69 | " 196, 190, 181, 181, 181, 167, 165, 163, 160, 159, 158, 157, 156, 151,\n",
70 | " 149, 145, 144, 142, 142, 142, 141, 140, 139, 139, 135, 135, 131, 131,\n",
71 | " 130, 126, 126, 121, 120, 119, 115, 115, 115, 103, 102, 99, 96, 96,\n",
72 | " 95, 94, 91, 91, 87, 87, 79, 78, 73, 68, 60, 52, 48, 44,\n",
73 | " 39, 33])\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "#length will tell us length of each sequence without padding, in decreasing order\n",
79 | "print(length)"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 5,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "torch.Size([100])\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "print(label.shape)"
97 | ]
98 | }
99 | ],
100 | "metadata": {
101 | "kernelspec": {
102 | "display_name": "Python 3",
103 | "language": "python",
104 | "name": "python3"
105 | },
106 | "language_info": {
107 | "codemirror_mode": {
108 | "name": "ipython",
109 | "version": 3
110 | },
111 | "file_extension": ".py",
112 | "mimetype": "text/x-python",
113 | "name": "python",
114 | "nbconvert_exporter": "python",
115 | "pygments_lexer": "ipython3",
116 | "version": "3.6.9"
117 | }
118 | },
119 | "nbformat": 4,
120 | "nbformat_minor": 4
121 | }
122 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | from data_loader import dataloader
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | import torch.optim as optim
6 | import json
7 | from tqdm import tqdm
8 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
9 |
10 | class RNN(nn.Module):
11 | def __init__(self,word2id, input_dim, embedding_dim, hidden_dim, output_dim):
12 | super().__init__()
13 | self.word2id = word2id
14 | #input dimension is lenght of your dictionary
15 | self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = self.word2id[''])
16 | self.rnn = nn.RNN(embedding_dim, hidden_dim)
17 | self.fc = nn.Linear(hidden_dim, output_dim)
18 |
19 | def forward(self, text, length):
20 | #text = [batch size, sent len]
21 |
22 | text = text.permute(1,0)
23 | #text = [sent len, batch size]
24 | embedded = self.embedding(text)
25 | #embedded = [sent len, batch size, emb dim]
26 |
27 | # since we have output of different length with zero padded, when we use pack padded sequence then LSTM or RNN will only process non paded elements of our sequence.
28 | # The RNN will return a packed output (which is nothing but hidden state at all non paded elements) as well as the last hidden state of our element.
29 | # Without packed padded sequences, hidden is tensors from the last element in the sequence, which will most probably be a pad token,
30 | # however when using packed padded sequences they are both from the last non-padded element in the sequence.
31 | embedded = torch.nn.utils.rnn.pack_padded_sequence(embedded, length)
32 |
33 | packed_output, hidden = self.rnn(embedded)
34 | output, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(packed_output)
35 | #output=[sent len, batch size, hid dim]
36 | #output over padding token will be zero
37 |
38 | #hidden = [1, batch size, hid dim]
39 | #the last output and hidden should be the same, to check that uncomment below code
40 |
41 | # # convert length to index
42 | # l = [lengths-1 for lengths in length]
43 | # for i, length in enumerate(l):
44 | # assert torch.equal(output[length,i,:], hidden.squeeze(0)[i])
45 | out = self.fc(hidden.squeeze(0))
46 | # No softmax as we are using BCEWithLogitsLoss
47 | return out
48 |
49 | def accuracy(prediction, labels):
50 | rounded_preds = torch.round(torch.sigmoid(prediction))
51 | correct = (rounded_preds == labels).float() #convert into float for division
52 | acc = correct.sum() / len(correct)
53 | return acc
54 |
55 |
56 | def train(train_data, model, optimizer, criterion):
57 | avg_loss = 0
58 | avg_acc = 0
59 | # print(next(model.parameters()).is_cuda)
60 | model.train()
61 | for pad_seq, length, label in tqdm(train_data):
62 |
63 | optimizer.zero_grad()
64 |
65 | pad_seq = pad_seq.to(device)
66 | label = label.to(device)
67 | length = length.to(device)
68 | label = label.type(torch.cuda.FloatTensor)
69 |
70 | # print(label, pad_seq.is_cuda, label.is_cuda)
71 | output = model(pad_seq, length)
72 | #output =[batch_size, 1]
73 | output = output.reshape(output.size(0))
74 | loss = criterion(output, label)
75 | loss.backward()
76 | optimizer.step()
77 |
78 | acc = accuracy(output, label)
79 | avg_loss += loss.item()
80 | avg_acc += acc.item()
81 |
82 | return (1.0 * avg_loss)/len(train_data), (1.0*avg_acc)/len(train_data)
83 |
84 | def evaluate(test_data, model, criterion):
85 | avg_loss = 0
86 | avg_acc = 0
87 | # print(next(model.parameters()).is_cuda)
88 | model.eval()
89 | for pad_seq, length, label in tqdm(test_data):
90 | pad_seq = pad_seq.to(device)
91 | label = label.to(device)
92 | length = length.to(device)
93 | label = label.type(torch.cuda.FloatTensor)
94 | # print(label, pad_seq.is_cuda, label.is_cuda)
95 | output = model(pad_seq, length)
96 | #output =[batch_size, 1]
97 | output = output.reshape(output.size(0))
98 | loss = criterion(output, label)
99 |
100 | acc = accuracy(output, label)
101 | avg_loss += loss.item()
102 | avg_acc += acc.item()
103 |
104 | return (1.0 * avg_loss)/len(test_data), (1.0*avg_acc)/len(test_data)
105 |
106 | if __name__ == '__main__':
107 |
108 | train_path = "./aclImdb/train"
109 | test_path = "./aclImdb/test"
110 |
111 | with open('./word2id.json', 'r') as f:
112 | word2id = json.load(f)
113 |
114 | train_data, test_data = dataloader(word2id, train_path, test_path)
115 | model = RNN(word2id, len(word2id), 100, 256, 1)
116 | optimizer = optim.SGD(model.parameters(), lr=1e-3)
117 | criterion = nn.BCEWithLogitsLoss()
118 |
119 |
120 | num_epochs = 20
121 | for i in range(num_epochs):
122 | print("Training")
123 | model.to(device)
124 | criterion.to(device)
125 | train_loss, train_acc = train(train_data, model, optimizer, criterion)
126 | print("Evaluating")
127 | eval_loss, eval_acc = evaluate(test_data, model, criterion)
128 | print("Training loss: {}, Evaluation loss: {}, Training accuracy: {}, Evlaution accuracy: {}".
129 | format(train_loss, eval_loss, train_acc, eval_acc))
--------------------------------------------------------------------------------