├── README.md
├── dataloader.py
├── main.py
├── model.py
├── preprocess.py
├── requirements.txt
├── util.py
└── vocab.py


/README.md:
--------------------------------------------------------------------------------
 1 | # RNN-based short text classification
 2 | 
 3 | - This is for multi-class short text classification.
 4 | - Model is built with Word Embedding, LSTM ( or GRU), and Fully-connected layer by [Pytorch](http://pytorch.org).
 5 | - A mini-batch is created by 0 padding and processed by using torch.nn.utils.rnn.PackedSequence.
 6 | - Cross-entropy Loss + Adam optimizer.
 7 | - Support pretrained word embedding ([GloVe](https://nlp.stanford.edu/projects/glove/)).
 8 | ## Model
 9 | - Embedding --> Dropout --> LSTM(GRU) --> Dropout --> FC.
10 | 
11 | 
12 | 
13 | ## Preprocessing
14 | - The following command will download the dataset used in
15 |  [Learning to Classify Short and Sparse Text & Web with Hidden Topics from Large-scale Data Collections](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.332.6000&rep=rep1&type=pdf)
16 |  from [here](http://jwebpro.sourceforge.net/data-web-snippets.tar.gz) and process it for training.
17 | - Also it download GloVe embeddings.
18 | ```
19 | python preprocess.py
20 | ```
21 | 
22 | ## Training
23 | 
24 | - The following command starts training. Run it with ```-h``` for optional arguments.
25 | 
26 | ```
27 | python main.py
28 | ```
29 | 


--------------------------------------------------------------------------------
/dataloader.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import torch
  3 | import pandas as pd
  4 | import numpy as np
  5 | 
  6 | import util as ut
  7 | 
  8 | class TextClassDataLoader(object):
  9 | 
 10 |     def __init__(self, path_file, word_to_index, batch_size=32):
 11 |         """
 12 | 
 13 |         Args:
 14 |             path_file:
 15 |             word_to_index:
 16 |             batch_size:
 17 |         """
 18 | 
 19 |         self.batch_size = batch_size
 20 |         self.word_to_index = word_to_index
 21 | 
 22 |         # read file
 23 |         df = pd.read_csv(path_file, delimiter='\t')
 24 |         df['body'] = df['body'].apply(ut._tokenize)
 25 |         df['body'] = df['body'].apply(self.generate_indexifyer())
 26 |         self.samples = df.values.tolist()
 27 | 
 28 |         # for batch
 29 |         self.n_samples = len(self.samples)
 30 |         self.n_batches = int(self.n_samples / self.batch_size)
 31 |         self.max_length = self._get_max_length()
 32 |         self._shuffle_indices()
 33 | 
 34 |         self.report()
 35 | 
 36 |     def _shuffle_indices(self):
 37 |         self.indices = np.random.permutation(self.n_samples)
 38 |         self.index = 0
 39 |         self.batch_index = 0
 40 | 
 41 |     def _get_max_length(self):
 42 |         length = 0
 43 |         for sample in self.samples:
 44 |             length = max(length, len(sample[1]))
 45 |         return length
 46 | 
 47 |     def generate_indexifyer(self):
 48 | 
 49 |         def indexify(lst_text):
 50 |             indices = []
 51 |             for word in lst_text:
 52 |                 if word in self.word_to_index:
 53 |                     indices.append(self.word_to_index[word])
 54 |                 else:
 55 |                     indices.append(self.word_to_index['__UNK__'])
 56 |             return indices
 57 | 
 58 |         return indexify
 59 | 
 60 |     @staticmethod
 61 |     def _padding(batch_x):
 62 |         batch_s = sorted(batch_x, key=lambda x: len(x))
 63 |         size = len(batch_s[-1])
 64 |         for i, x in enumerate(batch_x):
 65 |             missing = size - len(x)
 66 |             batch_x[i] =  batch_x[i] + [0 for _ in range(missing)]
 67 |         return batch_x
 68 | 
 69 |     def _create_batch(self):
 70 |         batch = []
 71 |         n = 0
 72 |         while n < self.batch_size:
 73 |             _index = self.indices[self.index]
 74 |             batch.append(self.samples[_index])
 75 |             self.index += 1
 76 |             n += 1
 77 |         self.batch_index += 1
 78 | 
 79 |         label, string = tuple(zip(*batch))
 80 | 
 81 |         # get the length of each seq in your batch
 82 |         seq_lengths = torch.LongTensor(map(len, string))
 83 | 
 84 |         # dump padding everywhere, and place seqs on the left.
 85 |         # NOTE: you only need a tensor as big as your longest sequence
 86 |         seq_tensor = torch.zeros((len(string), seq_lengths.max())).long()
 87 |         for idx, (seq, seqlen) in enumerate(zip(string, seq_lengths)):
 88 |             seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
 89 | 
 90 |         # SORT YOUR TENSORS BY LENGTH!
 91 |         seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
 92 |         seq_tensor = seq_tensor[perm_idx]
 93 |         # seq_tensor = seq_tensor.transpose(0, 1)
 94 | 
 95 |         label = torch.LongTensor(label)
 96 |         label = label[perm_idx]
 97 | 
 98 |         return seq_tensor, label, seq_lengths
 99 | 
100 |     def __len__(self):
101 |         return self.n_batches
102 | 
103 |     def __iter__(self):
104 |         self._shuffle_indices()
105 |         for i in range(self.n_batches):
106 |             if self.batch_index == self.n_batches:
107 |                 raise StopIteration()
108 |             yield self._create_batch()
109 | 
110 |     def show_samples(self, n=10):
111 |         for sample in self.samples[:n]:
112 |             print(sample)
113 | 
114 |     def report(self):
115 |         print('# samples: {}'.format(len(self.samples)))
116 |         print('max len: {}'.format(self.max_length))
117 |         print('# vocab: {}'.format(len(self.word_to_index)))
118 |         print('# batches: {} (batch_size = {})'.format(self.n_batches, self.batch_size))
119 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import time
  4 | import gc
  5 | import os
  6 | import argparse
  7 | 
  8 | import numpy as np
  9 | from sklearn.externals import  joblib
 10 | import torch
 11 | from torch import nn
 12 | import torch.backends.cudnn as cudnn
 13 | 
 14 | from vocab import  VocabBuilder, GloveVocabBuilder
 15 | from dataloader import TextClassDataLoader
 16 | from model import RNN
 17 | from util import AverageMeter, accuracy
 18 | from util import adjust_learning_rate
 19 | 
 20 | np.random.seed(0)
 21 | torch.manual_seed(0)
 22 | 
 23 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 24 | parser.add_argument('--epochs', default=50, type=int, metavar='N', help='number of total epochs to run')
 25 | parser.add_argument('-b', '--batch-size', default=128, type=int, metavar='N', help='mini-batch size')
 26 | parser.add_argument('--lr', '--learning-rate', default=0.005, type=float, metavar='LR', help='initial learning rate')
 27 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, metavar='W', help='weight decay')
 28 | parser.add_argument('--print-freq', '-p', default=10, type=int, metavar='N', help='print frequency')
 29 | parser.add_argument('--save-freq', '-sf', default=10, type=int, metavar='N', help='model save frequency(epoch)')
 30 | parser.add_argument('--embedding-size', default=50, type=int, metavar='N', help='embedding size')
 31 | parser.add_argument('--hidden-size', default=128, type=int, metavar='N', help='rnn hidden size')
 32 | parser.add_argument('--layers', default=2, type=int, metavar='N', help='number of rnn layers')
 33 | parser.add_argument('--classes', default=8, type=int, metavar='N', help='number of output classes')
 34 | parser.add_argument('--min-samples', default=5, type=int, metavar='N', help='min number of tokens')
 35 | parser.add_argument('--cuda', default=False, action='store_true', help='use cuda')
 36 | parser.add_argument('--glove', default='glove/glove.6B.100d.txt', help='path to glove txt')
 37 | parser.add_argument('--rnn', default='LSTM', choices=['LSTM', 'GRU'], help='rnn module type')
 38 | parser.add_argument('--mean_seq', default=False, action='store_true', help='use mean of rnn output')
 39 | parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping')
 40 | args = parser.parse_args()
 41 | 
 42 | 
 43 | # create vocab
 44 | print("===> creating vocabs ...")
 45 | end = time.time()
 46 | v_builder, d_word_index, embed = None, None, None
 47 | if os.path.exists(args.glove):
 48 |     v_builder = GloveVocabBuilder(path_glove=args.glove)
 49 |     d_word_index, embed = v_builder.get_word_index()
 50 |     args.embedding_size = embed.size(1)
 51 | else:
 52 |     v_builder = VocabBuilder(path_file='data/train.tsv')
 53 |     d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples)
 54 | 
 55 | if not os.path.exists('gen'):
 56 |     os.mkdir('gen')
 57 | joblib.dump(d_word_index, 'gen/d_word_index.pkl', compress=3)
 58 | print('===> vocab creatin: {t:.3f}'.format(t=time.time()-end))
 59 | 
 60 | print('args: ',args)
 61 | 
 62 | # create trainer
 63 | print("===> creating dataloaders ...")
 64 | end = time.time()
 65 | train_loader = TextClassDataLoader('data/train.tsv', d_word_index, batch_size=args.batch_size)
 66 | val_loader = TextClassDataLoader('data/test.tsv', d_word_index, batch_size=args.batch_size)
 67 | print('===> dataloader creatin: {t:.3f}'.format(t=time.time()-end))
 68 | 
 69 | 
 70 | # create model
 71 | print("===> creating rnn model ...")
 72 | vocab_size = len(d_word_index)
 73 | model = RNN(vocab_size=vocab_size, embed_size=args.embedding_size, num_output=args.classes, rnn_model=args.rnn,
 74 |             use_last=( not args.mean_seq),
 75 |             hidden_size=args.hidden_size, embedding_tensor=embed, num_layers=args.layers, batch_first=True)
 76 | print(model)
 77 | 
 78 | # optimizer and loss
 79 | optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.weight_decay)
 80 | 
 81 | criterion = nn.CrossEntropyLoss()
 82 | print(optimizer)
 83 | print(criterion)
 84 | 
 85 | if args.cuda:
 86 |     torch.backends.cudnn.enabled = True
 87 |     cudnn.benchmark = True
 88 |     model.cuda()
 89 |     criterion = criterion.cuda()
 90 | 
 91 | 
 92 | def train(train_loader, model, criterion, optimizer, epoch):
 93 |     batch_time = AverageMeter()
 94 |     data_time = AverageMeter()
 95 |     losses = AverageMeter()
 96 |     top1 = AverageMeter()
 97 | 
 98 |     # switch to train mode
 99 |     model.train()
100 | 
101 |     end = time.time()
102 |     for i, (input, target, seq_lengths) in enumerate(train_loader):
103 |         # measure data loading time
104 |         data_time.update(time.time() - end)
105 | 
106 |         if args.cuda:
107 |             input = input.cuda(async=True)
108 |             target = target.cuda(async=True)
109 | 
110 |         # compute output
111 |         output = model(input, seq_lengths)
112 |         loss = criterion(output, target)
113 | 
114 |         # measure accuracy and record loss
115 |         prec1 = accuracy(output.data, target, topk=(1,))
116 |         losses.update(loss.data, input.size(0))
117 |         top1.update(prec1[0][0], input.size(0))
118 | 
119 |         # compute gradient and do SGD step
120 |         optimizer.zero_grad()
121 |         loss.backward()
122 | 
123 |         torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
124 |         optimizer.step()
125 | 
126 |         # measure elapsed time
127 |         batch_time.update(time.time() - end)
128 |         end = time.time()
129 | 
130 |         if i != 0 and i % args.print_freq == 0:
131 |             print('Epoch: [{0}][{1}/{2}]  Time {batch_time.val:.3f} ({batch_time.avg:.3f})  '
132 |                   'Data {data_time.val:.3f} ({data_time.avg:.3f})  Loss {loss.val:.4f} ({loss.avg:.4f})  '
133 |                   'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
134 |                    epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1))
135 |             gc.collect()
136 | 
137 | 
138 | def test(val_loader, model, criterion):
139 |     batch_time = AverageMeter()
140 |     losses = AverageMeter()
141 |     top1 = AverageMeter()
142 | 
143 |     # switch to evaluate mode
144 |     model.eval()
145 |     end = time.time()
146 |     for i, (input, target,seq_lengths) in enumerate(val_loader):
147 | 
148 |         if args.cuda:
149 |             input = input.cuda(async=True)
150 |             target = target.cuda(async=True)
151 | 
152 |         # compute output
153 |         output = model(input,seq_lengths)
154 |         loss = criterion(output, target)
155 | 
156 |         # measure accuracy and record loss
157 |         prec1 = accuracy(output.data, target, topk=(1,))
158 |         losses.update(loss.data, input.size(0))
159 |         top1.update(prec1[0][0], input.size(0))
160 | 
161 |         # measure elapsed time
162 |         batch_time.update(time.time() - end)
163 |         end = time.time()
164 | 
165 |         if i!= 0 and i % args.print_freq == 0:
166 |             print('Test: [{0}/{1}]  Time {batch_time.val:.3f} ({batch_time.avg:.3f})  '
167 |                   'Loss {loss.val:.4f} ({loss.avg:.4f})  Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
168 |                    i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1))
169 |             gc.collect()
170 | 
171 |     print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1))
172 |     return top1.avg
173 | 
174 | 
175 | # training and testing
176 | for epoch in range(1, args.epochs+1):
177 | 
178 |     adjust_learning_rate(args.lr, optimizer, epoch)
179 |     train(train_loader, model, criterion, optimizer, epoch)
180 |     test(val_loader, model, criterion)
181 | 
182 |     # save current model
183 |     if epoch % args.save_freq == 0:
184 |         name_model = 'rnn_{}.pkl'.format(epoch)
185 |         path_save_model = os.path.join('gen', name_model)
186 |         joblib.dump(model.float(), path_save_model, compress=2)


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 4 | 
 5 | 
 6 | class RNN(nn.Module):
 7 | 
 8 |     def __init__(self, vocab_size, embed_size, num_output, rnn_model='LSTM', use_last=True, embedding_tensor=None,
 9 |                  padding_index=0, hidden_size=64, num_layers=1, batch_first=True):
10 |         """
11 | 
12 |         Args:
13 |             vocab_size: vocab size
14 |             embed_size: embedding size
15 |             num_output: number of output (classes)
16 |             rnn_model:  LSTM or GRU
17 |             use_last:  bool
18 |             embedding_tensor:
19 |             padding_index:
20 |             hidden_size: hidden size of rnn module
21 |             num_layers:  number of layers in rnn module
22 |             batch_first: batch first option
23 |         """
24 | 
25 |         super(RNN, self).__init__()
26 |         self.use_last = use_last
27 |         # embedding
28 |         self.encoder = None
29 |         if torch.is_tensor(embedding_tensor):
30 |             self.encoder = nn.Embedding(vocab_size, embed_size, padding_idx=padding_index, _weight=embedding_tensor)
31 |             self.encoder.weight.requires_grad = False
32 |         else:
33 |             self.encoder = nn.Embedding(vocab_size, embed_size, padding_idx=padding_index)
34 | 
35 |         self.drop_en = nn.Dropout(p=0.6)
36 | 
37 |         # rnn module
38 |         if rnn_model == 'LSTM':
39 |             self.rnn = nn.LSTM( input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, dropout=0.5,
40 |                                 batch_first=True, bidirectional=True)
41 |         elif rnn_model == 'GRU':
42 |             self.rnn = nn.GRU( input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, dropout=0.5,
43 |                                 batch_first=True, bidirectional=True)
44 |         else:
45 |             raise LookupError(' only support LSTM and GRU')
46 | 
47 | 
48 |         self.bn2 = nn.BatchNorm1d(hidden_size*2)
49 |         self.fc = nn.Linear(hidden_size*2, num_output)
50 | 
51 |     def forward(self, x, seq_lengths):
52 |         '''
53 |         Args:
54 |             x: (batch, time_step, input_size)
55 | 
56 |         Returns:
57 |             num_output size
58 |         '''
59 | 
60 |         x_embed = self.encoder(x)
61 |         x_embed = self.drop_en(x_embed)
62 |         packed_input = pack_padded_sequence(x_embed, seq_lengths.cpu().numpy(),batch_first=True)
63 | 
64 |         # r_out shape (batch, time_step, output_size)
65 |         # None is for initial hidden state
66 |         packed_output, ht = self.rnn(packed_input, None)
67 |         out_rnn, _ = pad_packed_sequence(packed_output, batch_first=True)
68 | 
69 |         row_indices = torch.arange(0, x.size(0)).long()
70 |         col_indices = seq_lengths - 1
71 |         if next(self.parameters()).is_cuda:
72 |             row_indices = row_indices.cuda()
73 |             col_indices = col_indices.cuda()
74 | 
75 |         if self.use_last:
76 |             last_tensor=out_rnn[row_indices, col_indices, :]
77 |         else:
78 |             # use mean
79 |             last_tensor = out_rnn[row_indices, :, :]
80 |             last_tensor = torch.mean(last_tensor, dim=1)
81 | 
82 |         fc_input = self.bn2(last_tensor)
83 |         out = self.fc(fc_input)
84 |         return out
85 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from util import create_tsv_file
 4 | 
 5 | if __name__ == '__main__':
 6 | 
 7 |     subprocess.call(['bash', '-c', 'wget http://jwebpro.sourceforge.net/data-web-snippets.tar.gz'])
 8 |     subprocess.call(['bash', '-c', 'tar xvzf data-web-snippets.tar.gz'])
 9 | 
10 |     subprocess.call(['bash','-c', 'wget http://nlp.stanford.edu/data/glove.6B.zip'])
11 |     subprocess.call(['bash', '-c', 'unzip glove.6B.zip'])
12 | 
13 |     if not os.path.exists('data'):
14 |         os.mkdir('data')
15 | 
16 |     if not os.path.exists('glove'):
17 |         os.mkdir('glove')
18 | 
19 |     subprocess.call(['bash', '-c', 'mv glove.6B.*d.txt glove'])
20 |     create_tsv_file('data-web-snippets/train.txt', 'data/train.tsv')
21 |     create_tsv_file('data-web-snippets/test.txt', 'data/test.tsv')
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.22.0
2 | scikit-learn==0.19.1
3 | torch==1.13.1
4 | torchvision==0.2.1
5 | torchwordemb==0.0.8
6 | pandas==0.22.0


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | # import nltk
 4 | 
 5 | LABEL_TO_INDEX = {
 6 |     'business':                  0,
 7 |     'computers':                 1,
 8 |     'culture-arts-entertainment':2,
 9 |     'education-science':         3,
10 |     'engineering':               4,
11 |     'health':                    5,
12 |     'politics-society':          6,
13 |     'sports':                    7
14 | }
15 | 
16 | def create_tsv_file(path_in, path_out):
17 | 
18 |     with open(path_in,'r', encoding = 'utf-8') as f, open(path_out,'w',encoding = 'utf-8') as fw:
19 |         writer = csv.writer(fw, delimiter='\t')
20 |         writer.writerow(['label','body'])
21 |         for line in f:
22 |             tokens = [x.lower() for x in line.split()]
23 |             label = LABEL_TO_INDEX[tokens[-1]]
24 |             body = ' '.join(tokens[:-1])
25 |             writer.writerow([label, body])
26 | 
27 | 
28 | def _tokenize(text):
29 |     # return [x.lower() for x in nltk.word_tokenize(text)]
30 |     return [ x.lower() for x in text.split() ]
31 | 
32 | 
33 | ''' from https://github.com/pytorch/examples/blob/master/imagenet/main.py'''
34 | class AverageMeter(object):
35 | 
36 |     """Computes and stores the average and current value"""
37 | 
38 |     def __init__(self):
39 |         self.reset()
40 | 
41 |     def reset(self):
42 |         self.val = 0
43 |         self.avg = 0
44 |         self.sum = 0
45 |         self.count = 0
46 | 
47 |     def update(self, val, n=1):
48 |         self.val = val
49 |         self.sum += val * n
50 |         self.count += n
51 |         self.avg = self.sum / self.count
52 | 
53 | 
54 | def accuracy(output, target, topk=(1,)):
55 |     """Computes the precision@k for the specified values of k"""
56 |     maxk = max(topk)
57 |     batch_size = target.size(0)
58 | 
59 |     _, pred = output.topk(maxk, 1, True, True)
60 |     pred = pred.t()
61 |     correct = pred.eq(target.view(1, -1).expand_as(pred))
62 | 
63 |     res = []
64 |     for k in topk:
65 |         correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
66 |         res.append(correct_k.mul_(100.0 / batch_size))
67 |     return res
68 | 
69 | 
70 | def adjust_learning_rate(lr, optimizer, epoch):
71 |     """Sets the learning rate to the initial LR decayed by 10 every 8 epochs"""
72 |     lr = lr * (0.1 ** (epoch // 8))
73 |     for param_group in optimizer.param_groups:
74 |         param_group['lr'] = lr
75 | 
76 | 


--------------------------------------------------------------------------------
/vocab.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | 
  3 | import pandas as pd
  4 | import torchwordemb
  5 | import torch
  6 | 
  7 | import util as ut
  8 | 
  9 | 
 10 | class VocabBuilder(object):
 11 |     '''
 12 |     Read file and create word_to_index dictionary.
 13 |     This can truncate low-frequency words with min_sample option.
 14 |     '''
 15 |     def __init__(self, path_file=None):
 16 |         # word count
 17 |         self.word_count = VocabBuilder.count_from_file(path_file)
 18 |         self.word_to_index = {}
 19 | 
 20 |     @staticmethod
 21 |     def count_from_file(path_file, tokenizer=ut._tokenize):
 22 |         """
 23 |         count word frequencies in a file.
 24 |         Args:
 25 |             path_file:
 26 | 
 27 |         Returns:
 28 |             dict: {word_n :count_n, ...}
 29 | 
 30 |         """
 31 |         df = pd.read_csv(path_file, delimiter='\t')
 32 |         # tokenize
 33 |         df['body'] = df['body'].apply(tokenizer)
 34 |         # count
 35 |         word_count = Counter([tkn for sample in df['body'].values.tolist() for tkn in sample])
 36 |         print('Original Vocab size:{}'.format(len(word_count)))
 37 |         return word_count
 38 | 
 39 |     def get_word_index(self, min_sample=1, padding_marker='__PADDING__', unknown_marker='__UNK__',):
 40 |         """
 41 |         create word-to-index mapping. Padding and unknown are added to last 2 indices.
 42 | 
 43 |         Args:
 44 |             min_sample: for Truncation
 45 |             padding_marker: padding mark
 46 |             unknown_marker: unknown-word mark
 47 | 
 48 |         Returns:
 49 |             dict: {word_n: index_n, ... }
 50 | 
 51 |         """
 52 |         # truncate low fq word
 53 |         _word_count = filter(lambda x:  min_sample<=x[1], self.word_count.items())
 54 |         tokens = zip(*_word_count)[0]
 55 | 
 56 |         # inset padding and unknown
 57 |         self.word_to_index = { tkn: i for i, tkn in enumerate([padding_marker, unknown_marker] + sorted(tokens))}
 58 |         print('Turncated vocab size:{} (removed:{})'.format(len(self.word_to_index),
 59 |                                                             len(self.word_count) - len(self.word_to_index)))
 60 |         return self.word_to_index, None
 61 | 
 62 | 
 63 | class GloveVocabBuilder(object) :
 64 | 
 65 |     def __init__(self, path_glove):
 66 |         self.vec = None
 67 |         self.vocab = None
 68 |         self.path_glove = path_glove
 69 | 
 70 |     def get_word_index(self, padding_marker='__PADDING__', unknown_marker='__UNK__',):
 71 |         _vocab, _vec = torchwordemb.load_glove_text(self.path_glove)
 72 |         vocab = {padding_marker:0, unknown_marker:1}
 73 |         for tkn, indx in _vocab.items():
 74 |             vocab[tkn] = indx + 2
 75 |         vec_2 = torch.zeros((2, _vec.size(1)))
 76 |         vec_2[1].normal_()
 77 |         self.vec = torch.cat((vec_2, _vec))
 78 |         self.vocab = vocab
 79 |         return self.vocab, self.vec
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | if __name__ == "__main__":
 87 | 
 88 |     # v_builder = VocabBuilder(path_file='data/train.tsv')
 89 |     # d = v_builder.get_word_index(min_sample=10)
 90 |     # print (d['__UNK__'])
 91 |     # for k, v in sorted(d.items())[:100]:
 92 |     #     print (k,v)
 93 | 
 94 |     v_builder = GloveVocabBuilder()
 95 |     d, vec = v_builder.get_word_index()
 96 |     print (d['__UNK__'])
 97 |     for k, v in sorted(d.items())[:100]:
 98 |         print (k,v)
 99 |         print(v)
100 | 
101 | 


--------------------------------------------------------------------------------