├── README.md ├── dataloader.py ├── main.py ├── model.py ├── preprocess.py ├── requirements.txt ├── util.py └── vocab.py /README.md: -------------------------------------------------------------------------------- 1 | # RNN-based short text classification 2 | 3 | - This is for multi-class short text classification. 4 | - Model is built with Word Embedding, LSTM ( or GRU), and Fully-connected layer by [Pytorch](http://pytorch.org). 5 | - A mini-batch is created by 0 padding and processed by using torch.nn.utils.rnn.PackedSequence. 6 | - Cross-entropy Loss + Adam optimizer. 7 | - Support pretrained word embedding ([GloVe](https://nlp.stanford.edu/projects/glove/)). 8 | ## Model 9 | - Embedding --> Dropout --> LSTM(GRU) --> Dropout --> FC. 10 | 11 | 12 | 13 | ## Preprocessing 14 | - The following command will download the dataset used in 15 | [Learning to Classify Short and Sparse Text & Web with Hidden Topics from Large-scale Data Collections](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.332.6000&rep=rep1&type=pdf) 16 | from [here](http://jwebpro.sourceforge.net/data-web-snippets.tar.gz) and process it for training. 17 | - Also it download GloVe embeddings. 18 | ``` 19 | python preprocess.py 20 | ``` 21 | 22 | ## Training 23 | 24 | - The following command starts training. Run it with ```-h``` for optional arguments. 25 | 26 | ``` 27 | python main.py 28 | ``` 29 | -------------------------------------------------------------------------------- /dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import torch 3 | import pandas as pd 4 | import numpy as np 5 | 6 | import util as ut 7 | 8 | class TextClassDataLoader(object): 9 | 10 | def __init__(self, path_file, word_to_index, batch_size=32): 11 | """ 12 | 13 | Args: 14 | path_file: 15 | word_to_index: 16 | batch_size: 17 | """ 18 | 19 | self.batch_size = batch_size 20 | self.word_to_index = word_to_index 21 | 22 | # read file 23 | df = pd.read_csv(path_file, delimiter='\t') 24 | df['body'] = df['body'].apply(ut._tokenize) 25 | df['body'] = df['body'].apply(self.generate_indexifyer()) 26 | self.samples = df.values.tolist() 27 | 28 | # for batch 29 | self.n_samples = len(self.samples) 30 | self.n_batches = int(self.n_samples / self.batch_size) 31 | self.max_length = self._get_max_length() 32 | self._shuffle_indices() 33 | 34 | self.report() 35 | 36 | def _shuffle_indices(self): 37 | self.indices = np.random.permutation(self.n_samples) 38 | self.index = 0 39 | self.batch_index = 0 40 | 41 | def _get_max_length(self): 42 | length = 0 43 | for sample in self.samples: 44 | length = max(length, len(sample[1])) 45 | return length 46 | 47 | def generate_indexifyer(self): 48 | 49 | def indexify(lst_text): 50 | indices = [] 51 | for word in lst_text: 52 | if word in self.word_to_index: 53 | indices.append(self.word_to_index[word]) 54 | else: 55 | indices.append(self.word_to_index['__UNK__']) 56 | return indices 57 | 58 | return indexify 59 | 60 | @staticmethod 61 | def _padding(batch_x): 62 | batch_s = sorted(batch_x, key=lambda x: len(x)) 63 | size = len(batch_s[-1]) 64 | for i, x in enumerate(batch_x): 65 | missing = size - len(x) 66 | batch_x[i] = batch_x[i] + [0 for _ in range(missing)] 67 | return batch_x 68 | 69 | def _create_batch(self): 70 | batch = [] 71 | n = 0 72 | while n < self.batch_size: 73 | _index = self.indices[self.index] 74 | batch.append(self.samples[_index]) 75 | self.index += 1 76 | n += 1 77 | self.batch_index += 1 78 | 79 | label, string = tuple(zip(*batch)) 80 | 81 | # get the length of each seq in your batch 82 | seq_lengths = torch.LongTensor(map(len, string)) 83 | 84 | # dump padding everywhere, and place seqs on the left. 85 | # NOTE: you only need a tensor as big as your longest sequence 86 | seq_tensor = torch.zeros((len(string), seq_lengths.max())).long() 87 | for idx, (seq, seqlen) in enumerate(zip(string, seq_lengths)): 88 | seq_tensor[idx, :seqlen] = torch.LongTensor(seq) 89 | 90 | # SORT YOUR TENSORS BY LENGTH! 91 | seq_lengths, perm_idx = seq_lengths.sort(0, descending=True) 92 | seq_tensor = seq_tensor[perm_idx] 93 | # seq_tensor = seq_tensor.transpose(0, 1) 94 | 95 | label = torch.LongTensor(label) 96 | label = label[perm_idx] 97 | 98 | return seq_tensor, label, seq_lengths 99 | 100 | def __len__(self): 101 | return self.n_batches 102 | 103 | def __iter__(self): 104 | self._shuffle_indices() 105 | for i in range(self.n_batches): 106 | if self.batch_index == self.n_batches: 107 | raise StopIteration() 108 | yield self._create_batch() 109 | 110 | def show_samples(self, n=10): 111 | for sample in self.samples[:n]: 112 | print(sample) 113 | 114 | def report(self): 115 | print('# samples: {}'.format(len(self.samples))) 116 | print('max len: {}'.format(self.max_length)) 117 | print('# vocab: {}'.format(len(self.word_to_index))) 118 | print('# batches: {} (batch_size = {})'.format(self.n_batches, self.batch_size)) 119 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import time 4 | import gc 5 | import os 6 | import argparse 7 | 8 | import numpy as np 9 | from sklearn.externals import joblib 10 | import torch 11 | from torch import nn 12 | import torch.backends.cudnn as cudnn 13 | 14 | from vocab import VocabBuilder, GloveVocabBuilder 15 | from dataloader import TextClassDataLoader 16 | from model import RNN 17 | from util import AverageMeter, accuracy 18 | from util import adjust_learning_rate 19 | 20 | np.random.seed(0) 21 | torch.manual_seed(0) 22 | 23 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 24 | parser.add_argument('--epochs', default=50, type=int, metavar='N', help='number of total epochs to run') 25 | parser.add_argument('-b', '--batch-size', default=128, type=int, metavar='N', help='mini-batch size') 26 | parser.add_argument('--lr', '--learning-rate', default=0.005, type=float, metavar='LR', help='initial learning rate') 27 | parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, metavar='W', help='weight decay') 28 | parser.add_argument('--print-freq', '-p', default=10, type=int, metavar='N', help='print frequency') 29 | parser.add_argument('--save-freq', '-sf', default=10, type=int, metavar='N', help='model save frequency(epoch)') 30 | parser.add_argument('--embedding-size', default=50, type=int, metavar='N', help='embedding size') 31 | parser.add_argument('--hidden-size', default=128, type=int, metavar='N', help='rnn hidden size') 32 | parser.add_argument('--layers', default=2, type=int, metavar='N', help='number of rnn layers') 33 | parser.add_argument('--classes', default=8, type=int, metavar='N', help='number of output classes') 34 | parser.add_argument('--min-samples', default=5, type=int, metavar='N', help='min number of tokens') 35 | parser.add_argument('--cuda', default=False, action='store_true', help='use cuda') 36 | parser.add_argument('--glove', default='glove/glove.6B.100d.txt', help='path to glove txt') 37 | parser.add_argument('--rnn', default='LSTM', choices=['LSTM', 'GRU'], help='rnn module type') 38 | parser.add_argument('--mean_seq', default=False, action='store_true', help='use mean of rnn output') 39 | parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') 40 | args = parser.parse_args() 41 | 42 | 43 | # create vocab 44 | print("===> creating vocabs ...") 45 | end = time.time() 46 | v_builder, d_word_index, embed = None, None, None 47 | if os.path.exists(args.glove): 48 | v_builder = GloveVocabBuilder(path_glove=args.glove) 49 | d_word_index, embed = v_builder.get_word_index() 50 | args.embedding_size = embed.size(1) 51 | else: 52 | v_builder = VocabBuilder(path_file='data/train.tsv') 53 | d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples) 54 | 55 | if not os.path.exists('gen'): 56 | os.mkdir('gen') 57 | joblib.dump(d_word_index, 'gen/d_word_index.pkl', compress=3) 58 | print('===> vocab creatin: {t:.3f}'.format(t=time.time()-end)) 59 | 60 | print('args: ',args) 61 | 62 | # create trainer 63 | print("===> creating dataloaders ...") 64 | end = time.time() 65 | train_loader = TextClassDataLoader('data/train.tsv', d_word_index, batch_size=args.batch_size) 66 | val_loader = TextClassDataLoader('data/test.tsv', d_word_index, batch_size=args.batch_size) 67 | print('===> dataloader creatin: {t:.3f}'.format(t=time.time()-end)) 68 | 69 | 70 | # create model 71 | print("===> creating rnn model ...") 72 | vocab_size = len(d_word_index) 73 | model = RNN(vocab_size=vocab_size, embed_size=args.embedding_size, num_output=args.classes, rnn_model=args.rnn, 74 | use_last=( not args.mean_seq), 75 | hidden_size=args.hidden_size, embedding_tensor=embed, num_layers=args.layers, batch_first=True) 76 | print(model) 77 | 78 | # optimizer and loss 79 | optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.weight_decay) 80 | 81 | criterion = nn.CrossEntropyLoss() 82 | print(optimizer) 83 | print(criterion) 84 | 85 | if args.cuda: 86 | torch.backends.cudnn.enabled = True 87 | cudnn.benchmark = True 88 | model.cuda() 89 | criterion = criterion.cuda() 90 | 91 | 92 | def train(train_loader, model, criterion, optimizer, epoch): 93 | batch_time = AverageMeter() 94 | data_time = AverageMeter() 95 | losses = AverageMeter() 96 | top1 = AverageMeter() 97 | 98 | # switch to train mode 99 | model.train() 100 | 101 | end = time.time() 102 | for i, (input, target, seq_lengths) in enumerate(train_loader): 103 | # measure data loading time 104 | data_time.update(time.time() - end) 105 | 106 | if args.cuda: 107 | input = input.cuda(async=True) 108 | target = target.cuda(async=True) 109 | 110 | # compute output 111 | output = model(input, seq_lengths) 112 | loss = criterion(output, target) 113 | 114 | # measure accuracy and record loss 115 | prec1 = accuracy(output.data, target, topk=(1,)) 116 | losses.update(loss.data, input.size(0)) 117 | top1.update(prec1[0][0], input.size(0)) 118 | 119 | # compute gradient and do SGD step 120 | optimizer.zero_grad() 121 | loss.backward() 122 | 123 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) 124 | optimizer.step() 125 | 126 | # measure elapsed time 127 | batch_time.update(time.time() - end) 128 | end = time.time() 129 | 130 | if i != 0 and i % args.print_freq == 0: 131 | print('Epoch: [{0}][{1}/{2}] Time {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 132 | 'Data {data_time.val:.3f} ({data_time.avg:.3f}) Loss {loss.val:.4f} ({loss.avg:.4f}) ' 133 | 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format( 134 | epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1)) 135 | gc.collect() 136 | 137 | 138 | def test(val_loader, model, criterion): 139 | batch_time = AverageMeter() 140 | losses = AverageMeter() 141 | top1 = AverageMeter() 142 | 143 | # switch to evaluate mode 144 | model.eval() 145 | end = time.time() 146 | for i, (input, target,seq_lengths) in enumerate(val_loader): 147 | 148 | if args.cuda: 149 | input = input.cuda(async=True) 150 | target = target.cuda(async=True) 151 | 152 | # compute output 153 | output = model(input,seq_lengths) 154 | loss = criterion(output, target) 155 | 156 | # measure accuracy and record loss 157 | prec1 = accuracy(output.data, target, topk=(1,)) 158 | losses.update(loss.data, input.size(0)) 159 | top1.update(prec1[0][0], input.size(0)) 160 | 161 | # measure elapsed time 162 | batch_time.update(time.time() - end) 163 | end = time.time() 164 | 165 | if i!= 0 and i % args.print_freq == 0: 166 | print('Test: [{0}/{1}] Time {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 167 | 'Loss {loss.val:.4f} ({loss.avg:.4f}) Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format( 168 | i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1)) 169 | gc.collect() 170 | 171 | print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1)) 172 | return top1.avg 173 | 174 | 175 | # training and testing 176 | for epoch in range(1, args.epochs+1): 177 | 178 | adjust_learning_rate(args.lr, optimizer, epoch) 179 | train(train_loader, model, criterion, optimizer, epoch) 180 | test(val_loader, model, criterion) 181 | 182 | # save current model 183 | if epoch % args.save_freq == 0: 184 | name_model = 'rnn_{}.pkl'.format(epoch) 185 | path_save_model = os.path.join('gen', name_model) 186 | joblib.dump(model.float(), path_save_model, compress=2) -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 4 | 5 | 6 | class RNN(nn.Module): 7 | 8 | def __init__(self, vocab_size, embed_size, num_output, rnn_model='LSTM', use_last=True, embedding_tensor=None, 9 | padding_index=0, hidden_size=64, num_layers=1, batch_first=True): 10 | """ 11 | 12 | Args: 13 | vocab_size: vocab size 14 | embed_size: embedding size 15 | num_output: number of output (classes) 16 | rnn_model: LSTM or GRU 17 | use_last: bool 18 | embedding_tensor: 19 | padding_index: 20 | hidden_size: hidden size of rnn module 21 | num_layers: number of layers in rnn module 22 | batch_first: batch first option 23 | """ 24 | 25 | super(RNN, self).__init__() 26 | self.use_last = use_last 27 | # embedding 28 | self.encoder = None 29 | if torch.is_tensor(embedding_tensor): 30 | self.encoder = nn.Embedding(vocab_size, embed_size, padding_idx=padding_index, _weight=embedding_tensor) 31 | self.encoder.weight.requires_grad = False 32 | else: 33 | self.encoder = nn.Embedding(vocab_size, embed_size, padding_idx=padding_index) 34 | 35 | self.drop_en = nn.Dropout(p=0.6) 36 | 37 | # rnn module 38 | if rnn_model == 'LSTM': 39 | self.rnn = nn.LSTM( input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, dropout=0.5, 40 | batch_first=True, bidirectional=True) 41 | elif rnn_model == 'GRU': 42 | self.rnn = nn.GRU( input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, dropout=0.5, 43 | batch_first=True, bidirectional=True) 44 | else: 45 | raise LookupError(' only support LSTM and GRU') 46 | 47 | 48 | self.bn2 = nn.BatchNorm1d(hidden_size*2) 49 | self.fc = nn.Linear(hidden_size*2, num_output) 50 | 51 | def forward(self, x, seq_lengths): 52 | ''' 53 | Args: 54 | x: (batch, time_step, input_size) 55 | 56 | Returns: 57 | num_output size 58 | ''' 59 | 60 | x_embed = self.encoder(x) 61 | x_embed = self.drop_en(x_embed) 62 | packed_input = pack_padded_sequence(x_embed, seq_lengths.cpu().numpy(),batch_first=True) 63 | 64 | # r_out shape (batch, time_step, output_size) 65 | # None is for initial hidden state 66 | packed_output, ht = self.rnn(packed_input, None) 67 | out_rnn, _ = pad_packed_sequence(packed_output, batch_first=True) 68 | 69 | row_indices = torch.arange(0, x.size(0)).long() 70 | col_indices = seq_lengths - 1 71 | if next(self.parameters()).is_cuda: 72 | row_indices = row_indices.cuda() 73 | col_indices = col_indices.cuda() 74 | 75 | if self.use_last: 76 | last_tensor=out_rnn[row_indices, col_indices, :] 77 | else: 78 | # use mean 79 | last_tensor = out_rnn[row_indices, :, :] 80 | last_tensor = torch.mean(last_tensor, dim=1) 81 | 82 | fc_input = self.bn2(last_tensor) 83 | out = self.fc(fc_input) 84 | return out 85 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from util import create_tsv_file 4 | 5 | if __name__ == '__main__': 6 | 7 | subprocess.call(['bash', '-c', 'wget http://jwebpro.sourceforge.net/data-web-snippets.tar.gz']) 8 | subprocess.call(['bash', '-c', 'tar xvzf data-web-snippets.tar.gz']) 9 | 10 | subprocess.call(['bash','-c', 'wget http://nlp.stanford.edu/data/glove.6B.zip']) 11 | subprocess.call(['bash', '-c', 'unzip glove.6B.zip']) 12 | 13 | if not os.path.exists('data'): 14 | os.mkdir('data') 15 | 16 | if not os.path.exists('glove'): 17 | os.mkdir('glove') 18 | 19 | subprocess.call(['bash', '-c', 'mv glove.6B.*d.txt glove']) 20 | create_tsv_file('data-web-snippets/train.txt', 'data/train.tsv') 21 | create_tsv_file('data-web-snippets/test.txt', 'data/test.tsv') 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.22.0 2 | scikit-learn==0.19.1 3 | torch==1.13.1 4 | torchvision==0.2.1 5 | torchwordemb==0.0.8 6 | pandas==0.22.0 -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | # import nltk 4 | 5 | LABEL_TO_INDEX = { 6 | 'business': 0, 7 | 'computers': 1, 8 | 'culture-arts-entertainment':2, 9 | 'education-science': 3, 10 | 'engineering': 4, 11 | 'health': 5, 12 | 'politics-society': 6, 13 | 'sports': 7 14 | } 15 | 16 | def create_tsv_file(path_in, path_out): 17 | 18 | with open(path_in,'r', encoding = 'utf-8') as f, open(path_out,'w',encoding = 'utf-8') as fw: 19 | writer = csv.writer(fw, delimiter='\t') 20 | writer.writerow(['label','body']) 21 | for line in f: 22 | tokens = [x.lower() for x in line.split()] 23 | label = LABEL_TO_INDEX[tokens[-1]] 24 | body = ' '.join(tokens[:-1]) 25 | writer.writerow([label, body]) 26 | 27 | 28 | def _tokenize(text): 29 | # return [x.lower() for x in nltk.word_tokenize(text)] 30 | return [ x.lower() for x in text.split() ] 31 | 32 | 33 | ''' from https://github.com/pytorch/examples/blob/master/imagenet/main.py''' 34 | class AverageMeter(object): 35 | 36 | """Computes and stores the average and current value""" 37 | 38 | def __init__(self): 39 | self.reset() 40 | 41 | def reset(self): 42 | self.val = 0 43 | self.avg = 0 44 | self.sum = 0 45 | self.count = 0 46 | 47 | def update(self, val, n=1): 48 | self.val = val 49 | self.sum += val * n 50 | self.count += n 51 | self.avg = self.sum / self.count 52 | 53 | 54 | def accuracy(output, target, topk=(1,)): 55 | """Computes the precision@k for the specified values of k""" 56 | maxk = max(topk) 57 | batch_size = target.size(0) 58 | 59 | _, pred = output.topk(maxk, 1, True, True) 60 | pred = pred.t() 61 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 62 | 63 | res = [] 64 | for k in topk: 65 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 66 | res.append(correct_k.mul_(100.0 / batch_size)) 67 | return res 68 | 69 | 70 | def adjust_learning_rate(lr, optimizer, epoch): 71 | """Sets the learning rate to the initial LR decayed by 10 every 8 epochs""" 72 | lr = lr * (0.1 ** (epoch // 8)) 73 | for param_group in optimizer.param_groups: 74 | param_group['lr'] = lr 75 | 76 | -------------------------------------------------------------------------------- /vocab.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import pandas as pd 4 | import torchwordemb 5 | import torch 6 | 7 | import util as ut 8 | 9 | 10 | class VocabBuilder(object): 11 | ''' 12 | Read file and create word_to_index dictionary. 13 | This can truncate low-frequency words with min_sample option. 14 | ''' 15 | def __init__(self, path_file=None): 16 | # word count 17 | self.word_count = VocabBuilder.count_from_file(path_file) 18 | self.word_to_index = {} 19 | 20 | @staticmethod 21 | def count_from_file(path_file, tokenizer=ut._tokenize): 22 | """ 23 | count word frequencies in a file. 24 | Args: 25 | path_file: 26 | 27 | Returns: 28 | dict: {word_n :count_n, ...} 29 | 30 | """ 31 | df = pd.read_csv(path_file, delimiter='\t') 32 | # tokenize 33 | df['body'] = df['body'].apply(tokenizer) 34 | # count 35 | word_count = Counter([tkn for sample in df['body'].values.tolist() for tkn in sample]) 36 | print('Original Vocab size:{}'.format(len(word_count))) 37 | return word_count 38 | 39 | def get_word_index(self, min_sample=1, padding_marker='__PADDING__', unknown_marker='__UNK__',): 40 | """ 41 | create word-to-index mapping. Padding and unknown are added to last 2 indices. 42 | 43 | Args: 44 | min_sample: for Truncation 45 | padding_marker: padding mark 46 | unknown_marker: unknown-word mark 47 | 48 | Returns: 49 | dict: {word_n: index_n, ... } 50 | 51 | """ 52 | # truncate low fq word 53 | _word_count = filter(lambda x: min_sample<=x[1], self.word_count.items()) 54 | tokens = zip(*_word_count)[0] 55 | 56 | # inset padding and unknown 57 | self.word_to_index = { tkn: i for i, tkn in enumerate([padding_marker, unknown_marker] + sorted(tokens))} 58 | print('Turncated vocab size:{} (removed:{})'.format(len(self.word_to_index), 59 | len(self.word_count) - len(self.word_to_index))) 60 | return self.word_to_index, None 61 | 62 | 63 | class GloveVocabBuilder(object) : 64 | 65 | def __init__(self, path_glove): 66 | self.vec = None 67 | self.vocab = None 68 | self.path_glove = path_glove 69 | 70 | def get_word_index(self, padding_marker='__PADDING__', unknown_marker='__UNK__',): 71 | _vocab, _vec = torchwordemb.load_glove_text(self.path_glove) 72 | vocab = {padding_marker:0, unknown_marker:1} 73 | for tkn, indx in _vocab.items(): 74 | vocab[tkn] = indx + 2 75 | vec_2 = torch.zeros((2, _vec.size(1))) 76 | vec_2[1].normal_() 77 | self.vec = torch.cat((vec_2, _vec)) 78 | self.vocab = vocab 79 | return self.vocab, self.vec 80 | 81 | 82 | 83 | 84 | 85 | 86 | if __name__ == "__main__": 87 | 88 | # v_builder = VocabBuilder(path_file='data/train.tsv') 89 | # d = v_builder.get_word_index(min_sample=10) 90 | # print (d['__UNK__']) 91 | # for k, v in sorted(d.items())[:100]: 92 | # print (k,v) 93 | 94 | v_builder = GloveVocabBuilder() 95 | d, vec = v_builder.get_word_index() 96 | print (d['__UNK__']) 97 | for k, v in sorted(d.items())[:100]: 98 | print (k,v) 99 | print(v) 100 | 101 | --------------------------------------------------------------------------------