├── .gitignore ├── README.md ├── gradient_reversal ├── README.md └── gradient_reversal.py ├── jmt ├── README.md ├── code │ ├── data.py │ ├── model.py │ ├── train.py │ └── utils.py ├── dataset │ └── pos │ │ ├── pos_wsj.sample.dev │ │ └── pos_wsj.sample.train └── download_embeddings.sh ├── nmt ├── README.md ├── code │ ├── bleu.sh │ ├── data.py │ ├── model.py │ ├── setup.sh │ ├── train.py │ └── utils.py └── code_0.4 │ ├── bleu.sh │ ├── data.py │ ├── data │ ├── sample.en.voc │ └── sample.ja.voc │ ├── model.py │ ├── setup.sh │ ├── train.py │ └── utils.py └── text_classifier ├── README.md ├── data.py ├── dataset ├── stanford_sentiment_sample.dev └── stanford_sentiment_sample.train ├── model.py ├── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # emacs alias 104 | *~ 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pytorch-playground 2 | My PyTorch playground for NLP 3 | 4 | ## Setup commands I used 5 | * Installing Anaconda (Python3.6 will also be installed)
6 | `wget https://repo.continuum.io/archive/Anaconda3-4.2.0-Linux-x86_64.sh`
7 | `bash Anaconda3-4.2.0-Linux-x86_64.sh # prefix: $ANACONDA_PATH` 8 | 9 | * Installing PyTorch (0.2.0) in an Anaconda env
10 | `conda create --name pytorch_test`
11 | `source $ANACONDA_PATH/envs/pytorch_test/bin/activate pytorch_test`
12 | `conda install pytorch torchvision cuda80 -c soumith` 13 | 14 | ## Models 15 | * A Joint Many-Task Model (./jmt)
16 | An implementation of the JMT model proposed in our EMNLP 2017 paper 17 | 18 | * Text Classifier (./text_classifier)
19 | Classifying input text (wrods, phrases, sentences, or documents) using LSTM 20 | 21 | ## Questions or comments? 22 | Feel free to e-mail me (hassy@logos.t.u-tokyo.ac.jp). 23 | -------------------------------------------------------------------------------- /gradient_reversal/README.md: -------------------------------------------------------------------------------- 1 | # Gradient Reversal Layer 2 | An implementation of the gradient reversal layer [1] 3 | 4 | ## Reference ## 5 | [1] Yaroslav Ganin and Victor Lempitsky. 2014. Unsupervised Domain Adaptation by Backpropagation. arXiv stat.ML 1409.7495. 6 | -------------------------------------------------------------------------------- /gradient_reversal/gradient_reversal.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class GradientReversal(torch.autograd.Function): 4 | 5 | def __init__(self, scale_): 6 | super(GradientReversal, self).__init__() 7 | 8 | self.scale = scale_ 9 | 10 | def forward(self, inp): 11 | return inp.clone() 12 | 13 | def backward(self, grad_out): 14 | return -self.scale * grad_out.clone() 15 | 16 | -------------------------------------------------------------------------------- /jmt/README.md: -------------------------------------------------------------------------------- 1 | # A Joint Many-Task Model (JMT) 2 | An implementation of the JMT model proposed in our EMNLP 2017 paper [1] 3 | 4 | ## Usage 5 | First download the pre-trained word and character n-gram embeddings used in our paper:
6 | `./download_embeddings.sh`
7 | 8 | Then we can run experiments:
9 | `python train.py`
10 | 11 | ## Notes 12 | * Currently, only the single-task tagging model is implemented, and eventually all of the five task models will be availabel here.
13 | 14 | * We can replicate almost the same POS tagging results reported in our paper. We should also be able to replicate the chunking results, but the F1 evaluation metric has not yet implemented.
15 | 16 | ## Reference ## 17 | [1] Kazuma Hashimoto, Caiming Xiong, Yoshimasa Tsuruoka, and Richard Socher. 2017. A Joint Many-Task Model: Growing a Neural Network for Multiple NLP Tasks. In Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing (EMNLP 2017), arXiv cs.CL 1611.01587. 18 | 19 | @InProceedings{hashimoto-jmt:2017:EMNLP2017, 20 | author = {Hashimoto, Kazuma and Xiong, Caiming and Tsuruoka, Yoshimasa and Socher, Richard}, 21 | title = {{A Joint Many-Task Model: Growing a Neural Network for Multiple NLP Tasks}}, 22 | booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing (EMNLP)}, 23 | month = {September}, 24 | year = {2017}, 25 | address = {Copenhagen, Denmark}, 26 | publisher = {Association for Computational Linguistics}, 27 | pages = {446--456}, 28 | url = {http://arxiv.org/abs/1611.01587} 29 | } -------------------------------------------------------------------------------- /jmt/code/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from torch.autograd import Variable 5 | 6 | import utils 7 | 8 | class Token: 9 | def __init__(self, str_ = '', count_ = 0): 10 | self.str = str_ 11 | self.count = count_ 12 | 13 | class Vocabulary: 14 | def __init__(self): 15 | self.UNK = '' # unkown words 16 | self.PAD = '' # padding 17 | self.unkIndex = -1 18 | self.padIndex = -1 19 | self.tokenIndex = {} 20 | self.tokenList = [] 21 | 22 | def getTokenIndex(self, str): 23 | if str in self.tokenIndex: 24 | return self.tokenIndex[str] 25 | else: 26 | return self.tokenIndex[self.UNK] 27 | 28 | def add(self, str, count): 29 | if str not in self.tokenIndex: 30 | self.tokenList.append(Token(str, count)) 31 | self.tokenIndex[str] = len(self.tokenList)-1 32 | 33 | def size(self): 34 | return len(self.tokenList) 35 | 36 | class Data: 37 | def __init__(self, tokenIndices_, charNgramIndices_, labelIndices_): 38 | self.tokenIndices = tokenIndices_ 39 | self.charNgramIndices = charNgramIndices_ 40 | self.labelIndices = labelIndices_ 41 | 42 | class Corpus: 43 | def __init__(self, trainFile, devFile, wordDropoutCoeff): 44 | self.voc = Vocabulary() 45 | self.classVoc = Vocabulary() 46 | self.charVoc = Vocabulary() 47 | 48 | self.buildVoc(trainFile, wordDropoutCoeff) 49 | self.trainData = self.buildDataset(trainFile) 50 | self.devData = self.buildDataset(devFile) 51 | 52 | def buildVoc(self, fileName, wordDropoutCoeff): 53 | assert os.path.exists(fileName) 54 | 55 | with open(fileName, 'r') as f: 56 | tokenCount = {} 57 | charNgramCount = {} 58 | labelCount = {} 59 | 60 | for line in f: 61 | fields = line.split('\t') 62 | tokens = fields[0].split() # w1 w2 ... wn \t l1 l2 ... ln \n 63 | labels = fields[1].split() 64 | assert len(tokens) == len(labels) 65 | 66 | for t in tokens: 67 | for c in utils.getCharNgram(t): 68 | if c in charNgramCount: 69 | charNgramCount[c] += 1 70 | else: 71 | charNgramCount[c] = 1 72 | 73 | t = t.lower() 74 | if t in tokenCount: 75 | tokenCount[t] += 1 76 | else: 77 | tokenCount[t] = 1 78 | 79 | for l in labels: 80 | if l in labelCount: 81 | labelCount[l] += 1 82 | else: 83 | labelCount[l] = 1 84 | 85 | tokenList = sorted(tokenCount.items(), key = lambda x: -x[1]) # sort by value 86 | charNgramList = sorted(charNgramCount.items(), key = lambda x: -x[1]) # sort by value 87 | labelList = sorted(labelCount.items(), key = lambda x: -x[1]) # sort by value 88 | 89 | for t in tokenList: 90 | self.voc.add(t[0], t[1]) 91 | for c in charNgramList: 92 | self.charVoc.add(c[0], c[1]) 93 | for l in labelList: 94 | self.classVoc.add(l[0], l[1]) 95 | 96 | ''' 97 | Add special tokens 98 | ''' 99 | self.voc.add(self.voc.UNK, 0) 100 | self.voc.add(self.voc.PAD, 0) 101 | self.voc.unkIndex = self.voc.getTokenIndex(self.voc.UNK) 102 | self.voc.padIndex = self.voc.getTokenIndex(self.voc.PAD) 103 | self.charVoc.add(self.charVoc.UNK, 0) 104 | self.charVoc.add(self.charVoc.PAD, 0) # use this for padding 105 | self.charVoc.unkIndex = self.charVoc.getTokenIndex(self.charVoc.UNK) 106 | self.charVoc.padIndex = self.charVoc.getTokenIndex(self.charVoc.PAD) 107 | 108 | ''' 109 | Prob for UNK word-dropout 110 | ''' 111 | alpha = wordDropoutCoeff 112 | for t in self.voc.tokenList: 113 | t.count = alpha/(t.count + alpha) 114 | 115 | def buildDataset(self, fileName): 116 | assert os.path.exists(fileName) 117 | 118 | with open(fileName, 'r') as f: 119 | dataset = [] 120 | 121 | for line in f: 122 | fields = line.split('\t') 123 | tokens = fields[0].split() # w1 w2 ... wn \t l1 l2 ... ln \n 124 | labels = fields[1].split() # w1 w2 ... wn \t l1 l2 ... ln \n 125 | assert len(tokens) == len(labels) 126 | tokenIndices = [] 127 | charNgramIndices = [] 128 | labelIndices = [] 129 | 130 | for i in range(len(tokens)): 131 | charNgramIndices.append([]) 132 | for c in utils.getCharNgram(tokens[i]): 133 | ci = self.charVoc.getTokenIndex(c) 134 | if ci != self.charVoc.unkIndex: 135 | charNgramIndices[i].append(ci) 136 | if len(charNgramIndices[i]) == 0: 137 | charNgramIndices[i].append(self.charVoc.unkIndex) 138 | 139 | tokenIndices.append(self.voc.getTokenIndex(tokens[i].lower())) 140 | labelIndices.append(self.classVoc.getTokenIndex(labels[i])) 141 | 142 | dataset.append(Data(tokenIndices, charNgramIndices, labelIndices)) 143 | 144 | return dataset 145 | 146 | ''' 147 | input: w1, w2, ..., wn 148 | target: l1, l2, ..., ln 149 | ''' 150 | def processBatchInfo(self, batch, train, hiddenDim, useGpu): 151 | begin = batch[0] 152 | end = batch[1] 153 | batchSize = end-begin+1 154 | if train: 155 | data = sorted(self.trainData[begin:end+1], key = lambda x: -len(x.tokenIndices)) 156 | else: 157 | data = sorted(self.devData[begin:end+1], key = lambda x: -len(x.tokenIndices)) 158 | maxLen = len(data[0].tokenIndices) 159 | batchInput = torch.LongTensor(batchSize, maxLen).fill_(self.voc.padIndex) 160 | batchTarget = torch.LongTensor(batchSize*maxLen).fill_(-1) 161 | lengths = [] 162 | targetIndex = 0 163 | tokenCount = 0 164 | 165 | for i in range(batchSize): 166 | l = len(data[i].tokenIndices) 167 | lengths.append(l) 168 | tokenCount += l 169 | 170 | for j in range(l): 171 | batchInput[i][j] = data[i].tokenIndices[j] 172 | 173 | for j in range(maxLen): 174 | if j < l: 175 | batchTarget[targetIndex] = data[i].labelIndices[j] 176 | targetIndex += 1 177 | 178 | ''' 179 | UNK word-dropout 180 | ''' 181 | if train: 182 | rnd = torch.FloatTensor(l).uniform_(0.0, 1.0) 183 | for j in range(l): 184 | if rnd[j] < self.voc.tokenList[batchInput[i][j]].count: 185 | batchInput[i][j] = self.voc.unkIndex 186 | assert(targetIndex == batchSize*maxLen) 187 | 188 | batchInput = Variable(batchInput, requires_grad = False) 189 | batchTarget = Variable(batchTarget, requires_grad = False) 190 | 191 | ''' 192 | Char n-gram 193 | ''' 194 | batchCharInput = [] 195 | batchCharOffset = [] 196 | offsetPos = 0 197 | for i in range(batchSize): 198 | for j in range(maxLen): 199 | batchCharOffset.append(offsetPos) 200 | if j < lengths[i]: 201 | index = data[i].tokenIndices[j] 202 | offsetPos += len(data[i].charNgramIndices[j]) 203 | batchCharInput += data[i].charNgramIndices[j] 204 | else: 205 | offsetPos += 1 206 | batchCharInput.append(self.charVoc.padIndex) 207 | 208 | batchCharInput = Variable(torch.LongTensor(batchCharInput), requires_grad = False) 209 | batchCharOffset = Variable(torch.LongTensor(batchCharOffset), requires_grad = False) 210 | 211 | shape = 2, batchSize, hiddenDim 212 | h0 = c0 = Variable(torch.zeros(*shape), requires_grad = False) 213 | 214 | if useGpu: 215 | batchInput = batchInput.cuda() 216 | batchCharInput = batchCharInput.cuda() 217 | batchCharOffset = batchCharOffset.cuda() 218 | batchTarget = batchTarget.cuda() 219 | h0 = h0.cuda() 220 | c0 = c0.cuda() 221 | 222 | return batchInput, (batchCharInput, batchCharOffset), batchTarget, lengths, (h0, c0), tokenCount 223 | -------------------------------------------------------------------------------- /jmt/code/model.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | 7 | import utils 8 | 9 | ''' 10 | Word and character n-gram embeddings 11 | ''' 12 | class Embedding(nn.Module): 13 | 14 | def __init__(self, wordVocSize, charVocSize, wordDim, charDim): 15 | super(Embedding, self).__init__() 16 | 17 | self.wordEmbedding = nn.Embedding(wordVocSize, wordDim) 18 | self.charEmbedding = nn.EmbeddingBag(charVocSize, charDim) 19 | 20 | self.wordDim = wordDim 21 | self.charDim = charDim 22 | 23 | self.initWeights() 24 | 25 | def initWeights(self): 26 | self.wordEmbedding.weight.data.uniform_(-1.0/self.wordDim, 1.0/self.wordDim) 27 | self.charEmbedding.weight.data.uniform_(-1.0/self.charDim, 1.0/self.charDim) 28 | 29 | ''' 30 | Get mini-batched embeddings 31 | ''' 32 | def getBatchedEmbedding(self, batchInput, batchChar): 33 | wordInput = self.wordEmbedding(batchInput) 34 | 35 | charInput = self.charEmbedding(batchChar[0], batchChar[1]) 36 | charInput = charInput.view(wordInput.size(0), wordInput.size(1), charInput.size(1)) 37 | 38 | return torch.cat((wordInput, charInput), dim = 2) 39 | 40 | 41 | ''' 42 | Sequential tagging model 43 | - Input: w1 w2 ... wn 44 | - Output: l1 l2 ... ln 45 | ''' 46 | class Tagger(nn.Module): 47 | 48 | ''' 49 | Initialize the tagging model 50 | ''' 51 | def __init__(self, inputDim, hiddenDim, classNum, inputDropoutRate, outputDropoutRate): 52 | super(Tagger, self).__init__() 53 | 54 | self.encoder = nn.LSTM(input_size = inputDim, 55 | hidden_size = hiddenDim, 56 | num_layers = 1, 57 | dropout = 0.0, 58 | bidirectional = True) 59 | 60 | self.inputDropout = nn.Dropout(p = inputDropoutRate) 61 | self.outputDropout = nn.Dropout(p = outputDropoutRate) 62 | 63 | classifierDim = 2*hiddenDim 64 | self.hiddenLayer = nn.Linear(classifierDim, classifierDim) 65 | self.hiddenAct = nn.ReLU() 66 | 67 | self.softmaxLayer = nn.Linear(classifierDim, classNum) 68 | 69 | self.inputDim = inputDim 70 | self.hiddenDim = hiddenDim 71 | self.classifierDim = classifierDim 72 | 73 | self.initWeights() 74 | 75 | ''' 76 | Initialize the model paramters 77 | ''' 78 | def initWeights(self): 79 | initScale = math.sqrt(6.0)/math.sqrt(self.hiddenDim+(self.inputDim+self.hiddenDim)) 80 | initScale2 = math.sqrt(6.0)/math.sqrt(self.classifierDim+(self.classifierDim)) 81 | 82 | self.encoder.weight_ih_l0.data.uniform_(-initScale, initScale) 83 | self.encoder.weight_hh_l0.data.uniform_(-initScale, initScale) 84 | self.encoder.bias_ih_l0.data.zero_() 85 | self.encoder.bias_hh_l0.data.zero_() 86 | self.encoder.bias_hh_l0.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 87 | 88 | self.encoder.weight_ih_l0_reverse.data.uniform_(-initScale, initScale) 89 | self.encoder.weight_hh_l0_reverse.data.uniform_(-initScale, initScale) 90 | self.encoder.bias_ih_l0_reverse.data.zero_() 91 | self.encoder.bias_hh_l0_reverse.data.zero_() 92 | self.encoder.bias_hh_l0_reverse.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 93 | 94 | self.hiddenLayer.weight.data.uniform_(-initScale2, initScale2) 95 | self.hiddenLayer.bias.data.zero_() 96 | 97 | self.softmaxLayer.weight.data.zero_() 98 | self.softmaxLayer.bias.data.zero_() 99 | 100 | ''' 101 | Compute feature Vectors 102 | ''' 103 | def encode(self, input, lengths, hidden0): 104 | packedInput = nn.utils.rnn.pack_padded_sequence(self.inputDropout(input), lengths, batch_first = True) 105 | 106 | h, (hn, cn) = self.encoder(packedInput, hidden0) 107 | h, _ = nn.utils.rnn.pad_packed_sequence(h, batch_first = True) 108 | 109 | h = h.contiguous().view(h.size(0)*h.size(1), h.size(2)) 110 | h = self.hiddenLayer(self.outputDropout(h)) 111 | return self.hiddenAct(h) 112 | 113 | ''' 114 | Compute class scores 115 | ''' 116 | def forward(self, input, lengths, hidden0): 117 | encoded = self.encode(input, lengths, hidden0) 118 | return self.softmaxLayer(self.outputDropout(encoded)) 119 | 120 | 121 | class Parser(nn.Module): 122 | 123 | def __init__(self, inputDim, hiddenDim, classNum, inputDropoutRate, outputDropoutRate): 124 | super(Parser, self).__init__() 125 | 126 | self.encoder = nn.LSTM(input_size = inputDim, 127 | hidden_size = hiddenDim, 128 | num_layers = 1, 129 | dropout = 0.0, 130 | bidirectional = True) 131 | 132 | self.inputDropout = nn.Dropout(p = inputDropoutRate) 133 | self.outputDropout = nn.Dropout(p = outputDropoutRate) 134 | 135 | concatDim = 2*hiddenDim 136 | self.depMatchWeight = nn.Linear(concatDim, concatDim, bias = False) 137 | self.rootVec = nn.Parameter(torch.FloatTensor(concatDim)) 138 | 139 | classifierDim = 2*concatDim 140 | self.hiddenLayer = nn.Linear(classifierDim, classifierDim) 141 | self.hiddenAct = nn.ReLU() 142 | 143 | self.softmaxLayer = nn.Layer(classifierDim, classNum) 144 | 145 | self.inputDim = inputDim 146 | self.hiddenDim= hiddenDim 147 | self.concatDim = concatDim 148 | self.classifierDim = classifierDim 149 | 150 | self.initWeights() 151 | 152 | def initWeights(self): 153 | initScale = math.sqrt(6.0)/math.sqrt(self.hiddenDim+(self.inputDim+self.hiddenDim)) 154 | initScale2 = math.sqrt(6.0)/math.sqrt(self.classifierDim+(self.classifierDim)) 155 | 156 | self.encoder.weight_ih_l0.data.uniform_(-initScale, initScale) 157 | self.encoder.weight_hh_l0.data.uniform_(-initScale, initScale) 158 | self.encoder.bias_ih_l0.data.zero_() 159 | self.encoder.bias_hh_l0.data.zero_() 160 | self.encoder.bias_hh_l0.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 161 | 162 | self.encoder.weight_ih_l0_reverse.data.uniform_(-initScale, initScale) 163 | self.encoder.weight_hh_l0_reverse.data.uniform_(-initScale, initScale) 164 | self.encoder.bias_ih_l0_reverse.data.zero_() 165 | self.encoder.bias_hh_l0_reverse.data.zero_() 166 | self.encoder.bias_hh_l0_reverse.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 167 | 168 | self.depMatchWeight.weight.data.zero_() 169 | self.rootVec.data.zero_() 170 | 171 | self.hiddenLayer.weight.data.uniform_(-initScale2, initScale2) 172 | self.hiddenLayer.bias.data.zero_() 173 | 174 | self.softmaxLayer.weight.data.zero_() 175 | self.softmaxLayer.bias.data.zero_() 176 | -------------------------------------------------------------------------------- /jmt/code/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | from torch.autograd import Variable 5 | 6 | import random 7 | import math 8 | import argparse 9 | import itertools 10 | import os 11 | 12 | from data import Corpus 13 | import utils 14 | from model import Embedding 15 | from model import Tagger 16 | 17 | parser = argparse.ArgumentParser(description = 'A Joint Many-Task Model') 18 | parser.add_argument('--embedDim', type = int, default = 100, 19 | help='Size of word embeddings') 20 | parser.add_argument('--charDim', type = int, default = 100, 21 | help='Size of char embeddings') 22 | parser.add_argument('--hiddenDim', type = int, default = 100, 23 | help='Size of hidden layers') 24 | parser.add_argument('--batchSize', type = int, default = 32, 25 | help='Mini-batch size') 26 | parser.add_argument('--lr', type = float, default = 1.0, 27 | help='Initial learning rate') 28 | parser.add_argument('--lrDecay', type = float, default = 0.3, 29 | help='Learning rate decay per epoch') 30 | parser.add_argument('--lstmWeightDecay', type = float, default = 1.0e-06, 31 | help='Weight decay for LSTM weights') 32 | parser.add_argument('--mlpWeightDecay', type = float, default = 1.0e-05, 33 | help='Weight decay for MLP weights') 34 | parser.add_argument('--epoch', type = int, default = 20, 35 | help='Maximum number of training epochs') 36 | parser.add_argument('--seed', type = int, default = 1, 37 | help='Random seed') 38 | parser.add_argument('--gpuId', type = int, default = 0, 39 | help='GPU id') 40 | parser.add_argument('--inputDropout', type = float, default = 0.2, 41 | help='Dropout rate for input vectors') 42 | parser.add_argument('--outputDropout', type = float, default = 0.2, 43 | help='Dropout rate for output vectors') 44 | parser.add_argument('--wordDropoutCoeff', type = float, default = 0.25, 45 | help='Coefficient for word dropout') 46 | parser.add_argument('--clip', type = float, default = 1.0, 47 | help='Gradient clipping value') 48 | parser.add_argument('--random', action = 'store_true', 49 | help='Use randomly initialized embeddings or not') 50 | parser.add_argument('--test', action = 'store_true', 51 | help = 'Test mode or not') 52 | 53 | args = parser.parse_args() 54 | print(args) 55 | print() 56 | 57 | embedDim = args.embedDim 58 | charDim = args.charDim 59 | hiddenDim = args.hiddenDim 60 | batchSize = args.batchSize 61 | initialLearningRate = args.lr 62 | lrDecay = args.lrDecay 63 | lstmWeightDecay = args.lstmWeightDecay 64 | mlpWeightDecay = args.mlpWeightDecay 65 | maxEpoch = args.epoch 66 | seed = args.seed 67 | inputDropoutRate = args.inputDropout 68 | outputDropoutRate = args.outputDropout 69 | wordDropoutCoeff = args.wordDropoutCoeff 70 | gradClip = args.clip 71 | useGpu = True 72 | gpuId = args.gpuId 73 | test = args.test 74 | 75 | trainFile = '../dataset/pos/pos_wsj.sample.train' 76 | devFile = '../dataset/pos/pos_wsj.sample.dev' 77 | 78 | wordEmbeddingFile = '../embedding/word.txt' 79 | charEmbeddingFile = '../embedding/charNgram.txt' 80 | 81 | taggerParamsFile = 'tagger-'+str(gpuId) 82 | embeddingParamsFile = 'embedding-'+str(gpuId) 83 | wordParamsFile = 'word_params-'+str(gpuId) # for pre-trained embeddings 84 | charParamsFile = 'char_params-'+str(gpuId) # for pre-trained embeddings 85 | 86 | torch.manual_seed(seed) 87 | random.seed(seed) 88 | 89 | corpus = Corpus(trainFile, devFile, wordDropoutCoeff) 90 | 91 | print('Vocabulary size: '+str(corpus.voc.size())) 92 | print('# of classes: '+str(corpus.classVoc.size())) 93 | print() 94 | print('# of training samples: '+str(len(corpus.trainData))) 95 | print('# of dev samples: '+str(len(corpus.devData))) 96 | print() 97 | 98 | embedding = Embedding(corpus.voc.size(), corpus.charVoc.size(), embedDim, charDim) 99 | tagger = Tagger(embedDim+charDim, hiddenDim, corpus.classVoc.size(), 100 | inputDropoutRate, outputDropoutRate) 101 | 102 | if not test and not args.random: 103 | if os.path.exists(wordParamsFile): 104 | embedding.wordEmbedding.load_state_dict(torch.load(wordParamsFile)) 105 | else: 106 | utils.loadEmbeddings(embedding.wordEmbedding, corpus.voc, wordEmbeddingFile) 107 | torch.save(embedding.wordEmbedding.state_dict(), wordParamsFile) 108 | 109 | if os.path.exists(charParamsFile): 110 | embedding.charEmbedding.load_state_dict(torch.load(charParamsFile)) 111 | else: 112 | utils.loadEmbeddings(embedding.charEmbedding, corpus.charVoc, charEmbeddingFile) 113 | torch.save(embedding.charEmbedding.state_dict(), charParamsFile) 114 | if test: 115 | tagger.load_state_dict(torch.load(taggerParamsFile)) 116 | embedding.load_state_dict(torch.load(embeddingParamsFile)) 117 | 118 | if useGpu: 119 | if torch.cuda.is_available(): 120 | torch.cuda.set_device(args.gpuId) 121 | torch.cuda.manual_seed(seed) 122 | embedding.cuda() 123 | tagger.cuda() 124 | print('**** Running with GPU-' + str(args.gpuId) + ' ****\n') 125 | else: 126 | useGpu = False 127 | print('**** Warning: GPU is not available ****\n') 128 | 129 | criterionTagger = nn.CrossEntropyLoss(size_average = False, ignore_index = -1) 130 | 131 | batchListTrain = utils.buildBatchList(len(corpus.trainData), batchSize) 132 | batchListDev = utils.buildBatchList(len(corpus.devData), batchSize) 133 | 134 | totalParams = list(embedding.parameters())+list(tagger.parameters()) 135 | lstmParams = [] 136 | mlpParams = [] 137 | withoutWeightDecay = [] 138 | for name, param in list(embedding.named_parameters())+list(tagger.named_parameters()): 139 | if not param.requires_grad: 140 | continue 141 | if 'bias' in name or 'Embedding' in name: 142 | withoutWeightDecay += [param] 143 | elif 'encoder' in name: 144 | lstmParams += [param] 145 | else: 146 | mlpParams += [param] 147 | optParams = [{'params': lstmParams, 'weight_decay': lstmWeightDecay}, 148 | {'params': mlpParams, 'weight_decay': mlpWeightDecay}, 149 | {'params': withoutWeightDecay, 'weight_decay': 0.0}] 150 | 151 | opt = optim.SGD(optParams, 152 | lr = initialLearningRate) 153 | 154 | maxDevAcc = -100.0 155 | epoch = 0 156 | 157 | while epoch < maxEpoch and not test: 158 | trainAcc = 0.0 159 | trainTokenCount = 0.0 160 | batchProcessed = 0 161 | 162 | for paramGroup in opt.param_groups: 163 | paramGroup['lr'] = initialLearningRate/(1.0+lrDecay*epoch) 164 | 165 | epoch += 1 166 | print('--- Epoch '+str(epoch)) 167 | 168 | random.shuffle(corpus.trainData) 169 | embedding.train() 170 | tagger.train() 171 | 172 | ''' 173 | Mini-batch training 174 | ''' 175 | for batch in batchListTrain: 176 | opt.zero_grad() 177 | batchInput, batchChar, batchTarget, lengths, hidden0, tokenCount = corpus.processBatchInfo(batch, True, hiddenDim, useGpu) 178 | trainTokenCount += tokenCount 179 | 180 | output = tagger(embedding.getBatchedEmbedding(batchInput, batchChar), lengths, hidden0) 181 | loss = criterionTagger(output, batchTarget) 182 | loss /= (batch[1]-batch[0]+1.0) 183 | loss.backward() 184 | nn.utils.clip_grad_norm(totalParams, gradClip) 185 | opt.step() 186 | 187 | _, prediction = torch.max(output, 1) 188 | trainAcc += (prediction.data == batchTarget.data).sum() 189 | 190 | batchProcessed += 1 191 | ''' 192 | Mini-batch test 193 | ''' 194 | if batchProcessed == len(batchListTrain)//20: 195 | batchProcessed = 0 196 | devAcc = 0.0 197 | devTokenCount = 0.0 198 | 199 | embedding.eval() 200 | tagger.eval() 201 | for batch in batchListDev: 202 | batchInput, batchChar, batchTarget, lengths, hidden0, tokenCount = corpus.processBatchInfo(batch, False, hiddenDim, useGpu) 203 | devTokenCount += tokenCount 204 | 205 | output = tagger(embedding.getBatchedEmbedding(batchInput, batchChar), lengths, hidden0) 206 | _, prediction = torch.max(output, 1) 207 | devAcc += (prediction.data == batchTarget.data).sum() 208 | embedding.train() 209 | tagger.train() 210 | 211 | devAcc = 100.0*devAcc/devTokenCount 212 | print('Dev acc.: '+str(devAcc)) 213 | 214 | if devAcc > maxDevAcc: 215 | maxDevAcc = devAcc 216 | 217 | stateDict = tagger.state_dict() 218 | for elem in stateDict: 219 | stateDict[elem].cpu() 220 | torch.save(stateDict, taggerParamsFile) 221 | 222 | stateDict = embedding.state_dict() 223 | for elem in stateDict: 224 | stateDict[elem].cpu() 225 | torch.save(stateDict, embeddingParamsFile) 226 | 227 | print('Train acc.: '+str(100.0*trainAcc/trainTokenCount)) 228 | 229 | if test: 230 | embedding.eval() 231 | tagger.eval() 232 | 233 | devAcc = 0.0 234 | devTokenCount = 0.0 235 | for batch in batchListDev: 236 | batchInput, batchChar, batchTarget, lengths, hidden0, tokenCount = corpus.processBatchInfo(batch, False, hiddenDim, useGpu) 237 | devTokenCount += tokenCount 238 | 239 | output = tagger(embedding.getBatchedEmbedding(batchInput, batchChar), lengths, hidden0) 240 | _, prediction = torch.max(output, 1) 241 | devAcc += (prediction.data == batchTarget.data).sum() 242 | 243 | devAcc = 100.0*devAcc/devTokenCount 244 | print('Dev acc.: '+str(devAcc)) 245 | 246 | embedding.train() 247 | tagger.train() 248 | -------------------------------------------------------------------------------- /jmt/code/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def buildBatchList(dataSize, batchSize): 4 | batchList = [] 5 | numBatch = int(dataSize/batchSize) 6 | 7 | for i in range(numBatch): 8 | batch = [] 9 | batch.append(i*batchSize) 10 | if i == numBatch-1: 11 | batch.append(dataSize-1) 12 | else: 13 | batch.append((i+1)*batchSize-1) 14 | batchList.append(batch) 15 | 16 | return batchList 17 | 18 | def loadEmbeddings(embedding, voc, fileName): 19 | assert os.path.exists(fileName) 20 | 21 | print('Loading embeddings from '+fileName) 22 | 23 | with open(fileName, 'r') as f: 24 | counter = 0 25 | 26 | for line in f: 27 | fields = line.split() 28 | 29 | if len(fields)-1 != embedding.weight.size(1): 30 | continue 31 | 32 | tokenIndex = voc.getTokenIndex(fields[0]) 33 | 34 | if tokenIndex == voc.tokenIndex[voc.UNK]: 35 | continue 36 | 37 | counter += 1 38 | 39 | for i in range(len(fields)-1): 40 | embedding.weight[tokenIndex][i].data.fill_(float(fields[i+1])) 41 | 42 | print(str(counter)+' embeddings are initialized') 43 | print() 44 | 45 | def getCharNgram(token): 46 | BEG = '#BEGIN#' 47 | END = '#END#' 48 | result = [] 49 | 50 | chars = [BEG] + list(token) + [END] 51 | 52 | for n in [2, 3, 4]: 53 | for i in range(len(chars)-n+1): 54 | result.append(str(n) + 'gram-' + ''.join(chars[i:i+n])) 55 | 56 | return result 57 | -------------------------------------------------------------------------------- /jmt/download_embeddings.sh: -------------------------------------------------------------------------------- 1 | mkdir embedding 2 | cd embedding 3 | wget http://www.logos.t.u-tokyo.ac.jp/~hassy/publications/arxiv2016jmt/jmt_pre-trained_embeddings.tar.gz 4 | tar zxvf jmt_pre-trained_embeddings.tar.gz 5 | cd .. 6 | -------------------------------------------------------------------------------- /nmt/README.md: -------------------------------------------------------------------------------- 1 | # A Neural Machine Translation (NMT) implementation 2 | -------------------------------------------------------------------------------- /nmt/code/bleu.sh: -------------------------------------------------------------------------------- 1 | T=./trans.txt 2 | G=./gold.txt 3 | 4 | perl ./tools/multi-bleu.perl ${G} < ${T} > bleu.txt 5 | -------------------------------------------------------------------------------- /nmt/code/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from torch.autograd import Variable 5 | 6 | class Token: 7 | def __init__(self, str_ = '', count_ = 0): 8 | self.str = str_ 9 | self.count = count_ 10 | 11 | class Vocabulary: 12 | def __init__(self): 13 | self.UNK = 'UNK' # unkown words 14 | self.EOS = '' # the end-of-sequence token 15 | self.BOS = '' # the beginning-of-sequence token 16 | self.PAD = '' # padding 17 | 18 | self.unkIndex = -1 19 | self.eosIndex = -1 20 | self.bosIndex = -1 21 | self.padIndex = -1 22 | 23 | self.tokenIndex = {} 24 | self.tokenList = [] 25 | 26 | def getTokenIndex(self, str): 27 | if str in self.tokenIndex: 28 | return self.tokenIndex[str] 29 | else: 30 | return self.unkIndex 31 | 32 | def add(self, str, count): 33 | if str not in self.tokenIndex: 34 | self.tokenList.append(Token(str, count)) 35 | self.tokenIndex[str] = len(self.tokenList)-1 36 | 37 | def size(self): 38 | return len(self.tokenList) 39 | 40 | def outputTokenList(self, fileName): 41 | f = open(fileName, 'w') 42 | for t in self.tokenList: 43 | f.write(t.str + '\n') 44 | f.close() 45 | 46 | 47 | class Data: 48 | def __init__(self, sourceText_, targetText_, sourceOrigStr_ = None, targetUnkMap_ = None): 49 | self.sourceText = sourceText_ 50 | self.sourceOrigStr = sourceOrigStr_ 51 | 52 | self.targetText = targetText_ 53 | self.targetUnkMap = targetUnkMap_ 54 | 55 | 56 | class Corpus: 57 | def __init__(self, sourceTrainFile = '', sourceOrigTrainFile = '', targetTrainFile = '', sourceDevFile = '', sourceOrigDevFile = '', targetDevFile = '', minFreqSource = 1, minFreqTarget = 1, maxTokenLen = 100000): 58 | self.sourceVoc = Vocabulary() 59 | self.targetVoc = Vocabulary() 60 | 61 | self.buildVoc(sourceTrainFile, minFreqSource, source = True)#, maxLen = maxTokenLen) 62 | self.buildVoc(targetTrainFile, minFreqTarget, source = False)#, maxLen = maxTokenLen) 63 | 64 | self.trainData = self.buildDataset(sourceTrainFile, sourceOrigTrainFile, targetTrainFile, train = True, maxLen = maxTokenLen) 65 | self.devData = self.buildDataset(sourceDevFile, sourceOrigDevFile, targetDevFile, train = False) 66 | 67 | def buildVoc(self, fileName, minFreq, source, maxLen = 100000): 68 | assert os.path.exists(fileName) 69 | 70 | if source: 71 | voc = self.sourceVoc 72 | else: 73 | voc = self.targetVoc 74 | 75 | with open(fileName, 'r') as f: 76 | tokenCount = {} 77 | unkCount = 0 78 | eosCount = 0 79 | 80 | for line in f: 81 | tokens = line.split() # w1 w2 ... \n 82 | 83 | if len(tokens) > maxLen: 84 | continue 85 | 86 | eosCount += 1 87 | 88 | for t in tokens: 89 | if t in tokenCount: 90 | tokenCount[t] += 1 91 | else: 92 | tokenCount[t] = 1 93 | 94 | tokenList = sorted(tokenCount.items(), key = lambda x: -x[1]) # sort by value 95 | 96 | for t in tokenList: 97 | if t[1] >= minFreq: 98 | voc.add(t[0], t[1]) 99 | else: 100 | unkCount += t[1] 101 | 102 | ''' 103 | Add special tokens 104 | ''' 105 | voc.add(voc.UNK, unkCount) 106 | voc.add(voc.BOS, eosCount) 107 | voc.add(voc.EOS, eosCount) 108 | voc.add(voc.PAD, 0) 109 | 110 | voc.unkIndex = voc.getTokenIndex(voc.UNK) 111 | voc.bosIndex = voc.getTokenIndex(voc.BOS) 112 | voc.eosIndex = voc.getTokenIndex(voc.EOS) 113 | voc.padIndex = voc.getTokenIndex(voc.PAD) 114 | 115 | def buildDataset(self, sourceFileName, sourceOrigFileName, targetFileName, train, maxLen = 100000): 116 | assert os.path.exists(sourceFileName) and os.path.exists(targetFileName) 117 | assert os.path.exists(sourceOrigFileName) 118 | 119 | with open(sourceFileName, 'r') as fs, open(sourceOrigFileName, 'r') as fsOrig, open(targetFileName, 'r') as ft: 120 | dataset = [] 121 | 122 | for (lineSource, lineSourceOrig, lineTarget) in zip(fs, fsOrig, ft): 123 | tokensSource = lineSource.split() # w1 w2 ... \n 124 | if train: 125 | tokensSourceOrig = None 126 | else: 127 | tokensSourceOrig = lineSourceOrig.split() # w1 w2 ... \n 128 | tokensTarget = lineTarget.split() # w1 w2 ... \n 129 | 130 | if len(tokensSource) > maxLen or len(tokensTarget) > maxLen or len(tokensSource) == 0 or len(tokensTarget) == 0: 131 | continue 132 | 133 | tokenIndicesSource = torch.LongTensor(len(tokensSource)) 134 | tokenIndicesTarget = torch.LongTensor(len(tokensTarget)) 135 | unkMapTarget = {} 136 | 137 | for i in range(len(tokensSource)): 138 | t = tokensSource[i] 139 | tokenIndicesSource[i] = self.sourceVoc.getTokenIndex(t) 140 | 141 | for i in range(len(tokensTarget)): 142 | t = tokensTarget[i] 143 | tokenIndicesTarget[i] = self.targetVoc.getTokenIndex(t) 144 | if tokenIndicesTarget[i] == self.targetVoc.unkIndex: 145 | unkMapTarget[i] = t 146 | 147 | dataset.append(Data(tokenIndicesSource, tokenIndicesTarget, tokensSourceOrig, unkMapTarget)) 148 | 149 | return dataset 150 | 151 | 152 | def processBatchInfoNMT(self, batch, train, volatile = False): 153 | begin = batch[0] 154 | end = batch[1] 155 | batchSize = end-begin+1 156 | 157 | ''' 158 | Process source info 159 | ''' 160 | if train: 161 | data = sorted(self.trainData[begin:end+1], key = lambda x: -len(x.sourceText)) 162 | else: 163 | data = sorted(self.devData[begin:end+1], key = lambda x: -len(x.sourceText)) 164 | 165 | maxLen = len(data[0].sourceText) 166 | batchInputSource = torch.LongTensor(batchSize, maxLen) 167 | batchInputSource.fill_(self.sourceVoc.padIndex) 168 | lengthsSource = [] 169 | 170 | for i in range(batchSize): 171 | l = len(data[i].sourceText) 172 | lengthsSource.append(l) 173 | 174 | for j in range(l): 175 | batchInputSource[i, j] = data[i].sourceText[j] 176 | 177 | batchInputSource = Variable(batchInputSource, volatile = volatile) 178 | batchInputSource = batchInputSource.cuda() 179 | 180 | ''' 181 | Process target info 182 | ''' 183 | data_ = sorted(data, key = lambda x: -len(x.targetText)) 184 | 185 | maxLen = len(data_[0].targetText)+1 # for BOS or EOS 186 | batchInputTarget = torch.LongTensor(batchSize, maxLen) 187 | batchInputTarget.fill_(self.targetVoc.padIndex) 188 | lengthsTarget = [] 189 | batchTarget = torch.LongTensor(maxLen*batchSize).fill_(-1) 190 | targetIndexOffset = 0 191 | tokenCount = 0.0 192 | 193 | for i in range(batchSize): 194 | l = len(data[i].targetText) 195 | lengthsTarget.append(l+1) 196 | batchInputTarget[i, 0] = self.targetVoc.bosIndex 197 | for j in range(l): 198 | batchInputTarget[i, j+1] = data[i].targetText[j] 199 | batchTarget[targetIndexOffset+j] = data[i].targetText[j] 200 | batchTarget[targetIndexOffset+l] = self.targetVoc.eosIndex 201 | targetIndexOffset += maxLen 202 | tokenCount += (l+1) 203 | 204 | batchInputTarget = Variable(batchInputTarget, requires_grad = False, volatile = volatile) 205 | batchTarget = Variable(batchTarget, requires_grad = False, volatile = volatile) 206 | 207 | return batchInputSource, lengthsSource, batchInputTarget, batchTarget, lengthsTarget, tokenCount, data 208 | -------------------------------------------------------------------------------- /nmt/code/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import torch.nn.functional as F 5 | 6 | class Embedding(nn.Module): 7 | 8 | def __init__(self, sourceEmbedDim, targetEmbedDim, sourceVocSize, targetVocSize): 9 | super(Embedding, self).__init__() 10 | 11 | self.sourceEmbedding = nn.Embedding(sourceVocSize, sourceEmbedDim) 12 | self.targetEmbedding = nn.Embedding(targetVocSize, targetEmbedDim) 13 | 14 | self.initWeights() 15 | 16 | def initWeights(self): 17 | initScale = 0.1 18 | 19 | self.sourceEmbedding.weight.data.uniform_(-initScale, initScale) 20 | self.targetEmbedding.weight.data.uniform_(-initScale, initScale) 21 | 22 | def getBatchedSourceEmbedding(self, batchInput): 23 | return self.sourceEmbedding(batchInput) 24 | 25 | def getBatchedTargetEmbedding(self, batchInput): 26 | return self.targetEmbedding(batchInput) 27 | 28 | 29 | class WordPredictor(nn.Module): 30 | 31 | def __init__(self, inputDim, outputDim, ignoreIndex = -1): 32 | super(WordPredictor, self).__init__() 33 | 34 | self.softmaxLayer = nn.Linear(inputDim, outputDim) 35 | self.loss = nn.CrossEntropyLoss(size_average = False, ignore_index = ignoreIndex) 36 | 37 | self.initWeight() 38 | 39 | def initWeight(self): 40 | self.softmaxLayer.weight.data.zero_() 41 | self.softmaxLayer.bias.data.zero_() 42 | 43 | def forward(self, input, target = None): 44 | output = self.softmaxLayer(input) 45 | if target is not None: 46 | return self.loss(output, target) 47 | else: 48 | return output 49 | 50 | 51 | class DecCand: 52 | def __init__(self, score_ = 0.0, fin_ = False, sentence_ = [], attenIndex_ = []): 53 | self.score = score_ 54 | self.fin = fin_ 55 | self.sentence = sentence_ 56 | self.attenIndex = attenIndex_ 57 | 58 | 59 | class EncDec(nn.Module): 60 | 61 | def __init__(self, sourceEmbedDim, targetEmbedDim, hiddenDim, targetVocSize, dropoutRate = 0.2, numLayers = 1): 62 | super(EncDec, self).__init__() 63 | 64 | self.numLayers = numLayers 65 | self.dropout = nn.Dropout(p = dropoutRate) 66 | 67 | self.encoder = nn.LSTM(input_size = sourceEmbedDim, hidden_size = hiddenDim, 68 | num_layers = self.numLayers, dropout = 0.0, bidirectional = True) 69 | 70 | self.decoder = nn.LSTM(input_size = targetEmbedDim + hiddenDim, hidden_size = hiddenDim, 71 | num_layers = self.numLayers, dropout = 0.0, bidirectional = False, batch_first = True) 72 | 73 | self.attentionLayer = nn.Linear(2*hiddenDim, hiddenDim, bias = False) 74 | self.finalHiddenLayer = nn.Linear(3*hiddenDim, targetEmbedDim) 75 | self.finalHiddenAct = nn.Tanh() 76 | 77 | self.wordPredictor = WordPredictor(targetEmbedDim, targetVocSize) 78 | 79 | self.targetEmbedDim = targetEmbedDim 80 | self.hiddenDim = hiddenDim 81 | 82 | self.initWeight() 83 | 84 | def initWeight(self): 85 | initScale = 0.1 86 | 87 | self.encoder.weight_ih_l0.data.uniform_(-initScale, initScale) 88 | self.encoder.weight_hh_l0.data.uniform_(-initScale, initScale) 89 | self.encoder.bias_ih_l0.data.zero_() 90 | self.encoder.bias_hh_l0.data.zero_() 91 | self.encoder.bias_hh_l0.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 92 | 93 | self.encoder.weight_ih_l0_reverse.data.uniform_(-initScale, initScale) 94 | self.encoder.weight_hh_l0_reverse.data.uniform_(-initScale, initScale) 95 | self.encoder.bias_ih_l0_reverse.data.zero_() 96 | self.encoder.bias_hh_l0_reverse.data.zero_() 97 | self.encoder.bias_hh_l0_reverse.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 98 | 99 | self.decoder.weight_ih_l0.data.uniform_(-initScale, initScale) 100 | self.decoder.weight_hh_l0.data.uniform_(-initScale, initScale) 101 | self.decoder.bias_ih_l0.data.zero_() 102 | self.decoder.bias_hh_l0.data.zero_() 103 | self.decoder.bias_hh_l0.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 104 | 105 | if self.numLayers == 2: 106 | self.encoder.weight_ih_l1.data.uniform_(-initScale, initScale) 107 | self.encoder.weight_hh_l1.data.uniform_(-initScale, initScale) 108 | self.encoder.bias_ih_l1.data.zero_() 109 | self.encoder.bias_hh_l1.data.zero_() 110 | self.encoder.bias_hh_l1.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 111 | 112 | self.encoder.weight_ih_l1_reverse.data.uniform_(-initScale, initScale) 113 | self.encoder.weight_hh_l1_reverse.data.uniform_(-initScale, initScale) 114 | self.encoder.bias_ih_l1_reverse.data.zero_() 115 | self.encoder.bias_hh_l1_reverse.data.zero_() 116 | self.encoder.bias_hh_l1_reverse.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 117 | 118 | self.decoder.weight_ih_l1.data.uniform_(-initScale, initScale) 119 | self.decoder.weight_hh_l1.data.uniform_(-initScale, initScale) 120 | self.decoder.bias_ih_l1.data.zero_() 121 | self.decoder.bias_hh_l1.data.zero_() 122 | self.decoder.bias_hh_l1.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 123 | 124 | self.attentionLayer.weight.data.zero_() 125 | 126 | self.finalHiddenLayer.weight.data.uniform_(-initScale, initScale) 127 | self.finalHiddenLayer.bias.data.zero_() 128 | 129 | def encode(self, inputSource, lengthsSource): 130 | packedInput = nn.utils.rnn.pack_padded_sequence(inputSource, lengthsSource, batch_first = True) 131 | 132 | h, (hn, cn) = self.encoder(packedInput) # hn, ch: (layers*direction, B, Ds) 133 | h, _ = nn.utils.rnn.pad_packed_sequence(h, batch_first = True) 134 | 135 | if self.numLayers == 1: 136 | hn = (hn[0]+hn[1]).unsqueeze(0) 137 | cn = (cn[0]+cn[1]).unsqueeze(0) 138 | else: 139 | hn0 = (hn[0]+hn[1]).unsqueeze(0) 140 | cn0 = (cn[0]+cn[1]).unsqueeze(0) 141 | hn1 = (hn[2]+hn[3]).unsqueeze(0) 142 | cn1 = (cn[2]+cn[3]).unsqueeze(0) 143 | 144 | hn = torch.cat((hn0, hn1), dim = 0) 145 | cn = torch.cat((cn0, cn1), dim = 0) 146 | 147 | return h, (hn, cn) 148 | 149 | def forward(self, inputTarget, lengthsTarget, lengthsSource, hidden0Target, sourceH, target = None): 150 | batchSize = sourceH.size(0) 151 | maxLen = lengthsTarget[0] 152 | 153 | for i in range(batchSize): 154 | maxLen = max(maxLen, lengthsTarget[i]) 155 | 156 | finalHidden = Variable(inputTarget.data.new(batchSize, maxLen, self.targetEmbedDim), requires_grad = False) 157 | prevFinalHidden = Variable(inputTarget.data.new(batchSize, 1, self.targetEmbedDim).zero_(), requires_grad = False) 158 | 159 | newShape = sourceH.size(0), sourceH.size(1), self.hiddenDim # (B, Ls, Dt) 160 | sourceHtrans = sourceH.contiguous().view(sourceH.size(0)*sourceH.size(1), sourceH.size(2)) # (B*Ls, Ds) 161 | sourceHtrans = self.attentionLayer(sourceHtrans) # (B*Ls, Dt) 162 | sourceHtrans = sourceHtrans.view(*newShape).transpose(1, 2) # (B, Dt, Ls) 163 | 164 | for i in range(maxLen): 165 | hi, hidden0Target = self.decoder(torch.cat((inputTarget[:, i, :].unsqueeze(1), prevFinalHidden), dim = 2), hidden0Target) # hi: (B, 1, Dt) 166 | 167 | if self.numLayers != 1: # residual connection for this decoder 168 | hi = hidden0Target[0][0]+hidden0Target[0][1] 169 | hi = hi.unsqueeze(1) 170 | 171 | attentionScores_ = torch.bmm(hi, sourceHtrans).transpose(1, 2) # (B, Ls, 1) 172 | 173 | attentionScores = attentionScores_.data.new(attentionScores_.size()).fill_(-1024.0) 174 | for j in range(batchSize): 175 | attentionScores[j, :lengthsSource[j]].zero_() 176 | attentionScores = Variable(attentionScores, requires_grad = False) 177 | attentionScores += attentionScores_ 178 | 179 | attentionScores = attentionScores.transpose(1, 2) # (B, 1, Ls) 180 | attentionScores = F.softmax(attentionScores.transpose(0, 2)).transpose(0, 2) 181 | 182 | contextVec = torch.bmm(attentionScores, sourceH) # (B, 1, Ds) 183 | 184 | prevFinalHidden = torch.cat((hi, contextVec), dim = 2) # (B, 1, Ds+Dt) 185 | prevFinalHidden = self.dropout(prevFinalHidden) 186 | prevFinalHidden = self.finalHiddenLayer(prevFinalHidden) 187 | prevFinalHidden = self.finalHiddenAct(prevFinalHidden) 188 | prevFinalHidden = self.dropout(prevFinalHidden) 189 | 190 | finalHidden[:, i, :] = prevFinalHidden 191 | 192 | finalHidden = finalHidden.contiguous().view(finalHidden.size(0)*finalHidden.size(1), finalHidden.size(2)) 193 | output = self.wordPredictor(finalHidden, target) 194 | 195 | return output 196 | 197 | def greedyTrans(self, bosIndex, eosIndex, lengthsSource, targetEmbedding, sourceH, hidden0Target, maxGenLen = 100): 198 | batchSize = sourceH.size(0) 199 | i = 1 200 | eosCount = 0 201 | targetWordIndices = Variable(torch.LongTensor(batchSize, maxGenLen).fill_(bosIndex), requires_grad = False).cuda() 202 | attentionIndices = targetWordIndices.data.new(targetWordIndices.size()) 203 | targetWordLengths = torch.LongTensor(batchSize).fill_(0) 204 | fin = [False]*batchSize 205 | 206 | newShape = sourceH.size(0), sourceH.size(1), hidden0Target[0].size(2) # (B, Ls, Dt) 207 | sourceHtrans = sourceH.contiguous().view(sourceH.size(0)*sourceH.size(1), sourceH.size(2)) # (B*Ls, Ds) 208 | sourceHtrans = self.attentionLayer(sourceHtrans) # (B*Ls, Dt) 209 | sourceHtrans = sourceHtrans.view(*newShape).transpose(1, 2) # (B, Dt, Ls) 210 | 211 | prevFinalHidden = Variable(sourceH.data.new(batchSize, 1, self.targetEmbedDim).zero_(), requires_grad = False) 212 | 213 | while (i < maxGenLen) and (eosCount < batchSize): 214 | inputTarget = targetEmbedding(targetWordIndices[:, i-1].unsqueeze(1)) 215 | hi, hidden0Target = self.decoder(torch.cat((inputTarget, prevFinalHidden), dim = 2), hidden0Target) # hi: (B, 1, Dt) 216 | 217 | if self.numLayers != 1: 218 | hi = hidden0Target[0][0]+hidden0Target[0][1] 219 | hi = hi.unsqueeze(1) 220 | 221 | attentionScores_ = torch.bmm(hi, sourceHtrans).transpose(1, 2) # (B, Ls, 1) 222 | 223 | attentionScores = attentionScores_.data.new(attentionScores_.size()).fill_(-1024.0) 224 | for j in range(batchSize): 225 | attentionScores[j, :lengthsSource[j]].zero_() 226 | attentionScores = Variable(attentionScores, requires_grad = False) 227 | attentionScores += attentionScores_ 228 | 229 | attentionScores = attentionScores.transpose(1, 2) # (B, 1, Ls) 230 | attentionScores = F.softmax(attentionScores.transpose(0, 2)).transpose(0, 2) 231 | 232 | attnProb, attnIndex = torch.max(attentionScores, dim = 2) 233 | for j in range(batchSize): 234 | attentionIndices[j, i-1] = attnIndex.data[j, 0] 235 | 236 | contextVec = torch.bmm(attentionScores, sourceH) # (B, 1, Ds) 237 | finalHidden = torch.cat((hi, contextVec), 2) # (B, 1, Dt+Ds) 238 | finalHidden = self.dropout(finalHidden) 239 | finalHidden = self.finalHiddenLayer(finalHidden) 240 | finalHidden = self.finalHiddenAct(finalHidden) 241 | finalHidden = self.dropout(finalHidden) 242 | prevFinalHidden = finalHidden # (B, 1, Dt) 243 | 244 | finalHidden = finalHidden.contiguous().view(finalHidden.size(0)*finalHidden.size(1), finalHidden.size(2)) 245 | output = self.wordPredictor(finalHidden) 246 | 247 | maxProb, sampledIndex = torch.max(output, dim = 1) 248 | targetWordIndices.data[:, i].copy_(sampledIndex.data) 249 | sampledIndex = sampledIndex.data 250 | 251 | for j in range(batchSize): 252 | if not fin[j] and targetWordIndices.data[j, i-1] != eosIndex: 253 | targetWordLengths[j] += 1 254 | if sampledIndex[j] == eosIndex: 255 | eosCount += 1 256 | fin[j] = True 257 | 258 | i += 1 259 | 260 | targetWordIndices = targetWordIndices[:, 1:i] # i-1: no EOS 261 | 262 | return targetWordIndices, list(targetWordLengths), attentionIndices 263 | 264 | def beamSearch(self, bosIndex, eosIndex, lengthsSource, targetEmbedding, sourceH, hidden0Target, beamSize = 1, penalty = 0.75, maxGenLen = 100): 265 | batchSize = sourceH.size(0) 266 | 267 | targetWordIndices = Variable(torch.LongTensor(batchSize, maxGenLen).fill_(bosIndex), requires_grad = False, volatile = True).cuda() 268 | attentionIndices = targetWordIndices.data.new(targetWordIndices.size()) 269 | targetWordLengths = torch.LongTensor(batchSize).fill_(0) 270 | 271 | newShape = sourceH.size(0), sourceH.size(1), hidden0Target[0].size(2) # (B, Ls, Dt) 272 | sourceHtrans = sourceH.contiguous().view(sourceH.size(0)*sourceH.size(1), sourceH.size(2)) # (B*Ls, Ds) 273 | sourceHtrans = self.attentionLayer(sourceHtrans) # (B*Ls, Dt) 274 | sourceHtrans = sourceHtrans.view(*newShape).transpose(1, 2) # (B, Dt, Ls) 275 | 276 | sourceHtrans_ = Variable(sourceHtrans.data.new(beamSize, sourceHtrans.size(1), sourceHtrans.size(2)), requires_grad = False, volatile = True) 277 | sourceH_ = Variable(sourceH.data.new(beamSize, sourceH.size(1), sourceH.size(2)), requires_grad = False, volatile = True) 278 | prevFinalHidden = Variable(sourceH.data.new(beamSize, 1, self.targetEmbedDim).zero_(), requires_grad = False, volatile = True) 279 | 280 | sampledIndex = torch.LongTensor(beamSize).zero_() 281 | 282 | h0 = hidden0Target[0] 283 | c0 = hidden0Target[1] 284 | h0_ = Variable(h0.data.new(h0.size(0), beamSize, h0.size(2)), requires_grad = False, volatile = True) 285 | c0_ = Variable(c0.data.new(c0.size(0), beamSize, c0.size(2)), requires_grad = False, volatile = True) 286 | 287 | for dataIndex in range(batchSize): 288 | i = 1 289 | prevFinalHidden.data.zero_() 290 | sourceHtrans_.data.zero_() 291 | sourceHtrans_.data += sourceHtrans.data[dataIndex] 292 | sourceH_.data.zero_() 293 | sourceH_.data += sourceH.data[dataIndex] 294 | h0_.data.zero_() 295 | c0_.data.zero_() 296 | h0_.data += h0.data[:, dataIndex, :].unsqueeze(1) 297 | c0_.data += c0.data[:, dataIndex, :].unsqueeze(1) 298 | hidden0Target_ = (h0_, c0_) 299 | 300 | cand = [] 301 | for j in range(beamSize): 302 | cand.append(DecCand(sentence_ = [bosIndex])) 303 | 304 | while i < maxGenLen and not cand[0].fin: 305 | index = [] 306 | for j in range(beamSize): 307 | index.append([cand[j].sentence[-1]]) 308 | index = Variable(torch.LongTensor(index), requires_grad = False, volatile = True).cuda() 309 | inputTarget = targetEmbedding(index) 310 | 311 | hi, hidden0Target_ = self.decoder(torch.cat((inputTarget, prevFinalHidden), dim = 2), hidden0Target_) # hi: (B, 1, Dt) 312 | 313 | if self.numLayers != 1: 314 | hi = hidden0Target_[0][0]+hidden0Target_[0][1] 315 | hi = hi.unsqueeze(1) 316 | 317 | attentionScores_ = torch.bmm(hi, sourceHtrans_).transpose(1, 2) # (B, Ls, 1) 318 | 319 | attentionScores = attentionScores_.data.new(attentionScores_.size()).fill_(-1024.0) 320 | attentionScores[:, :lengthsSource[dataIndex]].zero_() 321 | attentionScores = Variable(attentionScores, requires_grad = False, volatile = True) 322 | attentionScores += attentionScores_ 323 | 324 | attentionScores = attentionScores.transpose(1, 2) # (B, 1, Ls) 325 | attentionScores = F.softmax(attentionScores.transpose(0, 2)).transpose(0, 2) 326 | 327 | attnProb, attnIndex = torch.max(attentionScores, dim = 2) 328 | 329 | contextVec = torch.bmm(attentionScores, sourceH_) # (B, 1, Ds) 330 | finalHidden = torch.cat((hi, contextVec), 2) # (B, 1, Dt+Ds) 331 | finalHidden = self.dropout(finalHidden) 332 | finalHidden = self.finalHiddenLayer(finalHidden) 333 | finalHidden = self.finalHiddenAct(finalHidden) 334 | finalHidden = self.dropout(finalHidden) 335 | prevFinalHidden = finalHidden # (B, 1, Dt) 336 | 337 | finalHidden = finalHidden.contiguous().view(finalHidden.size(0)*finalHidden.size(1), finalHidden.size(2)) 338 | output = self.wordPredictor(finalHidden) 339 | 340 | output = F.log_softmax(output)+penalty 341 | 342 | for j in range(beamSize): 343 | if cand[j].fin: 344 | output.data[j].fill_(cand[j].score) 345 | else: 346 | output.data[j] += cand[j].score 347 | 348 | updatedCand = [] 349 | updatedPrevFinalHidden = Variable(prevFinalHidden.data.new(prevFinalHidden.size()).zero_(), requires_grad = False, volatile = True) 350 | updatedH0 = Variable(h0_.data.new(h0_.size()).zero_(), requires_grad = False, volatile = True) 351 | updatedC0 = Variable(c0_.data.new(c0_.size()).zero_(), requires_grad = False, volatile = True) 352 | 353 | for j in range(beamSize): 354 | maxScore, maxIndex = torch.topk(output.view(output.size(0)*output.size(1)), k = 1) 355 | 356 | row = maxIndex.data[0] // output.size(1) 357 | col = maxIndex.data[0] % output.size(1) 358 | score = maxScore.data[0] 359 | sampledIndex[j] = col 360 | 361 | if cand[row].fin: 362 | updatedCand.append(DecCand(score, True, cand[row].sentence, cand[row].attenIndex)) 363 | output.data[row].fill_(-1024.0) 364 | continue 365 | 366 | updatedCand.append(DecCand(score, False, cand[row].sentence+[], cand[row].attenIndex+[attnIndex.data[row, 0]])) 367 | updatedPrevFinalHidden[j] = prevFinalHidden[row] 368 | updatedH0[:, j, :] = hidden0Target_[0][:, row, :].unsqueeze(1) 369 | updatedC0[:, j, :] = hidden0Target_[1][:, row, :].unsqueeze(1) 370 | 371 | if i == 1: 372 | output.data[:, col].fill_(-1024.0) 373 | else: 374 | output.data[row, col] = -1024.0 375 | 376 | for j in range(beamSize): 377 | if updatedCand[j].fin: 378 | continue 379 | 380 | if sampledIndex[j] == eosIndex: 381 | updatedCand[j].fin = True 382 | 383 | updatedCand[j].sentence.append(sampledIndex[j]) 384 | 385 | #cand = sorted(updatedCand, key = lambda x: -x.score/len(x.sentence)) 386 | cand = updatedCand 387 | prevFinalHidden = updatedPrevFinalHidden 388 | hidden0Target_ = (updatedH0, updatedC0) 389 | i += 1 390 | 391 | targetWordLengths[dataIndex] = len(cand[0].sentence)-1 392 | for j in range(targetWordLengths[dataIndex]): 393 | targetWordIndices[dataIndex, j] = cand[0].sentence[j] 394 | attentionIndices[dataIndex, j] = cand[0].attenIndex[j] 395 | 396 | return targetWordIndices[:, 1:], list(targetWordLengths), attentionIndices 397 | -------------------------------------------------------------------------------- /nmt/code/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir params 2 | mkdir tools 3 | cd tools 4 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl 5 | cd .. 6 | mkdir data 7 | cd ./data 8 | wget https://raw.githubusercontent.com/hassyGo/N3LP/master/corpus/sample.en 9 | wget https://raw.githubusercontent.com/hassyGo/N3LP/master/corpus/sample.ja 10 | wget https://raw.githubusercontent.com/hassyGo/N3LP/master/corpus/sample.en.dev 11 | wget https://raw.githubusercontent.com/hassyGo/N3LP/master/corpus/sample.ja.dev 12 | cd .. 13 | -------------------------------------------------------------------------------- /nmt/code/train.py: -------------------------------------------------------------------------------- 1 | from data import Corpus 2 | from model import Embedding 3 | from model import EncDec 4 | import utils 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from torch.autograd import Variable 11 | 12 | import random 13 | import math 14 | import os 15 | import time 16 | import sys 17 | 18 | # En-Ja dev 19 | sourceDevFile = './data/sample.en.dev' 20 | sourceOrigDevFile = './data/sample.en.dev' 21 | targetDevFile = './data/sample.ja.dev' 22 | 23 | # En-Ja train 24 | sourceTrainFile = './data/sample.en' 25 | sourceOrigTrainFile = './data/sample.en' 26 | targetTrainFile = './data/sample.ja' 27 | 28 | minFreqSource = 2 # use source-side words which appear at least N times in the training data 29 | minFreqTarget = 2 # use target-side words which appear at least N times in the training data 30 | hiddenDim = 128 # dimensionality of hidden states and embeddings 31 | decay = 0.5 # learning rate decay rate for SGD 32 | gradClip = 1.0 # clipping value for gradient-norm clipping 33 | dropoutRate = 0.2 # dropout rate for output MLP 34 | numLayers = 1 # number of LSTM layers (1 or 2) 35 | 36 | maxLen = 100 # use sentence pairs whose maximum lengths are 100 in both source and target sides 37 | maxEpoch = 20 38 | decayStart = 5 39 | 40 | sourceEmbedDim = hiddenDim 41 | targetEmbedDim = hiddenDim 42 | 43 | batchSize = 16 # "128" is typically used 44 | learningRate = 1.0 45 | momentumRate = 0.75 46 | 47 | gpuId = [0, 1, 2, 3] 48 | seed = int(sys.argv[1]) 49 | 50 | weightDecay = 1.0e-06 51 | 52 | train = True 53 | 54 | beamSize = 10 55 | 56 | if not train: 57 | batchSize = 1 58 | 59 | torch.set_num_threads(1) 60 | 61 | torch.manual_seed(seed) 62 | random.seed(seed) 63 | torch.cuda.set_device(gpuId[0]) 64 | torch.cuda.manual_seed(seed) 65 | 66 | corpus = Corpus(sourceTrainFile, sourceOrigTrainFile, targetTrainFile, sourceDevFile, sourceOrigDevFile, targetDevFile, minFreqSource, minFreqTarget, maxLen) 67 | 68 | print('Source vocabulary size: '+str(corpus.sourceVoc.size())) 69 | print('Target vocabulary size: '+str(corpus.targetVoc.size())) 70 | print() 71 | print('# of training samples: '+str(len(corpus.trainData))) 72 | print('# of develop samples: '+str(len(corpus.devData))) 73 | print('SEED: ', str(seed)) 74 | print() 75 | 76 | embedding = Embedding(sourceEmbedDim, targetEmbedDim, corpus.sourceVoc.size(), corpus.targetVoc.size()) 77 | encdec = EncDec(sourceEmbedDim, targetEmbedDim, hiddenDim, corpus.targetVoc.size(), dropoutRate = dropoutRate, numLayers = numLayers) 78 | 79 | encdec.wordPredictor.softmaxLayer.weight = embedding.targetEmbedding.weight 80 | encdec.wordPredictor = nn.DataParallel(encdec.wordPredictor, gpuId) 81 | 82 | if train: 83 | embedding.cuda() 84 | encdec.cuda() 85 | 86 | batchListTrain = utils.buildBatchList(len(corpus.trainData), batchSize) 87 | batchListDev = utils.buildBatchList(len(corpus.devData), batchSize) 88 | 89 | withoutWeightDecay = [] 90 | withWeightDecay = [] 91 | for name, param in list(embedding.named_parameters())+list(encdec.named_parameters()): 92 | if 'bias' in name or 'Embedding' in name: 93 | withoutWeightDecay += [param] 94 | elif 'softmax' not in name: 95 | withWeightDecay += [param] 96 | optParams = [{'params': withWeightDecay, 'weight_decay': weightDecay}, 97 | {'params': withoutWeightDecay, 'weight_decay': 0.0}] 98 | totalParamsNMT = withoutWeightDecay+withWeightDecay 99 | 100 | opt = optim.SGD(optParams, momentum = momentumRate, lr = learningRate) 101 | 102 | bestDevGleu = -1.0 103 | prevDevGleu = -1.0 104 | 105 | for epoch in range(maxEpoch): 106 | if not train: 107 | break 108 | 109 | batchProcessed = 0 110 | totalLoss = 0.0 111 | totalTrainTokenCount = 0.0 112 | 113 | print('--- Epoch ' + str(epoch+1)) 114 | startTime = time.time() 115 | 116 | random.shuffle(corpus.trainData) 117 | 118 | embedding.train() 119 | encdec.train() 120 | 121 | for batch in batchListTrain: 122 | print('\r', end = '') 123 | print(batchProcessed+1, '/', len(batchListTrain), end = '') 124 | 125 | batchSize = batch[1]-batch[0]+1 126 | 127 | opt.zero_grad() 128 | 129 | batchInputSource, lengthsSource, batchInputTarget, batchTarget, lengthsTarget, tokenCount, batchData = corpus.processBatchInfoNMT(batch, train = True) 130 | 131 | inputSource = embedding.getBatchedSourceEmbedding(batchInputSource) 132 | sourceH, (hn, cn) = encdec.encode(inputSource, lengthsSource) 133 | 134 | batchInputTarget = batchInputTarget.cuda() 135 | batchTarget = batchTarget.cuda() 136 | inputTarget = embedding.getBatchedTargetEmbedding(batchInputTarget) 137 | 138 | loss = encdec(inputTarget, lengthsTarget, lengthsSource, (hn, cn), sourceH, batchTarget) 139 | loss = loss.sum() 140 | 141 | totalLoss += loss.data[0] 142 | totalTrainTokenCount += tokenCount 143 | 144 | loss /= batchSize 145 | loss.backward() 146 | nn.utils.clip_grad_norm(totalParamsNMT, gradClip) 147 | opt.step() 148 | 149 | batchProcessed += 1 150 | if batchProcessed == len(batchListTrain)//2 or batchProcessed == len(batchListTrain): 151 | devPerp = 0.0 152 | devGleu = 0.0 153 | totalTokenCount = 0.0 154 | 155 | embedding.eval() 156 | encdec.eval() 157 | 158 | print() 159 | print('Training time: ' + str(time.time()-startTime) + ' sec') 160 | print('Train perp: ' + str(math.exp(totalLoss/totalTrainTokenCount))) 161 | 162 | f_trans = open('./trans.txt', 'w') 163 | f_gold = open('./gold.txt', 'w') 164 | 165 | for batch in batchListDev: 166 | batchSize = batch[1]-batch[0]+1 167 | batchInputSource, lengthsSource, batchInputTarget, batchTarget, lengthsTarget, tokenCount, batchData = corpus.processBatchInfoNMT(batch, train = False, volatile = True) 168 | 169 | inputSource = embedding.getBatchedSourceEmbedding(batchInputSource) 170 | sourceH, (hn, cn) = encdec.encode(inputSource, lengthsSource) 171 | 172 | indicesGreedy, lengthsGreedy, attentionIndices = encdec.greedyTrans(corpus.targetVoc.bosIndex, corpus.targetVoc.eosIndex, lengthsSource, embedding.targetEmbedding, sourceH, (hn, cn), maxGenLen = maxLen) 173 | indicesGreedy = indicesGreedy.cpu() 174 | 175 | for i in range(batchSize): 176 | for k in range(lengthsGreedy[i]-1): 177 | index = indicesGreedy.data[i, k] 178 | if index == corpus.targetVoc.unkIndex: 179 | index = attentionIndices[i, k] 180 | f_trans.write(batchData[i].sourceOrigStr[index] + ' ') 181 | else: 182 | f_trans.write(corpus.targetVoc.tokenList[index].str + ' ') 183 | f_trans.write('\n') 184 | 185 | for k in range(lengthsTarget[i]-1): 186 | index = batchInputTarget.data[i, k+1] 187 | if index == corpus.targetVoc.unkIndex: 188 | f_gold.write(batchData[i].targetUnkMap[k] + ' ') 189 | else: 190 | f_gold.write(corpus.targetVoc.tokenList[index].str + ' ') 191 | f_gold.write('\n') 192 | 193 | batchInputTarget = batchInputTarget.cuda() 194 | batchTarget = batchTarget.cuda() 195 | inputTarget = embedding.getBatchedTargetEmbedding(batchInputTarget) 196 | 197 | loss = encdec(inputTarget, lengthsTarget, lengthsSource, (hn, cn), sourceH, batchTarget) 198 | loss = loss.sum() 199 | devPerp += loss.data[0] 200 | 201 | totalTokenCount += tokenCount 202 | 203 | f_trans.close() 204 | f_gold.close() 205 | os.system("./bleu.sh 2> DUMMY") 206 | f_trans = open('./bleu.txt', 'r') 207 | for line in f_trans: 208 | devGleu = float(line.split()[2][0:-1]) 209 | break 210 | f_trans.close() 211 | 212 | devPerp = math.exp(devPerp/totalTokenCount) 213 | print("Dev perp:", devPerp) 214 | print("Dev BLEU:", devGleu) 215 | 216 | embedding.train() 217 | encdec.train() 218 | 219 | if epoch > decayStart and devGleu < prevDevGleu: 220 | print('lr -> ' + str(learningRate*decay)) 221 | learningRate *= decay 222 | 223 | for paramGroup in opt.param_groups: 224 | paramGroup['lr'] = learningRate 225 | 226 | elif devGleu >= bestDevGleu: 227 | bestDevGleu = devGleu 228 | 229 | stateDict = embedding.state_dict() 230 | for elem in stateDict: 231 | stateDict[elem] = stateDict[elem].cpu() 232 | torch.save(stateDict, './params/embedding.bin') 233 | 234 | stateDict = encdec.state_dict() 235 | for elem in stateDict: 236 | stateDict[elem] = stateDict[elem].cpu() 237 | torch.save(stateDict, './params/encdec.bin') 238 | 239 | prevDevGleu = devGleu 240 | 241 | if train: 242 | exit(0) 243 | 244 | embedding.load_state_dict(torch.load('./params/embedding.bin')) 245 | encdec.load_state_dict(torch.load('./params/encdec.bin')) 246 | 247 | embedding.cuda() 248 | encdec.cuda() 249 | 250 | embedding.eval() 251 | encdec.eval() 252 | 253 | f_trans = open('./trans.txt', 'w') 254 | f_gold = open('./gold.txt', 'w') 255 | 256 | devPerp = 0.0 257 | totalTokenCount = 0.0 258 | 259 | for batch in batchListDev: 260 | batchSize = batch[1]-batch[0]+1 261 | batchInputSource, lengthsSource, batchInputTarget, batchTarget, lengthsTarget, tokenCount, batchData = corpus.processBatchInfoNMT(batch, train = False, volatile = True) 262 | 263 | inputSource = embedding.getBatchedSourceEmbedding(batchInputSource) 264 | sourceH, (hn, cn) = encdec.encode(inputSource, lengthsSource) 265 | 266 | if beamSize == 1: 267 | indicesGreedy, lengthsGreedy, attentionIndices = encdec.greedyTrans(corpus.targetVoc.bosIndex, corpus.targetVoc.eosIndex, lengthsSource, embedding.targetEmbedding, sourceH, (hn, cn), maxGenLen = maxLen) 268 | else: 269 | indicesGreedy, lengthsGreedy, attentionIndices = encdec.beamSearch(corpus.targetVoc.bosIndex, corpus.targetVoc.eosIndex, lengthsSource, embedding.targetEmbedding, sourceH, (hn, cn), beamSize = beamSize, maxGenLen = maxLen) 270 | indicesGreedy = indicesGreedy.cpu() 271 | 272 | for i in range(batchSize): 273 | for k in range(lengthsGreedy[i]-1): 274 | index = indicesGreedy.data[i, k] 275 | if index == corpus.targetVoc.unkIndex: 276 | index = attentionIndices[i, k] 277 | f_trans.write(batchData[i].sourceOrigStr[index] + ' ') 278 | else: 279 | f_trans.write(corpus.targetVoc.tokenList[index].str + ' ') 280 | f_trans.write('\n') 281 | 282 | for k in range(lengthsTarget[i]-1): 283 | index = batchInputTarget.data[i, k+1] 284 | if index == corpus.targetVoc.unkIndex: 285 | f_gold.write(batchData[i].targetUnkMap[k] + ' ') 286 | else: 287 | f_gold.write(corpus.targetVoc.tokenList[index].str + ' ') 288 | f_gold.write('\n') 289 | 290 | batchInputTarget = batchInputTarget.cuda() 291 | batchTarget = batchTarget.cuda() 292 | inputTarget = embedding.getBatchedTargetEmbedding(batchInputTarget) 293 | 294 | loss = encdec(inputTarget, lengthsTarget, lengthsSource, (hn, cn), sourceH, batchTarget) 295 | loss = loss.sum() 296 | devPerp += loss.data[0] 297 | 298 | totalTokenCount += tokenCount 299 | 300 | f_trans.close() 301 | f_gold.close() 302 | os.system("./bleu.sh 2> DUMMY") 303 | f_trans = open('./bleu.txt', 'r') 304 | for line in f_trans: 305 | devGleu = float(line.split()[2][0:-1]) 306 | break 307 | f_trans.close() 308 | 309 | devPerp = math.exp(devPerp/totalTokenCount) 310 | print("Dev perp:", devPerp) 311 | print("Dev BLEU:", devGleu) 312 | -------------------------------------------------------------------------------- /nmt/code/utils.py: -------------------------------------------------------------------------------- 1 | 2 | def buildBatchList(dataSize, batchSize): 3 | batchList = [] 4 | if dataSize%batchSize == 0: 5 | numBatch = dataSize//batchSize 6 | else: 7 | numBatch = int(dataSize/batchSize)+1 8 | 9 | for i in range(numBatch): 10 | batch = [] 11 | batch.append(i*batchSize) 12 | if i == numBatch-1: 13 | batch.append(dataSize-1) 14 | else: 15 | batch.append((i+1)*batchSize-1) 16 | batchList.append(batch) 17 | 18 | return batchList 19 | -------------------------------------------------------------------------------- /nmt/code_0.4/bleu.sh: -------------------------------------------------------------------------------- 1 | T=./trans.txt 2 | G=./gold.txt 3 | 4 | perl ./tools/multi-bleu.perl ${G} < ${T} > bleu.txt 5 | -------------------------------------------------------------------------------- /nmt/code_0.4/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from torch.autograd import Variable 5 | 6 | class Token: 7 | def __init__(self, str_ = '', count_ = 0): 8 | self.str = str_ 9 | self.count = count_ 10 | 11 | class Vocabulary: 12 | def __init__(self): 13 | self.UNK = 'UNK' # unkown words 14 | self.EOS = '' # the end-of-sequence token 15 | self.BOS = '' # the beginning-of-sequence token 16 | self.PAD = '' # padding 17 | 18 | self.unkIndex = -1 19 | self.eosIndex = -1 20 | self.bosIndex = -1 21 | self.padIndex = -1 22 | 23 | self.tokenIndex = {} 24 | self.tokenList = [] 25 | 26 | def getTokenIndex(self, str): 27 | if str in self.tokenIndex: 28 | return self.tokenIndex[str] 29 | else: 30 | return self.unkIndex 31 | 32 | def add(self, str, count): 33 | if str not in self.tokenIndex: 34 | self.tokenList.append(Token(str, count)) 35 | self.tokenIndex[str] = len(self.tokenList)-1 36 | 37 | def size(self): 38 | return len(self.tokenList) 39 | 40 | def outputTokenList(self, fileName): 41 | f = open(fileName, 'w') 42 | for t in self.tokenList: 43 | f.write(t.str + '\n') 44 | f.close() 45 | 46 | 47 | class Data: 48 | def __init__(self, sourceText_, targetText_, sourceOrigStr_ = None, targetUnkMap_ = None): 49 | self.sourceText = sourceText_ 50 | self.sourceOrigStr = sourceOrigStr_ 51 | 52 | self.targetText = targetText_ 53 | self.targetUnkMap = targetUnkMap_ 54 | 55 | 56 | class Corpus: 57 | def __init__(self, sourceVocFile = '', targetVocFile = '', sourceTrainFile = '', sourceOrigTrainFile = '', targetTrainFile = '', sourceDevFile = '', sourceOrigDevFile = '', targetDevFile = '', minFreqSource = 1, minFreqTarget = 1, maxTokenLen = 100000): 58 | self.sourceVoc = Vocabulary() 59 | self.targetVoc = Vocabulary() 60 | 61 | if sourceVocFile == '' or targetVocFile == '': 62 | self.buildVoc(sourceTrainFile, minFreqSource, source = True)#, maxLen = maxTokenLen) 63 | self.buildVoc(targetTrainFile, minFreqTarget, source = False)#, maxLen = maxTokenLen) 64 | else: 65 | self.readVoc(sourceVocFile, source = True) 66 | self.readVoc(targetVocFile, source = False) 67 | 68 | self.trainData = self.buildDataset(sourceTrainFile, sourceOrigTrainFile, targetTrainFile, train = True, maxLen = maxTokenLen) 69 | self.devData = self.buildDataset(sourceDevFile, sourceOrigDevFile, targetDevFile, train = False) 70 | 71 | def buildVoc(self, fileName, minFreq, source, maxLen = 100000): 72 | assert os.path.exists(fileName) 73 | 74 | if source: 75 | voc = self.sourceVoc 76 | else: 77 | voc = self.targetVoc 78 | 79 | with open(fileName, 'r') as f: 80 | tokenCount = {} 81 | unkCount = 0 82 | eosCount = 0 83 | 84 | for line in f: 85 | tokens = line.split() # w1 w2 ... \n 86 | 87 | if len(tokens) > maxLen: 88 | continue 89 | 90 | eosCount += 1 91 | 92 | for t in tokens: 93 | if t in tokenCount: 94 | tokenCount[t] += 1 95 | else: 96 | tokenCount[t] = 1 97 | 98 | tokenList = sorted(tokenCount.items(), key = lambda x: -x[1]) # sort by value 99 | 100 | for t in tokenList: 101 | if t[1] >= minFreq: 102 | voc.add(t[0], t[1]) 103 | else: 104 | unkCount += t[1] 105 | 106 | ''' 107 | Add special tokens 108 | ''' 109 | voc.add(voc.UNK, unkCount) 110 | voc.add(voc.BOS, eosCount) 111 | voc.add(voc.EOS, eosCount) 112 | voc.add(voc.PAD, 0) 113 | 114 | voc.unkIndex = voc.getTokenIndex(voc.UNK) 115 | voc.bosIndex = voc.getTokenIndex(voc.BOS) 116 | voc.eosIndex = voc.getTokenIndex(voc.EOS) 117 | voc.padIndex = voc.getTokenIndex(voc.PAD) 118 | 119 | def readVoc(self, vocFile, source): 120 | assert os.path.exists(vocFile) 121 | 122 | if source: 123 | voc = self.sourceVoc 124 | else: 125 | voc = self.targetVoc 126 | 127 | with open(vocFile, 'r') as f: 128 | for line in f: 129 | line = line.rstrip() 130 | 131 | voc.add(line, 0) 132 | 133 | ''' 134 | Add special tokens 135 | ''' 136 | voc.add(voc.UNK, 0) 137 | voc.add(voc.BOS, 0) 138 | voc.add(voc.EOS, 0) 139 | voc.add(voc.PAD, 0) 140 | 141 | voc.unkIndex = voc.getTokenIndex(voc.UNK) 142 | voc.bosIndex = voc.getTokenIndex(voc.BOS) 143 | voc.eosIndex = voc.getTokenIndex(voc.EOS) 144 | voc.padIndex = voc.getTokenIndex(voc.PAD) 145 | 146 | 147 | def buildDataset(self, sourceFileName, sourceOrigFileName, targetFileName, train, maxLen = 100000): 148 | assert os.path.exists(sourceFileName) and os.path.exists(targetFileName) 149 | assert os.path.exists(sourceOrigFileName) 150 | 151 | with open(sourceFileName, 'r') as fs, open(sourceOrigFileName, 'r') as fsOrig, open(targetFileName, 'r') as ft: 152 | dataset = [] 153 | 154 | for (lineSource, lineSourceOrig, lineTarget) in zip(fs, fsOrig, ft): 155 | tokensSource = lineSource.split() # w1 w2 ... \n 156 | if train: 157 | tokensSourceOrig = None 158 | else: 159 | tokensSourceOrig = lineSourceOrig.split() # w1 w2 ... \n 160 | tokensTarget = lineTarget.split() # w1 w2 ... \n 161 | 162 | if len(tokensSource) > maxLen or len(tokensTarget) > maxLen or len(tokensSource) == 0 or len(tokensTarget) == 0: 163 | continue 164 | 165 | tokenIndicesSource = torch.LongTensor(len(tokensSource)) 166 | tokenIndicesTarget = torch.LongTensor(len(tokensTarget)) 167 | unkMapTarget = {} 168 | 169 | for i in range(len(tokensSource)): 170 | t = tokensSource[i] 171 | tokenIndicesSource[i] = self.sourceVoc.getTokenIndex(t) 172 | 173 | for i in range(len(tokensTarget)): 174 | t = tokensTarget[i] 175 | tokenIndicesTarget[i] = self.targetVoc.getTokenIndex(t) 176 | if tokenIndicesTarget[i] == self.targetVoc.unkIndex: 177 | unkMapTarget[i] = t 178 | 179 | dataset.append(Data(tokenIndicesSource, tokenIndicesTarget, tokensSourceOrig, unkMapTarget)) 180 | 181 | return dataset 182 | 183 | 184 | def processBatchInfoNMT(self, batch, train, device): 185 | begin = batch[0] 186 | end = batch[1] 187 | batchSize = end-begin+1 188 | 189 | ''' 190 | Process source info 191 | ''' 192 | if train: 193 | data = sorted(self.trainData[begin:end+1], key = lambda x: -len(x.sourceText)) 194 | else: 195 | data = sorted(self.devData[begin:end+1], key = lambda x: -len(x.sourceText)) 196 | 197 | maxLen = len(data[0].sourceText) 198 | batchInputSource = torch.LongTensor(batchSize, maxLen) 199 | batchInputSource.fill_(self.sourceVoc.padIndex) 200 | lengthsSource = [] 201 | 202 | for i in range(batchSize): 203 | l = len(data[i].sourceText) 204 | lengthsSource.append(l) 205 | 206 | for j in range(l): 207 | batchInputSource[i, j] = data[i].sourceText[j] 208 | 209 | batchInputSource = batchInputSource.to(device) 210 | 211 | ''' 212 | Process target info 213 | ''' 214 | data_ = sorted(data, key = lambda x: -len(x.targetText)) 215 | 216 | maxLen = len(data_[0].targetText)+1 # for BOS or EOS 217 | batchInputTarget = torch.LongTensor(batchSize, maxLen) 218 | batchInputTarget.fill_(self.targetVoc.padIndex) 219 | lengthsTarget = [] 220 | batchTarget = torch.LongTensor(maxLen*batchSize).fill_(-1) 221 | targetIndexOffset = 0 222 | tokenCount = 0.0 223 | 224 | for i in range(batchSize): 225 | l = len(data[i].targetText) 226 | lengthsTarget.append(l+1) 227 | batchInputTarget[i, 0] = self.targetVoc.bosIndex 228 | for j in range(l): 229 | batchInputTarget[i, j+1] = data[i].targetText[j] 230 | batchTarget[targetIndexOffset+j] = data[i].targetText[j] 231 | batchTarget[targetIndexOffset+l] = self.targetVoc.eosIndex 232 | targetIndexOffset += maxLen 233 | tokenCount += (l+1) 234 | 235 | return batchInputSource, lengthsSource, batchInputTarget, batchTarget, lengthsTarget, tokenCount, data 236 | -------------------------------------------------------------------------------- /nmt/code_0.4/data/sample.en.voc: -------------------------------------------------------------------------------- 1 | this 2 | is 3 | a 4 | new 5 | book 6 | . 7 | dictionary 8 | that 9 | an 10 | important 11 | ? 12 | yes 13 | , 14 | it 15 | no 16 | n't 17 | not 18 | long 19 | -------------------------------------------------------------------------------- /nmt/code_0.4/data/sample.ja.voc: -------------------------------------------------------------------------------- 1 | これ 2 | は 3 | 新し 4 | い 5 | 本 6 | で 7 | す 8 | 。 9 | この 10 | 辞書 11 | あれ 12 | 大切 13 | か 14 | ？ 15 | -------------------------------------------------------------------------------- /nmt/code_0.4/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | import math 6 | 7 | class Embedding(nn.Module): 8 | 9 | def __init__(self, sourceEmbedDim, targetEmbedDim, sourceVocSize, targetVocSize): 10 | super(Embedding, self).__init__() 11 | 12 | self.sourceEmbedding = nn.Embedding(sourceVocSize, sourceEmbedDim) 13 | self.targetEmbedding = nn.Embedding(targetVocSize, targetEmbedDim) 14 | 15 | self.initWeights() 16 | 17 | def initWeights(self): 18 | initScale = 0.1 19 | 20 | self.sourceEmbedding.weight.data.uniform_(-initScale, initScale) 21 | self.targetEmbedding.weight.data.uniform_(-initScale, initScale) 22 | 23 | def getBatchedSourceEmbedding(self, batchInput): 24 | return self.sourceEmbedding(batchInput) 25 | 26 | def getBatchedTargetEmbedding(self, batchInput): 27 | return self.targetEmbedding(batchInput) 28 | 29 | 30 | class WordPredictor(nn.Module): 31 | 32 | def __init__(self, inputDim, outputDim, ignoreIndex = -1): 33 | super(WordPredictor, self).__init__() 34 | 35 | self.softmaxLayer = nn.Linear(inputDim, outputDim) 36 | self.loss = nn.CrossEntropyLoss(reduction = 'sum', ignore_index = ignoreIndex) 37 | 38 | self.initWeight() 39 | 40 | def initWeight(self): 41 | self.softmaxLayer.weight.data.zero_() 42 | self.softmaxLayer.bias.data.zero_() 43 | 44 | def forward(self, input, target = None): 45 | output = self.softmaxLayer(input) 46 | if target is not None: 47 | return self.loss(output, target) 48 | else: 49 | return output 50 | 51 | 52 | class DecCand: 53 | def __init__(self, score_ = 0.0, fin_ = False, sentence_ = [], attenIndex_ = []): 54 | self.score = score_ 55 | self.fin = fin_ 56 | self.sentence = sentence_ 57 | self.attenIndex = attenIndex_ 58 | 59 | 60 | class EncDec(nn.Module): 61 | 62 | def __init__(self, sourceEmbedDim, targetEmbedDim, hiddenDim, targetVocSize, dropoutRate = 0.2, numLayers = 1): 63 | super(EncDec, self).__init__() 64 | 65 | self.numLayers = numLayers 66 | self.dropout = nn.Dropout(p = dropoutRate) 67 | 68 | self.encoder = nn.LSTM(input_size = sourceEmbedDim, hidden_size = hiddenDim, 69 | num_layers = self.numLayers, dropout = 0.0, bidirectional = True) 70 | 71 | self.decoder = nn.LSTM(input_size = targetEmbedDim + hiddenDim, hidden_size = hiddenDim, 72 | num_layers = self.numLayers, dropout = 0.0, bidirectional = False, batch_first = True) 73 | 74 | self.attentionLayer = nn.Linear(2*hiddenDim, hiddenDim, bias = False) 75 | self.finalHiddenLayer = nn.Linear(3*hiddenDim, targetEmbedDim) 76 | self.finalHiddenAct = nn.Tanh() 77 | 78 | self.wordPredictor = WordPredictor(targetEmbedDim, targetVocSize) 79 | 80 | self.targetEmbedDim = targetEmbedDim 81 | self.hiddenDim = hiddenDim 82 | 83 | self.initWeight() 84 | 85 | def initWeight(self): 86 | initScale = 0.1 87 | 88 | self.encoder.weight_ih_l0.data.uniform_(-initScale, initScale) 89 | self.encoder.weight_hh_l0.data.uniform_(-initScale, initScale) 90 | self.encoder.bias_ih_l0.data.zero_() 91 | self.encoder.bias_hh_l0.data.zero_() 92 | self.encoder.bias_hh_l0.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 93 | 94 | self.encoder.weight_ih_l0_reverse.data.uniform_(-initScale, initScale) 95 | self.encoder.weight_hh_l0_reverse.data.uniform_(-initScale, initScale) 96 | self.encoder.bias_ih_l0_reverse.data.zero_() 97 | self.encoder.bias_hh_l0_reverse.data.zero_() 98 | self.encoder.bias_hh_l0_reverse.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 99 | 100 | self.decoder.weight_ih_l0.data.uniform_(-initScale, initScale) 101 | self.decoder.weight_hh_l0.data.uniform_(-initScale, initScale) 102 | self.decoder.bias_ih_l0.data.zero_() 103 | self.decoder.bias_hh_l0.data.zero_() 104 | self.decoder.bias_hh_l0.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 105 | 106 | if self.numLayers == 2: 107 | self.encoder.weight_ih_l1.data.uniform_(-initScale, initScale) 108 | self.encoder.weight_hh_l1.data.uniform_(-initScale, initScale) 109 | self.encoder.bias_ih_l1.data.zero_() 110 | self.encoder.bias_hh_l1.data.zero_() 111 | self.encoder.bias_hh_l1.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 112 | 113 | self.encoder.weight_ih_l1_reverse.data.uniform_(-initScale, initScale) 114 | self.encoder.weight_hh_l1_reverse.data.uniform_(-initScale, initScale) 115 | self.encoder.bias_ih_l1_reverse.data.zero_() 116 | self.encoder.bias_hh_l1_reverse.data.zero_() 117 | self.encoder.bias_hh_l1_reverse.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 118 | 119 | self.decoder.weight_ih_l1.data.uniform_(-initScale, initScale) 120 | self.decoder.weight_hh_l1.data.uniform_(-initScale, initScale) 121 | self.decoder.bias_ih_l1.data.zero_() 122 | self.decoder.bias_hh_l1.data.zero_() 123 | self.decoder.bias_hh_l1.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 124 | 125 | self.attentionLayer.weight.data.zero_() 126 | 127 | self.finalHiddenLayer.weight.data.uniform_(-initScale, initScale) 128 | self.finalHiddenLayer.bias.data.zero_() 129 | 130 | def encode(self, inputSource, lengthsSource): 131 | packedInput = nn.utils.rnn.pack_padded_sequence(inputSource, lengthsSource, batch_first = True) 132 | 133 | h, (hn, cn) = self.encoder(packedInput) # hn, ch: (layers*direction, B, Ds) 134 | h, _ = nn.utils.rnn.pad_packed_sequence(h, batch_first = True) 135 | 136 | if self.numLayers == 1: 137 | hn = (hn[0]+hn[1]).unsqueeze(0) 138 | cn = (cn[0]+cn[1]).unsqueeze(0) 139 | else: 140 | hn0 = (hn[0]+hn[1]).unsqueeze(0) 141 | cn0 = (cn[0]+cn[1]).unsqueeze(0) 142 | hn1 = (hn[2]+hn[3]).unsqueeze(0) 143 | cn1 = (cn[2]+cn[3]).unsqueeze(0) 144 | 145 | hn = torch.cat((hn0, hn1), dim = 0) 146 | cn = torch.cat((cn0, cn1), dim = 0) 147 | 148 | return h, (hn, cn) 149 | 150 | def forward(self, inputTarget, lengthsTarget, lengthsSource, hidden0Target, sourceH, target = None): 151 | batchSize = sourceH.size(0) 152 | maxLen = lengthsTarget[0] 153 | 154 | for i in range(batchSize): 155 | maxLen = max(maxLen, lengthsTarget[i]) 156 | 157 | finalHidden = inputTarget.new(batchSize, maxLen, self.targetEmbedDim) 158 | prevFinalHidden = inputTarget.new(batchSize, 1, self.targetEmbedDim).zero_() 159 | 160 | newShape = sourceH.size(0), sourceH.size(1), self.hiddenDim # (B, Ls, Dt) 161 | sourceHtrans = sourceH.contiguous().view(sourceH.size(0)*sourceH.size(1), sourceH.size(2)) # (B*Ls, Ds) 162 | sourceHtrans = self.attentionLayer(sourceHtrans) # (B*Ls, Dt) 163 | sourceHtrans = sourceHtrans.view(*newShape).transpose(1, 2) # (B, Dt, Ls) 164 | 165 | for i in range(maxLen): 166 | hi, hidden0Target = self.decoder(torch.cat((inputTarget[:, i, :].unsqueeze(1), prevFinalHidden), dim = 2), hidden0Target) # hi: (B, 1, Dt) 167 | 168 | if self.numLayers != 1: # residual connection for this decoder 169 | hi = hidden0Target[0][0]+hidden0Target[0][1] 170 | hi = hi.unsqueeze(1) 171 | 172 | attentionScores_ = torch.bmm(hi, sourceHtrans).transpose(1, 2) # (B, Ls, 1) 173 | 174 | attentionScores = attentionScores_.new(attentionScores_.size()).fill_(-1024.0) 175 | for j in range(batchSize): 176 | attentionScores[j, :lengthsSource[j]].zero_() 177 | attentionScores += attentionScores_ 178 | 179 | attentionScores = attentionScores.transpose(1, 2) # (B, 1, Ls) 180 | attentionScores = F.softmax(attentionScores, dim = 2) 181 | 182 | contextVec = torch.bmm(attentionScores, sourceH) # (B, 1, Ds) 183 | 184 | prevFinalHidden = torch.cat((hi, contextVec), dim = 2) # (B, 1, Ds+Dt) 185 | prevFinalHidden = self.dropout(prevFinalHidden) 186 | prevFinalHidden = self.finalHiddenLayer(prevFinalHidden) 187 | prevFinalHidden = self.finalHiddenAct(prevFinalHidden) 188 | prevFinalHidden = self.dropout(prevFinalHidden) 189 | 190 | finalHidden[:, i, :] = prevFinalHidden.squeeze(1) 191 | 192 | finalHidden = finalHidden.contiguous().view(finalHidden.size(0)*finalHidden.size(1), finalHidden.size(2)) 193 | output = self.wordPredictor(finalHidden, target) 194 | 195 | return output 196 | 197 | def greedyTrans(self, bosIndex, eosIndex, lengthsSource, targetEmbedding, sourceH, hidden0Target, device, maxGenLen = 100): 198 | batchSize = sourceH.size(0) 199 | i = 1 200 | eosCount = 0 201 | targetWordIndices = torch.LongTensor(batchSize, maxGenLen).fill_(bosIndex).to(device) 202 | attentionIndices = targetWordIndices.new(targetWordIndices.size()) 203 | targetWordLengths = torch.LongTensor(batchSize).fill_(0) 204 | fin = [False]*batchSize 205 | 206 | newShape = sourceH.size(0), sourceH.size(1), hidden0Target[0].size(2) # (B, Ls, Dt) 207 | sourceHtrans = sourceH.contiguous().view(sourceH.size(0)*sourceH.size(1), sourceH.size(2)) # (B*Ls, Ds) 208 | sourceHtrans = self.attentionLayer(sourceHtrans) # (B*Ls, Dt) 209 | sourceHtrans = sourceHtrans.view(*newShape).transpose(1, 2) # (B, Dt, Ls) 210 | 211 | prevFinalHidden = sourceH.new(batchSize, 1, self.targetEmbedDim).zero_() 212 | 213 | while (i < maxGenLen) and (eosCount < batchSize): 214 | inputTarget = targetEmbedding(targetWordIndices[:, i-1].unsqueeze(1)) 215 | hi, hidden0Target = self.decoder(torch.cat((inputTarget, prevFinalHidden), dim = 2), hidden0Target) # hi: (B, 1, Dt) 216 | 217 | if self.numLayers != 1: 218 | hi = hidden0Target[0][0]+hidden0Target[0][1] 219 | hi = hi.unsqueeze(1) 220 | 221 | attentionScores_ = torch.bmm(hi, sourceHtrans).transpose(1, 2) # (B, Ls, 1) 222 | 223 | attentionScores = attentionScores_.new(attentionScores_.size()).fill_(-1024.0) 224 | for j in range(batchSize): 225 | attentionScores[j, :lengthsSource[j]].zero_() 226 | attentionScores += attentionScores_ 227 | 228 | attentionScores = attentionScores.transpose(1, 2) # (B, 1, Ls) 229 | attentionScores = F.softmax(attentionScores, dim = 2) 230 | 231 | attnProb, attnIndex = torch.max(attentionScores, dim = 2) 232 | for j in range(batchSize): 233 | attentionIndices[j, i-1] = attnIndex.data[j, 0] 234 | 235 | contextVec = torch.bmm(attentionScores, sourceH) # (B, 1, Ds) 236 | finalHidden = torch.cat((hi, contextVec), 2) # (B, 1, Dt+Ds) 237 | finalHidden = self.dropout(finalHidden) 238 | finalHidden = self.finalHiddenLayer(finalHidden) 239 | finalHidden = self.finalHiddenAct(finalHidden) 240 | finalHidden = self.dropout(finalHidden) 241 | prevFinalHidden = finalHidden # (B, 1, Dt) 242 | 243 | finalHidden = finalHidden.contiguous().view(finalHidden.size(0)*finalHidden.size(1), finalHidden.size(2)) 244 | output = self.wordPredictor(finalHidden) 245 | 246 | maxProb, sampledIndex = torch.max(output, dim = 1) 247 | targetWordIndices[:, i].copy_(sampledIndex) 248 | 249 | for j in range(batchSize): 250 | if not fin[j] and targetWordIndices[j, i-1].item() != eosIndex: 251 | targetWordLengths[j] += 1 252 | if sampledIndex[j].item() == eosIndex: 253 | eosCount += 1 254 | fin[j] = True 255 | 256 | i += 1 257 | 258 | targetWordIndices = targetWordIndices[:, 1:i] # i-1: no EOS 259 | 260 | return targetWordIndices, list(targetWordLengths), attentionIndices 261 | 262 | def beamSearch(self, bosIndex, eosIndex, lengthsSource, targetEmbedding, sourceH, hidden0Target, device, beamSize = 1, penalty = 0.75, maxGenLen = 100): 263 | batchSize = sourceH.size(0) 264 | 265 | targetWordIndices = torch.LongTensor(batchSize, maxGenLen).fill_(bosIndex).to(device) 266 | attentionIndices = targetWordIndices.new(targetWordIndices.size()) 267 | targetWordLengths = torch.LongTensor(batchSize).fill_(0) 268 | 269 | newShape = sourceH.size(0), sourceH.size(1), hidden0Target[0].size(2) # (B, Ls, Dt) 270 | sourceHtrans = sourceH.contiguous().view(sourceH.size(0)*sourceH.size(1), sourceH.size(2)) # (B*Ls, Ds) 271 | sourceHtrans = self.attentionLayer(sourceHtrans) # (B*Ls, Dt) 272 | sourceHtrans = sourceHtrans.view(*newShape).transpose(1, 2) # (B, Dt, Ls) 273 | 274 | sourceHtrans_ = sourceHtrans.new(beamSize, sourceHtrans.size(1), sourceHtrans.size(2)) 275 | sourceH_ = sourceH.new(beamSize, sourceH.size(1), sourceH.size(2)) 276 | prevFinalHidden = sourceH.new(beamSize, 1, self.targetEmbedDim).zero_() 277 | 278 | sampledIndex = torch.LongTensor(beamSize).zero_() 279 | 280 | h0 = hidden0Target[0] 281 | c0 = hidden0Target[1] 282 | h0_ = h0.data.new(h0.size(0), beamSize, h0.size(2)) 283 | c0_ = c0.data.new(c0.size(0), beamSize, c0.size(2)) 284 | 285 | for dataIndex in range(batchSize): 286 | i = 1 287 | prevFinalHidden.zero_() 288 | sourceHtrans_.zero_() 289 | sourceHtrans_ += sourceHtrans[dataIndex] 290 | sourceH_.zero_() 291 | sourceH_ += sourceH[dataIndex] 292 | h0_.zero_() 293 | c0_.zero_() 294 | h0_ += h0[:, dataIndex, :].unsqueeze(1) 295 | c0_ += c0[:, dataIndex, :].unsqueeze(1) 296 | hidden0Target_ = (h0_, c0_) 297 | 298 | cand = [] 299 | for j in range(beamSize): 300 | cand.append(DecCand(sentence_ = [bosIndex])) 301 | 302 | while i < maxGenLen and not cand[0].fin: 303 | index = [] 304 | for j in range(beamSize): 305 | index.append([cand[j].sentence[-1]]) 306 | index = torch.LongTensor(index).to(device) 307 | inputTarget = targetEmbedding(index) 308 | 309 | hi, hidden0Target_ = self.decoder(torch.cat((inputTarget, prevFinalHidden), dim = 2), hidden0Target_) # hi: (B, 1, Dt) 310 | 311 | if self.numLayers != 1: 312 | hi = hidden0Target_[0][0]+hidden0Target_[0][1] 313 | hi = hi.unsqueeze(1) 314 | 315 | attentionScores_ = torch.bmm(hi, sourceHtrans_).transpose(1, 2) # (B, Ls, 1) 316 | 317 | attentionScores = attentionScores_.new(attentionScores_.size()).fill_(-1024.0) 318 | attentionScores[:, :lengthsSource[dataIndex]].zero_() 319 | attentionScores += attentionScores_ 320 | 321 | attentionScores = attentionScores.transpose(1, 2) # (B, 1, Ls) 322 | attentionScores = F.softmax(attentionScores, dim = 2) 323 | 324 | attnProb, attnIndex = torch.max(attentionScores, dim = 2) 325 | 326 | contextVec = torch.bmm(attentionScores, sourceH_) # (B, 1, Ds) 327 | finalHidden = torch.cat((hi, contextVec), 2) # (B, 1, Dt+Ds) 328 | finalHidden = self.finalHiddenLayer(finalHidden) 329 | finalHidden = self.finalHiddenAct(finalHidden) 330 | prevFinalHidden = finalHidden # (B, 1, Dt) 331 | 332 | finalHidden = finalHidden.contiguous().view(finalHidden.size(0)*finalHidden.size(1), finalHidden.size(2)) 333 | output = self.wordPredictor(finalHidden) 334 | 335 | output = F.log_softmax(output, dim = 1)+penalty 336 | 337 | for j in range(beamSize): 338 | if cand[j].fin: 339 | output.data[j].fill_(cand[j].score) 340 | else: 341 | output.data[j] += cand[j].score 342 | 343 | updatedCand = [] 344 | updatedPrevFinalHidden = prevFinalHidden.new(prevFinalHidden.size()).zero_() 345 | updatedH0 = h0_.new(h0_.size()).zero_() 346 | updatedC0 = c0_.new(c0_.size()).zero_() 347 | 348 | for j in range(beamSize): 349 | maxScore, maxIndex = torch.topk(output.view(output.size(0)*output.size(1)), k = 1) 350 | 351 | row = maxIndex[0].item() // output.size(1) 352 | col = maxIndex[0].item() % output.size(1) 353 | score = maxScore[0].item() 354 | sampledIndex[j] = col 355 | 356 | if cand[row].fin: 357 | updatedCand.append(DecCand(score, True, cand[row].sentence, cand[row].attenIndex)) 358 | output.data[row].fill_(-1024.0) 359 | continue 360 | 361 | updatedCand.append(DecCand(score, False, cand[row].sentence+[], cand[row].attenIndex+[attnIndex[row, 0].item()])) 362 | updatedPrevFinalHidden[j] = prevFinalHidden[row] 363 | updatedH0[:, j, :] = hidden0Target_[0][:, row, :].unsqueeze(1) 364 | updatedC0[:, j, :] = hidden0Target_[1][:, row, :].unsqueeze(1) 365 | 366 | if i == 1: 367 | output[:, col].fill_(-1024.0) 368 | else: 369 | output[row, col] = -1024.0 370 | 371 | for j in range(beamSize): 372 | if updatedCand[j].fin: 373 | continue 374 | 375 | if sampledIndex[j] == eosIndex: 376 | updatedCand[j].fin = True 377 | 378 | updatedCand[j].sentence.append(sampledIndex[j].item()) 379 | 380 | #cand = sorted(updatedCand, key = lambda x: -x.score/len(x.sentence)) 381 | cand = updatedCand 382 | prevFinalHidden = updatedPrevFinalHidden 383 | hidden0Target_ = (updatedH0, updatedC0) 384 | i += 1 385 | 386 | targetWordLengths[dataIndex] = len(cand[0].sentence)-1 387 | for j in range(targetWordLengths[dataIndex]): 388 | targetWordIndices[dataIndex, j] = cand[0].sentence[j] 389 | attentionIndices[dataIndex, j] = cand[0].attenIndex[j] 390 | 391 | return targetWordIndices[:, 1:], list(targetWordLengths), attentionIndices 392 | -------------------------------------------------------------------------------- /nmt/code_0.4/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir params 2 | mkdir tools 3 | cd tools 4 | wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl 5 | cd .. 6 | mkdir data 7 | cd ./data 8 | wget https://raw.githubusercontent.com/hassyGo/N3LP/master/corpus/sample.en 9 | wget https://raw.githubusercontent.com/hassyGo/N3LP/master/corpus/sample.ja 10 | wget https://raw.githubusercontent.com/hassyGo/N3LP/master/corpus/sample.en.dev 11 | wget https://raw.githubusercontent.com/hassyGo/N3LP/master/corpus/sample.ja.dev 12 | cd .. 13 | -------------------------------------------------------------------------------- /nmt/code_0.4/train.py: -------------------------------------------------------------------------------- 1 | from data import Corpus 2 | from model import Embedding 3 | from model import EncDec 4 | import utils 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | 11 | import random 12 | import math 13 | import os 14 | import time 15 | import sys 16 | 17 | # En-Ja dev 18 | sourceDevFile = './data/sample.en.dev' 19 | sourceOrigDevFile = './data/sample.en.dev' 20 | targetDevFile = './data/sample.ja.dev' 21 | 22 | # En-Ja train 23 | sourceTrainFile = './data/sample.en' 24 | sourceOrigTrainFile = './data/sample.en' 25 | targetTrainFile = './data/sample.ja' 26 | 27 | # Vocabulary file (if applicable) 28 | sourceVocFile = './data/sample.en.voc' 29 | targetVocFile = './data/sample.ja.voc' 30 | 31 | minFreqSource = 2 # use source-side words which appear at least N times in the training data 32 | minFreqTarget = 2 # use target-side words which appear at least N times in the training data 33 | hiddenDim = 128 # dimensionality of hidden states and embeddings 34 | decay = 0.5 # learning rate decay rate for SGD 35 | gradClip = 1.0 # clipping value for gradient-norm clipping 36 | dropoutRate = 0.2 # dropout rate for output MLP 37 | numLayers = 1 # number of LSTM layers (1 or 2) 38 | 39 | maxLen = 100 # use sentence pairs whose maximum lengths are 100 in both source and target sides 40 | maxEpoch = 20 41 | decayStart = 5 42 | 43 | sourceEmbedDim = hiddenDim 44 | targetEmbedDim = hiddenDim 45 | 46 | batchSize = 16 # "128" is typically used 47 | learningRate = 1.0 48 | momentumRate = 0.75 49 | 50 | gpuId = [0, 1, 2, 3] 51 | seed = int(sys.argv[1]) 52 | 53 | device = torch.device('cuda:'+str(gpuId[0])) 54 | cpu = torch.device('cpu') 55 | 56 | weightDecay = 1.0e-06 57 | 58 | train = True 59 | 60 | beamSize = 10 61 | 62 | if not train: 63 | batchSize = 1 64 | 65 | torch.set_num_threads(1) 66 | 67 | torch.manual_seed(seed) 68 | random.seed(seed) 69 | torch.cuda.set_device(gpuId[0]) 70 | torch.cuda.manual_seed(seed) 71 | 72 | corpus = Corpus(sourceVocFile, targetVocFile, 73 | sourceTrainFile, sourceOrigTrainFile, targetTrainFile, 74 | sourceDevFile, sourceOrigDevFile, targetDevFile, 75 | minFreqSource, minFreqTarget, maxLen) 76 | 77 | print('Source vocabulary size: '+str(corpus.sourceVoc.size())) 78 | print('Target vocabulary size: '+str(corpus.targetVoc.size())) 79 | print() 80 | print('# of training samples: '+str(len(corpus.trainData))) 81 | print('# of develop samples: '+str(len(corpus.devData))) 82 | print('SEED: ', str(seed)) 83 | print() 84 | 85 | embedding = Embedding(sourceEmbedDim, targetEmbedDim, corpus.sourceVoc.size(), corpus.targetVoc.size()) 86 | encdec = EncDec(sourceEmbedDim, targetEmbedDim, hiddenDim, corpus.targetVoc.size(), dropoutRate = dropoutRate, numLayers = numLayers) 87 | 88 | encdec.wordPredictor.softmaxLayer.weight = embedding.targetEmbedding.weight 89 | encdec.wordPredictor = nn.DataParallel(encdec.wordPredictor, gpuId) 90 | 91 | if train: 92 | embedding.to(device) 93 | encdec.to(device) 94 | 95 | batchListTrain = utils.buildBatchList(len(corpus.trainData), batchSize) 96 | batchListDev = utils.buildBatchList(len(corpus.devData), batchSize) 97 | 98 | withoutWeightDecay = [] 99 | withWeightDecay = [] 100 | for name, param in list(embedding.named_parameters())+list(encdec.named_parameters()): 101 | if 'bias' in name or 'Embedding' in name: 102 | withoutWeightDecay += [param] 103 | elif 'softmax' not in name: 104 | withWeightDecay += [param] 105 | optParams = [{'params': withWeightDecay, 'weight_decay': weightDecay}, 106 | {'params': withoutWeightDecay, 'weight_decay': 0.0}] 107 | totalParamsNMT = withoutWeightDecay+withWeightDecay 108 | 109 | opt = optim.SGD(optParams, momentum = momentumRate, lr = learningRate) 110 | 111 | bestDevGleu = -1.0 112 | prevDevGleu = -1.0 113 | 114 | for epoch in range(maxEpoch): 115 | if not train: 116 | break 117 | 118 | batchProcessed = 0 119 | totalLoss = 0.0 120 | totalTrainTokenCount = 0.0 121 | 122 | print('--- Epoch ' + str(epoch+1)) 123 | startTime = time.time() 124 | 125 | random.shuffle(corpus.trainData) 126 | 127 | embedding.train() 128 | encdec.train() 129 | 130 | for batch in batchListTrain: 131 | print('\r', end = '') 132 | print(batchProcessed+1, '/', len(batchListTrain), end = '') 133 | 134 | batchSize = batch[1]-batch[0]+1 135 | 136 | opt.zero_grad() 137 | 138 | batchInputSource, lengthsSource, batchInputTarget, batchTarget, lengthsTarget, tokenCount, batchData = corpus.processBatchInfoNMT(batch, train = True, device = device) 139 | 140 | inputSource = embedding.getBatchedSourceEmbedding(batchInputSource) 141 | sourceH, (hn, cn) = encdec.encode(inputSource, lengthsSource) 142 | 143 | batchInputTarget = batchInputTarget.to(device) 144 | batchTarget = batchTarget.to(device) 145 | inputTarget = embedding.getBatchedTargetEmbedding(batchInputTarget) 146 | 147 | loss = encdec(inputTarget, lengthsTarget, lengthsSource, (hn, cn), sourceH, batchTarget) 148 | loss = loss.sum() 149 | 150 | totalLoss += loss.item() 151 | totalTrainTokenCount += tokenCount 152 | 153 | loss /= batchSize 154 | loss.backward() 155 | nn.utils.clip_grad_norm_(totalParamsNMT, gradClip) 156 | opt.step() 157 | 158 | batchProcessed += 1 159 | if batchProcessed == len(batchListTrain)//2 or batchProcessed == len(batchListTrain): 160 | devPerp = 0.0 161 | devGleu = 0.0 162 | totalTokenCount = 0.0 163 | 164 | embedding.eval() 165 | encdec.eval() 166 | torch.set_grad_enabled(False) 167 | 168 | print() 169 | print('Training time: ' + str(time.time()-startTime) + ' sec') 170 | print('Train perp: ' + str(math.exp(totalLoss/totalTrainTokenCount))) 171 | 172 | f_trans = open('./trans.txt', 'w') 173 | f_gold = open('./gold.txt', 'w') 174 | 175 | for batch in batchListDev: 176 | batchSize = batch[1]-batch[0]+1 177 | batchInputSource, lengthsSource, batchInputTarget, batchTarget, lengthsTarget, tokenCount, batchData = corpus.processBatchInfoNMT(batch, train = False, device = device) 178 | 179 | inputSource = embedding.getBatchedSourceEmbedding(batchInputSource) 180 | sourceH, (hn, cn) = encdec.encode(inputSource, lengthsSource) 181 | 182 | indicesGreedy, lengthsGreedy, attentionIndices = encdec.greedyTrans(corpus.targetVoc.bosIndex, corpus.targetVoc.eosIndex, lengthsSource, embedding.targetEmbedding, sourceH, (hn, cn), device, maxGenLen = maxLen) 183 | indicesGreedy = indicesGreedy.to(cpu) 184 | 185 | for i in range(batchSize): 186 | for k in range(lengthsGreedy[i]-1): 187 | index = indicesGreedy[i, k].item() 188 | if index == corpus.targetVoc.unkIndex: 189 | index = attentionIndices[i, k].item() 190 | f_trans.write(batchData[i].sourceOrigStr[index] + ' ') 191 | else: 192 | f_trans.write(corpus.targetVoc.tokenList[index].str + ' ') 193 | f_trans.write('\n') 194 | 195 | for k in range(lengthsTarget[i]-1): 196 | index = batchInputTarget[i, k+1].item() 197 | if index == corpus.targetVoc.unkIndex: 198 | f_gold.write(batchData[i].targetUnkMap[k] + ' ') 199 | else: 200 | f_gold.write(corpus.targetVoc.tokenList[index].str + ' ') 201 | f_gold.write('\n') 202 | 203 | batchInputTarget = batchInputTarget.to(device) 204 | batchTarget = batchTarget.to(device) 205 | inputTarget = embedding.getBatchedTargetEmbedding(batchInputTarget) 206 | 207 | loss = encdec(inputTarget, lengthsTarget, lengthsSource, (hn, cn), sourceH, batchTarget) 208 | loss = loss.sum() 209 | devPerp += loss.item() 210 | 211 | totalTokenCount += tokenCount 212 | 213 | f_trans.close() 214 | f_gold.close() 215 | os.system("./bleu.sh 2> DUMMY") 216 | f_trans = open('./bleu.txt', 'r') 217 | for line in f_trans: 218 | devGleu = float(line.split()[2][0:-1]) 219 | break 220 | f_trans.close() 221 | 222 | devPerp = math.exp(devPerp/totalTokenCount) 223 | print("Dev perp:", devPerp) 224 | print("Dev BLEU:", devGleu) 225 | 226 | embedding.train() 227 | encdec.train() 228 | torch.set_grad_enabled(True) 229 | 230 | if epoch > decayStart and devGleu < prevDevGleu: 231 | print('lr -> ' + str(learningRate*decay)) 232 | learningRate *= decay 233 | 234 | for paramGroup in opt.param_groups: 235 | paramGroup['lr'] = learningRate 236 | 237 | elif devGleu >= bestDevGleu: 238 | bestDevGleu = devGleu 239 | 240 | stateDict = embedding.state_dict() 241 | for elem in stateDict: 242 | stateDict[elem] = stateDict[elem].to(cpu) 243 | torch.save(stateDict, './params/embedding.bin') 244 | 245 | stateDict = encdec.state_dict() 246 | for elem in stateDict: 247 | stateDict[elem] = stateDict[elem].to(cpu) 248 | torch.save(stateDict, './params/encdec.bin') 249 | 250 | prevDevGleu = devGleu 251 | 252 | if train: 253 | exit(0) 254 | 255 | torch.set_grad_enabled(False) 256 | 257 | embedding.load_state_dict(torch.load('./params/embedding.bin')) 258 | encdec.load_state_dict(torch.load('./params/encdec.bin')) 259 | 260 | embedding.to(device) 261 | encdec.to(device) 262 | 263 | embedding.eval() 264 | encdec.eval() 265 | 266 | f_trans = open('./trans.txt', 'w') 267 | f_gold = open('./gold.txt', 'w') 268 | 269 | devPerp = 0.0 270 | totalTokenCount = 0.0 271 | 272 | for batch in batchListDev: 273 | batchSize = batch[1]-batch[0]+1 274 | batchInputSource, lengthsSource, batchInputTarget, batchTarget, lengthsTarget, tokenCount, batchData = corpus.processBatchInfoNMT(batch, train = False, device = device) 275 | 276 | inputSource = embedding.getBatchedSourceEmbedding(batchInputSource) 277 | sourceH, (hn, cn) = encdec.encode(inputSource, lengthsSource) 278 | 279 | if beamSize == 1: 280 | indicesGreedy, lengthsGreedy, attentionIndices = encdec.greedyTrans(corpus.targetVoc.bosIndex, corpus.targetVoc.eosIndex, lengthsSource, embedding.targetEmbedding, sourceH, (hn, cn), device, maxGenLen = maxLen) 281 | else: 282 | indicesGreedy, lengthsGreedy, attentionIndices = encdec.beamSearch(corpus.targetVoc.bosIndex, corpus.targetVoc.eosIndex, lengthsSource, embedding.targetEmbedding, sourceH, (hn, cn), device, beamSize = beamSize, maxGenLen = maxLen) 283 | indicesGreedy = indicesGreedy.to(cpu) 284 | 285 | for i in range(batchSize): 286 | for k in range(lengthsGreedy[i]-1): 287 | index = indicesGreedy[i, k].item() 288 | if index == corpus.targetVoc.unkIndex: 289 | index = attentionIndices[i, k].item() 290 | f_trans.write(batchData[i].sourceOrigStr[index] + ' ') 291 | else: 292 | f_trans.write(corpus.targetVoc.tokenList[index].str + ' ') 293 | f_trans.write('\n') 294 | 295 | for k in range(lengthsTarget[i]-1): 296 | index = batchInputTarget[i, k+1].item() 297 | if index == corpus.targetVoc.unkIndex: 298 | f_gold.write(batchData[i].targetUnkMap[k] + ' ') 299 | else: 300 | f_gold.write(corpus.targetVoc.tokenList[index].str + ' ') 301 | f_gold.write('\n') 302 | 303 | batchInputTarget = batchInputTarget.to(device) 304 | batchTarget = batchTarget.to(device) 305 | inputTarget = embedding.getBatchedTargetEmbedding(batchInputTarget) 306 | 307 | loss = encdec(inputTarget, lengthsTarget, lengthsSource, (hn, cn), sourceH, batchTarget) 308 | loss = loss.sum() 309 | devPerp += loss.item() 310 | 311 | totalTokenCount += tokenCount 312 | 313 | f_trans.close() 314 | f_gold.close() 315 | os.system("./bleu.sh 2> DUMMY") 316 | f_trans = open('./bleu.txt', 'r') 317 | for line in f_trans: 318 | devGleu = float(line.split()[2][0:-1]) 319 | break 320 | f_trans.close() 321 | 322 | devPerp = math.exp(devPerp/totalTokenCount) 323 | print("Dev perp:", devPerp) 324 | print("Dev BLEU:", devGleu) 325 | -------------------------------------------------------------------------------- /nmt/code_0.4/utils.py: -------------------------------------------------------------------------------- 1 | 2 | def buildBatchList(dataSize, batchSize): 3 | batchList = [] 4 | if dataSize%batchSize == 0: 5 | numBatch = dataSize//batchSize 6 | else: 7 | numBatch = int(dataSize/batchSize)+1 8 | 9 | for i in range(numBatch): 10 | batch = [] 11 | batch.append(i*batchSize) 12 | if i == numBatch-1: 13 | batch.append(dataSize-1) 14 | else: 15 | batch.append((i+1)*batchSize-1) 16 | batchList.append(batch) 17 | 18 | return batchList 19 | -------------------------------------------------------------------------------- /text_classifier/README.md: -------------------------------------------------------------------------------- 1 | # Text Classifier 2 | Classifying input text (wrods, phrases, sentences, or documents) using LSTM 3 | 4 | ## Notes 5 | * The input format is shown in the example dataset.
6 | Each line is like "[label] \t [word 1] [word 2] ..."
7 | 8 | * ~~The bi-LSTM weights are tied.~~
9 | ~~I usually use different weights for bi-LSTM, but here I just followed the default implementation.~~
10 | -------------------------------------------------------------------------------- /text_classifier/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | class Token: 5 | def __init__(self, str_ = '', count_ = 0): 6 | self.str = str_ 7 | self.count = count_ 8 | 9 | class Vocabulary: 10 | def __init__(self): 11 | self.UNK = '' # unkown words 12 | self.EOS = '' # the end-of-sequence token 13 | self.BOS = '' # the beginning-of-sequence token 14 | self.PAD = '' # padding 15 | self.tokenIndex = {} 16 | self.tokenList = [] 17 | 18 | def getTokenIndex(self, str): 19 | if str in self.tokenIndex: 20 | return self.tokenIndex[str] 21 | else: 22 | return self.tokenIndex[self.UNK] 23 | 24 | def add(self, str, count): 25 | if str not in self.tokenIndex: 26 | self.tokenList.append(Token(str, count)) 27 | self.tokenIndex[str] = len(self.tokenList)-1 28 | 29 | def size(self): 30 | return len(self.tokenList) 31 | 32 | class Data: 33 | def __init__(self, text_, label_): 34 | self.text = text_ 35 | self.label = label_ 36 | 37 | class Corpus: 38 | def __init__(self, trainFile = '', devFile = '', minFreq = 2): 39 | self.voc = Vocabulary() 40 | self.classVoc = Vocabulary() 41 | 42 | self.buildVoc(trainFile, minFreq) 43 | self.trainData = self.buildDataset(trainFile) 44 | self.devData = self.buildDataset(devFile) 45 | 46 | def buildVoc(self, fileName, minFreq): 47 | assert os.path.exists(fileName) 48 | 49 | with open(fileName, 'r') as f: 50 | tokenCount = {} 51 | unkCount = 0 52 | eosCount = 0 53 | 54 | labelCount = {} 55 | 56 | for line in f: 57 | tokens = line.split('\t')[1].split() # label \t w1 w2 ... \n 58 | label = line.split('\t')[0] 59 | eosCount += 1 60 | 61 | for t in tokens: 62 | if t in tokenCount: 63 | tokenCount[t] += 1 64 | else: 65 | tokenCount[t] = 1 66 | 67 | if label in labelCount: 68 | labelCount[label] += 1 69 | else: 70 | labelCount[label] = 1 71 | 72 | # select words which appear >= minFreq 73 | tokenList = sorted(tokenCount.items(), key = lambda x: -x[1]) # sort by value 74 | labelList = sorted(labelCount.items(), key = lambda x: -x[1]) # sort by value 75 | 76 | for t in tokenList: 77 | if t[1] >= minFreq: 78 | self.voc.add(t[0], t[1]) 79 | else: 80 | unkCount += t[1] 81 | self.voc.add(self.voc.UNK, unkCount) 82 | self.voc.add(self.voc.BOS, eosCount) 83 | self.voc.add(self.voc.EOS, eosCount) 84 | self.voc.add(self.voc.PAD, 0) 85 | 86 | for l in labelList: 87 | self.classVoc.add(l[0], l[1]) 88 | 89 | def buildDataset(self, fileName): 90 | assert os.path.exists(fileName) 91 | 92 | with open(fileName, 'r') as f: 93 | dataset = [] 94 | 95 | for line in f: 96 | tokens = [self.voc.BOS] + line.split('\t')[1].split() + [self.voc.EOS] # label \t w1 w2 ... \n 97 | tokenIndices = torch.LongTensor(len(tokens)) 98 | label = torch.LongTensor(1) 99 | i = 0 100 | 101 | for t in tokens: 102 | tokenIndices[i] = self.voc.getTokenIndex(t) 103 | i += 1 104 | 105 | label[0] = self.classVoc.getTokenIndex(line.split('\t')[0]) 106 | dataset.append(Data(tokenIndices, label)) 107 | 108 | return dataset 109 | -------------------------------------------------------------------------------- /text_classifier/dataset/stanford_sentiment_sample.dev: -------------------------------------------------------------------------------- 1 | + It 's a lovely film with lovely performances by Buy and Accorsi . 2 | 0 It 3 | ++ 's a lovely film with lovely performances by Buy and Accorsi . 4 | ++ 's a lovely film with lovely performances by Buy and Accorsi 5 | 0 's 6 | ++ a lovely film with lovely performances by Buy and Accorsi 7 | + a lovely film 8 | 0 a 9 | ++ lovely film 10 | + lovely 11 | 0 film 12 | + with lovely performances by Buy and Accorsi 13 | 0 with 14 | ++ lovely performances by Buy and Accorsi 15 | + lovely performances 16 | + lovely 17 | 0 performances 18 | 0 by Buy and Accorsi 19 | 0 by 20 | 0 Buy and Accorsi 21 | 0 Buy and 22 | 0 Buy 23 | 0 and 24 | 0 Accorsi 25 | 0 . 26 | 0 No one goes unindicted here , which is probably for the best . 27 | 0 No one 28 | - No 29 | 0 one 30 | - goes unindicted here , which is probably for the best . 31 | - goes unindicted here , which is probably for the best 32 | 0 goes 33 | 0 unindicted here , which is probably for the best 34 | - unindicted here , 35 | 0 unindicted here 36 | 0 unindicted 37 | 0 here 38 | 0 , 39 | 0 which is probably for the best 40 | 0 which 41 | + is probably for the best 42 | 0 is probably 43 | 0 is 44 | 0 probably 45 | + for the best 46 | 0 for 47 | ++ the best 48 | 0 the 49 | ++ best 50 | 0 . 51 | + And if you 're not nearly moved to tears by a couple of scenes , you 've got ice water in your veins . 52 | 0 And 53 | ++ if you 're not nearly moved to tears by a couple of scenes , you 've got ice water in your veins . 54 | + if you 're not nearly moved to tears by a couple of scenes 55 | 0 if 56 | - you 're not nearly moved to tears by a couple of scenes 57 | 0 you 58 | - 're not nearly moved to tears by a couple of scenes 59 | 0 're not nearly 60 | 0 're not 61 | 0 're 62 | - not 63 | 0 nearly 64 | ++ moved to tears by a couple of scenes 65 | + moved to tears 66 | + moved 67 | 0 to tears 68 | 0 to 69 | - tears 70 | 0 by a couple of scenes 71 | 0 by 72 | 0 a couple of scenes 73 | 0 a couple 74 | 0 a 75 | 0 couple 76 | 0 of scenes 77 | 0 of 78 | 0 scenes 79 | 0 , you 've got ice water in your veins . 80 | 0 , 81 | 0 you 've got ice water in your veins . 82 | 0 you 83 | 0 've got ice water in your veins . 84 | 0 've got ice water in your veins 85 | 0 've 86 | - got ice water in your veins 87 | 0 got 88 | 0 ice water in your veins 89 | + ice water 90 | 0 ice 91 | 0 water 92 | 0 in your veins 93 | 0 in 94 | 0 your veins 95 | 0 your 96 | 0 veins 97 | 0 . 98 | ++ A warm , funny , engaging film . 99 | ++ A warm , funny 100 | 0 A 101 | -------------------------------------------------------------------------------- /text_classifier/dataset/stanford_sentiment_sample.train: -------------------------------------------------------------------------------- 1 | + The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal . 2 | 0 The Rock 3 | 0 The 4 | 0 Rock 5 | ++ is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal . 6 | + is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal 7 | 0 is 8 | ++ destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal 9 | 0 destined 10 | 0 to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal 11 | 0 to be the 21st Century 's new `` Conan '' and 12 | 0 to be the 21st Century 's new `` Conan '' 13 | 0 to be the 21st Century 's new `` Conan 14 | 0 to 15 | 0 be the 21st Century 's new `` Conan 16 | 0 be 17 | 0 the 21st Century 's new `` Conan 18 | 0 the 19 | 0 21st Century 's new `` Conan 20 | 0 21st 21 | 0 Century 's new `` Conan 22 | 0 Century 's 23 | 0 Century 24 | 0 's 25 | 0 new `` Conan 26 | + new 27 | 0 `` Conan 28 | 0 `` 29 | 0 Conan 30 | 0 '' 31 | 0 and 32 | + that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal 33 | 0 that 34 | + he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal 35 | 0 he 36 | + 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal 37 | 0 's 38 | + going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal 39 | 0 going 40 | + to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal 41 | 0 to 42 | ++ make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal 43 | + make a splash even greater 44 | 0 make 45 | + a splash even greater 46 | + a splash 47 | 0 a 48 | + splash 49 | 0 even greater 50 | 0 even 51 | + greater 52 | 0 than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal 53 | 0 than 54 | 0 Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal 55 | 0 Arnold Schwarzenegger , Jean-Claud Van Damme or 56 | 0 Arnold Schwarzenegger , Jean-Claud Van Damme 57 | 0 Arnold Schwarzenegger , 58 | - Arnold Schwarzenegger 59 | 0 Arnold 60 | 0 Schwarzenegger 61 | 0 , 62 | 0 Jean-Claud Van Damme 63 | 0 Jean-Claud 64 | 0 Van Damme 65 | 0 Van 66 | 0 Damme 67 | 0 or 68 | 0 Steven Segal 69 | 0 Steven 70 | 0 Segal 71 | 0 . 72 | ++ The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy is so huge that a column of words can not adequately describe co-writer\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth . 73 | ++ The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy 74 | ++ The gorgeously elaborate continuation 75 | 0 The 76 | ++ gorgeously elaborate continuation 77 | + gorgeously 78 | + elaborate continuation 79 | 0 elaborate 80 | 0 continuation 81 | 0 of `` The Lord of the Rings '' trilogy 82 | 0 of `` 83 | 0 of 84 | 0 `` 85 | 0 The Lord of the Rings '' trilogy 86 | 0 The 87 | 0 Lord of the Rings '' trilogy 88 | 0 Lord of the Rings 89 | 0 Lord 90 | 0 of the Rings 91 | 0 of 92 | 0 the Rings 93 | 0 the 94 | 0 Rings 95 | 0 '' trilogy 96 | 0 '' 97 | 0 trilogy 98 | 0 is so huge that a column of words can not adequately describe co-writer\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth . 99 | + is so huge that a column of words can not adequately describe co-writer\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth 100 | 0 is so huge 101 | 0 is 102 | 0 so huge 103 | 0 so 104 | 0 huge 105 | 0 that a column of words can not adequately describe co-writer\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth 106 | 0 that 107 | + a column of words can not adequately describe co-writer\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth 108 | 0 a column of words 109 | 0 a column 110 | 0 a 111 | 0 column 112 | 0 of words 113 | 0 of 114 | 0 words 115 | 0 can not adequately describe co-writer\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth 116 | 0 can not adequately 117 | 0 can not 118 | 0 can 119 | - not 120 | + adequately 121 | 0 describe co-writer\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth 122 | 0 describe 123 | 0 co-writer\/director Peter Jackson 's expanded vision of J.R.R. Tolkien 's Middle-earth 124 | + co-writer\/director Peter Jackson 's expanded vision 125 | 0 co-writer\/director Peter Jackson 's 126 | 0 co-writer\/director 127 | 0 Peter Jackson 's 128 | 0 Peter 129 | + Jackson 's 130 | 0 Jackson 131 | 0 's 132 | + expanded vision 133 | 0 expanded 134 | 0 vision 135 | 0 of J.R.R. Tolkien 's Middle-earth 136 | 0 of 137 | 0 J.R.R. Tolkien 's Middle-earth 138 | 0 J.R.R. Tolkien 's 139 | 0 J.R.R. 140 | 0 Tolkien 's 141 | 0 Tolkien 142 | 0 's 143 | 0 Middle-earth 144 | 0 . 145 | + Singer\/composer Bryan Adams contributes a slew of songs -- a few potential hits , a few more simply intrusive to the story -- but the whole package certainly captures the intended , er , spirit of the piece . 146 | + Singer\/composer Bryan Adams contributes a slew of songs -- a few potential hits , a few more simply intrusive to the story -- but the whole package certainly captures the intended , er , spirit of the piece 147 | 0 Singer\/composer Bryan Adams contributes a slew of songs -- a few potential hits , a few more simply intrusive to the story -- but 148 | 0 Singer\/composer Bryan Adams contributes a slew of songs -- a few potential hits , a few more simply intrusive to the story -- 149 | 0 Singer\/composer Bryan Adams contributes a slew of songs 150 | 0 Singer\/composer Bryan Adams 151 | 0 Singer\/composer 152 | 0 Bryan Adams 153 | 0 Bryan 154 | 0 Adams 155 | 0 contributes a slew of songs 156 | 0 contributes 157 | 0 a slew of songs 158 | 0 a slew 159 | 0 a 160 | 0 slew 161 | 0 of songs 162 | 0 of 163 | 0 songs 164 | 0 -- a few potential hits , a few more simply intrusive to the story -- 165 | 0 -- 166 | 0 a few potential hits , a few more simply intrusive to the story -- 167 | 0 a few potential hits , a few more simply intrusive to the story 168 | 0 a few potential 169 | 0 a 170 | 0 few potential 171 | 0 few 172 | + potential 173 | 0 hits , a few more simply intrusive to the story 174 | 0 hits , 175 | 0 hits 176 | 0 , 177 | 0 a few more simply intrusive to the story 178 | 0 a few 179 | 0 a 180 | 0 few 181 | - more simply intrusive to the story 182 | - more simply intrusive 183 | 0 more 184 | - simply intrusive 185 | 0 simply 186 | 0 intrusive 187 | 0 to the story 188 | 0 to 189 | 0 the story 190 | 0 the 191 | 0 story 192 | 0 -- 193 | 0 but 194 | + the whole package certainly captures the intended , er , spirit of the piece 195 | ++ the whole package 196 | 0 the 197 | + whole package 198 | 0 whole 199 | 0 package 200 | 0 certainly captures the intended , er , spirit of the piece 201 | + certainly 202 | + captures the intended , er , spirit of the piece 203 | 0 captures 204 | 0 the intended , er , spirit of the piece 205 | - the intended , er , spirit 206 | 0 the 207 | 0 intended , er , spirit 208 | 0 intended , er , 209 | 0 intended 210 | 0 , er , 211 | 0 , 212 | 0 er , 213 | 0 er 214 | 0 , 215 | + spirit 216 | 0 of the piece 217 | 0 of 218 | 0 the piece 219 | 0 the 220 | 0 piece 221 | 0 . 222 | 0 You 'd think by now America would have had enough of plucky British eccentrics with hearts of gold . 223 | 0 You 'd think by now 224 | 0 You 225 | 0 'd think by now 226 | 0 'd 227 | 0 think by now 228 | 0 think 229 | 0 by now 230 | 0 by 231 | 0 now 232 | 0 America would have had enough of plucky British eccentrics with hearts of gold . 233 | 0 America 234 | 0 would have had enough of plucky British eccentrics with hearts of gold . 235 | 0 would have had enough of plucky British eccentrics with hearts of gold 236 | 0 would 237 | - have had enough of plucky British eccentrics with hearts of gold 238 | 0 have 239 | 0 had enough of plucky British eccentrics with hearts of gold 240 | 0 had enough of plucky British eccentrics 241 | 0 had 242 | - enough of plucky British eccentrics 243 | 0 enough 244 | 0 of plucky British eccentrics 245 | 0 of 246 | 0 plucky British eccentrics 247 | 0 plucky 248 | 0 British eccentrics 249 | 0 British 250 | - eccentrics 251 | ++ with hearts of gold 252 | 0 with 253 | ++ hearts of gold 254 | + hearts 255 | + of gold 256 | 0 of 257 | + gold 258 | 0 . 259 | + Yet the act is still charming here . 260 | 0 Yet 261 | + the act is still charming here . 262 | 0 the act 263 | 0 the 264 | 0 act 265 | + is still charming here . 266 | ++ is still charming here 267 | + is still charming 268 | 0 is 269 | + still charming 270 | 0 still 271 | ++ charming 272 | 0 here 273 | 0 . 274 | ++ Whether or not you 're enlightened by any of Derrida 's lectures on `` the other '' and `` the self , '' Derrida is an undeniably fascinating and playful fellow . 275 | 0 Whether or not you 're enlightened by any of Derrida 's lectures on `` the other '' and `` the self 276 | 0 Whether 277 | 0 or not you 're enlightened by any of Derrida 's lectures on `` the other '' and `` the self 278 | 0 or not you 're enlightened by any of Derrida 's 279 | 0 or not 280 | 0 or 281 | - not 282 | + you 're enlightened by any of Derrida 's 283 | 0 you 284 | 0 're enlightened by any of Derrida 's 285 | 0 're 286 | + enlightened by any of Derrida 's 287 | + enlightened 288 | 0 by any of Derrida 's 289 | 0 by 290 | 0 any of Derrida 's 291 | 0 any 292 | 0 of Derrida 's 293 | 0 of 294 | 0 Derrida 's 295 | 0 Derrida 296 | 0 's 297 | 0 lectures on `` the other '' and `` the self 298 | 0 lectures 299 | 0 on `` the other '' and `` the self 300 | 0 on 301 | 0 `` the other '' and `` the self 302 | 0 `` 303 | 0 the other '' and `` the self 304 | 0 the other '' and `` 305 | 0 the other '' and 306 | 0 the other '' 307 | 0 the other 308 | 0 the 309 | 0 other 310 | 0 '' 311 | 0 and 312 | 0 `` 313 | 0 the self 314 | 0 the 315 | 0 self 316 | + , '' Derrida is an undeniably fascinating and playful fellow . 317 | 0 , 318 | + '' Derrida is an undeniably fascinating and playful fellow . 319 | 0 '' 320 | + Derrida is an undeniably fascinating and playful fellow . 321 | 0 Derrida 322 | + is an undeniably fascinating and playful fellow . 323 | + is an undeniably fascinating and playful fellow 324 | 0 is 325 | ++ an undeniably fascinating and playful fellow 326 | 0 an 327 | ++ undeniably fascinating and playful fellow 328 | ++ undeniably fascinating and playful 329 | 0 undeniably 330 | + fascinating and playful 331 | ++ fascinating and 332 | + fascinating 333 | 0 and 334 | ++ playful 335 | 0 fellow 336 | 0 . 337 | ++ Just the labour involved in creating the layered richness of the imagery in this chiaroscuro of madness and light is astonishing . 338 | + Just the labour involved in creating the layered richness of the imagery in this chiaroscuro of madness and light 339 | 0 Just the labour 340 | 0 Just 341 | 0 the labour 342 | 0 the 343 | 0 labour 344 | + involved in creating the layered richness of the imagery in this chiaroscuro of madness and light 345 | 0 involved 346 | + in creating the layered richness of the imagery in this chiaroscuro of madness and light 347 | 0 in 348 | ++ creating the layered richness of the imagery in this chiaroscuro of madness and light 349 | 0 creating 350 | + the layered richness of the imagery in this chiaroscuro of madness and light 351 | + the layered richness 352 | 0 the 353 | + layered richness 354 | + layered 355 | 0 richness 356 | + of the imagery in this chiaroscuro of madness and light 357 | 0 of 358 | + the imagery in this chiaroscuro of madness and light 359 | 0 the imagery 360 | 0 the 361 | 0 imagery 362 | 0 in this chiaroscuro of madness and light 363 | 0 in 364 | + this chiaroscuro of madness and light 365 | 0 this chiaroscuro 366 | 0 this 367 | 0 chiaroscuro 368 | 0 of madness and light 369 | 0 of 370 | 0 madness and light 371 | 0 madness and 372 | 0 madness 373 | 0 and 374 | 0 light 375 | + is astonishing . 376 | + is astonishing 377 | 0 is 378 | ++ astonishing 379 | 0 . 380 | + Part of the charm of Satin Rouge is that it avoids the obvious with humour and lightness . 381 | + Part of the charm of Satin Rouge 382 | 0 Part 383 | + of the charm of Satin Rouge 384 | 0 of 385 | ++ the charm of Satin Rouge 386 | 0 the charm 387 | 0 the 388 | + charm 389 | 0 of Satin Rouge 390 | 0 of 391 | 0 Satin Rouge 392 | 0 Satin 393 | 0 Rouge 394 | + is that it avoids the obvious with humour and lightness . 395 | + is that it avoids the obvious with humour and lightness 396 | 0 is 397 | + that it avoids the obvious with humour and lightness 398 | 0 that 399 | + it avoids the obvious with humour and lightness 400 | 0 it 401 | 0 avoids the obvious with humour and lightness 402 | - avoids the obvious 403 | 0 avoids 404 | 0 the obvious 405 | 0 the 406 | - obvious 407 | + with humour and lightness 408 | 0 with 409 | + humour and lightness 410 | + humour and 411 | + humour 412 | 0 and 413 | 0 lightness 414 | 0 . 415 | ++ a screenplay more ingeniously constructed than `` Memento '' 416 | 0 a screenplay more 417 | 0 a 418 | 0 screenplay more 419 | 0 screenplay 420 | 0 more 421 | + ingeniously constructed than `` Memento '' 422 | ++ ingeniously 423 | 0 constructed than `` Memento '' 424 | 0 constructed 425 | 0 than `` Memento '' 426 | 0 than `` Memento 427 | 0 than `` 428 | 0 than 429 | 0 `` 430 | 0 Memento 431 | 0 '' 432 | + `` Extreme Ops '' exceeds expectations . 433 | 0 `` 434 | + Extreme Ops '' exceeds expectations . 435 | 0 Extreme Ops 436 | 0 Extreme 437 | 0 Ops 438 | + '' exceeds expectations . 439 | 0 '' 440 | ++ exceeds expectations . 441 | ++ exceeds expectations 442 | + exceeds 443 | 0 expectations 444 | 0 . 445 | ++ Good fun , good action , good acting , good dialogue , good pace , good cinematography . 446 | + Good fun 447 | + Good 448 | ++ fun 449 | ++ , good action , good acting , good dialogue , good pace , good cinematography . 450 | 0 , 451 | ++ good action , good acting , good dialogue , good pace , good cinematography . 452 | ++ good action , good acting , good dialogue , good pace , good cinematography 453 | ++ good action , good acting , good dialogue , good pace , 454 | ++ good action , good acting , good dialogue , good pace 455 | ++ good action , good acting , good dialogue , 456 | ++ good action , good acting , good dialogue 457 | ++ good action , good acting , 458 | ++ good action , good acting 459 | ++ good action , 460 | + good action 461 | + good 462 | 0 action 463 | 0 , 464 | + good acting 465 | + good 466 | 0 acting 467 | 0 , 468 | + good dialogue 469 | + good 470 | 0 dialogue 471 | 0 , 472 | ++ good pace 473 | + good 474 | 0 pace 475 | 0 , 476 | + good cinematography 477 | + good 478 | 0 cinematography 479 | 0 . 480 | 0 You Should Pay Nine Bucks for This : Because you can hear about suffering Afghan refugees on the news and still be unaffected . 481 | 0 You 482 | 0 Should Pay Nine Bucks for This : Because you can hear about suffering Afghan refugees on the news and still be unaffected . 483 | - Should Pay Nine Bucks for This : Because you can hear about suffering Afghan refugees on the news and still be unaffected 484 | 0 Should 485 | 0 Pay Nine Bucks for This : Because you can hear about suffering Afghan refugees on the news and still be unaffected 486 | 0 Pay Nine Bucks for This : 487 | - Pay Nine Bucks for This 488 | 0 Pay 489 | 0 Nine Bucks for This 490 | 0 Nine Bucks 491 | 0 Nine 492 | 0 Bucks 493 | 0 for This 494 | 0 for 495 | 0 This 496 | 0 : 497 | - Because you can hear about suffering Afghan refugees on the news and still be unaffected 498 | 0 Because 499 | 0 you can hear about suffering Afghan refugees on the news and still be unaffected 500 | 0 you 501 | 0 can hear about suffering Afghan refugees on the news and still be unaffected 502 | 0 can 503 | 0 hear about suffering Afghan refugees on the news and still be unaffected 504 | 0 hear about suffering Afghan refugees on the news and still 505 | 0 hear about suffering Afghan refugees on the news and 506 | 0 hear about suffering Afghan refugees on the news 507 | 0 hear 508 | 0 about suffering Afghan refugees on the news 509 | 0 about 510 | - suffering Afghan refugees on the news 511 | - suffering Afghan refugees 512 | - suffering 513 | 0 Afghan refugees 514 | 0 Afghan 515 | 0 refugees 516 | 0 on the news 517 | 0 on 518 | 0 the news 519 | 0 the 520 | 0 news 521 | 0 and 522 | 0 still 523 | - be unaffected 524 | 0 be 525 | 0 unaffected 526 | 0 . 527 | ++ Dramas like this make it human . 528 | 0 Dramas like this 529 | 0 Dramas 530 | 0 like this 531 | 0 like 532 | 0 this 533 | 0 make it human . 534 | + make it human 535 | 0 make 536 | 0 it human 537 | 0 it 538 | 0 human 539 | 0 . 540 | 0 A thunderous ride at first , quiet cadences of pure finesse are few and far between ; their shortage dilutes the potency of otherwise respectable action . 541 | 0 A thunderous ride at first , quiet cadences of pure finesse are few and far between ; their shortage dilutes the potency of otherwise respectable action 542 | + A thunderous ride at first , quiet cadences of pure finesse are few and far between ; 543 | 0 A thunderous ride at first , quiet cadences of pure finesse are few and far between 544 | + A thunderous ride at first , quiet cadences of pure finesse 545 | + A thunderous ride 546 | 0 A 547 | + thunderous ride 548 | + thunderous 549 | 0 ride 550 | + at first , quiet cadences of pure finesse 551 | 0 at 552 | + first , quiet cadences of pure finesse 553 | + first , quiet cadences 554 | 0 first 555 | 0 , quiet cadences 556 | 0 , 557 | 0 quiet cadences 558 | 0 quiet 559 | 0 cadences 560 | + of pure finesse 561 | 0 of 562 | ++ pure finesse 563 | ++ pure 564 | + finesse 565 | 0 are few and far between 566 | 0 are few 567 | 0 are 568 | 0 few 569 | 0 and far between 570 | 0 and 571 | 0 far between 572 | 0 far 573 | 0 between 574 | 0 ; 575 | - their shortage dilutes the potency of otherwise respectable action 576 | 0 their shortage 577 | 0 their 578 | - shortage 579 | + dilutes the potency of otherwise respectable action 580 | 0 dilutes 581 | 0 the potency of otherwise respectable action 582 | 0 the potency 583 | 0 the 584 | 0 potency 585 | 0 of otherwise respectable action 586 | 0 of 587 | + otherwise respectable action 588 | + otherwise respectable 589 | 0 otherwise 590 | + respectable 591 | 0 action 592 | 0 . 593 | ++ Still , this flick is fun , and host to some truly excellent sequences . 594 | 0 Still 595 | ++ , this flick is fun , and host to some truly excellent sequences . 596 | 0 , 597 | ++ this flick is fun , and host to some truly excellent sequences . 598 | 0 this flick 599 | 0 this 600 | 0 flick 601 | + is fun , and host to some truly excellent sequences . 602 | ++ is fun , and host to some truly excellent sequences 603 | 0 is 604 | ++ fun , and host to some truly excellent sequences 605 | + fun , and 606 | + fun , 607 | ++ fun 608 | 0 , 609 | 0 and 610 | + host to some truly excellent sequences 611 | 0 host 612 | + to some truly excellent sequences 613 | 0 to 614 | + some truly excellent sequences 615 | 0 some 616 | ++ truly excellent sequences 617 | ++ truly excellent 618 | + truly 619 | ++ excellent 620 | 0 sequences 621 | 0 . 622 | + Australian actor\/director John Polson and award-winning English cinematographer Giles Nuttgens make a terrific effort at disguising the obvious with energy and innovation . 623 | 0 Australian actor\/director John Polson and award-winning English cinematographer Giles Nuttgens 624 | 0 Australian actor\/director John Polson and 625 | 0 Australian actor\/director John Polson 626 | 0 Australian 627 | 0 actor\/director John Polson 628 | 0 actor\/director 629 | 0 John Polson 630 | 0 John 631 | 0 Polson 632 | 0 and 633 | + award-winning English cinematographer Giles Nuttgens 634 | ++ award-winning 635 | 0 English cinematographer Giles Nuttgens 636 | 0 English 637 | 0 cinematographer Giles Nuttgens 638 | 0 cinematographer 639 | 0 Giles Nuttgens 640 | 0 Giles 641 | 0 Nuttgens 642 | 0 make a terrific effort at disguising the obvious with energy and innovation . 643 | + make a terrific effort at disguising the obvious with energy and innovation 644 | + make a terrific effort 645 | 0 make 646 | ++ a terrific effort 647 | 0 a 648 | ++ terrific effort 649 | ++ terrific 650 | 0 effort 651 | 0 at disguising the obvious with energy and innovation 652 | 0 at 653 | + disguising the obvious with energy and innovation 654 | 0 disguising the obvious 655 | 0 disguising 656 | 0 the obvious 657 | 0 the 658 | - obvious 659 | + with energy and innovation 660 | 0 with 661 | + energy and innovation 662 | 0 energy and 663 | + energy 664 | 0 and 665 | + innovation 666 | 0 . 667 | + You walk out of The Good Girl with mixed emotions -- disapproval of Justine combined with a tinge of understanding for her actions . 668 | 0 You 669 | 0 walk out of The Good Girl with mixed emotions -- disapproval of Justine combined with a tinge of understanding for her actions . 670 | 0 walk out of The Good Girl with mixed emotions -- disapproval of Justine combined with a tinge of understanding for her actions 671 | - walk out of The Good Girl with mixed emotions -- 672 | 0 walk out of The Good Girl with mixed emotions 673 | - walk out of The Good Girl 674 | 0 walk out 675 | 0 walk 676 | - out 677 | 0 of The Good Girl 678 | 0 of 679 | 0 The Good Girl 680 | 0 The 681 | 0 Good Girl 682 | + Good 683 | 0 Girl 684 | 0 with mixed emotions 685 | 0 with 686 | 0 mixed emotions 687 | 0 mixed 688 | 0 emotions 689 | 0 -- 690 | 0 disapproval of Justine combined with a tinge of understanding for her actions 691 | 0 disapproval of Justine 692 | - disapproval 693 | 0 of Justine 694 | 0 of 695 | 0 Justine 696 | + combined with a tinge of understanding for her actions 697 | 0 combined 698 | 0 with a tinge of understanding for her actions 699 | 0 with 700 | 0 a tinge of understanding for her actions 701 | 0 a tinge 702 | 0 a 703 | 0 tinge 704 | 0 of understanding for her actions 705 | 0 of 706 | 0 understanding for her actions 707 | 0 understanding 708 | 0 for her actions 709 | 0 for 710 | 0 her actions 711 | 0 her 712 | 0 actions 713 | 0 . 714 | 0 Post 9\/11 the philosophical message of `` Personal Freedom First '' might not be as palatable as intended . 715 | 0 Post 9\/11 the philosophical message of `` Personal Freedom First '' 716 | 0 Post 9\/11 the philosophical message 717 | 0 Post 9\/11 718 | 0 Post 719 | 0 9\/11 720 | 0 the philosophical message 721 | 0 the 722 | 0 philosophical message 723 | + philosophical 724 | 0 message 725 | 0 of `` Personal Freedom First '' 726 | 0 of `` Personal Freedom First 727 | 0 of `` 728 | 0 of 729 | 0 `` 730 | 0 Personal Freedom First 731 | 0 Personal 732 | 0 Freedom First 733 | ++ Freedom 734 | + First 735 | 0 '' 736 | - might not be as palatable as intended . 737 | - might not be as palatable as intended 738 | 0 might not 739 | 0 might 740 | - not 741 | 0 be as palatable as intended 742 | 0 be 743 | 0 as palatable as intended 744 | 0 as 745 | 0 palatable as intended 746 | 0 palatable 747 | 0 as intended 748 | 0 as 749 | 0 intended 750 | 0 . 751 | ++ Absorbing character study by André Turpin . 752 | 0 Absorbing character 753 | + Absorbing 754 | 0 character 755 | 0 study by André Turpin . 756 | 0 study by André Turpin 757 | 0 study 758 | 0 by André Turpin 759 | 0 by 760 | 0 André Turpin 761 | 0 André 762 | 0 Turpin 763 | 0 . 764 | + If you love reading and\/or poetry , then by all means check it out . 765 | + If you love reading and\/or poetry 766 | 0 If 767 | + you love reading and\/or poetry 768 | 0 you 769 | + love reading and\/or poetry 770 | ++ love 771 | 0 reading and\/or poetry 772 | 0 reading 773 | 0 and\/or poetry 774 | 0 and\/or 775 | + poetry 776 | + , then by all means check it out . 777 | 0 , 778 | + then by all means check it out . 779 | 0 then by all 780 | 0 then 781 | 0 by all 782 | 0 by 783 | 0 all 784 | + means check it out . 785 | 0 means check it out 786 | 0 means 787 | ++ check it out 788 | 0 check it 789 | + check 790 | 0 it 791 | - out 792 | 0 . 793 | ++ You 'll probably love it . 794 | 0 You 795 | ++ 'll probably love it . 796 | + 'll probably love it 797 | 0 'll probably 798 | 0 'll 799 | 0 probably 800 | ++ love it 801 | ++ love 802 | 0 it 803 | 0 . 804 | + `` Frailty '' has been written so well , that even a simple `` Goddammit ! '' 805 | 0 `` 806 | + Frailty '' has been written so well , that even a simple `` Goddammit ! '' 807 | 0 Frailty 808 | ++ '' has been written so well , that even a simple `` Goddammit ! '' 809 | 0 '' 810 | + has been written so well , that even a simple `` Goddammit ! '' 811 | ++ has been written so well , that even a simple `` Goddammit ! 812 | + has been written so well , that even a simple `` Goddammit 813 | 0 has 814 | + been written so well , that even a simple `` Goddammit 815 | 0 been 816 | + written so well , that even a simple `` Goddammit 817 | ++ written so well , 818 | + written so well 819 | + written 820 | + so well 821 | 0 so 822 | + well 823 | 0 , 824 | 0 that even a simple `` Goddammit 825 | 0 that even 826 | 0 that 827 | 0 even 828 | - a simple `` Goddammit 829 | 0 a simple 830 | 0 a 831 | 0 simple 832 | - `` Goddammit 833 | 0 `` 834 | -- Goddammit 835 | 0 ! 836 | 0 '' 837 | 0 near the end takes on a whole other meaning . 838 | 0 near the end 839 | 0 near 840 | 0 the end 841 | 0 the 842 | 0 end 843 | + takes on a whole other meaning . 844 | + takes on a whole other meaning 845 | 0 takes 846 | 0 on a whole other meaning 847 | 0 on 848 | 0 a whole other meaning 849 | 0 a 850 | 0 whole other meaning 851 | 0 whole 852 | 0 other meaning 853 | 0 other 854 | 0 meaning 855 | 0 . 856 | ++ Grenier is terrific , bringing an unforced , rapid-fire delivery to Toback 's Heidegger - and Nietzsche-referencing dialogue . 857 | 0 Grenier 858 | ++ is terrific , bringing an unforced , rapid-fire delivery to Toback 's Heidegger - and Nietzsche-referencing dialogue . 859 | + is terrific , bringing an unforced , rapid-fire delivery to Toback 's Heidegger - and Nietzsche-referencing dialogue 860 | ++ is terrific , 861 | ++ is terrific 862 | 0 is 863 | ++ terrific 864 | 0 , 865 | + bringing an unforced , rapid-fire delivery to Toback 's Heidegger - and Nietzsche-referencing dialogue 866 | + bringing an unforced , rapid-fire delivery 867 | 0 bringing 868 | + an unforced , rapid-fire delivery 869 | 0 an 870 | + unforced , rapid-fire delivery 871 | + unforced 872 | + , rapid-fire delivery 873 | 0 , 874 | + rapid-fire delivery 875 | 0 rapid-fire 876 | 0 delivery 877 | 0 to Toback 's Heidegger - and Nietzsche-referencing dialogue 878 | 0 to 879 | 0 Toback 's Heidegger - and Nietzsche-referencing dialogue 880 | 0 Toback 's Heidegger - and 881 | 0 Toback 's Heidegger - 882 | 0 Toback 's Heidegger 883 | 0 Toback 's 884 | 0 Toback 885 | 0 's 886 | 0 Heidegger 887 | 0 - 888 | 0 and 889 | 0 Nietzsche-referencing dialogue 890 | 0 Nietzsche-referencing 891 | 0 dialogue 892 | 0 . 893 | 0 The Sundance Film Festival has become so buzz-obsessed that fans and producers descend upon Utah each January to ferret out The Next Great Thing . 894 | 0 The Sundance Film Festival 895 | 0 The 896 | 0 Sundance Film Festival 897 | 0 Sundance 898 | + Film Festival 899 | 0 Film 900 | + Festival 901 | 0 has become so buzz-obsessed that fans and producers descend upon Utah each January to ferret out The Next Great Thing . 902 | 0 has become so buzz-obsessed that fans and producers descend upon Utah each January to ferret out The Next Great Thing 903 | 0 has 904 | - become so buzz-obsessed that fans and producers descend upon Utah each January to ferret out The Next Great Thing 905 | 0 become so buzz-obsessed 906 | 0 become 907 | 0 so buzz-obsessed 908 | 0 so 909 | - buzz-obsessed 910 | 0 that fans and producers descend upon Utah each January to ferret out The Next Great Thing 911 | 0 that 912 | 0 fans and producers descend upon Utah each January to ferret out The Next Great Thing 913 | 0 fans and producers 914 | 0 fans and 915 | + fans 916 | 0 and 917 | 0 producers 918 | 0 descend upon Utah each January to ferret out The Next Great Thing 919 | 0 descend upon Utah each January 920 | 0 descend upon Utah each 921 | 0 descend 922 | 0 upon Utah each 923 | 0 upon 924 | 0 Utah each 925 | 0 Utah 926 | 0 each 927 | 0 January 928 | 0 to ferret out The Next Great Thing 929 | 0 to 930 | 0 ferret out The Next Great Thing 931 | 0 ferret out 932 | 0 ferret 933 | - out 934 | + The Next Great Thing 935 | 0 The 936 | + Next Great Thing 937 | 0 Next 938 | + Great Thing 939 | ++ Great 940 | 0 Thing 941 | 0 . 942 | 0 ` Tadpole ' was one of the films so declared this year , but it 's really more of The Next Pretty Good Thing . 943 | 0 ` 944 | 0 Tadpole ' was one of the films so declared this year , but it 's really more of The Next Pretty Good Thing . 945 | 0 Tadpole ' was one of the films so declared this year , but it 's really more of The Next Pretty Good Thing 946 | 0 Tadpole ' was one of the films so declared this year , but 947 | 0 Tadpole ' was one of the films so declared this year , 948 | 0 Tadpole ' was one of the films so declared this year 949 | 0 Tadpole ' 950 | 0 Tadpole 951 | 0 ' 952 | 0 was one of the films so declared this year 953 | 0 was 954 | 0 one of the films so declared this year 955 | 0 one 956 | 0 of the films so declared this year 957 | 0 of 958 | 0 the films so declared this year 959 | 0 the films 960 | 0 the 961 | 0 films 962 | 0 so declared this year 963 | 0 so 964 | 0 declared this year 965 | 0 declared 966 | 0 this year 967 | 0 this 968 | 0 year 969 | 0 , 970 | 0 but 971 | 0 it 's really more of The Next Pretty Good Thing 972 | 0 it 973 | 0 's really more of The Next Pretty Good Thing 974 | 0 's 975 | 0 really more of The Next Pretty Good Thing 976 | 0 really more 977 | 0 really 978 | 0 more 979 | 0 of The Next Pretty Good Thing 980 | 0 of 981 | 0 The Next Pretty Good Thing 982 | 0 The 983 | + Next Pretty Good Thing 984 | 0 Next 985 | + Pretty Good Thing 986 | + Pretty 987 | + Good Thing 988 | + Good 989 | 0 Thing 990 | 0 . 991 | ++ The actors are fantastic . 992 | 0 The actors 993 | 0 The 994 | 0 actors 995 | + are fantastic . 996 | ++ are fantastic 997 | 0 are 998 | + fantastic 999 | 0 . 1000 | + They are what makes it worth the trip to the theatre . 1001 | -------------------------------------------------------------------------------- /text_classifier/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import math 5 | 6 | ''' 7 | Text classification model 8 | - Input: text (words, phrases, sentences, or documents) 9 | - Output: class label 10 | ''' 11 | class TextClassifier(nn.Module): 12 | 13 | ''' 14 | Initialize the classifier model 15 | ''' 16 | def __init__(self, vocSize, embedDim, hiddenDim, classNum, biDirectional, repType, actType): 17 | super(TextClassifier, self).__init__() 18 | 19 | self.dropout = nn.Dropout(p = 0.0) 20 | 21 | self.embedding = nn.Embedding(vocSize, embedDim) 22 | 23 | self.encoder = nn.LSTM(input_size = embedDim, 24 | hidden_size = hiddenDim, 25 | num_layers = 1, 26 | dropout = 0.0, 27 | bidirectional = biDirectional) 28 | 29 | classifierDim = hiddenDim 30 | if biDirectional: 31 | classifierDim *= 2 32 | 33 | assert repType in {'Sen', 'Ave', 'Max'} 34 | self.repType = repType 35 | 36 | self.hiddenLayer = nn.Linear(classifierDim, hiddenDim) 37 | assert actType in {'Tanh', 'ReLU'} 38 | if actType == 'Tanh': 39 | self.hiddenAct = nn.Tanh() 40 | elif actType == 'ReLU': 41 | self.hiddenAct = nn.ReLU() 42 | 43 | self.softmaxLayer = nn.Linear(hiddenDim, classNum) 44 | 45 | self.embedDim = embedDim 46 | self.hiddenDim = hiddenDim 47 | self.classifierDim = classifierDim 48 | self.biDirectional = biDirectional 49 | 50 | self.initWeights() 51 | 52 | ''' 53 | Initialize the model paramters 54 | ''' 55 | def initWeights(self): 56 | initScale = math.sqrt(6.0)/math.sqrt(self.hiddenDim+(self.embedDim+self.hiddenDim)) 57 | initScale2 = math.sqrt(6.0)/math.sqrt(self.classifierDim+(self.hiddenDim)) 58 | 59 | self.embedding.weight.data.uniform_(-initScale, initScale) 60 | 61 | self.encoder.weight_ih_l0.data.uniform_(-initScale, initScale) 62 | self.encoder.weight_hh_l0.data.uniform_(-initScale, initScale) 63 | self.encoder.bias_ih_l0.data.zero_() 64 | self.encoder.bias_hh_l0.data.zero_() 65 | self.encoder.bias_hh_l0.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 66 | 67 | if self.biDirectional: 68 | self.encoder.weight_ih_l0_reverse.data.uniform_(-initScale, initScale) 69 | self.encoder.weight_hh_l0_reverse.data.uniform_(-initScale, initScale) 70 | self.encoder.bias_ih_l0_reverse.data.zero_() 71 | self.encoder.bias_hh_l0_reverse.data.zero_() 72 | self.encoder.bias_hh_l0_reverse.data[self.hiddenDim:2*self.hiddenDim].fill_(1.0) # forget bias = 1 73 | 74 | self.hiddenLayer.weight.data.uniform_(-initScale2, initScale2) 75 | self.hiddenLayer.bias.data.zero_() 76 | 77 | self.softmaxLayer.weight.data.zero_() 78 | self.softmaxLayer.bias.data.zero_() 79 | 80 | ''' 81 | Compute sentence representations 82 | ''' 83 | def encode(self, batchInput, lengths, hidden0): 84 | batchInput = torch.t(batchInput) 85 | input = self.embedding(Variable(batchInput)) 86 | packedInput = nn.utils.rnn.pack_padded_sequence(input, lengths, batch_first = True) 87 | 88 | h, (hn, cn) = self.encoder(packedInput, hidden0) 89 | h, _ = nn.utils.rnn.pad_packed_sequence(h, batch_first = True) 90 | 91 | if self.repType == 'Sen': 92 | if self.biDirectional: 93 | a = self.hiddenLayer(torch.cat((hn[0], hn[1]), 1)) 94 | else: 95 | a = self.hiddenLayer(hn.view(hn.size(1), hn.size(2))) 96 | elif self.repType == 'Ave': 97 | assert False 98 | elif self.repType == 'Max': 99 | assert False 100 | 101 | return self.hiddenAct(a), h 102 | 103 | ''' 104 | Compute class scores 105 | ''' 106 | def forward(self, batchInput, lengths, hidden0): 107 | encoded, _ = self.encode(batchInput, lengths, hidden0) 108 | output = self.softmaxLayer(encoded.view(len(lengths), self.hiddenDim)) 109 | return output 110 | 111 | -------------------------------------------------------------------------------- /text_classifier/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | from torch.autograd import Variable 5 | from model import TextClassifier 6 | from data import Corpus 7 | import utils 8 | import random 9 | 10 | embedDim = 50 11 | hiddenDim = embedDim 12 | biDirectional = True 13 | 14 | batchSize = 8 15 | initialLearningRate = 1.0 16 | lrDecay = 0.0 17 | gradClip = 1.0 18 | weightDecay = 1.0e-06 19 | maxEpoch = 100 20 | 21 | minFreq = 1 22 | 23 | useGpu = True 24 | 25 | trainFile = './dataset/stanford_sentiment_sample.train' 26 | devFile = './dataset/stanford_sentiment_sample.dev' 27 | 28 | seed = 1 29 | 30 | torch.manual_seed(seed) 31 | random.seed(seed) 32 | 33 | corpus = Corpus(trainFile, devFile, minFreq) 34 | 35 | print('Vocabulary size: '+str(corpus.voc.size())) 36 | print('# of classes: '+str(corpus.classVoc.size())) 37 | print() 38 | print('# of training samples: '+str(len(corpus.trainData))) 39 | print('# of dev samples: '+str(len(corpus.devData))) 40 | 41 | classifier = TextClassifier(corpus.voc.size(), 42 | embedDim, hiddenDim, 43 | corpus.classVoc.size(), 44 | biDirectional, 45 | repType = 'Sen', 46 | actType = 'Tanh') 47 | 48 | if useGpu: 49 | if torch.cuda.is_available(): 50 | torch.cuda.manual_seed(seed) 51 | classifier.cuda() 52 | print('**** Running with GPU ****\n') 53 | else: 54 | useGpu = False 55 | print('**** Warning: GPU is not available ****\n') 56 | 57 | 58 | criterionClassifier = nn.CrossEntropyLoss(size_average = True) 59 | 60 | epoch = 0 61 | 62 | batchListTrain = utils.buildBatchList(len(corpus.trainData), batchSize) 63 | batchListDev = utils.buildBatchList(len(corpus.devData), batchSize) 64 | 65 | while epoch < maxEpoch: 66 | aveLoss = 0.0 67 | trainAcc = 0.0 68 | 69 | epoch += 1 70 | print('--- Epoch '+str(epoch)) 71 | 72 | random.shuffle(corpus.trainData) 73 | 74 | opt = optim.SGD(classifier.parameters(), 75 | lr = initialLearningRate/(1.0+lrDecay*epoch), 76 | weight_decay = weightDecay) 77 | classifier.train() 78 | 79 | for batch in batchListTrain: 80 | opt.zero_grad() 81 | 82 | # build input for the batch 83 | curBatchSize = batch[1]-batch[0]+1 84 | batchInput, batchTarget, lengths = utils.buildBatchInOutForClassifier(corpus.voc.getTokenIndex(corpus.voc.PAD), batch, corpus.trainData) 85 | target = Variable(batchTarget) 86 | 87 | if biDirectional: 88 | shape = 2, curBatchSize, hiddenDim 89 | else: 90 | shape = 1, curBatchSize, hiddenDim 91 | h0 = c0 = Variable(torch.zeros(*shape), requires_grad = False) 92 | 93 | if useGpu: 94 | batchInput = batchInput.cuda() 95 | target = target.cuda() 96 | h0 = h0.cuda() 97 | c0 = c0.cuda() 98 | 99 | output = classifier(batchInput, lengths, (h0, c0)) 100 | 101 | loss = criterionClassifier(output, target) 102 | loss.backward() 103 | nn.utils.clip_grad_norm(classifier.parameters(), gradClip) 104 | opt.step() 105 | 106 | _, prediction = torch.max(output, 1) 107 | trainAcc += torch.sum(torch.eq(prediction, target)).data[0] 108 | aveLoss += loss.data[0] 109 | 110 | print('Train loss: '+str(aveLoss/len(batchListTrain))) 111 | print('Train acc.: '+str(100.0*trainAcc/len(corpus.trainData))) 112 | 113 | classifier.eval() 114 | devAcc = 0.0 115 | for batch in batchListDev: 116 | # build input for the batch 117 | curBatchSize = batch[1]-batch[0]+1 118 | batchInput, batchTarget, lengths = utils.buildBatchInOutForClassifier(corpus.voc.getTokenIndex(corpus.voc.PAD), batch, corpus.devData) 119 | target = Variable(batchTarget) 120 | 121 | if biDirectional: 122 | shape = 2, curBatchSize, hiddenDim 123 | else: 124 | shape = 1, curBatchSize, hiddenDim 125 | h0 = c0 = Variable(torch.zeros(*shape), requires_grad = False) 126 | 127 | if useGpu: 128 | batchInput = batchInput.cuda() 129 | target = target.cuda() 130 | h0 = h0.cuda() 131 | c0 = c0.cuda() 132 | 133 | output = classifier(batchInput, lengths, (h0, c0)) 134 | _, prediction = torch.max(output, 1) 135 | devAcc += torch.sum(torch.eq(prediction, target)).data[0] 136 | 137 | print('Dev acc.: '+str(100.0*devAcc/len(corpus.devData))) 138 | -------------------------------------------------------------------------------- /text_classifier/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def buildBatchList(dataSize, batchSize): 4 | batchList = [] 5 | numBatch = int(dataSize/batchSize) 6 | 7 | for i in range(numBatch): 8 | batch = [] 9 | batch.append(i*batchSize) 10 | if i == numBatch-1: 11 | batch.append(dataSize-1) 12 | else: 13 | batch.append((i+1)*batchSize-1) 14 | batchList.append(batch) 15 | 16 | return batchList 17 | 18 | def buildBatchInOutForClassifier(paddingIndex, batch, trainData): 19 | begin = batch[0] 20 | end = batch[1] 21 | batchSize = end-begin+1 22 | data = sorted(trainData[begin:end+1], key = lambda x: -len(x.text)) 23 | maxLen = len(data[0].text) 24 | batchInput = torch.LongTensor(maxLen, batchSize) 25 | batchInput.fill_(paddingIndex) 26 | batchTarget = torch.LongTensor(batchSize) 27 | lengths = [] 28 | 29 | for i in range(batchSize): 30 | batchTarget[i] = data[i].label[0] 31 | l = len(data[i].text) 32 | lengths.append(l) 33 | for j in range(l): 34 | batchInput[j][i] = data[i].text[j] 35 | 36 | return batchInput, batchTarget, lengths 37 | --------------------------------------------------------------------------------