├── .gitignore ├── Bi LSTM CRF.ipynb ├── BiLSTMTaggerWordCRFModel_CONLL2000 ├── BiLSTMTaggerWordCRFModel_CONLL2000.pdf ├── Elastic weight consolidation-Uncertainity-v1.6.ipynb ├── Elastic weight consolidation-Uncertainity.ipynb ├── Elastic weight consolidation-old.ipynb ├── Elastic weight consolidation.ipynb ├── Getting started.ipynb ├── IID_EWC_losses.pdf ├── IID_EWC_predictions.pdf ├── Iterated Dilated convolution.ipynb ├── LICENSE ├── PyTorch CONLL 2000 Chunking.ipynb ├── PyTorch CONLL 2000 Chunking.py ├── PyTorch RNN.ipynb ├── Pytorch - MMD VAE.ipynb ├── Pytorch Active Learning.ipynb ├── Pytorch Gradient reversal.ipynb ├── Pytorch RNN sequence tagging.ipynb ├── Pytorch Uncertainity-animated.ipynb ├── Pytorch Uncertainity-yaringal.ipynb ├── Pytorch Uncertainity.ipynb ├── Pytorch example.ipynb ├── README.md ├── Scratchpad.ipynb ├── Seq_EWC_losses.pdf ├── Seq_EWC_predictions.pdf ├── Viterbi decoding and CRF.ipynb ├── chunking_bilstm_crf_char_concat.py ├── conll2000.glove.100.npy ├── conlleval.py ├── data └── conll2000 │ └── get_data.sh ├── pytorch_models.py ├── pytorch_utils.py ├── utils.py └── wnut_bilstm_crf_char_concat.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # Data folders 92 | tmp/ 93 | -------------------------------------------------------------------------------- /Bi LSTM CRF.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "data": { 12 | "text/plain": [ 13 | "" 14 | ] 15 | }, 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "output_type": "execute_result" 19 | } 20 | ], 21 | "source": [ 22 | "import torch\n", 23 | "import torch.autograd as autograd\n", 24 | "import torch.nn as nn\n", 25 | "import torch.nn.functional as F\n", 26 | "import torch.optim as optim\n", 27 | "\n", 28 | "torch.manual_seed(1)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 4, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "def prepare_sequence(seq, to_ix):\n", 40 | " idxs = [to_ix[w] for w in seq]\n", 41 | " tensor = torch.LongTensor(idxs)\n", 42 | " return autograd.Variable(tensor)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 5, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "# Helper functions to make the code more readable.\n", 54 | "def to_scalar(var):\n", 55 | " # returns a python float\n", 56 | " return var.view(-1).data.tolist()[0]\n", 57 | "\n", 58 | "\n", 59 | "def argmax(vec):\n", 60 | " # return the argmax as a python int\n", 61 | " _, idx = torch.max(vec, 1)\n", 62 | " return to_scalar(idx)\n", 63 | "\n", 64 | "# Compute log sum exp in a numerically stable way for the forward algorithm\n", 65 | "\n", 66 | "\n", 67 | "def log_sum_exp(vec):\n", 68 | " max_score = vec[0, argmax(vec)]\n", 69 | " max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])\n", 70 | " return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))\n", 71 | "\n", 72 | "\n", 73 | "class BiLSTM_CRF(nn.Module):\n", 74 | "\n", 75 | " def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):\n", 76 | " super(BiLSTM_CRF, self).__init__()\n", 77 | " self.embedding_dim = embedding_dim\n", 78 | " self.hidden_dim = hidden_dim\n", 79 | " self.vocab_size = vocab_size\n", 80 | " self.tag_to_ix = tag_to_ix\n", 81 | " self.tagset_size = len(tag_to_ix)\n", 82 | "\n", 83 | " self.word_embeds = nn.Embedding(vocab_size, embedding_dim)\n", 84 | " self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,\n", 85 | " num_layers=1, bidirectional=True)\n", 86 | "\n", 87 | " # Maps the output of the LSTM into tag space.\n", 88 | " self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)\n", 89 | "\n", 90 | " # Matrix of transition parameters. Entry i,j is the score of\n", 91 | " # transitioning *to* i *from* j.\n", 92 | " self.transitions = nn.Parameter(\n", 93 | " torch.randn(self.tagset_size, self.tagset_size))\n", 94 | "\n", 95 | " self.hidden = self.init_hidden()\n", 96 | "\n", 97 | " def init_hidden(self):\n", 98 | " return (autograd.Variable(torch.randn(2, 1, self.hidden_dim)),\n", 99 | " autograd.Variable(torch.randn(2, 1, self.hidden_dim)))\n", 100 | "\n", 101 | " def _forward_alg(self, feats):\n", 102 | " # Do the forward algorithm to compute the partition function\n", 103 | " init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.)\n", 104 | " # START_TAG has all of the score.\n", 105 | " init_alphas[0][self.tag_to_ix[START_TAG]] = 0.\n", 106 | "\n", 107 | " # Wrap in a variable so that we will get automatic backprop\n", 108 | " forward_var = autograd.Variable(init_alphas)\n", 109 | "\n", 110 | " # Iterate through the sentence\n", 111 | " for feat in feats:\n", 112 | " alphas_t = [] # The forward variables at this timestep\n", 113 | " for next_tag in range(self.tagset_size):\n", 114 | " # broadcast the emission score: it is the same regardless of\n", 115 | " # the previous tag\n", 116 | " emit_score = feat[next_tag].view(\n", 117 | " 1, -1).expand(1, self.tagset_size)\n", 118 | " # the ith entry of trans_score is the score of transitioning to\n", 119 | " # next_tag from i\n", 120 | " trans_score = self.transitions[next_tag].view(1, -1)\n", 121 | " # The ith entry of next_tag_var is the value for the edge (i -> next_tag)\n", 122 | " # before we do log-sum-exp\n", 123 | " next_tag_var = forward_var + trans_score + emit_score\n", 124 | " # The forward variable for this tag is log-sum-exp of all the\n", 125 | " # scores.\n", 126 | " alphas_t.append(log_sum_exp(next_tag_var))\n", 127 | " forward_var = torch.cat(alphas_t).view(1, -1)\n", 128 | " terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]\n", 129 | " alpha = log_sum_exp(terminal_var)\n", 130 | " return alpha\n", 131 | "\n", 132 | " def _get_lstm_features(self, sentence):\n", 133 | " self.hidden = self.init_hidden()\n", 134 | " embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)\n", 135 | " lstm_out, self.hidden = self.lstm(embeds)\n", 136 | " lstm_out = lstm_out.view(len(sentence), self.hidden_dim)\n", 137 | " lstm_feats = self.hidden2tag(lstm_out)\n", 138 | " return lstm_feats\n", 139 | "\n", 140 | " def _score_sentence(self, feats, tags):\n", 141 | " # Gives the score of a provided tag sequence\n", 142 | " score = autograd.Variable(torch.Tensor([0]))\n", 143 | " tags = torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags])\n", 144 | " for i, feat in enumerate(feats):\n", 145 | " score = score + \\\n", 146 | " self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]\n", 147 | " score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]\n", 148 | " return score\n", 149 | "\n", 150 | " def _viterbi_decode(self, feats):\n", 151 | " backpointers = []\n", 152 | "\n", 153 | " # Initialize the viterbi variables in log space\n", 154 | " init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.)\n", 155 | " init_vvars[0][self.tag_to_ix[START_TAG]] = 0\n", 156 | "\n", 157 | " # forward_var at step i holds the viterbi variables for step i-1\n", 158 | " forward_var = autograd.Variable(init_vvars)\n", 159 | " for feat in feats:\n", 160 | " bptrs_t = [] # holds the backpointers for this step\n", 161 | " viterbivars_t = [] # holds the viterbi variables for this step\n", 162 | "\n", 163 | " for next_tag in range(self.tagset_size):\n", 164 | " # next_tag_var[i] holds the viterbi variable for tag i at the previous step,\n", 165 | " # plus the score of transitioning from tag i to next_tag.\n", 166 | " # We don't include the emission scores here because the max\n", 167 | " # does not depend on them (we add them in below)\n", 168 | " next_tag_var = forward_var + self.transitions[next_tag]\n", 169 | " best_tag_id = argmax(next_tag_var)\n", 170 | " bptrs_t.append(best_tag_id)\n", 171 | " viterbivars_t.append(next_tag_var[0][best_tag_id])\n", 172 | " # Now add in the emission scores, and assign forward_var to the set\n", 173 | " # of viterbi variables we just computed\n", 174 | " forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)\n", 175 | " backpointers.append(bptrs_t)\n", 176 | "\n", 177 | " # Transition to STOP_TAG\n", 178 | " terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]\n", 179 | " best_tag_id = argmax(terminal_var)\n", 180 | " path_score = terminal_var[0][best_tag_id]\n", 181 | "\n", 182 | " # Follow the back pointers to decode the best path.\n", 183 | " best_path = [best_tag_id]\n", 184 | " for bptrs_t in reversed(backpointers):\n", 185 | " best_tag_id = bptrs_t[best_tag_id]\n", 186 | " best_path.append(best_tag_id)\n", 187 | " # Pop off the start tag (we dont want to return that to the caller)\n", 188 | " start = best_path.pop()\n", 189 | " assert start == self.tag_to_ix[START_TAG] # Sanity check\n", 190 | " best_path.reverse()\n", 191 | " return path_score, best_path\n", 192 | "\n", 193 | " def neg_log_likelihood(self, sentence, tags):\n", 194 | " self.hidden = self.init_hidden()\n", 195 | " feats = self._get_lstm_features(sentence)\n", 196 | " forward_score = self._forward_alg(feats)\n", 197 | " gold_score = self._score_sentence(feats, tags)\n", 198 | " return forward_score - gold_score\n", 199 | "\n", 200 | " def forward(self, sentence): # dont confuse this with _forward_alg above.\n", 201 | " self.hidden = self.init_hidden()\n", 202 | " # Get the emission scores from the BiLSTM\n", 203 | " lstm_feats = self._get_lstm_features(sentence)\n", 204 | "\n", 205 | " # Find the best path, given the features.\n", 206 | " score, tag_seq = self._viterbi_decode(lstm_feats)\n", 207 | " return score, tag_seq" 208 | ] 209 | }, 210 | { 211 | "cell_type": "raw", 212 | "metadata": { 213 | "hide_egal": false, 214 | "is_egal": true 215 | }, 216 | "source": [ 217 | "" 218 | ] 219 | }, 220 | { 221 | "cell_type": "raw", 222 | "metadata": { 223 | "is_egal": true 224 | }, 225 | "source": [ 226 | "" 227 | ] 228 | }, 229 | { 230 | "cell_type": "raw", 231 | "metadata": { 232 | "is_egal": true 233 | }, 234 | "source": [ 235 | "" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 6, 241 | "metadata": { 242 | "collapsed": false 243 | }, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "(Variable containing:\n", 250 | " 9.7379\n", 251 | "[torch.FloatTensor of size 1]\n", 252 | ", [2, 3, 1, 2, 3, 1, 2, 3, 2, 3, 1])\n", 253 | "(Variable containing:\n", 254 | " 39.4279\n", 255 | "[torch.FloatTensor of size 1]\n", 256 | ", [0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2])\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "START_TAG = \"\"\n", 262 | "STOP_TAG = \"\"\n", 263 | "EMBEDDING_DIM = 5\n", 264 | "HIDDEN_DIM = 4\n", 265 | "\n", 266 | "# Make up some training data\n", 267 | "training_data = [(\n", 268 | " \"the wall street journal reported today that apple corporation made money\".split(),\n", 269 | " \"B I I I O O O B I O O\".split()\n", 270 | "), (\n", 271 | " \"georgia tech is a university in georgia\".split(),\n", 272 | " \"B I O O O O B\".split()\n", 273 | ")]\n", 274 | "\n", 275 | "word_to_ix = {}\n", 276 | "for sentence, tags in training_data:\n", 277 | " for word in sentence:\n", 278 | " if word not in word_to_ix:\n", 279 | " word_to_ix[word] = len(word_to_ix)\n", 280 | "\n", 281 | "tag_to_ix = {\"B\": 0, \"I\": 1, \"O\": 2, START_TAG: 3, STOP_TAG: 4}\n", 282 | "\n", 283 | "model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)\n", 284 | "optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)\n", 285 | "\n", 286 | "# Check predictions before training\n", 287 | "precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)\n", 288 | "precheck_tags = torch.LongTensor([tag_to_ix[t] for t in training_data[0][1]])\n", 289 | "print(model(precheck_sent))\n", 290 | "\n", 291 | "# Make sure prepare_sequence from earlier in the LSTM section is loaded\n", 292 | "for epoch in range(300): # again, normally you would NOT do 300 epochs, it is toy data\n", 293 | " for sentence, tags in training_data:\n", 294 | " # Step 1. Remember that Pytorch accumulates gradients. We need to clear them out\n", 295 | " # before each instance\n", 296 | " model.zero_grad()\n", 297 | "\n", 298 | " # Step 2. Get our inputs ready for the network, that is, turn them into Variables\n", 299 | " # of word indices.\n", 300 | " sentence_in = prepare_sequence(sentence, word_to_ix)\n", 301 | " targets = torch.LongTensor([tag_to_ix[t] for t in tags])\n", 302 | "\n", 303 | " # Step 3. Run our forward pass.\n", 304 | " neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets)\n", 305 | "\n", 306 | " # Step 4. Compute the loss, gradients, and update the parameters by calling\n", 307 | " # optimizer.step()\n", 308 | " neg_log_likelihood.backward()\n", 309 | " optimizer.step()\n", 310 | "\n", 311 | "# Check predictions after training\n", 312 | "precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)\n", 313 | "print(model(precheck_sent))\n", 314 | "# We got it!" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": { 321 | "collapsed": true 322 | }, 323 | "outputs": [], 324 | "source": [] 325 | } 326 | ], 327 | "metadata": { 328 | "kernelspec": { 329 | "display_name": "Python [conda root]", 330 | "language": "python", 331 | "name": "conda-root-py" 332 | }, 333 | "language_info": { 334 | "codemirror_mode": { 335 | "name": "ipython", 336 | "version": 3 337 | }, 338 | "file_extension": ".py", 339 | "mimetype": "text/x-python", 340 | "name": "python", 341 | "nbconvert_exporter": "python", 342 | "pygments_lexer": "ipython3", 343 | "version": "3.5.2" 344 | } 345 | }, 346 | "nbformat": 4, 347 | "nbformat_minor": 2 348 | } 349 | -------------------------------------------------------------------------------- /BiLSTMTaggerWordCRFModel_CONLL2000: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/BiLSTMTaggerWordCRFModel_CONLL2000 -------------------------------------------------------------------------------- /BiLSTMTaggerWordCRFModel_CONLL2000.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/BiLSTMTaggerWordCRFModel_CONLL2000.pdf -------------------------------------------------------------------------------- /IID_EWC_losses.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/IID_EWC_losses.pdf -------------------------------------------------------------------------------- /IID_EWC_predictions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/IID_EWC_predictions.pdf -------------------------------------------------------------------------------- /Iterated Dilated convolution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "\n", 13 | "import torch\n", 14 | "from torch.autograd import Variable\n", 15 | "from torch.utils.data import Dataset, DataLoader\n", 16 | "\n", 17 | "import unittest" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "def runTests(test_class):\n", 29 | " unittest.TextTestRunner().run(\n", 30 | " unittest.TestLoader().loadTestsFromModule(\n", 31 | " test_class()\n", 32 | " )\n", 33 | " )" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "VOCAB = {\n", 45 | " \"__pad__\": 0,\n", 46 | " \"__bos__\": 1,\n", 47 | " \"__eos__\": 2,\n", 48 | " \"__unk__\": 3,\n", 49 | " \"dog\": 4,\n", 50 | " \"cat\": 5,\n", 51 | " \"puppy\": 6\n", 52 | "}\n", 53 | "\n", 54 | "CHAR_VOCAB = {\n", 55 | " \"__c_pad__\": 0,\n", 56 | " \"__bot__\": 1,\n", 57 | " \"__eot__\": 2,\n", 58 | " \"__c_unk__\": 3,\n", 59 | " \"__pad__\": 4,\n", 60 | " \"__bos__\": 5,\n", 61 | " \"__eos__\": 6,\n", 62 | " \"a\": 7,\n", 63 | " \"c\": 8,\n", 64 | " \"d\": 9,\n", 65 | " \"g\": 10,\n", 66 | " \"o\": 11,\n", 67 | " \"p\": 12,\n", 68 | " \"t\": 13,\n", 69 | " \"u\": 14,\n", 70 | " \"y\": 15\n", 71 | "}\n", 72 | "\n", 73 | "TAG_VOCAB = {\n", 74 | " \"__pad__\": 0,\n", 75 | " \"__bos__\": 1,\n", 76 | " \"__eos__\": 2,\n", 77 | " \"animal_class\": 3,\n", 78 | " \"offspring\": 4\n", 79 | "}\n", 80 | "\n", 81 | "maxlen=10\n", 82 | "max_tokenlen=15\n", 83 | "\n", 84 | "def seq2idx(items, vocab, begin=\"__bos__\", end=\"__eos__\"):\n", 85 | " seq = (\n", 86 | " tuple([vocab[begin]]) \n", 87 | " + tuple([\n", 88 | " vocab[item]\n", 89 | " for item in items\n", 90 | " ]) \n", 91 | " + tuple([vocab[end]]))\n", 92 | " #print(seq)\n", 93 | " return seq\n", 94 | " \n", 95 | "def padded_seq(seq, maxlen, pad_value):\n", 96 | " seqlen = min(maxlen, len(seq))\n", 97 | " seq = tuple(seq[:seqlen]) + tuple([pad_value]*(maxlen - seqlen))\n", 98 | " return seq, seqlen\n", 99 | "\n", 100 | "def get_chars_seq(sentence, char_vocab):\n", 101 | " char_seq = tuple([[\"__bos__\"]]) + tuple([\n", 102 | " tuple(w) for w in sentence\n", 103 | " ]) + tuple([[\"__eos__\"]])\n", 104 | " char_seq = tuple([\n", 105 | " padded_seq(\n", 106 | " seq2idx(\n", 107 | " chars,\n", 108 | " char_vocab,\n", 109 | " begin=\"__bot__\",\n", 110 | " end=\"__eot__\"\n", 111 | " ),\n", 112 | " max_tokenlen,\n", 113 | " char_vocab[\"__c_pad__\"]\n", 114 | " )[0]\n", 115 | " for chars in char_seq\n", 116 | " ])\n", 117 | " padded_char_value = padded_seq(\n", 118 | " seq2idx(\n", 119 | " [\"__pad__\"],\n", 120 | " char_vocab,\n", 121 | " begin=\"__bot__\",\n", 122 | " end=\"__eot__\"\n", 123 | " ),\n", 124 | " max_tokenlen,\n", 125 | " char_vocab[\"__c_pad__\"]\n", 126 | " )[0]\n", 127 | " \n", 128 | " return char_seq, padded_char_value\n", 129 | " \n", 130 | "\n", 131 | "def transform(sentence_tags_item, vocab, char_vocab, tag_vocab):\n", 132 | " sentence, tags = sentence_tags_item\n", 133 | " word_tensor, word_len = padded_seq(\n", 134 | " seq2idx(sentence, VOCAB),\n", 135 | " maxlen,\n", 136 | " vocab[\"__pad__\"]\n", 137 | " )\n", 138 | " tag_tensor, tags_len = padded_seq(\n", 139 | " seq2idx(tags, TAG_VOCAB),\n", 140 | " maxlen,\n", 141 | " tag_vocab[\"__pad__\"]\n", 142 | " )\n", 143 | " assert word_len == tags_len, (\n", 144 | " \"Mismatch between padded word seq [{}]\"\n", 145 | " \" and padded tag seq [{}]\"\n", 146 | " ).format(word_len, tags_len)\n", 147 | " \n", 148 | " \n", 149 | " char_seq, padded_char_value = get_chars_seq(sentence, char_vocab)\n", 150 | " char_tensor, char_word_len = padded_seq(char_seq, maxlen, padded_char_value)\n", 151 | " assert word_len == char_word_len, (\n", 152 | " \"Mismatch between padded word seq [{}]\"\n", 153 | " \" and padded char based seq [{}]\"\n", 154 | " ).format(word_len, char_word_len)\n", 155 | " \n", 156 | " seq_len = word_len\n", 157 | " \n", 158 | " return word_tensor, char_tensor, tag_tensor, seq_len\n", 159 | " \n", 160 | " " 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 4, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/plain": [ 171 | "((10,), (10, 15), (10,), 6)" 172 | ] 173 | }, 174 | "execution_count": 4, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "char_seq, padded_char_value = get_chars_seq([\"dog\", \"cat\", \"dog\", \"puppy\"], CHAR_VOCAB)\n", 181 | "char_tensor, char_word_len = padded_seq(char_seq, maxlen, padded_char_value)\n", 182 | "\n", 183 | "\n", 184 | "word_tensor, char_tensor, tag_tensor, seq_len = transform((\n", 185 | " [\"dog\", \"cat\", \"dog\", \"puppy\"],\n", 186 | " [\"animal_class\", \"animal_class\", \"animal_class\", \"offspring\"]\n", 187 | "), VOCAB, CHAR_VOCAB, TAG_VOCAB)\n", 188 | "\n", 189 | "np.array(word_tensor).shape, np.array(char_tensor).shape, np.array(tag_tensor).shape, seq_len" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 5, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "class TestTransforms(unittest.TestCase):\n", 199 | " def test_seq2idx(self):\n", 200 | " self.assertEqual(\n", 201 | " seq2idx([\"dog\", \"cat\", \"dog\", \"puppy\"], VOCAB),\n", 202 | " (1, 4, 5, 4, 6, 2)\n", 203 | " )\n", 204 | " \n", 205 | " def test_padded_seq(self):\n", 206 | " self.assertEqual(\n", 207 | " padded_seq(\n", 208 | " seq2idx(\n", 209 | " [\"dog\", \"cat\", \"dog\", \"puppy\"],\n", 210 | " VOCAB\n", 211 | " ),\n", 212 | " maxlen,\n", 213 | " VOCAB[\"__pad__\"]\n", 214 | " ),\n", 215 | " ((1, 4, 5, 4, 6, 2, 0, 0, 0, 0), 6)\n", 216 | " )\n", 217 | " \n", 218 | " def test_padded_char_seq(self):\n", 219 | " char_seq, padded_char_value = get_chars_seq([\"dog\", \"cat\", \"dog\", \"puppy\"], CHAR_VOCAB)\n", 220 | " char_tensor, char_word_len = padded_seq(char_seq, maxlen, padded_char_value)\n", 221 | " self.assertEqual(\n", 222 | " np.array(char_tensor).shape,\n", 223 | " (maxlen, max_tokenlen)\n", 224 | " )\n", 225 | " \n", 226 | " \n", 227 | " def test_transform(self):\n", 228 | " word_tensor, char_tensor, tag_tensor, seq_len = transform(\n", 229 | " (\n", 230 | " [\"dog\", \"cat\", \"dog\", \"puppy\"],\n", 231 | " [\"animal_class\", \"animal_class\", \"animal_class\", \"offspring\"]\n", 232 | " ),\n", 233 | " VOCAB,\n", 234 | " CHAR_VOCAB,\n", 235 | " TAG_VOCAB\n", 236 | " )\n", 237 | "\n", 238 | " self.assertEqual(\n", 239 | " (\n", 240 | " np.array(word_tensor).shape,\n", 241 | " np.array(char_tensor).shape,\n", 242 | " np.array(tag_tensor).shape,\n", 243 | " seq_len\n", 244 | " ), ((10,), (10, 15), (10,), 6)\n", 245 | " )\n", 246 | " \n", 247 | " \n" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 6, 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "name": "stderr", 257 | "output_type": "stream", 258 | "text": [ 259 | "....\n", 260 | "----------------------------------------------------------------------\n", 261 | "Ran 4 tests in 0.004s\n", 262 | "\n", 263 | "OK\n" 264 | ] 265 | } 266 | ], 267 | "source": [ 268 | "runTests(TestTransforms)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "## Make dataset" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 7, 281 | "metadata": { 282 | "collapsed": true 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "class SentenceDataset(Dataset):\n", 287 | " def __init__(\n", 288 | " self,\n", 289 | " sentence_tags_items,\n", 290 | " transform,\n", 291 | " vocab,\n", 292 | " char_vocab,\n", 293 | " tag_vocab\n", 294 | " ):\n", 295 | " self.sentence_tags_items = sentence_tags_items\n", 296 | " self.transform = transform\n", 297 | " self.vocab = vocab\n", 298 | " self.char_vocab = char_vocab\n", 299 | " self.tag_vocab = tag_vocab\n", 300 | " \n", 301 | " def __getitem__(self, idx):\n", 302 | " word_tensor, char_tensor, tag_tensor, seq_len = self.transform(\n", 303 | " self.sentence_tags_items[idx],\n", 304 | " self.vocab,\n", 305 | " self.char_vocab,\n", 306 | " self.tag_vocab\n", 307 | " )\n", 308 | " \n", 309 | " word_tensor = torch.from_numpy(np.asarray(word_tensor))#.view(-1, 1)\n", 310 | " char_tensor = torch.from_numpy(np.asarray(char_tensor))\n", 311 | " tag_tensor = torch.from_numpy(np.asarray(tag_tensor))#.view(-1, 1)\n", 312 | " seq_len = torch.from_numpy(np.asarray([seq_len]))\n", 313 | " \n", 314 | " return word_tensor, char_tensor, tag_tensor, seq_len\n", 315 | " \n", 316 | " def __len__(self):\n", 317 | " return len(self.sentence_tags_items)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 8, 323 | "metadata": { 324 | "collapsed": true 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "sentence_tag_items = [\n", 329 | " (\n", 330 | " [\"dog\", \"cat\", \"dog\", \"puppy\"],\n", 331 | " [\"animal_class\", \"animal_class\", \"animal_class\", \"offspring\"]\n", 332 | " ),\n", 333 | " (\n", 334 | " [\"dog\", \"cat\", \"cat\", \"puppy\"],\n", 335 | " [\"animal_class\", \"animal_class\", \"animal_class\", \"offspring\"]\n", 336 | " ),\n", 337 | " (\n", 338 | " [\"dog\", \"puppy\", \"dog\", \"puppy\"],\n", 339 | " [\"animal_class\", \"offspring\", \"animal_class\", \"offspring\"]\n", 340 | " ),\n", 341 | " \n", 342 | "]" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 9, 348 | "metadata": { 349 | "collapsed": true 350 | }, 351 | "outputs": [], 352 | "source": [ 353 | "sent_dataset = SentenceDataset(\n", 354 | " sentence_tag_items,\n", 355 | " transform,\n", 356 | " VOCAB,\n", 357 | " CHAR_VOCAB,\n", 358 | " TAG_VOCAB\n", 359 | ")\n", 360 | "train_loader = DataLoader(sent_dataset, batch_size=10, shuffle=True, num_workers=1)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 10, 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "data": { 370 | "text/plain": [ 371 | "(torch.Size([3, 10]),\n", 372 | " torch.Size([3, 10, 15]),\n", 373 | " torch.Size([3, 10]),\n", 374 | " torch.Size([3, 1]))" 375 | ] 376 | }, 377 | "execution_count": 10, 378 | "metadata": {}, 379 | "output_type": "execute_result" 380 | } 381 | ], 382 | "source": [ 383 | "word_tensors, char_tensors, tag_tensors, seq_len = next(iter(train_loader))\n", 384 | "word_tensors.size(), char_tensors.size(), tag_tensors.size(), seq_len.size()" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 11, 390 | "metadata": {}, 391 | "outputs": [ 392 | { 393 | "data": { 394 | "text/plain": [ 395 | "torch.Size([3, 1])" 396 | ] 397 | }, 398 | "execution_count": 11, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | } 402 | ], 403 | "source": [ 404 | "seq_len.size()" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "## Train model" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 12, 417 | "metadata": { 418 | "collapsed": true 419 | }, 420 | "outputs": [], 421 | "source": [ 422 | "conv1d = torch.nn.Conv1d(5, 10, 1, dilation=2)" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 13, 428 | "metadata": {}, 429 | "outputs": [ 430 | { 431 | "data": { 432 | "text/plain": [ 433 | "torch.Size([2, 5, 4])" 434 | ] 435 | }, 436 | "execution_count": 13, 437 | "metadata": {}, 438 | "output_type": "execute_result" 439 | } 440 | ], 441 | "source": [ 442 | "torch.rand(2,5,4).size()" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 14, 448 | "metadata": {}, 449 | "outputs": [ 450 | { 451 | "data": { 452 | "text/plain": [ 453 | "torch.Size([2, 10, 4])" 454 | ] 455 | }, 456 | "execution_count": 14, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | } 460 | ], 461 | "source": [ 462 | "conv1d(Variable(torch.rand(2,5,4), requires_grad=False)).size()" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 15, 468 | "metadata": { 469 | "collapsed": true 470 | }, 471 | "outputs": [], 472 | "source": [ 473 | "emb = torch.nn.Embedding(10, 5)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 16, 479 | "metadata": {}, 480 | "outputs": [ 481 | { 482 | "data": { 483 | "text/plain": [ 484 | "torch.Size([2, 4, 5])" 485 | ] 486 | }, 487 | "execution_count": 16, 488 | "metadata": {}, 489 | "output_type": "execute_result" 490 | } 491 | ], 492 | "source": [ 493 | "embeddings = emb(Variable(torch.LongTensor([[1,2,4,5],[4,3,2,9]]), requires_grad=False))\n", 494 | "embeddings.size()" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 17, 500 | "metadata": {}, 501 | "outputs": [ 502 | { 503 | "data": { 504 | "text/plain": [ 505 | "torch.Size([2, 5, 4])" 506 | ] 507 | }, 508 | "execution_count": 17, 509 | "metadata": {}, 510 | "output_type": "execute_result" 511 | } 512 | ], 513 | "source": [ 514 | "embeddings.permute(0, 2, 1).size()" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 18, 520 | "metadata": {}, 521 | "outputs": [ 522 | { 523 | "data": { 524 | "text/plain": [ 525 | "Variable containing:\n", 526 | "(0 ,.,.) = \n", 527 | " 0.3917 0.8784 0.5268 0.4315\n", 528 | " -0.1406 0.2500 1.4438 0.0828\n", 529 | " 0.1396 -0.2760 -0.3761 0.1704\n", 530 | " -0.3965 -0.4440 0.2955 -0.3060\n", 531 | " 0.2451 -0.4238 0.3279 0.2239\n", 532 | " -0.5347 -1.1390 1.0406 -0.3362\n", 533 | " 0.0030 -0.7008 0.5324 0.1248\n", 534 | " -0.1148 0.7700 -0.3185 -0.1458\n", 535 | " -0.3496 -0.2052 -0.5736 -0.2478\n", 536 | " -0.1141 0.1016 -0.8129 -0.2597\n", 537 | "\n", 538 | "(1 ,.,.) = \n", 539 | " 0.5268 0.8617 0.8784 0.6424\n", 540 | " 1.4438 -0.5622 0.2500 1.4265\n", 541 | " -0.3761 -0.3985 -0.2760 -0.2698\n", 542 | " 0.2955 -0.7914 -0.4440 0.2630\n", 543 | " 0.3279 1.0187 -0.4238 -0.3041\n", 544 | " 1.0406 -0.9597 -1.1390 0.2380\n", 545 | " 0.5324 0.4073 -0.7008 -0.1318\n", 546 | " -0.3185 0.5722 0.7700 0.3617\n", 547 | " -0.5736 -0.5782 -0.2052 0.4115\n", 548 | " -0.8129 -0.2299 0.1016 -0.6984\n", 549 | "[torch.FloatTensor of size 2x10x4]" 550 | ] 551 | }, 552 | "execution_count": 18, 553 | "metadata": {}, 554 | "output_type": "execute_result" 555 | } 556 | ], 557 | "source": [ 558 | "conv1d(embeddings.permute(0, 2, 1))" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 19, 564 | "metadata": {}, 565 | "outputs": [ 566 | { 567 | "data": { 568 | "text/plain": [ 569 | "torch.Size([2, 10])" 570 | ] 571 | }, 572 | "execution_count": 19, 573 | "metadata": {}, 574 | "output_type": "execute_result" 575 | } 576 | ], 577 | "source": [ 578 | "conv1d(embeddings.permute(0, 2, 1)).max(2)[1].size()\n" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": 20, 584 | "metadata": {}, 585 | "outputs": [ 586 | { 587 | "data": { 588 | "text/plain": [ 589 | "torch.Size([2, 1, 4, 5])" 590 | ] 591 | }, 592 | "execution_count": 20, 593 | "metadata": {}, 594 | "output_type": "execute_result" 595 | } 596 | ], 597 | "source": [ 598 | "embeddings.unsqueeze(1).size()" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 21, 604 | "metadata": {}, 605 | "outputs": [ 606 | { 607 | "data": { 608 | "text/plain": [ 609 | "torch.Size([3, 10, 15])" 610 | ] 611 | }, 612 | "execution_count": 21, 613 | "metadata": {}, 614 | "output_type": "execute_result" 615 | } 616 | ], 617 | "source": [ 618 | "char_tensors.size()" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": 22, 624 | "metadata": {}, 625 | "outputs": [ 626 | { 627 | "data": { 628 | "text/plain": [ 629 | "torch.Size([3, 10, 15])" 630 | ] 631 | }, 632 | "execution_count": 22, 633 | "metadata": {}, 634 | "output_type": "execute_result" 635 | } 636 | ], 637 | "source": [ 638 | "char_tensors.view(-1, 15).view(3, 10, -1).shape" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": 23, 644 | "metadata": { 645 | "collapsed": true 646 | }, 647 | "outputs": [], 648 | "source": [ 649 | "class CharCNN(torch.nn.Module):\n", 650 | " def __init__(self):\n", 651 | " super(CharCNN, self).__init__()\n", 652 | " self.char_embedding=4\n", 653 | " self.char_conv_features=5\n", 654 | " self.char_conv_kernel=1\n", 655 | " \n", 656 | " self.char_emb = torch.nn.Embedding(\n", 657 | " len(CHAR_VOCAB),\n", 658 | " self.char_embedding\n", 659 | " )\n", 660 | " \n", 661 | " self.char_conv1d = torch.nn.Conv1d(\n", 662 | " self.char_embedding,\n", 663 | " self.char_conv_features,\n", 664 | " self.char_conv_kernel\n", 665 | " )\n", 666 | " \n", 667 | " self.output_size = self.char_conv_features\n", 668 | " \n", 669 | " def forward(self, char_tensors):\n", 670 | " batch_size, seqlen, char_seqlen = char_tensors.size()\n", 671 | " char_tensors = char_tensors.view(-1, char_seqlen)\n", 672 | " char_tensors = self.char_emb(char_tensors)\n", 673 | " char_tensors = char_tensors.permute(0, 2, 1)\n", 674 | " char_tensors = self.char_conv1d(char_tensors)\n", 675 | " char_tensors = char_tensors.max(2)[0] # Get the global max\n", 676 | " char_tensors = char_tensors.view(batch_size, seqlen, -1)\n", 677 | " return char_tensors" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": 24, 683 | "metadata": {}, 684 | "outputs": [ 685 | { 686 | "data": { 687 | "text/plain": [ 688 | "torch.Size([3, 10, 15])" 689 | ] 690 | }, 691 | "execution_count": 24, 692 | "metadata": {}, 693 | "output_type": "execute_result" 694 | } 695 | ], 696 | "source": [ 697 | "char_tensors.shape" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": 25, 703 | "metadata": {}, 704 | "outputs": [], 705 | "source": [ 706 | "char_model = CharCNN()" 707 | ] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": 26, 712 | "metadata": {}, 713 | "outputs": [ 714 | { 715 | "data": { 716 | "text/plain": [ 717 | "torch.Size([3, 10, 15])" 718 | ] 719 | }, 720 | "execution_count": 26, 721 | "metadata": {}, 722 | "output_type": "execute_result" 723 | } 724 | ], 725 | "source": [ 726 | "char_tensors.size()" 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "execution_count": 27, 732 | "metadata": {}, 733 | "outputs": [ 734 | { 735 | "data": { 736 | "text/plain": [ 737 | "torch.Size([3, 10, 5])" 738 | ] 739 | }, 740 | "execution_count": 27, 741 | "metadata": {}, 742 | "output_type": "execute_result" 743 | } 744 | ], 745 | "source": [ 746 | "char_model(Variable(char_tensors, requires_grad=False)).size()" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": 28, 752 | "metadata": {}, 753 | "outputs": [ 754 | { 755 | "data": { 756 | "text/plain": [ 757 | "torch.Size([3, 10, 30])" 758 | ] 759 | }, 760 | "execution_count": 28, 761 | "metadata": {}, 762 | "output_type": "execute_result" 763 | } 764 | ], 765 | "source": [ 766 | "torch.cat((char_tensors, char_tensors), -1).size()" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": 29, 772 | "metadata": {}, 773 | "outputs": [ 774 | { 775 | "data": { 776 | "text/plain": [ 777 | "(Variable containing:\n", 778 | " 0.8364 -0.1794 2.4606 0.3041 -0.3007\n", 779 | " 2.0133 1.1859 0.9896 1.6575 1.4240\n", 780 | " -0.3331 1.1859 2.4606 1.6575 1.4240\n", 781 | " 0.7453 0.0274 0.7354 0.1239 1.8854\n", 782 | " [torch.FloatTensor of size 4x5], Variable containing:\n", 783 | " 0 0 1 0 1\n", 784 | " 1 0 1 0 0\n", 785 | " 1 1 0 1 1\n", 786 | " 0 1 1 0 1\n", 787 | " [torch.LongTensor of size 4x5])" 788 | ] 789 | }, 790 | "execution_count": 29, 791 | "metadata": {}, 792 | "output_type": "execute_result" 793 | } 794 | ], 795 | "source": [ 796 | "embeddings.max(0)" 797 | ] 798 | }, 799 | { 800 | "cell_type": "markdown", 801 | "metadata": {}, 802 | "source": [ 803 | "## Word model" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": 30, 809 | "metadata": { 810 | "collapsed": true 811 | }, 812 | "outputs": [], 813 | "source": [ 814 | "class WordEmbeddings(torch.nn.Module):\n", 815 | " def __init__(\n", 816 | " self,\n", 817 | " char_model,\n", 818 | " ):\n", 819 | " super(WordEmbeddings, self).__init__()\n", 820 | " self.char_model = char_model\n", 821 | " self.word_embedding = 10\n", 822 | " self.word_emb = torch.nn.Embedding(\n", 823 | " len(VOCAB),\n", 824 | " self.word_embedding\n", 825 | " )\n", 826 | " \n", 827 | " self.output_size = (\n", 828 | " self.word_embedding\n", 829 | " + self.char_model.output_size\n", 830 | " )\n", 831 | " \n", 832 | " \n", 833 | " def forward(self, word_tensors, char_tensors):\n", 834 | " char_based_embs = self.char_model(char_tensors)\n", 835 | " #print(char_based_embs.size(), type(char_based_embs.data))\n", 836 | " word_embs = self.word_emb(word_tensors)\n", 837 | " #print(word_embs.size(), type(word_embs.data))\n", 838 | " word_embs = torch.cat(\n", 839 | " [word_embs, char_based_embs],\n", 840 | " -1\n", 841 | " ) # Concat word and char based embeddings\n", 842 | " return word_embs\n", 843 | " \n", 844 | " " 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 31, 850 | "metadata": { 851 | "collapsed": true 852 | }, 853 | "outputs": [], 854 | "source": [ 855 | "word_model = WordEmbeddings(char_model)" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": 32, 861 | "metadata": {}, 862 | "outputs": [ 863 | { 864 | "data": { 865 | "text/plain": [ 866 | "(torch.Size([3, 10]), torch.Size([3, 10, 15]))" 867 | ] 868 | }, 869 | "execution_count": 32, 870 | "metadata": {}, 871 | "output_type": "execute_result" 872 | } 873 | ], 874 | "source": [ 875 | "word_tensors.size(), char_tensors.size()" 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": 33, 881 | "metadata": {}, 882 | "outputs": [ 883 | { 884 | "data": { 885 | "text/plain": [ 886 | "torch.Size([3, 10, 15])" 887 | ] 888 | }, 889 | "execution_count": 33, 890 | "metadata": {}, 891 | "output_type": "execute_result" 892 | } 893 | ], 894 | "source": [ 895 | "word_model(\n", 896 | " Variable(word_tensors, requires_grad=False),\n", 897 | " Variable(char_tensors, requires_grad=False)\n", 898 | ").size()" 899 | ] 900 | }, 901 | { 902 | "cell_type": "markdown", 903 | "metadata": {}, 904 | "source": [ 905 | "## ID CNN model\n", 906 | "\n", 907 | "https://arxiv.org/pdf/1702.02098.pdf" 908 | ] 909 | }, 910 | { 911 | "cell_type": "code", 912 | "execution_count": 34, 913 | "metadata": {}, 914 | "outputs": [], 915 | "source": [ 916 | "class ID_CNN(torch.nn.Module):\n", 917 | " \"\"\"ID CNN Encoder\n", 918 | " \n", 919 | " Input: (batch, input_dims, seqlen)\n", 920 | " Outpus: (batch, input_dims, seqlen)\n", 921 | " \"\"\"\n", 922 | " def __init__(\n", 923 | " self,\n", 924 | " input_dims,\n", 925 | " dialation_block_depth=5,\n", 926 | " field_of_view=2,\n", 927 | " block_stacks=2\n", 928 | " ):\n", 929 | " super(ID_CNN, self).__init__()\n", 930 | " \n", 931 | " # We want to make the input emb same as output emb\n", 932 | " # This allows us to recursively stack the layers. \n", 933 | " \n", 934 | " \n", 935 | " self.conv_features = input_dims\n", 936 | " self.conv_kernel = 3\n", 937 | " self.block_stacks = block_stacks\n", 938 | " \n", 939 | " self.word_char_conv1d = torch.nn.Sequential(\n", 940 | " *[\n", 941 | " torch.nn.Sequential(\n", 942 | " torch.nn.Conv1d(\n", 943 | " input_dims,\n", 944 | " self.conv_features,\n", 945 | " kernel_size=self.conv_kernel,\n", 946 | " padding=field_of_view**i,\n", 947 | " dilation=field_of_view**i\n", 948 | " ),\n", 949 | " torch.nn.ReLU()\n", 950 | " )\n", 951 | " for i in range(dialation_block_depth)\n", 952 | " ]\n", 953 | " )\n", 954 | " \n", 955 | " def forward(self, seq_scores):\n", 956 | " for block_idx in range(self.block_stacks):\n", 957 | " seq_scores = self.word_char_conv1d(seq_scores)\n", 958 | " return seq_scores\n", 959 | " \n", 960 | " \n", 961 | "class IDCNNEncoder(torch.nn.Module):\n", 962 | " \"\"\"IDCNNEncoder - Encodes word and char based sentence\n", 963 | " \n", 964 | " Input: \n", 965 | " word_tensors - (batch, seqlen), \n", 966 | " char_tensors - (batch, seqlen, char_seqlen)\n", 967 | " \"\"\"\n", 968 | " def __init__(\n", 969 | " self,\n", 970 | " word_model,\n", 971 | " ):\n", 972 | " super(IDCNNEncoder, self).__init__()\n", 973 | " self.word_model = word_model\n", 974 | " self.id_cnn = ID_CNN(self.word_model.output_size)\n", 975 | " \n", 976 | " def forward(self, word_tensors, char_tensors):\n", 977 | " word_embs = self.word_model(word_tensors, char_tensors)\n", 978 | " word_embs = word_embs.permute(0, 2, 1)\n", 979 | " seq_scores = self.id_cnn(word_embs)\n", 980 | " return seq_scores\n", 981 | " \n", 982 | "class IDCNNDecoder(torch.nn.Module):\n", 983 | " def __init__(\n", 984 | " self,\n", 985 | " input_dims,\n", 986 | " num_classes,\n", 987 | " decoder_layers=3\n", 988 | " ):\n", 989 | " super(IDCNNDecoder, self).__init__()\n", 990 | " self.input_dims = input_dims\n", 991 | " self.num_classes = num_classes\n", 992 | " self.decoder_layers = decoder_layers\n", 993 | " self.transform_layer = torch.nn.Sequential(\n", 994 | " torch.nn.Linear(self.input_dims, self.num_classes),\n", 995 | " torch.nn.ReLU()\n", 996 | " )\n", 997 | " self.create_decoder_layers()\n", 998 | " \n", 999 | " def create_decoder_layers(self):\n", 1000 | " self.id_cnn = torch.nn.ModuleList(\n", 1001 | " [\n", 1002 | " ID_CNN(self.num_classes, self.num_classes, block_stacks=1)\n", 1003 | " for i in range(self.decoder_layers)\n", 1004 | " ]\n", 1005 | " )\n", 1006 | " \n", 1007 | " def forward(self, seq_scores):\n", 1008 | " outputs = []\n", 1009 | " batch, input_dims, seqlen = seq_scores.size()\n", 1010 | " seq_scores = seq_scores.permute(0, 2, 1).contiguous()\n", 1011 | " seq_scores = seq_scores.view(batch*seqlen, input_dims)\n", 1012 | " seq_scores = self.transform_layer(seq_scores)\n", 1013 | " seq_scores = seq_scores.view(batch, seqlen, self.num_classes)\n", 1014 | " seq_scores = seq_scores.permute(0, 2, 1)\n", 1015 | " for id_cnn in self.id_cnn:\n", 1016 | " seq_scores = id_cnn(seq_scores)\n", 1017 | " outputs.append(seq_scores)\n", 1018 | " return outputs" 1019 | ] 1020 | }, 1021 | { 1022 | "cell_type": "code", 1023 | "execution_count": 35, 1024 | "metadata": {}, 1025 | "outputs": [ 1026 | { 1027 | "data": { 1028 | "text/plain": [ 1029 | "(torch.Size([3, 10]), torch.Size([3, 10, 15]))" 1030 | ] 1031 | }, 1032 | "execution_count": 35, 1033 | "metadata": {}, 1034 | "output_type": "execute_result" 1035 | } 1036 | ], 1037 | "source": [ 1038 | "id_cnn = IDCNNEncoder(word_model)\n", 1039 | "word_tensors.size(), char_tensors.size()" 1040 | ] 1041 | }, 1042 | { 1043 | "cell_type": "code", 1044 | "execution_count": 36, 1045 | "metadata": {}, 1046 | "outputs": [ 1047 | { 1048 | "data": { 1049 | "text/plain": [ 1050 | "torch.Size([3, 15, 10])" 1051 | ] 1052 | }, 1053 | "execution_count": 36, 1054 | "metadata": {}, 1055 | "output_type": "execute_result" 1056 | } 1057 | ], 1058 | "source": [ 1059 | "id_cnn(\n", 1060 | " Variable(word_tensors, requires_grad=False),\n", 1061 | " Variable(char_tensors, requires_grad=False)\n", 1062 | ").size()" 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "code", 1067 | "execution_count": 37, 1068 | "metadata": {}, 1069 | "outputs": [], 1070 | "source": [ 1071 | "id_cnn_decoder = IDCNNDecoder(15, len(TAG_VOCAB))" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "code", 1076 | "execution_count": 38, 1077 | "metadata": {}, 1078 | "outputs": [ 1079 | { 1080 | "data": { 1081 | "text/plain": [ 1082 | "[torch.Size([3, 5, 10]), torch.Size([3, 5, 10]), torch.Size([3, 5, 10])]" 1083 | ] 1084 | }, 1085 | "execution_count": 38, 1086 | "metadata": {}, 1087 | "output_type": "execute_result" 1088 | } 1089 | ], 1090 | "source": [ 1091 | "decoder_outputs = id_cnn_decoder(id_cnn(\n", 1092 | " Variable(word_tensors, requires_grad=False),\n", 1093 | " Variable(char_tensors, requires_grad=False)\n", 1094 | "))\n", 1095 | "[output.size() for output in decoder_outputs]" 1096 | ] 1097 | }, 1098 | { 1099 | "cell_type": "code", 1100 | "execution_count": 39, 1101 | "metadata": {}, 1102 | "outputs": [], 1103 | "source": [ 1104 | "def get_loss(decoder_outputs, target, loss_fn):\n", 1105 | " batch, seqlen = target.size()[:2]\n", 1106 | " #target = target.unsqueeze(2).permute(0,2,1).contiguous().view(-1, 1).squeeze()\n", 1107 | " target = target.view(-1)\n", 1108 | " #print(target.size())\n", 1109 | " loss = None\n", 1110 | " for output in decoder_outputs:\n", 1111 | " output = output.permute(0,2,1).contiguous().view(-1, output.size()[1])\n", 1112 | " #print(output.size())\n", 1113 | " if loss is None:\n", 1114 | " loss = loss_fn(output, target)\n", 1115 | " else: \n", 1116 | " loss += loss_fn(output, target)\n", 1117 | " return loss\n", 1118 | "\n", 1119 | "loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)" 1120 | ] 1121 | }, 1122 | { 1123 | "cell_type": "code", 1124 | "execution_count": 40, 1125 | "metadata": {}, 1126 | "outputs": [ 1127 | { 1128 | "data": { 1129 | "text/plain": [ 1130 | "torch.Size([30, 5])" 1131 | ] 1132 | }, 1133 | "execution_count": 40, 1134 | "metadata": {}, 1135 | "output_type": "execute_result" 1136 | } 1137 | ], 1138 | "source": [ 1139 | "decoder_outputs[0].permute(0,2,1).contiguous().view(-1, decoder_outputs[0].size()[1]).size()" 1140 | ] 1141 | }, 1142 | { 1143 | "cell_type": "code", 1144 | "execution_count": 41, 1145 | "metadata": {}, 1146 | "outputs": [ 1147 | { 1148 | "data": { 1149 | "text/plain": [ 1150 | "Variable containing:\n", 1151 | " 4.7655\n", 1152 | "[torch.FloatTensor of size 1]" 1153 | ] 1154 | }, 1155 | "execution_count": 41, 1156 | "metadata": {}, 1157 | "output_type": "execute_result" 1158 | } 1159 | ], 1160 | "source": [ 1161 | "get_loss(decoder_outputs, Variable(tag_tensors, requires_grad=False), loss_fn)" 1162 | ] 1163 | }, 1164 | { 1165 | "cell_type": "markdown", 1166 | "metadata": {}, 1167 | "source": [ 1168 | "## Train model" 1169 | ] 1170 | }, 1171 | { 1172 | "cell_type": "code", 1173 | "execution_count": 42, 1174 | "metadata": { 1175 | "collapsed": true 1176 | }, 1177 | "outputs": [], 1178 | "source": [ 1179 | "def train(encoder, decoder, dataloader, num_epochs, history=None):\n", 1180 | " if history is None:\n", 1181 | " history = []\n", 1182 | " cuda = torch.cuda.is_available()\n", 1183 | " if cuda:\n", 1184 | " encoder.cuda()\n", 1185 | " decoder.cuda()\n", 1186 | " optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()))\n", 1187 | " loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)\n", 1188 | " for i in range(num_epochs):\n", 1189 | " per_epoch_losses = []\n", 1190 | " for batch in dataloader:\n", 1191 | " word_tensors = Variable(batch[0], requires_grad=False)\n", 1192 | " char_tensors = Variable(batch[1], requires_grad=False)\n", 1193 | " tag_tensors = Variable(batch[2], requires_grad=False)\n", 1194 | " seq_len = Variable(batch[3], requires_grad=False)\n", 1195 | " if cuda:\n", 1196 | " word_tensors = word_tensors.cuda()\n", 1197 | " char_tensors = char_tensors.cuda()\n", 1198 | " tag_tensors = tag_tensors.cuda()\n", 1199 | " optimizer.zero_grad()\n", 1200 | " encoding = encoder(word_tensors, char_tensors)\n", 1201 | " outputs = decoder(encoding)\n", 1202 | " loss = get_loss(outputs, tag_tensors, loss_fn)\n", 1203 | " loss.backward()\n", 1204 | " optimizer.step()\n", 1205 | " per_epoch_losses.append(loss.data[0])\n", 1206 | " history.append(np.mean(per_epoch_losses))\n", 1207 | " print('epoch[%d] loss: %.4f' % (i, loss.data[0]))\n", 1208 | " return history " 1209 | ] 1210 | }, 1211 | { 1212 | "cell_type": "code", 1213 | "execution_count": 43, 1214 | "metadata": { 1215 | "collapsed": true 1216 | }, 1217 | "outputs": [], 1218 | "source": [ 1219 | "char_model = CharCNN()\n", 1220 | "word_model = WordEmbeddings(char_model)\n", 1221 | "id_cnn = IDCNNEncoder(word_model)\n", 1222 | "id_cnn_decoder = IDCNNDecoder(15, len(TAG_VOCAB))\n", 1223 | "history = None" 1224 | ] 1225 | }, 1226 | { 1227 | "cell_type": "code", 1228 | "execution_count": 44, 1229 | "metadata": {}, 1230 | "outputs": [ 1231 | { 1232 | "name": "stdout", 1233 | "output_type": "stream", 1234 | "text": [ 1235 | "epoch[0] loss: 4.8157\n", 1236 | "epoch[1] loss: 4.8139\n", 1237 | "epoch[2] loss: 4.8121\n", 1238 | "epoch[3] loss: 4.8102\n", 1239 | "epoch[4] loss: 4.8084\n", 1240 | "epoch[5] loss: 4.8066\n", 1241 | "epoch[6] loss: 4.8048\n", 1242 | "epoch[7] loss: 4.8030\n", 1243 | "epoch[8] loss: 4.8011\n", 1244 | "epoch[9] loss: 4.7993\n" 1245 | ] 1246 | } 1247 | ], 1248 | "source": [ 1249 | "history = train(id_cnn, id_cnn_decoder, train_loader, 10, history=history)" 1250 | ] 1251 | }, 1252 | { 1253 | "cell_type": "code", 1254 | "execution_count": null, 1255 | "metadata": { 1256 | "collapsed": true 1257 | }, 1258 | "outputs": [], 1259 | "source": [] 1260 | }, 1261 | { 1262 | "cell_type": "code", 1263 | "execution_count": null, 1264 | "metadata": { 1265 | "collapsed": true 1266 | }, 1267 | "outputs": [], 1268 | "source": [] 1269 | } 1270 | ], 1271 | "metadata": { 1272 | "kernelspec": { 1273 | "display_name": "Python [default]", 1274 | "language": "python", 1275 | "name": "python3" 1276 | }, 1277 | "language_info": { 1278 | "codemirror_mode": { 1279 | "name": "ipython", 1280 | "version": 3 1281 | }, 1282 | "file_extension": ".py", 1283 | "mimetype": "text/x-python", 1284 | "name": "python", 1285 | "nbconvert_exporter": "python", 1286 | "pygments_lexer": "ipython3", 1287 | "version": "3.5.3" 1288 | }, 1289 | "toc": { 1290 | "nav_menu": { 1291 | "height": "156px", 1292 | "width": "160px" 1293 | }, 1294 | "number_sections": true, 1295 | "sideBar": true, 1296 | "skip_h1_title": false, 1297 | "toc_cell": false, 1298 | "toc_position": {}, 1299 | "toc_section_display": "block", 1300 | "toc_window_display": false 1301 | } 1302 | }, 1303 | "nbformat": 4, 1304 | "nbformat_minor": 2 1305 | } 1306 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /PyTorch CONLL 2000 Chunking.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import matplotlib 7 | matplotlib.use("Agg") 8 | import torch 9 | from torch.autograd import Variable 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | 14 | torch.manual_seed(1) 15 | 16 | import numpy as np 17 | 18 | from tqdm import tqdm 19 | 20 | import pandas as pd 21 | 22 | import matplotlib.pyplot as plt 23 | import seaborn as sns 24 | 25 | from pytorch_utils import * 26 | from pytorch_models import * 27 | from utils import load_sequences, conll_classification_report_to_df 28 | from conlleval import main as conll_eval 29 | import re 30 | 31 | import io 32 | from pathlib import Path 33 | 34 | sns.set_context("poster") 35 | sns.set_style("ticks") 36 | 37 | 38 | # In[2]: 39 | 40 | TRAIN_CORPUS="data/conll2000/train.txt" 41 | TEST_CORPUS="data/conll2000/test.txt" 42 | 43 | 44 | # In[3]: 45 | 46 | train_corpus = load_sequences(TRAIN_CORPUS, sep=" ", col_ids=(0, -1)) 47 | train_corpus, dev_corpus = train_corpus[100:], train_corpus[:100] 48 | print("Total items in train corpus: %s" % len(train_corpus)) 49 | print("Total items in dev corpus: %s" % len(dev_corpus)) 50 | test_corpus = load_sequences(TEST_CORPUS, sep=" ", col_ids=(0, -1)) 51 | print("Total items in test corpus: %s" % len(test_corpus)) 52 | 53 | 54 | # In[4]: 55 | 56 | train_corpus[0] 57 | 58 | 59 | # In[5]: 60 | 61 | def create_vocab(data, vocabs, char_vocab, word_idx=0): 62 | n_vocabs = len(vocabs) 63 | for sent in data: 64 | for token_tags in sent: 65 | for vocab_id in range(n_vocabs): 66 | vocabs[vocab_id].add(token_tags[vocab_id]) 67 | char_vocab.batch_add(token_tags[word_idx]) 68 | print("Created vocabs: %s, chars[%s]" % (", ".join( 69 | "{}[{}]".format(vocab.name, vocab.size) 70 | for vocab in vocabs 71 | ), char_vocab.size)) 72 | 73 | 74 | # In[6]: 75 | 76 | word_vocab = Vocab("words", UNK="UNK", lower=True) 77 | char_vocab = Vocab("chars", UNK="", lower=False) 78 | chunk_vocab = Vocab("chunk_tags", lower=False) 79 | 80 | create_vocab(train_corpus+dev_corpus+test_corpus, [word_vocab, chunk_vocab], char_vocab) 81 | 82 | 83 | # In[7]: 84 | 85 | def data2tensors(data, vocabs, char_vocab, word_idx=0, column_ids=(0, -1)): 86 | vocabs = [vocabs[idx] for idx in column_ids] 87 | n_vocabs = len(vocabs) 88 | tensors = [] 89 | char_tensors = [] 90 | for sent in data: 91 | sent_vecs = [[] for i in range(n_vocabs+1)] # Last is for char vecs 92 | char_vecs = [] 93 | for token_tags in sent: 94 | vocab_id = 0 # First column is the word 95 | # lowercase the word 96 | sent_vecs[vocab_id].append( 97 | vocabs[vocab_id].getidx(token_tags[vocab_id].lower()) 98 | ) 99 | for vocab_id in range(1, n_vocabs): 100 | sent_vecs[vocab_id].append( 101 | vocabs[vocab_id].getidx(token_tags[vocab_id]) 102 | ) 103 | sent_vecs[-1].append( 104 | [char_vocab.getidx(c) for c in token_tags[word_idx]] 105 | ) 106 | tensors.append(sent_vecs) 107 | return tensors 108 | 109 | 110 | # In[8]: 111 | 112 | train_tensors = data2tensors(train_corpus, [word_vocab, chunk_vocab], char_vocab) 113 | dev_tensors = data2tensors(dev_corpus, [word_vocab, chunk_vocab], char_vocab) 114 | test_tensors = data2tensors(test_corpus, [word_vocab, chunk_vocab], char_vocab) 115 | print("Train: {}, Dev: {}, Test: {}".format( 116 | len(train_tensors), 117 | len(dev_tensors), 118 | len(test_tensors), 119 | )) 120 | 121 | 122 | # In[9]: 123 | 124 | def load_word_vectors(vector_file, ndims, vocab, cache_file, override_cache=False): 125 | W = np.zeros((vocab.size, ndims), dtype="float32") 126 | # Check for cached file and return vectors 127 | cache_file = Path(cache_file) 128 | if cache_file.is_file() and not override_cache: 129 | W = np.load(cache_file) 130 | return W 131 | # Else load vectors from the vector file 132 | total, found = 0, 0 133 | with open(vector_file) as fp: 134 | for line in fp: 135 | line = line.strip().split() 136 | if line: 137 | total += 1 138 | assert len(line) == ndims+1,( 139 | "{} vector dims {} doesn't match ndims={}".format(line[0], len(line)-1, ndims) 140 | ) 141 | word = line[0] 142 | idx = vocab.getidx(word) 143 | if idx >= vocab.offset: 144 | found += 1 145 | vecs = np.array(list(map(float, line[1:]))) 146 | W[idx, :] += vecs 147 | # Write to cache file 148 | print("Found {} [{:.2f}%] vectors from {} vectors in {} with ndims={}".format( 149 | found, found * 100/vocab.size, total, vector_file, ndims)) 150 | norm_W = np.sqrt((W*W).sum(axis=1, keepdims=True)) 151 | valid_idx = norm_W.squeeze() != 0 152 | W[valid_idx, :] /= norm_W[valid_idx] 153 | print("Caching embedding with shape {} to {}".format(W.shape, cache_file.as_posix())) 154 | np.save(cache_file, W) 155 | return W 156 | 157 | 158 | 159 | 160 | 161 | # In[10]: 162 | 163 | get_ipython().run_cell_magic(u'time', u'', u'embedding_file="/home/napsternxg/datadrive/Downloads/Glove/glove.6B.100d.txt"\ncache_file="conll2000.glove.100.npy"\nndims=100\npretrained_embeddings = load_word_vectors(embedding_file, ndims, word_vocab, cache_file)') 164 | 165 | 166 | # In[11]: 167 | 168 | def plot_losses(train_losses, eval_losses=None, plot_std=False, ax=None): 169 | if ax is None: 170 | ax = plt.gca() 171 | for losses, color, label in zip( 172 | [train_losses, eval_losses], 173 | ["0.5", "r"], 174 | ["Train", "Eval"], 175 | ): 176 | mean_loss, std_loss = zip(*losses) 177 | mean_loss = np.array(mean_loss) 178 | std_loss = np.array(std_loss) 179 | ax.plot( 180 | mean_loss, color=color, label=label, 181 | linestyle="-", 182 | ) 183 | if plot_std: 184 | ax.fill_between( 185 | np.arange(mean_loss.shape[0]), 186 | mean_loss-std_loss, 187 | mean_loss+std_loss, 188 | color=color, 189 | alpha=0.3 190 | ) 191 | ax.set_xlabel("Epochs") 192 | ax.set_ylabel("Mean Loss ($\pm$ S.D.)") 193 | 194 | 195 | def print_predictions(corpus, predictions, filename, label_vocab): 196 | with open(filename, "w+") as fp: 197 | for seq, pred in zip(corpus, predictions): 198 | for (token, true_label), pred_label in zip(seq, pred): 199 | pred_label = label_vocab.idx2item[pred_label] 200 | print("{}\t{}\t{}".format(token, true_label, pred_label), file=fp) 201 | print(file=fp) # Add new line after each sequence 202 | 203 | 204 | # In[12]: 205 | 206 | char_emb_size=10 207 | output_channels=50 208 | kernel_sizes=[2, 3] 209 | char_embedding = CharEmbedding(char_vocab.size, char_emb_size, output_channels, kernel_sizes) 210 | 211 | 212 | # In[13]: 213 | 214 | char_embedding(Variable(torch.LongTensor([[1,1,2,3]]), requires_grad=False)).size() 215 | 216 | 217 | # In[14]: 218 | 219 | word_emb_size=100 220 | char_embed_kwargs=dict( 221 | vocab_size=char_vocab.size, 222 | embedding_size=char_emb_size, 223 | out_channels=output_channels, 224 | kernel_sizes=kernel_sizes 225 | ) 226 | word_char_embedding = WordCharEmbedding( 227 | word_vocab.size, word_emb_size, char_embed_kwargs, dropout=0.2) 228 | 229 | 230 | # In[15]: 231 | 232 | def charseq2varlist(X_chars): 233 | return [Variable(torch.LongTensor([x]), requires_grad=False) for x in X_chars] 234 | 235 | 236 | # In[16]: 237 | 238 | print(len(train_tensors[0][0])) 239 | print(len(train_tensors[0][-1])) 240 | 241 | 242 | # In[17]: 243 | 244 | train_corpus[0] 245 | 246 | 247 | # In[18]: 248 | 249 | charseq2varlist(train_tensors[0][-1]) 250 | 251 | 252 | # In[19]: 253 | 254 | word_char_embedding( 255 | Variable(torch.LongTensor([train_tensors[0][0]]), requires_grad=False), 256 | charseq2varlist(train_tensors[0][-1]) 257 | ).size() 258 | 259 | 260 | # In[20]: 261 | 262 | def assign_embeddings(embedding_module, pretrained_embeddings, fix_embedding=False): 263 | embedding_module.weight.data.copy_(torch.from_numpy(pretrained_embeddings)) 264 | if fix_embedding: 265 | embedding_module.weight.requires_grad = False 266 | 267 | 268 | # In[21]: 269 | 270 | assign_embeddings(word_char_embedding.word_embeddings, pretrained_embeddings, fix_embedding=True) 271 | 272 | 273 | # ## Class based 274 | 275 | # In[22]: 276 | 277 | class ModelWrapper(object): 278 | def __init__(self, model, 279 | loss_function, 280 | use_cuda=False 281 | ): 282 | self.model = model 283 | self.loss_function = loss_function 284 | 285 | self.use_cuda = use_cuda 286 | if self.use_cuda: 287 | self.model.cuda() 288 | 289 | def _process_instance_tensors(self, instance_tensors): 290 | raise NotImplementedError("Please define this function explicitly") 291 | 292 | def zero_grad(self): 293 | self.model.zero_grad() 294 | 295 | def get_parameters(self): 296 | return self.model.paramerters() 297 | 298 | def set_model_mode(self, training_mode=True): 299 | if training_mode: 300 | self.model.train() 301 | else: 302 | self.model.eval() 303 | 304 | def save(self, filename): 305 | torch.save(self.model, filename) 306 | print("{} model saved to {}".format(self.model.__class__, filename)) 307 | 308 | def load(self, filename): 309 | self.model = torch.load(filename) 310 | if self.use_cuda: 311 | self.model.cuda() 312 | 313 | def get_instance_loss(self, instance_tensors, zero_grad=True): 314 | if zero_grads: 315 | ## Clear gradients before every update else memory runs out 316 | self.zero_grad() 317 | raise NotImplementedError("Please define this function explicitly") 318 | 319 | def predict(self, instance_tensors): 320 | raise NotImplementedError("Please define this function explicitly") 321 | 322 | def predict_batch(self, batch_tensors): 323 | predictions = [] 324 | for instance_tensors in batch_tensors: 325 | predictions.append(self.predict(instance_tensors)) 326 | return predictions 327 | 328 | 329 | def get_epoch_function(model_wrapper, optimizer, 330 | use_cuda=False): 331 | def perform_epoch(data_tensors, training_mode=True, batch_size=1): 332 | model_wrapper.set_model_mode(training_mode) 333 | step_losses = [] 334 | data_tensors = np.random.permutation(data_tensors) 335 | n_splits = data_tensors.shape[0]//batch_size 336 | for batch_tensors in np.array_split(data_tensors, n_splits): 337 | #from IPython.core.debugger import Tracer; Tracer()() 338 | model_wrapper.zero_grad() 339 | loss = Variable(torch.FloatTensor([0.])) 340 | if use_cuda: 341 | loss = loss.cuda() 342 | for instance_tensors in batch_tensors: 343 | loss += model_wrapper.get_instance_loss(instance_tensors, zero_grad=False) 344 | loss = loss/batch_tensors.shape[0] # Mean loss 345 | step_losses.append(loss.data[0]) 346 | if training_mode: 347 | ## Get gradients of model params wrt. loss 348 | loss.backward() 349 | ## Optimize the loss by one step 350 | optimizer.step() 351 | return step_losses 352 | return perform_epoch 353 | 354 | def write_losses(losses, fp, title="train", epoch=0): 355 | for i, loss in enumerate(losses): 356 | print("{:<10} epoch={:<3} batch={:<5} loss={:<10}".format( 357 | title, epoch, i, loss 358 | ), file=fp) 359 | print("{:<10} epoch={:<3} {:<11} mean={:<10.3f} std={:<10.3f}".format( 360 | title, epoch, "overall", np.mean(losses), np.std(losses) 361 | ), file=fp) 362 | 363 | 364 | def training_wrapper( 365 | model_wrapper, data_tensors, 366 | eval_tensors=None, 367 | optimizer=optim.SGD, 368 | optimizer_kwargs=None, 369 | n_epochs=10, 370 | batch_size=1, 371 | use_cuda=False, 372 | log_file="training_output.log" 373 | ): 374 | """Wrapper to train the model 375 | """ 376 | if optimizer_kwargs is None: 377 | optimizer_kwargs = {} 378 | # Fileter out parameters which don't require a gradient 379 | parameters = filter(lambda p: p.requires_grad, model_wrapper.model.parameters()) 380 | optimizer=optimizer(parameters, **optimizer_kwargs) 381 | # Start training 382 | losses = [] 383 | eval_losses = [] 384 | data_tensors = np.array(data_tensors) 385 | if eval_tensors is not None: 386 | eval_tensors = np.array(eval_tensors) 387 | perform_epoch = get_epoch_function( 388 | model_wrapper, 389 | optimizer, 390 | use_cuda=use_cuda) 391 | with open(log_file, "w+") as fp: 392 | for epoch in tqdm(range(n_epochs)): 393 | i = epoch 394 | step_losses = perform_epoch(data_tensors, batch_size=batch_size) 395 | mean_loss, std_loss = np.mean(step_losses), np.std(step_losses) 396 | losses.append((mean_loss, std_loss)) 397 | write_losses(step_losses, fp, title="train", epoch=i) 398 | if eval_tensors is not None: 399 | step_losses = perform_epoch(eval_tensors, training_mode=False) 400 | mean_loss, std_loss = np.mean(step_losses), np.std(step_losses) 401 | eval_losses.append((mean_loss, std_loss)) 402 | write_losses(step_losses, fp, title="eval", epoch=i) 403 | return { 404 | "training_loss": losses, 405 | "evaluation_loss": eval_losses 406 | } 407 | 408 | 409 | # In[23]: 410 | 411 | class LSTMTaggerModel(ModelWrapper): 412 | def __init__(self, model, 413 | loss_function, 414 | use_cuda=False): 415 | self.model = model 416 | self.loss_function = loss_function 417 | 418 | self.use_cuda = use_cuda 419 | if self.use_cuda: 420 | #[k.cuda() for k in self.model.modules()] 421 | self.model.cuda() 422 | 423 | def _process_instance_tensors(self, instance_tensors): 424 | X, Y, X_char = instance_tensors 425 | X = Variable(torch.LongTensor([X]), requires_grad=False) 426 | Y = Variable(torch.LongTensor(Y), requires_grad=False) 427 | X_char = charseq2varlist(X_char) 428 | if self.use_cuda: 429 | X = X.cuda() 430 | Y = Y.cuda() 431 | X_char = [t.cuda() for t in X_char] 432 | return X, X_char, Y 433 | 434 | def get_instance_loss(self, instance_tensors, zero_grad=True): 435 | if zero_grad: 436 | ## Clear gradients before every update else memory runs out 437 | self.model.zero_grad() 438 | X, X_char, Y = self._process_instance_tensors(instance_tensors) 439 | #print(X.get_device(), [t.get_device() for t in X_char]) 440 | return self.loss_function(self.model.forward(X, X_char), Y) 441 | 442 | def predict(self, instance_tensors): 443 | X, X_char, Y = self._process_instance_tensors(instance_tensors) 444 | prediction = self.model.forward(X, X_char) 445 | return prediction.data.cpu().max(1)[1].numpy().ravel() 446 | 447 | 448 | # In[24]: 449 | 450 | use_cuda=True 451 | n_embed=100 452 | hidden_size=100 453 | batch_size=10 454 | 455 | char_emb_size=50 456 | output_channels=50 457 | kernel_sizes=[2, 3] 458 | 459 | word_emb_size=100 460 | char_embed_kwargs=dict( 461 | vocab_size=char_vocab.size, 462 | embedding_size=char_emb_size, 463 | out_channels=output_channels, 464 | kernel_sizes=kernel_sizes 465 | ) 466 | 467 | word_char_embedding = WordCharEmbedding( 468 | word_vocab.size, word_emb_size, 469 | char_embed_kwargs, dropout=0) 470 | # Assign glove embeddings 471 | assign_embeddings(word_char_embedding.word_embeddings, pretrained_embeddings, fix_embedding=True) 472 | 473 | model_wrapper = LSTMTaggerModel( 474 | LSTMTaggerWordChar(word_char_embedding, n_embed, hidden_size, chunk_vocab.size), 475 | nn.NLLLoss(), use_cuda=use_cuda) 476 | 477 | 478 | # In[25]: 479 | 480 | model_wrapper.get_instance_loss(train_tensors[0]) 481 | 482 | 483 | # In[26]: 484 | 485 | len(list(model_wrapper.model.parameters())) 486 | 487 | 488 | # In[27]: 489 | 490 | n_epochs=5 491 | training_history = training_wrapper( 492 | model_wrapper, train_tensors, 493 | eval_tensors=dev_tensors, 494 | optimizer=optim.Adam, 495 | optimizer_kwargs={ 496 | #"lr": 0.01, 497 | "weight_decay": 0.0 498 | }, 499 | n_epochs=n_epochs, 500 | batch_size=batch_size, 501 | use_cuda=use_cuda, 502 | log_file="LSTMTaggerModel_CONLL2000.log" 503 | ) 504 | model_wrapper.save("LSTMTaggerModel_CONLL2000") 505 | 506 | 507 | # In[28]: 508 | 509 | preds = model_wrapper.predict(train_tensors[0]) 510 | preds 511 | 512 | 513 | # In[29]: 514 | 515 | fig, ax = plt.subplots(1,1) 516 | plot_losses(training_history["training_loss"], 517 | training_history["evaluation_loss"], 518 | plot_std=True, 519 | ax=ax) 520 | ax.legend() 521 | sns.despine(offset=5) 522 | plt.savefig("LSTMTaggerModel_CONLL2000.pdf") 523 | 524 | # In[30]: 525 | 526 | for title, tensors, corpus in zip( 527 | ["train", "dev", "test"], 528 | [train_tensors, dev_tensors, test_tensors], 529 | [train_corpus, dev_corpus, test_corpus], 530 | ): 531 | get_ipython().magic(u'time predictions = model_wrapper.predict_batch(tensors)') 532 | print_predictions(corpus, predictions, "%s.chunking.conll" % title, chunk_vocab) 533 | conll_eval(["conlleval", "%s.chunking.conll" % title]) 534 | 535 | 536 | # ## CRF model 537 | 538 | # In[31]: 539 | 540 | class BiLSTMTaggerWordCRFModel(ModelWrapper): 541 | def __init__(self, model, 542 | loss_function, 543 | use_cuda=False): 544 | self.model = model 545 | self.loss_function = None 546 | 547 | self.use_cuda = use_cuda 548 | if self.use_cuda: 549 | #[k.cuda() for k in self.model.modules()] 550 | self.model.cuda() 551 | 552 | def _process_instance_tensors(self, instance_tensors): 553 | X, Y, X_char = instance_tensors 554 | X = Variable(torch.LongTensor([X]), requires_grad=False) 555 | Y = torch.LongTensor(Y) 556 | X_char = charseq2varlist(X_char) 557 | if self.use_cuda: 558 | X = X.cuda() 559 | Y = Y.cuda() 560 | X_char = [t.cuda() for t in X_char] 561 | return X, X_char, Y 562 | 563 | def get_instance_loss(self, instance_tensors, zero_grad=True): 564 | if zero_grad: 565 | ## Clear gradients before every update else memory runs out 566 | self.model.zero_grad() 567 | X, X_char, Y = self._process_instance_tensors(instance_tensors) 568 | #print(X.get_device(), [t.get_device() for t in X_char]) 569 | return self.model.loss(X, X_char, Y) 570 | 571 | def predict(self, instance_tensors): 572 | X, X_char, Y = self._process_instance_tensors(instance_tensors) 573 | emissions = self.model.forward(X, X_char) 574 | return self.model.crf.forward(emissions)[1] 575 | 576 | 577 | # In[32]: 578 | 579 | use_cuda=True 580 | n_embed=100 581 | hidden_size=128 582 | batch_size=64 583 | 584 | char_emb_size=50 585 | output_channels=50 586 | kernel_sizes=[2, 3] 587 | 588 | word_emb_size=100 589 | char_embed_kwargs=dict( 590 | vocab_size=char_vocab.size, 591 | embedding_size=char_emb_size, 592 | out_channels=output_channels, 593 | kernel_sizes=kernel_sizes 594 | ) 595 | 596 | word_char_embedding = WordCharEmbedding( 597 | word_vocab.size, word_emb_size, 598 | char_embed_kwargs, dropout=0) 599 | # Assign glove embeddings 600 | assign_embeddings(word_char_embedding.word_embeddings, pretrained_embeddings, fix_embedding=True) 601 | 602 | model_wrapper = BiLSTMTaggerWordCRFModel( 603 | LSTMTaggerWordCharCRF(word_char_embedding, n_embed, hidden_size, chunk_vocab.size), 604 | None, use_cuda=use_cuda) 605 | 606 | 607 | # In[33]: 608 | 609 | n_epochs=50 610 | training_history = training_wrapper( 611 | model_wrapper, train_tensors, 612 | eval_tensors=dev_tensors, 613 | optimizer=optim.Adam, 614 | optimizer_kwargs={ 615 | #"lr": 0.01, 616 | "weight_decay": 0 617 | }, 618 | n_epochs=n_epochs, 619 | batch_size=batch_size, 620 | use_cuda=use_cuda, 621 | log_file="BiLSTMTaggerWordCRFModel_CONLL2000.log" 622 | ) 623 | model_wrapper.save("BiLSTMTaggerWordCRFModel_CONLL2000") 624 | 625 | 626 | # In[34]: 627 | 628 | fig, ax = plt.subplots(1,1) 629 | plot_losses(training_history["training_loss"], 630 | training_history["evaluation_loss"], 631 | plot_std=True, 632 | ax=ax) 633 | ax.legend() 634 | sns.despine(offset=5) 635 | plt.savefig("BiLSTMTaggerWordCRFModel_CONLL2000.pdf") 636 | 637 | # Performance may improve by creating all the torch tensors upfront and then pinning them to memory 638 | 639 | 640 | # In[35]: 641 | 642 | for title, tensors, corpus in zip( 643 | ["train", "dev", "test"], 644 | [train_tensors, dev_tensors, test_tensors], 645 | [train_corpus, dev_corpus, test_corpus], 646 | ): 647 | get_ipython().magic(u'time predictions = model_wrapper.predict_batch(tensors)') 648 | print_predictions(corpus, predictions, "%s.chunking.conll" % title, chunk_vocab) 649 | conll_eval(["conlleval", "%s.chunking.conll" % title]) 650 | 651 | 652 | -------------------------------------------------------------------------------- /Pytorch Gradient reversal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Gradient reversal pytorch\n", 8 | "\n", 9 | "Inspired from the following tweets:\n", 10 | "\n", 11 | "* https://twitter.com/mat_kelcey/status/932149793765261313\n", 12 | "* https://twitter.com/ericjang11/status/932073259721359363\n", 13 | "\n", 14 | "Basic idea:\n", 15 | "\n", 16 | "```python\n", 17 | "# Add something to gradient\n", 18 | "f(x) + g(x) - tf.stop_gradients(g(x))\n", 19 | "\n", 20 | "# Reverse gradient\n", 21 | "tf.stop_gradient(f(x)*2) - f(x)\n", 22 | "```" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import torch\n", 32 | "import tensorflow as tf\n", 33 | "from torch.autograd import Variable\n", 34 | "\n", 35 | "import numpy as np" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "def f(X):\n", 47 | " return X*X\n", 48 | "\n", 49 | "def g(X):\n", 50 | " return X**3" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/plain": [ 61 | "array([ 0.01995021, -0.32892969, 0.75804777, 0.172995 , 0.69747771,\n", 62 | " 1.11414039, -0.69194092, 2.43364877, 0.92732815, -0.91409348])" 63 | ] 64 | }, 65 | "execution_count": 3, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "X = np.random.randn(10)\n", 72 | "X" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## Tensorflow implementation" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "sess = tf.InteractiveSession()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 5, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "tf_X = tf.Variable(X)\n", 100 | "init_op = tf.global_variables_initializer()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 6, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "data": { 110 | "text/plain": [ 111 | "array([ 0.01995021, -0.32892969, 0.75804777, 0.172995 , 0.69747771,\n", 112 | " 1.11414039, -0.69194092, 2.43364877, 0.92732815, -0.91409348])" 113 | ] 114 | }, 115 | "execution_count": 6, 116 | "metadata": {}, 117 | "output_type": "execute_result" 118 | } 119 | ], 120 | "source": [ 121 | "sess.run(init_op)\n", 122 | "sess.run(tf_X)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 7, 128 | "metadata": { 129 | "collapsed": true 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "forward_op = f(tf_X)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 8, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "array([ 3.98010770e-04, 1.08194738e-01, 5.74636421e-01,\n", 145 | " 2.99272683e-02, 4.86475162e-01, 1.24130881e+00,\n", 146 | " 4.78782241e-01, 5.92264633e+00, 8.59937506e-01,\n", 147 | " 8.35566890e-01])" 148 | ] 149 | }, 150 | "execution_count": 8, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "sess.run(forward_op)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 9, 162 | "metadata": { 163 | "collapsed": true 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "gradient_op = tf.gradients(forward_op, tf_X)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 10, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "[array([ 0.03990041, -0.65785937, 1.51609554, 0.34598999, 1.39495543,\n", 179 | " 2.22828078, -1.38388185, 4.86729754, 1.85465631, -1.82818696])]" 180 | ] 181 | }, 182 | "execution_count": 10, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "sess.run(gradient_op)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 11, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "array([ 0.03990041, -0.65785937, 1.51609554, 0.34598999, 1.39495543,\n", 200 | " 2.22828078, -1.38388185, 4.86729754, 1.85465631, -1.82818696])" 201 | ] 202 | }, 203 | "execution_count": 11, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "X*2 # This should match the gradient above" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "### Modify the gradients\n", 217 | "Keep forward pass the same. \n", 218 | "The trick is to add $g(x)$, such that $g'(x)$ is the gradient modifier, during the forward pass and substract it as well. But stop gradients from flowing through the substraction part. \n", 219 | "\n", 220 | "$f(x) + g(x) - g(x)$ will lead to gradients $f'(x) + g'(x) -g'(x)$. Since gradients don't flow through $-g'(x)$, hence we get new gradients as $f'(x) + g'(x)$" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 12, 226 | "metadata": { 227 | "collapsed": true 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "gradient_modifier_op = g(tf_X)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 13, 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "data": { 241 | "text/plain": [ 242 | "array([ 7.94039737e-06, -3.55884610e-02, 4.35601858e-01,\n", 243 | " 5.17726764e-03, 3.39305584e-01, 1.38299228e+00,\n", 244 | " -3.31289026e-01, 1.44136410e+01, 7.97444260e-01,\n", 245 | " -7.63786246e-01])" 246 | ] 247 | }, 248 | "execution_count": 13, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "sess.run(gradient_modifier_op)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 14, 260 | "metadata": { 261 | "collapsed": true 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "modified_forward_op = (f(tf_X) + g(tf_X) - tf.stop_gradient(g(tf_X)))\n", 266 | "modified_backward_op = tf.gradients(modified_forward_op, tf_X)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 15, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "data": { 276 | "text/plain": [ 277 | "array([ 3.98010770e-04, 1.08194738e-01, 5.74636421e-01,\n", 278 | " 2.99272683e-02, 4.86475162e-01, 1.24130881e+00,\n", 279 | " 4.78782241e-01, 5.92264633e+00, 8.59937506e-01,\n", 280 | " 8.35566890e-01])" 281 | ] 282 | }, 283 | "execution_count": 15, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "sess.run(modified_forward_op)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 16, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/plain": [ 300 | "[array([ 0.04109445, -0.33327516, 3.2400048 , 0.4357718 ,\n", 301 | " 2.85438092, 5.95220721, 0.05246488, 22.63523654,\n", 302 | " 4.43446883, 0.67851371])]" 303 | ] 304 | }, 305 | "execution_count": 16, 306 | "metadata": {}, 307 | "output_type": "execute_result" 308 | } 309 | ], 310 | "source": [ 311 | "sess.run(modified_backward_op)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 17, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/plain": [ 322 | "array([ 0.04109445, -0.33327516, 3.2400048 , 0.4357718 ,\n", 323 | " 2.85438092, 5.95220721, 0.05246488, 22.63523654,\n", 324 | " 4.43446883, 0.67851371])" 325 | ] 326 | }, 327 | "execution_count": 17, 328 | "metadata": {}, 329 | "output_type": "execute_result" 330 | } 331 | ], 332 | "source": [ 333 | "2*X + 3*(X**2) # This should match the gradients above" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "### Gradient reversal\n", 341 | "\n", 342 | "Here the modifying function $g(x)$ is simply the $-2*f(x)$, this will make the gradients $-f'(x)$." 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 18, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "gradient_reversal_op = (tf.stop_gradient(2*f(tf_X)) - f(tf_X))\n", 352 | "gradient_reversal_grad_op = tf.gradients(gradient_reversal_op, tf_X)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 19, 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "data": { 362 | "text/plain": [ 363 | "array([ 3.98010770e-04, 1.08194738e-01, 5.74636421e-01,\n", 364 | " 2.99272683e-02, 4.86475162e-01, 1.24130881e+00,\n", 365 | " 4.78782241e-01, 5.92264633e+00, 8.59937506e-01,\n", 366 | " 8.35566890e-01])" 367 | ] 368 | }, 369 | "execution_count": 19, 370 | "metadata": {}, 371 | "output_type": "execute_result" 372 | } 373 | ], 374 | "source": [ 375 | "sess.run(gradient_reversal_op)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 20, 381 | "metadata": {}, 382 | "outputs": [ 383 | { 384 | "data": { 385 | "text/plain": [ 386 | "[array([-0.03990041, 0.65785937, -1.51609554, -0.34598999, -1.39495543,\n", 387 | " -2.22828078, 1.38388185, -4.86729754, -1.85465631, 1.82818696])]" 388 | ] 389 | }, 390 | "execution_count": 20, 391 | "metadata": {}, 392 | "output_type": "execute_result" 393 | } 394 | ], 395 | "source": [ 396 | "sess.run(gradient_reversal_grad_op)" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 21, 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "data": { 406 | "text/plain": [ 407 | "array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])" 408 | ] 409 | }, 410 | "execution_count": 21, 411 | "metadata": {}, 412 | "output_type": "execute_result" 413 | } 414 | ], 415 | "source": [ 416 | "sess.run((gradient_op[0] + gradient_reversal_grad_op[0])) # This should be zero. Signifying grad is reversed. " 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "## Pytoch case" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 22, 429 | "metadata": { 430 | "collapsed": true 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "def zero_grad(X):\n", 435 | " if X.grad is not None:\n", 436 | " X.grad.data.zero_()" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 23, 442 | "metadata": { 443 | "collapsed": true 444 | }, 445 | "outputs": [], 446 | "source": [ 447 | "torch_X = Variable(torch.FloatTensor(X), requires_grad=True)" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 24, 453 | "metadata": {}, 454 | "outputs": [ 455 | { 456 | "data": { 457 | "text/plain": [ 458 | "array([ 0.01995021, -0.32892969, 0.75804776, 0.172995 , 0.6974777 ,\n", 459 | " 1.11414039, -0.6919409 , 2.43364882, 0.92732817, -0.91409349], dtype=float32)" 460 | ] 461 | }, 462 | "execution_count": 24, 463 | "metadata": {}, 464 | "output_type": "execute_result" 465 | } 466 | ], 467 | "source": [ 468 | "torch_X.data.numpy()" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 25, 474 | "metadata": {}, 475 | "outputs": [ 476 | { 477 | "data": { 478 | "text/plain": [ 479 | "array([ 3.98010772e-04, 1.08194746e-01, 5.74636400e-01,\n", 480 | " 2.99272705e-02, 4.86475140e-01, 1.24130881e+00,\n", 481 | " 4.78782207e-01, 5.92264652e+00, 8.59937549e-01,\n", 482 | " 8.35566938e-01], dtype=float32)" 483 | ] 484 | }, 485 | "execution_count": 25, 486 | "metadata": {}, 487 | "output_type": "execute_result" 488 | } 489 | ], 490 | "source": [ 491 | "f(torch_X).data.numpy()" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 26, 497 | "metadata": {}, 498 | "outputs": [ 499 | { 500 | "data": { 501 | "text/plain": [ 502 | "array([ 7.94039715e-06, -3.55884619e-02, 4.35601830e-01,\n", 503 | " 5.17726783e-03, 3.39305550e-01, 1.38299227e+00,\n", 504 | " -3.31288993e-01, 1.44136410e+01, 7.97444284e-01,\n", 505 | " -7.63786316e-01], dtype=float32)" 506 | ] 507 | }, 508 | "execution_count": 26, 509 | "metadata": {}, 510 | "output_type": "execute_result" 511 | } 512 | ], 513 | "source": [ 514 | "g(torch_X).data.numpy()" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 27, 520 | "metadata": {}, 521 | "outputs": [ 522 | { 523 | "data": { 524 | "text/plain": [ 525 | "array([ 0.03990041, -0.65785939, 1.51609552, 0.34599 , 1.3949554 ,\n", 526 | " 2.22828078, -1.38388181, 4.86729765, 1.85465634, -1.82818699], dtype=float32)" 527 | ] 528 | }, 529 | "execution_count": 27, 530 | "metadata": {}, 531 | "output_type": "execute_result" 532 | } 533 | ], 534 | "source": [ 535 | "zero_grad(torch_X)\n", 536 | "f_X = f(torch_X)\n", 537 | "f_X.backward(torch.ones(f_X.size()))\n", 538 | "torch_X.grad.data.numpy()" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": 28, 544 | "metadata": {}, 545 | "outputs": [ 546 | { 547 | "data": { 548 | "text/plain": [ 549 | "array([ 0.03990041, -0.65785937, 1.51609554, 0.34598999, 1.39495543,\n", 550 | " 2.22828078, -1.38388185, 4.86729754, 1.85465631, -1.82818696])" 551 | ] 552 | }, 553 | "execution_count": 28, 554 | "metadata": {}, 555 | "output_type": "execute_result" 556 | } 557 | ], 558 | "source": [ 559 | "2*X" 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": {}, 565 | "source": [ 566 | "### Modify gradients" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 29, 572 | "metadata": { 573 | "collapsed": true 574 | }, 575 | "outputs": [], 576 | "source": [ 577 | "modified_gradients_forward = lambda x: f(x) + g(x) - g(x).detach()" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": 30, 583 | "metadata": {}, 584 | "outputs": [ 585 | { 586 | "data": { 587 | "text/plain": [ 588 | "array([ 0.04109445, -0.33327514, 3.24000454, 0.43577182,\n", 589 | " 2.85438085, 5.95220757, 0.05246484, 22.63523865,\n", 590 | " 4.43446875, 0.67851377], dtype=float32)" 591 | ] 592 | }, 593 | "execution_count": 30, 594 | "metadata": {}, 595 | "output_type": "execute_result" 596 | } 597 | ], 598 | "source": [ 599 | "zero_grad(torch_X)\n", 600 | "modified_grad = modified_gradients_forward(torch_X)\n", 601 | "modified_grad.backward(torch.ones(modified_grad.size()))\n", 602 | "torch_X.grad.data.numpy()" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": 31, 608 | "metadata": {}, 609 | "outputs": [ 610 | { 611 | "data": { 612 | "text/plain": [ 613 | "array([ 0.04109445, -0.33327516, 3.2400048 , 0.4357718 ,\n", 614 | " 2.85438092, 5.95220721, 0.05246488, 22.63523654,\n", 615 | " 4.43446883, 0.67851371])" 616 | ] 617 | }, 618 | "execution_count": 31, 619 | "metadata": {}, 620 | "output_type": "execute_result" 621 | } 622 | ], 623 | "source": [ 624 | "2*X + 3*(X*X) # It should be same as above" 625 | ] 626 | }, 627 | { 628 | "cell_type": "markdown", 629 | "metadata": {}, 630 | "source": [ 631 | "### Gradient reversal" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 32, 637 | "metadata": { 638 | "collapsed": true 639 | }, 640 | "outputs": [], 641 | "source": [ 642 | "gradient_reversal = lambda x: (2*f(x)).detach() - f(x)" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": 33, 648 | "metadata": {}, 649 | "outputs": [ 650 | { 651 | "data": { 652 | "text/plain": [ 653 | "array([-0.03990041, 0.65785939, -1.51609552, -0.34599 , -1.3949554 ,\n", 654 | " -2.22828078, 1.38388181, -4.86729765, -1.85465634, 1.82818699], dtype=float32)" 655 | ] 656 | }, 657 | "execution_count": 33, 658 | "metadata": {}, 659 | "output_type": "execute_result" 660 | } 661 | ], 662 | "source": [ 663 | "zero_grad(torch_X)\n", 664 | "grad_reverse = gradient_reversal(torch_X)\n", 665 | "grad_reverse.backward(torch.ones(grad_reverse.size()))\n", 666 | "torch_X.grad.data.numpy()" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": 34, 672 | "metadata": {}, 673 | "outputs": [ 674 | { 675 | "data": { 676 | "text/plain": [ 677 | "array([-0.03990041, 0.65785937, -1.51609554, -0.34598999, -1.39495543,\n", 678 | " -2.22828078, 1.38388185, -4.86729754, -1.85465631, 1.82818696])" 679 | ] 680 | }, 681 | "execution_count": 34, 682 | "metadata": {}, 683 | "output_type": "execute_result" 684 | } 685 | ], 686 | "source": [ 687 | "-2*X # It should be same as above" 688 | ] 689 | }, 690 | { 691 | "cell_type": "markdown", 692 | "metadata": {}, 693 | "source": [ 694 | "### Pytorch backward hooks" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": 35, 700 | "metadata": {}, 701 | "outputs": [ 702 | { 703 | "data": { 704 | "text/plain": [ 705 | "array([-0.03990041, 0.65785939, -1.51609552, -0.34599 , -1.3949554 ,\n", 706 | " -2.22828078, 1.38388181, -4.86729765, -1.85465634, 1.82818699], dtype=float32)" 707 | ] 708 | }, 709 | "execution_count": 35, 710 | "metadata": {}, 711 | "output_type": "execute_result" 712 | } 713 | ], 714 | "source": [ 715 | "# Gradient reversal\n", 716 | "zero_grad(torch_X)\n", 717 | "f_X = f(torch_X)\n", 718 | "f_X.register_hook(lambda grad: -grad)\n", 719 | "f_X.backward(torch.ones(f_X.size()))\n", 720 | "torch_X.grad.data.numpy()" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 36, 726 | "metadata": {}, 727 | "outputs": [ 728 | { 729 | "data": { 730 | "text/plain": [ 731 | "array([-0.03990041, 0.65785937, -1.51609554, -0.34598999, -1.39495543,\n", 732 | " -2.22828078, 1.38388185, -4.86729754, -1.85465631, 1.82818696])" 733 | ] 734 | }, 735 | "execution_count": 36, 736 | "metadata": {}, 737 | "output_type": "execute_result" 738 | } 739 | ], 740 | "source": [ 741 | "-2*X" 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": 37, 747 | "metadata": {}, 748 | "outputs": [ 749 | { 750 | "data": { 751 | "text/plain": [ 752 | "array([ 0.04109445, -0.33327514, 3.24000454, 0.43577182,\n", 753 | " 2.85438085, 5.95220757, 0.05246484, 22.63523865,\n", 754 | " 4.43446875, 0.67851377], dtype=float32)" 755 | ] 756 | }, 757 | "execution_count": 37, 758 | "metadata": {}, 759 | "output_type": "execute_result" 760 | } 761 | ], 762 | "source": [ 763 | "# Modified grad example\n", 764 | "zero_grad(torch_X)\n", 765 | "h = torch_X.register_hook(lambda grad: grad + 3*(torch_X*torch_X))\n", 766 | "f_X = f(torch_X)\n", 767 | "f_X.backward(torch.ones(f_X.size()))\n", 768 | "h.remove()\n", 769 | "torch_X.grad.data.numpy()" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": 38, 775 | "metadata": {}, 776 | "outputs": [ 777 | { 778 | "data": { 779 | "text/plain": [ 780 | "array([ 0.04109445, -0.33327516, 3.2400048 , 0.4357718 ,\n", 781 | " 2.85438092, 5.95220721, 0.05246488, 22.63523654,\n", 782 | " 4.43446883, 0.67851371])" 783 | ] 784 | }, 785 | "execution_count": 38, 786 | "metadata": {}, 787 | "output_type": "execute_result" 788 | } 789 | ], 790 | "source": [ 791 | "2*X + 3*(X*X) # It should be same as above" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": null, 797 | "metadata": { 798 | "collapsed": true 799 | }, 800 | "outputs": [], 801 | "source": [] 802 | } 803 | ], 804 | "metadata": { 805 | "kernelspec": { 806 | "display_name": "Python [default]", 807 | "language": "python", 808 | "name": "python3" 809 | }, 810 | "language_info": { 811 | "codemirror_mode": { 812 | "name": "ipython", 813 | "version": 3 814 | }, 815 | "file_extension": ".py", 816 | "mimetype": "text/x-python", 817 | "name": "python", 818 | "nbconvert_exporter": "python", 819 | "pygments_lexer": "ipython3", 820 | "version": "3.6.1" 821 | } 822 | }, 823 | "nbformat": 4, 824 | "nbformat_minor": 2 825 | } 826 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pytorch-practice 2 | Some example scripts on pytorch 3 | 4 | ## CONLL 2000 Chunking task 5 | 6 | Uses BiLSTM CRF loss with char CNN embeddings. To run use: 7 | 8 | ``` 9 | cd data/conll2000 10 | bash get_data.sh 11 | cd .. 12 | python chunking_bilstm_crf_char_concat.py # Takes around # 8 hours on Tesla K80 GPU 13 | ``` 14 | 15 | 92.82% mean F1 on test data. 16 | 17 | -------------------------------------------------------------------------------- /Seq_EWC_losses.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/Seq_EWC_losses.pdf -------------------------------------------------------------------------------- /Seq_EWC_predictions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/Seq_EWC_predictions.pdf -------------------------------------------------------------------------------- /Viterbi decoding and CRF.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "np.random.seed(2017)\n", 13 | "\n", 14 | "import torch\n", 15 | "torch.manual_seed(2017)\n", 16 | "\n", 17 | "from scipy.misc import logsumexp # Use it for reference checking implementation" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "Emissions:\n", 30 | "[[ 9. 6.]\n", 31 | " [ 13. 10.]\n", 32 | " [ 8. 18.]\n", 33 | " [ 3. 15.]]\n", 34 | "Transitions:\n", 35 | "[[ 7. 8.]\n", 36 | " [ 0. 8.]]\n" 37 | ] 38 | } 39 | ], 40 | "source": [ 41 | "seq_length, num_states=4, 2\n", 42 | "emissions = np.random.randint(20, size=(seq_length,num_states))*1.\n", 43 | "transitions = np.random.randint(10, size=(num_states, num_states))*1.\n", 44 | "print(\"Emissions:\", emissions, sep=\"\\n\")\n", 45 | "print(\"Transitions:\", transitions, sep=\"\\n\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": { 52 | "collapsed": true 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "def viterbi_decoding(emissions, transitions):\n", 57 | " # Use help from: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/crf/python/ops/crf.py\n", 58 | " scores = np.zeros_like(emissions)\n", 59 | " back_pointers = np.zeros_like(emissions, dtype=\"int\")\n", 60 | " scores = emissions[0]\n", 61 | " # Generate most likely scores and paths for each step in sequence\n", 62 | " for i in range(1, emissions.shape[0]):\n", 63 | " score_with_transition = np.expand_dims(scores, 1) + transitions\n", 64 | " scores = emissions[i] + score_with_transition.max(axis=0)\n", 65 | " back_pointers[i] = np.argmax(score_with_transition, 0)\n", 66 | " # Generate the most likely path\n", 67 | " viterbi = [np.argmax(scores)]\n", 68 | " for bp in reversed(back_pointers[1:]):\n", 69 | " viterbi.append(bp[viterbi[-1]])\n", 70 | " viterbi.reverse()\n", 71 | " viterbi_score = np.max(scores)\n", 72 | " return viterbi_score, viterbi" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "(78.0, [0, 0, 1, 1])" 84 | ] 85 | }, 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "viterbi_decoding(emissions, transitions)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "def viterbi_decoding_torch(emissions, transitions):\n", 104 | " scores = torch.zeros(emissions.size(1))\n", 105 | " back_pointers = torch.zeros(emissions.size()).int()\n", 106 | " scores = scores + emissions[0]\n", 107 | " # Generate most likely scores and paths for each step in sequence\n", 108 | " for i in range(1, emissions.size(0)):\n", 109 | " scores_with_transitions = scores.unsqueeze(1).expand_as(transitions) + transitions\n", 110 | " max_scores, back_pointers[i] = torch.max(scores_with_transitions, 0)\n", 111 | " scores = emissions[i] + max_scores\n", 112 | " # Generate the most likely path\n", 113 | " viterbi = [scores.numpy().argmax()]\n", 114 | " back_pointers = back_pointers.numpy()\n", 115 | " for bp in reversed(back_pointers[1:]):\n", 116 | " viterbi.append(bp[viterbi[-1]])\n", 117 | " viterbi.reverse()\n", 118 | " viterbi_score = scores.numpy().max()\n", 119 | " return viterbi_score, viterbi\n", 120 | " " 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 6, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "(78.0, [0, 0, 1, 1])" 132 | ] 133 | }, 134 | "execution_count": 6, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "viterbi_decoding_torch(torch.Tensor(emissions), torch.Tensor(transitions))" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 7, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "(78.0, [0, 0, 1, 1])" 152 | ] 153 | }, 154 | "execution_count": 7, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "viterbi_decoding(emissions, transitions)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 8, 166 | "metadata": { 167 | "collapsed": true 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "def log_sum_exp(vecs, axis=None, keepdims=False):\n", 172 | " ## Use help from: https://github.com/scipy/scipy/blob/v0.18.1/scipy/misc/common.py#L20-L140\n", 173 | " max_val = vecs.max(axis=axis, keepdims=True)\n", 174 | " vecs = vecs - max_val\n", 175 | " if not keepdims:\n", 176 | " max_val = max_val.squeeze(axis=axis)\n", 177 | " out_val = np.log(np.exp(vecs).sum(axis=axis, keepdims=keepdims))\n", 178 | " return max_val + out_val" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 9, 184 | "metadata": { 185 | "collapsed": true 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "def score_sequence(emissions, transitions, tags):\n", 190 | " # Use help from: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/crf/python/ops/crf.py\n", 191 | " score = emissions[0][tags[0]]\n", 192 | " for i, emission in enumerate(emissions[1:]):\n", 193 | " score = score + transitions[tags[i], tags[i+1]] + emission[tags[i+1]]\n", 194 | " return score" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 10, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/plain": [ 205 | "42.0" 206 | ] 207 | }, 208 | "execution_count": 10, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "score_sequence(emissions, transitions, [1,1,0,0])" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 11, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "[7.0, 8.0, 8.0]" 226 | ] 227 | }, 228 | "execution_count": 11, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "correct_seq = [0, 0, 1, 1]\n", 235 | "[transitions[correct_seq[i],correct_seq[i+1]] for i in range(len(correct_seq) -1)]" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 12, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "23.0" 247 | ] 248 | }, 249 | "execution_count": 12, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "sum([transitions[correct_seq[i], correct_seq[i+1]] for i in range(len(correct_seq) -1)])" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 13, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/plain": [ 266 | "(78.0, [0, 0, 1, 1])" 267 | ] 268 | }, 269 | "execution_count": 13, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "viterbi_decoding(emissions, transitions)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 14, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "78.0" 287 | ] 288 | }, 289 | "execution_count": 14, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "score_sequence(emissions, transitions, [0, 0, 1, 1])" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 15, 301 | "metadata": { 302 | "collapsed": true 303 | }, 304 | "outputs": [], 305 | "source": [ 306 | "def score_sequence_torch(emissions, transitions, tags):\n", 307 | " score = emissions[0][tags[0]]\n", 308 | " for i, emission in enumerate(emissions[1:]):\n", 309 | " score = score + transitions[tags[i], tags[i+1]] + emission[tags[i+1]]\n", 310 | " return score" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 16, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/plain": [ 321 | "78.0" 322 | ] 323 | }, 324 | "execution_count": 16, 325 | "metadata": {}, 326 | "output_type": "execute_result" 327 | } 328 | ], 329 | "source": [ 330 | "score_sequence_torch(torch.Tensor(emissions), torch.Tensor(transitions), [0, 0, 1, 1])" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 17, 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/plain": [ 341 | "[[0, 0, 0, 0],\n", 342 | " [1, 0, 0, 0],\n", 343 | " [0, 1, 0, 0],\n", 344 | " [1, 1, 0, 0],\n", 345 | " [0, 0, 1, 0],\n", 346 | " [1, 0, 1, 0],\n", 347 | " [0, 1, 1, 0],\n", 348 | " [1, 1, 1, 0],\n", 349 | " [0, 0, 0, 1],\n", 350 | " [1, 0, 0, 1],\n", 351 | " [0, 1, 0, 1],\n", 352 | " [1, 1, 0, 1],\n", 353 | " [0, 0, 1, 1],\n", 354 | " [1, 0, 1, 1],\n", 355 | " [0, 1, 1, 1],\n", 356 | " [1, 1, 1, 1]]" 357 | ] 358 | }, 359 | "execution_count": 17, 360 | "metadata": {}, 361 | "output_type": "execute_result" 362 | } 363 | ], 364 | "source": [ 365 | "def get_all_tags(seq_length, num_labels):\n", 366 | " if seq_length == 0:\n", 367 | " yield []\n", 368 | " return\n", 369 | " for sequence in get_all_tags(seq_length-1, num_labels):\n", 370 | " #print(sequence, seq_length)\n", 371 | " for label in range(num_labels):\n", 372 | " yield [label] + sequence \n", 373 | "list(get_all_tags(4,2))" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 18, 379 | "metadata": {}, 380 | "outputs": [ 381 | { 382 | "data": { 383 | "text/plain": [ 384 | "[[0, 0], [0, 1], [1, 0], [1, 1]]" 385 | ] 386 | }, 387 | "execution_count": 18, 388 | "metadata": {}, 389 | "output_type": "execute_result" 390 | } 391 | ], 392 | "source": [ 393 | "def get_all_tags_dp(seq_length, num_labels):\n", 394 | " prior_tags = [[]]\n", 395 | " for i in range(1, seq_length+1):\n", 396 | " new_tags = []\n", 397 | " for label in range(num_labels):\n", 398 | " for tags in prior_tags:\n", 399 | " new_tags.append([label] + tags)\n", 400 | " prior_tags = new_tags\n", 401 | " return new_tags\n", 402 | "list(get_all_tags_dp(2,2))" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 19, 408 | "metadata": {}, 409 | "outputs": [ 410 | { 411 | "name": "stdout", 412 | "output_type": "stream", 413 | "text": [ 414 | "[54.0, 67.0, 58.0, 78.0, 45.0, 58.0, 56.0, 76.0, 44.0, 57.0, 48.0, 68.0, 42.0, 55.0, 53.0, 73.0]\n" 415 | ] 416 | } 417 | ], 418 | "source": [ 419 | "def brute_force_score(emissions, transitions):\n", 420 | " # This is for ensuring the correctness of the dynamic programming method.\n", 421 | " # DO NOT run with very high values of number of labels or sequence lengths\n", 422 | " for tags in get_all_tags_dp(*emissions.shape):\n", 423 | " yield score_sequence(emissions, transitions, tags)\n", 424 | "\n", 425 | " \n", 426 | "brute_force_sequence_scores = list(brute_force_score(emissions, transitions))\n", 427 | "print(brute_force_sequence_scores)" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 20, 433 | "metadata": {}, 434 | "outputs": [ 435 | { 436 | "data": { 437 | "text/plain": [ 438 | "78.0" 439 | ] 440 | }, 441 | "execution_count": 20, 442 | "metadata": {}, 443 | "output_type": "execute_result" 444 | } 445 | ], 446 | "source": [ 447 | "max(brute_force_sequence_scores) # Best score calcuated using brute force" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 21, 453 | "metadata": {}, 454 | "outputs": [ 455 | { 456 | "data": { 457 | "text/plain": [ 458 | "78.132899613126483" 459 | ] 460 | }, 461 | "execution_count": 21, 462 | "metadata": {}, 463 | "output_type": "execute_result" 464 | } 465 | ], 466 | "source": [ 467 | "log_sum_exp(np.array(brute_force_sequence_scores)) # Partition function" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 22, 473 | "metadata": { 474 | "collapsed": true 475 | }, 476 | "outputs": [], 477 | "source": [ 478 | "def forward_algorithm_naive(emissions, transitions):\n", 479 | " scores = emissions[0]\n", 480 | " # Get the log sum exp score\n", 481 | " for i in range(1,emissions.shape[0]):\n", 482 | " print(scores)\n", 483 | " alphas_t = np.zeros_like(scores) # Forward vars at timestep t\n", 484 | " for j in range(emissions.shape[1]):\n", 485 | " emit_score = emissions[i,j]\n", 486 | " trans_score = transitions.T[j]\n", 487 | " next_tag_var = scores + trans_score\n", 488 | " alphas_t[j] = log_sum_exp(next_tag_var) + emit_score\n", 489 | " scores = alphas_t\n", 490 | " return log_sum_exp(scores)" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 23, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "name": "stdout", 500 | "output_type": "stream", 501 | "text": [ 502 | "[ 9. 6.]\n", 503 | "[ 29.0000454 27.04858735]\n", 504 | "[ 44.00017494 55.13288499]\n" 505 | ] 506 | }, 507 | { 508 | "data": { 509 | "text/plain": [ 510 | "78.132899613126483" 511 | ] 512 | }, 513 | "execution_count": 23, 514 | "metadata": {}, 515 | "output_type": "execute_result" 516 | } 517 | ], 518 | "source": [ 519 | "forward_algorithm_naive(emissions, transitions)" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 24, 525 | "metadata": { 526 | "collapsed": true 527 | }, 528 | "outputs": [], 529 | "source": [ 530 | "def forward_algorithm_vec_check(emissions, transitions):\n", 531 | " # This is for checking the correctedness of log_sum_exp function compared to scipy\n", 532 | " scores = emissions[0]\n", 533 | " scores_naive = emissions[0]\n", 534 | " # Get the log sum exp score\n", 535 | " for i in range(1, emissions.shape[0]):\n", 536 | " print(scores, scores_naive)\n", 537 | " scores = emissions[i] + logsumexp(\n", 538 | " scores_naive + transitions.T,\n", 539 | " axis=1)\n", 540 | " scores_naive = emissions[i] + np.array([log_sum_exp(\n", 541 | " scores_naive + transitions.T[j]) for j in range(emissions.shape[1])])\n", 542 | " print(scores, scores_naive)\n", 543 | " return logsumexp(scores), log_sum_exp(scores_naive)" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 25, 549 | "metadata": {}, 550 | "outputs": [ 551 | { 552 | "name": "stdout", 553 | "output_type": "stream", 554 | "text": [ 555 | "[ 9. 6.] [ 9. 6.]\n", 556 | "[ 29.0000454 27.04858735] [ 29.0000454 27.04858735]\n", 557 | "[ 44.00017494 55.13288499] [ 44.00017494 55.13288499]\n", 558 | "[ 58.14879707 78.13289961] [ 58.14879707 78.13289961]\n" 559 | ] 560 | }, 561 | { 562 | "data": { 563 | "text/plain": [ 564 | "(78.132899613126483, 78.132899613126483)" 565 | ] 566 | }, 567 | "execution_count": 25, 568 | "metadata": {}, 569 | "output_type": "execute_result" 570 | } 571 | ], 572 | "source": [ 573 | "forward_algorithm_vec_check(emissions, transitions)" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 26, 579 | "metadata": { 580 | "collapsed": true 581 | }, 582 | "outputs": [], 583 | "source": [ 584 | "def forward_algorithm(emissions, transitions):\n", 585 | " scores = emissions[0]\n", 586 | " # Get the log sum exp score\n", 587 | " for i in range(1, emissions.shape[0]):\n", 588 | " scores = emissions[i] + log_sum_exp(\n", 589 | " scores + transitions.T,\n", 590 | " axis=1)\n", 591 | " return log_sum_exp(scores)" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 27, 597 | "metadata": {}, 598 | "outputs": [ 599 | { 600 | "data": { 601 | "text/plain": [ 602 | "78.132899613126483" 603 | ] 604 | }, 605 | "execution_count": 27, 606 | "metadata": {}, 607 | "output_type": "execute_result" 608 | } 609 | ], 610 | "source": [ 611 | "forward_algorithm(emissions, transitions)" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 28, 617 | "metadata": {}, 618 | "outputs": [], 619 | "source": [ 620 | "tt = torch.Tensor(emissions)\n", 621 | "tt_max, _ = tt.max(1)" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 29, 627 | "metadata": {}, 628 | "outputs": [ 629 | { 630 | "data": { 631 | "text/plain": [ 632 | "\n", 633 | " 9 9\n", 634 | " 13 13\n", 635 | " 18 18\n", 636 | " 15 15\n", 637 | "[torch.FloatTensor of size 4x2]" 638 | ] 639 | }, 640 | "execution_count": 29, 641 | "metadata": {}, 642 | "output_type": "execute_result" 643 | } 644 | ], 645 | "source": [ 646 | "tt_max.expand_as(tt)" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": 30, 652 | "metadata": {}, 653 | "outputs": [ 654 | { 655 | "data": { 656 | "text/plain": [ 657 | "\n", 658 | " 33 49\n", 659 | "[torch.FloatTensor of size 1x2]" 660 | ] 661 | }, 662 | "execution_count": 30, 663 | "metadata": {}, 664 | "output_type": "execute_result" 665 | } 666 | ], 667 | "source": [ 668 | "tt.sum(0)" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 31, 674 | "metadata": {}, 675 | "outputs": [ 676 | { 677 | "data": { 678 | "text/plain": [ 679 | "\n", 680 | " 9 6\n", 681 | " 13 10\n", 682 | " 8 18\n", 683 | " 3 15\n", 684 | "[torch.FloatTensor of size 4x2]" 685 | ] 686 | }, 687 | "execution_count": 31, 688 | "metadata": {}, 689 | "output_type": "execute_result" 690 | } 691 | ], 692 | "source": [ 693 | "tt.squeeze(0)" 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": 32, 699 | "metadata": {}, 700 | "outputs": [ 701 | { 702 | "data": { 703 | "text/plain": [ 704 | "\n", 705 | " 9 13 8 3\n", 706 | " 6 10 18 15\n", 707 | "[torch.FloatTensor of size 2x4]" 708 | ] 709 | }, 710 | "execution_count": 32, 711 | "metadata": {}, 712 | "output_type": "execute_result" 713 | } 714 | ], 715 | "source": [ 716 | "tt.transpose(-1,-2)" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": 33, 722 | "metadata": {}, 723 | "outputs": [ 724 | { 725 | "data": { 726 | "text/plain": [ 727 | "2" 728 | ] 729 | }, 730 | "execution_count": 33, 731 | "metadata": {}, 732 | "output_type": "execute_result" 733 | } 734 | ], 735 | "source": [ 736 | "tt.ndimension()" 737 | ] 738 | }, 739 | { 740 | "cell_type": "code", 741 | "execution_count": 34, 742 | "metadata": { 743 | "collapsed": true 744 | }, 745 | "outputs": [], 746 | "source": [ 747 | "def log_sum_exp_torch(vecs, axis=None):\n", 748 | " ## Use help from: http://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html#sphx-glr-beginner-nlp-advanced-tutorial-py\n", 749 | " if axis < 0:\n", 750 | " axis = vecs.ndimension()+axis\n", 751 | " max_val, _ = vecs.max(axis)\n", 752 | " vecs = vecs - max_val.expand_as(vecs)\n", 753 | " out_val = torch.log(torch.exp(vecs).sum(axis))\n", 754 | " #print(max_val, out_val)\n", 755 | " return max_val + out_val" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": 35, 761 | "metadata": { 762 | "collapsed": true 763 | }, 764 | "outputs": [], 765 | "source": [ 766 | "def forward_algorithm_torch(emissions, transitions):\n", 767 | " scores = emissions[0]\n", 768 | " # Get the log sum exp score\n", 769 | " transitions = transitions.transpose(-1,-2)\n", 770 | " for i in range(1, emissions.size(0)):\n", 771 | " scores = emissions[i] + log_sum_exp_torch(\n", 772 | " scores.expand_as(transitions) + transitions,\n", 773 | " axis=1)\n", 774 | " return log_sum_exp_torch(scores, axis=-1)" 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": 36, 780 | "metadata": {}, 781 | "outputs": [ 782 | { 783 | "data": { 784 | "text/plain": [ 785 | "\n", 786 | " 78.1329\n", 787 | "[torch.FloatTensor of size 1]" 788 | ] 789 | }, 790 | "execution_count": 36, 791 | "metadata": {}, 792 | "output_type": "execute_result" 793 | } 794 | ], 795 | "source": [ 796 | "forward_algorithm_torch(torch.Tensor(emissions), torch.Tensor(transitions))" 797 | ] 798 | }, 799 | { 800 | "cell_type": "markdown", 801 | "metadata": {}, 802 | "source": [ 803 | "The core idea is to find the sequence of states $y = \\{y_0, y_1, ..., y_N\\}$ which have the highest probability given the input $X = \\{X_0, X_1, ..., X_N\\}$ as follows:\n", 804 | "\n", 805 | "$$\n", 806 | "\\begin{equation}\n", 807 | "p(y\\mid X) = \\prod_{i=0}^{N}{p(y_i\\mid X_i)p(y_i \\mid y_{i-1})}\\\\\n", 808 | "\\log{p(y\\mid X)} = \\sum_{i=0}^{N}{\\log{p(y_i\\mid X_i)} + \\log{p(y_i \\mid y_{i-1})}}\\\\\n", 809 | "\\end{equation}\n", 810 | "$$\n", 811 | "\n", 812 | "Now $\\log{p(y_i\\mid X_i)}$ and $\\log{p(y_i \\mid y_{i-1})}$ can be parameterized as follows:\n", 813 | "\n", 814 | "$$\n", 815 | "\\begin{equation}\n", 816 | "\\log{p(y_i\\mid X_i)} = \\sum_{l=0}^{L}{\\sum_{k=0}^{K}{w_{k}^{l}*\\phi_{k}^{l}(X_i, y_i)}}\\\\\n", 817 | "\\log{p(y_i\\mid y_{y-1})} = \\sum_{l=0}^{L}{\\sum_{l'=0}^{L}{w_{l'}^{l}*\\psi_{l'}^{l}(y_i, y_{i-1})}}\\\\\n", 818 | "\\implies \\log{p(y\\mid X)} = \\sum_{i=0}^{N}{(\\sum_{l=0}^{L}{\\sum_{k=0}^{K}{w_{k}^{l}*\\phi_{k}^{l}(X_i, y_i)}}\n", 819 | "+ \\sum_{l=0}^{L}{\\sum_{l'=0}^{L}{w_{l'}^{l}*\\psi_{l'}^{l}(y_i, y_{i-1})}})}\\\\\n", 820 | "\\implies \\log{p(y\\mid X)} = \\sum_{i=0}^{N}{(\\Phi(X_i)W_{emission} + \\log{p(y_{i-1} \\mid X_{i-1})}W_{transition})}\n", 821 | "\\end{equation}\n", 822 | "$$\n", 823 | "\n", 824 | "Where, \n", 825 | "\n", 826 | "* $N$ is the sequence length\n", 827 | "* $K$ is number of feature functions,\n", 828 | "* $L$ is number of states\n", 829 | "* $W_{emission}$ is $K*L$ matrix\n", 830 | "* $W_{transition}$ is $L*L$ matrix\n", 831 | "* $\\Phi(X_i)$ is a feature vector of shape $1*K$\n", 832 | "* $(\\Phi(X_i)W_{emission} + \\log{p(y_{i-1} \\mid X_{i-1})}W_{transition})$ gives the score for each label\n", 833 | "\n" 834 | ] 835 | }, 836 | { 837 | "cell_type": "code", 838 | "execution_count": null, 839 | "metadata": { 840 | "collapsed": true 841 | }, 842 | "outputs": [], 843 | "source": [] 844 | } 845 | ], 846 | "metadata": { 847 | "kernelspec": { 848 | "display_name": "Python [default]", 849 | "language": "python", 850 | "name": "python3" 851 | }, 852 | "language_info": { 853 | "codemirror_mode": { 854 | "name": "ipython", 855 | "version": 3 856 | }, 857 | "file_extension": ".py", 858 | "mimetype": "text/x-python", 859 | "name": "python", 860 | "nbconvert_exporter": "python", 861 | "pygments_lexer": "ipython3", 862 | "version": "3.5.2" 863 | } 864 | }, 865 | "nbformat": 4, 866 | "nbformat_minor": 2 867 | } 868 | -------------------------------------------------------------------------------- /chunking_bilstm_crf_char_concat.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import matplotlib 7 | matplotlib.use("Agg") 8 | import torch 9 | from torch.autograd import Variable 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | 14 | torch.manual_seed(1) 15 | 16 | import numpy as np 17 | 18 | import matplotlib.pyplot as plt 19 | import seaborn as sns 20 | 21 | from pytorch_utils import * 22 | from pytorch_models import * 23 | from utils import load_sequences, conll_classification_report_to_df 24 | from conlleval import main as conll_eval 25 | import re 26 | 27 | sns.set_context("poster") 28 | sns.set_style("ticks") 29 | 30 | 31 | # In[2]: 32 | 33 | TRAIN_CORPUS="data/conll2000/train.txt" 34 | TEST_CORPUS="data/conll2000/test.txt" 35 | 36 | 37 | # In[3]: 38 | 39 | train_corpus = load_sequences(TRAIN_CORPUS, sep=" ", col_ids=(0, -1)) 40 | train_corpus, dev_corpus = train_corpus[100:], train_corpus[:100] 41 | print("Total items in train corpus: %s" % len(train_corpus)) 42 | print("Total items in dev corpus: %s" % len(dev_corpus)) 43 | test_corpus = load_sequences(TEST_CORPUS, sep=" ", col_ids=(0, -1)) 44 | print("Total items in test corpus: %s" % len(test_corpus)) 45 | 46 | 47 | # In[5]: 48 | 49 | def create_vocab(data, vocabs, char_vocab, word_idx=0): 50 | n_vocabs = len(vocabs) 51 | for sent in data: 52 | for token_tags in sent: 53 | for vocab_id in range(n_vocabs): 54 | vocabs[vocab_id].add(token_tags[vocab_id]) 55 | char_vocab.batch_add(token_tags[word_idx]) 56 | print("Created vocabs: %s, chars[%s]" % (", ".join( 57 | "{}[{}]".format(vocab.name, vocab.size) 58 | for vocab in vocabs 59 | ), char_vocab.size)) 60 | 61 | 62 | # In[6]: 63 | 64 | word_vocab = Vocab("words", UNK="UNK", lower=True) 65 | char_vocab = Vocab("chars", UNK="", lower=False) 66 | chunk_vocab = Vocab("chunk_tags", lower=False) 67 | 68 | create_vocab(train_corpus+dev_corpus+test_corpus, [word_vocab, chunk_vocab], char_vocab) 69 | 70 | 71 | # In[7]: 72 | 73 | def data2tensors(data, vocabs, char_vocab, word_idx=0, column_ids=(0, -1)): 74 | vocabs = [vocabs[idx] for idx in column_ids] 75 | n_vocabs = len(vocabs) 76 | tensors = [] 77 | char_tensors = [] 78 | for sent in data: 79 | sent_vecs = [[] for i in range(n_vocabs+1)] # Last is for char vecs 80 | char_vecs = [] 81 | for token_tags in sent: 82 | vocab_id = 0 # First column is the word 83 | # lowercase the word 84 | sent_vecs[vocab_id].append( 85 | vocabs[vocab_id].getidx(token_tags[vocab_id].lower()) 86 | ) 87 | for vocab_id in range(1, n_vocabs): 88 | sent_vecs[vocab_id].append( 89 | vocabs[vocab_id].getidx(token_tags[vocab_id]) 90 | ) 91 | sent_vecs[-1].append( 92 | [char_vocab.getidx(c) for c in token_tags[word_idx]] 93 | ) 94 | tensors.append(sent_vecs) 95 | return tensors 96 | 97 | 98 | # In[8]: 99 | 100 | train_tensors = data2tensors(train_corpus, [word_vocab, chunk_vocab], char_vocab) 101 | dev_tensors = data2tensors(dev_corpus, [word_vocab, chunk_vocab], char_vocab) 102 | test_tensors = data2tensors(test_corpus, [word_vocab, chunk_vocab], char_vocab) 103 | print("Train: {}, Dev: {}, Test: {}".format( 104 | len(train_tensors), 105 | len(dev_tensors), 106 | len(test_tensors), 107 | )) 108 | 109 | 110 | # In[9]: 111 | 112 | embedding_file="/home/napsternxg/datadrive/Downloads/Glove/glove.6B.100d.txt" 113 | cache_file="conll2000.glove.100.npy" 114 | ndims=100 115 | pretrained_embeddings = load_word_vectors(embedding_file, ndims, word_vocab, cache_file) 116 | 117 | 118 | # In[10]: 119 | 120 | def plot_losses(train_losses, eval_losses=None, plot_std=False, ax=None): 121 | if ax is None: 122 | ax = plt.gca() 123 | for losses, color, label in zip( 124 | [train_losses, eval_losses], 125 | ["0.5", "r"], 126 | ["Train", "Eval"], 127 | ): 128 | mean_loss, std_loss = zip(*losses) 129 | mean_loss = np.array(mean_loss) 130 | std_loss = np.array(std_loss) 131 | ax.plot( 132 | mean_loss, color=color, label=label, 133 | linestyle="-", 134 | ) 135 | if plot_std: 136 | ax.fill_between( 137 | np.arange(mean_loss.shape[0]), 138 | mean_loss-std_loss, 139 | mean_loss+std_loss, 140 | color=color, 141 | alpha=0.3 142 | ) 143 | ax.set_xlabel("Epochs") 144 | ax.set_ylabel("Mean Loss ($\pm$ S.D.)") 145 | 146 | 147 | def print_predictions(corpus, predictions, filename, label_vocab): 148 | with open(filename, "w+") as fp: 149 | for seq, pred in zip(corpus, predictions): 150 | for (token, true_label), pred_label in zip(seq, pred): 151 | pred_label = label_vocab.idx2item[pred_label] 152 | print("{}\t{}\t{}".format(token, true_label, pred_label), file=fp) 153 | print(file=fp) # Add new line after each sequence 154 | 155 | 156 | # In[11]: 157 | 158 | # ## Class based 159 | 160 | # In[19]: 161 | 162 | class BiLSTMTaggerWordCRFModel(ModelWrapper): 163 | def __init__(self, model, 164 | loss_function, 165 | use_cuda=False): 166 | self.model = model 167 | self.loss_function = None 168 | 169 | self.use_cuda = use_cuda 170 | if self.use_cuda: 171 | #[k.cuda() for k in self.model.modules()] 172 | self.model.cuda() 173 | 174 | def _process_instance_tensors(self, instance_tensors, volatile=False): 175 | X, Y, X_char = instance_tensors 176 | X = Variable(torch.LongTensor([X]), requires_grad=False, volatile=volatile) 177 | Y = torch.LongTensor(Y) 178 | X_char = charseq2varlist(X_char, volatile=volatile) 179 | return X, X_char, Y 180 | 181 | def get_instance_loss(self, instance_tensors, zero_grad=True): 182 | if zero_grad: 183 | ## Clear gradients before every update else memory runs out 184 | self.model.zero_grad() 185 | X, X_char, Y = instance_tensors 186 | if self.use_cuda: 187 | X = X.cuda(async=True) 188 | Y = Y.cuda(async=True) 189 | X_char = [t.cuda(async=True) for t in X_char] 190 | #print(X.get_device(), [t.get_device() for t in X_char]) 191 | return self.model.loss(X, X_char, Y) 192 | 193 | def predict(self, instance_tensors): 194 | X, X_char, Y = self._process_instance_tensors(instance_tensors, volatile=True) 195 | if self.use_cuda: 196 | X = X.cuda(async=True) 197 | Y = Y.cuda(async=True) 198 | X_char = [t.cuda(async=True) for t in X_char] 199 | emissions = self.model.forward(X, X_char) 200 | return self.model.crf.forward(emissions)[1] 201 | 202 | 203 | use_cuda=True 204 | hidden_size=128 205 | batch_size=64 206 | 207 | char_emb_size=50 208 | output_channels=25 209 | kernel_sizes=[2, 3] 210 | 211 | word_emb_size=100 212 | n_embed=150 # Get this using char embedding and word embed 213 | char_embed_kwargs=dict( 214 | vocab_size=char_vocab.size, 215 | embedding_size=char_emb_size, 216 | out_channels=output_channels, 217 | kernel_sizes=kernel_sizes 218 | ) 219 | 220 | word_char_embedding = WordCharEmbedding( 221 | word_vocab.size, word_emb_size, 222 | char_embed_kwargs, dropout=0, concat=True) 223 | # Assign glove embeddings 224 | assign_embeddings(word_char_embedding.word_embeddings, pretrained_embeddings, fix_embedding=True) 225 | 226 | model_wrapper = BiLSTMTaggerWordCRFModel( 227 | LSTMTaggerWordCharCRF(word_char_embedding, n_embed, hidden_size, chunk_vocab.size), 228 | None, use_cuda=use_cuda) 229 | 230 | 231 | # In[33]: 232 | model_prefix="BiLSTMCharConcatCRF_CONLL2000" 233 | n_epochs=50 234 | training_history = training_wrapper( 235 | model_wrapper, train_tensors, 236 | eval_tensors=dev_tensors, 237 | optimizer=optim.Adam, 238 | optimizer_kwargs={ 239 | #"lr": 0.01, 240 | "weight_decay": 0 241 | }, 242 | n_epochs=n_epochs, 243 | batch_size=batch_size, 244 | use_cuda=use_cuda, 245 | log_file="{}.log".format(model_prefix) 246 | ) 247 | model_wrapper.save("{}.pth".format(model_prefix)) 248 | 249 | 250 | # In[34]: 251 | 252 | fig, ax = plt.subplots(1,1) 253 | plot_losses(training_history["training_loss"], 254 | training_history["evaluation_loss"], 255 | plot_std=True, 256 | ax=ax) 257 | ax.legend() 258 | sns.despine(offset=5) 259 | plt.savefig("{}.pdf".format(model_prefix)) 260 | 261 | for title, tensors, corpus in zip( 262 | ["train", "dev", "test"], 263 | [train_tensors, dev_tensors, test_tensors], 264 | [train_corpus, dev_corpus, test_corpus], 265 | ): 266 | predictions = model_wrapper.predict_batch(tensors, title=title) 267 | print_predictions(corpus, predictions, "%s.chunking.conll" % title, chunk_vocab) 268 | conll_eval(["conlleval", "%s.chunking.conll" % title]) 269 | 270 | 271 | -------------------------------------------------------------------------------- /conll2000.glove.100.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/conll2000.glove.100.npy -------------------------------------------------------------------------------- /conlleval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ## Original script taken from https://github.com/spyysalo/conlleval.py 4 | ## Modifications made by Shubhanshu Mishra to support notypes argument and functional api 5 | 6 | # Python version of the evaluation script from CoNLL'00- 7 | 8 | # Intentional differences: 9 | # - accept any space as delimiter by default 10 | # - optional file argument (default STDIN) 11 | # - option to set boundary (-b argument) 12 | # - LaTeX output (-l argument) not supported 13 | # - raw tags (-r argument) not supported 14 | 15 | import sys 16 | import re 17 | 18 | from collections import defaultdict, namedtuple 19 | 20 | ANY_SPACE = '' 21 | 22 | class FormatError(Exception): 23 | pass 24 | 25 | Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore') 26 | 27 | class EvalCounts(object): 28 | def __init__(self): 29 | self.correct_chunk = 0 # number of correctly identified chunks 30 | self.correct_tags = 0 # number of correct chunk tags 31 | self.found_correct = 0 # number of chunks in corpus 32 | self.found_guessed = 0 # number of identified chunks 33 | self.token_counter = 0 # token counter (ignores sentence breaks) 34 | 35 | # counts by type 36 | self.t_correct_chunk = defaultdict(int) 37 | self.t_found_correct = defaultdict(int) 38 | self.t_found_guessed = defaultdict(int) 39 | 40 | def parse_args(argv): 41 | import argparse 42 | parser = argparse.ArgumentParser( 43 | description='evaluate tagging results using CoNLL criteria', 44 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 45 | ) 46 | arg = parser.add_argument 47 | arg('-b', '--boundary', metavar='STR', default='-X-', 48 | help='sentence boundary') 49 | arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE, 50 | help='character delimiting items in input') 51 | arg('-o', '--otag', metavar='CHAR', default='O', 52 | help='alternative outside tag') 53 | arg('-t', '--no-types', action='store_const', const=True, default=False, 54 | help='evaluate without entity types') 55 | arg('file', nargs='?', default=None) 56 | arg('--outstream', default=None, 57 | help='output file for storing report') 58 | return parser.parse_args(argv) 59 | 60 | def parse_tag(t): 61 | m = re.match(r'^([^-]*)-(.*)$', t) 62 | return m.groups() if m else (t, '') 63 | 64 | def evaluate(iterable, options=None): 65 | if options is None: 66 | options = parse_args([]) # use defaults 67 | counts = EvalCounts() 68 | num_features = None # number of features per line 69 | in_correct = False # currently processed chunks is correct until now 70 | last_correct = 'O' # previous chunk tag in corpus 71 | last_correct_type = '' # type of previously identified chunk tag 72 | last_guessed = 'O' # previously identified chunk tag 73 | last_guessed_type = '' # type of previous chunk tag in corpus 74 | new_sent=True 75 | 76 | for line in iterable: 77 | line = line.rstrip('\r\n') 78 | 79 | if options.delimiter == ANY_SPACE: 80 | features = line.split() 81 | else: 82 | features = line.split(options.delimiter)[-2:] 83 | 84 | if num_features is None: 85 | num_features = len(features) 86 | elif num_features != len(features) and len(features) != 0: 87 | raise FormatError('unexpected number of features: %d (%d)' % 88 | (len(features), num_features)) 89 | 90 | if len(features) == 0 or features[0] == options.boundary: 91 | features = ['O', 'O'] 92 | new_sent=True 93 | else: 94 | new_sent=False 95 | if len(features) < 2: 96 | raise FormatError('unexpected number of features in line %s' % line) 97 | 98 | guessed, guessed_type = parse_tag(features.pop()) 99 | correct, correct_type = parse_tag(features.pop()) 100 | if options.no_types: 101 | guessed_type = '' 102 | correct_type = '' 103 | 104 | if new_sent: 105 | guessed = 'O' 106 | 107 | end_correct = end_of_chunk(last_correct, correct, 108 | last_correct_type, correct_type) 109 | end_guessed = end_of_chunk(last_guessed, guessed, 110 | last_guessed_type, guessed_type) 111 | start_correct = start_of_chunk(last_correct, correct, 112 | last_correct_type, correct_type) 113 | start_guessed = start_of_chunk(last_guessed, guessed, 114 | last_guessed_type, guessed_type) 115 | 116 | if in_correct: 117 | if (end_correct and end_guessed and 118 | last_guessed_type == last_correct_type): 119 | in_correct = False 120 | counts.correct_chunk += 1 121 | counts.t_correct_chunk[last_correct_type] += 1 122 | elif (end_correct != end_guessed or guessed_type != correct_type): 123 | in_correct = False 124 | 125 | if start_correct and start_guessed and guessed_type == correct_type: 126 | in_correct = True 127 | 128 | if start_correct: 129 | counts.found_correct += 1 130 | counts.t_found_correct[correct_type] += 1 131 | if start_guessed: 132 | counts.found_guessed += 1 133 | counts.t_found_guessed[guessed_type] += 1 134 | if not new_sent: 135 | if correct == guessed and guessed_type == correct_type: 136 | counts.correct_tags += 1 137 | counts.token_counter += 1 138 | 139 | last_guessed = guessed 140 | last_correct = correct 141 | last_guessed_type = guessed_type 142 | last_correct_type = correct_type 143 | 144 | if in_correct: 145 | counts.correct_chunk += 1 146 | counts.t_correct_chunk[last_correct_type] += 1 147 | 148 | return counts 149 | 150 | def uniq(iterable): 151 | seen = set() 152 | return [i for i in iterable if not (i in seen or seen.add(i))] 153 | 154 | def calculate_metrics(correct, guessed, total): 155 | tp, fp, fn = correct, guessed-correct, total-correct 156 | p = 0 if tp + fp == 0 else 1.*tp / (tp + fp) 157 | r = 0 if tp + fn == 0 else 1.*tp / (tp + fn) 158 | f = 0 if p + r == 0 else 2 * p * r / (p + r) 159 | return Metrics(tp, fp, fn, p, r, f) 160 | 161 | def metrics(counts): 162 | c = counts 163 | overall = calculate_metrics( 164 | c.correct_chunk, c.found_guessed, c.found_correct 165 | ) 166 | by_type = {} 167 | for t in uniq(list(c.t_found_correct.keys()) + list(c.t_found_guessed.keys())): 168 | by_type[t] = calculate_metrics( 169 | c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t] 170 | ) 171 | return overall, by_type 172 | 173 | def report(counts, out=None): 174 | if out is None: 175 | out = sys.stdout 176 | 177 | overall, by_type = metrics(counts) 178 | 179 | c = counts 180 | out.write('processed %d tokens with %d phrases; ' % 181 | (c.token_counter, c.found_correct)) 182 | out.write('found: %d phrases; correct: %d.\n' % 183 | (c.found_guessed, c.correct_chunk)) 184 | 185 | if c.token_counter > 0: 186 | out.write('accuracy: %6.2f%%; ' % 187 | (100.*c.correct_tags/c.token_counter)) 188 | out.write('precision: %6.2f%%; ' % (100.*overall.prec)) 189 | out.write('recall: %6.2f%%; ' % (100.*overall.rec)) 190 | out.write('FB1: %6.2f\n' % (100.*overall.fscore)) 191 | 192 | for i, m in sorted(by_type.items()): 193 | out.write('%17s: ' % i) 194 | out.write('precision: %6.2f%%; ' % (100.*m.prec)) 195 | out.write('recall: %6.2f%%; ' % (100.*m.rec)) 196 | out.write('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) 197 | 198 | def end_of_chunk(prev_tag, tag, prev_type, type_): 199 | # check if a chunk ended between the previous and current word 200 | # arguments: previous and current chunk tags, previous and current types 201 | chunk_end = False 202 | 203 | if prev_tag == 'E': chunk_end = True 204 | if prev_tag == 'U': chunk_end = True 205 | 206 | if prev_tag == 'B' and tag == 'B': chunk_end = True 207 | if prev_tag == 'B' and tag == 'U': chunk_end = True 208 | if prev_tag == 'B' and tag == 'O': chunk_end = True 209 | if prev_tag == 'I' and tag == 'B': chunk_end = True 210 | if prev_tag == 'I' and tag == 'U': chunk_end = True 211 | if prev_tag == 'I' and tag == 'O': chunk_end = True 212 | 213 | if prev_tag != 'O' and prev_tag != '.' and prev_type != type_: 214 | chunk_end = True 215 | 216 | # these chunks are assumed to have length 1 217 | if prev_tag == ']': chunk_end = True 218 | if prev_tag == '[': chunk_end = True 219 | 220 | return chunk_end 221 | 222 | def start_of_chunk(prev_tag, tag, prev_type, type_): 223 | # check if a chunk started between the previous and current word 224 | # arguments: previous and current chunk tags, previous and current types 225 | chunk_start = False 226 | 227 | if tag == 'B': chunk_start = True 228 | if tag == 'U': chunk_start = True 229 | 230 | if prev_tag == 'E' and tag == 'E': chunk_start = True 231 | if prev_tag == 'E' and tag == 'I': chunk_start = True 232 | if prev_tag == 'U' and tag == 'E': chunk_start = True 233 | if prev_tag == 'U' and tag == 'I': chunk_start = True 234 | if prev_tag == 'O' and tag == 'E': chunk_start = True 235 | if prev_tag == 'O' and tag == 'I': chunk_start = True 236 | 237 | if tag != 'O' and tag != '.' and prev_type != type_: 238 | chunk_start = True 239 | 240 | # these chunks are assumed to have length 1 241 | if tag == '[': chunk_start = True 242 | if tag == ']': chunk_start = True 243 | 244 | return chunk_start 245 | 246 | def evaluate_from_file(filename, argv, outstream=None): 247 | args = parse_args(argv[1:]) 248 | with open(filename) as f: 249 | counts = evaluate(f, args) 250 | report(counts, outstream) 251 | 252 | def main(argv, outstream=None): 253 | args = parse_args(argv[1:]) 254 | 255 | if args.file is None: 256 | counts = evaluate(sys.stdin, args) 257 | else: 258 | with open(args.file) as f: 259 | counts = evaluate(f, args) 260 | if outstream is not None: 261 | args.outstream = outstream 262 | report(counts, args.outstream) 263 | 264 | if __name__ == '__main__': 265 | sys.exit(main(sys.argv)) 266 | -------------------------------------------------------------------------------- /data/conll2000/get_data.sh: -------------------------------------------------------------------------------- 1 | wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz 2 | wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz 3 | gunzip train.txt 4 | gunzip test.txt 5 | -------------------------------------------------------------------------------- /pytorch_models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | 7 | import numpy as np 8 | from tqdm import tqdm 9 | 10 | 11 | def to_scalar(var): 12 | # returns a python float 13 | return var.view(-1).data.tolist()[0] 14 | 15 | 16 | def argmax(vec): 17 | # return the argmax as a python int 18 | _, idx = torch.max(vec, 1) 19 | return to_scalar(idx) 20 | 21 | 22 | def log_sum_exp_torch(vecs, axis=None): 23 | ## Use help from: http://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html#sphx-glr-beginner-nlp-advanced-tutorial-py 24 | if axis < 0: 25 | axis = vecs.ndimension()+axis 26 | max_val, _ = vecs.max(axis) 27 | vecs = vecs - max_val.expand_as(vecs) 28 | out_val = torch.log(torch.exp(vecs).sum(axis)) 29 | #print(max_val, out_val) 30 | return max_val + out_val 31 | 32 | 33 | def charseq2varlist(X_chars, volatile=False): 34 | return [Variable(torch.LongTensor([x]).pin_memory(), requires_grad=False, volatile=volatile) for x in X_chars] 35 | 36 | 37 | def assign_embeddings(embedding_module, pretrained_embeddings, fix_embedding=False): 38 | embedding_module.weight.data.copy_(torch.from_numpy(pretrained_embeddings)) 39 | if fix_embedding: 40 | embedding_module.weight.requires_grad = False 41 | 42 | 43 | class ModelWrapper(object): 44 | def __init__(self, model, 45 | loss_function, 46 | use_cuda=False 47 | ): 48 | self.model = model 49 | self.loss_function = loss_function 50 | 51 | self.use_cuda = use_cuda 52 | if self.use_cuda: 53 | self.model.cuda() 54 | 55 | def batch_process_tensors(self, data_tensors): 56 | for instance_tensors in data_tensors: 57 | yield self._process_instance_tensors(instance_tensors) 58 | 59 | def _process_instance_tensors(self, instance_tensors, volatile=False): 60 | raise NotImplementedError("Please define this function explicitly") 61 | 62 | def zero_grad(self): 63 | self.model.zero_grad() 64 | 65 | def post_backward(self): 66 | ## Implement things like grad clipping or grad norm 67 | pass 68 | 69 | def get_parameters(self): 70 | return self.model.paramerters() 71 | 72 | def set_model_mode(self, training_mode=True): 73 | if training_mode: 74 | self.model.train() 75 | else: 76 | self.model.eval() 77 | 78 | def save(self, filename, verbose=True): 79 | torch.save(self.model, filename) 80 | if verbose: 81 | print("{} model saved to {}".format(self.model.__class__, filename)) 82 | 83 | def load(self, filename): 84 | self.model = torch.load(filename) 85 | if self.use_cuda: 86 | self.model.cuda() 87 | 88 | def get_instance_loss(self, instance_tensors, zero_grad=True): 89 | if zero_grad: 90 | ## Clear gradients before every update else memory runs out 91 | self.zero_grad() 92 | raise NotImplementedError("Please define this function explicitly") 93 | 94 | def predict(self, instance_tensors): 95 | raise NotImplementedError("Please define this function explicitly") 96 | 97 | def predict_batch(self, batch_tensors, title="train"): 98 | self.model.eval() # Set model to eval mode 99 | predictions = [] 100 | for instance_tensors in tqdm(batch_tensors, 101 | desc="%s predict" % title, unit="instance"): 102 | predictions.append(self.predict(instance_tensors)) 103 | return predictions 104 | 105 | 106 | def get_epoch_function(model_wrapper, optimizer, 107 | use_cuda=False): 108 | def perform_epoch(data_tensors, training_mode=True, batch_size=1, pbar=None): 109 | model_wrapper.set_model_mode(training_mode) 110 | step_losses = [] 111 | len_data_tensors = len(data_tensors) 112 | data_tensor_idxs = np.random.permutation(np.arange(len_data_tensors, dtype="int")) 113 | n_splits = data_tensor_idxs.shape[0]//batch_size 114 | title = "train" if training_mode else "eval" 115 | for batch_tensors_idxs in np.array_split(data_tensor_idxs, n_splits): 116 | #from IPython.core.debugger import Tracer; Tracer()() 117 | optimizer.zero_grad() 118 | #loss = Variable(torch.FloatTensor([0.])) 119 | losses = [] 120 | for instance_tensors_idx in batch_tensors_idxs: 121 | instance_tensors = data_tensors[instance_tensors_idx] 122 | loss = model_wrapper.get_instance_loss(instance_tensors, zero_grad=False) 123 | losses.append(loss) 124 | if pbar is not None: 125 | pbar.update(1) 126 | loss = torch.mean(torch.cat(losses)) 127 | #loss = loss/batch_tensors_idxs.shape[0] # Mean loss 128 | step_losses.append(loss.data[0]) 129 | if training_mode: 130 | ## Get gradients of model params wrt. loss 131 | loss.backward() 132 | ## Model grad specific steps like clipping or norm 133 | model_wrapper.post_backward() 134 | ## Optimize the loss by one step 135 | optimizer.step() 136 | return step_losses 137 | return perform_epoch 138 | 139 | def write_losses(losses, fp, title="train", epoch=0): 140 | for i, loss in enumerate(losses): 141 | print("{:<10} epoch={:<3} batch={:<5} loss={:<10}".format( 142 | title, epoch, i, loss 143 | ), file=fp) 144 | print("{:<10} epoch={:<3} {:<11} mean={:<10.3f} std={:<10.3f}".format( 145 | title, epoch, "overall", np.mean(losses), np.std(losses) 146 | ), file=fp) 147 | 148 | 149 | def training_wrapper( 150 | model_wrapper, data_tensors, 151 | eval_tensors=None, 152 | optimizer=optim.SGD, 153 | optimizer_kwargs=None, 154 | n_epochs=10, 155 | batch_size=1, 156 | use_cuda=False, 157 | log_file="training_output.log", 158 | early_stopping=None, 159 | save_best=False, 160 | save_path="best_model.pth", 161 | reduce_lr_every=5, 162 | lr_reduce_factor=0.5 163 | ): 164 | """Wrapper to train the model 165 | """ 166 | if optimizer_kwargs is None: 167 | optimizer_kwargs = {} 168 | # Fileter out parameters which don't require a gradient 169 | parameters = filter(lambda p: p.requires_grad, model_wrapper.model.parameters()) 170 | optimizer=optimizer(parameters, **optimizer_kwargs) 171 | # Start training 172 | losses = [] 173 | eval_losses = [] 174 | ## Covert data tensors to torch tensors 175 | data_tensors = list( 176 | tqdm( 177 | model_wrapper.batch_process_tensors(data_tensors), 178 | total=len(data_tensors), 179 | desc="Proc. train tensors", 180 | #leave=False, 181 | ) 182 | ) 183 | if eval_tensors is not None: 184 | eval_tensors = list( 185 | tqdm( 186 | model_wrapper.batch_process_tensors(eval_tensors), 187 | total=len(eval_tensors), 188 | desc="Proc. eval tensors", 189 | #leave=False, 190 | ) 191 | ) 192 | ## 193 | #data_tensors = np.array(data_tensors) 194 | #if eval_tensors is not None: 195 | # eval_tensors = np.array(eval_tensors) 196 | perform_epoch = get_epoch_function( 197 | model_wrapper, 198 | optimizer, 199 | use_cuda=use_cuda) 200 | with open(log_file, "w+") as fp: 201 | with tqdm(total=n_epochs, desc="Epochs", unit="epochs") as epoch_progress_bar: 202 | for epoch in range(n_epochs): 203 | with tqdm( 204 | total=len(data_tensors), 205 | desc="Train", unit="instance", leave=False 206 | ) as train_progress_bar: 207 | step_losses = perform_epoch(data_tensors, batch_size=batch_size, pbar=train_progress_bar) 208 | mean_loss, std_loss = np.mean(step_losses), np.std(step_losses) 209 | losses.append((mean_loss, std_loss)) 210 | write_losses(step_losses, fp, title="train", epoch=epoch) 211 | if eval_tensors is not None: 212 | with tqdm( 213 | total=len(eval_tensors), 214 | desc="Eval", unit="instance", leave=False) as eval_progress_bar: 215 | step_losses = perform_epoch(eval_tensors, training_mode=False, pbar=eval_progress_bar) 216 | mean_loss, std_loss = np.mean(step_losses), np.std(step_losses) 217 | eval_losses.append((mean_loss, std_loss)) 218 | write_losses(step_losses, fp, title="eval", epoch=epoch) 219 | epoch_progress_bar.update(1) 220 | if early_stopping is not None and epoch > 1: 221 | assert isinstance(early_stopping, float), "early_stopping should be either None or float value. Got {}".format(early_stopping) 222 | eval_loss_diff = np.abs(eval_losses[-2][0] - eval_losses[-1][0]) 223 | if eval_loss_diff < early_stopping: 224 | epoch_progress_bar.write("Evaluation loss stopped decreased less than {}. Early stopping at epoch {}.".format(early_stopping, epoch)) 225 | break 226 | if save_best and save_path is not None: 227 | if epoch == 0: 228 | best_eval_loss = eval_losses[-1][0] 229 | best_epoch = epoch 230 | model_wrapper.save(save_path, verbose=False) 231 | continue 232 | # Save the best model 233 | if eval_losses[-1][0] < best_eval_loss: 234 | best_eval_loss = eval_losses[-1][0] 235 | best_epoch = epoch 236 | model_wrapper.save(save_path, verbose=False) 237 | if epoch == n_epochs -1: 238 | epoch_progress_bar.write("Best model from {} epoch with {:3f} loss".format(best_epoch, best_eval_loss)) 239 | 240 | if reduce_lr_every > 0 and lr_reduce_factor > 0 and ((epoch + 1) % reduce_lr_every) == 0: 241 | for param_group in optimizer.param_groups: 242 | param_group['lr'] = param_group['lr']*lr_reduce_factor 243 | 244 | 245 | return { 246 | "training_loss": losses, 247 | "evaluation_loss": eval_losses 248 | } 249 | 250 | 251 | 252 | 253 | class BoWModule(nn.Module): 254 | def __init__(self, input_size, output_size): 255 | super(BoWModule, self).__init__() 256 | self.W = nn.Linear(input_size, output_size) 257 | 258 | def forward(self, X): 259 | return F.log_softmax(self.W(X)) 260 | 261 | 262 | class BoEmbeddingsModule(nn.Module): 263 | def __init__(self, vocab_size, embedding_size, output_size): 264 | super(BoEmbeddingsModule, self).__init__() 265 | self.word_embeddings = nn.Embedding(vocab_size, embedding_size) 266 | self.W = nn.Linear(embedding_size, output_size) 267 | 268 | def forward(self, X): 269 | hidden_layer = self.word_embeddings(X).mean(1).view(-1,self.word_embeddings.embedding_dim) 270 | return F.log_softmax(self.W(hidden_layer)) 271 | 272 | 273 | 274 | class LSTMPredictor(nn.Module): 275 | def __init__(self, vocab_size, embedding_size, hidden_size, output_size): 276 | super(LSTMPredictor, self).__init__() 277 | self.word_embeddings = nn.Embedding(vocab_size, embedding_size) 278 | self.lstm = nn.LSTM(embedding_size, hidden_size) 279 | self.output = nn.Linear(hidden_size, output_size) 280 | 281 | def forward(self, X): 282 | seq_embed = self.word_embeddings(X).permute(1, 0, 2) 283 | out, hidden = self.lstm(seq_embed) 284 | output = self.output(out[-1, :, :]) 285 | return F.log_softmax(output) 286 | 287 | 288 | class LSTMTagger(nn.Module): 289 | def __init__(self, vocab_size, embedding_size, hidden_size, output_size): 290 | super(LSTMTagger, self).__init__() 291 | self.word_embeddings = nn.Embedding(vocab_size, embedding_size) 292 | self.lstm = nn.LSTM(embedding_size, hidden_size) 293 | self.output = nn.Linear(hidden_size, output_size) 294 | 295 | def forward(self, X): 296 | seq_embed = self.word_embeddings(X).permute(1, 0, 2) 297 | out, hidden = self.lstm(seq_embed) 298 | # Reshape the output to be a tensor of shape seq_len*label_size 299 | output = self.output(out.view(X.data.size(1), -1)) 300 | return F.log_softmax(output) 301 | 302 | 303 | class CharEmbedding(nn.Module): 304 | def __init__(self, vocab_size, embedding_size, 305 | out_channels, kernel_sizes, dropout=0.5): 306 | super(CharEmbedding, self).__init__() 307 | self.char_embeddings = nn.Embedding(vocab_size, embedding_size) 308 | # Usage of nn.ModuleList is important 309 | ## See: https://discuss.pytorch.org/t/list-of-nn-module-in-a-nn-module/219/6 310 | self.convs1 = nn.ModuleList([nn.Conv2d(1, out_channels, (K, embedding_size), padding=(K-1, 0)) 311 | for K in kernel_sizes]) 312 | self.dropout = nn.Dropout(dropout) 313 | 314 | def forward(self, X): 315 | x = self.char_embeddings(X) 316 | x = self.dropout(x) 317 | # Ref: https://github.com/Shawn1993/cnn-text-classification-pytorch/blob/master/model.py 318 | x = x.unsqueeze(1) # (N,Ci,W,D) 319 | x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks) 320 | x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks) 321 | x = torch.cat(x, 1) 322 | return self.dropout(x) 323 | 324 | 325 | class WordCharEmbedding(nn.Module): 326 | def __init__(self, 327 | vocab_size, embedding_size, 328 | char_embed_kwargs, dropout=0.5, 329 | aux_embedding_size=None, 330 | concat=False 331 | ): 332 | super(WordCharEmbedding, self).__init__() 333 | self.char_embeddings = CharEmbedding(**char_embed_kwargs) 334 | self.word_embeddings = nn.Embedding(vocab_size, embedding_size) 335 | self.dropout = nn.Dropout(dropout) 336 | if concat and aux_embedding_size is not None: 337 | ## Only allow aux embedding in concat mode 338 | self.aux_word_embeddings = nn.Embedding(vocab_size, aux_embedding_size) 339 | self.concat = concat 340 | 341 | def forward(self, X, X_char=None): 342 | # Ref: https://github.com/Shawn1993/cnn-text-classification-pytorch/blob/master/model.py 343 | word_vecs = self.word_embeddings(X) 344 | if X_char is not None: 345 | char_vecs = torch.cat([ 346 | self.char_embeddings(x).unsqueeze(0) 347 | for x in X_char 348 | ], 1) 349 | if self.concat: 350 | embedding_list = [char_vecs, word_vecs] 351 | if hasattr(self, "aux_word_embeddings"): 352 | aux_vecs = self.aux_word_embeddings(X) 353 | embedding_list.append(aux_vecs) 354 | word_vecs = torch.cat(embedding_list, 2) 355 | else: 356 | word_vecs = char_vecs + word_vecs 357 | return self.dropout(word_vecs) 358 | 359 | class WordCharEmbedding_tuple(nn.Module): 360 | def __init__(self, 361 | vocab_size, embedding_size, 362 | char_embed_kwargs, dropout=0.5, 363 | aux_embedding_size=None, 364 | concat=False 365 | ): 366 | super(WordCharEmbedding_tuple, self).__init__() 367 | self.char_embeddings = CharEmbedding(**char_embed_kwargs) 368 | self.word_embeddings = nn.Embedding(vocab_size, embedding_size) 369 | self.dropout = nn.Dropout(dropout) 370 | self.concat = concat 371 | if concat and aux_embedding_size is not None: 372 | ## Only allow aux embedding in concat mode 373 | self.aux_word_embeddings = nn.Embedding(vocab_size, aux_embedding_size) 374 | 375 | def forward(self, X): 376 | if isinstance(X, tuple): 377 | X, X_char = X 378 | # Ref: https://github.com/Shawn1993/cnn-text-classification-pytorch/blob/master/model.py 379 | word_vecs = self.word_embeddings(X) 380 | if X_char is not None: 381 | char_vecs = torch.cat([ 382 | self.char_embeddings(x).unsqueeze(0) 383 | for x in X_char 384 | ], 1) 385 | if self.concat: 386 | embedding_list = [char_vecs, word_vecs] 387 | if hasattr(self, "aux_word_embeddings"): 388 | aux_vecs = self.aux_word_embeddings(X) 389 | embedding_list.append(aux_vecs) 390 | word_vecs = torch.cat(embedding_list, 2) 391 | else: 392 | word_vecs = char_vecs + word_vecs 393 | return self.dropout(word_vecs) 394 | 395 | class ConcatInputs(nn.Module): 396 | def __init__(self, input_modules, dim=2): 397 | super(ConcatInputs, self).__init__() 398 | assert isinstance(input_modules, list), "Modules should be a list of input modules" 399 | self.input_modules = nn.ModuleList(input_modules) 400 | self.dim = dim 401 | 402 | def forward(self, X): 403 | assert isinstance(X, list), "X should be a list of input variables" 404 | concat_vecs = torch.cat([self.input_modules[i](x) for i,x in enumerate(X)], self.dim) 405 | return concat_vecs 406 | 407 | 408 | 409 | class LSTMTaggerWordChar(nn.Module): 410 | def __init__(self, word_char_embedding, embedding_size, hidden_size, output_size): 411 | super(LSTMTaggerWordChar, self).__init__() 412 | self.word_embeddings = word_char_embedding 413 | self.lstm = nn.LSTM(embedding_size, hidden_size//2, bidirectional=True) 414 | self.output = nn.Linear(hidden_size, output_size) 415 | 416 | def forward(self, X, X_char): 417 | seq_embed = self.word_embeddings(X, X_char).permute(1, 0, 2) 418 | out, hidden = self.lstm(seq_embed) 419 | # Reshape the output to be a tensor of shape seq_len*label_size 420 | output = self.output(out.view(X.data.size(1), -1)) 421 | return F.log_softmax(output) 422 | 423 | 424 | 425 | 426 | class CRFLayer(nn.Module): 427 | def __init__(self, num_labels): 428 | super(CRFLayer, self).__init__() 429 | self.num_labels = num_labels 430 | self.transitions = nn.Parameter(torch.randn(self.num_labels, self.num_labels)) 431 | 432 | def _forward_alg(self, emissions): 433 | scores = emissions[0] 434 | # Get the log sum exp score 435 | transitions = self.transitions.transpose(-1,-2) 436 | for i in range(1, emissions.size(0)): 437 | scores = emissions[i] + log_sum_exp_torch( 438 | scores.expand_as(transitions) + transitions, 439 | axis=1) 440 | return log_sum_exp_torch(scores, axis=-1) 441 | 442 | def _score_sentence(self, emissions, tags): 443 | score = emissions[0][tags[0]] 444 | if emissions.size()[0] < 2: 445 | return score 446 | for i, emission in enumerate(emissions[1:]): 447 | score = score + self.transitions[tags[i], tags[i+1]] + emission[tags[i+1]] 448 | return score 449 | 450 | def _viterbi_decode(self, emissions): 451 | emissions = emissions.data.cpu() 452 | scores = torch.zeros(emissions.size(1)) 453 | back_pointers = torch.zeros(emissions.size()).int() 454 | scores = scores + emissions[0] 455 | transitions = self.transitions.data.cpu() 456 | # Generate most likely scores and paths for each step in sequence 457 | for i in range(1, emissions.size(0)): 458 | scores_with_transitions = scores.unsqueeze(1).expand_as(transitions) + transitions 459 | max_scores, back_pointers[i] = torch.max(scores_with_transitions, 0) 460 | scores = emissions[i] + max_scores 461 | # Generate the most likely path 462 | viterbi = [scores.numpy().argmax()] 463 | back_pointers = back_pointers.numpy() 464 | for bp in reversed(back_pointers[1:]): 465 | viterbi.append(bp[viterbi[-1]]) 466 | viterbi.reverse() 467 | viterbi_score = scores.numpy().max() 468 | return viterbi_score, viterbi 469 | 470 | def neg_log_likelihood(self, feats, tags): 471 | forward_score = self._forward_alg(feats) 472 | gold_score = self._score_sentence(feats, tags) 473 | return forward_score - gold_score 474 | 475 | def forward(self, feats): 476 | # Find the best path, given the features. 477 | score, tag_seq = self._viterbi_decode(feats) 478 | return score, tag_seq 479 | 480 | 481 | class BiLSTMTaggerWordCRF(nn.Module): 482 | def __init__(self, vocab_size, embedding_size, hidden_size, output_size): 483 | super(BiLSTMTaggerWordCRF, self).__init__() 484 | self.word_embeddings = nn.Embedding(vocab_size, embedding_size) 485 | self.lstm = nn.LSTM(embedding_size, hidden_size//2, bidirectional=True) 486 | self.output = nn.Linear(hidden_size, output_size) 487 | self.crf = CRFLayer(output_size) 488 | 489 | def forward(self, X): 490 | seq_embed = self.word_embeddings(X).permute(1, 0, 2) 491 | out, hidden = self.lstm(seq_embed) 492 | # Reshape the output to be a tensor of shape seq_len*label_size 493 | output = self.output(out.view(X.data.size(1), -1)) 494 | return output 495 | 496 | def loss(self, X, Y): 497 | feats = self.forward(X) 498 | return self.crf.neg_log_likelihood(feats, Y) 499 | 500 | 501 | class LSTMTaggerWordCharCRF(nn.Module): 502 | def __init__(self, word_char_embedding, embedding_size, hidden_size, output_size): 503 | super(LSTMTaggerWordCharCRF, self).__init__() 504 | self.word_embeddings = word_char_embedding 505 | self.lstm = nn.LSTM(embedding_size, hidden_size//2, bidirectional=True) 506 | self.output = nn.Linear(hidden_size, output_size) 507 | self.crf = CRFLayer(output_size) 508 | 509 | def forward(self, X, X_char): 510 | seq_embed = self.word_embeddings(X, X_char).permute(1, 0, 2) 511 | out, hidden = self.lstm(seq_embed) 512 | # Reshape the output to be a tensor of shape seq_len*label_size 513 | output = self.output(out.view(X.data.size(1), -1)) 514 | return output 515 | 516 | def loss(self, X, X_char, Y): 517 | feats = self.forward(X, X_char) 518 | return self.crf.neg_log_likelihood(feats, Y) 519 | 520 | class BiLSTMTaggerWordCharCRF(nn.Module): 521 | def __init__(self, input_embedding, embedding_size, hidden_size, output_size): 522 | super(BiLSTMTaggerWordCharCRF, self).__init__() 523 | self.input_embedding = input_embedding 524 | self.lstm = nn.LSTM(embedding_size, hidden_size//2, bidirectional=True) 525 | self.output = nn.Linear(hidden_size, output_size) 526 | self.crf = CRFLayer(output_size) 527 | 528 | def forward(self, X): 529 | seq_embed = self.input_embedding(X).permute(1, 0, 2) 530 | out, hidden = self.lstm(seq_embed) 531 | # Reshape the output to be a tensor of shape seq_len*label_size 532 | output = self.output(out.view(out.data.size(0), -1)) 533 | return output 534 | 535 | def loss(self, X, Y): 536 | feats = self.forward(X) 537 | return self.crf.neg_log_likelihood(feats, Y) 538 | 539 | -------------------------------------------------------------------------------- /pytorch_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from tqdm import tqdm 4 | 5 | import torch 6 | from torch.autograd import Variable 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | 11 | from pathlib import Path 12 | 13 | 14 | class Vocab(object): 15 | def __init__(self, name="vocab", 16 | offset_items=tuple([]), 17 | UNK=None, lower=True): 18 | self.name = name 19 | self.item2idx = {} 20 | self.idx2item = [] 21 | self.size = 0 22 | self.UNK = UNK 23 | self.lower=lower 24 | 25 | self.batch_add(offset_items, lower=False) 26 | if UNK is not None: 27 | self.add(UNK, lower=False) 28 | self.UNK_ID = self.item2idx[self.UNK] 29 | self.offset = self.size 30 | 31 | def add(self, item, lower=True): 32 | if self.lower and lower: 33 | item = item.lower() 34 | if item not in self.item2idx: 35 | self.item2idx[item] = self.size 36 | self.size += 1 37 | self.idx2item.append(item) 38 | 39 | def batch_add(self, items, lower=True): 40 | for item in items: 41 | self.add(item, lower=lower) 42 | 43 | def in_vocab(self, item, lower=True): 44 | if self.lower and lower: 45 | item = item.lower() 46 | return item in self.item2idx 47 | 48 | def getidx(self, item, lower=True): 49 | if self.lower and lower: 50 | item = item.lower() 51 | if item not in self.item2idx: 52 | if self.UNK is None: 53 | raise RuntimeError("UNK is not defined. %s not in vocab." % item) 54 | return self.UNK_ID 55 | return self.item2idx[item] 56 | 57 | def __repr__(self): 58 | return "Vocab(name={}, size={:d}, UNK={}, offset={:d}, lower={})".format( 59 | self.name, self.size, 60 | self.UNK, self.offset, 61 | self.lower 62 | ) 63 | 64 | 65 | def load_word_vectors(vector_file, ndims, vocab, cache_file, override_cache=False): 66 | W = np.zeros((vocab.size, ndims), dtype="float32") 67 | # Check for cached file and return vectors 68 | cache_file = Path(cache_file) 69 | if cache_file.is_file() and not override_cache: 70 | W = np.load(cache_file) 71 | return W 72 | # Else load vectors from the vector file 73 | total, found = 0, 0 74 | with open(vector_file) as fp: 75 | for i, line in enumerate(fp): 76 | line = line.rstrip().split() 77 | if line: 78 | total += 1 79 | try: 80 | assert len(line) == ndims+1,( 81 | "Line[{}] {} vector dims {} doesn't match ndims={}".format(i, line[0], len(line)-1, ndims) 82 | ) 83 | except AssertionError as e: 84 | print(e) 85 | continue 86 | word = line[0] 87 | idx = vocab.getidx(word) 88 | if idx >= vocab.offset: 89 | found += 1 90 | vecs = np.array(list(map(float, line[1:]))) 91 | W[idx, :] += vecs 92 | # Write to cache file 93 | print("Found {} [{:.2f}%] vectors from {} vectors in {} with ndims={}".format( 94 | found, found * 100/vocab.size, total, vector_file, ndims)) 95 | norm_W = np.sqrt((W*W).sum(axis=1, keepdims=True)) 96 | valid_idx = norm_W.squeeze() != 0 97 | W[valid_idx, :] /= norm_W[valid_idx] 98 | print("Caching embedding with shape {} to {}".format(W.shape, cache_file.as_posix())) 99 | np.save(cache_file, W) 100 | return W 101 | 102 | class Seq2Vec(object): 103 | def __init__(self, vocab): 104 | self.vocab = vocab 105 | 106 | def encode(self, seq): 107 | vec = [] 108 | for item in seq: 109 | vec.append(self.vocab.getidx(item)) 110 | return vec 111 | 112 | def batch_encode(self, seq_batch): 113 | vecs = [self.encode(seq) for seq in seq_batch] 114 | return vecs 115 | 116 | 117 | class Seq2OneHot(object): 118 | def __init__(self, size): 119 | self.size = size 120 | 121 | def encode(self, x, as_variable=False): 122 | one_hot = torch.zeros(self.size) 123 | for i in x: 124 | one_hot[i] += 1 125 | one_hot = one_hot.view(1, -1) 126 | if as_variable: 127 | return Variable(one_hot) 128 | return one_hot 129 | 130 | 131 | def print_log_probs(log_probs, label_vocab, label_true=None): 132 | for i, label_probs in enumerate(log_probs.data.tolist()): 133 | prob_string = ", ".join([ 134 | "{}: {:.3f}".format(label_vocab.idx2item[j], val) 135 | for j, val in enumerate(label_probs) 136 | ]) 137 | true_string = "?" 138 | if label_true is not None: 139 | true_string = label_vocab.idx2item[label_true[i]] 140 | 141 | print(prob_string, "True label: ", true_string) 142 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import Counter 3 | import pandas as pd 4 | import re 5 | 6 | #import tensorflow as tf 7 | 8 | from sklearn.cluster import KMeans 9 | 10 | 11 | def get_clusters(W_word, n_clusters=10, **kwargs): 12 | clusterer = KMeans(n_clusters=n_clusters, 13 | n_jobs=-1, **kwargs) 14 | cluster_labels = clusterer.fit_predict(W_word) 15 | return cluster_labels 16 | 17 | 18 | def read_glove(filename, 19 | ndims=50): 20 | vocab = [] 21 | char_vocab = Counter() 22 | W = [] 23 | with open(filename) as fp: 24 | for line in fp: 25 | line = line.rstrip().split() 26 | word = line[0] 27 | embed = list(map(float, line[1:])) 28 | vocab.append(word) 29 | W.append(embed) 30 | char_vocab.update(list(word)) 31 | return vocab, char_vocab, np.array(W) 32 | 33 | 34 | def crf_loss(y_true, y_pred): 35 | y_true = tf.cast(tf.squeeze(y_true), tf.int32) 36 | seq_lengths_t = tf.reduce_sum( 37 | tf.cast(tf.not_equal(y_true, 0), 38 | tf.int32), axis=-1) 39 | log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood( 40 | y_pred, y_true, seq_lengths_t) 41 | return tf.reduce_mean(-log_likelihood, axis=-1) 42 | 43 | 44 | def load_sequences(filenames, sep=" ", col_ids=None): 45 | sequences = [] 46 | if isinstance(filenames, str): 47 | filenames = [filenames] 48 | for filename in filenames: 49 | with open(filename, encoding='utf-8') as fp: 50 | seq = [] 51 | for line in fp: 52 | line = line.rstrip() 53 | if line: 54 | line = line.split(sep) 55 | if col_ids is not None: 56 | line = [line[idx] for idx in col_ids] 57 | seq.append(tuple(line)) 58 | else: 59 | if seq: 60 | sequences.append(seq) 61 | seq = [] 62 | if seq: 63 | sequences.append(seq) 64 | return sequences 65 | 66 | 67 | def classification_report_to_df(report): 68 | report_list = [] 69 | for i, line in enumerate(report.split("\n")): 70 | if i == 0: 71 | report_list.append(["class", "precision", "recall", "f1-score", "support"]) 72 | else: 73 | line = line.strip() 74 | if line: 75 | if line.startswith("avg"): 76 | line = line.replace("avg / total", "avg/total") 77 | line = re.split(r'\s+', line) 78 | line = [line[0]] + list(map(float, line[1:-1])) + [int(line[-1])] 79 | report_list.append(tuple(line)) 80 | return pd.DataFrame(report_list[1:], columns=report_list[0]) 81 | 82 | 83 | def conll_classification_report_to_df(report): 84 | report_list = [] 85 | report_list.append(["class", "accuracy", "precision", "recall", "f1-score", "support"]) 86 | for i, line in enumerate(report.split("\n")): 87 | line = line.strip() 88 | if not line: 89 | continue 90 | if i == 0: 91 | continue 92 | if i == 1: 93 | line = re.findall( 94 | 'accuracy:\s*([0-9\.]{4,5})%; precision:\s+([0-9\.]{4,5})%; recall:\s+([0-9\.]{4,5})%; FB1:\s+([0-9\.]{4,5})', 95 | line)[0] 96 | line = ("overall",) + tuple(map(float, line)) + (0,) 97 | else: 98 | line = re.findall( 99 | '\s*(.+?): precision:\s+([0-9\.]{4,5})%; recall:\s+([0-9\.]{4,5})%; FB1:\s+([0-9\.]{4,5})\s+([0-9]+)', 100 | line)[0] 101 | line = (line[0], 0.0) + tuple(map(float, line[1:-1])) + (int(line[-1]),) 102 | report_list.append(line) 103 | return pd.DataFrame(report_list[1:], columns=report_list[0]) 104 | 105 | 106 | def get_labels(y_arr): 107 | return np.expand_dims( 108 | np.array([ 109 | np.zeros(max_len) 110 | if y is None else y 111 | for y in y_arr], 112 | dtype='int'), 113 | -1) 114 | 115 | 116 | 117 | def create_tagged_sequence(seq, task2col, default_tag): 118 | seq_tags = [] 119 | for t in seq: 120 | try: 121 | tag = default_tag._replace(token=t[0], **{ti: t[ci] for ti, ci in task2col.items()}) 122 | except: 123 | print("Error processing tag:", t) 124 | print("Error in sequence: ", seq) 125 | raise 126 | seq_tags.append(tag) 127 | return seq_tags 128 | 129 | 130 | def get_tagged_corpus(corpus, *args): 131 | max_len = 0 132 | for seq in corpus: 133 | if seq: 134 | max_len = max(len(seq), max_len) 135 | yield create_tagged_sequence(seq, *args) 136 | print("Max sequence length in the corpus is: %s" % max_len) 137 | 138 | def gen_vocab_counts(corpus, tasks, include_chars=False, token_counts=None): 139 | task_counts = {k: Counter() for k in tasks} 140 | if token_counts is None: 141 | token_counts = Counter() 142 | max_seq_len = 0 143 | max_word_len = 0 144 | if include_chars: 145 | char_counts = Counter() 146 | for seq in corpus: 147 | max_seq_len = max(len(seq), max_seq_len) 148 | for t in seq: 149 | token_counts[t.token] += 1 150 | if include_chars: 151 | char_counts.update(list(t.token)) 152 | max_word_len = max(len(t.token), max_word_len) 153 | for k in task_counts: 154 | v = getattr(t, k) 155 | if v is not None: 156 | task_counts[k][v] += 1 157 | if include_chars: 158 | return token_counts, task_counts, max_seq_len, char_counts, max_word_len 159 | return token_counts, task_counts, max_seq_len 160 | 161 | def print_predictions(tagged_seq, predictions, filename, label_id=0, task_id=0): 162 | from sklearn.metrics import classification_report, accuracy_score 163 | y_true, y_pred = [], [] 164 | with open(filename, "w+") as fp: 165 | for seq, pred in zip(tagged_seq, predictions[label_id]): 166 | for tag, label in zip(seq, pred): 167 | true_label = tag[task_id+1] 168 | print(u"%s\t%s\t%s" % (tag[0], true_label, label), file=fp) 169 | y_true.append(true_label) 170 | y_pred.append(label) 171 | print(u"", file=fp) 172 | 173 | report = classification_report(y_true, y_pred) 174 | print(report) 175 | print("Accuracy: %s" % accuracy_score(y_true, y_pred)) 176 | return classification_report_to_df(report) 177 | 178 | 179 | 180 | -------------------------------------------------------------------------------- /wnut_bilstm_crf_char_concat.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import matplotlib 7 | matplotlib.use("Agg") 8 | import torch 9 | from torch.autograd import Variable 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | 14 | torch.manual_seed(1) 15 | 16 | import numpy as np 17 | 18 | import matplotlib.pyplot as plt 19 | import seaborn as sns 20 | 21 | from pytorch_utils import * 22 | from pytorch_models import * 23 | from utils import load_sequences, conll_classification_report_to_df 24 | from conlleval import main as conll_eval 25 | import re 26 | 27 | sns.set_context("poster") 28 | sns.set_style("ticks") 29 | 30 | 31 | # In[2]: 32 | 33 | TRAIN_CORPUS="data/WNUT_NER/train.tsv" 34 | DEV_CORPUS="data/WNUT_NER/dev.tsv" 35 | TEST_CORPUS="data/WNUT_NER/test.tsv" 36 | 37 | 38 | # In[3]: 39 | 40 | train_corpus = load_sequences(TRAIN_CORPUS, sep="\t", col_ids=(0, -1)) 41 | print("Total items in train corpus: %s" % len(train_corpus)) 42 | dev_corpus = load_sequences(DEV_CORPUS, sep="\t", col_ids=(0, -1)) 43 | print("Total items in dev corpus: %s" % len(dev_corpus)) 44 | test_corpus = load_sequences(TEST_CORPUS, sep="\t", col_ids=(0, -1)) 45 | print("Total items in test corpus: %s" % len(test_corpus)) 46 | 47 | 48 | # In[5]: 49 | CAP_LETTERS=re.compile(r'[A-Z]') 50 | SMALL_LETTERS=re.compile(r'[a-z]') 51 | NUMBERS=re.compile(r'[0-9]') 52 | PUNCT=re.compile(r'[\.,\"\'!\?;:]') 53 | OTHERS=re.compile(r'[^A-Za-z0-9\.,\"\'!\?;:]') 54 | 55 | def get_ortho_feature(word): 56 | word = CAP_LETTERS.sub("A", word) 57 | word = SMALL_LETTERS.sub("a", word) 58 | word = NUMBERS.sub("0", word) 59 | word = PUNCT.sub(".", word) 60 | word = OTHERS.sub("%", word) 61 | return word 62 | 63 | def create_vocab(data, vocabs, char_vocab, ortho_word_vocab, ortho_char_vocab, word_idx=0): 64 | n_vocabs = len(vocabs) 65 | for sent in data: 66 | for token_tags in sent: 67 | for vocab_id in range(n_vocabs): 68 | vocabs[vocab_id].add(token_tags[vocab_id]) 69 | char_vocab.batch_add(token_tags[word_idx]) 70 | ortho_word = get_ortho_feature(token_tags[word_idx]) 71 | ortho_word_vocab.add(ortho_word) 72 | ortho_char_vocab.batch_add(ortho_word) 73 | print("Created vocabs: %s" % (", ".join( 74 | "{}[{}]".format(vocab.name, vocab.size) 75 | for vocab in vocabs + [char_vocab, ortho_word_vocab, ortho_char_vocab] 76 | ))) 77 | 78 | 79 | # In[6]: 80 | 81 | word_vocab = Vocab("words", UNK="UNK", lower=True) 82 | char_vocab = Vocab("chars", UNK="", lower=False) 83 | ortho_word_vocab = Vocab("ortho_words", UNK="UNK", lower=True) 84 | ortho_char_vocab = Vocab("ortho_chars", UNK="", lower=False) 85 | ner_vocab = Vocab("ner_tags", lower=False) 86 | 87 | create_vocab(train_corpus+dev_corpus+test_corpus, [word_vocab, ner_vocab], char_vocab, ortho_word_vocab, ortho_char_vocab) 88 | 89 | 90 | # In[7]: 91 | 92 | def data2tensors(data, vocabs, char_vocab, ortho_word_vocab, ortho_char_vocab, word_idx=0, column_ids=(0, -1)): 93 | vocabs = [vocabs[idx] for idx in column_ids] 94 | n_vocabs = len(vocabs) 95 | tensors = [] 96 | char_tensors = [] 97 | for sent in data: 98 | sent_vecs = [[] for i in range(n_vocabs+3)] # Last 3 are for char vecs, ortho_word and ortho_char 99 | char_vecs = [] 100 | for token_tags in sent: 101 | vocab_id = 0 # First column is the word 102 | ortho_word = get_ortho_feature(token_tags[vocab_id]) 103 | # lowercase the word 104 | sent_vecs[vocab_id].append( 105 | vocabs[vocab_id].getidx(token_tags[vocab_id].lower()) 106 | ) 107 | for vocab_id in range(1, n_vocabs): 108 | sent_vecs[vocab_id].append( 109 | vocabs[vocab_id].getidx(token_tags[vocab_id]) 110 | ) 111 | sent_vecs[-3].append( 112 | [char_vocab.getidx(c) for c in token_tags[word_idx]] 113 | ) 114 | sent_vecs[-2].append( 115 | ortho_word_vocab.getidx(ortho_word) 116 | ) 117 | sent_vecs[-1].append( 118 | [ortho_char_vocab.getidx(c) for c in ortho_word] 119 | ) 120 | tensors.append(sent_vecs) 121 | return tensors 122 | 123 | 124 | # In[8]: 125 | 126 | train_tensors = data2tensors(train_corpus, [word_vocab, ner_vocab], char_vocab, ortho_word_vocab, ortho_char_vocab) 127 | dev_tensors = data2tensors(dev_corpus, [word_vocab, ner_vocab], char_vocab, ortho_word_vocab, ortho_char_vocab) 128 | test_tensors = data2tensors(test_corpus, [word_vocab, ner_vocab], char_vocab, ortho_word_vocab, ortho_char_vocab) 129 | print("Train: ({}, {}), Dev: ({}, {}), Test: ({}, {})".format( 130 | len(train_tensors), len(train_tensors[0]), 131 | len(dev_tensors), len(dev_tensors[0]), 132 | len(test_tensors), len(test_tensors[0]) 133 | )) 134 | 135 | 136 | # In[9]: 137 | 138 | embedding_file="data/WNUT_NER/wnut_vecs.txt" 139 | cache_file="wnut_ner.twitter.400.npy" 140 | ndims=400 141 | pretrained_embeddings = load_word_vectors(embedding_file, ndims, word_vocab, cache_file) 142 | 143 | 144 | # In[10]: 145 | 146 | def plot_losses(train_losses, eval_losses=None, plot_std=False, ax=None): 147 | if ax is None: 148 | ax = plt.gca() 149 | for losses, color, label in zip( 150 | [train_losses, eval_losses], 151 | ["0.5", "r"], 152 | ["Train", "Eval"], 153 | ): 154 | mean_loss, std_loss = zip(*losses) 155 | mean_loss = np.array(mean_loss) 156 | std_loss = np.array(std_loss) 157 | ax.plot( 158 | mean_loss, color=color, label=label, 159 | linestyle="-", 160 | ) 161 | if plot_std: 162 | ax.fill_between( 163 | np.arange(mean_loss.shape[0]), 164 | mean_loss-std_loss, 165 | mean_loss+std_loss, 166 | color=color, 167 | alpha=0.3 168 | ) 169 | ax.set_xlabel("Epochs") 170 | ax.set_ylabel("Mean Loss ($\pm$ S.D.)") 171 | 172 | 173 | def print_predictions(corpus, predictions, filename, label_vocab): 174 | with open(filename, "w+") as fp: 175 | for seq, pred in zip(corpus, predictions): 176 | for (token, true_label), pred_label in zip(seq, pred): 177 | pred_label = label_vocab.idx2item[pred_label] 178 | print("{}\t{}\t{}".format(token, true_label, pred_label), file=fp) 179 | print(file=fp) # Add new line after each sequence 180 | 181 | 182 | # In[11]: 183 | 184 | # ## Class based 185 | 186 | # In[19]: 187 | 188 | class BiLSTMTaggerWordCRFModel(ModelWrapper): 189 | def __init__(self, model, 190 | loss_function, 191 | use_cuda=False, grad_max_norm=5): 192 | self.model = model 193 | self.loss_function = None 194 | self.grad_max_norm=grad_max_norm 195 | 196 | self.use_cuda = use_cuda 197 | if self.use_cuda: 198 | #[k.cuda() for k in self.model.modules()] 199 | self.model.cuda() 200 | 201 | def post_backward(self): 202 | torch.nn.utils.clip_grad_norm(self.model.parameters(), self.grad_max_norm) 203 | 204 | def _process_instance_tensors(self, instance_tensors, volatile=False): 205 | X, Y, X_char, X_ortho, X_char_ortho = instance_tensors 206 | X = Variable(torch.LongTensor([X]), requires_grad=False, volatile=volatile) 207 | X_char = charseq2varlist(X_char, volatile=volatile) 208 | X_ortho = Variable(torch.LongTensor([X_ortho]), requires_grad=False, volatile=volatile) 209 | X_char_ortho = charseq2varlist(X_char_ortho, volatile=volatile) 210 | Y = torch.LongTensor(Y) 211 | return X, X_char, X_ortho, X_char_ortho, Y 212 | 213 | def get_instance_loss(self, instance_tensors, zero_grad=True): 214 | if zero_grad: 215 | ## Clear gradients before every update else memory runs out 216 | self.model.zero_grad() 217 | X, X_char, X_ortho, X_char_ortho, Y = instance_tensors 218 | if self.use_cuda: 219 | X = X.cuda(async=True) 220 | X_char = [t.cuda(async=True) for t in X_char] 221 | X_ortho = X_ortho.cuda(async=True) 222 | X_char_ortho = [t.cuda(async=True) for t in X_char_ortho] 223 | Y = Y.cuda(async=True) 224 | return self.model.loss([(X, X_char), (X_ortho, X_char_ortho)], Y) 225 | 226 | def predict(self, instance_tensors): 227 | X, X_char, X_ortho, X_char_ortho, Y = self._process_instance_tensors(instance_tensors, volatile=True) 228 | if self.use_cuda: 229 | X = X.cuda(async=True) 230 | X_char = [t.cuda(async=True) for t in X_char] 231 | X_ortho = X_ortho.cuda(async=True) 232 | X_char_ortho = [t.cuda(async=True) for t in X_char_ortho] 233 | Y = Y.cuda(async=True) 234 | emissions = self.model.forward([(X, X_char), (X_ortho, X_char_ortho)]) 235 | return self.model.crf.forward(emissions)[1] 236 | 237 | 238 | use_cuda=True 239 | hidden_size=128 240 | batch_size=64 241 | 242 | char_emb_size=30 243 | output_channels=200 244 | kernel_sizes=[3] 245 | 246 | word_emb_size=400 247 | aux_emb_size=100 248 | 249 | main_total_emb_dims=700 250 | char_embed_kwargs=dict( 251 | vocab_size=char_vocab.size, 252 | embedding_size=char_emb_size, 253 | out_channels=output_channels, 254 | kernel_sizes=kernel_sizes 255 | ) 256 | 257 | word_char_embedding = WordCharEmbedding_tuple( 258 | word_vocab.size, word_emb_size, 259 | char_embed_kwargs, dropout=0.5, 260 | aux_embedding_size=aux_emb_size, 261 | concat=True) 262 | 263 | 264 | ortho_char_emb_size=30 265 | output_channels=200 266 | kernel_sizes=[3] 267 | ortho_word_emb_size=200 268 | ortho_total_emb_dims=400 269 | 270 | ortho_char_embed_kwargs=dict( 271 | vocab_size=ortho_char_vocab.size, 272 | embedding_size=ortho_char_emb_size, 273 | out_channels=output_channels, 274 | kernel_sizes=kernel_sizes 275 | ) 276 | 277 | ortho_word_char_embedding = WordCharEmbedding_tuple( 278 | ortho_word_vocab.size, ortho_word_emb_size, 279 | ortho_char_embed_kwargs, dropout=0.5, concat=True) 280 | 281 | 282 | concat_embeddings = ConcatInputs([word_char_embedding, ortho_word_char_embedding]) 283 | 284 | # Assign glove embeddings 285 | assign_embeddings(word_char_embedding.word_embeddings, pretrained_embeddings, fix_embedding=True) 286 | 287 | n_embed=main_total_emb_dims + ortho_total_emb_dims # Get this using char embedding and word embed and ortho embeddings 288 | model_wrapper = BiLSTMTaggerWordCRFModel( 289 | BiLSTMTaggerWordCharCRF(concat_embeddings, n_embed, hidden_size, ner_vocab.size), 290 | None, use_cuda=use_cuda, grad_max_norm=5) 291 | 292 | 293 | # In[33]: 294 | model_prefix="BiLSTMCharConcatCRF_WNUT_NER_ortho" 295 | n_epochs=50 296 | 297 | load_model = True 298 | 299 | if load_model: 300 | model_wrapper.load("{}.pth".format(model_prefix)) 301 | print("Loaded model from {}.pth".format(model_prefix)) 302 | 303 | training_history = training_wrapper( 304 | model_wrapper, train_tensors, 305 | eval_tensors=dev_tensors, 306 | optimizer=optim.Adam, 307 | optimizer_kwargs={ 308 | "lr": 0.1, 309 | "weight_decay": 1e-2 310 | }, 311 | n_epochs=n_epochs, 312 | batch_size=batch_size, 313 | use_cuda=use_cuda, 314 | log_file="{}.log".format(model_prefix), 315 | #early_stopping=0.001, 316 | save_best=True, 317 | save_path="{}.pth".format(model_prefix) 318 | ) 319 | #model_wrapper.save("{}.pth".format(model_prefix)) 320 | model_wrapper.load("{}.pth".format(model_prefix)) 321 | 322 | # In[34]: 323 | 324 | fig, ax = plt.subplots(1,1) 325 | plot_losses(training_history["training_loss"], 326 | training_history["evaluation_loss"], 327 | plot_std=True, 328 | ax=ax) 329 | ax.legend() 330 | sns.despine(offset=5) 331 | plt.savefig("{}.pdf".format(model_prefix)) 332 | 333 | for title, tensors, corpus in zip( 334 | ["train", "dev", "test"], 335 | [train_tensors, dev_tensors, test_tensors], 336 | [train_corpus, dev_corpus, test_corpus], 337 | ): 338 | predictions = model_wrapper.predict_batch(tensors, title=title) 339 | print_predictions(corpus, predictions, "%s.wnut.conll" % title, ner_vocab) 340 | conll_eval(["conlleval", "%s.wnut.conll" % title]) 341 | 342 | 343 | --------------------------------------------------------------------------------