├── .gitignore
├── Bi LSTM CRF.ipynb
├── BiLSTMTaggerWordCRFModel_CONLL2000
├── BiLSTMTaggerWordCRFModel_CONLL2000.pdf
├── Elastic weight consolidation-Uncertainity-v1.6.ipynb
├── Elastic weight consolidation-Uncertainity.ipynb
├── Elastic weight consolidation-old.ipynb
├── Elastic weight consolidation.ipynb
├── Getting started.ipynb
├── IID_EWC_losses.pdf
├── IID_EWC_predictions.pdf
├── Iterated Dilated convolution.ipynb
├── LICENSE
├── PyTorch CONLL 2000 Chunking.ipynb
├── PyTorch CONLL 2000 Chunking.py
├── PyTorch RNN.ipynb
├── Pytorch - MMD VAE.ipynb
├── Pytorch Active Learning.ipynb
├── Pytorch Gradient reversal.ipynb
├── Pytorch RNN sequence tagging.ipynb
├── Pytorch Uncertainity-animated.ipynb
├── Pytorch Uncertainity-yaringal.ipynb
├── Pytorch Uncertainity.ipynb
├── Pytorch example.ipynb
├── README.md
├── Scratchpad.ipynb
├── Seq_EWC_losses.pdf
├── Seq_EWC_predictions.pdf
├── Viterbi decoding and CRF.ipynb
├── chunking_bilstm_crf_char_concat.py
├── conll2000.glove.100.npy
├── conlleval.py
├── data
    └── conll2000
    │   └── get_data.sh
├── pytorch_models.py
├── pytorch_utils.py
├── utils.py
└── wnut_bilstm_crf_char_concat.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 
91 | # Data folders
92 | tmp/
93 | 


--------------------------------------------------------------------------------
/Bi LSTM CRF.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "data": {
 12 |       "text/plain": [
 13 |        "<torch._C.Generator at 0x7f83d40f3558>"
 14 |       ]
 15 |      },
 16 |      "execution_count": 1,
 17 |      "metadata": {},
 18 |      "output_type": "execute_result"
 19 |     }
 20 |    ],
 21 |    "source": [
 22 |     "import torch\n",
 23 |     "import torch.autograd as autograd\n",
 24 |     "import torch.nn as nn\n",
 25 |     "import torch.nn.functional as F\n",
 26 |     "import torch.optim as optim\n",
 27 |     "\n",
 28 |     "torch.manual_seed(1)"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 4,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "def prepare_sequence(seq, to_ix):\n",
 40 |     "    idxs = [to_ix[w] for w in seq]\n",
 41 |     "    tensor = torch.LongTensor(idxs)\n",
 42 |     "    return autograd.Variable(tensor)"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 5,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# Helper functions to make the code more readable.\n",
 54 |     "def to_scalar(var):\n",
 55 |     "    # returns a python float\n",
 56 |     "    return var.view(-1).data.tolist()[0]\n",
 57 |     "\n",
 58 |     "\n",
 59 |     "def argmax(vec):\n",
 60 |     "    # return the argmax as a python int\n",
 61 |     "    _, idx = torch.max(vec, 1)\n",
 62 |     "    return to_scalar(idx)\n",
 63 |     "\n",
 64 |     "# Compute log sum exp in a numerically stable way for the forward algorithm\n",
 65 |     "\n",
 66 |     "\n",
 67 |     "def log_sum_exp(vec):\n",
 68 |     "    max_score = vec[0, argmax(vec)]\n",
 69 |     "    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])\n",
 70 |     "    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))\n",
 71 |     "\n",
 72 |     "\n",
 73 |     "class BiLSTM_CRF(nn.Module):\n",
 74 |     "\n",
 75 |     "    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):\n",
 76 |     "        super(BiLSTM_CRF, self).__init__()\n",
 77 |     "        self.embedding_dim = embedding_dim\n",
 78 |     "        self.hidden_dim = hidden_dim\n",
 79 |     "        self.vocab_size = vocab_size\n",
 80 |     "        self.tag_to_ix = tag_to_ix\n",
 81 |     "        self.tagset_size = len(tag_to_ix)\n",
 82 |     "\n",
 83 |     "        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)\n",
 84 |     "        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,\n",
 85 |     "                            num_layers=1, bidirectional=True)\n",
 86 |     "\n",
 87 |     "        # Maps the output of the LSTM into tag space.\n",
 88 |     "        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)\n",
 89 |     "\n",
 90 |     "        # Matrix of transition parameters.  Entry i,j is the score of\n",
 91 |     "        # transitioning *to* i *from* j.\n",
 92 |     "        self.transitions = nn.Parameter(\n",
 93 |     "            torch.randn(self.tagset_size, self.tagset_size))\n",
 94 |     "\n",
 95 |     "        self.hidden = self.init_hidden()\n",
 96 |     "\n",
 97 |     "    def init_hidden(self):\n",
 98 |     "        return (autograd.Variable(torch.randn(2, 1, self.hidden_dim)),\n",
 99 |     "                autograd.Variable(torch.randn(2, 1, self.hidden_dim)))\n",
100 |     "\n",
101 |     "    def _forward_alg(self, feats):\n",
102 |     "        # Do the forward algorithm to compute the partition function\n",
103 |     "        init_alphas = torch.Tensor(1, self.tagset_size).fill_(-10000.)\n",
104 |     "        # START_TAG has all of the score.\n",
105 |     "        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.\n",
106 |     "\n",
107 |     "        # Wrap in a variable so that we will get automatic backprop\n",
108 |     "        forward_var = autograd.Variable(init_alphas)\n",
109 |     "\n",
110 |     "        # Iterate through the sentence\n",
111 |     "        for feat in feats:\n",
112 |     "            alphas_t = []  # The forward variables at this timestep\n",
113 |     "            for next_tag in range(self.tagset_size):\n",
114 |     "                # broadcast the emission score: it is the same regardless of\n",
115 |     "                # the previous tag\n",
116 |     "                emit_score = feat[next_tag].view(\n",
117 |     "                    1, -1).expand(1, self.tagset_size)\n",
118 |     "                # the ith entry of trans_score is the score of transitioning to\n",
119 |     "                # next_tag from i\n",
120 |     "                trans_score = self.transitions[next_tag].view(1, -1)\n",
121 |     "                # The ith entry of next_tag_var is the value for the edge (i -> next_tag)\n",
122 |     "                # before we do log-sum-exp\n",
123 |     "                next_tag_var = forward_var + trans_score + emit_score\n",
124 |     "                # The forward variable for this tag is log-sum-exp of all the\n",
125 |     "                # scores.\n",
126 |     "                alphas_t.append(log_sum_exp(next_tag_var))\n",
127 |     "            forward_var = torch.cat(alphas_t).view(1, -1)\n",
128 |     "        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]\n",
129 |     "        alpha = log_sum_exp(terminal_var)\n",
130 |     "        return alpha\n",
131 |     "\n",
132 |     "    def _get_lstm_features(self, sentence):\n",
133 |     "        self.hidden = self.init_hidden()\n",
134 |     "        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)\n",
135 |     "        lstm_out, self.hidden = self.lstm(embeds)\n",
136 |     "        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)\n",
137 |     "        lstm_feats = self.hidden2tag(lstm_out)\n",
138 |     "        return lstm_feats\n",
139 |     "\n",
140 |     "    def _score_sentence(self, feats, tags):\n",
141 |     "        # Gives the score of a provided tag sequence\n",
142 |     "        score = autograd.Variable(torch.Tensor([0]))\n",
143 |     "        tags = torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags])\n",
144 |     "        for i, feat in enumerate(feats):\n",
145 |     "            score = score + \\\n",
146 |     "                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]\n",
147 |     "        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]\n",
148 |     "        return score\n",
149 |     "\n",
150 |     "    def _viterbi_decode(self, feats):\n",
151 |     "        backpointers = []\n",
152 |     "\n",
153 |     "        # Initialize the viterbi variables in log space\n",
154 |     "        init_vvars = torch.Tensor(1, self.tagset_size).fill_(-10000.)\n",
155 |     "        init_vvars[0][self.tag_to_ix[START_TAG]] = 0\n",
156 |     "\n",
157 |     "        # forward_var at step i holds the viterbi variables for step i-1\n",
158 |     "        forward_var = autograd.Variable(init_vvars)\n",
159 |     "        for feat in feats:\n",
160 |     "            bptrs_t = []  # holds the backpointers for this step\n",
161 |     "            viterbivars_t = []  # holds the viterbi variables for this step\n",
162 |     "\n",
163 |     "            for next_tag in range(self.tagset_size):\n",
164 |     "                # next_tag_var[i] holds the viterbi variable for tag i at the previous step,\n",
165 |     "                # plus the score of transitioning from tag i to next_tag.\n",
166 |     "                # We don't include the emission scores here because the max\n",
167 |     "                # does not depend on them (we add them in below)\n",
168 |     "                next_tag_var = forward_var + self.transitions[next_tag]\n",
169 |     "                best_tag_id = argmax(next_tag_var)\n",
170 |     "                bptrs_t.append(best_tag_id)\n",
171 |     "                viterbivars_t.append(next_tag_var[0][best_tag_id])\n",
172 |     "            # Now add in the emission scores, and assign forward_var to the set\n",
173 |     "            # of viterbi variables we just computed\n",
174 |     "            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)\n",
175 |     "            backpointers.append(bptrs_t)\n",
176 |     "\n",
177 |     "        # Transition to STOP_TAG\n",
178 |     "        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]\n",
179 |     "        best_tag_id = argmax(terminal_var)\n",
180 |     "        path_score = terminal_var[0][best_tag_id]\n",
181 |     "\n",
182 |     "        # Follow the back pointers to decode the best path.\n",
183 |     "        best_path = [best_tag_id]\n",
184 |     "        for bptrs_t in reversed(backpointers):\n",
185 |     "            best_tag_id = bptrs_t[best_tag_id]\n",
186 |     "            best_path.append(best_tag_id)\n",
187 |     "        # Pop off the start tag (we dont want to return that to the caller)\n",
188 |     "        start = best_path.pop()\n",
189 |     "        assert start == self.tag_to_ix[START_TAG]  # Sanity check\n",
190 |     "        best_path.reverse()\n",
191 |     "        return path_score, best_path\n",
192 |     "\n",
193 |     "    def neg_log_likelihood(self, sentence, tags):\n",
194 |     "        self.hidden = self.init_hidden()\n",
195 |     "        feats = self._get_lstm_features(sentence)\n",
196 |     "        forward_score = self._forward_alg(feats)\n",
197 |     "        gold_score = self._score_sentence(feats, tags)\n",
198 |     "        return forward_score - gold_score\n",
199 |     "\n",
200 |     "    def forward(self, sentence):  # dont confuse this with _forward_alg above.\n",
201 |     "        self.hidden = self.init_hidden()\n",
202 |     "        # Get the emission scores from the BiLSTM\n",
203 |     "        lstm_feats = self._get_lstm_features(sentence)\n",
204 |     "\n",
205 |     "        # Find the best path, given the features.\n",
206 |     "        score, tag_seq = self._viterbi_decode(lstm_feats)\n",
207 |     "        return score, tag_seq"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "raw",
212 |    "metadata": {
213 |     "hide_egal": false,
214 |     "is_egal": true
215 |    },
216 |    "source": [
217 |     "<svg height='250'></svg>"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "raw",
222 |    "metadata": {
223 |     "is_egal": true
224 |    },
225 |    "source": [
226 |     "<svg height='250'></svg>"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "raw",
231 |    "metadata": {
232 |     "is_egal": true
233 |    },
234 |    "source": [
235 |     "<svg height='250'></svg>"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 6,
241 |    "metadata": {
242 |     "collapsed": false
243 |    },
244 |    "outputs": [
245 |     {
246 |      "name": "stdout",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "(Variable containing:\n",
250 |       " 9.7379\n",
251 |       "[torch.FloatTensor of size 1]\n",
252 |       ", [2, 3, 1, 2, 3, 1, 2, 3, 2, 3, 1])\n",
253 |       "(Variable containing:\n",
254 |       " 39.4279\n",
255 |       "[torch.FloatTensor of size 1]\n",
256 |       ", [0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2])\n"
257 |      ]
258 |     }
259 |    ],
260 |    "source": [
261 |     "START_TAG = \"<START>\"\n",
262 |     "STOP_TAG = \"<STOP>\"\n",
263 |     "EMBEDDING_DIM = 5\n",
264 |     "HIDDEN_DIM = 4\n",
265 |     "\n",
266 |     "# Make up some training data\n",
267 |     "training_data = [(\n",
268 |     "    \"the wall street journal reported today that apple corporation made money\".split(),\n",
269 |     "    \"B I I I O O O B I O O\".split()\n",
270 |     "), (\n",
271 |     "    \"georgia tech is a university in georgia\".split(),\n",
272 |     "    \"B I O O O O B\".split()\n",
273 |     ")]\n",
274 |     "\n",
275 |     "word_to_ix = {}\n",
276 |     "for sentence, tags in training_data:\n",
277 |     "    for word in sentence:\n",
278 |     "        if word not in word_to_ix:\n",
279 |     "            word_to_ix[word] = len(word_to_ix)\n",
280 |     "\n",
281 |     "tag_to_ix = {\"B\": 0, \"I\": 1, \"O\": 2, START_TAG: 3, STOP_TAG: 4}\n",
282 |     "\n",
283 |     "model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)\n",
284 |     "optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)\n",
285 |     "\n",
286 |     "# Check predictions before training\n",
287 |     "precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)\n",
288 |     "precheck_tags = torch.LongTensor([tag_to_ix[t] for t in training_data[0][1]])\n",
289 |     "print(model(precheck_sent))\n",
290 |     "\n",
291 |     "# Make sure prepare_sequence from earlier in the LSTM section is loaded\n",
292 |     "for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data\n",
293 |     "    for sentence, tags in training_data:\n",
294 |     "        # Step 1. Remember that Pytorch accumulates gradients.  We need to clear them out\n",
295 |     "        # before each instance\n",
296 |     "        model.zero_grad()\n",
297 |     "\n",
298 |     "        # Step 2. Get our inputs ready for the network, that is, turn them into Variables\n",
299 |     "        # of word indices.\n",
300 |     "        sentence_in = prepare_sequence(sentence, word_to_ix)\n",
301 |     "        targets = torch.LongTensor([tag_to_ix[t] for t in tags])\n",
302 |     "\n",
303 |     "        # Step 3. Run our forward pass.\n",
304 |     "        neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets)\n",
305 |     "\n",
306 |     "        # Step 4. Compute the loss, gradients, and update the parameters by calling\n",
307 |     "        # optimizer.step()\n",
308 |     "        neg_log_likelihood.backward()\n",
309 |     "        optimizer.step()\n",
310 |     "\n",
311 |     "# Check predictions after training\n",
312 |     "precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)\n",
313 |     "print(model(precheck_sent))\n",
314 |     "# We got it!"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {
321 |     "collapsed": true
322 |    },
323 |    "outputs": [],
324 |    "source": []
325 |   }
326 |  ],
327 |  "metadata": {
328 |   "kernelspec": {
329 |    "display_name": "Python [conda root]",
330 |    "language": "python",
331 |    "name": "conda-root-py"
332 |   },
333 |   "language_info": {
334 |    "codemirror_mode": {
335 |     "name": "ipython",
336 |     "version": 3
337 |    },
338 |    "file_extension": ".py",
339 |    "mimetype": "text/x-python",
340 |    "name": "python",
341 |    "nbconvert_exporter": "python",
342 |    "pygments_lexer": "ipython3",
343 |    "version": "3.5.2"
344 |   }
345 |  },
346 |  "nbformat": 4,
347 |  "nbformat_minor": 2
348 | }
349 | 


--------------------------------------------------------------------------------
/BiLSTMTaggerWordCRFModel_CONLL2000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/BiLSTMTaggerWordCRFModel_CONLL2000


--------------------------------------------------------------------------------
/BiLSTMTaggerWordCRFModel_CONLL2000.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/BiLSTMTaggerWordCRFModel_CONLL2000.pdf


--------------------------------------------------------------------------------
/IID_EWC_losses.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/IID_EWC_losses.pdf


--------------------------------------------------------------------------------
/IID_EWC_predictions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/IID_EWC_predictions.pdf


--------------------------------------------------------------------------------
/Iterated Dilated convolution.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {
   7 |     "collapsed": true
   8 |    },
   9 |    "outputs": [],
  10 |    "source": [
  11 |     "import numpy as np\n",
  12 |     "\n",
  13 |     "import torch\n",
  14 |     "from torch.autograd import Variable\n",
  15 |     "from torch.utils.data import Dataset, DataLoader\n",
  16 |     "\n",
  17 |     "import unittest"
  18 |    ]
  19 |   },
  20 |   {
  21 |    "cell_type": "code",
  22 |    "execution_count": 2,
  23 |    "metadata": {
  24 |     "collapsed": true
  25 |    },
  26 |    "outputs": [],
  27 |    "source": [
  28 |     "def runTests(test_class):\n",
  29 |     "    unittest.TextTestRunner().run(\n",
  30 |     "        unittest.TestLoader().loadTestsFromModule(\n",
  31 |     "            test_class()\n",
  32 |     "        )\n",
  33 |     "    )"
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "code",
  38 |    "execution_count": 3,
  39 |    "metadata": {
  40 |     "collapsed": true
  41 |    },
  42 |    "outputs": [],
  43 |    "source": [
  44 |     "VOCAB = {\n",
  45 |     "    \"__pad__\": 0,\n",
  46 |     "    \"__bos__\": 1,\n",
  47 |     "    \"__eos__\": 2,\n",
  48 |     "    \"__unk__\": 3,\n",
  49 |     "    \"dog\": 4,\n",
  50 |     "    \"cat\": 5,\n",
  51 |     "    \"puppy\": 6\n",
  52 |     "}\n",
  53 |     "\n",
  54 |     "CHAR_VOCAB = {\n",
  55 |     "    \"__c_pad__\": 0,\n",
  56 |     "    \"__bot__\": 1,\n",
  57 |     "    \"__eot__\": 2,\n",
  58 |     "    \"__c_unk__\": 3,\n",
  59 |     "    \"__pad__\": 4,\n",
  60 |     "    \"__bos__\": 5,\n",
  61 |     "    \"__eos__\": 6,\n",
  62 |     "    \"a\": 7,\n",
  63 |     "    \"c\": 8,\n",
  64 |     "    \"d\": 9,\n",
  65 |     "    \"g\": 10,\n",
  66 |     "    \"o\": 11,\n",
  67 |     "    \"p\": 12,\n",
  68 |     "    \"t\": 13,\n",
  69 |     "    \"u\": 14,\n",
  70 |     "    \"y\": 15\n",
  71 |     "}\n",
  72 |     "\n",
  73 |     "TAG_VOCAB = {\n",
  74 |     "    \"__pad__\": 0,\n",
  75 |     "    \"__bos__\": 1,\n",
  76 |     "    \"__eos__\": 2,\n",
  77 |     "    \"animal_class\": 3,\n",
  78 |     "    \"offspring\": 4\n",
  79 |     "}\n",
  80 |     "\n",
  81 |     "maxlen=10\n",
  82 |     "max_tokenlen=15\n",
  83 |     "\n",
  84 |     "def seq2idx(items, vocab, begin=\"__bos__\", end=\"__eos__\"):\n",
  85 |     "    seq = (\n",
  86 |     "        tuple([vocab[begin]]) \n",
  87 |     "        + tuple([\n",
  88 |     "            vocab[item]\n",
  89 |     "            for item in items\n",
  90 |     "        ]) \n",
  91 |     "        + tuple([vocab[end]]))\n",
  92 |     "    #print(seq)\n",
  93 |     "    return seq\n",
  94 |     "    \n",
  95 |     "def padded_seq(seq, maxlen, pad_value):\n",
  96 |     "    seqlen = min(maxlen, len(seq))\n",
  97 |     "    seq = tuple(seq[:seqlen]) + tuple([pad_value]*(maxlen - seqlen))\n",
  98 |     "    return seq, seqlen\n",
  99 |     "\n",
 100 |     "def get_chars_seq(sentence, char_vocab):\n",
 101 |     "    char_seq = tuple([[\"__bos__\"]]) + tuple([\n",
 102 |     "        tuple(w) for w in sentence\n",
 103 |     "    ]) + tuple([[\"__eos__\"]])\n",
 104 |     "    char_seq = tuple([\n",
 105 |     "        padded_seq(\n",
 106 |     "            seq2idx(\n",
 107 |     "                chars,\n",
 108 |     "                char_vocab,\n",
 109 |     "                begin=\"__bot__\",\n",
 110 |     "                end=\"__eot__\"\n",
 111 |     "            ),\n",
 112 |     "            max_tokenlen,\n",
 113 |     "            char_vocab[\"__c_pad__\"]\n",
 114 |     "        )[0]\n",
 115 |     "        for chars in char_seq\n",
 116 |     "    ])\n",
 117 |     "    padded_char_value = padded_seq(\n",
 118 |     "        seq2idx(\n",
 119 |     "            [\"__pad__\"],\n",
 120 |     "            char_vocab,\n",
 121 |     "            begin=\"__bot__\",\n",
 122 |     "            end=\"__eot__\"\n",
 123 |     "        ),\n",
 124 |     "        max_tokenlen,\n",
 125 |     "        char_vocab[\"__c_pad__\"]\n",
 126 |     "    )[0]\n",
 127 |     "    \n",
 128 |     "    return char_seq, padded_char_value\n",
 129 |     "    \n",
 130 |     "\n",
 131 |     "def transform(sentence_tags_item, vocab, char_vocab, tag_vocab):\n",
 132 |     "    sentence, tags = sentence_tags_item\n",
 133 |     "    word_tensor, word_len = padded_seq(\n",
 134 |     "        seq2idx(sentence, VOCAB),\n",
 135 |     "        maxlen,\n",
 136 |     "        vocab[\"__pad__\"]\n",
 137 |     "    )\n",
 138 |     "    tag_tensor, tags_len = padded_seq(\n",
 139 |     "        seq2idx(tags, TAG_VOCAB),\n",
 140 |     "        maxlen,\n",
 141 |     "        tag_vocab[\"__pad__\"]\n",
 142 |     "    )\n",
 143 |     "    assert word_len == tags_len, (\n",
 144 |     "        \"Mismatch between padded word seq [{}]\"\n",
 145 |     "        \" and padded tag seq [{}]\"\n",
 146 |     "    ).format(word_len, tags_len)\n",
 147 |     "    \n",
 148 |     "    \n",
 149 |     "    char_seq, padded_char_value = get_chars_seq(sentence, char_vocab)\n",
 150 |     "    char_tensor, char_word_len = padded_seq(char_seq, maxlen, padded_char_value)\n",
 151 |     "    assert word_len == char_word_len, (\n",
 152 |     "        \"Mismatch between padded word seq [{}]\"\n",
 153 |     "        \" and padded char based seq [{}]\"\n",
 154 |     "    ).format(word_len, char_word_len)\n",
 155 |     "    \n",
 156 |     "    seq_len = word_len\n",
 157 |     "    \n",
 158 |     "    return word_tensor, char_tensor, tag_tensor, seq_len\n",
 159 |     "    \n",
 160 |     "    "
 161 |    ]
 162 |   },
 163 |   {
 164 |    "cell_type": "code",
 165 |    "execution_count": 4,
 166 |    "metadata": {},
 167 |    "outputs": [
 168 |     {
 169 |      "data": {
 170 |       "text/plain": [
 171 |        "((10,), (10, 15), (10,), 6)"
 172 |       ]
 173 |      },
 174 |      "execution_count": 4,
 175 |      "metadata": {},
 176 |      "output_type": "execute_result"
 177 |     }
 178 |    ],
 179 |    "source": [
 180 |     "char_seq, padded_char_value = get_chars_seq([\"dog\", \"cat\", \"dog\", \"puppy\"], CHAR_VOCAB)\n",
 181 |     "char_tensor, char_word_len = padded_seq(char_seq, maxlen, padded_char_value)\n",
 182 |     "\n",
 183 |     "\n",
 184 |     "word_tensor, char_tensor, tag_tensor, seq_len = transform((\n",
 185 |     "    [\"dog\", \"cat\", \"dog\", \"puppy\"],\n",
 186 |     "    [\"animal_class\", \"animal_class\", \"animal_class\", \"offspring\"]\n",
 187 |     "), VOCAB, CHAR_VOCAB, TAG_VOCAB)\n",
 188 |     "\n",
 189 |     "np.array(word_tensor).shape, np.array(char_tensor).shape, np.array(tag_tensor).shape, seq_len"
 190 |    ]
 191 |   },
 192 |   {
 193 |    "cell_type": "code",
 194 |    "execution_count": 5,
 195 |    "metadata": {},
 196 |    "outputs": [],
 197 |    "source": [
 198 |     "class TestTransforms(unittest.TestCase):\n",
 199 |     "    def test_seq2idx(self):\n",
 200 |     "        self.assertEqual(\n",
 201 |     "            seq2idx([\"dog\", \"cat\", \"dog\", \"puppy\"], VOCAB),\n",
 202 |     "            (1, 4, 5, 4, 6, 2)\n",
 203 |     "        )\n",
 204 |     "    \n",
 205 |     "    def test_padded_seq(self):\n",
 206 |     "        self.assertEqual(\n",
 207 |     "            padded_seq(\n",
 208 |     "                seq2idx(\n",
 209 |     "                    [\"dog\", \"cat\", \"dog\", \"puppy\"],\n",
 210 |     "                    VOCAB\n",
 211 |     "                ),\n",
 212 |     "                maxlen,\n",
 213 |     "                VOCAB[\"__pad__\"]\n",
 214 |     "            ),\n",
 215 |     "            ((1, 4, 5, 4, 6, 2, 0, 0, 0, 0), 6)\n",
 216 |     "        )\n",
 217 |     "        \n",
 218 |     "    def test_padded_char_seq(self):\n",
 219 |     "        char_seq, padded_char_value = get_chars_seq([\"dog\", \"cat\", \"dog\", \"puppy\"], CHAR_VOCAB)\n",
 220 |     "        char_tensor, char_word_len = padded_seq(char_seq, maxlen, padded_char_value)\n",
 221 |     "        self.assertEqual(\n",
 222 |     "            np.array(char_tensor).shape,\n",
 223 |     "            (maxlen, max_tokenlen)\n",
 224 |     "        )\n",
 225 |     "        \n",
 226 |     "        \n",
 227 |     "    def test_transform(self):\n",
 228 |     "        word_tensor, char_tensor, tag_tensor, seq_len = transform(\n",
 229 |     "            (\n",
 230 |     "                [\"dog\", \"cat\", \"dog\", \"puppy\"],\n",
 231 |     "                [\"animal_class\", \"animal_class\", \"animal_class\", \"offspring\"]\n",
 232 |     "            ),\n",
 233 |     "            VOCAB,\n",
 234 |     "            CHAR_VOCAB,\n",
 235 |     "            TAG_VOCAB\n",
 236 |     "        )\n",
 237 |     "\n",
 238 |     "        self.assertEqual(\n",
 239 |     "            (\n",
 240 |     "                np.array(word_tensor).shape,\n",
 241 |     "                np.array(char_tensor).shape,\n",
 242 |     "                np.array(tag_tensor).shape,\n",
 243 |     "                seq_len\n",
 244 |     "            ), ((10,), (10, 15), (10,), 6)\n",
 245 |     "        )\n",
 246 |     "        \n",
 247 |     "    \n"
 248 |    ]
 249 |   },
 250 |   {
 251 |    "cell_type": "code",
 252 |    "execution_count": 6,
 253 |    "metadata": {},
 254 |    "outputs": [
 255 |     {
 256 |      "name": "stderr",
 257 |      "output_type": "stream",
 258 |      "text": [
 259 |       "....\n",
 260 |       "----------------------------------------------------------------------\n",
 261 |       "Ran 4 tests in 0.004s\n",
 262 |       "\n",
 263 |       "OK\n"
 264 |      ]
 265 |     }
 266 |    ],
 267 |    "source": [
 268 |     "runTests(TestTransforms)"
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "markdown",
 273 |    "metadata": {},
 274 |    "source": [
 275 |     "## Make dataset"
 276 |    ]
 277 |   },
 278 |   {
 279 |    "cell_type": "code",
 280 |    "execution_count": 7,
 281 |    "metadata": {
 282 |     "collapsed": true
 283 |    },
 284 |    "outputs": [],
 285 |    "source": [
 286 |     "class SentenceDataset(Dataset):\n",
 287 |     "    def __init__(\n",
 288 |     "        self,\n",
 289 |     "        sentence_tags_items,\n",
 290 |     "        transform,\n",
 291 |     "        vocab,\n",
 292 |     "        char_vocab,\n",
 293 |     "        tag_vocab\n",
 294 |     "    ):\n",
 295 |     "        self.sentence_tags_items = sentence_tags_items\n",
 296 |     "        self.transform = transform\n",
 297 |     "        self.vocab = vocab\n",
 298 |     "        self.char_vocab = char_vocab\n",
 299 |     "        self.tag_vocab = tag_vocab\n",
 300 |     "        \n",
 301 |     "    def __getitem__(self, idx):\n",
 302 |     "        word_tensor, char_tensor, tag_tensor, seq_len = self.transform(\n",
 303 |     "            self.sentence_tags_items[idx],\n",
 304 |     "            self.vocab,\n",
 305 |     "            self.char_vocab,\n",
 306 |     "            self.tag_vocab\n",
 307 |     "        )\n",
 308 |     "        \n",
 309 |     "        word_tensor = torch.from_numpy(np.asarray(word_tensor))#.view(-1, 1)\n",
 310 |     "        char_tensor = torch.from_numpy(np.asarray(char_tensor))\n",
 311 |     "        tag_tensor = torch.from_numpy(np.asarray(tag_tensor))#.view(-1, 1)\n",
 312 |     "        seq_len = torch.from_numpy(np.asarray([seq_len]))\n",
 313 |     "        \n",
 314 |     "        return word_tensor, char_tensor, tag_tensor, seq_len\n",
 315 |     "    \n",
 316 |     "    def __len__(self):\n",
 317 |     "        return len(self.sentence_tags_items)"
 318 |    ]
 319 |   },
 320 |   {
 321 |    "cell_type": "code",
 322 |    "execution_count": 8,
 323 |    "metadata": {
 324 |     "collapsed": true
 325 |    },
 326 |    "outputs": [],
 327 |    "source": [
 328 |     "sentence_tag_items = [\n",
 329 |     "    (\n",
 330 |     "            [\"dog\", \"cat\", \"dog\", \"puppy\"],\n",
 331 |     "            [\"animal_class\", \"animal_class\", \"animal_class\", \"offspring\"]\n",
 332 |     "    ),\n",
 333 |     "    (\n",
 334 |     "            [\"dog\", \"cat\", \"cat\", \"puppy\"],\n",
 335 |     "            [\"animal_class\", \"animal_class\", \"animal_class\", \"offspring\"]\n",
 336 |     "    ),\n",
 337 |     "    (\n",
 338 |     "            [\"dog\", \"puppy\", \"dog\", \"puppy\"],\n",
 339 |     "            [\"animal_class\", \"offspring\", \"animal_class\", \"offspring\"]\n",
 340 |     "    ),\n",
 341 |     "    \n",
 342 |     "]"
 343 |    ]
 344 |   },
 345 |   {
 346 |    "cell_type": "code",
 347 |    "execution_count": 9,
 348 |    "metadata": {
 349 |     "collapsed": true
 350 |    },
 351 |    "outputs": [],
 352 |    "source": [
 353 |     "sent_dataset = SentenceDataset(\n",
 354 |     "    sentence_tag_items,\n",
 355 |     "    transform,\n",
 356 |     "    VOCAB,\n",
 357 |     "    CHAR_VOCAB,\n",
 358 |     "    TAG_VOCAB\n",
 359 |     ")\n",
 360 |     "train_loader = DataLoader(sent_dataset, batch_size=10, shuffle=True, num_workers=1)"
 361 |    ]
 362 |   },
 363 |   {
 364 |    "cell_type": "code",
 365 |    "execution_count": 10,
 366 |    "metadata": {},
 367 |    "outputs": [
 368 |     {
 369 |      "data": {
 370 |       "text/plain": [
 371 |        "(torch.Size([3, 10]),\n",
 372 |        " torch.Size([3, 10, 15]),\n",
 373 |        " torch.Size([3, 10]),\n",
 374 |        " torch.Size([3, 1]))"
 375 |       ]
 376 |      },
 377 |      "execution_count": 10,
 378 |      "metadata": {},
 379 |      "output_type": "execute_result"
 380 |     }
 381 |    ],
 382 |    "source": [
 383 |     "word_tensors, char_tensors, tag_tensors, seq_len = next(iter(train_loader))\n",
 384 |     "word_tensors.size(), char_tensors.size(), tag_tensors.size(), seq_len.size()"
 385 |    ]
 386 |   },
 387 |   {
 388 |    "cell_type": "code",
 389 |    "execution_count": 11,
 390 |    "metadata": {},
 391 |    "outputs": [
 392 |     {
 393 |      "data": {
 394 |       "text/plain": [
 395 |        "torch.Size([3, 1])"
 396 |       ]
 397 |      },
 398 |      "execution_count": 11,
 399 |      "metadata": {},
 400 |      "output_type": "execute_result"
 401 |     }
 402 |    ],
 403 |    "source": [
 404 |     "seq_len.size()"
 405 |    ]
 406 |   },
 407 |   {
 408 |    "cell_type": "markdown",
 409 |    "metadata": {},
 410 |    "source": [
 411 |     "## Train model"
 412 |    ]
 413 |   },
 414 |   {
 415 |    "cell_type": "code",
 416 |    "execution_count": 12,
 417 |    "metadata": {
 418 |     "collapsed": true
 419 |    },
 420 |    "outputs": [],
 421 |    "source": [
 422 |     "conv1d = torch.nn.Conv1d(5, 10, 1, dilation=2)"
 423 |    ]
 424 |   },
 425 |   {
 426 |    "cell_type": "code",
 427 |    "execution_count": 13,
 428 |    "metadata": {},
 429 |    "outputs": [
 430 |     {
 431 |      "data": {
 432 |       "text/plain": [
 433 |        "torch.Size([2, 5, 4])"
 434 |       ]
 435 |      },
 436 |      "execution_count": 13,
 437 |      "metadata": {},
 438 |      "output_type": "execute_result"
 439 |     }
 440 |    ],
 441 |    "source": [
 442 |     "torch.rand(2,5,4).size()"
 443 |    ]
 444 |   },
 445 |   {
 446 |    "cell_type": "code",
 447 |    "execution_count": 14,
 448 |    "metadata": {},
 449 |    "outputs": [
 450 |     {
 451 |      "data": {
 452 |       "text/plain": [
 453 |        "torch.Size([2, 10, 4])"
 454 |       ]
 455 |      },
 456 |      "execution_count": 14,
 457 |      "metadata": {},
 458 |      "output_type": "execute_result"
 459 |     }
 460 |    ],
 461 |    "source": [
 462 |     "conv1d(Variable(torch.rand(2,5,4), requires_grad=False)).size()"
 463 |    ]
 464 |   },
 465 |   {
 466 |    "cell_type": "code",
 467 |    "execution_count": 15,
 468 |    "metadata": {
 469 |     "collapsed": true
 470 |    },
 471 |    "outputs": [],
 472 |    "source": [
 473 |     "emb = torch.nn.Embedding(10, 5)"
 474 |    ]
 475 |   },
 476 |   {
 477 |    "cell_type": "code",
 478 |    "execution_count": 16,
 479 |    "metadata": {},
 480 |    "outputs": [
 481 |     {
 482 |      "data": {
 483 |       "text/plain": [
 484 |        "torch.Size([2, 4, 5])"
 485 |       ]
 486 |      },
 487 |      "execution_count": 16,
 488 |      "metadata": {},
 489 |      "output_type": "execute_result"
 490 |     }
 491 |    ],
 492 |    "source": [
 493 |     "embeddings = emb(Variable(torch.LongTensor([[1,2,4,5],[4,3,2,9]]), requires_grad=False))\n",
 494 |     "embeddings.size()"
 495 |    ]
 496 |   },
 497 |   {
 498 |    "cell_type": "code",
 499 |    "execution_count": 17,
 500 |    "metadata": {},
 501 |    "outputs": [
 502 |     {
 503 |      "data": {
 504 |       "text/plain": [
 505 |        "torch.Size([2, 5, 4])"
 506 |       ]
 507 |      },
 508 |      "execution_count": 17,
 509 |      "metadata": {},
 510 |      "output_type": "execute_result"
 511 |     }
 512 |    ],
 513 |    "source": [
 514 |     "embeddings.permute(0, 2, 1).size()"
 515 |    ]
 516 |   },
 517 |   {
 518 |    "cell_type": "code",
 519 |    "execution_count": 18,
 520 |    "metadata": {},
 521 |    "outputs": [
 522 |     {
 523 |      "data": {
 524 |       "text/plain": [
 525 |        "Variable containing:\n",
 526 |        "(0 ,.,.) = \n",
 527 |        "  0.3917  0.8784  0.5268  0.4315\n",
 528 |        " -0.1406  0.2500  1.4438  0.0828\n",
 529 |        "  0.1396 -0.2760 -0.3761  0.1704\n",
 530 |        " -0.3965 -0.4440  0.2955 -0.3060\n",
 531 |        "  0.2451 -0.4238  0.3279  0.2239\n",
 532 |        " -0.5347 -1.1390  1.0406 -0.3362\n",
 533 |        "  0.0030 -0.7008  0.5324  0.1248\n",
 534 |        " -0.1148  0.7700 -0.3185 -0.1458\n",
 535 |        " -0.3496 -0.2052 -0.5736 -0.2478\n",
 536 |        " -0.1141  0.1016 -0.8129 -0.2597\n",
 537 |        "\n",
 538 |        "(1 ,.,.) = \n",
 539 |        "  0.5268  0.8617  0.8784  0.6424\n",
 540 |        "  1.4438 -0.5622  0.2500  1.4265\n",
 541 |        " -0.3761 -0.3985 -0.2760 -0.2698\n",
 542 |        "  0.2955 -0.7914 -0.4440  0.2630\n",
 543 |        "  0.3279  1.0187 -0.4238 -0.3041\n",
 544 |        "  1.0406 -0.9597 -1.1390  0.2380\n",
 545 |        "  0.5324  0.4073 -0.7008 -0.1318\n",
 546 |        " -0.3185  0.5722  0.7700  0.3617\n",
 547 |        " -0.5736 -0.5782 -0.2052  0.4115\n",
 548 |        " -0.8129 -0.2299  0.1016 -0.6984\n",
 549 |        "[torch.FloatTensor of size 2x10x4]"
 550 |       ]
 551 |      },
 552 |      "execution_count": 18,
 553 |      "metadata": {},
 554 |      "output_type": "execute_result"
 555 |     }
 556 |    ],
 557 |    "source": [
 558 |     "conv1d(embeddings.permute(0, 2, 1))"
 559 |    ]
 560 |   },
 561 |   {
 562 |    "cell_type": "code",
 563 |    "execution_count": 19,
 564 |    "metadata": {},
 565 |    "outputs": [
 566 |     {
 567 |      "data": {
 568 |       "text/plain": [
 569 |        "torch.Size([2, 10])"
 570 |       ]
 571 |      },
 572 |      "execution_count": 19,
 573 |      "metadata": {},
 574 |      "output_type": "execute_result"
 575 |     }
 576 |    ],
 577 |    "source": [
 578 |     "conv1d(embeddings.permute(0, 2, 1)).max(2)[1].size()\n"
 579 |    ]
 580 |   },
 581 |   {
 582 |    "cell_type": "code",
 583 |    "execution_count": 20,
 584 |    "metadata": {},
 585 |    "outputs": [
 586 |     {
 587 |      "data": {
 588 |       "text/plain": [
 589 |        "torch.Size([2, 1, 4, 5])"
 590 |       ]
 591 |      },
 592 |      "execution_count": 20,
 593 |      "metadata": {},
 594 |      "output_type": "execute_result"
 595 |     }
 596 |    ],
 597 |    "source": [
 598 |     "embeddings.unsqueeze(1).size()"
 599 |    ]
 600 |   },
 601 |   {
 602 |    "cell_type": "code",
 603 |    "execution_count": 21,
 604 |    "metadata": {},
 605 |    "outputs": [
 606 |     {
 607 |      "data": {
 608 |       "text/plain": [
 609 |        "torch.Size([3, 10, 15])"
 610 |       ]
 611 |      },
 612 |      "execution_count": 21,
 613 |      "metadata": {},
 614 |      "output_type": "execute_result"
 615 |     }
 616 |    ],
 617 |    "source": [
 618 |     "char_tensors.size()"
 619 |    ]
 620 |   },
 621 |   {
 622 |    "cell_type": "code",
 623 |    "execution_count": 22,
 624 |    "metadata": {},
 625 |    "outputs": [
 626 |     {
 627 |      "data": {
 628 |       "text/plain": [
 629 |        "torch.Size([3, 10, 15])"
 630 |       ]
 631 |      },
 632 |      "execution_count": 22,
 633 |      "metadata": {},
 634 |      "output_type": "execute_result"
 635 |     }
 636 |    ],
 637 |    "source": [
 638 |     "char_tensors.view(-1, 15).view(3, 10, -1).shape"
 639 |    ]
 640 |   },
 641 |   {
 642 |    "cell_type": "code",
 643 |    "execution_count": 23,
 644 |    "metadata": {
 645 |     "collapsed": true
 646 |    },
 647 |    "outputs": [],
 648 |    "source": [
 649 |     "class CharCNN(torch.nn.Module):\n",
 650 |     "    def __init__(self):\n",
 651 |     "        super(CharCNN, self).__init__()\n",
 652 |     "        self.char_embedding=4\n",
 653 |     "        self.char_conv_features=5\n",
 654 |     "        self.char_conv_kernel=1\n",
 655 |     "        \n",
 656 |     "        self.char_emb = torch.nn.Embedding(\n",
 657 |     "            len(CHAR_VOCAB),\n",
 658 |     "            self.char_embedding\n",
 659 |     "        )\n",
 660 |     "        \n",
 661 |     "        self.char_conv1d = torch.nn.Conv1d(\n",
 662 |     "            self.char_embedding,\n",
 663 |     "            self.char_conv_features,\n",
 664 |     "            self.char_conv_kernel\n",
 665 |     "        )\n",
 666 |     "        \n",
 667 |     "        self.output_size = self.char_conv_features\n",
 668 |     "        \n",
 669 |     "    def forward(self, char_tensors):\n",
 670 |     "        batch_size, seqlen, char_seqlen = char_tensors.size()\n",
 671 |     "        char_tensors = char_tensors.view(-1, char_seqlen)\n",
 672 |     "        char_tensors = self.char_emb(char_tensors)\n",
 673 |     "        char_tensors = char_tensors.permute(0, 2, 1)\n",
 674 |     "        char_tensors = self.char_conv1d(char_tensors)\n",
 675 |     "        char_tensors = char_tensors.max(2)[0] # Get the global max\n",
 676 |     "        char_tensors = char_tensors.view(batch_size, seqlen, -1)\n",
 677 |     "        return char_tensors"
 678 |    ]
 679 |   },
 680 |   {
 681 |    "cell_type": "code",
 682 |    "execution_count": 24,
 683 |    "metadata": {},
 684 |    "outputs": [
 685 |     {
 686 |      "data": {
 687 |       "text/plain": [
 688 |        "torch.Size([3, 10, 15])"
 689 |       ]
 690 |      },
 691 |      "execution_count": 24,
 692 |      "metadata": {},
 693 |      "output_type": "execute_result"
 694 |     }
 695 |    ],
 696 |    "source": [
 697 |     "char_tensors.shape"
 698 |    ]
 699 |   },
 700 |   {
 701 |    "cell_type": "code",
 702 |    "execution_count": 25,
 703 |    "metadata": {},
 704 |    "outputs": [],
 705 |    "source": [
 706 |     "char_model = CharCNN()"
 707 |    ]
 708 |   },
 709 |   {
 710 |    "cell_type": "code",
 711 |    "execution_count": 26,
 712 |    "metadata": {},
 713 |    "outputs": [
 714 |     {
 715 |      "data": {
 716 |       "text/plain": [
 717 |        "torch.Size([3, 10, 15])"
 718 |       ]
 719 |      },
 720 |      "execution_count": 26,
 721 |      "metadata": {},
 722 |      "output_type": "execute_result"
 723 |     }
 724 |    ],
 725 |    "source": [
 726 |     "char_tensors.size()"
 727 |    ]
 728 |   },
 729 |   {
 730 |    "cell_type": "code",
 731 |    "execution_count": 27,
 732 |    "metadata": {},
 733 |    "outputs": [
 734 |     {
 735 |      "data": {
 736 |       "text/plain": [
 737 |        "torch.Size([3, 10, 5])"
 738 |       ]
 739 |      },
 740 |      "execution_count": 27,
 741 |      "metadata": {},
 742 |      "output_type": "execute_result"
 743 |     }
 744 |    ],
 745 |    "source": [
 746 |     "char_model(Variable(char_tensors, requires_grad=False)).size()"
 747 |    ]
 748 |   },
 749 |   {
 750 |    "cell_type": "code",
 751 |    "execution_count": 28,
 752 |    "metadata": {},
 753 |    "outputs": [
 754 |     {
 755 |      "data": {
 756 |       "text/plain": [
 757 |        "torch.Size([3, 10, 30])"
 758 |       ]
 759 |      },
 760 |      "execution_count": 28,
 761 |      "metadata": {},
 762 |      "output_type": "execute_result"
 763 |     }
 764 |    ],
 765 |    "source": [
 766 |     "torch.cat((char_tensors, char_tensors), -1).size()"
 767 |    ]
 768 |   },
 769 |   {
 770 |    "cell_type": "code",
 771 |    "execution_count": 29,
 772 |    "metadata": {},
 773 |    "outputs": [
 774 |     {
 775 |      "data": {
 776 |       "text/plain": [
 777 |        "(Variable containing:\n",
 778 |        "  0.8364 -0.1794  2.4606  0.3041 -0.3007\n",
 779 |        "  2.0133  1.1859  0.9896  1.6575  1.4240\n",
 780 |        " -0.3331  1.1859  2.4606  1.6575  1.4240\n",
 781 |        "  0.7453  0.0274  0.7354  0.1239  1.8854\n",
 782 |        " [torch.FloatTensor of size 4x5], Variable containing:\n",
 783 |        "  0  0  1  0  1\n",
 784 |        "  1  0  1  0  0\n",
 785 |        "  1  1  0  1  1\n",
 786 |        "  0  1  1  0  1\n",
 787 |        " [torch.LongTensor of size 4x5])"
 788 |       ]
 789 |      },
 790 |      "execution_count": 29,
 791 |      "metadata": {},
 792 |      "output_type": "execute_result"
 793 |     }
 794 |    ],
 795 |    "source": [
 796 |     "embeddings.max(0)"
 797 |    ]
 798 |   },
 799 |   {
 800 |    "cell_type": "markdown",
 801 |    "metadata": {},
 802 |    "source": [
 803 |     "## Word model"
 804 |    ]
 805 |   },
 806 |   {
 807 |    "cell_type": "code",
 808 |    "execution_count": 30,
 809 |    "metadata": {
 810 |     "collapsed": true
 811 |    },
 812 |    "outputs": [],
 813 |    "source": [
 814 |     "class WordEmbeddings(torch.nn.Module):\n",
 815 |     "    def __init__(\n",
 816 |     "        self,\n",
 817 |     "        char_model,\n",
 818 |     "    ):\n",
 819 |     "        super(WordEmbeddings, self).__init__()\n",
 820 |     "        self.char_model = char_model\n",
 821 |     "        self.word_embedding = 10\n",
 822 |     "        self.word_emb = torch.nn.Embedding(\n",
 823 |     "            len(VOCAB),\n",
 824 |     "            self.word_embedding\n",
 825 |     "        )\n",
 826 |     "        \n",
 827 |     "        self.output_size = (\n",
 828 |     "            self.word_embedding\n",
 829 |     "            + self.char_model.output_size\n",
 830 |     "        )\n",
 831 |     "        \n",
 832 |     "        \n",
 833 |     "    def forward(self, word_tensors, char_tensors):\n",
 834 |     "        char_based_embs = self.char_model(char_tensors)\n",
 835 |     "        #print(char_based_embs.size(), type(char_based_embs.data))\n",
 836 |     "        word_embs = self.word_emb(word_tensors)\n",
 837 |     "        #print(word_embs.size(), type(word_embs.data))\n",
 838 |     "        word_embs = torch.cat(\n",
 839 |     "            [word_embs, char_based_embs],\n",
 840 |     "            -1\n",
 841 |     "        ) # Concat word and char based embeddings\n",
 842 |     "        return word_embs\n",
 843 |     "        \n",
 844 |     "        "
 845 |    ]
 846 |   },
 847 |   {
 848 |    "cell_type": "code",
 849 |    "execution_count": 31,
 850 |    "metadata": {
 851 |     "collapsed": true
 852 |    },
 853 |    "outputs": [],
 854 |    "source": [
 855 |     "word_model = WordEmbeddings(char_model)"
 856 |    ]
 857 |   },
 858 |   {
 859 |    "cell_type": "code",
 860 |    "execution_count": 32,
 861 |    "metadata": {},
 862 |    "outputs": [
 863 |     {
 864 |      "data": {
 865 |       "text/plain": [
 866 |        "(torch.Size([3, 10]), torch.Size([3, 10, 15]))"
 867 |       ]
 868 |      },
 869 |      "execution_count": 32,
 870 |      "metadata": {},
 871 |      "output_type": "execute_result"
 872 |     }
 873 |    ],
 874 |    "source": [
 875 |     "word_tensors.size(), char_tensors.size()"
 876 |    ]
 877 |   },
 878 |   {
 879 |    "cell_type": "code",
 880 |    "execution_count": 33,
 881 |    "metadata": {},
 882 |    "outputs": [
 883 |     {
 884 |      "data": {
 885 |       "text/plain": [
 886 |        "torch.Size([3, 10, 15])"
 887 |       ]
 888 |      },
 889 |      "execution_count": 33,
 890 |      "metadata": {},
 891 |      "output_type": "execute_result"
 892 |     }
 893 |    ],
 894 |    "source": [
 895 |     "word_model(\n",
 896 |     "    Variable(word_tensors, requires_grad=False),\n",
 897 |     "    Variable(char_tensors, requires_grad=False)\n",
 898 |     ").size()"
 899 |    ]
 900 |   },
 901 |   {
 902 |    "cell_type": "markdown",
 903 |    "metadata": {},
 904 |    "source": [
 905 |     "## ID CNN model\n",
 906 |     "\n",
 907 |     "https://arxiv.org/pdf/1702.02098.pdf"
 908 |    ]
 909 |   },
 910 |   {
 911 |    "cell_type": "code",
 912 |    "execution_count": 34,
 913 |    "metadata": {},
 914 |    "outputs": [],
 915 |    "source": [
 916 |     "class ID_CNN(torch.nn.Module):\n",
 917 |     "    \"\"\"ID CNN Encoder\n",
 918 |     "    \n",
 919 |     "    Input: (batch, input_dims, seqlen)\n",
 920 |     "    Outpus: (batch, input_dims, seqlen)\n",
 921 |     "    \"\"\"\n",
 922 |     "    def __init__(\n",
 923 |     "        self,\n",
 924 |     "        input_dims,\n",
 925 |     "        dialation_block_depth=5,\n",
 926 |     "        field_of_view=2,\n",
 927 |     "        block_stacks=2\n",
 928 |     "    ):\n",
 929 |     "        super(ID_CNN, self).__init__()\n",
 930 |     "        \n",
 931 |     "        # We want to make the input emb same as output emb\n",
 932 |     "        # This allows us to recursively stack the layers. \n",
 933 |     "        \n",
 934 |     "        \n",
 935 |     "        self.conv_features = input_dims\n",
 936 |     "        self.conv_kernel = 3\n",
 937 |     "        self.block_stacks = block_stacks\n",
 938 |     "        \n",
 939 |     "        self.word_char_conv1d = torch.nn.Sequential(\n",
 940 |     "            *[\n",
 941 |     "                torch.nn.Sequential(\n",
 942 |     "                    torch.nn.Conv1d(\n",
 943 |     "                        input_dims,\n",
 944 |     "                        self.conv_features,\n",
 945 |     "                        kernel_size=self.conv_kernel,\n",
 946 |     "                        padding=field_of_view**i,\n",
 947 |     "                        dilation=field_of_view**i\n",
 948 |     "                    ),\n",
 949 |     "                    torch.nn.ReLU()\n",
 950 |     "                )\n",
 951 |     "                for i in range(dialation_block_depth)\n",
 952 |     "            ]\n",
 953 |     "        )\n",
 954 |     "        \n",
 955 |     "    def forward(self, seq_scores):\n",
 956 |     "        for block_idx in range(self.block_stacks):\n",
 957 |     "            seq_scores = self.word_char_conv1d(seq_scores)\n",
 958 |     "        return seq_scores\n",
 959 |     "        \n",
 960 |     "        \n",
 961 |     "class IDCNNEncoder(torch.nn.Module):\n",
 962 |     "    \"\"\"IDCNNEncoder - Encodes word and char based sentence\n",
 963 |     "    \n",
 964 |     "    Input: \n",
 965 |     "        word_tensors - (batch, seqlen), \n",
 966 |     "        char_tensors - (batch, seqlen, char_seqlen)\n",
 967 |     "    \"\"\"\n",
 968 |     "    def __init__(\n",
 969 |     "        self,\n",
 970 |     "        word_model,\n",
 971 |     "    ):\n",
 972 |     "        super(IDCNNEncoder, self).__init__()\n",
 973 |     "        self.word_model = word_model\n",
 974 |     "        self.id_cnn = ID_CNN(self.word_model.output_size)\n",
 975 |     "        \n",
 976 |     "    def forward(self, word_tensors, char_tensors):\n",
 977 |     "        word_embs = self.word_model(word_tensors, char_tensors)\n",
 978 |     "        word_embs = word_embs.permute(0, 2, 1)\n",
 979 |     "        seq_scores = self.id_cnn(word_embs)\n",
 980 |     "        return seq_scores\n",
 981 |     "    \n",
 982 |     "class IDCNNDecoder(torch.nn.Module):\n",
 983 |     "    def __init__(\n",
 984 |     "        self,\n",
 985 |     "        input_dims,\n",
 986 |     "        num_classes,\n",
 987 |     "        decoder_layers=3\n",
 988 |     "    ):\n",
 989 |     "        super(IDCNNDecoder, self).__init__()\n",
 990 |     "        self.input_dims = input_dims\n",
 991 |     "        self.num_classes = num_classes\n",
 992 |     "        self.decoder_layers = decoder_layers\n",
 993 |     "        self.transform_layer = torch.nn.Sequential(\n",
 994 |     "                torch.nn.Linear(self.input_dims, self.num_classes),\n",
 995 |     "                torch.nn.ReLU()\n",
 996 |     "            )\n",
 997 |     "        self.create_decoder_layers()\n",
 998 |     "        \n",
 999 |     "    def create_decoder_layers(self):\n",
1000 |     "        self.id_cnn = torch.nn.ModuleList(\n",
1001 |     "            [\n",
1002 |     "                ID_CNN(self.num_classes, self.num_classes, block_stacks=1)\n",
1003 |     "                for i in range(self.decoder_layers)\n",
1004 |     "            ]\n",
1005 |     "        )\n",
1006 |     "        \n",
1007 |     "    def forward(self, seq_scores):\n",
1008 |     "        outputs = []\n",
1009 |     "        batch, input_dims, seqlen = seq_scores.size()\n",
1010 |     "        seq_scores = seq_scores.permute(0, 2, 1).contiguous()\n",
1011 |     "        seq_scores = seq_scores.view(batch*seqlen, input_dims)\n",
1012 |     "        seq_scores = self.transform_layer(seq_scores)\n",
1013 |     "        seq_scores = seq_scores.view(batch, seqlen, self.num_classes)\n",
1014 |     "        seq_scores = seq_scores.permute(0, 2, 1)\n",
1015 |     "        for id_cnn in self.id_cnn:\n",
1016 |     "            seq_scores = id_cnn(seq_scores)\n",
1017 |     "            outputs.append(seq_scores)\n",
1018 |     "        return outputs"
1019 |    ]
1020 |   },
1021 |   {
1022 |    "cell_type": "code",
1023 |    "execution_count": 35,
1024 |    "metadata": {},
1025 |    "outputs": [
1026 |     {
1027 |      "data": {
1028 |       "text/plain": [
1029 |        "(torch.Size([3, 10]), torch.Size([3, 10, 15]))"
1030 |       ]
1031 |      },
1032 |      "execution_count": 35,
1033 |      "metadata": {},
1034 |      "output_type": "execute_result"
1035 |     }
1036 |    ],
1037 |    "source": [
1038 |     "id_cnn = IDCNNEncoder(word_model)\n",
1039 |     "word_tensors.size(), char_tensors.size()"
1040 |    ]
1041 |   },
1042 |   {
1043 |    "cell_type": "code",
1044 |    "execution_count": 36,
1045 |    "metadata": {},
1046 |    "outputs": [
1047 |     {
1048 |      "data": {
1049 |       "text/plain": [
1050 |        "torch.Size([3, 15, 10])"
1051 |       ]
1052 |      },
1053 |      "execution_count": 36,
1054 |      "metadata": {},
1055 |      "output_type": "execute_result"
1056 |     }
1057 |    ],
1058 |    "source": [
1059 |     "id_cnn(\n",
1060 |     "    Variable(word_tensors, requires_grad=False),\n",
1061 |     "    Variable(char_tensors, requires_grad=False)\n",
1062 |     ").size()"
1063 |    ]
1064 |   },
1065 |   {
1066 |    "cell_type": "code",
1067 |    "execution_count": 37,
1068 |    "metadata": {},
1069 |    "outputs": [],
1070 |    "source": [
1071 |     "id_cnn_decoder = IDCNNDecoder(15, len(TAG_VOCAB))"
1072 |    ]
1073 |   },
1074 |   {
1075 |    "cell_type": "code",
1076 |    "execution_count": 38,
1077 |    "metadata": {},
1078 |    "outputs": [
1079 |     {
1080 |      "data": {
1081 |       "text/plain": [
1082 |        "[torch.Size([3, 5, 10]), torch.Size([3, 5, 10]), torch.Size([3, 5, 10])]"
1083 |       ]
1084 |      },
1085 |      "execution_count": 38,
1086 |      "metadata": {},
1087 |      "output_type": "execute_result"
1088 |     }
1089 |    ],
1090 |    "source": [
1091 |     "decoder_outputs = id_cnn_decoder(id_cnn(\n",
1092 |     "    Variable(word_tensors, requires_grad=False),\n",
1093 |     "    Variable(char_tensors, requires_grad=False)\n",
1094 |     "))\n",
1095 |     "[output.size() for output in decoder_outputs]"
1096 |    ]
1097 |   },
1098 |   {
1099 |    "cell_type": "code",
1100 |    "execution_count": 39,
1101 |    "metadata": {},
1102 |    "outputs": [],
1103 |    "source": [
1104 |     "def get_loss(decoder_outputs, target, loss_fn):\n",
1105 |     "    batch, seqlen = target.size()[:2]\n",
1106 |     "    #target = target.unsqueeze(2).permute(0,2,1).contiguous().view(-1, 1).squeeze()\n",
1107 |     "    target = target.view(-1)\n",
1108 |     "    #print(target.size())\n",
1109 |     "    loss = None\n",
1110 |     "    for output in decoder_outputs:\n",
1111 |     "        output = output.permute(0,2,1).contiguous().view(-1, output.size()[1])\n",
1112 |     "        #print(output.size())\n",
1113 |     "        if loss is None:\n",
1114 |     "            loss = loss_fn(output, target)\n",
1115 |     "        else: \n",
1116 |     "            loss += loss_fn(output, target)\n",
1117 |     "    return loss\n",
1118 |     "\n",
1119 |     "loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)"
1120 |    ]
1121 |   },
1122 |   {
1123 |    "cell_type": "code",
1124 |    "execution_count": 40,
1125 |    "metadata": {},
1126 |    "outputs": [
1127 |     {
1128 |      "data": {
1129 |       "text/plain": [
1130 |        "torch.Size([30, 5])"
1131 |       ]
1132 |      },
1133 |      "execution_count": 40,
1134 |      "metadata": {},
1135 |      "output_type": "execute_result"
1136 |     }
1137 |    ],
1138 |    "source": [
1139 |     "decoder_outputs[0].permute(0,2,1).contiguous().view(-1, decoder_outputs[0].size()[1]).size()"
1140 |    ]
1141 |   },
1142 |   {
1143 |    "cell_type": "code",
1144 |    "execution_count": 41,
1145 |    "metadata": {},
1146 |    "outputs": [
1147 |     {
1148 |      "data": {
1149 |       "text/plain": [
1150 |        "Variable containing:\n",
1151 |        " 4.7655\n",
1152 |        "[torch.FloatTensor of size 1]"
1153 |       ]
1154 |      },
1155 |      "execution_count": 41,
1156 |      "metadata": {},
1157 |      "output_type": "execute_result"
1158 |     }
1159 |    ],
1160 |    "source": [
1161 |     "get_loss(decoder_outputs, Variable(tag_tensors, requires_grad=False), loss_fn)"
1162 |    ]
1163 |   },
1164 |   {
1165 |    "cell_type": "markdown",
1166 |    "metadata": {},
1167 |    "source": [
1168 |     "## Train model"
1169 |    ]
1170 |   },
1171 |   {
1172 |    "cell_type": "code",
1173 |    "execution_count": 42,
1174 |    "metadata": {
1175 |     "collapsed": true
1176 |    },
1177 |    "outputs": [],
1178 |    "source": [
1179 |     "def train(encoder, decoder, dataloader, num_epochs, history=None):\n",
1180 |     "    if history is None:\n",
1181 |     "        history = []\n",
1182 |     "    cuda = torch.cuda.is_available()\n",
1183 |     "    if cuda:\n",
1184 |     "        encoder.cuda()\n",
1185 |     "        decoder.cuda()\n",
1186 |     "    optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()))\n",
1187 |     "    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)\n",
1188 |     "    for i in range(num_epochs):\n",
1189 |     "        per_epoch_losses = []\n",
1190 |     "        for batch in dataloader:\n",
1191 |     "            word_tensors = Variable(batch[0], requires_grad=False)\n",
1192 |     "            char_tensors = Variable(batch[1], requires_grad=False)\n",
1193 |     "            tag_tensors = Variable(batch[2], requires_grad=False)\n",
1194 |     "            seq_len = Variable(batch[3], requires_grad=False)\n",
1195 |     "            if cuda:\n",
1196 |     "                word_tensors = word_tensors.cuda()\n",
1197 |     "                char_tensors = char_tensors.cuda()\n",
1198 |     "                tag_tensors = tag_tensors.cuda()\n",
1199 |     "            optimizer.zero_grad()\n",
1200 |     "            encoding = encoder(word_tensors, char_tensors)\n",
1201 |     "            outputs = decoder(encoding)\n",
1202 |     "            loss = get_loss(outputs, tag_tensors, loss_fn)\n",
1203 |     "            loss.backward()\n",
1204 |     "            optimizer.step()\n",
1205 |     "            per_epoch_losses.append(loss.data[0])\n",
1206 |     "        history.append(np.mean(per_epoch_losses))\n",
1207 |     "        print('epoch[%d] loss: %.4f' % (i, loss.data[0]))\n",
1208 |     "    return history "
1209 |    ]
1210 |   },
1211 |   {
1212 |    "cell_type": "code",
1213 |    "execution_count": 43,
1214 |    "metadata": {
1215 |     "collapsed": true
1216 |    },
1217 |    "outputs": [],
1218 |    "source": [
1219 |     "char_model = CharCNN()\n",
1220 |     "word_model = WordEmbeddings(char_model)\n",
1221 |     "id_cnn = IDCNNEncoder(word_model)\n",
1222 |     "id_cnn_decoder = IDCNNDecoder(15, len(TAG_VOCAB))\n",
1223 |     "history = None"
1224 |    ]
1225 |   },
1226 |   {
1227 |    "cell_type": "code",
1228 |    "execution_count": 44,
1229 |    "metadata": {},
1230 |    "outputs": [
1231 |     {
1232 |      "name": "stdout",
1233 |      "output_type": "stream",
1234 |      "text": [
1235 |       "epoch[0] loss: 4.8157\n",
1236 |       "epoch[1] loss: 4.8139\n",
1237 |       "epoch[2] loss: 4.8121\n",
1238 |       "epoch[3] loss: 4.8102\n",
1239 |       "epoch[4] loss: 4.8084\n",
1240 |       "epoch[5] loss: 4.8066\n",
1241 |       "epoch[6] loss: 4.8048\n",
1242 |       "epoch[7] loss: 4.8030\n",
1243 |       "epoch[8] loss: 4.8011\n",
1244 |       "epoch[9] loss: 4.7993\n"
1245 |      ]
1246 |     }
1247 |    ],
1248 |    "source": [
1249 |     "history = train(id_cnn, id_cnn_decoder, train_loader, 10, history=history)"
1250 |    ]
1251 |   },
1252 |   {
1253 |    "cell_type": "code",
1254 |    "execution_count": null,
1255 |    "metadata": {
1256 |     "collapsed": true
1257 |    },
1258 |    "outputs": [],
1259 |    "source": []
1260 |   },
1261 |   {
1262 |    "cell_type": "code",
1263 |    "execution_count": null,
1264 |    "metadata": {
1265 |     "collapsed": true
1266 |    },
1267 |    "outputs": [],
1268 |    "source": []
1269 |   }
1270 |  ],
1271 |  "metadata": {
1272 |   "kernelspec": {
1273 |    "display_name": "Python [default]",
1274 |    "language": "python",
1275 |    "name": "python3"
1276 |   },
1277 |   "language_info": {
1278 |    "codemirror_mode": {
1279 |     "name": "ipython",
1280 |     "version": 3
1281 |    },
1282 |    "file_extension": ".py",
1283 |    "mimetype": "text/x-python",
1284 |    "name": "python",
1285 |    "nbconvert_exporter": "python",
1286 |    "pygments_lexer": "ipython3",
1287 |    "version": "3.5.3"
1288 |   },
1289 |   "toc": {
1290 |    "nav_menu": {
1291 |     "height": "156px",
1292 |     "width": "160px"
1293 |    },
1294 |    "number_sections": true,
1295 |    "sideBar": true,
1296 |    "skip_h1_title": false,
1297 |    "toc_cell": false,
1298 |    "toc_position": {},
1299 |    "toc_section_display": "block",
1300 |    "toc_window_display": false
1301 |   }
1302 |  },
1303 |  "nbformat": 4,
1304 |  "nbformat_minor": 2
1305 | }
1306 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/PyTorch CONLL 2000 Chunking.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | import matplotlib
  7 | matplotlib.use("Agg")
  8 | import torch
  9 | from torch.autograd import Variable
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import torch.optim as optim
 13 | 
 14 | torch.manual_seed(1)
 15 | 
 16 | import numpy as np
 17 | 
 18 | from tqdm import tqdm
 19 | 
 20 | import pandas as pd
 21 | 
 22 | import matplotlib.pyplot as plt
 23 | import seaborn as sns
 24 | 
 25 | from pytorch_utils import *
 26 | from pytorch_models import *
 27 | from utils import load_sequences, conll_classification_report_to_df
 28 | from conlleval import main as conll_eval
 29 | import re
 30 | 
 31 | import io
 32 | from pathlib import Path
 33 | 
 34 | sns.set_context("poster")
 35 | sns.set_style("ticks")
 36 | 
 37 | 
 38 | # In[2]:
 39 | 
 40 | TRAIN_CORPUS="data/conll2000/train.txt"
 41 | TEST_CORPUS="data/conll2000/test.txt"
 42 | 
 43 | 
 44 | # In[3]:
 45 | 
 46 | train_corpus = load_sequences(TRAIN_CORPUS, sep=" ", col_ids=(0, -1))
 47 | train_corpus, dev_corpus = train_corpus[100:], train_corpus[:100]
 48 | print("Total items in train corpus: %s" % len(train_corpus))
 49 | print("Total items in dev corpus: %s" % len(dev_corpus))
 50 | test_corpus = load_sequences(TEST_CORPUS, sep=" ", col_ids=(0, -1))
 51 | print("Total items in test corpus: %s" % len(test_corpus))
 52 | 
 53 | 
 54 | # In[4]:
 55 | 
 56 | train_corpus[0]
 57 | 
 58 | 
 59 | # In[5]:
 60 | 
 61 | def create_vocab(data, vocabs, char_vocab, word_idx=0):
 62 |     n_vocabs = len(vocabs)
 63 |     for sent in data:
 64 |         for token_tags in sent:
 65 |             for vocab_id in range(n_vocabs):
 66 |                 vocabs[vocab_id].add(token_tags[vocab_id])
 67 |             char_vocab.batch_add(token_tags[word_idx])
 68 |     print("Created vocabs: %s, chars[%s]" % (", ".join(
 69 |         "{}[{}]".format(vocab.name, vocab.size)
 70 |         for vocab in vocabs
 71 |     ), char_vocab.size))
 72 | 
 73 | 
 74 | # In[6]:
 75 | 
 76 | word_vocab = Vocab("words", UNK="UNK", lower=True)
 77 | char_vocab = Vocab("chars", UNK="<U>", lower=False)
 78 | chunk_vocab = Vocab("chunk_tags", lower=False)
 79 | 
 80 | create_vocab(train_corpus+dev_corpus+test_corpus, [word_vocab, chunk_vocab], char_vocab)
 81 | 
 82 | 
 83 | # In[7]:
 84 | 
 85 | def data2tensors(data, vocabs, char_vocab, word_idx=0, column_ids=(0, -1)):
 86 |     vocabs = [vocabs[idx] for idx in column_ids]
 87 |     n_vocabs = len(vocabs)
 88 |     tensors = []
 89 |     char_tensors = []
 90 |     for sent in data:
 91 |         sent_vecs = [[] for i in range(n_vocabs+1)] # Last is for char vecs
 92 |         char_vecs = []
 93 |         for token_tags in sent:
 94 |             vocab_id = 0 # First column is the word
 95 |             # lowercase the word
 96 |             sent_vecs[vocab_id].append(
 97 |                     vocabs[vocab_id].getidx(token_tags[vocab_id].lower())
 98 |                 )
 99 |             for vocab_id in range(1, n_vocabs):
100 |                 sent_vecs[vocab_id].append(
101 |                     vocabs[vocab_id].getidx(token_tags[vocab_id])
102 |                 )
103 |             sent_vecs[-1].append(
104 |                 [char_vocab.getidx(c) for c in token_tags[word_idx]]
105 |             )
106 |         tensors.append(sent_vecs)
107 |     return tensors
108 | 
109 | 
110 | # In[8]:
111 | 
112 | train_tensors = data2tensors(train_corpus, [word_vocab, chunk_vocab], char_vocab)
113 | dev_tensors = data2tensors(dev_corpus, [word_vocab, chunk_vocab], char_vocab)
114 | test_tensors = data2tensors(test_corpus, [word_vocab, chunk_vocab], char_vocab)
115 | print("Train: {}, Dev: {}, Test: {}".format(
116 |     len(train_tensors),
117 |     len(dev_tensors),
118 |     len(test_tensors),
119 | ))
120 | 
121 | 
122 | # In[9]:
123 | 
124 | def load_word_vectors(vector_file, ndims, vocab, cache_file, override_cache=False):
125 |     W = np.zeros((vocab.size, ndims), dtype="float32")
126 |     # Check for cached file and return vectors
127 |     cache_file = Path(cache_file)
128 |     if cache_file.is_file() and not override_cache:
129 |         W = np.load(cache_file)
130 |         return W
131 |     # Else load vectors from the vector file
132 |     total, found = 0, 0
133 |     with open(vector_file) as fp:
134 |         for line in fp:
135 |             line = line.strip().split()
136 |             if line:
137 |                 total += 1
138 |                 assert len(line) == ndims+1,(
139 |                     "{} vector dims {} doesn't match ndims={}".format(line[0], len(line)-1, ndims)
140 |                 )
141 |                 word = line[0]
142 |                 idx = vocab.getidx(word) 
143 |                 if idx >= vocab.offset:
144 |                     found += 1
145 |                     vecs = np.array(list(map(float, line[1:])))
146 |                     W[idx, :] += vecs
147 |     # Write to cache file
148 |     print("Found {} [{:.2f}%] vectors from {} vectors in {} with ndims={}".format(
149 |         found, found * 100/vocab.size, total, vector_file, ndims))
150 |     norm_W = np.sqrt((W*W).sum(axis=1, keepdims=True))
151 |     valid_idx = norm_W.squeeze() != 0
152 |     W[valid_idx, :] /= norm_W[valid_idx]
153 |     print("Caching embedding with shape {} to {}".format(W.shape, cache_file.as_posix()))
154 |     np.save(cache_file, W)
155 |     return W
156 |         
157 |                     
158 |                 
159 | 
160 | 
161 | # In[10]:
162 | 
163 | get_ipython().run_cell_magic(u'time', u'', u'embedding_file="/home/napsternxg/datadrive/Downloads/Glove/glove.6B.100d.txt"\ncache_file="conll2000.glove.100.npy"\nndims=100\npretrained_embeddings = load_word_vectors(embedding_file, ndims, word_vocab, cache_file)')
164 | 
165 | 
166 | # In[11]:
167 | 
168 | def plot_losses(train_losses, eval_losses=None, plot_std=False, ax=None):
169 |     if ax is None:
170 |         ax = plt.gca()
171 |     for losses, color, label in zip(
172 |         [train_losses, eval_losses],
173 |         ["0.5", "r"],
174 |         ["Train", "Eval"],
175 |     ):
176 |         mean_loss, std_loss = zip(*losses)
177 |         mean_loss = np.array(mean_loss)
178 |         std_loss = np.array(std_loss)
179 |         ax.plot(
180 |             mean_loss, color=color, label=label,
181 |             linestyle="-", 
182 |         )
183 |         if plot_std:
184 |             ax.fill_between(
185 |                 np.arange(mean_loss.shape[0]),
186 |                 mean_loss-std_loss,
187 |                 mean_loss+std_loss,
188 |                 color=color,
189 |                 alpha=0.3
190 |             )
191 |     ax.set_xlabel("Epochs")
192 |     ax.set_ylabel("Mean Loss ($\pm$ S.D.)")
193 |     
194 |     
195 | def print_predictions(corpus, predictions, filename, label_vocab):
196 |     with open(filename, "w+") as fp:
197 |         for seq, pred in zip(corpus, predictions):
198 |             for (token, true_label), pred_label in zip(seq, pred):
199 |                 pred_label = label_vocab.idx2item[pred_label]
200 |                 print("{}\t{}\t{}".format(token, true_label, pred_label), file=fp)
201 |             print(file=fp) # Add new line after each sequence
202 | 
203 | 
204 | # In[12]:
205 | 
206 | char_emb_size=10
207 | output_channels=50
208 | kernel_sizes=[2, 3]
209 | char_embedding = CharEmbedding(char_vocab.size, char_emb_size, output_channels, kernel_sizes)
210 | 
211 | 
212 | # In[13]:
213 | 
214 | char_embedding(Variable(torch.LongTensor([[1,1,2,3]]), requires_grad=False)).size()
215 | 
216 | 
217 | # In[14]:
218 | 
219 | word_emb_size=100
220 | char_embed_kwargs=dict(
221 |     vocab_size=char_vocab.size,
222 |     embedding_size=char_emb_size,
223 |     out_channels=output_channels,
224 |     kernel_sizes=kernel_sizes
225 | )
226 | word_char_embedding = WordCharEmbedding(
227 |     word_vocab.size, word_emb_size, char_embed_kwargs, dropout=0.2)
228 | 
229 | 
230 | # In[15]:
231 | 
232 | def charseq2varlist(X_chars):
233 |     return [Variable(torch.LongTensor([x]), requires_grad=False) for x in X_chars]
234 | 
235 | 
236 | # In[16]:
237 | 
238 | print(len(train_tensors[0][0]))
239 | print(len(train_tensors[0][-1]))
240 | 
241 | 
242 | # In[17]:
243 | 
244 | train_corpus[0]
245 | 
246 | 
247 | # In[18]:
248 | 
249 | charseq2varlist(train_tensors[0][-1])
250 | 
251 | 
252 | # In[19]:
253 | 
254 | word_char_embedding(
255 |     Variable(torch.LongTensor([train_tensors[0][0]]), requires_grad=False),
256 |     charseq2varlist(train_tensors[0][-1])
257 | ).size()
258 | 
259 | 
260 | # In[20]:
261 | 
262 | def assign_embeddings(embedding_module, pretrained_embeddings, fix_embedding=False):
263 |     embedding_module.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
264 |     if fix_embedding:
265 |         embedding_module.weight.requires_grad = False
266 | 
267 | 
268 | # In[21]:
269 | 
270 | assign_embeddings(word_char_embedding.word_embeddings, pretrained_embeddings, fix_embedding=True)
271 | 
272 | 
273 | # ## Class based
274 | 
275 | # In[22]:
276 | 
277 | class ModelWrapper(object):
278 |     def __init__(self, model,
279 |                  loss_function,
280 |                  use_cuda=False
281 |                 ):
282 |         self.model = model
283 |         self.loss_function = loss_function
284 | 
285 |         self.use_cuda = use_cuda
286 |         if self.use_cuda:
287 |             self.model.cuda()
288 |         
289 |     def _process_instance_tensors(self, instance_tensors):
290 |         raise NotImplementedError("Please define this function explicitly")
291 |         
292 |     def zero_grad(self):
293 |         self.model.zero_grad()
294 |         
295 |     def get_parameters(self):
296 |         return self.model.paramerters()
297 |     
298 |     def set_model_mode(self, training_mode=True):
299 |         if training_mode:
300 |             self.model.train()
301 |         else:
302 |             self.model.eval()
303 |             
304 |     def save(self, filename):
305 |         torch.save(self.model, filename)
306 |         print("{} model saved to {}".format(self.model.__class__, filename))
307 |         
308 |     def load(self, filename):
309 |         self.model = torch.load(filename)
310 |         if self.use_cuda:
311 |             self.model.cuda()
312 | 
313 |     def get_instance_loss(self, instance_tensors, zero_grad=True):
314 |         if zero_grads:
315 |         ## Clear gradients before every update else memory runs out
316 |             self.zero_grad()
317 |         raise NotImplementedError("Please define this function explicitly")
318 |         
319 |     def predict(self, instance_tensors):
320 |         raise NotImplementedError("Please define this function explicitly")
321 |         
322 |     def predict_batch(self, batch_tensors):
323 |         predictions = []
324 |         for instance_tensors in batch_tensors:
325 |             predictions.append(self.predict(instance_tensors))
326 |         return predictions
327 |         
328 |         
329 | def get_epoch_function(model_wrapper, optimizer,
330 |                        use_cuda=False):
331 |     def perform_epoch(data_tensors, training_mode=True, batch_size=1):
332 |         model_wrapper.set_model_mode(training_mode)
333 |         step_losses = []
334 |         data_tensors = np.random.permutation(data_tensors)
335 |         n_splits = data_tensors.shape[0]//batch_size
336 |         for batch_tensors in np.array_split(data_tensors, n_splits):
337 |             #from IPython.core.debugger import Tracer; Tracer()()
338 |             model_wrapper.zero_grad()
339 |             loss = Variable(torch.FloatTensor([0.]))
340 |             if use_cuda:
341 |                 loss = loss.cuda()
342 |             for instance_tensors in batch_tensors:
343 |                 loss += model_wrapper.get_instance_loss(instance_tensors, zero_grad=False)
344 |             loss = loss/batch_tensors.shape[0] # Mean loss
345 |             step_losses.append(loss.data[0])
346 |             if training_mode:
347 |                 ## Get gradients of model params wrt. loss
348 |                 loss.backward()
349 |                 ## Optimize the loss by one step
350 |                 optimizer.step()
351 |         return step_losses
352 |     return perform_epoch
353 | 
354 | def write_losses(losses, fp, title="train", epoch=0):
355 |     for i, loss in enumerate(losses):
356 |         print("{:<10} epoch={:<3} batch={:<5} loss={:<10}".format(
357 |             title, epoch, i, loss
358 |         ), file=fp)
359 |     print("{:<10} epoch={:<3} {:<11} mean={:<10.3f} std={:<10.3f}".format(
360 |         title, epoch, "overall", np.mean(losses), np.std(losses)
361 |     ), file=fp)
362 | 
363 | 
364 | def training_wrapper(
365 |     model_wrapper, data_tensors,
366 |     eval_tensors=None,
367 |     optimizer=optim.SGD,
368 |     optimizer_kwargs=None,
369 |     n_epochs=10,
370 |     batch_size=1,
371 |     use_cuda=False,
372 |     log_file="training_output.log"
373 | ):
374 |     """Wrapper to train the model
375 |     """
376 |     if optimizer_kwargs is None:
377 |         optimizer_kwargs = {}
378 |     # Fileter out parameters which don't require a gradient
379 |     parameters = filter(lambda p: p.requires_grad, model_wrapper.model.parameters())
380 |     optimizer=optimizer(parameters, **optimizer_kwargs)
381 |     # Start training
382 |     losses = []
383 |     eval_losses = []
384 |     data_tensors = np.array(data_tensors)
385 |     if eval_tensors is not None:
386 |         eval_tensors = np.array(eval_tensors)
387 |     perform_epoch = get_epoch_function(
388 |         model_wrapper,
389 |         optimizer,
390 |         use_cuda=use_cuda)
391 |     with open(log_file, "w+") as fp:
392 |         for epoch in tqdm(range(n_epochs)):
393 |             i = epoch
394 |             step_losses = perform_epoch(data_tensors, batch_size=batch_size)
395 |             mean_loss, std_loss = np.mean(step_losses), np.std(step_losses)
396 |             losses.append((mean_loss, std_loss))
397 |             write_losses(step_losses, fp, title="train", epoch=i)
398 |             if eval_tensors is not None:
399 |                 step_losses = perform_epoch(eval_tensors, training_mode=False)
400 |                 mean_loss, std_loss = np.mean(step_losses), np.std(step_losses)
401 |                 eval_losses.append((mean_loss, std_loss))
402 |                 write_losses(step_losses, fp, title="eval", epoch=i)
403 |     return {
404 |         "training_loss": losses,
405 |         "evaluation_loss": eval_losses
406 |     }
407 | 
408 | 
409 | # In[23]:
410 | 
411 | class LSTMTaggerModel(ModelWrapper):
412 |     def __init__(self, model,
413 |                  loss_function,
414 |                  use_cuda=False):
415 |         self.model = model
416 |         self.loss_function = loss_function
417 | 
418 |         self.use_cuda = use_cuda
419 |         if self.use_cuda:
420 |             #[k.cuda() for k in self.model.modules()]
421 |             self.model.cuda()
422 |         
423 |     def _process_instance_tensors(self, instance_tensors):
424 |         X, Y, X_char = instance_tensors
425 |         X = Variable(torch.LongTensor([X]), requires_grad=False)
426 |         Y = Variable(torch.LongTensor(Y), requires_grad=False)
427 |         X_char = charseq2varlist(X_char)
428 |         if self.use_cuda:
429 |             X = X.cuda()
430 |             Y = Y.cuda()
431 |             X_char = [t.cuda() for t in X_char]
432 |         return X, X_char, Y
433 | 
434 |     def get_instance_loss(self, instance_tensors, zero_grad=True):
435 |         if zero_grad:
436 |             ## Clear gradients before every update else memory runs out
437 |             self.model.zero_grad()
438 |         X, X_char, Y = self._process_instance_tensors(instance_tensors)
439 |         #print(X.get_device(), [t.get_device() for t in X_char])
440 |         return self.loss_function(self.model.forward(X, X_char), Y)
441 |         
442 |     def predict(self, instance_tensors):
443 |         X, X_char, Y = self._process_instance_tensors(instance_tensors)
444 |         prediction = self.model.forward(X, X_char)
445 |         return prediction.data.cpu().max(1)[1].numpy().ravel()
446 | 
447 | 
448 | # In[24]:
449 | 
450 | use_cuda=True
451 | n_embed=100
452 | hidden_size=100
453 | batch_size=10
454 | 
455 | char_emb_size=50
456 | output_channels=50
457 | kernel_sizes=[2, 3]
458 | 
459 | word_emb_size=100
460 | char_embed_kwargs=dict(
461 |     vocab_size=char_vocab.size,
462 |     embedding_size=char_emb_size,
463 |     out_channels=output_channels,
464 |     kernel_sizes=kernel_sizes
465 | )
466 | 
467 | word_char_embedding = WordCharEmbedding(
468 |         word_vocab.size, word_emb_size,
469 |         char_embed_kwargs, dropout=0)
470 | # Assign glove embeddings
471 | assign_embeddings(word_char_embedding.word_embeddings, pretrained_embeddings, fix_embedding=True)
472 | 
473 | model_wrapper = LSTMTaggerModel(
474 |     LSTMTaggerWordChar(word_char_embedding, n_embed, hidden_size, chunk_vocab.size),
475 |     nn.NLLLoss(), use_cuda=use_cuda)
476 | 
477 | 
478 | # In[25]:
479 | 
480 | model_wrapper.get_instance_loss(train_tensors[0])
481 | 
482 | 
483 | # In[26]:
484 | 
485 | len(list(model_wrapper.model.parameters()))
486 | 
487 | 
488 | # In[27]:
489 | 
490 | n_epochs=5
491 | training_history = training_wrapper(
492 |     model_wrapper, train_tensors, 
493 |     eval_tensors=dev_tensors,
494 |     optimizer=optim.Adam,
495 |     optimizer_kwargs={
496 |         #"lr": 0.01,
497 |         "weight_decay": 0.0
498 |     },
499 |     n_epochs=n_epochs,
500 |     batch_size=batch_size,
501 |     use_cuda=use_cuda,
502 |     log_file="LSTMTaggerModel_CONLL2000.log"
503 | )
504 | model_wrapper.save("LSTMTaggerModel_CONLL2000")
505 | 
506 | 
507 | # In[28]:
508 | 
509 | preds = model_wrapper.predict(train_tensors[0])
510 | preds
511 | 
512 | 
513 | # In[29]:
514 | 
515 | fig, ax = plt.subplots(1,1)
516 | plot_losses(training_history["training_loss"],
517 |             training_history["evaluation_loss"],
518 |             plot_std=True,
519 |             ax=ax)
520 | ax.legend()
521 | sns.despine(offset=5)
522 | plt.savefig("LSTMTaggerModel_CONLL2000.pdf")
523 | 
524 | # In[30]:
525 | 
526 | for title, tensors, corpus in zip(
527 |     ["train", "dev", "test"],
528 |     [train_tensors, dev_tensors, test_tensors],
529 |     [train_corpus, dev_corpus, test_corpus],
530 |                          ):
531 |     get_ipython().magic(u'time predictions = model_wrapper.predict_batch(tensors)')
532 |     print_predictions(corpus, predictions, "%s.chunking.conll" % title, chunk_vocab)
533 |     conll_eval(["conlleval", "%s.chunking.conll" % title])
534 | 
535 | 
536 | # ## CRF model
537 | 
538 | # In[31]:
539 | 
540 | class BiLSTMTaggerWordCRFModel(ModelWrapper):
541 |     def __init__(self, model,
542 |                  loss_function,
543 |                  use_cuda=False):
544 |         self.model = model
545 |         self.loss_function = None
546 | 
547 |         self.use_cuda = use_cuda
548 |         if self.use_cuda:
549 |             #[k.cuda() for k in self.model.modules()]
550 |             self.model.cuda()
551 |         
552 |     def _process_instance_tensors(self, instance_tensors):
553 |         X, Y, X_char = instance_tensors
554 |         X = Variable(torch.LongTensor([X]), requires_grad=False)
555 |         Y = torch.LongTensor(Y)
556 |         X_char = charseq2varlist(X_char)
557 |         if self.use_cuda:
558 |             X = X.cuda()
559 |             Y = Y.cuda()
560 |             X_char = [t.cuda() for t in X_char]
561 |         return X, X_char, Y
562 | 
563 |     def get_instance_loss(self, instance_tensors, zero_grad=True):
564 |         if zero_grad:
565 |             ## Clear gradients before every update else memory runs out
566 |             self.model.zero_grad()
567 |         X, X_char, Y = self._process_instance_tensors(instance_tensors)
568 |         #print(X.get_device(), [t.get_device() for t in X_char])
569 |         return self.model.loss(X, X_char, Y)
570 |         
571 |     def predict(self, instance_tensors):
572 |         X, X_char, Y = self._process_instance_tensors(instance_tensors)
573 |         emissions = self.model.forward(X, X_char)
574 |         return self.model.crf.forward(emissions)[1]
575 | 
576 | 
577 | # In[32]:
578 | 
579 | use_cuda=True
580 | n_embed=100
581 | hidden_size=128
582 | batch_size=64
583 | 
584 | char_emb_size=50
585 | output_channels=50
586 | kernel_sizes=[2, 3]
587 | 
588 | word_emb_size=100
589 | char_embed_kwargs=dict(
590 |     vocab_size=char_vocab.size,
591 |     embedding_size=char_emb_size,
592 |     out_channels=output_channels,
593 |     kernel_sizes=kernel_sizes
594 | )
595 | 
596 | word_char_embedding = WordCharEmbedding(
597 |         word_vocab.size, word_emb_size,
598 |         char_embed_kwargs, dropout=0)
599 | # Assign glove embeddings
600 | assign_embeddings(word_char_embedding.word_embeddings, pretrained_embeddings, fix_embedding=True)
601 | 
602 | model_wrapper = BiLSTMTaggerWordCRFModel(
603 |     LSTMTaggerWordCharCRF(word_char_embedding, n_embed, hidden_size, chunk_vocab.size),
604 |     None, use_cuda=use_cuda)
605 | 
606 | 
607 | # In[33]:
608 | 
609 | n_epochs=50
610 | training_history = training_wrapper(
611 |     model_wrapper, train_tensors, 
612 |     eval_tensors=dev_tensors,
613 |     optimizer=optim.Adam,
614 |     optimizer_kwargs={
615 |         #"lr": 0.01,
616 |         "weight_decay": 0
617 |     },
618 |     n_epochs=n_epochs,
619 |     batch_size=batch_size,
620 |     use_cuda=use_cuda,
621 |     log_file="BiLSTMTaggerWordCRFModel_CONLL2000.log"
622 | )
623 | model_wrapper.save("BiLSTMTaggerWordCRFModel_CONLL2000")
624 | 
625 | 
626 | # In[34]:
627 | 
628 | fig, ax = plt.subplots(1,1)
629 | plot_losses(training_history["training_loss"],
630 |             training_history["evaluation_loss"],
631 |             plot_std=True,
632 |             ax=ax)
633 | ax.legend()
634 | sns.despine(offset=5)
635 | plt.savefig("BiLSTMTaggerWordCRFModel_CONLL2000.pdf")
636 | 
637 | # Performance may improve by creating all the torch tensors upfront and then pinning them to memory
638 | 
639 | 
640 | # In[35]:
641 | 
642 | for title, tensors, corpus in zip(
643 |     ["train", "dev", "test"],
644 |     [train_tensors, dev_tensors, test_tensors],
645 |     [train_corpus, dev_corpus, test_corpus],
646 |                          ):
647 |     get_ipython().magic(u'time predictions = model_wrapper.predict_batch(tensors)')
648 |     print_predictions(corpus, predictions, "%s.chunking.conll" % title, chunk_vocab)
649 |     conll_eval(["conlleval", "%s.chunking.conll" % title]) 
650 | 
651 | 
652 | 


--------------------------------------------------------------------------------
/Pytorch Gradient reversal.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Gradient reversal pytorch\n",
  8 |     "\n",
  9 |     "Inspired from the following tweets:\n",
 10 |     "\n",
 11 |     "* https://twitter.com/mat_kelcey/status/932149793765261313\n",
 12 |     "* https://twitter.com/ericjang11/status/932073259721359363\n",
 13 |     "\n",
 14 |     "Basic idea:\n",
 15 |     "\n",
 16 |     "```python\n",
 17 |     "# Add something to gradient\n",
 18 |     "f(x) + g(x) - tf.stop_gradients(g(x))\n",
 19 |     "\n",
 20 |     "# Reverse gradient\n",
 21 |     "tf.stop_gradient(f(x)*2) - f(x)\n",
 22 |     "```"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import torch\n",
 32 |     "import tensorflow as tf\n",
 33 |     "from torch.autograd import Variable\n",
 34 |     "\n",
 35 |     "import numpy as np"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "def f(X):\n",
 47 |     "    return X*X\n",
 48 |     "\n",
 49 |     "def g(X):\n",
 50 |     "    return X**3"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 3,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "data": {
 60 |       "text/plain": [
 61 |        "array([ 0.01995021, -0.32892969,  0.75804777,  0.172995  ,  0.69747771,\n",
 62 |        "        1.11414039, -0.69194092,  2.43364877,  0.92732815, -0.91409348])"
 63 |       ]
 64 |      },
 65 |      "execution_count": 3,
 66 |      "metadata": {},
 67 |      "output_type": "execute_result"
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "X = np.random.randn(10)\n",
 72 |     "X"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## Tensorflow implementation"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 4,
 85 |    "metadata": {
 86 |     "collapsed": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "sess = tf.InteractiveSession()"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 5,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "tf_X = tf.Variable(X)\n",
100 |     "init_op = tf.global_variables_initializer()"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 6,
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "data": {
110 |       "text/plain": [
111 |        "array([ 0.01995021, -0.32892969,  0.75804777,  0.172995  ,  0.69747771,\n",
112 |        "        1.11414039, -0.69194092,  2.43364877,  0.92732815, -0.91409348])"
113 |       ]
114 |      },
115 |      "execution_count": 6,
116 |      "metadata": {},
117 |      "output_type": "execute_result"
118 |     }
119 |    ],
120 |    "source": [
121 |     "sess.run(init_op)\n",
122 |     "sess.run(tf_X)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 7,
128 |    "metadata": {
129 |     "collapsed": true
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "forward_op = f(tf_X)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 8,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "array([  3.98010770e-04,   1.08194738e-01,   5.74636421e-01,\n",
145 |        "         2.99272683e-02,   4.86475162e-01,   1.24130881e+00,\n",
146 |        "         4.78782241e-01,   5.92264633e+00,   8.59937506e-01,\n",
147 |        "         8.35566890e-01])"
148 |       ]
149 |      },
150 |      "execution_count": 8,
151 |      "metadata": {},
152 |      "output_type": "execute_result"
153 |     }
154 |    ],
155 |    "source": [
156 |     "sess.run(forward_op)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 9,
162 |    "metadata": {
163 |     "collapsed": true
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "gradient_op = tf.gradients(forward_op, tf_X)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 10,
173 |    "metadata": {},
174 |    "outputs": [
175 |     {
176 |      "data": {
177 |       "text/plain": [
178 |        "[array([ 0.03990041, -0.65785937,  1.51609554,  0.34598999,  1.39495543,\n",
179 |        "         2.22828078, -1.38388185,  4.86729754,  1.85465631, -1.82818696])]"
180 |       ]
181 |      },
182 |      "execution_count": 10,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "sess.run(gradient_op)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 11,
194 |    "metadata": {},
195 |    "outputs": [
196 |     {
197 |      "data": {
198 |       "text/plain": [
199 |        "array([ 0.03990041, -0.65785937,  1.51609554,  0.34598999,  1.39495543,\n",
200 |        "        2.22828078, -1.38388185,  4.86729754,  1.85465631, -1.82818696])"
201 |       ]
202 |      },
203 |      "execution_count": 11,
204 |      "metadata": {},
205 |      "output_type": "execute_result"
206 |     }
207 |    ],
208 |    "source": [
209 |     "X*2 # This should match the gradient above"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "### Modify the gradients\n",
217 |     "Keep forward pass the same. \n",
218 |     "The trick is to add $g(x)$, such that $g'(x)$ is the gradient modifier, during the forward pass and substract it as well. But stop gradients from flowing through the substraction part. \n",
219 |     "\n",
220 |     "$f(x) + g(x) - g(x)$ will lead to gradients $f'(x) + g'(x) -g'(x)$. Since gradients don't flow through $-g'(x)$, hence we get new gradients as $f'(x) + g'(x)$"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 12,
226 |    "metadata": {
227 |     "collapsed": true
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "gradient_modifier_op = g(tf_X)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 13,
237 |    "metadata": {},
238 |    "outputs": [
239 |     {
240 |      "data": {
241 |       "text/plain": [
242 |        "array([  7.94039737e-06,  -3.55884610e-02,   4.35601858e-01,\n",
243 |        "         5.17726764e-03,   3.39305584e-01,   1.38299228e+00,\n",
244 |        "        -3.31289026e-01,   1.44136410e+01,   7.97444260e-01,\n",
245 |        "        -7.63786246e-01])"
246 |       ]
247 |      },
248 |      "execution_count": 13,
249 |      "metadata": {},
250 |      "output_type": "execute_result"
251 |     }
252 |    ],
253 |    "source": [
254 |     "sess.run(gradient_modifier_op)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 14,
260 |    "metadata": {
261 |     "collapsed": true
262 |    },
263 |    "outputs": [],
264 |    "source": [
265 |     "modified_forward_op = (f(tf_X) + g(tf_X) - tf.stop_gradient(g(tf_X)))\n",
266 |     "modified_backward_op = tf.gradients(modified_forward_op, tf_X)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 15,
272 |    "metadata": {},
273 |    "outputs": [
274 |     {
275 |      "data": {
276 |       "text/plain": [
277 |        "array([  3.98010770e-04,   1.08194738e-01,   5.74636421e-01,\n",
278 |        "         2.99272683e-02,   4.86475162e-01,   1.24130881e+00,\n",
279 |        "         4.78782241e-01,   5.92264633e+00,   8.59937506e-01,\n",
280 |        "         8.35566890e-01])"
281 |       ]
282 |      },
283 |      "execution_count": 15,
284 |      "metadata": {},
285 |      "output_type": "execute_result"
286 |     }
287 |    ],
288 |    "source": [
289 |     "sess.run(modified_forward_op)"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 16,
295 |    "metadata": {},
296 |    "outputs": [
297 |     {
298 |      "data": {
299 |       "text/plain": [
300 |        "[array([  0.04109445,  -0.33327516,   3.2400048 ,   0.4357718 ,\n",
301 |        "          2.85438092,   5.95220721,   0.05246488,  22.63523654,\n",
302 |        "          4.43446883,   0.67851371])]"
303 |       ]
304 |      },
305 |      "execution_count": 16,
306 |      "metadata": {},
307 |      "output_type": "execute_result"
308 |     }
309 |    ],
310 |    "source": [
311 |     "sess.run(modified_backward_op)"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 17,
317 |    "metadata": {},
318 |    "outputs": [
319 |     {
320 |      "data": {
321 |       "text/plain": [
322 |        "array([  0.04109445,  -0.33327516,   3.2400048 ,   0.4357718 ,\n",
323 |        "         2.85438092,   5.95220721,   0.05246488,  22.63523654,\n",
324 |        "         4.43446883,   0.67851371])"
325 |       ]
326 |      },
327 |      "execution_count": 17,
328 |      "metadata": {},
329 |      "output_type": "execute_result"
330 |     }
331 |    ],
332 |    "source": [
333 |     "2*X + 3*(X**2) # This should match the gradients above"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "metadata": {},
339 |    "source": [
340 |     "### Gradient reversal\n",
341 |     "\n",
342 |     "Here the modifying function $g(x)$ is simply the $-2*f(x)$, this will make the gradients $-f'(x)$."
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": 18,
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": [
351 |     "gradient_reversal_op = (tf.stop_gradient(2*f(tf_X)) - f(tf_X))\n",
352 |     "gradient_reversal_grad_op = tf.gradients(gradient_reversal_op, tf_X)"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": 19,
358 |    "metadata": {},
359 |    "outputs": [
360 |     {
361 |      "data": {
362 |       "text/plain": [
363 |        "array([  3.98010770e-04,   1.08194738e-01,   5.74636421e-01,\n",
364 |        "         2.99272683e-02,   4.86475162e-01,   1.24130881e+00,\n",
365 |        "         4.78782241e-01,   5.92264633e+00,   8.59937506e-01,\n",
366 |        "         8.35566890e-01])"
367 |       ]
368 |      },
369 |      "execution_count": 19,
370 |      "metadata": {},
371 |      "output_type": "execute_result"
372 |     }
373 |    ],
374 |    "source": [
375 |     "sess.run(gradient_reversal_op)"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 20,
381 |    "metadata": {},
382 |    "outputs": [
383 |     {
384 |      "data": {
385 |       "text/plain": [
386 |        "[array([-0.03990041,  0.65785937, -1.51609554, -0.34598999, -1.39495543,\n",
387 |        "        -2.22828078,  1.38388185, -4.86729754, -1.85465631,  1.82818696])]"
388 |       ]
389 |      },
390 |      "execution_count": 20,
391 |      "metadata": {},
392 |      "output_type": "execute_result"
393 |     }
394 |    ],
395 |    "source": [
396 |     "sess.run(gradient_reversal_grad_op)"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": 21,
402 |    "metadata": {},
403 |    "outputs": [
404 |     {
405 |      "data": {
406 |       "text/plain": [
407 |        "array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])"
408 |       ]
409 |      },
410 |      "execution_count": 21,
411 |      "metadata": {},
412 |      "output_type": "execute_result"
413 |     }
414 |    ],
415 |    "source": [
416 |     "sess.run((gradient_op[0] + gradient_reversal_grad_op[0])) # This should be zero. Signifying grad is reversed. "
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "## Pytoch case"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 22,
429 |    "metadata": {
430 |     "collapsed": true
431 |    },
432 |    "outputs": [],
433 |    "source": [
434 |     "def zero_grad(X):\n",
435 |     "    if X.grad is not None:\n",
436 |     "        X.grad.data.zero_()"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": 23,
442 |    "metadata": {
443 |     "collapsed": true
444 |    },
445 |    "outputs": [],
446 |    "source": [
447 |     "torch_X = Variable(torch.FloatTensor(X), requires_grad=True)"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 24,
453 |    "metadata": {},
454 |    "outputs": [
455 |     {
456 |      "data": {
457 |       "text/plain": [
458 |        "array([ 0.01995021, -0.32892969,  0.75804776,  0.172995  ,  0.6974777 ,\n",
459 |        "        1.11414039, -0.6919409 ,  2.43364882,  0.92732817, -0.91409349], dtype=float32)"
460 |       ]
461 |      },
462 |      "execution_count": 24,
463 |      "metadata": {},
464 |      "output_type": "execute_result"
465 |     }
466 |    ],
467 |    "source": [
468 |     "torch_X.data.numpy()"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": 25,
474 |    "metadata": {},
475 |    "outputs": [
476 |     {
477 |      "data": {
478 |       "text/plain": [
479 |        "array([  3.98010772e-04,   1.08194746e-01,   5.74636400e-01,\n",
480 |        "         2.99272705e-02,   4.86475140e-01,   1.24130881e+00,\n",
481 |        "         4.78782207e-01,   5.92264652e+00,   8.59937549e-01,\n",
482 |        "         8.35566938e-01], dtype=float32)"
483 |       ]
484 |      },
485 |      "execution_count": 25,
486 |      "metadata": {},
487 |      "output_type": "execute_result"
488 |     }
489 |    ],
490 |    "source": [
491 |     "f(torch_X).data.numpy()"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": 26,
497 |    "metadata": {},
498 |    "outputs": [
499 |     {
500 |      "data": {
501 |       "text/plain": [
502 |        "array([  7.94039715e-06,  -3.55884619e-02,   4.35601830e-01,\n",
503 |        "         5.17726783e-03,   3.39305550e-01,   1.38299227e+00,\n",
504 |        "        -3.31288993e-01,   1.44136410e+01,   7.97444284e-01,\n",
505 |        "        -7.63786316e-01], dtype=float32)"
506 |       ]
507 |      },
508 |      "execution_count": 26,
509 |      "metadata": {},
510 |      "output_type": "execute_result"
511 |     }
512 |    ],
513 |    "source": [
514 |     "g(torch_X).data.numpy()"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": 27,
520 |    "metadata": {},
521 |    "outputs": [
522 |     {
523 |      "data": {
524 |       "text/plain": [
525 |        "array([ 0.03990041, -0.65785939,  1.51609552,  0.34599   ,  1.3949554 ,\n",
526 |        "        2.22828078, -1.38388181,  4.86729765,  1.85465634, -1.82818699], dtype=float32)"
527 |       ]
528 |      },
529 |      "execution_count": 27,
530 |      "metadata": {},
531 |      "output_type": "execute_result"
532 |     }
533 |    ],
534 |    "source": [
535 |     "zero_grad(torch_X)\n",
536 |     "f_X = f(torch_X)\n",
537 |     "f_X.backward(torch.ones(f_X.size()))\n",
538 |     "torch_X.grad.data.numpy()"
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "code",
543 |    "execution_count": 28,
544 |    "metadata": {},
545 |    "outputs": [
546 |     {
547 |      "data": {
548 |       "text/plain": [
549 |        "array([ 0.03990041, -0.65785937,  1.51609554,  0.34598999,  1.39495543,\n",
550 |        "        2.22828078, -1.38388185,  4.86729754,  1.85465631, -1.82818696])"
551 |       ]
552 |      },
553 |      "execution_count": 28,
554 |      "metadata": {},
555 |      "output_type": "execute_result"
556 |     }
557 |    ],
558 |    "source": [
559 |     "2*X"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "markdown",
564 |    "metadata": {},
565 |    "source": [
566 |     "### Modify gradients"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": 29,
572 |    "metadata": {
573 |     "collapsed": true
574 |    },
575 |    "outputs": [],
576 |    "source": [
577 |     "modified_gradients_forward = lambda x: f(x) + g(x) - g(x).detach()"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": 30,
583 |    "metadata": {},
584 |    "outputs": [
585 |     {
586 |      "data": {
587 |       "text/plain": [
588 |        "array([  0.04109445,  -0.33327514,   3.24000454,   0.43577182,\n",
589 |        "         2.85438085,   5.95220757,   0.05246484,  22.63523865,\n",
590 |        "         4.43446875,   0.67851377], dtype=float32)"
591 |       ]
592 |      },
593 |      "execution_count": 30,
594 |      "metadata": {},
595 |      "output_type": "execute_result"
596 |     }
597 |    ],
598 |    "source": [
599 |     "zero_grad(torch_X)\n",
600 |     "modified_grad = modified_gradients_forward(torch_X)\n",
601 |     "modified_grad.backward(torch.ones(modified_grad.size()))\n",
602 |     "torch_X.grad.data.numpy()"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "code",
607 |    "execution_count": 31,
608 |    "metadata": {},
609 |    "outputs": [
610 |     {
611 |      "data": {
612 |       "text/plain": [
613 |        "array([  0.04109445,  -0.33327516,   3.2400048 ,   0.4357718 ,\n",
614 |        "         2.85438092,   5.95220721,   0.05246488,  22.63523654,\n",
615 |        "         4.43446883,   0.67851371])"
616 |       ]
617 |      },
618 |      "execution_count": 31,
619 |      "metadata": {},
620 |      "output_type": "execute_result"
621 |     }
622 |    ],
623 |    "source": [
624 |     "2*X + 3*(X*X) # It should be same as above"
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "markdown",
629 |    "metadata": {},
630 |    "source": [
631 |     "### Gradient reversal"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": 32,
637 |    "metadata": {
638 |     "collapsed": true
639 |    },
640 |    "outputs": [],
641 |    "source": [
642 |     "gradient_reversal = lambda x: (2*f(x)).detach() - f(x)"
643 |    ]
644 |   },
645 |   {
646 |    "cell_type": "code",
647 |    "execution_count": 33,
648 |    "metadata": {},
649 |    "outputs": [
650 |     {
651 |      "data": {
652 |       "text/plain": [
653 |        "array([-0.03990041,  0.65785939, -1.51609552, -0.34599   , -1.3949554 ,\n",
654 |        "       -2.22828078,  1.38388181, -4.86729765, -1.85465634,  1.82818699], dtype=float32)"
655 |       ]
656 |      },
657 |      "execution_count": 33,
658 |      "metadata": {},
659 |      "output_type": "execute_result"
660 |     }
661 |    ],
662 |    "source": [
663 |     "zero_grad(torch_X)\n",
664 |     "grad_reverse = gradient_reversal(torch_X)\n",
665 |     "grad_reverse.backward(torch.ones(grad_reverse.size()))\n",
666 |     "torch_X.grad.data.numpy()"
667 |    ]
668 |   },
669 |   {
670 |    "cell_type": "code",
671 |    "execution_count": 34,
672 |    "metadata": {},
673 |    "outputs": [
674 |     {
675 |      "data": {
676 |       "text/plain": [
677 |        "array([-0.03990041,  0.65785937, -1.51609554, -0.34598999, -1.39495543,\n",
678 |        "       -2.22828078,  1.38388185, -4.86729754, -1.85465631,  1.82818696])"
679 |       ]
680 |      },
681 |      "execution_count": 34,
682 |      "metadata": {},
683 |      "output_type": "execute_result"
684 |     }
685 |    ],
686 |    "source": [
687 |     "-2*X # It should be same as above"
688 |    ]
689 |   },
690 |   {
691 |    "cell_type": "markdown",
692 |    "metadata": {},
693 |    "source": [
694 |     "### Pytorch backward hooks"
695 |    ]
696 |   },
697 |   {
698 |    "cell_type": "code",
699 |    "execution_count": 35,
700 |    "metadata": {},
701 |    "outputs": [
702 |     {
703 |      "data": {
704 |       "text/plain": [
705 |        "array([-0.03990041,  0.65785939, -1.51609552, -0.34599   , -1.3949554 ,\n",
706 |        "       -2.22828078,  1.38388181, -4.86729765, -1.85465634,  1.82818699], dtype=float32)"
707 |       ]
708 |      },
709 |      "execution_count": 35,
710 |      "metadata": {},
711 |      "output_type": "execute_result"
712 |     }
713 |    ],
714 |    "source": [
715 |     "# Gradient reversal\n",
716 |     "zero_grad(torch_X)\n",
717 |     "f_X = f(torch_X)\n",
718 |     "f_X.register_hook(lambda grad: -grad)\n",
719 |     "f_X.backward(torch.ones(f_X.size()))\n",
720 |     "torch_X.grad.data.numpy()"
721 |    ]
722 |   },
723 |   {
724 |    "cell_type": "code",
725 |    "execution_count": 36,
726 |    "metadata": {},
727 |    "outputs": [
728 |     {
729 |      "data": {
730 |       "text/plain": [
731 |        "array([-0.03990041,  0.65785937, -1.51609554, -0.34598999, -1.39495543,\n",
732 |        "       -2.22828078,  1.38388185, -4.86729754, -1.85465631,  1.82818696])"
733 |       ]
734 |      },
735 |      "execution_count": 36,
736 |      "metadata": {},
737 |      "output_type": "execute_result"
738 |     }
739 |    ],
740 |    "source": [
741 |     "-2*X"
742 |    ]
743 |   },
744 |   {
745 |    "cell_type": "code",
746 |    "execution_count": 37,
747 |    "metadata": {},
748 |    "outputs": [
749 |     {
750 |      "data": {
751 |       "text/plain": [
752 |        "array([  0.04109445,  -0.33327514,   3.24000454,   0.43577182,\n",
753 |        "         2.85438085,   5.95220757,   0.05246484,  22.63523865,\n",
754 |        "         4.43446875,   0.67851377], dtype=float32)"
755 |       ]
756 |      },
757 |      "execution_count": 37,
758 |      "metadata": {},
759 |      "output_type": "execute_result"
760 |     }
761 |    ],
762 |    "source": [
763 |     "# Modified grad example\n",
764 |     "zero_grad(torch_X)\n",
765 |     "h = torch_X.register_hook(lambda grad: grad + 3*(torch_X*torch_X))\n",
766 |     "f_X = f(torch_X)\n",
767 |     "f_X.backward(torch.ones(f_X.size()))\n",
768 |     "h.remove()\n",
769 |     "torch_X.grad.data.numpy()"
770 |    ]
771 |   },
772 |   {
773 |    "cell_type": "code",
774 |    "execution_count": 38,
775 |    "metadata": {},
776 |    "outputs": [
777 |     {
778 |      "data": {
779 |       "text/plain": [
780 |        "array([  0.04109445,  -0.33327516,   3.2400048 ,   0.4357718 ,\n",
781 |        "         2.85438092,   5.95220721,   0.05246488,  22.63523654,\n",
782 |        "         4.43446883,   0.67851371])"
783 |       ]
784 |      },
785 |      "execution_count": 38,
786 |      "metadata": {},
787 |      "output_type": "execute_result"
788 |     }
789 |    ],
790 |    "source": [
791 |     "2*X + 3*(X*X) # It should be same as above"
792 |    ]
793 |   },
794 |   {
795 |    "cell_type": "code",
796 |    "execution_count": null,
797 |    "metadata": {
798 |     "collapsed": true
799 |    },
800 |    "outputs": [],
801 |    "source": []
802 |   }
803 |  ],
804 |  "metadata": {
805 |   "kernelspec": {
806 |    "display_name": "Python [default]",
807 |    "language": "python",
808 |    "name": "python3"
809 |   },
810 |   "language_info": {
811 |    "codemirror_mode": {
812 |     "name": "ipython",
813 |     "version": 3
814 |    },
815 |    "file_extension": ".py",
816 |    "mimetype": "text/x-python",
817 |    "name": "python",
818 |    "nbconvert_exporter": "python",
819 |    "pygments_lexer": "ipython3",
820 |    "version": "3.6.1"
821 |   }
822 |  },
823 |  "nbformat": 4,
824 |  "nbformat_minor": 2
825 | }
826 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pytorch-practice
 2 | Some example scripts on pytorch
 3 | 
 4 | ## CONLL 2000 Chunking task
 5 | 
 6 | Uses BiLSTM CRF loss with char CNN embeddings. To run use:
 7 | 
 8 | ```
 9 | cd data/conll2000
10 | bash get_data.sh
11 | cd ..
12 | python chunking_bilstm_crf_char_concat.py # Takes around # 8 hours on Tesla K80 GPU
13 | ```
14 | 
15 | 92.82% mean F1 on test data. 
16 | 
17 | 


--------------------------------------------------------------------------------
/Seq_EWC_losses.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/Seq_EWC_losses.pdf


--------------------------------------------------------------------------------
/Seq_EWC_predictions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/Seq_EWC_predictions.pdf


--------------------------------------------------------------------------------
/Viterbi decoding and CRF.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "np.random.seed(2017)\n",
 13 |     "\n",
 14 |     "import torch\n",
 15 |     "torch.manual_seed(2017)\n",
 16 |     "\n",
 17 |     "from scipy.misc import logsumexp # Use it for reference checking implementation"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "name": "stdout",
 27 |      "output_type": "stream",
 28 |      "text": [
 29 |       "Emissions:\n",
 30 |       "[[  9.   6.]\n",
 31 |       " [ 13.  10.]\n",
 32 |       " [  8.  18.]\n",
 33 |       " [  3.  15.]]\n",
 34 |       "Transitions:\n",
 35 |       "[[ 7.  8.]\n",
 36 |       " [ 0.  8.]]\n"
 37 |      ]
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "seq_length, num_states=4, 2\n",
 42 |     "emissions = np.random.randint(20, size=(seq_length,num_states))*1.\n",
 43 |     "transitions = np.random.randint(10, size=(num_states, num_states))*1.\n",
 44 |     "print(\"Emissions:\", emissions, sep=\"\\n\")\n",
 45 |     "print(\"Transitions:\", transitions, sep=\"\\n\")"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 3,
 51 |    "metadata": {
 52 |     "collapsed": true
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "def viterbi_decoding(emissions, transitions):\n",
 57 |     "    # Use help from: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/crf/python/ops/crf.py\n",
 58 |     "    scores = np.zeros_like(emissions)\n",
 59 |     "    back_pointers = np.zeros_like(emissions, dtype=\"int\")\n",
 60 |     "    scores = emissions[0]\n",
 61 |     "    # Generate most likely scores and paths for each step in sequence\n",
 62 |     "    for i in range(1, emissions.shape[0]):\n",
 63 |     "        score_with_transition = np.expand_dims(scores, 1) + transitions\n",
 64 |     "        scores = emissions[i] + score_with_transition.max(axis=0)\n",
 65 |     "        back_pointers[i] = np.argmax(score_with_transition, 0)\n",
 66 |     "    # Generate the most likely path\n",
 67 |     "    viterbi = [np.argmax(scores)]\n",
 68 |     "    for bp in reversed(back_pointers[1:]):\n",
 69 |     "        viterbi.append(bp[viterbi[-1]])\n",
 70 |     "    viterbi.reverse()\n",
 71 |     "    viterbi_score = np.max(scores)\n",
 72 |     "    return viterbi_score, viterbi"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "(78.0, [0, 0, 1, 1])"
 84 |       ]
 85 |      },
 86 |      "execution_count": 4,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "viterbi_decoding(emissions, transitions)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 5,
 98 |    "metadata": {
 99 |     "collapsed": true
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "def viterbi_decoding_torch(emissions, transitions):\n",
104 |     "    scores = torch.zeros(emissions.size(1))\n",
105 |     "    back_pointers = torch.zeros(emissions.size()).int()\n",
106 |     "    scores = scores + emissions[0]\n",
107 |     "    # Generate most likely scores and paths for each step in sequence\n",
108 |     "    for i in range(1, emissions.size(0)):\n",
109 |     "        scores_with_transitions = scores.unsqueeze(1).expand_as(transitions) + transitions\n",
110 |     "        max_scores, back_pointers[i] = torch.max(scores_with_transitions, 0)\n",
111 |     "        scores = emissions[i] + max_scores\n",
112 |     "    # Generate the most likely path\n",
113 |     "    viterbi = [scores.numpy().argmax()]\n",
114 |     "    back_pointers = back_pointers.numpy()\n",
115 |     "    for bp in reversed(back_pointers[1:]):\n",
116 |     "        viterbi.append(bp[viterbi[-1]])\n",
117 |     "    viterbi.reverse()\n",
118 |     "    viterbi_score = scores.numpy().max()\n",
119 |     "    return viterbi_score, viterbi\n",
120 |     "    "
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 6,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "(78.0, [0, 0, 1, 1])"
132 |       ]
133 |      },
134 |      "execution_count": 6,
135 |      "metadata": {},
136 |      "output_type": "execute_result"
137 |     }
138 |    ],
139 |    "source": [
140 |     "viterbi_decoding_torch(torch.Tensor(emissions), torch.Tensor(transitions))"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 7,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "(78.0, [0, 0, 1, 1])"
152 |       ]
153 |      },
154 |      "execution_count": 7,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "viterbi_decoding(emissions, transitions)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 8,
166 |    "metadata": {
167 |     "collapsed": true
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "def log_sum_exp(vecs, axis=None, keepdims=False):\n",
172 |     "    ## Use help from: https://github.com/scipy/scipy/blob/v0.18.1/scipy/misc/common.py#L20-L140\n",
173 |     "    max_val = vecs.max(axis=axis, keepdims=True)\n",
174 |     "    vecs = vecs - max_val\n",
175 |     "    if not keepdims:\n",
176 |     "        max_val = max_val.squeeze(axis=axis)\n",
177 |     "    out_val = np.log(np.exp(vecs).sum(axis=axis, keepdims=keepdims))\n",
178 |     "    return max_val + out_val"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 9,
184 |    "metadata": {
185 |     "collapsed": true
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "def score_sequence(emissions, transitions, tags):\n",
190 |     "    # Use help from: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/crf/python/ops/crf.py\n",
191 |     "    score = emissions[0][tags[0]]\n",
192 |     "    for i, emission in enumerate(emissions[1:]):\n",
193 |     "        score = score + transitions[tags[i], tags[i+1]] + emission[tags[i+1]]\n",
194 |     "    return score"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 10,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "data": {
204 |       "text/plain": [
205 |        "42.0"
206 |       ]
207 |      },
208 |      "execution_count": 10,
209 |      "metadata": {},
210 |      "output_type": "execute_result"
211 |     }
212 |    ],
213 |    "source": [
214 |     "score_sequence(emissions, transitions, [1,1,0,0])"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 11,
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "data": {
224 |       "text/plain": [
225 |        "[7.0, 8.0, 8.0]"
226 |       ]
227 |      },
228 |      "execution_count": 11,
229 |      "metadata": {},
230 |      "output_type": "execute_result"
231 |     }
232 |    ],
233 |    "source": [
234 |     "correct_seq = [0, 0, 1, 1]\n",
235 |     "[transitions[correct_seq[i],correct_seq[i+1]] for i in range(len(correct_seq) -1)]"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 12,
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "data": {
245 |       "text/plain": [
246 |        "23.0"
247 |       ]
248 |      },
249 |      "execution_count": 12,
250 |      "metadata": {},
251 |      "output_type": "execute_result"
252 |     }
253 |    ],
254 |    "source": [
255 |     "sum([transitions[correct_seq[i], correct_seq[i+1]] for i in range(len(correct_seq) -1)])"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 13,
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "data": {
265 |       "text/plain": [
266 |        "(78.0, [0, 0, 1, 1])"
267 |       ]
268 |      },
269 |      "execution_count": 13,
270 |      "metadata": {},
271 |      "output_type": "execute_result"
272 |     }
273 |    ],
274 |    "source": [
275 |     "viterbi_decoding(emissions, transitions)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 14,
281 |    "metadata": {},
282 |    "outputs": [
283 |     {
284 |      "data": {
285 |       "text/plain": [
286 |        "78.0"
287 |       ]
288 |      },
289 |      "execution_count": 14,
290 |      "metadata": {},
291 |      "output_type": "execute_result"
292 |     }
293 |    ],
294 |    "source": [
295 |     "score_sequence(emissions, transitions, [0, 0, 1, 1])"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 15,
301 |    "metadata": {
302 |     "collapsed": true
303 |    },
304 |    "outputs": [],
305 |    "source": [
306 |     "def score_sequence_torch(emissions, transitions, tags):\n",
307 |     "    score = emissions[0][tags[0]]\n",
308 |     "    for i, emission in enumerate(emissions[1:]):\n",
309 |     "        score = score + transitions[tags[i], tags[i+1]] + emission[tags[i+1]]\n",
310 |     "    return score"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 16,
316 |    "metadata": {},
317 |    "outputs": [
318 |     {
319 |      "data": {
320 |       "text/plain": [
321 |        "78.0"
322 |       ]
323 |      },
324 |      "execution_count": 16,
325 |      "metadata": {},
326 |      "output_type": "execute_result"
327 |     }
328 |    ],
329 |    "source": [
330 |     "score_sequence_torch(torch.Tensor(emissions), torch.Tensor(transitions), [0, 0, 1, 1])"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 17,
336 |    "metadata": {},
337 |    "outputs": [
338 |     {
339 |      "data": {
340 |       "text/plain": [
341 |        "[[0, 0, 0, 0],\n",
342 |        " [1, 0, 0, 0],\n",
343 |        " [0, 1, 0, 0],\n",
344 |        " [1, 1, 0, 0],\n",
345 |        " [0, 0, 1, 0],\n",
346 |        " [1, 0, 1, 0],\n",
347 |        " [0, 1, 1, 0],\n",
348 |        " [1, 1, 1, 0],\n",
349 |        " [0, 0, 0, 1],\n",
350 |        " [1, 0, 0, 1],\n",
351 |        " [0, 1, 0, 1],\n",
352 |        " [1, 1, 0, 1],\n",
353 |        " [0, 0, 1, 1],\n",
354 |        " [1, 0, 1, 1],\n",
355 |        " [0, 1, 1, 1],\n",
356 |        " [1, 1, 1, 1]]"
357 |       ]
358 |      },
359 |      "execution_count": 17,
360 |      "metadata": {},
361 |      "output_type": "execute_result"
362 |     }
363 |    ],
364 |    "source": [
365 |     "def get_all_tags(seq_length, num_labels):\n",
366 |     "    if seq_length == 0:\n",
367 |     "        yield []\n",
368 |     "        return\n",
369 |     "    for sequence in get_all_tags(seq_length-1, num_labels):\n",
370 |     "        #print(sequence, seq_length)\n",
371 |     "        for label in range(num_labels):\n",
372 |     "            yield [label] + sequence        \n",
373 |     "list(get_all_tags(4,2))"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": 18,
379 |    "metadata": {},
380 |    "outputs": [
381 |     {
382 |      "data": {
383 |       "text/plain": [
384 |        "[[0, 0], [0, 1], [1, 0], [1, 1]]"
385 |       ]
386 |      },
387 |      "execution_count": 18,
388 |      "metadata": {},
389 |      "output_type": "execute_result"
390 |     }
391 |    ],
392 |    "source": [
393 |     "def get_all_tags_dp(seq_length, num_labels):\n",
394 |     "    prior_tags = [[]]\n",
395 |     "    for i in range(1, seq_length+1):\n",
396 |     "        new_tags = []\n",
397 |     "        for label in range(num_labels):\n",
398 |     "            for tags in prior_tags:\n",
399 |     "                new_tags.append([label] + tags)\n",
400 |     "        prior_tags = new_tags\n",
401 |     "    return new_tags\n",
402 |     "list(get_all_tags_dp(2,2))"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 19,
408 |    "metadata": {},
409 |    "outputs": [
410 |     {
411 |      "name": "stdout",
412 |      "output_type": "stream",
413 |      "text": [
414 |       "[54.0, 67.0, 58.0, 78.0, 45.0, 58.0, 56.0, 76.0, 44.0, 57.0, 48.0, 68.0, 42.0, 55.0, 53.0, 73.0]\n"
415 |      ]
416 |     }
417 |    ],
418 |    "source": [
419 |     "def brute_force_score(emissions, transitions):\n",
420 |     "    # This is for ensuring the correctness of the dynamic programming method.\n",
421 |     "    # DO NOT run with very high values of number of labels or sequence lengths\n",
422 |     "    for tags in get_all_tags_dp(*emissions.shape):\n",
423 |     "        yield score_sequence(emissions, transitions, tags)\n",
424 |     "\n",
425 |     "        \n",
426 |     "brute_force_sequence_scores = list(brute_force_score(emissions, transitions))\n",
427 |     "print(brute_force_sequence_scores)"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": 20,
433 |    "metadata": {},
434 |    "outputs": [
435 |     {
436 |      "data": {
437 |       "text/plain": [
438 |        "78.0"
439 |       ]
440 |      },
441 |      "execution_count": 20,
442 |      "metadata": {},
443 |      "output_type": "execute_result"
444 |     }
445 |    ],
446 |    "source": [
447 |     "max(brute_force_sequence_scores) # Best score calcuated using brute force"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 21,
453 |    "metadata": {},
454 |    "outputs": [
455 |     {
456 |      "data": {
457 |       "text/plain": [
458 |        "78.132899613126483"
459 |       ]
460 |      },
461 |      "execution_count": 21,
462 |      "metadata": {},
463 |      "output_type": "execute_result"
464 |     }
465 |    ],
466 |    "source": [
467 |     "log_sum_exp(np.array(brute_force_sequence_scores)) # Partition function"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": 22,
473 |    "metadata": {
474 |     "collapsed": true
475 |    },
476 |    "outputs": [],
477 |    "source": [
478 |     "def forward_algorithm_naive(emissions, transitions):\n",
479 |     "    scores = emissions[0]\n",
480 |     "    # Get the log sum exp score\n",
481 |     "    for i in range(1,emissions.shape[0]):\n",
482 |     "        print(scores)\n",
483 |     "        alphas_t = np.zeros_like(scores) # Forward vars at timestep t\n",
484 |     "        for j in range(emissions.shape[1]):\n",
485 |     "            emit_score = emissions[i,j]\n",
486 |     "            trans_score = transitions.T[j]\n",
487 |     "            next_tag_var = scores + trans_score\n",
488 |     "            alphas_t[j] = log_sum_exp(next_tag_var) + emit_score\n",
489 |     "        scores = alphas_t\n",
490 |     "    return log_sum_exp(scores)"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": 23,
496 |    "metadata": {},
497 |    "outputs": [
498 |     {
499 |      "name": "stdout",
500 |      "output_type": "stream",
501 |      "text": [
502 |       "[ 9.  6.]\n",
503 |       "[ 29.0000454   27.04858735]\n",
504 |       "[ 44.00017494  55.13288499]\n"
505 |      ]
506 |     },
507 |     {
508 |      "data": {
509 |       "text/plain": [
510 |        "78.132899613126483"
511 |       ]
512 |      },
513 |      "execution_count": 23,
514 |      "metadata": {},
515 |      "output_type": "execute_result"
516 |     }
517 |    ],
518 |    "source": [
519 |     "forward_algorithm_naive(emissions, transitions)"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": 24,
525 |    "metadata": {
526 |     "collapsed": true
527 |    },
528 |    "outputs": [],
529 |    "source": [
530 |     "def forward_algorithm_vec_check(emissions, transitions):\n",
531 |     "    # This is for checking the correctedness of log_sum_exp function compared to scipy\n",
532 |     "    scores = emissions[0]\n",
533 |     "    scores_naive = emissions[0]\n",
534 |     "    # Get the log sum exp score\n",
535 |     "    for i in range(1, emissions.shape[0]):\n",
536 |     "        print(scores, scores_naive)\n",
537 |     "        scores = emissions[i] + logsumexp(\n",
538 |     "            scores_naive + transitions.T,\n",
539 |     "            axis=1)\n",
540 |     "        scores_naive = emissions[i] + np.array([log_sum_exp(\n",
541 |     "            scores_naive + transitions.T[j]) for j in range(emissions.shape[1])])\n",
542 |     "    print(scores, scores_naive)\n",
543 |     "    return logsumexp(scores), log_sum_exp(scores_naive)"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": 25,
549 |    "metadata": {},
550 |    "outputs": [
551 |     {
552 |      "name": "stdout",
553 |      "output_type": "stream",
554 |      "text": [
555 |       "[ 9.  6.] [ 9.  6.]\n",
556 |       "[ 29.0000454   27.04858735] [ 29.0000454   27.04858735]\n",
557 |       "[ 44.00017494  55.13288499] [ 44.00017494  55.13288499]\n",
558 |       "[ 58.14879707  78.13289961] [ 58.14879707  78.13289961]\n"
559 |      ]
560 |     },
561 |     {
562 |      "data": {
563 |       "text/plain": [
564 |        "(78.132899613126483, 78.132899613126483)"
565 |       ]
566 |      },
567 |      "execution_count": 25,
568 |      "metadata": {},
569 |      "output_type": "execute_result"
570 |     }
571 |    ],
572 |    "source": [
573 |     "forward_algorithm_vec_check(emissions, transitions)"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": 26,
579 |    "metadata": {
580 |     "collapsed": true
581 |    },
582 |    "outputs": [],
583 |    "source": [
584 |     "def forward_algorithm(emissions, transitions):\n",
585 |     "    scores = emissions[0]\n",
586 |     "    # Get the log sum exp score\n",
587 |     "    for i in range(1, emissions.shape[0]):\n",
588 |     "        scores = emissions[i] + log_sum_exp(\n",
589 |     "            scores + transitions.T,\n",
590 |     "            axis=1)\n",
591 |     "    return log_sum_exp(scores)"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": 27,
597 |    "metadata": {},
598 |    "outputs": [
599 |     {
600 |      "data": {
601 |       "text/plain": [
602 |        "78.132899613126483"
603 |       ]
604 |      },
605 |      "execution_count": 27,
606 |      "metadata": {},
607 |      "output_type": "execute_result"
608 |     }
609 |    ],
610 |    "source": [
611 |     "forward_algorithm(emissions, transitions)"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": 28,
617 |    "metadata": {},
618 |    "outputs": [],
619 |    "source": [
620 |     "tt = torch.Tensor(emissions)\n",
621 |     "tt_max, _ = tt.max(1)"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": 29,
627 |    "metadata": {},
628 |    "outputs": [
629 |     {
630 |      "data": {
631 |       "text/plain": [
632 |        "\n",
633 |        "  9   9\n",
634 |        " 13  13\n",
635 |        " 18  18\n",
636 |        " 15  15\n",
637 |        "[torch.FloatTensor of size 4x2]"
638 |       ]
639 |      },
640 |      "execution_count": 29,
641 |      "metadata": {},
642 |      "output_type": "execute_result"
643 |     }
644 |    ],
645 |    "source": [
646 |     "tt_max.expand_as(tt)"
647 |    ]
648 |   },
649 |   {
650 |    "cell_type": "code",
651 |    "execution_count": 30,
652 |    "metadata": {},
653 |    "outputs": [
654 |     {
655 |      "data": {
656 |       "text/plain": [
657 |        "\n",
658 |        " 33  49\n",
659 |        "[torch.FloatTensor of size 1x2]"
660 |       ]
661 |      },
662 |      "execution_count": 30,
663 |      "metadata": {},
664 |      "output_type": "execute_result"
665 |     }
666 |    ],
667 |    "source": [
668 |     "tt.sum(0)"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": 31,
674 |    "metadata": {},
675 |    "outputs": [
676 |     {
677 |      "data": {
678 |       "text/plain": [
679 |        "\n",
680 |        "  9   6\n",
681 |        " 13  10\n",
682 |        "  8  18\n",
683 |        "  3  15\n",
684 |        "[torch.FloatTensor of size 4x2]"
685 |       ]
686 |      },
687 |      "execution_count": 31,
688 |      "metadata": {},
689 |      "output_type": "execute_result"
690 |     }
691 |    ],
692 |    "source": [
693 |     "tt.squeeze(0)"
694 |    ]
695 |   },
696 |   {
697 |    "cell_type": "code",
698 |    "execution_count": 32,
699 |    "metadata": {},
700 |    "outputs": [
701 |     {
702 |      "data": {
703 |       "text/plain": [
704 |        "\n",
705 |        "  9  13   8   3\n",
706 |        "  6  10  18  15\n",
707 |        "[torch.FloatTensor of size 2x4]"
708 |       ]
709 |      },
710 |      "execution_count": 32,
711 |      "metadata": {},
712 |      "output_type": "execute_result"
713 |     }
714 |    ],
715 |    "source": [
716 |     "tt.transpose(-1,-2)"
717 |    ]
718 |   },
719 |   {
720 |    "cell_type": "code",
721 |    "execution_count": 33,
722 |    "metadata": {},
723 |    "outputs": [
724 |     {
725 |      "data": {
726 |       "text/plain": [
727 |        "2"
728 |       ]
729 |      },
730 |      "execution_count": 33,
731 |      "metadata": {},
732 |      "output_type": "execute_result"
733 |     }
734 |    ],
735 |    "source": [
736 |     "tt.ndimension()"
737 |    ]
738 |   },
739 |   {
740 |    "cell_type": "code",
741 |    "execution_count": 34,
742 |    "metadata": {
743 |     "collapsed": true
744 |    },
745 |    "outputs": [],
746 |    "source": [
747 |     "def log_sum_exp_torch(vecs, axis=None):\n",
748 |     "    ## Use help from: http://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html#sphx-glr-beginner-nlp-advanced-tutorial-py\n",
749 |     "    if axis < 0:\n",
750 |     "        axis = vecs.ndimension()+axis\n",
751 |     "    max_val, _ = vecs.max(axis)\n",
752 |     "    vecs = vecs - max_val.expand_as(vecs)\n",
753 |     "    out_val = torch.log(torch.exp(vecs).sum(axis))\n",
754 |     "    #print(max_val, out_val)\n",
755 |     "    return max_val + out_val"
756 |    ]
757 |   },
758 |   {
759 |    "cell_type": "code",
760 |    "execution_count": 35,
761 |    "metadata": {
762 |     "collapsed": true
763 |    },
764 |    "outputs": [],
765 |    "source": [
766 |     "def forward_algorithm_torch(emissions, transitions):\n",
767 |     "    scores = emissions[0]\n",
768 |     "    # Get the log sum exp score\n",
769 |     "    transitions = transitions.transpose(-1,-2)\n",
770 |     "    for i in range(1, emissions.size(0)):\n",
771 |     "        scores = emissions[i] + log_sum_exp_torch(\n",
772 |     "            scores.expand_as(transitions) + transitions,\n",
773 |     "            axis=1)\n",
774 |     "    return log_sum_exp_torch(scores, axis=-1)"
775 |    ]
776 |   },
777 |   {
778 |    "cell_type": "code",
779 |    "execution_count": 36,
780 |    "metadata": {},
781 |    "outputs": [
782 |     {
783 |      "data": {
784 |       "text/plain": [
785 |        "\n",
786 |        " 78.1329\n",
787 |        "[torch.FloatTensor of size 1]"
788 |       ]
789 |      },
790 |      "execution_count": 36,
791 |      "metadata": {},
792 |      "output_type": "execute_result"
793 |     }
794 |    ],
795 |    "source": [
796 |     "forward_algorithm_torch(torch.Tensor(emissions), torch.Tensor(transitions))"
797 |    ]
798 |   },
799 |   {
800 |    "cell_type": "markdown",
801 |    "metadata": {},
802 |    "source": [
803 |     "The core idea is to find the sequence of states $y = \\{y_0, y_1, ..., y_N\\}$ which have the highest probability given the input $X = \\{X_0, X_1, ..., X_N\\}$ as follows:\n",
804 |     "\n",
805 |     "$$\n",
806 |     "\\begin{equation}\n",
807 |     "p(y\\mid X) = \\prod_{i=0}^{N}{p(y_i\\mid X_i)p(y_i \\mid y_{i-1})}\\\\\n",
808 |     "\\log{p(y\\mid X)} = \\sum_{i=0}^{N}{\\log{p(y_i\\mid X_i)} + \\log{p(y_i \\mid y_{i-1})}}\\\\\n",
809 |     "\\end{equation}\n",
810 |     "$$\n",
811 |     "\n",
812 |     "Now $\\log{p(y_i\\mid X_i)}$ and $\\log{p(y_i \\mid y_{i-1})}$ can be parameterized as follows:\n",
813 |     "\n",
814 |     "$$\n",
815 |     "\\begin{equation}\n",
816 |     "\\log{p(y_i\\mid X_i)} = \\sum_{l=0}^{L}{\\sum_{k=0}^{K}{w_{k}^{l}*\\phi_{k}^{l}(X_i, y_i)}}\\\\\n",
817 |     "\\log{p(y_i\\mid y_{y-1})} = \\sum_{l=0}^{L}{\\sum_{l'=0}^{L}{w_{l'}^{l}*\\psi_{l'}^{l}(y_i, y_{i-1})}}\\\\\n",
818 |     "\\implies \\log{p(y\\mid X)} = \\sum_{i=0}^{N}{(\\sum_{l=0}^{L}{\\sum_{k=0}^{K}{w_{k}^{l}*\\phi_{k}^{l}(X_i, y_i)}}\n",
819 |     "+ \\sum_{l=0}^{L}{\\sum_{l'=0}^{L}{w_{l'}^{l}*\\psi_{l'}^{l}(y_i, y_{i-1})}})}\\\\\n",
820 |     "\\implies \\log{p(y\\mid X)} = \\sum_{i=0}^{N}{(\\Phi(X_i)W_{emission} + \\log{p(y_{i-1} \\mid X_{i-1})}W_{transition})}\n",
821 |     "\\end{equation}\n",
822 |     "$$\n",
823 |     "\n",
824 |     "Where, \n",
825 |     "\n",
826 |     "* $N$ is the sequence length\n",
827 |     "* $K$ is number of feature functions,\n",
828 |     "* $L$ is number of states\n",
829 |     "* $W_{emission}$ is $K*L$ matrix\n",
830 |     "* $W_{transition}$ is $L*L$ matrix\n",
831 |     "* $\\Phi(X_i)$ is a feature vector of shape $1*K$\n",
832 |     "* $(\\Phi(X_i)W_{emission} + \\log{p(y_{i-1} \\mid X_{i-1})}W_{transition})$ gives the score for each label\n",
833 |     "\n"
834 |    ]
835 |   },
836 |   {
837 |    "cell_type": "code",
838 |    "execution_count": null,
839 |    "metadata": {
840 |     "collapsed": true
841 |    },
842 |    "outputs": [],
843 |    "source": []
844 |   }
845 |  ],
846 |  "metadata": {
847 |   "kernelspec": {
848 |    "display_name": "Python [default]",
849 |    "language": "python",
850 |    "name": "python3"
851 |   },
852 |   "language_info": {
853 |    "codemirror_mode": {
854 |     "name": "ipython",
855 |     "version": 3
856 |    },
857 |    "file_extension": ".py",
858 |    "mimetype": "text/x-python",
859 |    "name": "python",
860 |    "nbconvert_exporter": "python",
861 |    "pygments_lexer": "ipython3",
862 |    "version": "3.5.2"
863 |   }
864 |  },
865 |  "nbformat": 4,
866 |  "nbformat_minor": 2
867 | }
868 | 


--------------------------------------------------------------------------------
/chunking_bilstm_crf_char_concat.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | import matplotlib
  7 | matplotlib.use("Agg")
  8 | import torch
  9 | from torch.autograd import Variable
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import torch.optim as optim
 13 | 
 14 | torch.manual_seed(1)
 15 | 
 16 | import numpy as np
 17 | 
 18 | import matplotlib.pyplot as plt
 19 | import seaborn as sns
 20 | 
 21 | from pytorch_utils import *
 22 | from pytorch_models import *
 23 | from utils import load_sequences, conll_classification_report_to_df
 24 | from conlleval import main as conll_eval
 25 | import re
 26 | 
 27 | sns.set_context("poster")
 28 | sns.set_style("ticks")
 29 | 
 30 | 
 31 | # In[2]:
 32 | 
 33 | TRAIN_CORPUS="data/conll2000/train.txt"
 34 | TEST_CORPUS="data/conll2000/test.txt"
 35 | 
 36 | 
 37 | # In[3]:
 38 | 
 39 | train_corpus = load_sequences(TRAIN_CORPUS, sep=" ", col_ids=(0, -1))
 40 | train_corpus, dev_corpus = train_corpus[100:], train_corpus[:100]
 41 | print("Total items in train corpus: %s" % len(train_corpus))
 42 | print("Total items in dev corpus: %s" % len(dev_corpus))
 43 | test_corpus = load_sequences(TEST_CORPUS, sep=" ", col_ids=(0, -1))
 44 | print("Total items in test corpus: %s" % len(test_corpus))
 45 | 
 46 | 
 47 | # In[5]:
 48 | 
 49 | def create_vocab(data, vocabs, char_vocab, word_idx=0):
 50 |     n_vocabs = len(vocabs)
 51 |     for sent in data:
 52 |         for token_tags in sent:
 53 |             for vocab_id in range(n_vocabs):
 54 |                 vocabs[vocab_id].add(token_tags[vocab_id])
 55 |             char_vocab.batch_add(token_tags[word_idx])
 56 |     print("Created vocabs: %s, chars[%s]" % (", ".join(
 57 |         "{}[{}]".format(vocab.name, vocab.size)
 58 |         for vocab in vocabs
 59 |     ), char_vocab.size))
 60 | 
 61 | 
 62 | # In[6]:
 63 | 
 64 | word_vocab = Vocab("words", UNK="UNK", lower=True)
 65 | char_vocab = Vocab("chars", UNK="<U>", lower=False)
 66 | chunk_vocab = Vocab("chunk_tags", lower=False)
 67 | 
 68 | create_vocab(train_corpus+dev_corpus+test_corpus, [word_vocab, chunk_vocab], char_vocab)
 69 | 
 70 | 
 71 | # In[7]:
 72 | 
 73 | def data2tensors(data, vocabs, char_vocab, word_idx=0, column_ids=(0, -1)):
 74 |     vocabs = [vocabs[idx] for idx in column_ids]
 75 |     n_vocabs = len(vocabs)
 76 |     tensors = []
 77 |     char_tensors = []
 78 |     for sent in data:
 79 |         sent_vecs = [[] for i in range(n_vocabs+1)] # Last is for char vecs
 80 |         char_vecs = []
 81 |         for token_tags in sent:
 82 |             vocab_id = 0 # First column is the word
 83 |             # lowercase the word
 84 |             sent_vecs[vocab_id].append(
 85 |                     vocabs[vocab_id].getidx(token_tags[vocab_id].lower())
 86 |                 )
 87 |             for vocab_id in range(1, n_vocabs):
 88 |                 sent_vecs[vocab_id].append(
 89 |                     vocabs[vocab_id].getidx(token_tags[vocab_id])
 90 |                 )
 91 |             sent_vecs[-1].append(
 92 |                 [char_vocab.getidx(c) for c in token_tags[word_idx]]
 93 |             )
 94 |         tensors.append(sent_vecs)
 95 |     return tensors
 96 | 
 97 | 
 98 | # In[8]:
 99 | 
100 | train_tensors = data2tensors(train_corpus, [word_vocab, chunk_vocab], char_vocab)
101 | dev_tensors = data2tensors(dev_corpus, [word_vocab, chunk_vocab], char_vocab)
102 | test_tensors = data2tensors(test_corpus, [word_vocab, chunk_vocab], char_vocab)
103 | print("Train: {}, Dev: {}, Test: {}".format(
104 |     len(train_tensors),
105 |     len(dev_tensors),
106 |     len(test_tensors),
107 | ))
108 | 
109 | 
110 | # In[9]:
111 | 
112 | embedding_file="/home/napsternxg/datadrive/Downloads/Glove/glove.6B.100d.txt"
113 | cache_file="conll2000.glove.100.npy"
114 | ndims=100
115 | pretrained_embeddings = load_word_vectors(embedding_file, ndims, word_vocab, cache_file)
116 | 
117 | 
118 | # In[10]:
119 | 
120 | def plot_losses(train_losses, eval_losses=None, plot_std=False, ax=None):
121 |     if ax is None:
122 |         ax = plt.gca()
123 |     for losses, color, label in zip(
124 |         [train_losses, eval_losses],
125 |         ["0.5", "r"],
126 |         ["Train", "Eval"],
127 |     ):
128 |         mean_loss, std_loss = zip(*losses)
129 |         mean_loss = np.array(mean_loss)
130 |         std_loss = np.array(std_loss)
131 |         ax.plot(
132 |             mean_loss, color=color, label=label,
133 |             linestyle="-", 
134 |         )
135 |         if plot_std:
136 |             ax.fill_between(
137 |                 np.arange(mean_loss.shape[0]),
138 |                 mean_loss-std_loss,
139 |                 mean_loss+std_loss,
140 |                 color=color,
141 |                 alpha=0.3
142 |             )
143 |     ax.set_xlabel("Epochs")
144 |     ax.set_ylabel("Mean Loss ($\pm$ S.D.)")
145 |     
146 |     
147 | def print_predictions(corpus, predictions, filename, label_vocab):
148 |     with open(filename, "w+") as fp:
149 |         for seq, pred in zip(corpus, predictions):
150 |             for (token, true_label), pred_label in zip(seq, pred):
151 |                 pred_label = label_vocab.idx2item[pred_label]
152 |                 print("{}\t{}\t{}".format(token, true_label, pred_label), file=fp)
153 |             print(file=fp) # Add new line after each sequence
154 | 
155 | 
156 | # In[11]:
157 | 
158 | # ## Class based
159 | 
160 | # In[19]:
161 | 
162 | class BiLSTMTaggerWordCRFModel(ModelWrapper):
163 |     def __init__(self, model,
164 |                  loss_function,
165 |                  use_cuda=False):
166 |         self.model = model
167 |         self.loss_function = None
168 | 
169 |         self.use_cuda = use_cuda
170 |         if self.use_cuda:
171 |             #[k.cuda() for k in self.model.modules()]
172 |             self.model.cuda()
173 |         
174 |     def _process_instance_tensors(self, instance_tensors, volatile=False):
175 |         X, Y, X_char = instance_tensors
176 |         X = Variable(torch.LongTensor([X]), requires_grad=False, volatile=volatile)
177 |         Y = torch.LongTensor(Y)
178 |         X_char = charseq2varlist(X_char, volatile=volatile)
179 |         return X, X_char, Y
180 | 
181 |     def get_instance_loss(self, instance_tensors, zero_grad=True):
182 |         if zero_grad:
183 |             ## Clear gradients before every update else memory runs out
184 |             self.model.zero_grad()
185 |         X, X_char, Y = instance_tensors
186 |         if self.use_cuda:
187 |             X = X.cuda(async=True)
188 |             Y = Y.cuda(async=True)
189 |             X_char = [t.cuda(async=True) for t in X_char]
190 |         #print(X.get_device(), [t.get_device() for t in X_char])
191 |         return self.model.loss(X, X_char, Y)
192 |         
193 |     def predict(self, instance_tensors):
194 |         X, X_char, Y = self._process_instance_tensors(instance_tensors, volatile=True)
195 |         if self.use_cuda:
196 |             X = X.cuda(async=True)
197 |             Y = Y.cuda(async=True)
198 |             X_char = [t.cuda(async=True) for t in X_char]
199 |         emissions = self.model.forward(X, X_char)
200 |         return self.model.crf.forward(emissions)[1]
201 | 
202 | 
203 | use_cuda=True
204 | hidden_size=128
205 | batch_size=64
206 | 
207 | char_emb_size=50
208 | output_channels=25
209 | kernel_sizes=[2, 3]
210 | 
211 | word_emb_size=100
212 | n_embed=150 # Get this using char embedding and word embed
213 | char_embed_kwargs=dict(
214 |     vocab_size=char_vocab.size,
215 |     embedding_size=char_emb_size,
216 |     out_channels=output_channels,
217 |     kernel_sizes=kernel_sizes
218 | )
219 | 
220 | word_char_embedding = WordCharEmbedding(
221 |         word_vocab.size, word_emb_size,
222 |         char_embed_kwargs, dropout=0, concat=True)
223 | # Assign glove embeddings
224 | assign_embeddings(word_char_embedding.word_embeddings, pretrained_embeddings, fix_embedding=True)
225 | 
226 | model_wrapper = BiLSTMTaggerWordCRFModel(
227 |     LSTMTaggerWordCharCRF(word_char_embedding, n_embed, hidden_size, chunk_vocab.size),
228 |     None, use_cuda=use_cuda)
229 | 
230 | 
231 | # In[33]:
232 | model_prefix="BiLSTMCharConcatCRF_CONLL2000"
233 | n_epochs=50
234 | training_history = training_wrapper(
235 |     model_wrapper, train_tensors, 
236 |     eval_tensors=dev_tensors,
237 |     optimizer=optim.Adam,
238 |     optimizer_kwargs={
239 |         #"lr": 0.01,
240 |         "weight_decay": 0
241 |     },
242 |     n_epochs=n_epochs,
243 |     batch_size=batch_size,
244 |     use_cuda=use_cuda,
245 |     log_file="{}.log".format(model_prefix)
246 | )
247 | model_wrapper.save("{}.pth".format(model_prefix))
248 | 
249 | 
250 | # In[34]:
251 | 
252 | fig, ax = plt.subplots(1,1)
253 | plot_losses(training_history["training_loss"],
254 |             training_history["evaluation_loss"],
255 |             plot_std=True,
256 |             ax=ax)
257 | ax.legend()
258 | sns.despine(offset=5)
259 | plt.savefig("{}.pdf".format(model_prefix))
260 | 
261 | for title, tensors, corpus in zip(
262 |     ["train", "dev", "test"],
263 |     [train_tensors, dev_tensors, test_tensors],
264 |     [train_corpus, dev_corpus, test_corpus],
265 |                          ):
266 |     predictions = model_wrapper.predict_batch(tensors, title=title)
267 |     print_predictions(corpus, predictions, "%s.chunking.conll" % title, chunk_vocab)
268 |     conll_eval(["conlleval", "%s.chunking.conll" % title]) 
269 | 
270 | 
271 | 


--------------------------------------------------------------------------------
/conll2000.glove.100.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/napsternxg/pytorch-practice/a48c6aae19c57458e94492e637a38363154f3374/conll2000.glove.100.npy


--------------------------------------------------------------------------------
/conlleval.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | ## Original script taken from https://github.com/spyysalo/conlleval.py
  4 | ## Modifications made by Shubhanshu Mishra to support notypes argument and functional api
  5 | 
  6 | # Python version of the evaluation script from CoNLL'00-
  7 | 
  8 | # Intentional differences:
  9 | # - accept any space as delimiter by default
 10 | # - optional file argument (default STDIN)
 11 | # - option to set boundary (-b argument)
 12 | # - LaTeX output (-l argument) not supported
 13 | # - raw tags (-r argument) not supported
 14 | 
 15 | import sys
 16 | import re
 17 | 
 18 | from collections import defaultdict, namedtuple
 19 | 
 20 | ANY_SPACE = '<SPACE>'
 21 | 
 22 | class FormatError(Exception):
 23 |     pass
 24 | 
 25 | Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')
 26 | 
 27 | class EvalCounts(object):
 28 |     def __init__(self):
 29 |         self.correct_chunk = 0    # number of correctly identified chunks
 30 |         self.correct_tags = 0     # number of correct chunk tags
 31 |         self.found_correct = 0    # number of chunks in corpus
 32 |         self.found_guessed = 0    # number of identified chunks
 33 |         self.token_counter = 0    # token counter (ignores sentence breaks)
 34 | 
 35 |         # counts by type
 36 |         self.t_correct_chunk = defaultdict(int)
 37 |         self.t_found_correct = defaultdict(int)
 38 |         self.t_found_guessed = defaultdict(int)
 39 | 
 40 | def parse_args(argv):
 41 |     import argparse
 42 |     parser = argparse.ArgumentParser(
 43 |         description='evaluate tagging results using CoNLL criteria',
 44 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 45 |     )
 46 |     arg = parser.add_argument
 47 |     arg('-b', '--boundary', metavar='STR', default='-X-',
 48 |         help='sentence boundary')
 49 |     arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
 50 |         help='character delimiting items in input')
 51 |     arg('-o', '--otag', metavar='CHAR', default='O',
 52 |         help='alternative outside tag')
 53 |     arg('-t', '--no-types', action='store_const', const=True, default=False,
 54 |         help='evaluate without entity types')
 55 |     arg('file', nargs='?', default=None)
 56 |     arg('--outstream', default=None,
 57 |         help='output file for storing report')
 58 |     return parser.parse_args(argv)
 59 | 
 60 | def parse_tag(t):
 61 |     m = re.match(r'^([^-]*)-(.*)$', t)
 62 |     return m.groups() if m else (t, '')
 63 | 
 64 | def evaluate(iterable, options=None):
 65 |     if options is None:
 66 |         options = parse_args([])    # use defaults
 67 |     counts = EvalCounts()
 68 |     num_features = None       # number of features per line
 69 |     in_correct = False        # currently processed chunks is correct until now
 70 |     last_correct = 'O'        # previous chunk tag in corpus
 71 |     last_correct_type = ''    # type of previously identified chunk tag
 72 |     last_guessed = 'O'        # previously identified chunk tag
 73 |     last_guessed_type = ''    # type of previous chunk tag in corpus
 74 |     new_sent=True
 75 | 
 76 |     for line in iterable:
 77 |         line = line.rstrip('\r\n')
 78 | 
 79 |         if options.delimiter == ANY_SPACE:
 80 |             features = line.split()
 81 |         else:
 82 |             features = line.split(options.delimiter)[-2:]
 83 | 
 84 |         if num_features is None:
 85 |             num_features = len(features)
 86 |         elif num_features != len(features) and len(features) != 0:
 87 |             raise FormatError('unexpected number of features: %d (%d)' %
 88 |                               (len(features), num_features))
 89 | 
 90 |         if len(features) == 0 or features[0] == options.boundary:
 91 |             features = ['O', 'O']
 92 |             new_sent=True
 93 |         else:
 94 |             new_sent=False
 95 |         if len(features) < 2:
 96 |             raise FormatError('unexpected number of features in line %s' % line)
 97 | 
 98 |         guessed, guessed_type = parse_tag(features.pop())
 99 |         correct, correct_type = parse_tag(features.pop())
100 |         if options.no_types:
101 |             guessed_type = ''
102 |             correct_type = ''
103 | 
104 |         if new_sent:
105 |             guessed = 'O'
106 | 
107 |         end_correct = end_of_chunk(last_correct, correct,
108 |                                    last_correct_type, correct_type)
109 |         end_guessed = end_of_chunk(last_guessed, guessed,
110 |                                    last_guessed_type, guessed_type)
111 |         start_correct = start_of_chunk(last_correct, correct,
112 |                                        last_correct_type, correct_type)
113 |         start_guessed = start_of_chunk(last_guessed, guessed,
114 |                                        last_guessed_type, guessed_type)
115 | 
116 |         if in_correct:
117 |             if (end_correct and end_guessed and
118 |                 last_guessed_type == last_correct_type):
119 |                 in_correct = False
120 |                 counts.correct_chunk += 1
121 |                 counts.t_correct_chunk[last_correct_type] += 1
122 |             elif (end_correct != end_guessed or guessed_type != correct_type):
123 |                 in_correct = False
124 | 
125 |         if start_correct and start_guessed and guessed_type == correct_type:
126 |             in_correct = True
127 | 
128 |         if start_correct:
129 |             counts.found_correct += 1
130 |             counts.t_found_correct[correct_type] += 1
131 |         if start_guessed:
132 |             counts.found_guessed += 1
133 |             counts.t_found_guessed[guessed_type] += 1
134 |         if not new_sent:
135 |             if correct == guessed and guessed_type == correct_type:
136 |                 counts.correct_tags += 1
137 |             counts.token_counter += 1
138 | 
139 |         last_guessed = guessed
140 |         last_correct = correct
141 |         last_guessed_type = guessed_type
142 |         last_correct_type = correct_type
143 | 
144 |     if in_correct:
145 |         counts.correct_chunk += 1
146 |         counts.t_correct_chunk[last_correct_type] += 1
147 | 
148 |     return counts
149 | 
150 | def uniq(iterable):
151 |     seen = set()
152 |     return [i for i in iterable if not (i in seen or seen.add(i))]
153 | 
154 | def calculate_metrics(correct, guessed, total):
155 |     tp, fp, fn = correct, guessed-correct, total-correct
156 |     p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
157 |     r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
158 |     f = 0 if p + r == 0 else 2 * p * r / (p + r)
159 |     return Metrics(tp, fp, fn, p, r, f)
160 | 
161 | def metrics(counts):
162 |     c = counts
163 |     overall = calculate_metrics(
164 |         c.correct_chunk, c.found_guessed, c.found_correct
165 |     )
166 |     by_type = {}
167 |     for t in uniq(list(c.t_found_correct.keys()) + list(c.t_found_guessed.keys())):
168 |         by_type[t] = calculate_metrics(
169 |             c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
170 |         )
171 |     return overall, by_type
172 | 
173 | def report(counts, out=None):
174 |     if out is None:
175 |         out = sys.stdout
176 | 
177 |     overall, by_type = metrics(counts)
178 | 
179 |     c = counts
180 |     out.write('processed %d tokens with %d phrases; ' %
181 |               (c.token_counter, c.found_correct))
182 |     out.write('found: %d phrases; correct: %d.\n' %
183 |               (c.found_guessed, c.correct_chunk))
184 | 
185 |     if c.token_counter > 0:
186 |         out.write('accuracy: %6.2f%%; ' %
187 |                   (100.*c.correct_tags/c.token_counter))
188 |         out.write('precision: %6.2f%%; ' % (100.*overall.prec))
189 |         out.write('recall: %6.2f%%; ' % (100.*overall.rec))
190 |         out.write('FB1: %6.2f\n' % (100.*overall.fscore))
191 | 
192 |     for i, m in sorted(by_type.items()):
193 |         out.write('%17s: ' % i)
194 |         out.write('precision: %6.2f%%; ' % (100.*m.prec))
195 |         out.write('recall: %6.2f%%; ' % (100.*m.rec))
196 |         out.write('FB1: %6.2f  %d\n' % (100.*m.fscore, c.t_found_guessed[i]))
197 | 
198 | def end_of_chunk(prev_tag, tag, prev_type, type_):
199 |     # check if a chunk ended between the previous and current word
200 |     # arguments: previous and current chunk tags, previous and current types
201 |     chunk_end = False
202 | 
203 |     if prev_tag == 'E': chunk_end = True
204 |     if prev_tag == 'U': chunk_end = True
205 | 
206 |     if prev_tag == 'B' and tag == 'B': chunk_end = True
207 |     if prev_tag == 'B' and tag == 'U': chunk_end = True
208 |     if prev_tag == 'B' and tag == 'O': chunk_end = True
209 |     if prev_tag == 'I' and tag == 'B': chunk_end = True
210 |     if prev_tag == 'I' and tag == 'U': chunk_end = True
211 |     if prev_tag == 'I' and tag == 'O': chunk_end = True
212 | 
213 |     if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
214 |         chunk_end = True
215 | 
216 |     # these chunks are assumed to have length 1
217 |     if prev_tag == ']': chunk_end = True
218 |     if prev_tag == '[': chunk_end = True
219 | 
220 |     return chunk_end
221 | 
222 | def start_of_chunk(prev_tag, tag, prev_type, type_):
223 |     # check if a chunk started between the previous and current word
224 |     # arguments: previous and current chunk tags, previous and current types
225 |     chunk_start = False
226 | 
227 |     if tag == 'B': chunk_start = True
228 |     if tag == 'U': chunk_start = True
229 | 
230 |     if prev_tag == 'E' and tag == 'E': chunk_start = True
231 |     if prev_tag == 'E' and tag == 'I': chunk_start = True
232 |     if prev_tag == 'U' and tag == 'E': chunk_start = True
233 |     if prev_tag == 'U' and tag == 'I': chunk_start = True
234 |     if prev_tag == 'O' and tag == 'E': chunk_start = True
235 |     if prev_tag == 'O' and tag == 'I': chunk_start = True
236 | 
237 |     if tag != 'O' and tag != '.' and prev_type != type_:
238 |         chunk_start = True
239 | 
240 |     # these chunks are assumed to have length 1
241 |     if tag == '[': chunk_start = True
242 |     if tag == ']': chunk_start = True
243 | 
244 |     return chunk_start
245 | 
246 | def evaluate_from_file(filename, argv, outstream=None):
247 |     args = parse_args(argv[1:])
248 |     with open(filename) as f:
249 |         counts = evaluate(f, args)
250 |     report(counts, outstream)
251 |     
252 | def main(argv, outstream=None):
253 |     args = parse_args(argv[1:])
254 | 
255 |     if args.file is None:
256 |         counts = evaluate(sys.stdin, args)
257 |     else:
258 |         with open(args.file) as f:
259 |             counts = evaluate(f, args)
260 |     if outstream is not None:
261 |         args.outstream = outstream
262 |     report(counts, args.outstream)
263 | 
264 | if __name__ == '__main__':
265 |     sys.exit(main(sys.argv))
266 | 


--------------------------------------------------------------------------------
/data/conll2000/get_data.sh:
--------------------------------------------------------------------------------
1 | wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
2 | wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz
3 | gunzip train.txt
4 | gunzip test.txt
5 | 


--------------------------------------------------------------------------------
/pytorch_models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.autograd import Variable
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | 
  7 | import numpy as np
  8 | from tqdm import tqdm
  9 | 
 10 | 
 11 | def to_scalar(var):
 12 |     # returns a python float
 13 |     return var.view(-1).data.tolist()[0]
 14 | 
 15 | 
 16 | def argmax(vec):
 17 |     # return the argmax as a python int
 18 |     _, idx = torch.max(vec, 1)
 19 |     return to_scalar(idx)
 20 | 
 21 | 
 22 | def log_sum_exp_torch(vecs, axis=None):
 23 |     ## Use help from: http://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html#sphx-glr-beginner-nlp-advanced-tutorial-py
 24 |     if axis < 0:
 25 |         axis = vecs.ndimension()+axis
 26 |     max_val, _ = vecs.max(axis)
 27 |     vecs = vecs - max_val.expand_as(vecs)
 28 |     out_val = torch.log(torch.exp(vecs).sum(axis))
 29 |     #print(max_val, out_val)
 30 |     return max_val + out_val
 31 | 
 32 | 
 33 | def charseq2varlist(X_chars, volatile=False):
 34 |     return [Variable(torch.LongTensor([x]).pin_memory(), requires_grad=False, volatile=volatile) for x in X_chars]
 35 | 
 36 | 
 37 | def assign_embeddings(embedding_module, pretrained_embeddings, fix_embedding=False):
 38 |     embedding_module.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
 39 |     if fix_embedding:
 40 |         embedding_module.weight.requires_grad = False
 41 | 
 42 | 
 43 | class ModelWrapper(object):
 44 |     def __init__(self, model,
 45 |                  loss_function,
 46 |                  use_cuda=False
 47 |                 ):
 48 |         self.model = model
 49 |         self.loss_function = loss_function
 50 | 
 51 |         self.use_cuda = use_cuda
 52 |         if self.use_cuda:
 53 |             self.model.cuda()
 54 |             
 55 |     def batch_process_tensors(self, data_tensors):
 56 |         for instance_tensors in data_tensors:
 57 |             yield self._process_instance_tensors(instance_tensors)
 58 |         
 59 |     def _process_instance_tensors(self, instance_tensors, volatile=False):
 60 |         raise NotImplementedError("Please define this function explicitly")
 61 |         
 62 |     def zero_grad(self):
 63 |         self.model.zero_grad()
 64 | 
 65 |     def post_backward(self):
 66 |         ## Implement things like grad clipping or grad norm
 67 |         pass
 68 |         
 69 |     def get_parameters(self):
 70 |         return self.model.paramerters()
 71 |     
 72 |     def set_model_mode(self, training_mode=True):
 73 |         if training_mode:
 74 |             self.model.train()
 75 |         else:
 76 |             self.model.eval()
 77 |             
 78 |     def save(self, filename, verbose=True):
 79 |         torch.save(self.model, filename)
 80 |         if verbose:
 81 |             print("{} model saved to {}".format(self.model.__class__, filename))
 82 |         
 83 |     def load(self, filename):
 84 |         self.model = torch.load(filename)
 85 |         if self.use_cuda:
 86 |             self.model.cuda()
 87 | 
 88 |     def get_instance_loss(self, instance_tensors, zero_grad=True):
 89 |         if zero_grad:
 90 |         ## Clear gradients before every update else memory runs out
 91 |             self.zero_grad()
 92 |         raise NotImplementedError("Please define this function explicitly")
 93 |         
 94 |     def predict(self, instance_tensors):
 95 |         raise NotImplementedError("Please define this function explicitly")
 96 |         
 97 |     def predict_batch(self, batch_tensors, title="train"):
 98 |         self.model.eval() # Set model to eval mode
 99 |         predictions = []
100 |         for instance_tensors in tqdm(batch_tensors,
101 |                 desc="%s predict" % title, unit="instance"):
102 |             predictions.append(self.predict(instance_tensors))
103 |         return predictions
104 |         
105 |         
106 | def get_epoch_function(model_wrapper, optimizer,
107 |                        use_cuda=False):
108 |     def perform_epoch(data_tensors, training_mode=True, batch_size=1, pbar=None):
109 |         model_wrapper.set_model_mode(training_mode)
110 |         step_losses = []
111 |         len_data_tensors = len(data_tensors)
112 |         data_tensor_idxs = np.random.permutation(np.arange(len_data_tensors, dtype="int"))
113 |         n_splits = data_tensor_idxs.shape[0]//batch_size
114 |         title = "train" if training_mode else "eval"
115 |         for batch_tensors_idxs in np.array_split(data_tensor_idxs, n_splits):
116 |             #from IPython.core.debugger import Tracer; Tracer()()
117 |             optimizer.zero_grad()
118 |             #loss = Variable(torch.FloatTensor([0.]))
119 |             losses = []
120 |             for instance_tensors_idx in batch_tensors_idxs:
121 |                 instance_tensors = data_tensors[instance_tensors_idx]
122 |                 loss = model_wrapper.get_instance_loss(instance_tensors, zero_grad=False)
123 |                 losses.append(loss)
124 |                 if pbar is not None:
125 |                     pbar.update(1)
126 |             loss = torch.mean(torch.cat(losses))
127 |             #loss = loss/batch_tensors_idxs.shape[0] # Mean loss
128 |             step_losses.append(loss.data[0])
129 |             if training_mode:
130 |                 ## Get gradients of model params wrt. loss
131 |                 loss.backward()
132 |                 ## Model grad specific steps like clipping or norm
133 |                 model_wrapper.post_backward()
134 |                 ## Optimize the loss by one step
135 |                 optimizer.step()
136 |         return step_losses
137 |     return perform_epoch
138 | 
139 | def write_losses(losses, fp, title="train", epoch=0):
140 |     for i, loss in enumerate(losses):
141 |         print("{:<10} epoch={:<3} batch={:<5} loss={:<10}".format(
142 |             title, epoch, i, loss
143 |         ), file=fp)
144 |     print("{:<10} epoch={:<3} {:<11} mean={:<10.3f} std={:<10.3f}".format(
145 |         title, epoch, "overall", np.mean(losses), np.std(losses)
146 |     ), file=fp)
147 | 
148 | 
149 | def training_wrapper(
150 |     model_wrapper, data_tensors,
151 |     eval_tensors=None,
152 |     optimizer=optim.SGD,
153 |     optimizer_kwargs=None,
154 |     n_epochs=10,
155 |     batch_size=1,
156 |     use_cuda=False,
157 |     log_file="training_output.log",
158 |     early_stopping=None,
159 |     save_best=False,
160 |     save_path="best_model.pth",
161 |     reduce_lr_every=5,
162 |     lr_reduce_factor=0.5
163 | ):
164 |     """Wrapper to train the model
165 |     """
166 |     if optimizer_kwargs is None:
167 |         optimizer_kwargs = {}
168 |     # Fileter out parameters which don't require a gradient
169 |     parameters = filter(lambda p: p.requires_grad, model_wrapper.model.parameters())
170 |     optimizer=optimizer(parameters, **optimizer_kwargs)
171 |     # Start training
172 |     losses = []
173 |     eval_losses = []
174 |     ## Covert data tensors to torch tensors
175 |     data_tensors = list(
176 |         tqdm(
177 |             model_wrapper.batch_process_tensors(data_tensors),
178 |             total=len(data_tensors),
179 |             desc="Proc. train tensors",
180 |             #leave=False,
181 |         )
182 |     )
183 |     if eval_tensors is not None:
184 |         eval_tensors = list(
185 |             tqdm(
186 |                 model_wrapper.batch_process_tensors(eval_tensors),
187 |                 total=len(eval_tensors),
188 |                 desc="Proc. eval tensors",
189 |                 #leave=False,
190 |             )
191 |         )
192 |     ## 
193 |     #data_tensors = np.array(data_tensors)
194 |     #if eval_tensors is not None:
195 |     #    eval_tensors = np.array(eval_tensors)
196 |     perform_epoch = get_epoch_function(
197 |         model_wrapper,
198 |         optimizer,
199 |         use_cuda=use_cuda)
200 |     with open(log_file, "w+") as fp:
201 |         with tqdm(total=n_epochs, desc="Epochs", unit="epochs") as epoch_progress_bar:
202 |             for epoch in range(n_epochs):
203 |                 with tqdm(
204 |                     total=len(data_tensors),
205 |                     desc="Train", unit="instance", leave=False
206 |                     ) as train_progress_bar:
207 |                     step_losses = perform_epoch(data_tensors, batch_size=batch_size, pbar=train_progress_bar)
208 |                     mean_loss, std_loss = np.mean(step_losses), np.std(step_losses)
209 |                     losses.append((mean_loss, std_loss))
210 |                     write_losses(step_losses, fp, title="train", epoch=epoch)
211 |                 if eval_tensors is not None:
212 |                     with tqdm(
213 |                         total=len(eval_tensors),
214 |                         desc="Eval", unit="instance", leave=False) as eval_progress_bar:
215 |                         step_losses = perform_epoch(eval_tensors, training_mode=False, pbar=eval_progress_bar)
216 |                         mean_loss, std_loss = np.mean(step_losses), np.std(step_losses)
217 |                         eval_losses.append((mean_loss, std_loss))
218 |                         write_losses(step_losses, fp, title="eval", epoch=epoch)
219 |                 epoch_progress_bar.update(1)
220 |                 if early_stopping is not None and epoch > 1:
221 |                     assert isinstance(early_stopping, float), "early_stopping should be either None or float value. Got {}".format(early_stopping)
222 |                     eval_loss_diff = np.abs(eval_losses[-2][0] - eval_losses[-1][0])
223 |                     if eval_loss_diff < early_stopping:
224 |                         epoch_progress_bar.write("Evaluation loss stopped decreased less than {}. Early stopping at epoch {}.".format(early_stopping, epoch))
225 |                         break
226 |                 if save_best and save_path is not None:
227 |                     if epoch == 0:
228 |                         best_eval_loss = eval_losses[-1][0]
229 |                         best_epoch = epoch
230 |                         model_wrapper.save(save_path, verbose=False)
231 |                         continue
232 |                     # Save the best model
233 |                     if eval_losses[-1][0] < best_eval_loss:
234 |                         best_eval_loss = eval_losses[-1][0]
235 |                         best_epoch = epoch
236 |                         model_wrapper.save(save_path, verbose=False)
237 |                     if epoch == n_epochs -1:
238 |                         epoch_progress_bar.write("Best model from {} epoch with {:3f} loss".format(best_epoch, best_eval_loss))
239 | 
240 |                 if reduce_lr_every > 0 and lr_reduce_factor > 0 and ((epoch + 1) % reduce_lr_every) == 0:
241 |                     for param_group in optimizer.param_groups:
242 |                         param_group['lr'] = param_group['lr']*lr_reduce_factor
243 | 
244 | 
245 |     return {
246 |         "training_loss": losses,
247 |         "evaluation_loss": eval_losses
248 |     }
249 | 
250 | 
251 | 
252 | 
253 | class BoWModule(nn.Module):
254 |     def __init__(self, input_size, output_size):
255 |         super(BoWModule, self).__init__()
256 |         self.W = nn.Linear(input_size, output_size)
257 |         
258 |     def forward(self, X):
259 |         return F.log_softmax(self.W(X))
260 | 
261 | 
262 | class BoEmbeddingsModule(nn.Module):
263 |     def __init__(self, vocab_size, embedding_size, output_size):
264 |         super(BoEmbeddingsModule, self).__init__()
265 |         self.word_embeddings = nn.Embedding(vocab_size, embedding_size)
266 |         self.W = nn.Linear(embedding_size, output_size)
267 |         
268 |     def forward(self, X):
269 |         hidden_layer = self.word_embeddings(X).mean(1).view(-1,self.word_embeddings.embedding_dim)
270 |         return F.log_softmax(self.W(hidden_layer))
271 |     
272 | 
273 |     
274 | class LSTMPredictor(nn.Module):
275 |     def __init__(self, vocab_size, embedding_size, hidden_size, output_size):
276 |         super(LSTMPredictor, self).__init__()
277 |         self.word_embeddings = nn.Embedding(vocab_size, embedding_size)
278 |         self.lstm = nn.LSTM(embedding_size, hidden_size)
279 |         self.output = nn.Linear(hidden_size, output_size)
280 |         
281 |     def forward(self, X):
282 |         seq_embed = self.word_embeddings(X).permute(1, 0, 2)
283 |         out, hidden = self.lstm(seq_embed)
284 |         output = self.output(out[-1, :, :])
285 |         return F.log_softmax(output)    
286 | 
287 |     
288 | class LSTMTagger(nn.Module):
289 |     def __init__(self, vocab_size, embedding_size, hidden_size, output_size):
290 |         super(LSTMTagger, self).__init__()
291 |         self.word_embeddings = nn.Embedding(vocab_size, embedding_size)
292 |         self.lstm = nn.LSTM(embedding_size, hidden_size)
293 |         self.output = nn.Linear(hidden_size, output_size)
294 |         
295 |     def forward(self, X):
296 |         seq_embed = self.word_embeddings(X).permute(1, 0, 2)
297 |         out, hidden = self.lstm(seq_embed)
298 |         # Reshape the output to be a tensor of shape seq_len*label_size
299 |         output = self.output(out.view(X.data.size(1), -1))
300 |         return F.log_softmax(output)
301 |     
302 |     
303 | class CharEmbedding(nn.Module):
304 |     def __init__(self, vocab_size, embedding_size,
305 |                  out_channels, kernel_sizes, dropout=0.5):
306 |         super(CharEmbedding, self).__init__()
307 |         self.char_embeddings = nn.Embedding(vocab_size, embedding_size)
308 |         # Usage of nn.ModuleList is important
309 |         ## See: https://discuss.pytorch.org/t/list-of-nn-module-in-a-nn-module/219/6
310 |         self.convs1 = nn.ModuleList([nn.Conv2d(1, out_channels, (K, embedding_size), padding=(K-1, 0)) 
311 |                        for K in kernel_sizes])
312 |         self.dropout = nn.Dropout(dropout)
313 |         
314 |     def forward(self, X):
315 |         x = self.char_embeddings(X)
316 |         x = self.dropout(x)
317 |         # Ref: https://github.com/Shawn1993/cnn-text-classification-pytorch/blob/master/model.py
318 |         x = x.unsqueeze(1) # (N,Ci,W,D)
319 |         x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] #[(N,Co,W), ...]*len(Ks)
320 |         x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
321 |         x = torch.cat(x, 1)
322 |         return self.dropout(x)
323 |     
324 |     
325 | class WordCharEmbedding(nn.Module):
326 |     def __init__(self,
327 |             vocab_size, embedding_size,
328 |             char_embed_kwargs, dropout=0.5,
329 |             aux_embedding_size=None,
330 |             concat=False
331 |             ):
332 |         super(WordCharEmbedding, self).__init__()
333 |         self.char_embeddings = CharEmbedding(**char_embed_kwargs)
334 |         self.word_embeddings = nn.Embedding(vocab_size, embedding_size)
335 |         self.dropout = nn.Dropout(dropout)
336 |         if concat and aux_embedding_size is not None:
337 |             ## Only allow aux embedding in concat mode
338 |             self.aux_word_embeddings = nn.Embedding(vocab_size, aux_embedding_size)
339 |         self.concat = concat
340 |         
341 |     def forward(self, X, X_char=None):
342 |         # Ref: https://github.com/Shawn1993/cnn-text-classification-pytorch/blob/master/model.py
343 |         word_vecs = self.word_embeddings(X)
344 |         if X_char is not None:
345 |             char_vecs = torch.cat([
346 |                 self.char_embeddings(x).unsqueeze(0)
347 |                 for x in X_char
348 |             ], 1)
349 |             if self.concat:
350 |                 embedding_list = [char_vecs, word_vecs]
351 |                 if hasattr(self, "aux_word_embeddings"):
352 |                     aux_vecs = self.aux_word_embeddings(X)
353 |                     embedding_list.append(aux_vecs)
354 |                 word_vecs = torch.cat(embedding_list, 2)
355 |             else:
356 |                 word_vecs = char_vecs + word_vecs
357 |         return self.dropout(word_vecs)
358 | 
359 | class WordCharEmbedding_tuple(nn.Module):
360 |     def __init__(self,
361 |             vocab_size, embedding_size,
362 |             char_embed_kwargs, dropout=0.5,
363 |             aux_embedding_size=None,
364 |             concat=False
365 |             ):
366 |         super(WordCharEmbedding_tuple, self).__init__()
367 |         self.char_embeddings = CharEmbedding(**char_embed_kwargs)
368 |         self.word_embeddings = nn.Embedding(vocab_size, embedding_size)
369 |         self.dropout = nn.Dropout(dropout)
370 |         self.concat = concat
371 |         if concat and aux_embedding_size is not None:
372 |             ## Only allow aux embedding in concat mode
373 |             self.aux_word_embeddings = nn.Embedding(vocab_size, aux_embedding_size)
374 |         
375 |     def forward(self, X):
376 |         if isinstance(X, tuple):
377 |             X, X_char = X
378 |         # Ref: https://github.com/Shawn1993/cnn-text-classification-pytorch/blob/master/model.py
379 |         word_vecs = self.word_embeddings(X)
380 |         if X_char is not None:
381 |             char_vecs = torch.cat([
382 |                 self.char_embeddings(x).unsqueeze(0)
383 |                 for x in X_char
384 |             ], 1)
385 |             if self.concat:
386 |                 embedding_list = [char_vecs, word_vecs]
387 |                 if hasattr(self, "aux_word_embeddings"):
388 |                     aux_vecs = self.aux_word_embeddings(X)
389 |                     embedding_list.append(aux_vecs)
390 |                 word_vecs = torch.cat(embedding_list, 2)
391 |             else:
392 |                 word_vecs = char_vecs + word_vecs
393 |         return self.dropout(word_vecs)
394 | 
395 | class ConcatInputs(nn.Module):
396 |     def __init__(self, input_modules, dim=2):
397 |         super(ConcatInputs, self).__init__()
398 |         assert isinstance(input_modules, list), "Modules should be a list of input modules"
399 |         self.input_modules = nn.ModuleList(input_modules)
400 |         self.dim = dim
401 | 
402 |     def forward(self, X):
403 |         assert isinstance(X, list), "X should be a list of input variables"
404 |         concat_vecs = torch.cat([self.input_modules[i](x) for i,x in enumerate(X)], self.dim)
405 |         return concat_vecs
406 | 
407 | 
408 |     
409 | class LSTMTaggerWordChar(nn.Module):
410 |     def __init__(self, word_char_embedding, embedding_size, hidden_size, output_size):
411 |         super(LSTMTaggerWordChar, self).__init__()
412 |         self.word_embeddings = word_char_embedding
413 |         self.lstm = nn.LSTM(embedding_size, hidden_size//2, bidirectional=True)
414 |         self.output = nn.Linear(hidden_size, output_size)
415 |         
416 |     def forward(self, X, X_char):
417 |         seq_embed = self.word_embeddings(X, X_char).permute(1, 0, 2)
418 |         out, hidden = self.lstm(seq_embed)
419 |         # Reshape the output to be a tensor of shape seq_len*label_size
420 |         output = self.output(out.view(X.data.size(1), -1))
421 |         return F.log_softmax(output)
422 |     
423 |     
424 |     
425 |     
426 | class CRFLayer(nn.Module):
427 |     def __init__(self, num_labels):
428 |         super(CRFLayer, self).__init__()
429 |         self.num_labels = num_labels
430 |         self.transitions = nn.Parameter(torch.randn(self.num_labels, self.num_labels))
431 |         
432 |     def _forward_alg(self, emissions):
433 |         scores = emissions[0]
434 |         # Get the log sum exp score
435 |         transitions = self.transitions.transpose(-1,-2)
436 |         for i in range(1, emissions.size(0)):
437 |             scores = emissions[i] + log_sum_exp_torch(
438 |                 scores.expand_as(transitions) + transitions,
439 |                 axis=1)
440 |         return log_sum_exp_torch(scores, axis=-1)
441 |         
442 |     def _score_sentence(self, emissions, tags):
443 |         score = emissions[0][tags[0]]
444 |         if emissions.size()[0] < 2:
445 |             return score
446 |         for i, emission in enumerate(emissions[1:]):
447 |             score = score + self.transitions[tags[i], tags[i+1]] + emission[tags[i+1]]
448 |         return score
449 |     
450 |     def _viterbi_decode(self, emissions):
451 |         emissions = emissions.data.cpu()
452 |         scores = torch.zeros(emissions.size(1))
453 |         back_pointers = torch.zeros(emissions.size()).int()
454 |         scores = scores + emissions[0]
455 |         transitions = self.transitions.data.cpu()
456 |         # Generate most likely scores and paths for each step in sequence
457 |         for i in range(1, emissions.size(0)):
458 |             scores_with_transitions = scores.unsqueeze(1).expand_as(transitions) + transitions
459 |             max_scores, back_pointers[i] = torch.max(scores_with_transitions, 0)
460 |             scores = emissions[i] + max_scores
461 |         # Generate the most likely path
462 |         viterbi = [scores.numpy().argmax()]
463 |         back_pointers = back_pointers.numpy()
464 |         for bp in reversed(back_pointers[1:]):
465 |             viterbi.append(bp[viterbi[-1]])
466 |         viterbi.reverse()
467 |         viterbi_score = scores.numpy().max()
468 |         return viterbi_score, viterbi
469 |         
470 |     def neg_log_likelihood(self, feats, tags):
471 |         forward_score = self._forward_alg(feats)
472 |         gold_score = self._score_sentence(feats, tags)
473 |         return forward_score - gold_score
474 |         
475 |     def forward(self, feats):
476 |         # Find the best path, given the features.
477 |         score, tag_seq = self._viterbi_decode(feats)
478 |         return score, tag_seq
479 |     
480 |     
481 | class BiLSTMTaggerWordCRF(nn.Module):
482 |     def __init__(self, vocab_size, embedding_size, hidden_size, output_size):
483 |         super(BiLSTMTaggerWordCRF, self).__init__()
484 |         self.word_embeddings = nn.Embedding(vocab_size, embedding_size)
485 |         self.lstm = nn.LSTM(embedding_size, hidden_size//2, bidirectional=True)
486 |         self.output = nn.Linear(hidden_size, output_size)
487 |         self.crf = CRFLayer(output_size)
488 |         
489 |     def forward(self, X):
490 |         seq_embed = self.word_embeddings(X).permute(1, 0, 2)
491 |         out, hidden = self.lstm(seq_embed)
492 |         # Reshape the output to be a tensor of shape seq_len*label_size
493 |         output = self.output(out.view(X.data.size(1), -1))
494 |         return output
495 |     
496 |     def loss(self, X, Y):
497 |         feats = self.forward(X)
498 |         return self.crf.neg_log_likelihood(feats, Y)
499 |     
500 |     
501 | class LSTMTaggerWordCharCRF(nn.Module):
502 |     def __init__(self, word_char_embedding, embedding_size, hidden_size, output_size):
503 |         super(LSTMTaggerWordCharCRF, self).__init__()
504 |         self.word_embeddings = word_char_embedding
505 |         self.lstm = nn.LSTM(embedding_size, hidden_size//2, bidirectional=True)
506 |         self.output = nn.Linear(hidden_size, output_size)
507 |         self.crf = CRFLayer(output_size)
508 |         
509 |     def forward(self, X, X_char):
510 |         seq_embed = self.word_embeddings(X, X_char).permute(1, 0, 2)
511 |         out, hidden = self.lstm(seq_embed)
512 |         # Reshape the output to be a tensor of shape seq_len*label_size
513 |         output = self.output(out.view(X.data.size(1), -1))
514 |         return output
515 |     
516 |     def loss(self, X, X_char, Y):
517 |         feats = self.forward(X, X_char)
518 |         return self.crf.neg_log_likelihood(feats, Y)
519 |     
520 | class BiLSTMTaggerWordCharCRF(nn.Module):
521 |     def __init__(self, input_embedding, embedding_size, hidden_size, output_size):
522 |         super(BiLSTMTaggerWordCharCRF, self).__init__()
523 |         self.input_embedding = input_embedding
524 |         self.lstm = nn.LSTM(embedding_size, hidden_size//2, bidirectional=True)
525 |         self.output = nn.Linear(hidden_size, output_size)
526 |         self.crf = CRFLayer(output_size)
527 |         
528 |     def forward(self, X):
529 |         seq_embed = self.input_embedding(X).permute(1, 0, 2)
530 |         out, hidden = self.lstm(seq_embed)
531 |         # Reshape the output to be a tensor of shape seq_len*label_size
532 |         output = self.output(out.view(out.data.size(0), -1))
533 |         return output
534 |     
535 |     def loss(self, X, Y):
536 |         feats = self.forward(X)
537 |         return self.crf.neg_log_likelihood(feats, Y)
538 |     
539 | 


--------------------------------------------------------------------------------
/pytorch_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from tqdm import tqdm
  4 | 
  5 | import torch
  6 | from torch.autograd import Variable
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch.optim as optim
 10 | 
 11 | from pathlib import Path
 12 | 
 13 | 
 14 | class Vocab(object):
 15 |     def __init__(self, name="vocab",
 16 |                  offset_items=tuple([]),
 17 |                  UNK=None, lower=True):
 18 |         self.name = name
 19 |         self.item2idx = {}
 20 |         self.idx2item = []
 21 |         self.size = 0
 22 |         self.UNK = UNK
 23 |         self.lower=lower
 24 |         
 25 |         self.batch_add(offset_items, lower=False)
 26 |         if UNK is not None:
 27 |             self.add(UNK, lower=False)
 28 |             self.UNK_ID = self.item2idx[self.UNK]
 29 |         self.offset = self.size
 30 |         
 31 |     def add(self, item, lower=True):
 32 |         if self.lower and lower:
 33 |             item = item.lower()
 34 |         if item not in self.item2idx:
 35 |             self.item2idx[item] = self.size
 36 |             self.size += 1
 37 |             self.idx2item.append(item)
 38 |             
 39 |     def batch_add(self, items, lower=True):
 40 |         for item in items:
 41 |             self.add(item, lower=lower)
 42 |             
 43 |     def in_vocab(self, item, lower=True):
 44 |         if self.lower and lower:
 45 |             item = item.lower()
 46 |         return item in self.item2idx
 47 |         
 48 |     def getidx(self, item, lower=True):
 49 |         if self.lower and lower:
 50 |             item = item.lower()
 51 |         if item not in self.item2idx:
 52 |             if self.UNK is None:
 53 |                 raise RuntimeError("UNK is not defined. %s not in vocab." % item)
 54 |             return self.UNK_ID
 55 |         return self.item2idx[item]
 56 |             
 57 |     def __repr__(self):
 58 |         return "Vocab(name={}, size={:d}, UNK={}, offset={:d}, lower={})".format(
 59 |             self.name, self.size,
 60 |             self.UNK, self.offset,
 61 |             self.lower
 62 |         )
 63 |     
 64 |     
 65 | def load_word_vectors(vector_file, ndims, vocab, cache_file, override_cache=False):
 66 |     W = np.zeros((vocab.size, ndims), dtype="float32")
 67 |     # Check for cached file and return vectors
 68 |     cache_file = Path(cache_file)
 69 |     if cache_file.is_file() and not override_cache:
 70 |         W = np.load(cache_file)
 71 |         return W
 72 |     # Else load vectors from the vector file
 73 |     total, found = 0, 0
 74 |     with open(vector_file) as fp:
 75 |         for i, line in enumerate(fp):
 76 |             line = line.rstrip().split()
 77 |             if line:
 78 |                 total += 1
 79 |                 try:
 80 |                     assert len(line) == ndims+1,(
 81 |                         "Line[{}] {} vector dims {} doesn't match ndims={}".format(i, line[0], len(line)-1, ndims)
 82 |                     )
 83 |                 except AssertionError as e:
 84 |                     print(e)
 85 |                     continue
 86 |                 word = line[0]
 87 |                 idx = vocab.getidx(word) 
 88 |                 if idx >= vocab.offset:
 89 |                     found += 1
 90 |                     vecs = np.array(list(map(float, line[1:])))
 91 |                     W[idx, :] += vecs
 92 |     # Write to cache file
 93 |     print("Found {} [{:.2f}%] vectors from {} vectors in {} with ndims={}".format(
 94 |         found, found * 100/vocab.size, total, vector_file, ndims))
 95 |     norm_W = np.sqrt((W*W).sum(axis=1, keepdims=True))
 96 |     valid_idx = norm_W.squeeze() != 0
 97 |     W[valid_idx, :] /= norm_W[valid_idx]
 98 |     print("Caching embedding with shape {} to {}".format(W.shape, cache_file.as_posix()))
 99 |     np.save(cache_file, W)
100 |     return W    
101 |     
102 | class Seq2Vec(object):
103 |     def __init__(self, vocab):
104 |         self.vocab = vocab
105 |         
106 |     def encode(self, seq):
107 |         vec = []
108 |         for item in seq:
109 |             vec.append(self.vocab.getidx(item))
110 |         return vec
111 |     
112 |     def batch_encode(self, seq_batch):
113 |         vecs = [self.encode(seq) for seq in seq_batch]
114 |         return vecs
115 |         
116 |         
117 | class Seq2OneHot(object):
118 |     def __init__(self, size):
119 |         self.size = size
120 |     
121 |     def encode(self, x, as_variable=False):
122 |         one_hot = torch.zeros(self.size)
123 |         for i in x:
124 |             one_hot[i] += 1
125 |         one_hot = one_hot.view(1, -1)
126 |         if as_variable:
127 |             return Variable(one_hot)
128 |         return one_hot
129 |     
130 |     
131 | def print_log_probs(log_probs, label_vocab, label_true=None):
132 |     for i, label_probs in enumerate(log_probs.data.tolist()):
133 |         prob_string = ", ".join([
134 |             "{}: {:.3f}".format(label_vocab.idx2item[j], val)
135 |             for j, val in enumerate(label_probs)
136 |         ])
137 |         true_string = "?"
138 |         if label_true is not None:
139 |             true_string = label_vocab.idx2item[label_true[i]]
140 |             
141 |         print(prob_string, "True label: ", true_string)    
142 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import Counter
  3 | import pandas as pd
  4 | import re
  5 | 
  6 | #import tensorflow as tf
  7 | 
  8 | from sklearn.cluster import KMeans
  9 | 
 10 | 
 11 | def get_clusters(W_word, n_clusters=10, **kwargs):
 12 |     clusterer = KMeans(n_clusters=n_clusters,
 13 |             n_jobs=-1, **kwargs)
 14 |     cluster_labels = clusterer.fit_predict(W_word)
 15 |     return cluster_labels
 16 | 
 17 | 
 18 | def read_glove(filename,
 19 |                ndims=50):
 20 |     vocab = []
 21 |     char_vocab = Counter()
 22 |     W = []
 23 |     with open(filename) as fp:
 24 |         for line in fp:
 25 |             line = line.rstrip().split()
 26 |             word = line[0]
 27 |             embed = list(map(float, line[1:]))
 28 |             vocab.append(word)
 29 |             W.append(embed)
 30 |             char_vocab.update(list(word))
 31 |     return vocab, char_vocab, np.array(W)
 32 | 
 33 | 
 34 | def crf_loss(y_true, y_pred):
 35 |     y_true = tf.cast(tf.squeeze(y_true), tf.int32)
 36 |     seq_lengths_t = tf.reduce_sum(
 37 |             tf.cast(tf.not_equal(y_true, 0),
 38 |                 tf.int32), axis=-1)
 39 |     log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
 40 |             y_pred, y_true, seq_lengths_t)
 41 |     return tf.reduce_mean(-log_likelihood, axis=-1)
 42 | 
 43 | 
 44 | def load_sequences(filenames, sep=" ", col_ids=None):
 45 |     sequences = []
 46 |     if isinstance(filenames, str):
 47 |         filenames = [filenames]
 48 |     for filename in filenames:
 49 |         with open(filename, encoding='utf-8') as fp:
 50 |             seq = []
 51 |             for line in fp:
 52 |                 line = line.rstrip()
 53 |                 if line:
 54 |                     line = line.split(sep)
 55 |                     if col_ids is not None:
 56 |                         line = [line[idx] for idx in col_ids]
 57 |                     seq.append(tuple(line))
 58 |                 else:
 59 |                     if seq:
 60 |                         sequences.append(seq)
 61 |                     seq = []
 62 |             if seq:
 63 |                 sequences.append(seq)
 64 |     return sequences
 65 | 
 66 | 
 67 | def classification_report_to_df(report):
 68 |     report_list = []
 69 |     for i, line in enumerate(report.split("\n")):
 70 |         if i == 0:
 71 |             report_list.append(["class", "precision", "recall", "f1-score", "support"])
 72 |         else:
 73 |             line = line.strip()
 74 |             if line:
 75 |                 if line.startswith("avg"):
 76 |                     line = line.replace("avg / total", "avg/total")
 77 |                 line = re.split(r'\s+', line)
 78 |                 line = [line[0]] + list(map(float, line[1:-1])) + [int(line[-1])]
 79 |                 report_list.append(tuple(line))
 80 |     return pd.DataFrame(report_list[1:], columns=report_list[0])  
 81 | 
 82 | 
 83 | def conll_classification_report_to_df(report):
 84 |     report_list = []
 85 |     report_list.append(["class", "accuracy", "precision", "recall", "f1-score", "support"])
 86 |     for i, line in enumerate(report.split("\n")):
 87 |         line = line.strip()
 88 |         if not line:
 89 |             continue
 90 |         if i == 0:
 91 |             continue
 92 |         if i == 1:
 93 |             line = re.findall(
 94 |                 'accuracy:\s*([0-9\.]{4,5})%; precision:\s+([0-9\.]{4,5})%; recall:\s+([0-9\.]{4,5})%; FB1:\s+([0-9\.]{4,5})',
 95 |                 line)[0]
 96 |             line = ("overall",) + tuple(map(float, line)) + (0,)
 97 |         else:
 98 |             line = re.findall(
 99 |                 '\s*(.+?): precision:\s+([0-9\.]{4,5})%; recall:\s+([0-9\.]{4,5})%; FB1:\s+([0-9\.]{4,5})\s+([0-9]+)',
100 |                 line)[0]
101 |             line = (line[0], 0.0) + tuple(map(float, line[1:-1])) + (int(line[-1]),)
102 |         report_list.append(line)
103 |     return pd.DataFrame(report_list[1:], columns=report_list[0])
104 | 
105 | 
106 | def get_labels(y_arr):
107 |     return np.expand_dims(
108 |         np.array([
109 |             np.zeros(max_len)
110 |             if y is None else y
111 |             for y in y_arr],
112 |             dtype='int'),
113 |         -1)
114 | 
115 | 
116 | 
117 | def create_tagged_sequence(seq, task2col, default_tag):
118 |     seq_tags = []
119 |     for t in seq:
120 |         try:
121 |             tag = default_tag._replace(token=t[0], **{ti: t[ci] for ti, ci in task2col.items()})
122 |         except:
123 |             print("Error processing tag:", t)
124 |             print("Error in sequence: ", seq)
125 |             raise
126 |         seq_tags.append(tag)
127 |     return seq_tags        
128 | 
129 | 
130 | def get_tagged_corpus(corpus, *args):
131 |     max_len = 0
132 |     for seq in corpus:
133 |         if seq:
134 |             max_len = max(len(seq), max_len)
135 |             yield create_tagged_sequence(seq, *args)
136 |     print("Max sequence length in the corpus is: %s" % max_len)
137 | 
138 | def gen_vocab_counts(corpus, tasks, include_chars=False, token_counts=None):
139 |     task_counts = {k: Counter() for k in tasks}
140 |     if token_counts is None:
141 |         token_counts = Counter()
142 |     max_seq_len = 0
143 |     max_word_len = 0
144 |     if include_chars:
145 |         char_counts = Counter()
146 |     for seq in corpus:
147 |         max_seq_len = max(len(seq), max_seq_len)
148 |         for t in seq:
149 |             token_counts[t.token] += 1
150 |             if include_chars:
151 |                 char_counts.update(list(t.token))
152 |                 max_word_len = max(len(t.token), max_word_len)
153 |             for k in task_counts:
154 |                 v = getattr(t, k)
155 |                 if v is not None:
156 |                     task_counts[k][v] += 1
157 |     if include_chars:
158 |         return token_counts, task_counts, max_seq_len, char_counts, max_word_len
159 |     return token_counts, task_counts, max_seq_len
160 | 
161 | def print_predictions(tagged_seq, predictions, filename, label_id=0, task_id=0):
162 |     from sklearn.metrics import classification_report, accuracy_score
163 |     y_true, y_pred = [], []
164 |     with open(filename, "w+") as fp:
165 |         for seq, pred in zip(tagged_seq, predictions[label_id]):
166 |             for tag, label in zip(seq, pred):
167 |                 true_label = tag[task_id+1]
168 |                 print(u"%s\t%s\t%s" % (tag[0], true_label, label), file=fp)
169 |                 y_true.append(true_label)
170 |                 y_pred.append(label)
171 |             print(u"", file=fp) 
172 |     
173 |     report = classification_report(y_true, y_pred)
174 |     print(report)
175 |     print("Accuracy: %s" % accuracy_score(y_true, y_pred))
176 |     return classification_report_to_df(report)
177 | 
178 | 
179 | 
180 | 


--------------------------------------------------------------------------------
/wnut_bilstm_crf_char_concat.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | import matplotlib
  7 | matplotlib.use("Agg")
  8 | import torch
  9 | from torch.autograd import Variable
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import torch.optim as optim
 13 | 
 14 | torch.manual_seed(1)
 15 | 
 16 | import numpy as np
 17 | 
 18 | import matplotlib.pyplot as plt
 19 | import seaborn as sns
 20 | 
 21 | from pytorch_utils import *
 22 | from pytorch_models import *
 23 | from utils import load_sequences, conll_classification_report_to_df
 24 | from conlleval import main as conll_eval
 25 | import re
 26 | 
 27 | sns.set_context("poster")
 28 | sns.set_style("ticks")
 29 | 
 30 | 
 31 | # In[2]:
 32 | 
 33 | TRAIN_CORPUS="data/WNUT_NER/train.tsv"
 34 | DEV_CORPUS="data/WNUT_NER/dev.tsv"
 35 | TEST_CORPUS="data/WNUT_NER/test.tsv"
 36 | 
 37 | 
 38 | # In[3]:
 39 | 
 40 | train_corpus = load_sequences(TRAIN_CORPUS, sep="\t", col_ids=(0, -1))
 41 | print("Total items in train corpus: %s" % len(train_corpus))
 42 | dev_corpus = load_sequences(DEV_CORPUS, sep="\t", col_ids=(0, -1))
 43 | print("Total items in dev corpus: %s" % len(dev_corpus))
 44 | test_corpus = load_sequences(TEST_CORPUS, sep="\t", col_ids=(0, -1))
 45 | print("Total items in test corpus: %s" % len(test_corpus))
 46 | 
 47 | 
 48 | # In[5]:
 49 | CAP_LETTERS=re.compile(r'[A-Z]')
 50 | SMALL_LETTERS=re.compile(r'[a-z]')
 51 | NUMBERS=re.compile(r'[0-9]')
 52 | PUNCT=re.compile(r'[\.,\"\'!\?;:]')
 53 | OTHERS=re.compile(r'[^A-Za-z0-9\.,\"\'!\?;:]')
 54 | 
 55 | def get_ortho_feature(word):
 56 |     word = CAP_LETTERS.sub("A", word)
 57 |     word = SMALL_LETTERS.sub("a", word)
 58 |     word = NUMBERS.sub("0", word)
 59 |     word = PUNCT.sub(".", word)
 60 |     word = OTHERS.sub("%", word)
 61 |     return word
 62 | 
 63 | def create_vocab(data, vocabs, char_vocab, ortho_word_vocab, ortho_char_vocab, word_idx=0):
 64 |     n_vocabs = len(vocabs)
 65 |     for sent in data:
 66 |         for token_tags in sent:
 67 |             for vocab_id in range(n_vocabs):
 68 |                 vocabs[vocab_id].add(token_tags[vocab_id])
 69 |             char_vocab.batch_add(token_tags[word_idx])
 70 |             ortho_word = get_ortho_feature(token_tags[word_idx])
 71 |             ortho_word_vocab.add(ortho_word)
 72 |             ortho_char_vocab.batch_add(ortho_word)
 73 |     print("Created vocabs: %s" % (", ".join(
 74 |         "{}[{}]".format(vocab.name, vocab.size)
 75 |         for vocab in vocabs + [char_vocab, ortho_word_vocab, ortho_char_vocab]
 76 |     )))
 77 | 
 78 | 
 79 | # In[6]:
 80 | 
 81 | word_vocab = Vocab("words", UNK="UNK", lower=True)
 82 | char_vocab = Vocab("chars", UNK="<U>", lower=False)
 83 | ortho_word_vocab = Vocab("ortho_words", UNK="UNK", lower=True)
 84 | ortho_char_vocab = Vocab("ortho_chars", UNK="<U>", lower=False)
 85 | ner_vocab = Vocab("ner_tags", lower=False)
 86 | 
 87 | create_vocab(train_corpus+dev_corpus+test_corpus, [word_vocab, ner_vocab], char_vocab, ortho_word_vocab, ortho_char_vocab)
 88 | 
 89 | 
 90 | # In[7]:
 91 | 
 92 | def data2tensors(data, vocabs, char_vocab, ortho_word_vocab, ortho_char_vocab, word_idx=0, column_ids=(0, -1)):
 93 |     vocabs = [vocabs[idx] for idx in column_ids]
 94 |     n_vocabs = len(vocabs)
 95 |     tensors = []
 96 |     char_tensors = []
 97 |     for sent in data:
 98 |         sent_vecs = [[] for i in range(n_vocabs+3)] # Last 3 are for char vecs, ortho_word and ortho_char
 99 |         char_vecs = []
100 |         for token_tags in sent:
101 |             vocab_id = 0 # First column is the word
102 |             ortho_word = get_ortho_feature(token_tags[vocab_id])
103 |             # lowercase the word
104 |             sent_vecs[vocab_id].append(
105 |                     vocabs[vocab_id].getidx(token_tags[vocab_id].lower())
106 |                 )
107 |             for vocab_id in range(1, n_vocabs):
108 |                 sent_vecs[vocab_id].append(
109 |                     vocabs[vocab_id].getidx(token_tags[vocab_id])
110 |                 )
111 |             sent_vecs[-3].append(
112 |                 [char_vocab.getidx(c) for c in token_tags[word_idx]]
113 |             )
114 |             sent_vecs[-2].append(
115 |                     ortho_word_vocab.getidx(ortho_word)
116 |                 )
117 |             sent_vecs[-1].append(
118 |                 [ortho_char_vocab.getidx(c) for c in ortho_word]
119 |             )
120 |         tensors.append(sent_vecs)
121 |     return tensors
122 | 
123 | 
124 | # In[8]:
125 | 
126 | train_tensors = data2tensors(train_corpus, [word_vocab, ner_vocab], char_vocab, ortho_word_vocab, ortho_char_vocab)
127 | dev_tensors = data2tensors(dev_corpus, [word_vocab, ner_vocab], char_vocab, ortho_word_vocab, ortho_char_vocab)
128 | test_tensors = data2tensors(test_corpus, [word_vocab, ner_vocab], char_vocab, ortho_word_vocab, ortho_char_vocab)
129 | print("Train: ({}, {}), Dev: ({}, {}), Test: ({}, {})".format(
130 |     len(train_tensors), len(train_tensors[0]),
131 |     len(dev_tensors), len(dev_tensors[0]),
132 |     len(test_tensors), len(test_tensors[0])
133 | ))
134 | 
135 | 
136 | # In[9]:
137 | 
138 | embedding_file="data/WNUT_NER/wnut_vecs.txt"
139 | cache_file="wnut_ner.twitter.400.npy"
140 | ndims=400
141 | pretrained_embeddings = load_word_vectors(embedding_file, ndims, word_vocab, cache_file)
142 | 
143 | 
144 | # In[10]:
145 | 
146 | def plot_losses(train_losses, eval_losses=None, plot_std=False, ax=None):
147 |     if ax is None:
148 |         ax = plt.gca()
149 |     for losses, color, label in zip(
150 |         [train_losses, eval_losses],
151 |         ["0.5", "r"],
152 |         ["Train", "Eval"],
153 |     ):
154 |         mean_loss, std_loss = zip(*losses)
155 |         mean_loss = np.array(mean_loss)
156 |         std_loss = np.array(std_loss)
157 |         ax.plot(
158 |             mean_loss, color=color, label=label,
159 |             linestyle="-", 
160 |         )
161 |         if plot_std:
162 |             ax.fill_between(
163 |                 np.arange(mean_loss.shape[0]),
164 |                 mean_loss-std_loss,
165 |                 mean_loss+std_loss,
166 |                 color=color,
167 |                 alpha=0.3
168 |             )
169 |     ax.set_xlabel("Epochs")
170 |     ax.set_ylabel("Mean Loss ($\pm$ S.D.)")
171 |     
172 |     
173 | def print_predictions(corpus, predictions, filename, label_vocab):
174 |     with open(filename, "w+") as fp:
175 |         for seq, pred in zip(corpus, predictions):
176 |             for (token, true_label), pred_label in zip(seq, pred):
177 |                 pred_label = label_vocab.idx2item[pred_label]
178 |                 print("{}\t{}\t{}".format(token, true_label, pred_label), file=fp)
179 |             print(file=fp) # Add new line after each sequence
180 | 
181 | 
182 | # In[11]:
183 | 
184 | # ## Class based
185 | 
186 | # In[19]:
187 | 
188 | class BiLSTMTaggerWordCRFModel(ModelWrapper):
189 |     def __init__(self, model,
190 |                  loss_function,
191 |                  use_cuda=False, grad_max_norm=5):
192 |         self.model = model
193 |         self.loss_function = None
194 |         self.grad_max_norm=grad_max_norm
195 | 
196 |         self.use_cuda = use_cuda
197 |         if self.use_cuda:
198 |             #[k.cuda() for k in self.model.modules()]
199 |             self.model.cuda()
200 | 
201 |     def post_backward(self):
202 |         torch.nn.utils.clip_grad_norm(self.model.parameters(), self.grad_max_norm)
203 | 
204 |     def _process_instance_tensors(self, instance_tensors, volatile=False):
205 |         X, Y, X_char, X_ortho, X_char_ortho = instance_tensors
206 |         X = Variable(torch.LongTensor([X]), requires_grad=False, volatile=volatile)
207 |         X_char = charseq2varlist(X_char, volatile=volatile)
208 |         X_ortho = Variable(torch.LongTensor([X_ortho]), requires_grad=False, volatile=volatile)
209 |         X_char_ortho = charseq2varlist(X_char_ortho, volatile=volatile)
210 |         Y = torch.LongTensor(Y)
211 |         return X, X_char, X_ortho, X_char_ortho, Y
212 | 
213 |     def get_instance_loss(self, instance_tensors, zero_grad=True):
214 |         if zero_grad:
215 |             ## Clear gradients before every update else memory runs out
216 |             self.model.zero_grad()
217 |         X, X_char, X_ortho, X_char_ortho, Y = instance_tensors
218 |         if self.use_cuda:
219 |             X = X.cuda(async=True)
220 |             X_char = [t.cuda(async=True) for t in X_char]
221 |             X_ortho = X_ortho.cuda(async=True)
222 |             X_char_ortho = [t.cuda(async=True) for t in X_char_ortho]
223 |             Y = Y.cuda(async=True)
224 |         return self.model.loss([(X, X_char), (X_ortho, X_char_ortho)], Y)
225 |         
226 |     def predict(self, instance_tensors):
227 |         X, X_char, X_ortho, X_char_ortho, Y = self._process_instance_tensors(instance_tensors, volatile=True)
228 |         if self.use_cuda:
229 |             X = X.cuda(async=True)
230 |             X_char = [t.cuda(async=True) for t in X_char]
231 |             X_ortho = X_ortho.cuda(async=True)
232 |             X_char_ortho = [t.cuda(async=True) for t in X_char_ortho]
233 |             Y = Y.cuda(async=True)
234 |         emissions = self.model.forward([(X, X_char), (X_ortho, X_char_ortho)])
235 |         return self.model.crf.forward(emissions)[1]
236 | 
237 | 
238 | use_cuda=True
239 | hidden_size=128
240 | batch_size=64
241 | 
242 | char_emb_size=30
243 | output_channels=200
244 | kernel_sizes=[3]
245 | 
246 | word_emb_size=400
247 | aux_emb_size=100
248 | 
249 | main_total_emb_dims=700
250 | char_embed_kwargs=dict(
251 |     vocab_size=char_vocab.size,
252 |     embedding_size=char_emb_size,
253 |     out_channels=output_channels,
254 |     kernel_sizes=kernel_sizes
255 | )
256 | 
257 | word_char_embedding = WordCharEmbedding_tuple(
258 |         word_vocab.size, word_emb_size,
259 |         char_embed_kwargs, dropout=0.5,
260 |         aux_embedding_size=aux_emb_size,
261 |         concat=True)
262 | 
263 | 
264 | ortho_char_emb_size=30
265 | output_channels=200
266 | kernel_sizes=[3]
267 | ortho_word_emb_size=200
268 | ortho_total_emb_dims=400
269 | 
270 | ortho_char_embed_kwargs=dict(
271 |     vocab_size=ortho_char_vocab.size,
272 |     embedding_size=ortho_char_emb_size,
273 |     out_channels=output_channels,
274 |     kernel_sizes=kernel_sizes
275 | )
276 | 
277 | ortho_word_char_embedding = WordCharEmbedding_tuple(
278 |         ortho_word_vocab.size, ortho_word_emb_size,
279 |         ortho_char_embed_kwargs, dropout=0.5, concat=True)
280 | 
281 | 
282 | concat_embeddings = ConcatInputs([word_char_embedding, ortho_word_char_embedding])
283 | 
284 | # Assign glove embeddings
285 | assign_embeddings(word_char_embedding.word_embeddings, pretrained_embeddings, fix_embedding=True)
286 | 
287 | n_embed=main_total_emb_dims + ortho_total_emb_dims # Get this using char embedding and word embed and ortho embeddings
288 | model_wrapper = BiLSTMTaggerWordCRFModel(
289 |     BiLSTMTaggerWordCharCRF(concat_embeddings, n_embed, hidden_size, ner_vocab.size),
290 |     None, use_cuda=use_cuda, grad_max_norm=5)
291 | 
292 | 
293 | # In[33]:
294 | model_prefix="BiLSTMCharConcatCRF_WNUT_NER_ortho"
295 | n_epochs=50
296 | 
297 | load_model = True
298 | 
299 | if load_model:
300 |     model_wrapper.load("{}.pth".format(model_prefix))
301 |     print("Loaded model from {}.pth".format(model_prefix))
302 | 
303 | training_history = training_wrapper(
304 |     model_wrapper, train_tensors, 
305 |     eval_tensors=dev_tensors,
306 |     optimizer=optim.Adam,
307 |     optimizer_kwargs={
308 |         "lr": 0.1,
309 |         "weight_decay": 1e-2
310 |     },
311 |     n_epochs=n_epochs,
312 |     batch_size=batch_size,
313 |     use_cuda=use_cuda,
314 |     log_file="{}.log".format(model_prefix),
315 |     #early_stopping=0.001,
316 |     save_best=True,
317 |     save_path="{}.pth".format(model_prefix)
318 | )
319 | #model_wrapper.save("{}.pth".format(model_prefix))
320 | model_wrapper.load("{}.pth".format(model_prefix))
321 | 
322 | # In[34]:
323 | 
324 | fig, ax = plt.subplots(1,1)
325 | plot_losses(training_history["training_loss"],
326 |             training_history["evaluation_loss"],
327 |             plot_std=True,
328 |             ax=ax)
329 | ax.legend()
330 | sns.despine(offset=5)
331 | plt.savefig("{}.pdf".format(model_prefix))
332 | 
333 | for title, tensors, corpus in zip(
334 |     ["train", "dev", "test"],
335 |     [train_tensors, dev_tensors, test_tensors],
336 |     [train_corpus, dev_corpus, test_corpus],
337 |                          ):
338 |     predictions = model_wrapper.predict_batch(tensors, title=title)
339 |     print_predictions(corpus, predictions, "%s.wnut.conll" % title, ner_vocab)
340 |     conll_eval(["conlleval", "%s.wnut.conll" % title]) 
341 | 
342 | 
343 | 


--------------------------------------------------------------------------------