├── .gitignore
├── 01-intro
    ├── bow-pytorch.py
    ├── bow-simple-pytorch.py
    ├── bow.ipynb
    ├── bow.py
    ├── cbow-pytorch.py
    ├── cbow.ipynb
    ├── cbow.py
    ├── deep-cbow-pytorch-minibatch.py
    ├── deep-cbow-pytorch.py
    ├── deep-cbow.ipynb
    └── deep-cbow.py
├── 02-lm
    ├── loglin-lm.py
    ├── nn-lm-batch.py
    ├── nn-lm-optim.py
    └── nn-lm.py
├── 03-wordemb
    ├── kwic.py
    ├── tsne.py
    ├── wordemb-cbow.py
    ├── wordemb-skip.py
    └── wordemb-vis-tsne.py
├── 04-efficiency
    ├── slow-impl.py
    ├── wordemb-skip-binary.py
    └── wordemb-skip-ns.py
├── 05-cnn
    ├── cnn-activation.py
    └── cnn-class.py
├── 06-rnn
    ├── lm-lstm.py
    ├── lm-minibatch.py
    ├── sentiment-lstm.py
    └── sentiment-rnn.py
├── 07-sentrep
    └── text-retrieval.py
├── 08-condlm
    ├── batched_enc_dec.py
    ├── bleu.py
    └── enc_dec.py
├── 09-attention
    ├── batched_attention.py
    └── plot_attention.py
├── 10-structured
    ├── bilstm-tagger.py
    └── bilstm-variant-tagger.py
├── 12-transitionparsing
    ├── feed_forward.py
    ├── oracle.py
    ├── stack_lstm.py
    └── tree_parser.py
├── 13-graphparsing
    ├── biaffine.py
    ├── biaffine_parser.py
    └── mst.py
├── 14-semparsing
    └── ucca
    │   ├── .appveyor.yml
    │   ├── .gitignore
    │   ├── .travis.yml
    │   ├── LICENSE.txt
    │   ├── README.md
    │   ├── actions.py
    │   ├── ci
    │       ├── deploy.sh
    │       └── test.sh
    │   ├── doc
    │       ├── README
    │       ├── short_defs.pdf
    │       └── toy.xml
    │   ├── oracle.py
    │   ├── runner.py
    │   ├── scripts
    │       ├── __init__.py
    │       ├── annotate.py
    │       ├── convert_and_evaluate.py
    │       ├── count_parents_children.py
    │       ├── distances
    │       │   └── align.py
    │       ├── evaluate_db.py
    │       ├── evaluate_standard.py
    │       ├── find_constructions.py
    │       ├── join_passages.py
    │       ├── join_sdp.py
    │       ├── pickle_to_standard.py
    │       ├── site_to_standard.py
    │       ├── split_corpus.py
    │       ├── standard_to_pickle.py
    │       ├── standard_to_sentences.py
    │       ├── statistics.py
    │       ├── ucca_db.py
    │       ├── unique_roles.py
    │       └── visualize.py
    │   ├── setup.cfg
    │   ├── setup.py
    │   ├── test_files
    │       ├── site1.xml
    │       ├── site2.xml
    │       ├── site3.xml
    │       ├── standard3.conll
    │       ├── standard3.conll.xml
    │       ├── standard3.export
    │       ├── standard3.export.xml
    │       ├── standard3.sdp
    │       ├── standard3.sdp.xml
    │       └── standard3.xml
    │   ├── ucca
    │       ├── README.md
    │       ├── __init__.py
    │       ├── constructions.py
    │       ├── convert.py
    │       ├── core.py
    │       ├── diffutil.py
    │       ├── evaluation.py
    │       ├── ioutil.py
    │       ├── layer0.py
    │       ├── layer1.py
    │       ├── tests
    │       │   ├── __init__.py
    │       │   └── test_ucca.py
    │       ├── textutil.py
    │       └── visualization.py
    │   └── uccaapp
    │       ├── __init__.py
    │       ├── api.py
    │       ├── convert_and_evaluate.py
    │       ├── download_task.py
    │       └── upload_task.py
├── 15-vae
    └── vae-lm.py
├── 16-reinforce
    └── bilstm-tagger.py
├── COPYING
├── README.md
└── data
    ├── README.md
    ├── classes
        ├── dev.txt
        ├── test.txt
        └── train.txt
    ├── parallel
        ├── dev.en
        ├── dev.ja
        ├── test.en
        ├── test.ja
        ├── train.en
        └── train.ja
    ├── parsing
        ├── graph
        │   ├── ptb_dev.txt
        │   └── ptb_train.txt
        ├── shift_reduce
        │   ├── small-dev.txt
        │   ├── small-dev.unk.txt
        │   ├── small-test.txt
        │   ├── small-test.unk.txt
        │   ├── small-train.txt
        │   ├── small-train.unk.txt
        │   └── vocab.txt
        └── trees
        │   ├── dev.txt
        │   ├── test.txt
        │   └── train.txt
    ├── ptb
        ├── test.txt
        ├── train.txt
        └── valid.txt
    ├── tags
        ├── dev.txt
        └── train.txt
    └── trees
        ├── dev.txt
        ├── test.txt
        └── train.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints
 2 | *.log
 3 | *.swp
 4 | .DS_Store
 5 | __MACOSX
 6 | __pycache__
 7 | 03-wordemb/*.txt
 8 | 03-wordemb/*.png
 9 | 04-efficiency/*.txt
10 | 04-efficiency/*.png
11 | 09-attention/*.png
12 | 
13 | *.pyc
14 | 
15 | .idea/
16 | *.iml
17 | 


--------------------------------------------------------------------------------
/01-intro/bow-pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | BOW
  5 | 
  6 | Based on Graham Neubig's DyNet code examples:
  7 |   https://github.com/neubig/nn4nlp2017-code
  8 |   http://phontron.com/class/nn4nlp2017/
  9 | 
 10 | """
 11 | 
 12 | from collections import defaultdict
 13 | import time
 14 | import random
 15 | import torch
 16 | from torch.autograd import Variable
 17 | import torch.nn as nn
 18 | import torch.optim as optim
 19 | torch.manual_seed(1)
 20 | 
 21 | 
 22 | # Functions to read in the corpus
 23 | w2i = defaultdict(lambda: len(w2i))
 24 | t2i = defaultdict(lambda: len(t2i))
 25 | UNK = w2i["<unk>"]
 26 | 
 27 | 
 28 | def read_dataset(filename):
 29 |     with open(filename, "r") as f:
 30 |         for line in f:
 31 |             tag, words = line.lower().strip().split(" ||| ")
 32 |             yield ([w2i[x] for x in words.split(" ")], t2i[tag])
 33 | 
 34 | 
 35 | # Read in the data
 36 | train = list(read_dataset("../data/classes/train.txt"))
 37 | w2i = defaultdict(lambda: UNK, w2i)
 38 | dev = list(read_dataset("../data/classes/test.txt"))
 39 | nwords = len(w2i)
 40 | ntags = len(t2i)
 41 | 
 42 | 
 43 | class BOW(nn.Module):
 44 | 
 45 |     def __init__(self, vocab_size, embedding_dim):
 46 |         super(BOW, self).__init__()
 47 |         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
 48 |         self.bias = nn.Parameter(torch.zeros(embedding_dim), requires_grad=True)        
 49 | 
 50 |     def forward(self, inputs):
 51 |         embeds = self.embeddings(inputs)
 52 |         logits = torch.sum(embeds, 1) + self.bias
 53 |         return logits
 54 | 
 55 | 
 56 | model = BOW(nwords, ntags)
 57 | print(model)
 58 | 
 59 | 
 60 | def evaluate(model, data):
 61 |     """Evaluate a model on a data set."""
 62 |     correct = 0.0
 63 |     
 64 |     for words, tag in data:
 65 |         lookup_tensor = Variable(torch.LongTensor([words]))
 66 |         scores = model(lookup_tensor)
 67 |         predict = scores.data.numpy().argmax(axis=1)[0]
 68 | 
 69 |         if predict == tag:
 70 |             correct += 1
 71 | 
 72 |     return correct, len(data), correct/len(data)
 73 | 
 74 | 
 75 | optimizer = optim.SGD(model.parameters(), lr=0.01)
 76 | 
 77 | for ITER in range(100):
 78 | 
 79 |     random.shuffle(train)
 80 |     train_loss = 0.0
 81 |     start = time.time()
 82 | 
 83 |     for words, tag in train:
 84 | 
 85 |         # forward pass
 86 |         lookup_tensor = Variable(torch.LongTensor([words]))
 87 |         scores = model(lookup_tensor)
 88 |         loss = nn.CrossEntropyLoss()
 89 |         target = Variable(torch.LongTensor([tag]))
 90 |         output = loss(scores, target)
 91 |         train_loss += output.data[0]
 92 | 
 93 |         # backward pass
 94 |         model.zero_grad()
 95 |         output.backward()
 96 | 
 97 |         # update weights
 98 |         optimizer.step()
 99 | 
100 |     print("iter %r: train loss/sent=%.4f, time=%.2fs" % 
101 |           (ITER, train_loss/len(train), time.time()-start))
102 | 
103 |     # evaluate
104 |     _, _, acc = evaluate(model, dev)
105 |     print("iter %r: test acc=%.4f" % (ITER, acc))
106 | 
107 | 


--------------------------------------------------------------------------------
/01-intro/bow-simple-pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | """
 4 | BOW (simple version)
 5 | 
 6 | Based on Graham Neubig's DyNet code examples:
 7 |   https://github.com/neubig/nn4nlp2017-code
 8 |   http://phontron.com/class/nn4nlp2017/
 9 | 
10 | """
11 | 
12 | from collections import defaultdict
13 | import time
14 | import random
15 | import torch
16 | from torch.autograd import Variable
17 | import torch.nn as nn
18 | 
19 | torch.manual_seed(1)
20 | 
21 | 
22 | # Functions to read in the corpus
23 | w2i = defaultdict(lambda: len(w2i))
24 | t2i = defaultdict(lambda: len(t2i))
25 | UNK = w2i["<unk>"]
26 | 
27 | 
28 | def read_dataset(filename):
29 |     with open(filename, "r") as f:
30 |         for line in f:
31 |             tag, words = line.lower().strip().split(" ||| ")
32 |             yield ([w2i[x] for x in words.split(" ")], t2i[tag])
33 | 
34 | 
35 | # Read in the data
36 | train = list(read_dataset("../data/classes/train.txt"))
37 | w2i = defaultdict(lambda: UNK, w2i)
38 | dev = list(read_dataset("../data/classes/test.txt"))
39 | nwords = len(w2i)
40 | ntags = len(t2i)
41 | 
42 | 
43 | # The parameters for our BoW-model
44 | dtype = torch.FloatTensor  # enable CUDA here if you like
45 | w = Variable(torch.randn(nwords, ntags).type(dtype), requires_grad=True)
46 | b = Variable(torch.randn(ntags).type(dtype), requires_grad=True)
47 | 
48 | 
49 | # A function to calculate scores for one sentence
50 | def calc_scores(words):
51 |     lookup_tensor = Variable(torch.LongTensor(words))
52 |     embed = w[lookup_tensor]
53 |     score = torch.sum(embed, 0) + b
54 |     return score.view((1, -1))
55 | 
56 | 
57 | for ITER in range(100):
58 |     
59 |     # train
60 |     random.shuffle(train)
61 |     train_loss = 0.0
62 |     start = time.time()
63 |     
64 |     for words, tag in train:
65 |         
66 |         # forward pass
67 |         scores = calc_scores(words)
68 |         target = Variable(torch.LongTensor([tag]))        
69 |         loss = nn.CrossEntropyLoss()
70 |         output = loss(scores, target)
71 |         train_loss += output.data[0]        
72 |         
73 |         # backward pass (compute gradients)
74 |         output.backward()
75 | 
76 |         # update weights with SGD
77 |         lr = 0.01
78 |         w.data -= lr * w.grad.data
79 |         b.data -= lr * b.grad.data
80 | 
81 |         # clear gradients for next step
82 |         w.grad.data.zero_()
83 |         b.grad.data.zero_()
84 |         
85 |     print("iter %r: train loss/sent=%.4f, time=%.2fs" % 
86 |           (ITER, train_loss/len(train), time.time()-start))
87 | 
88 |     # evaluate
89 |     correct = 0.0
90 |     for words, tag in dev:
91 |         scores = calc_scores(words)
92 |         predict = scores.data.numpy().argmax(axis=1)
93 |         if predict == tag:
94 |             correct += 1
95 |     
96 |     print("iter %r: test acc=%.4f" % 
97 |           (ITER, correct/len(dev)))
98 | 


--------------------------------------------------------------------------------
/01-intro/bow.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from collections import defaultdict\n",
 12 |     "import time\n",
 13 |     "import random\n",
 14 |     "import dynet as dy\n",
 15 |     "import numpy as np"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {
 22 |     "collapsed": true
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# Functions to read in the corpus\n",
 27 |     "w2i = defaultdict(lambda: len(w2i))\n",
 28 |     "t2i = defaultdict(lambda: len(t2i))\n",
 29 |     "UNK = w2i[\"<unk>\"]\n",
 30 |     "def read_dataset(filename):\n",
 31 |     "  with open(filename, \"r\") as f:\n",
 32 |     "    for line in f:\n",
 33 |     "      tag, words = line.lower().strip().split(\" ||| \")\n",
 34 |     "      yield ([w2i[x] for x in words.split(\" \")], t2i[tag])"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {
 41 |     "collapsed": false
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "# Read in the data\n",
 46 |     "train = list(read_dataset(\"../data/classes/train.txt\"))\n",
 47 |     "w2i = defaultdict(lambda: UNK, w2i)\n",
 48 |     "dev = list(read_dataset(\"../data/classes/test.txt\"))\n",
 49 |     "nwords = len(w2i)\n",
 50 |     "ntags = len(t2i)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {
 57 |     "collapsed": false
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "train[0][1]"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {
 68 |     "collapsed": true
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "# Start DyNet and define trainer\n",
 73 |     "model = dy.Model()\n",
 74 |     "trainer = dy.AdamTrainer(model)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {
 81 |     "collapsed": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# Define the model\n",
 86 |     "W_sm = model.add_lookup_parameters((nwords, ntags)) # Word weights\n",
 87 |     "b_sm = model.add_parameters((ntags))                # Softmax bias"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# A function to calculate scores for one value\n",
 99 |     "def calc_scores(words):\n",
100 |     "  dy.renew_cg()\n",
101 |     "  score = dy.esum([dy.lookup(W_sm, x) for x in words])\n",
102 |     "  b_sm_exp = dy.parameter(b_sm)\n",
103 |     "  return score + b_sm_exp"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {
110 |     "collapsed": false,
111 |     "scrolled": false
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "for ITER in range(100):\n",
116 |     "  # Perform training\n",
117 |     "  random.shuffle(train)\n",
118 |     "  train_loss = 0.0\n",
119 |     "  start = time.time()\n",
120 |     "  for words, tag in train:\n",
121 |     "    my_loss = dy.pickneglogsoftmax(calc_scores(words), tag)\n",
122 |     "    train_loss += my_loss.value()\n",
123 |     "    my_loss.backward()\n",
124 |     "    trainer.update()\n",
125 |     "  print(\"iter %r: train loss/sent=%.4f, time=%.2fs\" % (ITER, train_loss/len(train), time.time()-start))\n",
126 |     "  # Perform testing\n",
127 |     "  test_correct = 0.0\n",
128 |     "  for words, tag in dev:\n",
129 |     "    scores = calc_scores(words).npvalue()\n",
130 |     "    predict = np.argmax(scores)\n",
131 |     "    if predict == tag:\n",
132 |     "      test_correct += 1\n",
133 |     "  print(\"iter %r: test acc=%.4f\" % (ITER, test_correct/len(dev)))"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {
140 |     "collapsed": true
141 |    },
142 |    "outputs": [],
143 |    "source": []
144 |   }
145 |  ],
146 |  "metadata": {
147 |   "anaconda-cloud": {},
148 |   "kernelspec": {
149 |    "display_name": "Python 3",
150 |    "language": "python",
151 |    "name": "python3"
152 |   },
153 |   "language_info": {
154 |    "codemirror_mode": {
155 |     "name": "ipython",
156 |     "version": 3
157 |    },
158 |    "file_extension": ".py",
159 |    "mimetype": "text/x-python",
160 |    "name": "python",
161 |    "nbconvert_exporter": "python",
162 |    "pygments_lexer": "ipython3",
163 |    "version": "3.6.0"
164 |   }
165 |  },
166 |  "nbformat": 4,
167 |  "nbformat_minor": 2
168 | }
169 | 


--------------------------------------------------------------------------------
/01-intro/bow.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import time
 3 | import random
 4 | import dynet as dy
 5 | import numpy as np
 6 | 
 7 | # Functions to read in the corpus
 8 | w2i = defaultdict(lambda: len(w2i))
 9 | t2i = defaultdict(lambda: len(t2i))
10 | UNK = w2i["<unk>"]
11 | def read_dataset(filename):
12 |   with open(filename, "r") as f:
13 |     for line in f:
14 |       tag, words = line.lower().strip().split(" ||| ")
15 |       yield ([w2i[x] for x in words.split(" ")], t2i[tag])
16 | 
17 | # Read in the data
18 | train = list(read_dataset("../data/classes/train.txt"))
19 | w2i = defaultdict(lambda: UNK, w2i)
20 | dev = list(read_dataset("../data/classes/test.txt"))
21 | nwords = len(w2i)
22 | ntags = len(t2i)
23 | 
24 | # Start DyNet and define trainer
25 | model = dy.Model()
26 | trainer = dy.AdamTrainer(model)
27 | 
28 | # Define the model
29 | W_sm = model.add_lookup_parameters((nwords, ntags)) # Word weights
30 | b_sm = model.add_parameters((ntags))                # Softmax bias
31 | 
32 | # A function to calculate scores for one value
33 | def calc_scores(words):
34 |   # Create a computation graph, and add parameters
35 |   dy.renew_cg()
36 |   b_sm_exp = dy.parameter(b_sm)
37 |   # Take the sum of all the embedding vectors for each word
38 |   score = dy.esum([dy.lookup(W_sm, x) for x in words])
39 |   # Add the bias vector and return
40 |   return score + b_sm_exp
41 | 
42 | for ITER in range(100):
43 |   # Perform training
44 |   random.shuffle(train)
45 |   train_loss = 0.0
46 |   start = time.time()
47 |   for words, tag in train:
48 |     my_loss = dy.pickneglogsoftmax(calc_scores(words), tag)
49 |     train_loss += my_loss.value()
50 |     my_loss.backward()
51 |     trainer.update()
52 |   print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start))
53 |   # Perform testing
54 |   test_correct = 0.0
55 |   for words, tag in dev:
56 |     scores = calc_scores(words).npvalue()
57 |     predict = np.argmax(scores)
58 |     if predict == tag:
59 |       test_correct += 1
60 |   print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))
61 | 


--------------------------------------------------------------------------------
/01-intro/cbow-pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | CBOW
  5 | 
  6 | Based on Graham Neubig's DyNet code examples:
  7 |   https://github.com/neubig/nn4nlp2017-code
  8 |   http://phontron.com/class/nn4nlp2017/
  9 | 
 10 | """
 11 | 
 12 | from collections import defaultdict
 13 | import time
 14 | import random
 15 | import torch
 16 | from torch.autograd import Variable
 17 | import torch.nn as nn
 18 | import torch.optim as optim
 19 | 
 20 | torch.manual_seed(1)
 21 | 
 22 | 
 23 | # Functions to read in the corpus
 24 | w2i = defaultdict(lambda: len(w2i))
 25 | t2i = defaultdict(lambda: len(t2i))
 26 | UNK = w2i["<unk>"]
 27 | 
 28 | 
 29 | def read_dataset(filename):
 30 |     with open(filename, "r") as f:
 31 |         for line in f:
 32 |             tag, words = line.lower().strip().split(" ||| ")
 33 |             yield ([w2i[x] for x in words.split(" ")], t2i[tag])
 34 | 
 35 | 
 36 | # Read in the data
 37 | train = list(read_dataset("../data/classes/train.txt"))
 38 | w2i = defaultdict(lambda: UNK, w2i)
 39 | dev = list(read_dataset("../data/classes/test.txt"))
 40 | nwords = len(w2i)
 41 | ntags = len(t2i)
 42 | 
 43 | 
 44 | class CBOW(nn.Module):
 45 | 
 46 |     def __init__(self, vocab_size, embedding_dim, output_dim):
 47 |         super(CBOW, self).__init__()
 48 |         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
 49 |         self.linear = nn.Linear(embedding_dim, output_dim)
 50 | 
 51 |     def forward(self, inputs):
 52 |         embeds = self.embeddings(inputs)
 53 |         bow = torch.sum(embeds, 1)
 54 |         logits = self.linear(bow)
 55 |         return logits
 56 | 
 57 | 
 58 | model = CBOW(nwords, 64, ntags)
 59 | print(model)
 60 | 
 61 | 
 62 | def evaluate(model, data):
 63 |     """Evaluate a model on a data set."""
 64 |     correct = 0.0
 65 |     
 66 |     for words, tag in data:
 67 |         lookup_tensor = Variable(torch.LongTensor([words]))
 68 |         scores = model(lookup_tensor)
 69 |         predict = scores.data.numpy().argmax(axis=1)[0]
 70 | 
 71 |         if predict == tag:
 72 |             correct += 1
 73 | 
 74 |     return correct, len(data), correct/len(data)
 75 | 
 76 | 
 77 | optimizer = optim.SGD(model.parameters(), lr=0.001)
 78 | 
 79 | for ITER in range(100):
 80 | 
 81 |     random.shuffle(train)
 82 |     train_loss = 0.0
 83 |     start = time.time()
 84 | 
 85 |     for words, tag in train:
 86 | 
 87 |         # forward pass
 88 |         lookup_tensor = Variable(torch.LongTensor([words]))
 89 |         scores = model(lookup_tensor)
 90 |         loss = nn.CrossEntropyLoss()
 91 |         target = Variable(torch.LongTensor([tag]))
 92 |         output = loss(scores, target)
 93 |         train_loss += output.data[0]
 94 | 
 95 |         # backward pass
 96 |         model.zero_grad()
 97 |         output.backward()
 98 | 
 99 |         # update weights
100 |         optimizer.step()
101 | 
102 |     print("iter %r: train loss/sent=%.4f, time=%.2fs" % 
103 |           (ITER, train_loss/len(train), time.time()-start))
104 | 
105 |     # evaluate
106 |     _, _, acc = evaluate(model, dev)
107 |     print("iter %r: test acc=%.4f" % (ITER, acc))
108 | 


--------------------------------------------------------------------------------
/01-intro/cbow.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from collections import defaultdict\n",
 12 |     "import time\n",
 13 |     "import random\n",
 14 |     "import dynet as dy\n",
 15 |     "import numpy as np\n",
 16 |     "\n",
 17 |     "# Functions to read in the corpus\n",
 18 |     "w2i = defaultdict(lambda: len(w2i))\n",
 19 |     "t2i = defaultdict(lambda: len(t2i))\n",
 20 |     "UNK = w2i[\"<unk>\"]\n",
 21 |     "def read_dataset(filename):\n",
 22 |     "  with open(filename, \"r\") as f:\n",
 23 |     "    for line in f:\n",
 24 |     "      tag, words = line.lower().strip().split(\" ||| \")\n",
 25 |     "      yield ([w2i[x] for x in words.split(\" \")], t2i[tag])\n",
 26 |     "\n",
 27 |     "# Read in the data\n",
 28 |     "train = list(read_dataset(\"../data/classes/train.txt\"))\n",
 29 |     "w2i = defaultdict(lambda: UNK, w2i)\n",
 30 |     "dev = list(read_dataset(\"../data/classes/test.txt\"))\n",
 31 |     "nwords = len(w2i)\n",
 32 |     "ntags = len(t2i)\n",
 33 |     "\n",
 34 |     "# Start DyNet and define trainer\n",
 35 |     "model = dy.Model()\n",
 36 |     "trainer = dy.AdamTrainer(model)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# Define the model\n",
 48 |     "EMB_SIZE = 64\n",
 49 |     "W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word embeddings\n",
 50 |     "W_sm = model.add_parameters((ntags, EMB_SIZE))          # Softmax weights\n",
 51 |     "b_sm = model.add_parameters((ntags))                      # Softmax bias"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {
 58 |     "collapsed": true
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# A function to calculate scores for one value\n",
 63 |     "def calc_scores(words):\n",
 64 |     "  dy.renew_cg()\n",
 65 |     "  cbow = dy.esum([dy.lookup(W_emb, x) for x in words])\n",
 66 |     "  W_sm_exp = dy.parameter(W_sm)\n",
 67 |     "  b_sm_exp = dy.parameter(b_sm)\n",
 68 |     "  return W_sm_exp * cbow + b_sm_exp"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {
 75 |     "collapsed": false
 76 |    },
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "for ITER in range(100):\n",
 80 |     "  # Perform training\n",
 81 |     "  random.shuffle(train)\n",
 82 |     "  train_loss = 0.0\n",
 83 |     "  start = time.time()\n",
 84 |     "  for words, tag in train:\n",
 85 |     "    my_loss = dy.pickneglogsoftmax(calc_scores(words), tag)\n",
 86 |     "    train_loss += my_loss.value()\n",
 87 |     "    my_loss.backward()\n",
 88 |     "    trainer.update()\n",
 89 |     "  print(\"iter %r: train loss/sent=%.4f, time=%.2fs\" % (ITER, train_loss/len(train), time.time()-start))\n",
 90 |     "  # Perform testing\n",
 91 |     "  test_correct = 0.0\n",
 92 |     "  for words, tag in dev:\n",
 93 |     "    scores = calc_scores(words).npvalue()\n",
 94 |     "    predict = np.argmax(scores)\n",
 95 |     "    if predict == tag:\n",
 96 |     "      test_correct += 1\n",
 97 |     "  print(\"iter %r: test acc=%.4f\" % (ITER, test_correct/len(dev)))"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {
104 |     "collapsed": true
105 |    },
106 |    "outputs": [],
107 |    "source": []
108 |   }
109 |  ],
110 |  "metadata": {
111 |   "anaconda-cloud": {},
112 |   "kernelspec": {
113 |    "display_name": "Python 3",
114 |    "language": "python",
115 |    "name": "python3"
116 |   },
117 |   "language_info": {
118 |    "codemirror_mode": {
119 |     "name": "ipython",
120 |     "version": 3
121 |    },
122 |    "file_extension": ".py",
123 |    "mimetype": "text/x-python",
124 |    "name": "python",
125 |    "nbconvert_exporter": "python",
126 |    "pygments_lexer": "ipython3",
127 |    "version": "3.6.0"
128 |   }
129 |  },
130 |  "nbformat": 4,
131 |  "nbformat_minor": 2
132 | }
133 | 


--------------------------------------------------------------------------------
/01-intro/cbow.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import time
 3 | import random
 4 | import dynet as dy
 5 | import numpy as np
 6 | 
 7 | # Functions to read in the corpus
 8 | w2i = defaultdict(lambda: len(w2i))
 9 | t2i = defaultdict(lambda: len(t2i))
10 | UNK = w2i["<unk>"]
11 | def read_dataset(filename):
12 |   with open(filename, "r") as f:
13 |     for line in f:
14 |       tag, words = line.lower().strip().split(" ||| ")
15 |       yield ([w2i[x] for x in words.split(" ")], t2i[tag])
16 | 
17 | # Read in the data
18 | train = list(read_dataset("../data/classes/train.txt"))
19 | w2i = defaultdict(lambda: UNK, w2i)
20 | dev = list(read_dataset("../data/classes/test.txt"))
21 | nwords = len(w2i)
22 | ntags = len(t2i)
23 | 
24 | # Start DyNet and define trainer
25 | model = dy.Model()
26 | trainer = dy.AdamTrainer(model)
27 | 
28 | # Define the model
29 | EMB_SIZE = 64
30 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word embeddings
31 | W_sm = model.add_parameters((ntags, EMB_SIZE))          # Softmax weights
32 | b_sm = model.add_parameters((ntags))                      # Softmax bias
33 | 
34 | # A function to calculate scores for one value
35 | def calc_scores(words):
36 |   dy.renew_cg()
37 |   cbow = dy.esum([dy.lookup(W_emb, x) for x in words])
38 |   W_sm_exp = dy.parameter(W_sm)
39 |   b_sm_exp = dy.parameter(b_sm)
40 |   return W_sm_exp * cbow + b_sm_exp
41 | 
42 | for ITER in range(100):
43 |   # Perform training
44 |   random.shuffle(train)
45 |   train_loss = 0.0
46 |   start = time.time()
47 |   for words, tag in train:
48 |     my_loss = dy.pickneglogsoftmax(calc_scores(words), tag)
49 |     train_loss += my_loss.value()
50 |     my_loss.backward()
51 |     trainer.update()
52 |   print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start))
53 |   # Perform testing
54 |   test_correct = 0.0
55 |   for words, tag in dev:
56 |     scores = calc_scores(words).npvalue()
57 |     predict = np.argmax(scores)
58 |     if predict == tag:
59 |       test_correct += 1
60 |   print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))
61 | 


--------------------------------------------------------------------------------
/01-intro/deep-cbow-pytorch-minibatch.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | Deep CBOW (with minibatching)
  5 | 
  6 | Based on Graham Neubig's DyNet code examples:
  7 |   https://github.com/neubig/nn4nlp2017-code
  8 |   http://phontron.com/class/nn4nlp2017/
  9 | 
 10 | """
 11 | 
 12 | from collections import defaultdict
 13 | from collections import namedtuple
 14 | import time
 15 | import random
 16 | import torch
 17 | from torch.autograd import Variable
 18 | import torch.nn as nn
 19 | import torch.nn.functional as F
 20 | import torch.optim as optim
 21 | 
 22 | torch.manual_seed(1)
 23 | random.seed(1)
 24 | 
 25 | 
 26 | CUDA = torch.cuda.is_available()
 27 | print("CUDA: %s" % CUDA)
 28 | 
 29 | 
 30 | # Functions to read in the corpus
 31 | w2i = defaultdict(lambda: len(w2i))
 32 | t2i = defaultdict(lambda: len(t2i))
 33 | UNK = w2i["<unk>"]
 34 | PAD = w2i["<pad>"]
 35 | 
 36 | # One data point
 37 | Example = namedtuple("Example", ["words", "tag"])
 38 | 
 39 | 
 40 | def read_dataset(filename):
 41 |     with open(filename, "r") as f:
 42 |         for line in f:
 43 |             tag, words = line.lower().strip().split(" ||| ")
 44 |             yield Example(words=[w2i[x] for x in words.split(" ")],
 45 |                           tag=t2i[tag])
 46 | 
 47 | 
 48 | # Read in the data
 49 | train = list(read_dataset("../data/classes/train.txt"))
 50 | w2i = defaultdict(lambda: UNK, w2i)
 51 | dev = list(read_dataset("../data/classes/test.txt"))
 52 | nwords = len(w2i)
 53 | ntags = len(t2i)
 54 | 
 55 | 
 56 | class DeepCBOW(nn.Module):
 57 |     """
 58 |     Deep CBOW model
 59 |     """
 60 | 
 61 |     def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
 62 |         super(DeepCBOW, self).__init__()
 63 |         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
 64 |         self.linear1 = nn.Linear(embedding_dim, hidden_dim)
 65 |         self.linear2 = nn.Linear(hidden_dim, hidden_dim)
 66 |         self.linear3 = nn.Linear(hidden_dim, output_dim)
 67 | 
 68 |     def forward(self, inputs):
 69 |         embeds = self.embeddings(inputs)
 70 |         h = torch.sum(embeds, 1)
 71 |         h = F.relu(self.linear1(h))
 72 |         h = F.relu(self.linear2(h))
 73 |         h = self.linear3(h)
 74 |         return h
 75 | 
 76 | 
 77 | model = DeepCBOW(nwords, 64, 64, ntags)
 78 | 
 79 | if CUDA:
 80 |     model.cuda()
 81 | 
 82 | print(model)
 83 | 
 84 | 
 85 | def minibatch(data, batch_size=32):
 86 |     for i in range(0, len(data), batch_size):
 87 |         yield data[i:i+batch_size]
 88 | 
 89 | 
 90 | def evaluate(model, data):
 91 |     """Evaluate a model on a data set."""
 92 |     correct = 0.0
 93 | 
 94 |     for batch in minibatch(data):
 95 | 
 96 |         seqs, tags = preprocess(batch)
 97 |         scores = model(get_variable(seqs))
 98 |         _, predictions = torch.max(scores.data, 1)
 99 |         targets = get_variable(tags)
100 | 
101 |         correct += torch.eq(predictions, targets).sum().data[0]
102 | 
103 |     return correct, len(data), correct/len(data)
104 | 
105 | 
106 | def get_variable(x):
107 |     """Get a Variable given indices x"""
108 |     tensor = torch.cuda.LongTensor(x) if CUDA else torch.LongTensor(x)
109 |     return Variable(tensor)
110 | 
111 | 
112 | def preprocess(batch):
113 |     """ Add zero-padding to a batch. """
114 | 
115 |     tags = [example.tag for example in batch]
116 | 
117 |     # add zero-padding to make all sequences equally long
118 |     seqs = [example.words for example in batch]
119 |     max_length = max(map(len, seqs))
120 |     seqs = [seq + [PAD] * (max_length - len(seq)) for seq in seqs]
121 | 
122 |     return seqs, tags
123 | 
124 | 
125 | optimizer = optim.Adam(model.parameters(), lr=0.001)
126 | 
127 | for ITER in range(100):
128 | 
129 |     random.shuffle(train)
130 |     train_loss = 0.0
131 |     start = time.time()
132 |     updates = 0
133 | 
134 |     for batch in minibatch(train):
135 | 
136 |         updates += 1
137 | 
138 |         # pad data with zeros
139 |         seqs, tags = preprocess(batch)
140 | 
141 |         # forward pass
142 |         scores = model(get_variable(seqs))
143 |         targets = get_variable(tags)
144 |         loss = nn.CrossEntropyLoss()
145 |         output = loss(scores, targets)
146 |         train_loss += output.data[0]
147 | 
148 |         # backward pass
149 |         model.zero_grad()
150 |         output.backward()
151 | 
152 |         # update weights
153 |         optimizer.step()
154 | 
155 |     print("iter %r: avg train loss=%.4f, time=%.2fs" %
156 |           (ITER, train_loss/updates, time.time()-start))
157 | 
158 |     # evaluate
159 |     _, _, acc_train = evaluate(model, train)
160 |     _, _, acc_dev = evaluate(model, dev)
161 |     print("iter %r: train acc=%.4f  test acc=%.4f" % (ITER, acc_train, acc_dev))
162 | 


--------------------------------------------------------------------------------
/01-intro/deep-cbow-pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | """
  4 | Deep CBOW
  5 | 
  6 | Based on Graham Neubig's DyNet code examples:
  7 |   https://github.com/neubig/nn4nlp2017-code
  8 |   http://phontron.com/class/nn4nlp2017/
  9 | 
 10 | """
 11 | 
 12 | from collections import defaultdict
 13 | import time
 14 | import random
 15 | import torch
 16 | from torch.autograd import Variable
 17 | import torch.nn as nn
 18 | import torch.nn.functional as F
 19 | import torch.optim as optim
 20 | 
 21 | torch.manual_seed(1)
 22 | 
 23 | 
 24 | # Functions to read in the corpus
 25 | w2i = defaultdict(lambda: len(w2i))
 26 | t2i = defaultdict(lambda: len(t2i))
 27 | UNK = w2i["<unk>"]
 28 | 
 29 | 
 30 | def read_dataset(filename):
 31 |     with open(filename, "r") as f:
 32 |         for line in f:
 33 |             tag, words = line.lower().strip().split(" ||| ")
 34 |             yield ([w2i[x] for x in words.split(" ")], t2i[tag])
 35 | 
 36 | 
 37 | # Read in the data
 38 | train = list(read_dataset("../data/classes/train.txt"))
 39 | w2i = defaultdict(lambda: UNK, w2i)
 40 | dev = list(read_dataset("../data/classes/test.txt"))
 41 | nwords = len(w2i)
 42 | ntags = len(t2i)
 43 | 
 44 | 
 45 | class DeepCBOW(nn.Module):
 46 | 
 47 |     def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
 48 |         super(DeepCBOW, self).__init__()
 49 |         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
 50 |         self.linear1 = nn.Linear(embedding_dim, hidden_dim)
 51 |         self.linear2 = nn.Linear(hidden_dim, hidden_dim)
 52 |         self.linear3 = nn.Linear(hidden_dim, output_dim)
 53 | 
 54 |     def forward(self, inputs):
 55 |         embeds = self.embeddings(inputs)
 56 |         h = torch.sum(embeds, 1)
 57 |         h = F.tanh(self.linear1(h))
 58 |         h = F.tanh(self.linear2(h))
 59 |         h = self.linear3(h)
 60 |         return h
 61 | 
 62 | 
 63 | model = DeepCBOW(nwords, 64, 64, ntags)
 64 | 
 65 |     
 66 | print(model)
 67 | 
 68 | 
 69 | def evaluate(model, data):
 70 |     """Evaluate a model on a data set."""
 71 |     correct = 0.0
 72 |     
 73 |     for words, tag in data:
 74 |         scores = model(get_tensor([words]))
 75 |         predict = scores.data.numpy().argmax(axis=1)[0]
 76 | 
 77 |         if predict == tag:
 78 |             correct += 1
 79 | 
 80 |     return correct, len(data), correct/len(data)
 81 | 
 82 | 
 83 | def get_tensor(x):
 84 |     """Get a Variable given indices x"""
 85 |     return Variable(torch.LongTensor(x))
 86 | 
 87 | 
 88 | optimizer = optim.Adam(model.parameters(), lr=0.001)
 89 | 
 90 | for ITER in range(100):
 91 | 
 92 |     random.shuffle(train)
 93 |     train_loss = 0.0
 94 |     start = time.time()
 95 | 
 96 |     for words, tag in train:
 97 | 
 98 |         # forward pass
 99 |         scores = model(get_tensor([words]))
100 | 
101 |         loss = nn.CrossEntropyLoss()
102 |         target = get_tensor([tag])
103 |         output = loss(scores, target)
104 |         train_loss += output.data[0]
105 | 
106 |         # backward pass
107 |         model.zero_grad()
108 |         output.backward()
109 | 
110 |         # update weights
111 |         optimizer.step()
112 | 
113 |     print("iter %r: train loss/sent=%.4f, time=%.2fs" %
114 |           (ITER, train_loss/len(train), time.time()-start))
115 | 
116 |     # evaluate
117 |     _, _, acc = evaluate(model, dev)
118 |     print("iter %r: test acc=%.4f" % (ITER, acc))
119 | 


--------------------------------------------------------------------------------
/01-intro/deep-cbow.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from collections import defaultdict\n",
 12 |     "import time\n",
 13 |     "import random\n",
 14 |     "import dynet as dy\n",
 15 |     "import numpy as np\n",
 16 |     "\n",
 17 |     "# Functions to read in the corpus\n",
 18 |     "w2i = defaultdict(lambda: len(w2i))\n",
 19 |     "t2i = defaultdict(lambda: len(t2i))\n",
 20 |     "UNK = w2i[\"<unk>\"]\n",
 21 |     "def read_dataset(filename):\n",
 22 |     "  with open(filename, \"r\") as f:\n",
 23 |     "    for line in f:\n",
 24 |     "      tag, words = line.lower().strip().split(\" ||| \")\n",
 25 |     "      yield ([w2i[x] for x in words.split(\" \")], t2i[tag])\n",
 26 |     "\n",
 27 |     "# Read in the data\n",
 28 |     "train = list(read_dataset(\"../data/classes/train.txt\"))\n",
 29 |     "w2i = defaultdict(lambda: UNK, w2i)\n",
 30 |     "dev = list(read_dataset(\"../data/classes/test.txt\"))\n",
 31 |     "nwords = len(w2i)\n",
 32 |     "ntags = len(t2i)\n",
 33 |     "\n",
 34 |     "# Start DyNet and define trainer\n",
 35 |     "model = dy.Model()\n",
 36 |     "trainer = dy.AdamTrainer(model)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# Define the model\n",
 48 |     "EMB_SIZE = 64\n",
 49 |     "HID_SIZE = 64\n",
 50 |     "HID_LAY = 2\n",
 51 |     "W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word embeddings\n",
 52 |     "W_h = [model.add_parameters((HID_SIZE, EMB_SIZE if lay == 0 else HID_SIZE)) for lay in range(HID_LAY)]\n",
 53 |     "b_h = [model.add_parameters((HID_SIZE)) for lay in range(HID_LAY)]\n",
 54 |     "W_sm = model.add_parameters((ntags, HID_SIZE))          # Softmax weights\n",
 55 |     "b_sm = model.add_parameters((ntags))                    # Softmax bias"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {
 62 |     "collapsed": true
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# A function to calculate scores for one value\n",
 67 |     "def calc_scores(words):\n",
 68 |     "  dy.renew_cg()\n",
 69 |     "  h = dy.esum([dy.lookup(W_emb, x) for x in words])\n",
 70 |     "  for W_h_i, b_h_i in zip(W_h, b_h):\n",
 71 |     "    h = dy.tanh( dy.parameter(W_h_i) * h + dy.parameter(b_h_i) )\n",
 72 |     "  return dy.parameter(W_sm) * h + dy.parameter(b_sm)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {
 79 |     "collapsed": false
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "for ITER in range(100):\n",
 84 |     "  # Perform training\n",
 85 |     "  random.shuffle(train)\n",
 86 |     "  train_loss = 0.0\n",
 87 |     "  start = time.time()\n",
 88 |     "  for words, tag in train:\n",
 89 |     "    my_loss = dy.pickneglogsoftmax(calc_scores(words), tag)\n",
 90 |     "    train_loss += my_loss.value()\n",
 91 |     "    my_loss.backward()\n",
 92 |     "    trainer.update()\n",
 93 |     "  print(\"iter %r: train loss/sent=%.4f, time=%.2fs\" % (ITER, train_loss/len(train), time.time()-start))\n",
 94 |     "  # Perform testing\n",
 95 |     "  test_correct = 0.0\n",
 96 |     "  for words, tag in dev:\n",
 97 |     "    scores = calc_scores(words).npvalue()\n",
 98 |     "    predict = np.argmax(scores)\n",
 99 |     "    if predict == tag:\n",
100 |     "      test_correct += 1\n",
101 |     "  print(\"iter %r: test acc=%.4f\" % (ITER, test_correct/len(dev)))"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {
108 |     "collapsed": true
109 |    },
110 |    "outputs": [],
111 |    "source": []
112 |   }
113 |  ],
114 |  "metadata": {
115 |   "kernelspec": {
116 |    "display_name": "Python 3",
117 |    "language": "python",
118 |    "name": "python3"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.6.0"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 2
135 | }
136 | 


--------------------------------------------------------------------------------
/01-intro/deep-cbow.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import time
 3 | import random
 4 | import dynet as dy
 5 | import numpy as np
 6 | 
 7 | # Functions to read in the corpus
 8 | w2i = defaultdict(lambda: len(w2i))
 9 | t2i = defaultdict(lambda: len(t2i))
10 | UNK = w2i["<unk>"]
11 | def read_dataset(filename):
12 |   with open(filename, "r") as f:
13 |     for line in f:
14 |       tag, words = line.lower().strip().split(" ||| ")
15 |       yield ([w2i[x] for x in words.split(" ")], t2i[tag])
16 | 
17 | # Read in the data
18 | train = list(read_dataset("../data/classes/train.txt"))
19 | w2i = defaultdict(lambda: UNK, w2i)
20 | dev = list(read_dataset("../data/classes/test.txt"))
21 | nwords = len(w2i)
22 | ntags = len(t2i)
23 | 
24 | # Start DyNet and define trainer
25 | model = dy.Model()
26 | trainer = dy.AdamTrainer(model)
27 | 
28 | # Define the model
29 | EMB_SIZE = 64
30 | HID_SIZE = 64
31 | HID_LAY = 2
32 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word embeddings
33 | W_h = [model.add_parameters((HID_SIZE, EMB_SIZE if lay == 0 else HID_SIZE)) for lay in range(HID_LAY)]
34 | b_h = [model.add_parameters((HID_SIZE)) for lay in range(HID_LAY)]
35 | W_sm = model.add_parameters((ntags, HID_SIZE))          # Softmax weights
36 | b_sm = model.add_parameters((ntags))                      # Softmax bias
37 | 
38 | # A function to calculate scores for one value
39 | def calc_scores(words):
40 |   dy.renew_cg()
41 |   h = dy.esum([dy.lookup(W_emb, x) for x in words])
42 |   for W_h_i, b_h_i in zip(W_h, b_h):
43 |     h = dy.tanh( dy.parameter(W_h_i) * h + dy.parameter(b_h_i) )
44 |   return dy.parameter(W_sm) * h + dy.parameter(b_sm)
45 | 
46 | for ITER in range(100):
47 |   # Perform training
48 |   random.shuffle(train)
49 |   train_loss = 0.0
50 |   start = time.time()
51 |   for words, tag in train:
52 |     my_loss = dy.pickneglogsoftmax(calc_scores(words), tag)
53 |     train_loss += my_loss.value()
54 |     my_loss.backward()
55 |     trainer.update()
56 |   print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start))
57 |   # Perform training
58 |   test_correct = 0.0
59 |   for words, tag in dev:
60 |     scores = calc_scores(words).npvalue()
61 |     predict = np.argmax(scores)
62 |     if predict == tag:
63 |       test_correct += 1
64 |   print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))
65 | 


--------------------------------------------------------------------------------
/02-lm/loglin-lm.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import math
  3 | import time
  4 | import random
  5 | import dynet as dy
  6 | import numpy as np
  7 | 
  8 | # The length of the n-gram
  9 | N = 2
 10 | 
 11 | # Functions to read in the corpus
 12 | # NOTE: We are using data from the Penn Treebank, which is already converted
 13 | #       into an easy-to-use format with "<unk>" symbols. If we were using other
 14 | #       data we would have to do pre-processing and consider how to choose
 15 | #       unknown words, etc.
 16 | w2i = defaultdict(lambda: len(w2i))
 17 | S = w2i["<s>"]
 18 | UNK = w2i["<unk>"]
 19 | def read_dataset(filename):
 20 |   with open(filename, "r") as f:
 21 |     for line in f:
 22 |       yield [w2i[x] for x in line.strip().split(" ")]
 23 | 
 24 | # Read in the data
 25 | train = list(read_dataset("../data/ptb/train.txt"))
 26 | w2i = defaultdict(lambda: UNK, w2i)
 27 | dev = list(read_dataset("../data/ptb/valid.txt"))
 28 | i2w = {v: k for k, v in w2i.items()}
 29 | nwords = len(w2i)
 30 | 
 31 | # Start DyNet and define trainer
 32 | model = dy.Model()
 33 | trainer = dy.SimpleSGDTrainer(model, learning_rate=0.1)
 34 | 
 35 | # Define the model
 36 | W_sm = [model.add_lookup_parameters((nwords, nwords)) for _ in range(N)] # Word weights at each position
 37 | b_sm = model.add_parameters((nwords))                # Softmax bias
 38 | 
 39 | # A function to calculate scores for one value
 40 | def calc_score_of_history(words):
 41 |   # Create a list of things to sum up with only the bias vector at first
 42 |   score_vecs = [dy.parameter(b_sm)]
 43 |   for word_id, lookup_param in zip(words, W_sm): 
 44 |     score_vecs.append(lookup_param[word_id])
 45 |   return dy.esum(score_vecs)
 46 | 
 47 | # Calculate the loss value for the entire sentence
 48 | def calc_sent_loss(sent):
 49 |   # Create a computation graph
 50 |   dy.renew_cg()
 51 |   # The initial history is equal to end of sentence symbols
 52 |   hist = [S] * N
 53 |   # Step through the sentence, including the end of sentence token
 54 |   all_losses = []
 55 |   for next_word in sent + [S]:
 56 |     s = calc_score_of_history(hist)
 57 |     all_losses.append(dy.pickneglogsoftmax(s, next_word))
 58 |     hist = hist[1:] + [next_word]
 59 |   return dy.esum(all_losses)
 60 | 
 61 | MAX_LEN = 100
 62 | # Generate a sentence
 63 | def generate_sent():
 64 |   dy.renew_cg()
 65 |   hist = [S] * N
 66 |   sent = []
 67 |   while True:
 68 |     p = dy.softmax(calc_score_of_history(hist)).npvalue()
 69 |     next_word = np.random.choice(nwords, p=p/p.sum())
 70 |     if next_word == S or len(sent) == MAX_LEN:
 71 |       break
 72 |     sent.append(next_word)
 73 |     hist = hist[1:] + [next_word]
 74 |   return sent
 75 | 
 76 | for ITER in range(100):
 77 |   # Perform training
 78 |   random.shuffle(train)
 79 |   train_words, train_loss = 0, 0.0
 80 |   start = time.time()
 81 |   for sent_id, sent in enumerate(train):
 82 |     my_loss = calc_sent_loss(sent)
 83 |     train_loss += my_loss.value()
 84 |     train_words += len(sent)
 85 |     my_loss.backward()
 86 |     trainer.update()
 87 |     if (sent_id+1) % 5000 == 0:
 88 |       print("--finished %r sentences" % (sent_id+1))
 89 |   print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start))
 90 |   # Evaluate on dev set
 91 |   dev_words, dev_loss = 0, 0.0
 92 |   start = time.time()
 93 |   for sent_id, sent in enumerate(dev):
 94 |     my_loss = calc_sent_loss(sent)
 95 |     dev_loss += my_loss.value()
 96 |     dev_words += len(sent)
 97 |     trainer.update()
 98 |   print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))
 99 |   # Generate a few sentences
100 |   for _ in range(5):
101 |     sent = generate_sent()
102 |     print(" ".join([i2w[x] for x in sent]))
103 | 


--------------------------------------------------------------------------------
/02-lm/nn-lm-batch.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import math
  3 | import time
  4 | import random
  5 | import dynet as dy
  6 | import numpy as np
  7 | 
  8 | N = 2 # The length of the n-gram
  9 | EMB_SIZE = 128 # The size of the embedding
 10 | HID_SIZE = 128 # The size of the hidden layer
 11 | 
 12 | # Functions to read in the corpus
 13 | # NOTE: We are using data from the Penn Treebank, which is already converted
 14 | #       into an easy-to-use format with "<unk>" symbols. If we were using other
 15 | #       data we would have to do pre-processing and consider how to choose
 16 | #       unknown words, etc.
 17 | w2i = defaultdict(lambda: len(w2i))
 18 | S = w2i["<s>"]
 19 | UNK = w2i["<unk>"]
 20 | def read_dataset(filename):
 21 |   with open(filename, "r") as f:
 22 |     for line in f:
 23 |       yield [w2i[x] for x in line.strip().split(" ")]
 24 | 
 25 | # Read in the data
 26 | train = list(read_dataset("../data/ptb/train.txt"))
 27 | w2i = defaultdict(lambda: UNK, w2i)
 28 | dev = list(read_dataset("../data/ptb/valid.txt"))
 29 | i2w = {v: k for k, v in w2i.items()}
 30 | nwords = len(w2i)
 31 | 
 32 | # Start DyNet and define trainer
 33 | model = dy.Model()
 34 | trainer = dy.AdamTrainer(model, alpha=0.001)
 35 | 
 36 | # Define the model
 37 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights at each position
 38 | W_h_p = model.add_parameters((HID_SIZE, EMB_SIZE * N))    # Weights of the softmax
 39 | b_h_p = model.add_parameters((HID_SIZE))                  # Weights of the softmax
 40 | W_sm_p = model.add_parameters((nwords, HID_SIZE))         # Weights of the softmax
 41 | b_sm_p = model.add_parameters((nwords))                   # Softmax bias
 42 | 
 43 | # A function to calculate scores for one value
 44 | def calc_score_of_histories(words, dropout=0.0):
 45 |   # This will change from a list of histories, to a list of words in each history position
 46 |   words = np.transpose(words)
 47 |   # Lookup the embeddings and concatenate them
 48 |   emb = dy.concatenate([dy.lookup_batch(W_emb, x) for x in words])
 49 |   # Create the hidden layer
 50 |   W_h = dy.parameter(W_h_p)
 51 |   b_h = dy.parameter(b_h_p)
 52 |   h = dy.tanh(dy.affine_transform([b_h, W_h, emb]))
 53 |   # Perform dropout
 54 |   if dropout != 0.0:
 55 |     h = dy.dropout(h, dropout)
 56 |   # Calculate the score and return
 57 |   W_sm = dy.parameter(W_sm_p)
 58 |   b_sm = dy.parameter(b_sm_p)
 59 |   return dy.affine_transform([b_sm, W_sm, h])
 60 | 
 61 | # Calculate the loss value for the entire sentence
 62 | def calc_sent_loss(sent, dropout=0.0):
 63 |   # Create a computation graph
 64 |   dy.renew_cg()
 65 |   # The initial history is equal to end of sentence symbols
 66 |   hist = [S] * N
 67 |   # Step through the sentence, including the end of sentence token
 68 |   all_histories = []
 69 |   all_targets = []
 70 |   for next_word in sent + [S]:
 71 |     all_histories.append(list(hist))
 72 |     all_targets.append(next_word)
 73 |     hist = hist[1:] + [next_word]
 74 |   s = calc_score_of_histories(all_histories, dropout=dropout)
 75 |   return dy.sum_batches(dy.pickneglogsoftmax_batch(s, all_targets))
 76 | 
 77 | MAX_LEN = 100
 78 | # Generate a sentence
 79 | def generate_sent():
 80 |   dy.renew_cg()
 81 |   hist = [S] * N
 82 |   sent = []
 83 |   while True:
 84 |     p = dy.softmax(calc_score_of_histories([hist])).npvalue()
 85 |     next_word = np.random.choice(nwords, p=p/p.sum())
 86 |     if next_word == S or len(sent) == MAX_LEN:
 87 |       break
 88 |     sent.append(next_word)
 89 |     hist = hist[1:] + [next_word]
 90 |   return sent
 91 | 
 92 | last_dev = 1e20
 93 | best_dev = 1e20
 94 | 
 95 | for ITER in range(100):
 96 |   # Perform training
 97 |   random.shuffle(train)
 98 |   train_words, train_loss = 0, 0.0
 99 |   start = time.time()
100 |   for sent_id, sent in enumerate(train):
101 |     my_loss = calc_sent_loss(sent, dropout=0.2)
102 |     train_loss += my_loss.value()
103 |     train_words += len(sent)
104 |     my_loss.backward()
105 |     trainer.update()
106 |     if (sent_id+1) % 5000 == 0:
107 |       print("--finished %r sentences" % (sent_id+1))
108 |   print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start))
109 |   # Evaluate on dev set
110 |   dev_words, dev_loss = 0, 0.0
111 |   start = time.time()
112 |   for sent_id, sent in enumerate(dev):
113 |     my_loss = calc_sent_loss(sent)
114 |     dev_loss += my_loss.value()
115 |     dev_words += len(sent)
116 |     trainer.update()
117 |   # Keep track of the development accuracy and reduce the learning rate if it got worse
118 |   if last_dev < dev_loss:
119 |     trainer.learning_rate /= 2
120 |   last_dev = dev_loss
121 |   # Keep track of the best development accuracy, and save the model only if it's the best one
122 |   if best_dev > dev_loss:
123 |     model.save("model.txt")
124 |     best_dev = dev_loss
125 |   # Save the model
126 |   print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))
127 |   # Generate a few sentences
128 |   for _ in range(5):
129 |     sent = generate_sent()
130 |     print(" ".join([i2w[x] for x in sent]))
131 | 


--------------------------------------------------------------------------------
/02-lm/nn-lm-optim.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import math
  3 | import time
  4 | import random
  5 | import dynet as dy
  6 | import numpy as np
  7 | 
  8 | N = 2 # The length of the n-gram
  9 | EMB_SIZE = 128 # The size of the embedding
 10 | HID_SIZE = 128 # The size of the hidden layer
 11 | 
 12 | # Functions to read in the corpus
 13 | # NOTE: We are using data from the Penn Treebank, which is already converted
 14 | #       into an easy-to-use format with "<unk>" symbols. If we were using other
 15 | #       data we would have to do pre-processing and consider how to choose
 16 | #       unknown words, etc.
 17 | w2i = defaultdict(lambda: len(w2i))
 18 | S = w2i["<s>"]
 19 | UNK = w2i["<unk>"]
 20 | def read_dataset(filename):
 21 |   with open(filename, "r") as f:
 22 |     for line in f:
 23 |       yield [w2i[x] for x in line.strip().split(" ")]
 24 | 
 25 | # Read in the data
 26 | train = list(read_dataset("../data/ptb/train.txt"))
 27 | w2i = defaultdict(lambda: UNK, w2i)
 28 | dev = list(read_dataset("../data/ptb/valid.txt"))
 29 | i2w = {v: k for k, v in w2i.items()}
 30 | nwords = len(w2i)
 31 | 
 32 | # Start DyNet and define trainer
 33 | model = dy.Model()
 34 | 
 35 | # CHANGE 1: Use Adam instead of Simple SGD
 36 | trainer = dy.AdamTrainer(model, alpha=0.001)
 37 | 
 38 | # Define the model
 39 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights at each position
 40 | W_h_p = model.add_parameters((HID_SIZE, EMB_SIZE * N))    # Weights of the softmax
 41 | b_h_p = model.add_parameters((HID_SIZE))                  # Weights of the softmax
 42 | W_sm_p = model.add_parameters((nwords, HID_SIZE))         # Weights of the softmax
 43 | b_sm_p = model.add_parameters((nwords))                   # Softmax bias
 44 | 
 45 | # A function to calculate scores for one value
 46 | def calc_score_of_history(words, dropout=0.0):
 47 |   # Lookup the embeddings and concatenate them
 48 |   emb = dy.concatenate([W_emb[x] for x in words])
 49 |   # Create the hidden layer
 50 |   W_h = dy.parameter(W_h_p)
 51 |   b_h = dy.parameter(b_h_p)
 52 |   h = dy.tanh(dy.affine_transform([b_h, W_h, emb]))
 53 |   # CHANGE 2: perform dropout
 54 |   if dropout != 0.0:
 55 |     h = dy.dropout(h, dropout)
 56 |   # Calculate the score and return
 57 |   W_sm = dy.parameter(W_sm_p)
 58 |   b_sm = dy.parameter(b_sm_p)
 59 |   return dy.affine_transform([b_sm, W_sm, h])
 60 | 
 61 | # Calculate the loss value for the entire sentence
 62 | def calc_sent_loss(sent, dropout=0.0):
 63 |   # Create a computation graph
 64 |   dy.renew_cg()
 65 |   # The initial history is equal to end of sentence symbols
 66 |   hist = [S] * N
 67 |   # Step through the sentence, including the end of sentence token
 68 |   all_losses = []
 69 |   for next_word in sent + [S]:
 70 |     s = calc_score_of_history(hist, dropout=dropout)
 71 |     all_losses.append(dy.pickneglogsoftmax(s, next_word))
 72 |     hist = hist[1:] + [next_word]
 73 |   return dy.esum(all_losses)
 74 | 
 75 | MAX_LEN = 100
 76 | # Generate a sentence
 77 | def generate_sent():
 78 |   dy.renew_cg()
 79 |   hist = [S] * N
 80 |   sent = []
 81 |   while True:
 82 |     p = dy.softmax(calc_score_of_history(hist)).npvalue()
 83 |     next_word = np.random.choice(nwords, p=p/p.sum())
 84 |     if next_word == S or len(sent) == MAX_LEN:
 85 |       break
 86 |     sent.append(next_word)
 87 |     hist = hist[1:] + [next_word]
 88 |   return sent
 89 | 
 90 | last_dev = 1e20
 91 | best_dev = 1e20
 92 | 
 93 | for ITER in range(100):
 94 |   # Perform training
 95 |   random.shuffle(train)
 96 |   train_words, train_loss = 0, 0.0
 97 |   start = time.time()
 98 |   for sent_id, sent in enumerate(train):
 99 |     my_loss = calc_sent_loss(sent, dropout=0.2)
100 |     train_loss += my_loss.value()
101 |     train_words += len(sent)
102 |     my_loss.backward()
103 |     trainer.update()
104 |     if (sent_id+1) % 5000 == 0:
105 |       print("--finished %r sentences" % (sent_id+1))
106 |   print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start))
107 |   # Evaluate on dev set
108 |   dev_words, dev_loss = 0, 0.0
109 |   start = time.time()
110 |   for sent_id, sent in enumerate(dev):
111 |     my_loss = calc_sent_loss(sent)
112 |     dev_loss += my_loss.value()
113 |     dev_words += len(sent)
114 |     trainer.update()
115 |   # CHANGE 3: Keep track of the development accuracy and reduce the learning rate if it got worse
116 |   if last_dev < dev_loss:
117 |     trainer.learning_rate /= 2
118 |   last_dev = dev_loss
119 |   # CHANGE 4: Keep track of the best development accuracy, and save the model only if it's the best one
120 |   if best_dev > dev_loss:
121 |     model.save("model.txt")
122 |     best_dev = dev_loss
123 |   # Save the model
124 |   print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))
125 |   # Generate a few sentences
126 |   for _ in range(5):
127 |     sent = generate_sent()
128 |     print(" ".join([i2w[x] for x in sent]))
129 | 


--------------------------------------------------------------------------------
/02-lm/nn-lm.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import math
  3 | import time
  4 | import random
  5 | import dynet as dy
  6 | import numpy as np
  7 | 
  8 | N = 2 # The length of the n-gram
  9 | EMB_SIZE = 128 # The size of the embedding
 10 | HID_SIZE = 128 # The size of the hidden layer
 11 | 
 12 | # Functions to read in the corpus
 13 | # NOTE: We are using data from the Penn Treebank, which is already converted
 14 | #       into an easy-to-use format with "<unk>" symbols. If we were using other
 15 | #       data we would have to do pre-processing and consider how to choose
 16 | #       unknown words, etc.
 17 | w2i = defaultdict(lambda: len(w2i))
 18 | S = w2i["<s>"]
 19 | UNK = w2i["<unk>"]
 20 | def read_dataset(filename):
 21 |   with open(filename, "r") as f:
 22 |     for line in f:
 23 |       yield [w2i[x] for x in line.strip().split(" ")]
 24 | 
 25 | # Read in the data
 26 | train = list(read_dataset("../data/ptb/train.txt"))
 27 | w2i = defaultdict(lambda: UNK, w2i)
 28 | dev = list(read_dataset("../data/ptb/valid.txt"))
 29 | i2w = {v: k for k, v in w2i.items()}
 30 | nwords = len(w2i)
 31 | 
 32 | # Start DyNet and define trainer
 33 | model = dy.Model()
 34 | trainer = dy.SimpleSGDTrainer(model, learning_rate=0.1)
 35 | 
 36 | # Define the model
 37 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights at each position
 38 | W_h_p = model.add_parameters((HID_SIZE, EMB_SIZE * N))    # Weights of the softmax
 39 | b_h_p = model.add_parameters((HID_SIZE))                  # Weights of the softmax
 40 | W_sm_p = model.add_parameters((nwords, HID_SIZE))         # Weights of the softmax
 41 | b_sm_p = model.add_parameters((nwords))                   # Softmax bias
 42 | 
 43 | # A function to calculate scores for one value
 44 | def calc_score_of_history(words):
 45 |   # Lookup the embeddings and concatenate them
 46 |   emb = dy.concatenate([W_emb[x] for x in words])
 47 |   # Create the hidden layer
 48 |   W_h = dy.parameter(W_h_p)
 49 |   b_h = dy.parameter(b_h_p)
 50 |   h = dy.tanh(dy.affine_transform([b_h, W_h, emb]))
 51 |   # Calculate the score and return
 52 |   W_sm = dy.parameter(W_sm_p)
 53 |   b_sm = dy.parameter(b_sm_p)
 54 |   return dy.affine_transform([b_sm, W_sm, h])
 55 | 
 56 | # Calculate the loss value for the entire sentence
 57 | def calc_sent_loss(sent):
 58 |   # Create a computation graph
 59 |   dy.renew_cg()
 60 |   # The initial history is equal to end of sentence symbols
 61 |   hist = [S] * N
 62 |   # Step through the sentence, including the end of sentence token
 63 |   all_losses = []
 64 |   for next_word in sent + [S]:
 65 |     s = calc_score_of_history(hist)
 66 |     all_losses.append(dy.pickneglogsoftmax(s, next_word))
 67 |     hist = hist[1:] + [next_word]
 68 |   return dy.esum(all_losses)
 69 | 
 70 | MAX_LEN = 100
 71 | # Generate a sentence
 72 | def generate_sent():
 73 |   dy.renew_cg()
 74 |   hist = [S] * N
 75 |   sent = []
 76 |   while True:
 77 |     p = dy.softmax(calc_score_of_history(hist)).npvalue()
 78 |     next_word = np.random.choice(nwords, p=p/p.sum())
 79 |     if next_word == S or len(sent) == MAX_LEN:
 80 |       break
 81 |     sent.append(next_word)
 82 |     hist = hist[1:] + [next_word]
 83 |   return sent
 84 | 
 85 | for ITER in range(100):
 86 |   # Perform training
 87 |   random.shuffle(train)
 88 |   train_words, train_loss = 0, 0.0
 89 |   start = time.time()
 90 |   for sent_id, sent in enumerate(train):
 91 |     my_loss = calc_sent_loss(sent)
 92 |     train_loss += my_loss.value()
 93 |     train_words += len(sent)
 94 |     my_loss.backward()
 95 |     trainer.update()
 96 |     if (sent_id+1) % 5000 == 0:
 97 |       print("--finished %r sentences" % (sent_id+1))
 98 |   print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start))
 99 |   # Evaluate on dev set
100 |   dev_words, dev_loss = 0, 0.0
101 |   start = time.time()
102 |   for sent_id, sent in enumerate(dev):
103 |     my_loss = calc_sent_loss(sent)
104 |     dev_loss += my_loss.value()
105 |     dev_words += len(sent)
106 |     trainer.update()
107 |   print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))
108 |   # Generate a few sentences
109 |   for _ in range(5):
110 |     sent = generate_sent()
111 |     print(" ".join([i2w[x] for x in sent]))
112 | 


--------------------------------------------------------------------------------
/03-wordemb/kwic.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | # Usage:
 4 | #  kwic.py word < corpus.txt > output.tsv
 5 | 
 6 | N = 4
 7 | 
 8 | for line in sys.stdin:
 9 |   arr = ["<s>"] * N + line.strip().split() + ["<s>"] * N
10 |   for i, w in enumerate(arr):
11 |     if w == sys.argv[1]:
12 |       print("\t".join(arr[i-N:i+N+1]))
13 | 


--------------------------------------------------------------------------------
/03-wordemb/tsne.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #  tsne.py
  3 | #
  4 | # Implementation of t-SNE in Python. The implementation was tested on Python 2.7.10, and it requires a working
  5 | # installation of NumPy. The implementation comes with an example on the MNIST dataset. In order to plot the
  6 | # results of this example, a working installation of matplotlib is required.
  7 | #
  8 | # The example can be run by executing: `ipython tsne.py`
  9 | #
 10 | #
 11 | #  Created by Laurens van der Maaten on 20-12-08.
 12 | #  Copyright (c) 2008 Tilburg University. All rights reserved.
 13 | 
 14 | import numpy as Math
 15 | import pylab as Plot
 16 | 
 17 | def Hbeta(D = Math.array([]), beta = 1.0):
 18 | 	"""Compute the perplexity and the P-row for a specific value of the precision of a Gaussian distribution."""
 19 | 
 20 | 	# Compute P-row and corresponding perplexity
 21 | 	P = Math.exp(-D.copy() * beta)
 22 | 	sumP = sum(P)
 23 | 	H = Math.log(sumP) + beta * Math.sum(D * P) / sumP
 24 | 	P = P / sumP
 25 | 	return H, P
 26 | 
 27 | 
 28 | def x2p(X = Math.array([]), tol = 1e-5, perplexity = 30.0):
 29 | 	"""Performs a binary search to get P-values in such a way that each conditional Gaussian has the same perplexity."""
 30 | 
 31 | 	# Initialize some variables
 32 | 	print("Computing pairwise distances...")
 33 | 	(n, d) = X.shape
 34 | 	sum_X = Math.sum(Math.square(X), 1)
 35 | 	D = Math.add(Math.add(-2 * Math.dot(X, X.T), sum_X).T, sum_X)
 36 | 	P = Math.zeros((n, n))
 37 | 	beta = Math.ones((n, 1))
 38 | 	logU = Math.log(perplexity)
 39 | 
 40 | 	# Loop over all datapoints
 41 | 	for i in range(n):
 42 | 
 43 | 		# Print progress
 44 | 		if i % 500 == 0:
 45 | 			print("Computing P-values for point ", i, " of ", n, "...")
 46 | 
 47 | 		# Compute the Gaussian kernel and entropy for the current precision
 48 | 		betamin = -Math.inf
 49 | 		betamax =  Math.inf
 50 | 		Di = D[i, Math.concatenate((Math.r_[0:i], Math.r_[i+1:n]))]
 51 | 		(H, thisP) = Hbeta(Di, beta[i])
 52 | 
 53 | 		# Evaluate whether the perplexity is within tolerance
 54 | 		Hdiff = H - logU
 55 | 		tries = 0
 56 | 		while Math.abs(Hdiff) > tol and tries < 50:
 57 | 
 58 | 			# If not, increase or decrease precision
 59 | 			if Hdiff > 0:
 60 | 				betamin = beta[i].copy()
 61 | 				if betamax == Math.inf or betamax == -Math.inf:
 62 | 					beta[i] = beta[i] * 2
 63 | 				else:
 64 | 					beta[i] = (beta[i] + betamax) / 2
 65 | 			else:
 66 | 				betamax = beta[i].copy()
 67 | 				if betamin == Math.inf or betamin == -Math.inf:
 68 | 					beta[i] = beta[i] / 2
 69 | 				else:
 70 | 					beta[i] = (beta[i] + betamin) / 2
 71 | 
 72 | 			# Recompute the values
 73 | 			(H, thisP) = Hbeta(Di, beta[i])
 74 | 			Hdiff = H - logU
 75 | 			tries = tries + 1
 76 | 
 77 | 		# Set the final row of P
 78 | 		P[i, Math.concatenate((Math.r_[0:i], Math.r_[i+1:n]))] = thisP
 79 | 
 80 | 	# Return final P-matrix
 81 | 	print("Mean value of sigma: ", Math.mean(Math.sqrt(1 / beta)))
 82 | 	return P
 83 | 
 84 | 
 85 | def pca(X = Math.array([]), no_dims = 50):
 86 | 	"""Runs PCA on the NxD array X in order to reduce its dimensionality to no_dims dimensions."""
 87 | 
 88 | 	print("Preprocessing the data using PCA...")
 89 | 	(n, d) = X.shape
 90 | 	X = X - Math.tile(Math.mean(X, 0), (n, 1))
 91 | 	(l, M) = Math.linalg.eig(Math.dot(X.T, X))
 92 | 	Y = Math.dot(X, M[:,0:no_dims])
 93 | 	return Y
 94 | 
 95 | 
 96 | def tsne(X = Math.array([]), no_dims = 2, initial_dims = 50, perplexity = 30.0):
 97 | 	"""Runs t-SNE on the dataset in the NxD array X to reduce its dimensionality to no_dims dimensions.
 98 | 	The syntaxis of the function is Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array."""
 99 | 
100 | 	# Check inputs
101 | 	if isinstance(no_dims, float):
102 | 		print("Error: array X should have type float.")
103 | 		return -1
104 | 	if round(no_dims) != no_dims:
105 | 		print("Error: number of dimensions should be an integer.")
106 | 		return -1
107 | 
108 | 	# Initialize variables
109 | 	X = pca(X, initial_dims).real
110 | 	(n, d) = X.shape
111 | 	max_iter = 1000
112 | 	initial_momentum = 0.5
113 | 	final_momentum = 0.8
114 | 	eta = 500
115 | 	min_gain = 0.01
116 | 	Y = Math.random.randn(n, no_dims)
117 | 	dY = Math.zeros((n, no_dims))
118 | 	iY = Math.zeros((n, no_dims))
119 | 	gains = Math.ones((n, no_dims))
120 | 
121 | 	# Compute P-values
122 | 	P = x2p(X, 1e-5, perplexity)
123 | 	P = P + Math.transpose(P)
124 | 	P = P / Math.sum(P)
125 | 	P = P * 4;									# early exaggeration
126 | 	P = Math.maximum(P, 1e-12)
127 | 
128 | 	# Run iterations
129 | 	for iter in range(max_iter):
130 | 
131 | 		# Compute pairwise affinities
132 | 		sum_Y = Math.sum(Math.square(Y), 1)
133 | 		num = 1 / (1 + Math.add(Math.add(-2 * Math.dot(Y, Y.T), sum_Y).T, sum_Y))
134 | 		num[range(n), range(n)] = 0
135 | 		Q = num / Math.sum(num)
136 | 		Q = Math.maximum(Q, 1e-12)
137 | 
138 | 		# Compute gradient
139 | 		PQ = P - Q
140 | 		for i in range(n):
141 | 			dY[i,:] = Math.sum(Math.tile(PQ[:,i] * num[:,i], (no_dims, 1)).T * (Y[i,:] - Y), 0)
142 | 
143 | 		# Perform the update
144 | 		if iter < 20:
145 | 			momentum = initial_momentum
146 | 		else:
147 | 			momentum = final_momentum
148 | 		gains = (gains + 0.2) * ((dY > 0) != (iY > 0)) + (gains * 0.8) * ((dY > 0) == (iY > 0))
149 | 		gains[gains < min_gain] = min_gain
150 | 		iY = momentum * iY - eta * (gains * dY)
151 | 		Y = Y + iY
152 | 		Y = Y - Math.tile(Math.mean(Y, 0), (n, 1))
153 | 
154 | 		# Compute current value of cost function
155 | 		if (iter + 1) % 10 == 0:
156 | 			C = Math.sum(P * Math.log(P / Q))
157 | 			print("Iteration ", (iter + 1), ": error is ", C)
158 | 
159 | 		# Stop lying about P-values
160 | 		if iter == 100:
161 | 			P = P / 4
162 | 
163 | 	# Return solution
164 | 	return Y
165 | 
166 | 
167 | if __name__ == "__main__":
168 | 	print("Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset.")
169 | 	print("Running example on 2,500 MNIST digits...")
170 | 	X = Math.loadtxt("mnist2500_X.txt")
171 | 	labels = Math.loadtxt("mnist2500_labels.txt")
172 | 	Y = tsne(X, 2, 50, 20.0)
173 | 	Plot.scatter(Y[:,0], Y[:,1], 20, labels)
174 | 	Plot.show()
175 | 


--------------------------------------------------------------------------------
/03-wordemb/wordemb-cbow.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import math
 3 | import time
 4 | import random
 5 | import dynet as dy
 6 | import numpy as np
 7 | 
 8 | N=2 #length of window on each side (so N=2 gives a total window size of 5, as in t-2 t-1 t t+1 t+2)
 9 | EMB_SIZE = 128 # The size of the embedding
10 | 
11 | embeddings_location = "embeddings.txt" #the file to write the word embeddings to
12 | labels_location = "labels.txt" #the file to write the labels to
13 | 
14 | # We reuse the data reading from the language modeling class
15 | w2i = defaultdict(lambda: len(w2i))
16 | S = w2i["<s>"]
17 | UNK = w2i["<unk>"]
18 | def read_dataset(filename):
19 |   with open(filename, "r") as f:
20 |     for line in f:
21 |       yield [w2i[x] for x in line.strip().split(" ")]
22 | 
23 | # Read in the data
24 | train = list(read_dataset("../data/ptb/train.txt"))
25 | w2i = defaultdict(lambda: UNK, w2i)
26 | dev = list(read_dataset("../data/ptb/valid.txt"))
27 | i2w = {v: k for k, v in w2i.items()}
28 | nwords = len(w2i)
29 | 
30 | with open(labels_location, 'w') as labels_file:
31 |   for i in range(nwords):
32 |     labels_file.write(i2w[i] + '\n')
33 | 
34 | # Start DyNet and define trainer
35 | model = dy.Model()
36 | trainer = dy.SimpleSGDTrainer(model, learning_rate=0.1)
37 | 
38 | # Define the model
39 | W_c_p = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights at each position
40 | W_w_p = model.add_parameters((nwords, EMB_SIZE))         # Weights of the softmax
41 | 
42 | # Calculate the loss value for the entire sentence
43 | def calc_sent_loss(sent):
44 |   # Create a computation graph
45 |   dy.renew_cg()
46 |   
47 |   #add padding to the sentence equal to the size of the window
48 |   #as we need to predict the eos as well, the future window at that point is N past it 
49 |   padded_sent = [S] * N + sent + [S] * N
50 |   padded_emb = [W_c_p[x] for x in padded_sent]
51 | 
52 |   W_w = dy.parameter(W_w_p)
53 | 
54 |   # Step through the sentence
55 |   all_losses = [] 
56 |   for i in range(N,len(sent)+N):
57 |     c = dy.esum(padded_emb[i-N:i] + padded_emb[i+1:i+N+1])
58 |     s = W_w * c
59 |     all_losses.append(dy.pickneglogsoftmax(s, padded_sent[i]))
60 |   return dy.esum(all_losses)
61 | 
62 | MAX_LEN = 100
63 | 
64 | for ITER in range(100):
65 |   print("started iter %r" % ITER)
66 |   # Perform training
67 |   random.shuffle(train)
68 |   train_words, train_loss = 0, 0.0
69 |   start = time.time()
70 |   for sent_id, sent in enumerate(train):
71 |     my_loss = calc_sent_loss(sent)
72 |     train_loss += my_loss.value()
73 |     train_words += len(sent)
74 |     my_loss.backward()
75 |     trainer.update()
76 |     if (sent_id+1) % 5000 == 0:
77 |       print("--finished %r sentences" % (sent_id+1))
78 |   print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start))
79 |   # Evaluate on dev set
80 |   dev_words, dev_loss = 0, 0.0
81 |   start = time.time()
82 |   for sent_id, sent in enumerate(dev):
83 |     my_loss = calc_sent_loss(sent)
84 |     dev_loss += my_loss.value()
85 |     dev_words += len(sent)
86 |     trainer.update()
87 |   print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))
88 | 
89 |   print("saving embedding files")
90 |   with open(embeddings_location, 'w') as embeddings_file:
91 |     W_w_np = W_w_p.as_array()
92 |     for i in range(nwords):
93 |       ith_embedding = '\t'.join(map(str, W_w_np[i]))
94 |       embeddings_file.write(ith_embedding + '\n')
95 | 


--------------------------------------------------------------------------------
/03-wordemb/wordemb-skip.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import math
 3 | import time
 4 | import random
 5 | import dynet as dy
 6 | import numpy as np
 7 | 
 8 | N=2 #length of window on each side (so N=2 gives a total window size of 5, as in t-2 t-1 t t+1 t+2)
 9 | EMB_SIZE = 128 # The size of the embedding
10 | 
11 | embeddings_location = "embeddings.txt" #the file to write the word embeddings to
12 | labels_location = "labels.txt" #the file to write the labels to
13 | 
14 | # We reuse the data reading from the language modeling class
15 | w2i = defaultdict(lambda: len(w2i))
16 | S = w2i["<s>"]
17 | UNK = w2i["<unk>"]
18 | def read_dataset(filename):
19 |   with open(filename, "r") as f:
20 |     for line in f:
21 |       yield [w2i[x] for x in line.strip().split(" ")]
22 | 
23 | # Read in the data
24 | train = list(read_dataset("../data/ptb/train.txt"))
25 | w2i = defaultdict(lambda: UNK, w2i)
26 | dev = list(read_dataset("../data/ptb/valid.txt"))
27 | i2w = {v: k for k, v in w2i.items()}
28 | nwords = len(w2i)
29 | 
30 | with open(labels_location, 'w') as labels_file:
31 |   for i in range(nwords):
32 |     labels_file.write(i2w[i] + '\n')
33 | 
34 | # Start DyNet and define trainer
35 | model = dy.Model()
36 | trainer = dy.SimpleSGDTrainer(model, learning_rate=0.1)
37 | 
38 | # Define the model
39 | W_c_p = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights at each position
40 | W_w_p = model.add_parameters((nwords, EMB_SIZE))         # Weights of the softmax
41 | 
42 | # Calculate the loss value for the entire sentence
43 | def calc_sent_loss(sent):
44 |   # Create a computation graph
45 |   dy.renew_cg()
46 |   
47 |   #add padding to the sentence equal to the size of the window
48 |   #as we need to predict the eos as well, the future window at that point is N past it 
49 |   emb = [W_c_p[x] for x in sent]
50 | 
51 |   W_w = dy.parameter(W_w_p)
52 | 
53 |   # Step through the sentence
54 |   all_losses = [] 
55 |   for i, my_emb in enumerate(emb):
56 |     s = W_w * my_emb
57 |     lp = dy.log_softmax(s)
58 |     for j in range(1,N+1):
59 |       all_losses.append(dy.pick(lp, sent[i-j] if i-j >= 0 else S))
60 |       all_losses.append(dy.pick(lp, sent[i+j] if i+j < len(sent) else S))
61 |   return dy.esum(all_losses)
62 | 
63 | MAX_LEN = 100
64 | 
65 | for ITER in range(100):
66 |   print("started iter %r" % ITER)
67 |   # Perform training
68 |   random.shuffle(train)
69 |   train_words, train_loss = 0, 0.0
70 |   start = time.time()
71 |   for sent_id, sent in enumerate(train):
72 |     my_loss = calc_sent_loss(sent)
73 |     train_loss += my_loss.value()
74 |     train_words += len(sent)
75 |     my_loss.backward()
76 |     trainer.update()
77 |     if (sent_id+1) % 5000 == 0:
78 |       print("--finished %r sentences" % (sent_id+1))
79 |   print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start))
80 |   # Evaluate on dev set
81 |   dev_words, dev_loss = 0, 0.0
82 |   start = time.time()
83 |   for sent_id, sent in enumerate(dev):
84 |     my_loss = calc_sent_loss(sent)
85 |     dev_loss += my_loss.value()
86 |     dev_words += len(sent)
87 |     trainer.update()
88 |   print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))
89 | 
90 |   print("saving embedding files")
91 |   with open(embeddings_location, 'w') as embeddings_file:
92 |     W_w_np = W_w_p.as_array()
93 |     for i in range(nwords):
94 |       ith_embedding = '\t'.join(map(str, W_w_np[i]))
95 |       embeddings_file.write(ith_embedding + '\n')
96 | 


--------------------------------------------------------------------------------
/03-wordemb/wordemb-vis-tsne.py:
--------------------------------------------------------------------------------
 1 | # This visualizer is based off of 
 2 | # http://nlp.yvespeirsman.be/blog/visualizing-word-embeddings-with-tsne/
 3 | 
 4 | import pylab as Plot
 5 | import numpy as np
 6 | import argparse
 7 | from tsne import tsne # from http://lvdmaaten.github.io/tsne/
 8 | import pdb
 9 | 
10 | parser = argparse.ArgumentParser(description='Visualize word embeddings using TSNE')
11 | parser.add_argument('vector_file', type=str, help='location of the word vector file')
12 | parser.add_argument('label_file', type=str, help='location of the word vector file')
13 | parser.add_argument('--target_words', dest='target_words', type=str, default=None, help='a list of words to display (if none, shows 1000 random words')
14 | 
15 | args = parser.parse_args()
16 | 
17 | #read the datafile, with the option for a seperate labels file
18 | def read_data(vector_file_path, labels_file_path=None):
19 |   vocab = []
20 |   word_vectors = []
21 | 
22 |   with open(labels_file_path) as sample_file:
23 |     for line in sample_file:
24 |       vocab.append(line.strip())
25 |   with open(vector_file_path) as vector_file:
26 |     for line in vector_file:
27 |       line = line.strip()
28 |       word_vector = line.split()
29 |       word_vectors.append([float(i) for i in word_vector])
30 |   return np.array(word_vectors), vocab
31 | 
32 | def display_data(word_vectors, words, target_words=None):
33 |   target_matrix = word_vectors.copy()
34 |   if target_words:
35 |     target_words = [line.strip().lower() for line in open(target_words)][:2000]
36 |     rows = [words.index(word) for word in target_words if word in words]
37 |     target_matrix = target_matrix[rows,:]
38 |   else:
39 |     rows = np.random.choice(len(word_vectors), size=1000, replace=False)
40 |     target_matrix = target_matrix[rows,:]
41 |   reduced_matrix = tsne(target_matrix, 2);
42 | 
43 |   Plot.figure(figsize=(200, 200), dpi=100)
44 |   max_x = np.amax(reduced_matrix, axis=0)[0]
45 |   max_y = np.amax(reduced_matrix, axis=0)[1]
46 |   Plot.xlim((-max_x,max_x))
47 |   Plot.ylim((-max_y,max_y))
48 | 
49 |   Plot.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1], 20);
50 | 
51 |   for row_id in range(0, len(rows)):
52 |       target_word = words[rows[row_id]]
53 |       x = reduced_matrix[row_id, 0]
54 |       y = reduced_matrix[row_id, 1]
55 |       Plot.annotate(target_word, (x,y))
56 |   Plot.savefig("word_vectors.png");
57 | 
58 | if __name__ == "__main__":
59 |   X, labels = read_data(args.vector_file, args.label_file)
60 |   display_data(X, labels, args.target_words)
61 | 
62 | 


--------------------------------------------------------------------------------
/04-efficiency/slow-impl.py:
--------------------------------------------------------------------------------
 1 | import dynet as dy
 2 | import numpy as np
 3 | 
 4 | # This implementation will be unnecessarily slow, especially on the GPU.
 5 | # It can be improved by following the speed tricks covered in class:
 6 | # 1) Don't repeat operations.
 7 | # 2) Minimize the number of operations.
 8 | # 3) Minimize the number of CPU-GPU memory copies, make them earlier.
 9 | 
10 | # Create the model
11 | model = dy.ParameterCollection()
12 | trainer = dy.SimpleSGDTrainer(model)
13 | W_p = model.add_parameters((100,100))
14 | 
15 | # Create the "training data"
16 | x_vecs = []
17 | y_vecs = []
18 | for i in range(10):
19 |   x_vecs.append(np.random.rand(100))
20 |   y_vecs.append(np.random.rand(100))
21 | 
22 | # Do the processing
23 | for my_iter in range(1000):
24 |   dy.renew_cg()
25 |   W = dy.parameter(W_p)
26 |   total = 0
27 |   for x in x_vecs:
28 |     for y in y_vecs:
29 |       x_exp = dy.inputTensor(x)
30 |       y_exp = dy.inputTensor(y)
31 |       total = total + dy.dot_product(W * x_exp, y_exp)
32 |   total.forward()
33 |   total.backward()
34 |   trainer.update()
35 | 
36 | 


--------------------------------------------------------------------------------
/04-efficiency/wordemb-skip-binary.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import math
  3 | import time
  4 | import random
  5 | import dynet as dy
  6 | import numpy as np
  7 | import pdb
  8 | 
  9 | N=2 #length of window on each side (so N=2 gives a total window size of 5, as in t-2 t-1 t t+1 t+2)
 10 | EMB_SIZE = 128 # The size of the embedding
 11 | 
 12 | embeddings_location = "embeddings.txt" #the file to write the word embeddings to
 13 | labels_location = "labels.txt" #the file to write the labels to
 14 | 
 15 | # We reuse the data reading from the language modeling class
 16 | w2i = defaultdict(lambda: len(w2i))
 17 | 
 18 | #word counts for negative sampling
 19 | word_counts = defaultdict(int)
 20 | 
 21 | S = w2i["<s>"]
 22 | UNK = w2i["<unk>"]
 23 | def read_dataset(filename):
 24 |   with open(filename, "r") as f:
 25 |     for line in f:
 26 |       line = line.strip().split(" ")
 27 |       for word in line:
 28 |         word_counts[w2i[word]] += 1
 29 |       yield [w2i[x] for x in line]
 30 | 
 31 | # Read in the data
 32 | train = list(read_dataset("../data/ptb/train.txt"))
 33 | w2i = defaultdict(lambda: UNK, w2i)
 34 | dev = list(read_dataset("../data/ptb/valid.txt"))
 35 | i2w = {v: k for k, v in w2i.items()}
 36 | nwords = len(w2i)
 37 | nbits = len(np.binary_repr(nwords-1))
 38 | 
 39 | with open(labels_location, 'w') as labels_file:
 40 |   for i in range(nwords):
 41 |     labels_file.write(i2w[i] + '\n')
 42 | 
 43 | # Start DyNet and define trainer
 44 | model = dy.Model()
 45 | trainer = dy.SimpleSGDTrainer(model, learning_rate=0.1)
 46 | 
 47 | # Define the model
 48 | W_w_p = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights
 49 | W_c_p = model.add_parameters((nbits, EMB_SIZE)) # Binary prediction weights
 50 | 
 51 | # Calculate the loss value for the entire sentence
 52 | def calc_sent_loss(sent):
 53 |   # Create a computation graph
 54 |   dy.renew_cg()
 55 | 
 56 |   W_c = dy.parameter(W_c_p)
 57 |   
 58 |   # Get embeddings for the sentence
 59 |   emb = [W_w_p[x] for x in sent]
 60 | 
 61 |   # Step through the sentence and calculate binary prediction losses
 62 |   all_losses = [] 
 63 |   for i, my_emb in enumerate(emb):
 64 |     scores = dy.logistic(W_c * my_emb)
 65 |     pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] +
 66 |                  [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)])
 67 |     word_repr = [[float(y) for y in np.binary_repr(x).zfill(nbits)] for x in pos_words]
 68 |     word_repr = [dy.inputVector(x) for x in word_repr]
 69 |     all_losses.extend([dy.binary_log_loss(scores, x) for x in word_repr])
 70 |   return dy.esum(all_losses)
 71 | 
 72 | MAX_LEN = 100
 73 | 
 74 | for ITER in range(100):
 75 |   print("started iter %r" % ITER)
 76 |   # Perform training
 77 |   random.shuffle(train)
 78 |   train_words, train_loss = 0, 0.0
 79 |   start = time.time()
 80 |   for sent_id, sent in enumerate(train):
 81 |     my_loss = calc_sent_loss(sent)
 82 |     train_loss += my_loss.value()
 83 |     train_words += len(sent)
 84 |     my_loss.backward()
 85 |     trainer.update()
 86 |     if (sent_id+1) % 5000 == 0:
 87 |       print("--finished %r sentences" % (sent_id+1))
 88 |   print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start))
 89 |   # Evaluate on dev set
 90 |   dev_words, dev_loss = 0, 0.0
 91 |   start = time.time()
 92 |   for sent_id, sent in enumerate(dev):
 93 |     my_loss = calc_sent_loss(sent)
 94 |     dev_loss += my_loss.value()
 95 |     dev_words += len(sent)
 96 |     trainer.update()
 97 |   print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))
 98 | 
 99 |   print("saving embedding files")
100 |   with open(embeddings_location, 'w') as embeddings_file:
101 |     W_w_np = W_w_p.as_array()
102 |     for i in range(nwords):
103 |       ith_embedding = '\t'.join(map(str, W_w_np[i]))
104 |       embeddings_file.write(ith_embedding + '\n')
105 | 


--------------------------------------------------------------------------------
/04-efficiency/wordemb-skip-ns.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import math
  3 | import time
  4 | import random
  5 | import dynet as dy
  6 | import numpy as np
  7 | import pdb
  8 | 
  9 | K=3 #number of negative samples
 10 | N=2 #length of window on each side (so N=2 gives a total window size of 5, as in t-2 t-1 t t+1 t+2)
 11 | EMB_SIZE = 128 # The size of the embedding
 12 | 
 13 | embeddings_location = "embeddings.txt" #the file to write the word embeddings to
 14 | labels_location = "labels.txt" #the file to write the labels to
 15 | 
 16 | # We reuse the data reading from the language modeling class
 17 | w2i = defaultdict(lambda: len(w2i))
 18 | 
 19 | #word counts for negative sampling
 20 | word_counts = defaultdict(int)
 21 | 
 22 | S = w2i["<s>"]
 23 | UNK = w2i["<unk>"]
 24 | def read_dataset(filename):
 25 |   with open(filename, "r") as f:
 26 |     for line in f:
 27 |       line = line.strip().split(" ")
 28 |       for word in line:
 29 |         word_counts[w2i[word]] += 1
 30 |       yield [w2i[x] for x in line]
 31 | 
 32 | 
 33 | # Read in the data
 34 | train = list(read_dataset("../data/ptb/train.txt"))
 35 | w2i = defaultdict(lambda: UNK, w2i)
 36 | dev = list(read_dataset("../data/ptb/valid.txt"))
 37 | i2w = {v: k for k, v in w2i.items()}
 38 | nwords = len(w2i)
 39 | 
 40 | 
 41 | # take the word counts to the 3/4, normalize
 42 | counts =  np.array([list(x) for x in word_counts.items()])[:,1]**.75
 43 | normalizing_constant = sum(counts)
 44 | word_probabilities = np.zeros(nwords)
 45 | for word_id in word_counts:
 46 |   word_probabilities[word_id] = word_counts[word_id]**.75/normalizing_constant
 47 | 
 48 | with open(labels_location, 'w') as labels_file:
 49 |   for i in range(nwords):
 50 |     labels_file.write(i2w[i] + '\n')
 51 | 
 52 | # Start DyNet and define trainer
 53 | model = dy.Model()
 54 | trainer = dy.SimpleSGDTrainer(model, learning_rate=0.1)
 55 | 
 56 | # Define the model
 57 | W_c_p = model.add_lookup_parameters((nwords, EMB_SIZE)) # Context word weights
 58 | W_w_p = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights
 59 | 
 60 | # Calculate the loss value for the entire sentence
 61 | def calc_sent_loss(sent):
 62 |   # Create a computation graph
 63 |   dy.renew_cg()
 64 |   
 65 |   # Get embeddings for the sentence
 66 |   emb = [W_w_p[x] for x in sent]
 67 | 
 68 |   # Sample K negative words for each predicted word at each position
 69 |   all_neg_words = np.random.choice(nwords, size=2*N*K*len(emb), replace=True, p=word_probabilities)
 70 | 
 71 |   # W_w = dy.parameter(W_w_p)
 72 |   # Step through the sentence and calculate the negative and positive losses
 73 |   all_losses = [] 
 74 |   for i, my_emb in enumerate(emb):
 75 |     neg_words = all_neg_words[i*K*2*N:(i+1)*K*2*N]
 76 |     pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] +
 77 |                  [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)])
 78 |     neg_loss = -dy.log(dy.logistic(-dy.dot_product(my_emb, dy.lookup_batch(W_c_p, neg_words))))
 79 |     pos_loss = -dy.log(dy.logistic(dy.dot_product(my_emb, dy.lookup_batch(W_c_p, pos_words))))
 80 |     all_losses.append(dy.sum_batches(neg_loss) + dy.sum_batches(pos_loss))
 81 |   return dy.esum(all_losses)
 82 | 
 83 | MAX_LEN = 100
 84 | 
 85 | for ITER in range(100):
 86 |   print("started iter %r" % ITER)
 87 |   # Perform training
 88 |   random.shuffle(train)
 89 |   train_words, train_loss = 0, 0.0
 90 |   start = time.time()
 91 |   for sent_id, sent in enumerate(train):
 92 |     my_loss = calc_sent_loss(sent)
 93 |     train_loss += my_loss.value()
 94 |     train_words += len(sent)
 95 |     my_loss.backward()
 96 |     trainer.update()
 97 |     if (sent_id+1) % 5000 == 0:
 98 |       print("--finished %r sentences" % (sent_id+1))
 99 |   print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start))
100 |   # Evaluate on dev set
101 |   dev_words, dev_loss = 0, 0.0
102 |   start = time.time()
103 |   for sent_id, sent in enumerate(dev):
104 |     my_loss = calc_sent_loss(sent)
105 |     dev_loss += my_loss.value()
106 |     dev_words += len(sent)
107 |     trainer.update()
108 |   print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))
109 | 
110 |   print("saving embedding files")
111 |   with open(embeddings_location, 'w') as embeddings_file:
112 |     W_w_np = W_w_p.as_array()
113 |     for i in range(nwords):
114 |       ith_embedding = '\t'.join(map(str, W_w_np[i]))
115 |       embeddings_file.write(ith_embedding + '\n')
116 | 


--------------------------------------------------------------------------------
/05-cnn/cnn-activation.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import time
  3 | import random
  4 | import dynet as dy
  5 | import numpy as np
  6 | 
  7 | np.set_printoptions(linewidth=np.nan, threshold=np.nan)
  8 | 
  9 | # Functions to read in the corpus
 10 | w2i = defaultdict(lambda: len(w2i))
 11 | UNK = w2i["<unk>"]
 12 | def read_dataset(filename):
 13 |     with open(filename, "r") as f:
 14 |         for line in f:
 15 |             tag, words = line.lower().strip().split(" ||| ")
 16 |             words = words.split(" ")
 17 |             yield (words, [w2i[x] for x in words], int(tag))
 18 | 
 19 | # Read in the data
 20 | train = list(read_dataset("../data/classes/train.txt"))
 21 | w2i = defaultdict(lambda: UNK, w2i)
 22 | dev = list(read_dataset("../data/classes/test.txt"))
 23 | nwords = len(w2i)
 24 | ntags = 5
 25 | 
 26 | # Start DyNet and define trainer
 27 | model = dy.Model()
 28 | trainer = dy.AdamTrainer(model)
 29 | 
 30 | # Define the model
 31 | EMB_SIZE = 10
 32 | W_emb = model.add_lookup_parameters((nwords, 1, 1, EMB_SIZE)) # Word embeddings
 33 | WIN_SIZE = 3
 34 | FILTER_SIZE = 8
 35 | W_cnn = model.add_parameters((1, WIN_SIZE, EMB_SIZE, FILTER_SIZE)) # cnn weights
 36 | b_cnn = model.add_parameters((FILTER_SIZE)) # cnn bias
 37 | 
 38 | W_sm = model.add_parameters((ntags, FILTER_SIZE))          # Softmax weights
 39 | b_sm = model.add_parameters((ntags))                      # Softmax bias
 40 | 
 41 | def calc_scores(wids):
 42 |     dy.renew_cg()
 43 |     W_cnn_express = dy.parameter(W_cnn)
 44 |     b_cnn_express = dy.parameter(b_cnn)
 45 |     W_sm_express = dy.parameter(W_sm)
 46 |     b_sm_express = dy.parameter(b_sm)
 47 |     if len(wids) < WIN_SIZE:
 48 |         wids += [0] * (WIN_SIZE-len(wids))
 49 | 
 50 |     cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1)
 51 |     cnn_out = dy.conv2d_bias(cnn_in, W_cnn_express, b_cnn_express, stride=(1, 1), is_valid=False)
 52 |     pool_out = dy.max_dim(cnn_out, d=1)
 53 |     pool_out = dy.reshape(pool_out, (FILTER_SIZE,))
 54 |     pool_out = dy.rectify(pool_out)
 55 |     return W_sm_express * pool_out + b_sm_express
 56 | 
 57 | def calc_predict_and_activations(wids, tag, words):
 58 |     dy.renew_cg()
 59 |     W_cnn_express = dy.parameter(W_cnn)
 60 |     b_cnn_express = dy.parameter(b_cnn)
 61 |     W_sm_express = dy.parameter(W_sm)
 62 |     b_sm_express = dy.parameter(b_sm)
 63 |     if len(wids) < WIN_SIZE:
 64 |         wids += [0] * (WIN_SIZE-len(wids))
 65 | 
 66 |     cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1)
 67 |     cnn_out = dy.conv2d_bias(cnn_in, W_cnn_express, b_cnn_express, stride=(1, 1), is_valid=False)
 68 |     filters = (dy.reshape(cnn_out, (len(wids), FILTER_SIZE))).npvalue()
 69 |     activations = filters.argmax(axis=0)
 70 | 
 71 |     pool_out = dy.max_dim(cnn_out, d=1)
 72 |     pool_out = dy.reshape(pool_out, (FILTER_SIZE,))
 73 |     pool_out = dy.rectify(pool_out)
 74 | 
 75 |     scores = (W_sm_express * pool_out + b_sm_express).npvalue()
 76 |     print '%d ||| %s' % (tag, ' '.join(words))
 77 |     predict = np.argmax(scores)
 78 |     print display_activations(words, activations)
 79 |     print 'scores=%s, predict: %d' % (scores, predict)
 80 |     features = pool_out.npvalue()
 81 |     W = W_sm_express.npvalue()
 82 |     bias = b_sm_express.npvalue()
 83 |     print '  bias=%s' % bias
 84 |     contributions = W * features
 85 |     print ' very bad (%.4f): %s' % (scores[0], contributions[0])
 86 |     print '      bad (%.4f): %s' % (scores[1], contributions[1])
 87 |     print '  neutral (%.4f): %s' % (scores[2], contributions[2])
 88 |     print '     good (%.4f): %s' % (scores[3], contributions[3])
 89 |     print 'very good (%.4f): %s' % (scores[4], contributions[4])
 90 | 
 91 | 
 92 | def display_activations(words, activations):
 93 |     pad_begin = (WIN_SIZE - 1) / 2
 94 |     pad_end = WIN_SIZE - 1 - pad_begin
 95 |     words_padded = ['pad' for i in range(pad_begin)] + words + ['pad' for i in range(pad_end)]
 96 | 
 97 |     ngrams = []
 98 |     for act in activations:
 99 |         ngrams.append('[' + ', '.join(words_padded[act:act+WIN_SIZE]) + ']')
100 | 
101 |     return ngrams
102 | 
103 | for ITER in range(10):
104 |     # Perform training
105 |     random.shuffle(train)
106 |     train_loss = 0.0
107 |     train_correct = 0.0
108 |     start = time.time()
109 |     for _, wids, tag in train:
110 |         scores = calc_scores(wids)
111 |         predict = np.argmax(scores.npvalue())
112 |         if predict == tag:
113 |             train_correct += 1
114 | 
115 |         my_loss = dy.pickneglogsoftmax(scores, tag)
116 |         train_loss += my_loss.value()
117 |         my_loss.backward()
118 |         trainer.update()
119 |     print("iter %r: train loss/sent=%.4f, acc=%.4f, time=%.2fs" % (ITER, train_loss/len(train), train_correct/len(train), time.time()-start))
120 |     # Perform testing
121 |     test_correct = 0.0
122 |     for _, wids, tag in dev:
123 |         scores = calc_scores(wids).npvalue()
124 |         predict = np.argmax(scores)
125 |         if predict == tag:
126 |             test_correct += 1
127 |     print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))
128 | 
129 | 
130 | for words, wids, tag in dev:
131 |     calc_predict_and_activations(wids, tag, words)
132 |     raw_input()


--------------------------------------------------------------------------------
/05-cnn/cnn-class.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import time
 3 | import random
 4 | import dynet as dy
 5 | import numpy as np
 6 | 
 7 | # Functions to read in the corpus
 8 | w2i = defaultdict(lambda: len(w2i))
 9 | t2i = defaultdict(lambda: len(t2i))
10 | UNK = w2i["<unk>"]
11 | def read_dataset(filename):
12 |     with open(filename, "r") as f:
13 |         for line in f:
14 |             tag, words = line.lower().strip().split(" ||| ")
15 |             yield ([w2i[x] for x in words.split(" ")], t2i[tag])
16 | 
17 | # Read in the data
18 | train = list(read_dataset("../data/classes/train.txt"))
19 | w2i = defaultdict(lambda: UNK, w2i)
20 | dev = list(read_dataset("../data/classes/test.txt"))
21 | nwords = len(w2i)
22 | ntags = len(t2i)
23 | 
24 | # Start DyNet and define trainer
25 | model = dy.Model()
26 | trainer = dy.AdamTrainer(model)
27 | 
28 | # Define the model
29 | EMB_SIZE = 64
30 | W_emb = model.add_lookup_parameters((nwords, 1, 1, EMB_SIZE)) # Word embeddings
31 | WIN_SIZE = 3
32 | FILTER_SIZE = 64
33 | W_cnn = model.add_parameters((1, WIN_SIZE, EMB_SIZE, FILTER_SIZE)) # cnn weights
34 | b_cnn = model.add_parameters((FILTER_SIZE)) # cnn bias
35 | 
36 | W_sm = model.add_parameters((ntags, FILTER_SIZE))          # Softmax weights
37 | b_sm = model.add_parameters((ntags))                      # Softmax bias
38 | 
39 | def calc_scores(words):
40 |     dy.renew_cg()
41 |     W_cnn_express = dy.parameter(W_cnn)
42 |     b_cnn_express = dy.parameter(b_cnn)
43 |     W_sm_express = dy.parameter(W_sm)
44 |     b_sm_express = dy.parameter(b_sm)
45 |     if len(words) < WIN_SIZE:
46 |       words += [0] * (WIN_SIZE-len(words))
47 | 
48 |     cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in words], d=1)
49 |     cnn_out = dy.conv2d_bias(cnn_in, W_cnn_express, b_cnn_express, stride=(1, 1), is_valid=False)
50 |     pool_out = dy.max_dim(cnn_out, d=1)
51 |     pool_out = dy.reshape(pool_out, (FILTER_SIZE,))
52 |     pool_out = dy.rectify(pool_out)
53 |     return W_sm_express * pool_out + b_sm_express
54 | 
55 | for ITER in range(100):
56 |     # Perform training
57 |     random.shuffle(train)
58 |     train_loss = 0.0
59 |     train_correct = 0.0
60 |     start = time.time()
61 |     for words, tag in train:
62 |         scores = calc_scores(words)
63 |         predict = np.argmax(scores.npvalue())
64 |         if predict == tag:
65 |             train_correct += 1
66 | 
67 |         my_loss = dy.pickneglogsoftmax(scores, tag)
68 |         train_loss += my_loss.value()
69 |         my_loss.backward()
70 |         trainer.update()
71 |     print("iter %r: train loss/sent=%.4f, acc=%.4f, time=%.2fs" % (ITER, train_loss/len(train), train_correct/len(train), time.time()-start))
72 |     # Perform testing
73 |     test_correct = 0.0
74 |     for words, tag in dev:
75 |         scores = calc_scores(words).npvalue()
76 |         predict = np.argmax(scores)
77 |         if predict == tag:
78 |             test_correct += 1
79 |     print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))
80 | 
81 | 


--------------------------------------------------------------------------------
/06-rnn/lm-lstm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | 
  4 | start = time.time()
  5 | 
  6 | from collections import Counter, defaultdict
  7 | import random
  8 | import math
  9 | import sys
 10 | import argparse
 11 | 
 12 | import dynet as dy
 13 | import numpy as np
 14 | 
 15 | # format of files: each line is "word1 word2 ..."
 16 | train_file = "../data/ptb/train.txt"
 17 | test_file = "../data/ptb/valid.txt"
 18 | 
 19 | w2i = defaultdict(lambda: len(w2i))
 20 | 
 21 | 
 22 | def read(fname):
 23 |     """
 24 |     Read a file where each line is of the form "word1 word2 ..."
 25 |     Yields lists of the form [word1, word2, ...]
 26 |     """
 27 |     with open(fname, "r") as fh:
 28 |         for line in fh:
 29 |             sent = [w2i[x] for x in line.strip().split()]
 30 |             sent.append(w2i["<s>"])
 31 |             yield sent
 32 | 
 33 | 
 34 | train = list(read(train_file))
 35 | nwords = len(w2i)
 36 | test = list(read(test_file))
 37 | S = w2i["<s>"]
 38 | assert (nwords == len(w2i))
 39 | 
 40 | # DyNet Starts
 41 | model = dy.Model()
 42 | trainer = dy.AdamTrainer(model)
 43 | 
 44 | # Lookup parameters for word embeddings
 45 | EMBED_SIZE = 64
 46 | HIDDEN_SIZE = 128
 47 | WORDS_LOOKUP = model.add_lookup_parameters((nwords, EMBED_SIZE))
 48 | 
 49 | # Word-level LSTM (layers=1, input=64, output=128, model)
 50 | RNN = dy.LSTMBuilder(1, EMBED_SIZE, HIDDEN_SIZE, model)
 51 | 
 52 | # Softmax weights/biases on top of LSTM outputs
 53 | W_sm = model.add_parameters((nwords, HIDDEN_SIZE))
 54 | b_sm = model.add_parameters(nwords)
 55 | 
 56 | 
 57 | # Build the language model graph
 58 | def calc_lm_loss(sent):
 59 |     dy.renew_cg()
 60 |     # parameters -> expressions
 61 |     W_exp = dy.parameter(W_sm)
 62 |     b_exp = dy.parameter(b_sm)
 63 | 
 64 |     # initialize the RNN
 65 |     f_init = RNN.initial_state()
 66 | 
 67 |     # get the wids and masks for each step
 68 |     tot_words = len(sent)
 69 | 
 70 |     # start the rnn by inputting "<s>"
 71 |     s = f_init.add_input(WORDS_LOOKUP[S])
 72 | 
 73 |     # feed word vectors into the RNN and predict the next word
 74 |     losses = []
 75 |     for wid in sent:
 76 |         # calculate the softmax and loss
 77 |         score = W_exp * s.output() + b_exp
 78 |         loss = dy.pickneglogsoftmax(score, wid)
 79 |         losses.append(loss)
 80 |         # update the state of the RNN
 81 |         wemb = WORDS_LOOKUP[wid]
 82 |         s = s.add_input(wemb)
 83 | 
 84 |     return dy.esum(losses), tot_words
 85 | 
 86 | 
 87 | # Sort training sentences in descending order and count minibatches
 88 | train_order = range(len(train))
 89 | 
 90 | print("startup time: %r" % (time.time() - start))
 91 | # Perform training
 92 | start = time.time()
 93 | i = all_time = dev_time = all_tagged = this_words = this_loss = 0
 94 | for ITER in range(100):
 95 |     random.shuffle(train_order)
 96 |     for sid in train_order:
 97 |         i += 1
 98 |         if i % int(500) == 0:
 99 |             trainer.status()
100 |             print(this_loss / this_words, file=sys.stderr)
101 |             all_tagged += this_words
102 |             this_loss = this_words = 0
103 |             all_time = time.time() - start
104 |         if i % int(10000) == 0:
105 |             dev_start = time.time()
106 |             dev_loss = dev_words = 0
107 |             for sent in test:
108 |                 loss_exp, mb_words = calc_lm_loss(sent)
109 |                 dev_loss += loss_exp.scalar_value()
110 |                 dev_words += mb_words
111 |             dev_time += time.time() - dev_start
112 |             train_time = time.time() - start - dev_time
113 |             print("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % (
114 |             dev_loss / dev_words, math.exp(dev_loss / dev_words), dev_words, train_time, all_tagged / train_time))
115 |         # train on the minibatch
116 |         loss_exp, mb_words = calc_lm_loss(train[sid])
117 |         this_loss += loss_exp.scalar_value()
118 |         this_words += mb_words
119 |         loss_exp.backward()
120 |         trainer.update()
121 |     print("epoch %r finished" % ITER)
122 |     trainer.update_epoch(1.0)
123 | 


--------------------------------------------------------------------------------
/06-rnn/lm-minibatch.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | 
  4 | start = time.time()
  5 | 
  6 | from collections import Counter, defaultdict
  7 | import random
  8 | import math
  9 | import sys
 10 | import argparse
 11 | 
 12 | import dynet as dy
 13 | import numpy as np
 14 | 
 15 | # format of files: each line is "word1/tag2 word2/tag2 ..."
 16 | train_file = "../data/ptb/train.txt"
 17 | test_file = "../data/ptb/valid.txt"
 18 | 
 19 | w2i = defaultdict(lambda: len(w2i))
 20 | 
 21 | 
 22 | def read(fname):
 23 |     """
 24 |     Read a file where each line is of the form "word1 word2 ..."
 25 |     Yields lists of the form [word1, word2, ...]
 26 |     """
 27 |     with open(fname, "r") as fh:
 28 |         for line in fh:
 29 |             sent = [w2i[x] for x in line.strip().split()]
 30 |             sent.append(w2i["<s>"])
 31 |             yield sent
 32 | 
 33 | 
 34 | train = list(read(train_file))
 35 | nwords = len(w2i)
 36 | test = list(read(test_file))
 37 | S = w2i["<s>"]
 38 | assert (nwords == len(w2i))
 39 | 
 40 | # DyNet Starts
 41 | 
 42 | model = dy.Model()
 43 | trainer = dy.AdamTrainer(model)
 44 | 
 45 | # Lookup parameters for word embeddings
 46 | EMBED_SIZE = 64
 47 | HIDDEN_SIZE = 128
 48 | WORDS_LOOKUP = model.add_lookup_parameters((nwords, EMBED_SIZE))
 49 | 
 50 | # Word-level LSTM (layers=1, input=64, output=128, model)
 51 | RNN = dy.VanillaLSTMBuilder(1, EMBED_SIZE, HIDDEN_SIZE, model)
 52 | 
 53 | # Softmax weights/biases on top of LSTM outputs
 54 | W_sm = model.add_parameters((nwords, HIDDEN_SIZE))
 55 | b_sm = model.add_parameters(nwords)
 56 | 
 57 | 
 58 | # Build the language model graph
 59 | def calc_lm_loss(sents):
 60 |     dy.renew_cg()
 61 |     # parameters -> expressions
 62 |     W_exp = dy.parameter(W_sm)
 63 |     b_exp = dy.parameter(b_sm)
 64 | 
 65 |     # initialize the RNN
 66 |     f_init = RNN.initial_state()
 67 | 
 68 |     # get the wids and masks for each step
 69 |     tot_words = 0
 70 |     wids = []
 71 |     masks = []
 72 |     for i in range(len(sents[0])):
 73 |         wids.append([(sent[i] if len(sent) > i else S) for sent in sents])
 74 |         mask = [(1 if len(sent) > i else 0) for sent in sents]
 75 |         masks.append(mask)
 76 |         tot_words += sum(mask)
 77 | 
 78 |     # start the rnn by inputting "<s>"
 79 |     init_ids = [S] * len(sents)
 80 |     s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP, init_ids))
 81 | 
 82 |     # feed word vectors into the RNN and predict the next word
 83 |     losses = []
 84 |     for wid, mask in zip(wids, masks):
 85 |         # calculate the softmax and loss
 86 |         score = dy.affine_transform([b_exp, W_exp, s.output()])
 87 |         loss = dy.pickneglogsoftmax_batch(score, wid)
 88 |         # mask the loss if at least one sentence is shorter
 89 |         if mask[-1] != 1:
 90 |             mask_expr = dy.inputVector(mask)
 91 |             mask_expr = dy.reshape(mask_expr, (1,), len(sents))
 92 |             loss = loss * mask_expr
 93 |         losses.append(loss)
 94 |         # update the state of the RNN
 95 |         wemb = dy.lookup_batch(WORDS_LOOKUP, wid)
 96 |         s = s.add_input(wemb)
 97 | 
 98 |     return dy.sum_batches(dy.esum(losses)), tot_words
 99 | 
100 | 
101 | # Sort training sentences in descending order and count minibatches
102 | MB_SIZE = 16
103 | train.sort(key=lambda x: -len(x))
104 | test.sort(key=lambda x: -len(x))
105 | train_order = [x * MB_SIZE for x in range(int((len(train) - 1) / MB_SIZE + 1))]
106 | test_order = [x * MB_SIZE for x in range(int((len(test) - 1) / MB_SIZE + 1))]
107 | 
108 | print("startup time: %r" % (time.time() - start))
109 | # Perform training
110 | start = time.time()
111 | i = all_time = dev_time = all_tagged = this_words = this_loss = 0
112 | for ITER in range(100):
113 |     random.shuffle(train_order)
114 |     for sid in train_order:
115 |         i += 1
116 |         if i % int(500 / MB_SIZE) == 0:
117 |             trainer.status()
118 |             print(this_loss / this_words, file=sys.stderr)
119 |             all_tagged += this_words
120 |             this_loss = this_words = 0
121 |             all_time = time.time() - start
122 |         if i % int(10000 / MB_SIZE) == 0:
123 |             dev_start = time.time()
124 |             dev_loss = dev_words = 0
125 |             for sid in test_order:
126 |                 loss_exp, mb_words = calc_lm_loss(test[sid:sid + MB_SIZE])
127 |                 dev_loss += loss_exp.scalar_value()
128 |                 dev_words += mb_words
129 |             dev_time += time.time() - dev_start
130 |             train_time = time.time() - start - dev_time
131 |             print("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % (
132 |             dev_loss / dev_words, math.exp(dev_loss / dev_words), dev_words, train_time, all_tagged / train_time))
133 |         # train on the minibatch
134 |         loss_exp, mb_words = calc_lm_loss(train[sid:sid + MB_SIZE])
135 |         this_loss += loss_exp.scalar_value()
136 |         # print("loss @ %r: %r" % (i, this_loss))
137 |         this_words += mb_words
138 |         loss_exp.backward()
139 |         trainer.update()
140 |     print("epoch %r finished" % ITER)
141 |     trainer.update_epoch(1.0)
142 | 


--------------------------------------------------------------------------------
/06-rnn/sentiment-lstm.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import time
 3 | import random
 4 | import dynet as dy
 5 | import numpy as np
 6 | 
 7 | # Functions to read in the corpus
 8 | w2i = defaultdict(lambda: len(w2i))
 9 | t2i = defaultdict(lambda: len(t2i))
10 | UNK = w2i["<unk>"]
11 | 
12 | 
13 | def read_dataset(filename):
14 |     with open(filename, "r") as f:
15 |         for line in f:
16 |             tag, words = line.lower().strip().split(" ||| ")
17 |             yield ([w2i[x] for x in words.split(" ")], t2i[tag])
18 | 
19 | 
20 | # Read in the data
21 | train = list(read_dataset("../data/classes/train.txt"))
22 | w2i = defaultdict(lambda: UNK, w2i)
23 | dev = list(read_dataset("../data/classes/test.txt"))
24 | nwords = len(w2i)
25 | ntags = len(t2i)
26 | 
27 | # Start DyNet and defin trainer
28 | model = dy.Model()
29 | trainer = dy.AdamTrainer(model)
30 | 
31 | # Define the model
32 | EMB_SIZE = 64
33 | HID_SIZE = 64
34 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings
35 | fwdLSTM = dy.VanillaLSTMBuilder(1, EMB_SIZE, HID_SIZE, model)  # Forward RNN
36 | bwdLSTM = dy.VanillaLSTMBuilder(1, EMB_SIZE, HID_SIZE, model)  # Backward RNN
37 | W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
38 | b_sm = model.add_parameters((ntags))  # Softmax bias
39 | 
40 | 
41 | # A function to calculate scores for one value
42 | def calc_scores(words):
43 |     dy.renew_cg()
44 |     word_embs = [dy.lookup(W_emb, x) for x in words]
45 |     fwd_init = fwdLSTM.initial_state()
46 |     fwd_embs = fwd_init.transduce(word_embs)
47 |     bwd_init = bwdLSTM.initial_state()
48 |     bwd_embs = bwd_init.transduce(reversed(word_embs))
49 |     W_sm_exp = dy.parameter(W_sm)
50 |     b_sm_exp = dy.parameter(b_sm)
51 |     return W_sm_exp * dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) + b_sm_exp
52 | 
53 | 
54 | for ITER in range(100):
55 |     # Perform training
56 |     random.shuffle(train)
57 |     train_loss = 0.0
58 |     start = time.time()
59 |     for words, tag in train:
60 |         my_loss = dy.pickneglogsoftmax(calc_scores(words), tag)
61 |         train_loss += my_loss.value()
62 |         my_loss.backward()
63 |         trainer.update()
64 |     print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
65 |     # Perform training
66 |     test_correct = 0.0
67 |     for words, tag in dev:
68 |         scores = calc_scores(words).npvalue()
69 |         predict = np.argmax(scores)
70 |         if predict == tag:
71 |             test_correct += 1
72 |     print("iter %r: test acc=%.4f" % (ITER, test_correct / len(dev)))
73 | 


--------------------------------------------------------------------------------
/06-rnn/sentiment-rnn.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import time
 3 | import random
 4 | import dynet as dy
 5 | import numpy as np
 6 | 
 7 | # Functions to read in the corpus
 8 | w2i = defaultdict(lambda: len(w2i))
 9 | t2i = defaultdict(lambda: len(t2i))
10 | UNK = w2i["<unk>"]
11 | 
12 | 
13 | def read_dataset(filename):
14 |     with open(filename, "r") as f:
15 |         for line in f:
16 |             tag, words = line.lower().strip().split(" ||| ")
17 |             yield ([w2i[x] for x in words.split(" ")], t2i[tag])
18 | 
19 | 
20 | # Read in the data
21 | train = list(read_dataset("../data/classes/train.txt"))
22 | w2i = defaultdict(lambda: UNK, w2i)
23 | dev = list(read_dataset("../data/classes/test.txt"))
24 | nwords = len(w2i)
25 | ntags = len(t2i)
26 | 
27 | # Start DyNet and defin trainer
28 | model = dy.Model()
29 | trainer = dy.AdamTrainer(model)
30 | 
31 | # Define the model
32 | EMB_SIZE = 64
33 | HID_SIZE = 64
34 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings
35 | fwdLSTM = dy.SimpleRNNBuilder(1, EMB_SIZE, HID_SIZE, model)  # Forward LSTM
36 | bwdLSTM = dy.SimpleRNNBuilder(1, EMB_SIZE, HID_SIZE, model)  # Backward LSTM
37 | W_sm = model.add_parameters((ntags, 2 * HID_SIZE))  # Softmax weights
38 | b_sm = model.add_parameters((ntags))  # Softmax bias
39 | 
40 | 
41 | # A function to calculate scores for one value
42 | def calc_scores(words):
43 |     dy.renew_cg()
44 |     word_embs = [dy.lookup(W_emb, x) for x in words]
45 |     fwd_init = fwdLSTM.initial_state()
46 |     fwd_embs = fwd_init.transduce(word_embs)
47 |     bwd_init = bwdLSTM.initial_state()
48 |     bwd_embs = bwd_init.transduce(reversed(word_embs))
49 |     W_sm_exp = dy.parameter(W_sm)
50 |     b_sm_exp = dy.parameter(b_sm)
51 |     return W_sm_exp * dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) + b_sm_exp
52 | 
53 | 
54 | for ITER in range(100):
55 |     # Perform training
56 |     random.shuffle(train)
57 |     train_loss = 0.0
58 |     start = time.time()
59 |     for words, tag in train:
60 |         my_loss = dy.pickneglogsoftmax(calc_scores(words), tag)
61 |         train_loss += my_loss.value()
62 |         my_loss.backward()
63 |         trainer.update()
64 |     print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
65 |     # Perform training
66 |     test_correct = 0.0
67 |     for words, tag in dev:
68 |         scores = calc_scores(words).npvalue()
69 |         predict = np.argmax(scores)
70 |         if predict == tag:
71 |             test_correct += 1
72 |     print("iter %r: test acc=%.4f" % (ITER, test_correct / len(dev)))
73 | 


--------------------------------------------------------------------------------
/07-sentrep/text-retrieval.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | 
  4 | from collections import defaultdict
  5 | import random
  6 | import math
  7 | import sys
  8 | import argparse
  9 | 
 10 | import dynet as dy
 11 | import numpy as np
 12 | 
 13 | # format of files: each line is "word1 word2 ..." aligned line-by-line
 14 | train_src_file = "../data/parallel/train.ja"
 15 | train_trg_file = "../data/parallel/train.en"
 16 | dev_src_file = "../data/parallel/dev.ja"
 17 | dev_trg_file = "../data/parallel/dev.en"
 18 | 
 19 | w2i_src = defaultdict(lambda: len(w2i_src))
 20 | w2i_trg = defaultdict(lambda: len(w2i_trg))
 21 | 
 22 | def read(fname_src, fname_trg):
 23 |     """
 24 |     Read parallel files where each line lines up
 25 |     """
 26 |     with open(fname_src, "r") as f_src, open(fname_trg, "r") as f_trg:
 27 |         for line_src, line_trg in zip(f_src, f_trg):
 28 |             sent_src = [w2i_src[x] for x in line_src.strip().split()]
 29 |             sent_trg = [w2i_trg[x] for x in line_trg.strip().split()]
 30 |             yield (sent_src, sent_trg)
 31 | 
 32 | # Read the data
 33 | train = list(read(train_src_file, train_trg_file))
 34 | unk_src = w2i_src["<unk>"]
 35 | w2i_src = defaultdict(lambda: unk_src, w2i_src)
 36 | unk_trg = w2i_trg["<unk>"]
 37 | w2i_trg = defaultdict(lambda: unk_trg, w2i_trg)
 38 | nwords_src = len(w2i_src)
 39 | nwords_trg = len(w2i_trg)
 40 | dev = list(read(dev_src_file, dev_trg_file))
 41 | 
 42 | # DyNet Starts
 43 | model = dy.Model()
 44 | trainer = dy.AdamTrainer(model)
 45 | 
 46 | # Model parameters
 47 | EMBED_SIZE = 64
 48 | HIDDEN_SIZE = 128
 49 | BATCH_SIZE = 16
 50 | 
 51 | # Lookup parameters for word embeddings
 52 | LOOKUP_SRC = model.add_lookup_parameters((nwords_src, EMBED_SIZE))
 53 | LOOKUP_TRG = model.add_lookup_parameters((nwords_trg, EMBED_SIZE))
 54 | 
 55 | # Word-level BiLSTMs
 56 | LSTM_SRC = dy.BiRNNBuilder(1, EMBED_SIZE, HIDDEN_SIZE, model, dy.LSTMBuilder)
 57 | LSTM_TRG = dy.BiRNNBuilder(1, EMBED_SIZE, HIDDEN_SIZE, model, dy.LSTMBuilder)
 58 | 
 59 | # Calculate loss for one mini-batch
 60 | def calc_loss(sents):
 61 |     dy.renew_cg()
 62 | 
 63 |     # Transduce all batch elements with an LSTM
 64 |     sent_reps = [(LSTM_SRC.transduce([LOOKUP_SRC[x] for x in src])[-1],
 65 |                   LSTM_TRG.transduce([LOOKUP_TRG[y] for y in trg])[-1]) for src, trg in sents]
 66 | 
 67 |     # Concatenate the sentence representations to a single matrix
 68 |     mtx_src = dy.concatenate_cols([src for src, trg in sent_reps])
 69 |     mtx_trg = dy.concatenate_cols([trg for src, trg in sent_reps])
 70 | 
 71 |     # Do matrix multiplication to get a matrix of dot product similarity scores
 72 |     sim_mtx = dy.transpose(mtx_src) * mtx_trg
 73 | 
 74 |     # Calculate the hinge loss over all dimensions 
 75 |     loss = dy.hinge_dim(sim_mtx, list(range(len(sents))), d=1)
 76 | 
 77 |     return dy.sum_elems(loss)
 78 | 
 79 | # Calculate representations for one corpus
 80 | def index_corpus(sents):
 81 |     
 82 |     # To take advantage of auto-batching, do several at a time
 83 |     for sid in range(0, len(sents), BATCH_SIZE):
 84 |         dy.renew_cg()
 85 |         
 86 |         # Set up the computation graph
 87 |         exprs = []
 88 |         for src, trg in sents[sid:min(sid+BATCH_SIZE,len(sents))]:
 89 |             exprs.append((LSTM_SRC.transduce([LOOKUP_SRC[x] for x in src])[-1],
 90 |                           LSTM_TRG.transduce([LOOKUP_TRG[y] for y in trg])[-1]))
 91 | 
 92 |         # Perform the forward pass to calculate everything at once
 93 |         exprs[-1][1].forward()
 94 | 
 95 |         for src_expr, trg_expr in exprs:
 96 |             yield (src_expr.npvalue(), trg_expr.npvalue())
 97 | 
 98 | # Perform retrieval, and return both scores and ranked order of candidates
 99 | def retrieve(src, db_mtx):
100 |     scores = np.dot(db_mtx,src)
101 |     ranks = np.argsort(-scores)
102 |     return ranks, scores
103 | 
104 | # Perform training
105 | start = time.time()
106 | train_mbs = all_time = dev_time = all_tagged = this_sents = this_loss = 0
107 | for ITER in range(100):
108 |     random.shuffle(train)
109 |     for sid in range(0, len(train), BATCH_SIZE):
110 |         my_size = min(BATCH_SIZE, len(train)-sid)
111 |         train_mbs += 1
112 |         if train_mbs % int(1000/BATCH_SIZE) == 0:
113 |             trainer.status()
114 |             print("loss/sent=%.4f, sent/sec=%.4f" % (this_loss / this_sents, (train_mbs * BATCH_SIZE) / (time.time() - start - dev_time)), file=sys.stderr)
115 |             this_loss = this_sents = 0
116 |         # train on the minibatch
117 |         loss_exp = calc_loss(train[sid:sid+BATCH_SIZE])
118 |         this_loss += loss_exp.scalar_value()
119 |         this_sents += BATCH_SIZE
120 |         loss_exp.backward()
121 |         trainer.update()
122 |     # Perform evaluation 
123 |     dev_start = time.time()
124 |     rec_at_1, rec_at_5, rec_at_10 = 0, 0, 0
125 |     reps = list(index_corpus(dev))
126 |     trg_mtx = np.stack([trg for src, trg in reps])
127 |     for i, (src, trg) in enumerate(reps):
128 |         ranks, scores = retrieve(src, trg_mtx)
129 |         if ranks[0] == i: rec_at_1 += 1
130 |         if i in ranks[:5]: rec_at_5 += 1
131 |         if i in ranks[:10]: rec_at_10 += 1
132 |     dev_time += time.time()-dev_start
133 |     print("epoch %r: dev recall@1=%.2f%% recall@5=%.2f%% recall@10=%.2f%%" % (ITER, rec_at_1/len(dev)*100, rec_at_5/len(dev)*100, rec_at_10/len(dev)*100))
134 | 


--------------------------------------------------------------------------------
/08-condlm/bleu.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from collections import Counter
 3 | import numpy
 4 | import sys
 5 | 
 6 | # written by Adam Lopez
 7 | 
 8 | # Collect BLEU-relevant statistics for a single hypothesis/reference pair.
 9 | # Return value is a generator yielding:
10 | # (c, r, numerator1, denominator1, ... numerator4, denominator4)
11 | # Summing the columns across calls to this function on an entire corpus will
12 | # produce a vector of statistics that can be used to compute BLEU (below)
13 | def bleu_stats(hypothesis, reference):
14 |   stats = []
15 |   stats.append(len(hypothesis))
16 |   stats.append(len(reference))
17 |   for n in xrange(1,5):
18 |     s_ngrams = Counter([tuple(hypothesis[i:i+n]) for i in xrange(len(hypothesis)+1-n)])
19 |     r_ngrams = Counter([tuple(reference[i:i+n]) for i in xrange(len(reference)+1-n)])
20 |     stats.append(max([sum((s_ngrams & r_ngrams).values()), 0]))
21 |     stats.append(max([len(hypothesis)+1-n, 0]))
22 |   return stats
23 | 
24 | # Compute BLEU from collected statistics obtained by call(s) to bleu_stats
25 | def bleu(stats):
26 |   if len(filter(lambda x: x==0, stats)) > 0:
27 |     return 0
28 |   (c, r) = stats[:2]
29 |   log_bleu_prec = sum([math.log(float(x)/y) for x,y in zip(stats[2::2],stats[3::2])]) / 4.
30 |   return math.exp(min([0, 1-float(r)/c]) + log_bleu_prec)
31 | 
32 | if __name__=='__main__':
33 |   stats = numpy.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
34 |   for hyp, ref in zip(open(sys.argv[1], 'r'), open(sys.argv[2], 'r')):
35 |     hyp, ref = (hyp.strip().split(), ref.strip().split())
36 |     stats += numpy.array(bleu_stats(hyp, ref))
37 |   print "%.2f" % (100*bleu(stats))


--------------------------------------------------------------------------------
/09-attention/plot_attention.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib
 3 | matplotlib.use('Agg')
 4 | 
 5 | from matplotlib.font_manager import FontProperties
 6 | from matplotlib import rcParams
 7 | import pdb as pdb
 8 | import matplotlib.pyplot as plt
 9 | import six
10 | 
11 | 
12 | # if you are outputting cjk, matplotlib needs to first load a cjk font.
13 | # you can figure out how to find a non-latin font on your system here:
14 | # > https://matplotlib.org/users/text_props.html#text-with-non-latin-glyphs
15 | # for example
16 | #
17 | # 1. run in terminal
18 | # $ fc-list :lang=ja family
19 | # -> displays "MS Gothic" as one of the options
20 | #
21 | # 2. add to code here:
22 | # matplotlib.rcParams['font.family'].insert(0, 'MS Gothic')
23 | 
24 | def plot_attention(src_words, trg_words, attention_matrix, file_name=None):
25 |   """This takes in source and target words and an attention matrix (in numpy format)
26 |   and prints a visualization of this to a file.
27 |   :param src_words: a list of words in the source
28 |   :param trg_words: a list of target words
29 |   :param attention_matrix: a two-dimensional numpy array of values between zero and one,
30 |     where rows correspond to source words, and columns correspond to target words
31 |   :param file_name: the name of the file to which we write the attention
32 |   """
33 |   fig, ax = plt.subplots()
34 |   #a lazy, rough, approximate way of making the image large enough
35 |   fig.set_figwidth(int(len(trg_words)*.6))
36 | 
37 |   # put the major ticks at the middle of each cell
38 |   ax.set_xticks(np.arange(attention_matrix.shape[1]) + 0.5, minor=False)
39 |   ax.set_yticks(np.arange(attention_matrix.shape[0]) + 0.5, minor=False)
40 |   ax.invert_yaxis()
41 | 
42 |   # label axes by words
43 |   ax.set_xticklabels(trg_words, minor=False)
44 |   ax.set_yticklabels(src_words, minor=False)
45 |   ax.xaxis.tick_top()
46 |   plt.setp(ax.get_xticklabels(), rotation=50, horizontalalignment='right')
47 |   # draw the heatmap
48 |   plt.pcolor(attention_matrix, cmap=plt.cm.Blues, vmin=0, vmax=1)
49 |   plt.colorbar()
50 | 
51 |   if file_name != None:
52 |     plt.savefig(file_name, dpi=100)
53 |   else:
54 |     plt.show()
55 |   plt.close()
56 | 
57 | 


--------------------------------------------------------------------------------
/10-structured/bilstm-tagger.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | 
  4 | from collections import defaultdict
  5 | import random
  6 | import math
  7 | import sys
  8 | import argparse
  9 | 
 10 | import dynet as dy
 11 | import numpy as np
 12 | 
 13 | # format of files: each line is "word1|tag1 word2|tag2 ..."
 14 | train_file = "../data/tags/train.txt"
 15 | dev_file = "../data/tags/dev.txt"
 16 | 
 17 | w2i = defaultdict(lambda: len(w2i))
 18 | t2i = defaultdict(lambda: len(t2i))
 19 | 
 20 | 
 21 | def read(fname):
 22 |     """
 23 |     Read tagged file
 24 |     """
 25 |     with open(fname, "r") as f:
 26 |         for line in f:
 27 |             words, tags = [], []
 28 |             for wt in line.strip().split():
 29 |                 w, t = wt.split('|')
 30 |                 words.append(w2i[w])
 31 |                 tags.append(t2i[t])
 32 |             yield (words, tags)
 33 | 
 34 | 
 35 | # Read the data
 36 | train = list(read(train_file))
 37 | unk_word = w2i["<unk>"]
 38 | w2i = defaultdict(lambda: unk_word, w2i)
 39 | unk_tag = t2i["<unk>"]
 40 | t2i = defaultdict(lambda: unk_tag, t2i)
 41 | nwords = len(w2i)
 42 | ntags = len(t2i)
 43 | dev = list(read(dev_file))
 44 | 
 45 | # DyNet Starts
 46 | model = dy.Model()
 47 | trainer = dy.AdamTrainer(model)
 48 | 
 49 | # Model parameters
 50 | EMBED_SIZE = 64
 51 | HIDDEN_SIZE = 128
 52 | 
 53 | # Lookup parameters for word embeddings
 54 | LOOKUP = model.add_lookup_parameters((nwords, EMBED_SIZE))
 55 | 
 56 | # Word-level BiLSTM
 57 | LSTM = dy.BiRNNBuilder(1, EMBED_SIZE, HIDDEN_SIZE, model, dy.LSTMBuilder)
 58 | 
 59 | # Word-level softmax
 60 | W_sm = model.add_parameters((ntags, HIDDEN_SIZE))
 61 | b_sm = model.add_parameters(ntags)
 62 | 
 63 | 
 64 | # Calculate the scores for one example
 65 | def calc_scores(words):
 66 |     dy.renew_cg()
 67 | 
 68 |     # Transduce all batch elements with an LSTM
 69 |     word_reps = LSTM.transduce([LOOKUP[x] for x in words])
 70 | 
 71 |     # Softmax scores
 72 |     W = dy.parameter(W_sm)
 73 |     b = dy.parameter(b_sm)
 74 |     scores = [dy.affine_transform([b, W, x]) for x in word_reps]
 75 | 
 76 |     return scores
 77 | 
 78 | 
 79 | # Calculate MLE loss for one example
 80 | def calc_loss(scores, tags):
 81 |     losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)]
 82 |     return dy.esum(losses)
 83 | 
 84 | 
 85 | # Calculate number of tags correct for one example
 86 | def calc_correct(scores, tags):
 87 |     correct = [np.argmax(score.npvalue()) == tag for score, tag in zip(scores, tags)]
 88 |     return sum(correct)
 89 | 
 90 | 
 91 | # Perform training
 92 | for ITER in range(100):
 93 |     random.shuffle(train)
 94 |     start = time.time()
 95 |     this_sents = this_words = this_loss = this_correct = 0
 96 |     for sid in range(0, len(train)):
 97 |         this_sents += 1
 98 |         if this_sents % int(1000) == 0:
 99 |             print("train loss/word=%.4f, acc=%.2f%%, word/sec=%.4f" % (
100 |                 this_loss / this_words, 100 * this_correct / this_words, this_words / (time.time() - start)),
101 |                   file=sys.stderr)
102 |         # train on the example
103 |         words, tags = train[sid]
104 |         scores = calc_scores(words)
105 |         loss_exp = calc_loss(scores, tags)
106 |         this_correct += calc_correct(scores, tags)
107 |         this_loss += loss_exp.scalar_value()
108 |         this_words += len(words)
109 |         loss_exp.backward()
110 |         trainer.update()
111 |     # Perform evaluation 
112 |     start = time.time()
113 |     this_sents = this_words = this_loss = this_correct = 0
114 |     for words, tags in dev:
115 |         this_sents += 1
116 |         scores = calc_scores(words)
117 |         loss_exp = calc_loss(scores, tags)
118 |         this_correct += calc_correct(scores, tags)
119 |         this_loss += loss_exp.scalar_value()
120 |         this_words += len(words)
121 |     print("dev loss/word=%.4f, acc=%.2f%%, word/sec=%.4f" % (
122 |         this_loss / this_words, 100 * this_correct / this_words, this_words / (time.time() - start)), file=sys.stderr)
123 | 


--------------------------------------------------------------------------------
/12-transitionparsing/oracle.py:
--------------------------------------------------------------------------------
 1 | import pdb
 2 | class Word:
 3 | 	def __init__(self, word, location, head = -1):
 4 | 		self.word = word
 5 | 		self.location = location
 6 | 		self.head = head
 7 | 		self.rightmost_child = -1
 8 | 
 9 | #abreviated conll file like gold.dev.txt
10 | def read_abrv(file_name):
11 | 	f = [l.strip() for l in open(file_name)]
12 | 	sents = []
13 | 	sent = []
14 | 	loc = 0
15 | 	for line in f:
16 | 		if line != '':
17 | 			line = line.split()
18 | 			word = line[0]
19 | 			head = int(line[1])
20 | 			sent.append(Word(word, loc, head))
21 | 			loc += 1
22 | 		else:
23 | 			for word in sent:
24 | 				if word.location > sent[word.head].rightmost_child:
25 | 					sent[word.head].rightmost_child = word.location
26 | 			sents.append(sent)
27 | 			sent = []
28 | 			loc = 0
29 | 	return sents
30 | 
31 | sentences = read_abrv('../data/parsing/gold.txt')
32 | actions_for_sents = []
33 | f = open('../data/parsing/output.txt', 'w')
34 | for sent in sentences:
35 | 	stack, buffer = [], []
36 | 	acts = []
37 | 	for word in sent:
38 | 		buffer.append(word)
39 | 	buffer = list(reversed(buffer))
40 | 	while len(buffer) > 0 or len(stack) > 1:
41 | 		if len(stack) < 2:
42 | 			stack.append(buffer.pop())
43 | 			acts.append('SHIFT')
44 | 		elif stack[-1].head == stack[-2].location  and (len(buffer) == 0 or stack[-1].rightmost_child < buffer[-1].location or stack[-2].rightmost_child == -1):
45 | 			acts.append('REDUCE_R')
46 | 			stack.pop()
47 | 		elif stack[-2].head == stack[-1].location and (len(buffer) == 0 or stack[-2].rightmost_child < buffer[-1].location or stack[-2].rightmost_child == -1):
48 | 			acts.append('REDUCE_L')
49 | 			temp = stack.pop()
50 | 			stack.pop()
51 | 			stack.append(temp)
52 | 		elif len(buffer) > 0:
53 | 			stack.append(buffer.pop())
54 | 			acts.append('SHIFT')
55 | 		else:
56 | 			break
57 | 	actions_for_sents.append(acts)
58 | 	f.write(' '.join([s.word for s in sent]) + ' ||| ' + ' '.join(acts) + '\n')
59 | 
60 | f.close()
61 | 
62 | 


--------------------------------------------------------------------------------
/12-transitionparsing/tree_parser.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict, Counter
  2 | import codecs
  3 | import time
  4 | import random
  5 | import dynet as dy
  6 | import numpy as np
  7 | 
  8 | from tree import Tree
  9 | 
 10 | def read_dataset(filename):
 11 |     return [Tree.from_sexpr(line.strip()) for line in codecs.open(filename,"r")]
 12 | 
 13 | def get_vocabs(trees):
 14 |     label_vocab = Counter()
 15 |     word_vocab  = Counter()
 16 |     for tree in trees:
 17 |         label_vocab.update([n.label for n in tree.nonterms()])
 18 |         word_vocab.update([l.label for l in tree.leaves()])
 19 |     labels = [x for x,c in label_vocab.iteritems() if c > 0]
 20 |     words  = ["_UNK_"] + [x for x,c in word_vocab.iteritems() if c > 0]
 21 |     l2i = {l:i for i,l in enumerate(labels)}
 22 |     w2i = {w:i for i,w in enumerate(words)}
 23 |     return l2i, w2i, labels, words
 24 | 
 25 | train = read_dataset("../data/parsing/trees/train.txt")
 26 | dev = read_dataset("../data/parsing/trees/dev.txt")
 27 | 
 28 | l2i, w2i, i2l, i2w = get_vocabs(train)
 29 | ntags = len(l2i)
 30 | nwords = len(w2i)
 31 | 
 32 | # Socher-style Tree RNN
 33 | class TreeRNNBuilder(object):
 34 |     def __init__(self, model, word_vocab, hdim):
 35 |         self.W = model.add_parameters((hdim, 2*hdim))
 36 |         self.E = model.add_lookup_parameters((len(word_vocab),hdim))
 37 |         self.w2i = word_vocab
 38 | 
 39 |     def expr_for_tree(self, tree):
 40 |         if tree.isleaf():
 41 |             return self.E[self.w2i.get(tree.label,0)]
 42 |         if len(tree.children) == 1:
 43 |             assert(tree.children[0].isleaf())
 44 |             expr = self.expr_for_tree(tree.children[0])
 45 |             return expr
 46 |         assert(len(tree.children) == 2),tree.children[0]
 47 |         e1 = self.expr_for_tree(tree.children[0])
 48 |         e2 = self.expr_for_tree(tree.children[1])
 49 |         W = dy.parameter(self.W)
 50 |         expr = dy.tanh(W*dy.concatenate([e1,e2]))
 51 |         return expr
 52 | 
 53 | # Tai-style Tree LSTM
 54 | class TreeLSTMBuilder(object):
 55 |     def __init__(self, model, word_vocab, wdim, hdim):
 56 |         self.WS = [model.add_parameters((hdim, wdim)) for _ in "iou"]
 57 |         self.US = [model.add_parameters((hdim, 2*hdim)) for _ in "iou"]
 58 |         self.UFS =[model.add_parameters((hdim, hdim)) for _ in "ff"]
 59 |         self.BS = [model.add_parameters(hdim) for _ in "iouf"]
 60 |         self.E = model.add_lookup_parameters((len(word_vocab),wdim))
 61 |         self.w2i = word_vocab
 62 | 
 63 |     def expr_for_tree(self, tree):
 64 |         if tree.isleaf():
 65 |             return self.E[self.w2i.get(tree.label,0)]
 66 |         if len(tree.children) == 1:
 67 |             assert(tree.children[0].isleaf())
 68 |             emb = self.expr_for_tree(tree.children[0])
 69 |             Wi,Wo,Wu   = [dy.parameter(w) for w in self.WS]
 70 |             bi,bo,bu,_ = [dy.parameter(b) for b in self.BS]
 71 |             i = dy.logistic(Wi*emb + bi)
 72 |             o = dy.logistic(Wo*emb + bo)
 73 |             u = dy.tanh(    Wu*emb + bu)
 74 |             c = dy.cmult(i,u)
 75 |             expr = dy.cmult(o,dy.tanh(c))
 76 |             return expr
 77 |         assert(len(tree.children) == 2),tree.children[0]
 78 |         e1 = self.expr_for_tree(tree.children[0])
 79 |         e2 = self.expr_for_tree(tree.children[1])
 80 |         Ui,Uo,Uu = [dy.parameter(u) for u in self.US]
 81 |         Uf1,Uf2 = [dy.parameter(u) for u in self.UFS]
 82 |         bi,bo,bu,bf = [dy.parameter(b) for b in self.BS]
 83 |         e = dy.concatenate([e1,e2])
 84 |         i = dy.logistic(Ui*e + bi)
 85 |         o = dy.logistic(Uo*e + bo)
 86 |         f1 = dy.logistic(Uf1*e1 + bf)
 87 |         f2 = dy.logistic(Uf2*e2 + bf)
 88 |         u = dy.tanh(    Uu*e + bu)
 89 |         c = dy.cmult(i,u) + dy.cmult(f1,e1) + dy.cmult(f2,e2)
 90 |         h = dy.cmult(o,dy.tanh(c))
 91 |         expr = h
 92 |         return expr
 93 | 
 94 | # Start DyNet and define trainer
 95 | model = dy.Model()
 96 | trainer = dy.AdamTrainer(model)
 97 | 
 98 | # Define the model
 99 | EMB_SIZE = 64
100 | HID_SIZE = 64
101 | # builder = TreeRNNBuilder(model, w2i, HID_SIZE)
102 | builder = TreeLSTMBuilder(model, w2i, HID_SIZE, EMB_SIZE)
103 | W_sm = model.add_parameters((ntags, HID_SIZE))        # Softmax weights
104 | b_sm = model.add_parameters((ntags))                  # Softmax bias
105 | 
106 | # A function to calculate scores for one value
107 | def calc_scores(tree):
108 |   dy.renew_cg()
109 |   emb = builder.expr_for_tree(tree)
110 |   W_sm_exp = dy.parameter(W_sm)
111 |   b_sm_exp = dy.parameter(b_sm)
112 |   return W_sm_exp * emb + b_sm_exp
113 | 
114 | for ITER in range(100):
115 |   # Perform training
116 |   random.shuffle(train)
117 |   train_loss = 0.0
118 |   start = time.time()
119 |   for tree in train:
120 |     my_loss = dy.hinge(calc_scores(tree), l2i[tree.label])
121 |     # my_loss = dy.pickneglogsoftmax(calc_scores(tree), l2i[tree.label])
122 |     train_loss += my_loss.value()
123 |     my_loss.backward()
124 |     trainer.update()
125 |   print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start))
126 |   # Perform testing
127 |   test_correct = 0.0
128 |   for tree in dev:
129 |     scores = calc_scores(tree).npvalue()
130 |     predict = np.argmax(scores)
131 |     if predict == l2i[tree.label]:
132 |       test_correct += 1
133 |   print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))
134 | 


--------------------------------------------------------------------------------
/13-graphparsing/biaffine_parser.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | import random
  4 | 
  5 | start = time.time()
  6 | 
  7 | from collections import Counter, defaultdict
  8 | from biaffine import DeepBiaffineAttentionDecoder
  9 | 
 10 | import dynet as dy
 11 | import numpy as np
 12 | 
 13 | # format of files: each line is "word1/tag2 word2/tag2 ..."
 14 | train_file = "../data/parsing/graph/ptb_train.txt"
 15 | test_file = "../data/parsing//graph/ptb_dev.txt"
 16 | 
 17 | w2i = defaultdict(lambda: len(w2i))
 18 | t2i = defaultdict(lambda: len(t2i))
 19 | UNK = w2i["<unk>"]
 20 | 
 21 | def read(fname):
 22 |     with open(fname, "r") as fh:
 23 |         for line in fh:
 24 |             tokens = line.strip().split()
 25 |             num_tokens = len(tokens)
 26 |             assert num_tokens % 3 == 0
 27 |             sent = []
 28 |             labels = []
 29 |             heads = []
 30 |             for i in range(num_tokens / 3):
 31 |                 sent.append(w2i[tokens[3 * i]])
 32 |                 labels.append(t2i[tokens[3 * i + 1]])
 33 |                 heads.append(int(tokens[3 * i + 2]))
 34 |             yield (sent, labels, heads)
 35 | 
 36 | 
 37 | train = list(read(train_file))
 38 | w2i = defaultdict(lambda: UNK, w2i)
 39 | dev = list(read(test_file))
 40 | nwords = len(w2i)
 41 | ntags = len(t2i)
 42 | 
 43 | # DyNet Starts
 44 | 
 45 | model = dy.Model()
 46 | trainer = dy.AdamTrainer(model)
 47 | 
 48 | # Lookup parameters for word embeddings
 49 | EMB_SIZE = 32
 50 | HID_SIZE = 64
 51 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE))  # Word embeddings
 52 | fwdLSTM = dy.SimpleRNNBuilder(1, EMB_SIZE, HID_SIZE, model)  # Forward LSTM
 53 | bwdLSTM = dy.SimpleRNNBuilder(1, EMB_SIZE, HID_SIZE, model)  # Backward LSTM
 54 | 
 55 | biaffineParser = DeepBiaffineAttentionDecoder(model, ntags, src_ctx_dim=HID_SIZE * 2,
 56 |                                               n_arc_mlp_units=64, n_label_mlp_units=32)
 57 | 
 58 | def calc_loss(words, labels, heads):
 59 |     dy.renew_cg()
 60 |     word_embs = [dy.lookup(W_emb, x) for x in words]
 61 |     fwd_init = fwdLSTM.initial_state()
 62 |     fwd_embs = fwd_init.transduce(word_embs)
 63 |     bwd_init = bwdLSTM.initial_state()
 64 |     bwd_embs = bwd_init.transduce(reversed(word_embs))
 65 |     src_encodings = [dy.reshape(dy.concatenate([f, b]), (HID_SIZE * 2, 1)) for f, b in zip(fwd_embs, reversed(bwd_embs))]
 66 |     return biaffineParser.decode_loss(src_encodings, ([heads], [labels]))
 67 | 
 68 | 
 69 | def calc_acc(words, labels, heads):
 70 |     dy.renew_cg()
 71 |     word_embs = [dy.lookup(W_emb, x) for x in words]
 72 |     fwd_init = fwdLSTM.initial_state()
 73 |     fwd_embs = fwd_init.transduce(word_embs)
 74 |     bwd_init = bwdLSTM.initial_state()
 75 |     bwd_embs = bwd_init.transduce(reversed(word_embs))
 76 |     src_encodings = [dy.reshape(dy.concatenate([f, b]), (HID_SIZE * 2, 1)) for f, b in zip(fwd_embs, reversed(bwd_embs))]
 77 |     pred_heads, pred_labels = biaffineParser.decoding(src_encodings)
 78 |     return biaffineParser.cal_accuracy(pred_heads, pred_labels, heads, labels)
 79 | 
 80 | for ITER in range(100):
 81 |     # Perform training
 82 |     random.shuffle(train)
 83 |     train_loss = 0.0
 84 |     start = time.time()
 85 |     for words, labels, heads in train:
 86 |         loss = calc_loss(words, labels, heads)
 87 |         train_loss += loss.value()
 88 |         loss.backward()
 89 |         trainer.update()
 90 | 
 91 |     print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start))
 92 | 
 93 |     correct_heads = 0.
 94 |     correct_labels = 0.
 95 |     total = 0.
 96 |     for words, labels, heads in dev:
 97 |         head_acc, label_acc = calc_acc(words, labels, heads)
 98 |         correct_heads += head_acc * len(words)
 99 |         correct_labels += label_acc * len(words)
100 |         total += len(words)
101 |     print("iter %r: test head_acc=%.4f, label_acc=%.4f" % (ITER, correct_heads * 100 / total,
102 |                                                            correct_labels * 100 / total))
103 | 
104 | 


--------------------------------------------------------------------------------
/13-graphparsing/mst.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import defaultdict
  3 | 
  4 | 
  5 | def mst(scores):
  6 |     """
  7 |     Chu-Liu-Edmonds' algorithm for finding minimum spanning arborescence in graphs.
  8 |     Calculates the arborescence with node 0 as root.
  9 |     Source: https://github.com/chantera/biaffineparser/blob/master/utils.py
 10 | 
 11 |     :param scores: `scores[i][j]` is the weight of edge from node `i` to node `j`
 12 |     :returns an array containing the head node (node with edge pointing to current node) for each node,
 13 |              with head[0] fixed as 0
 14 |     """
 15 |     length = scores.shape[0]
 16 |     scores = scores * (1 - np.eye(length))
 17 |     heads = np.argmax(scores, axis=1)
 18 |     heads[0] = 0
 19 |     tokens = np.arange(1, length)
 20 |     roots = np.where(heads[tokens] == 0)[0] + 1
 21 |     if len(roots) < 1:
 22 |         root_scores = scores[tokens, 0]
 23 |         head_scores = scores[tokens, heads[tokens]]
 24 |         new_root = tokens[np.argmax(root_scores / head_scores)]
 25 |         heads[new_root] = 0
 26 |     elif len(roots) > 1:
 27 |         root_scores = scores[roots, 0]
 28 |         scores[roots, 0] = 0
 29 |         new_heads = np.argmax(scores[roots][:, tokens], axis=1) + 1
 30 |         new_root = roots[np.argmin(
 31 |             scores[roots, new_heads] / root_scores)]
 32 |         heads[roots] = new_heads
 33 |         heads[new_root] = 0
 34 | 
 35 |     edges = defaultdict(set)
 36 |     vertices = set((0,))
 37 |     for dep, head in enumerate(heads[tokens]):
 38 |         vertices.add(dep + 1)
 39 |         edges[head].add(dep + 1)
 40 |     for cycle in _find_cycle(vertices, edges):
 41 |         dependents = set()
 42 |         to_visit = set(cycle)
 43 |         while len(to_visit) > 0:
 44 |             node = to_visit.pop()
 45 |             if node not in dependents:
 46 |                 dependents.add(node)
 47 |                 to_visit.update(edges[node])
 48 |         cycle = np.array(list(cycle))
 49 |         old_heads = heads[cycle]
 50 |         old_scores = scores[cycle, old_heads]
 51 |         non_heads = np.array(list(dependents))
 52 |         scores[np.repeat(cycle, len(non_heads)),
 53 |                np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0
 54 |         new_heads = np.argmax(scores[cycle][:, tokens], axis=1) + 1
 55 |         new_scores = scores[cycle, new_heads] / old_scores
 56 |         change = np.argmax(new_scores)
 57 |         changed_cycle = cycle[change]
 58 |         old_head = old_heads[change]
 59 |         new_head = new_heads[change]
 60 |         heads[changed_cycle] = new_head
 61 |         edges[new_head].add(changed_cycle)
 62 |         edges[old_head].remove(changed_cycle)
 63 | 
 64 |     return heads
 65 | 
 66 | 
 67 | def _find_cycle(vertices, edges):
 68 |     """
 69 |     https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm  # NOQA
 70 |     https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/etc/tarjan.py  # NOQA
 71 |     """
 72 |     _index = [0]
 73 |     _stack = []
 74 |     _indices = {}
 75 |     _lowlinks = {}
 76 |     _onstack = defaultdict(lambda: False)
 77 |     _SCCs = []
 78 | 
 79 |     def _strongconnect(v):
 80 |         _indices[v] = _index[0]
 81 |         _lowlinks[v] = _index[0]
 82 |         _index[0] += 1
 83 |         _stack.append(v)
 84 |         _onstack[v] = True
 85 | 
 86 |         for w in edges[v]:
 87 |             if w not in _indices:
 88 |                 _strongconnect(w)
 89 |                 _lowlinks[v] = min(_lowlinks[v], _lowlinks[w])
 90 |             elif _onstack[w]:
 91 |                 _lowlinks[v] = min(_lowlinks[v], _indices[w])
 92 | 
 93 |         if _lowlinks[v] == _indices[v]:
 94 |             SCC = set()
 95 |             while True:
 96 |                 w = _stack.pop()
 97 |                 _onstack[w] = False
 98 |                 SCC.add(w)
 99 |                 if not (w != v):
100 |                     break
101 |             _SCCs.append(SCC)
102 | 
103 |     for v in vertices:
104 |         if v not in _indices:
105 |             _strongconnect(v)
106 | 
107 |     return [SCC for SCC in _SCCs if len(SCC) > 1]
108 | 
109 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/.appveyor.yml:
--------------------------------------------------------------------------------
 1 | environment:
 2 |   matrix:
 3 |     - PYTHON: "C:\\Python35\\python.exe"
 4 |     - PYTHON: "C:\\Python36\\python.exe"
 5 | 
 6 | install:
 7 |   - "%PYTHON% -m pip install -U pip wheel"
 8 |   - "%PYTHON% setup.py install"
 9 |   - "%PYTHON% -m spacy download en"
10 | 
11 | build: off
12 | 
13 | test_script:
14 |   - "%PYTHON% -m unittest discover -v"
15 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__*
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: python
 3 | python: "3.6"
 4 | addons:
 5 |   apt:
 6 |     packages:
 7 |       - pandoc
 8 | install:
 9 | - python setup.py install
10 | - python -m spacy download en
11 | env:
12 |   global:
13 |     - TWINE_USERNAME=danielh
14 |     - secure: QrZ/47sh/8WeeTLU37yfhW94bwO2ocsbMMIRebSS9Y+FssrCi9IbSuTp6NliXlJq17rozGtEf9alu9JetE8hnivACGJm0cz2/j3oYaeCxz8sbTpXeEr8JHiDk6MCfCD9VMrpeo04RBmI76BY1mwdCvxQSJEn/NtkI9jjSaqjLCLcaFWD7mTuYefxrPplROQJPu+jcW1snnubntuux1nRxULC3Ge/IRWb4OYajLJcPXiVsdleSNV9avLE2xIPTFZf4cwHpRxZslKgHeyCLk+JoDlL0qneB4UWB/SZF8CHoYvidPJDzG5NHAEgfxSqbUq3DRvgVAPqR0YoQd/MQbPLBN6v1aY2zbqHJtTS1xidnnYIs3gJWVAurx6WjkNc9QYwdN22EPmYDVquW2tZgvi2kHRoJY+gEYylJRY0jOzqYmZUV9WOZeeb2AzgXnVjQubEm0NSYCC3BYjkiSmwpDWTcr/HvCQ+9iOI1OD56F7B6oowzXBP0Z/IClMd9Pb3vs9cRr6di/Vf+ijjUeHQxyKHiv2R2mGnPuR8d/gR538xmbc/RlEt2tycMD25SBAeFdtlUfB5Si8llTSd6YktZzZhkHiaIPBYAVEbrK3832TM7B7sGAa8R6Y8gctP6ccE/kFpSdnFHuENgRu2VZBDx6q8UmkArRLbrCvzmbn658EySkc=
15 |   matrix:
16 |   - TEST_SUITE=unit
17 |   - TEST_SUITE=convert
18 | script: ci/test.sh
19 | deploy:
20 |   provider: script
21 |   script: ci/deploy.sh
22 |   on:
23 |     repo: huji-nlp/ucca
24 |     tags: true
25 | 
26 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/README.md:
--------------------------------------------------------------------------------
 1 | Universal Conceptual Cognitive Annotation
 2 | ============================
 3 | UCCA is a linguistic framework for semantic annotation, whose details
 4 | are available at [the following paper](http://www.cs.huji.ac.il/~oabend/papers/ucca_acl.pdf):
 5 | 
 6 |     @inproceedings{abend2013universal,
 7 |       author={Abend, Omri  and  Rappoport, Ari},
 8 |       title={{U}niversal {C}onceptual {C}ognitive {A}nnotation ({UCCA})},
 9 |       booktitle={Proc. of ACL},
10 |       month={August},
11 |       year={2013},
12 |       pages={228--238},
13 |       url={http://aclweb.org/anthology/P13-1023}
14 |     }
15 | 
16 | This Python 3 package provides an API to the UCCA annotation and tools to
17 | manipulate and process it. Its main features are conversion between different
18 | representations of UCCA annotations, and rich objects for all of the linguistic
19 | relations which appear in the theoretical framework (see `core`, `layer0`, `layer1`
20 | and `convert` modules under the `ucca` package).
21 | 
22 | The `scripts` package contains various utilities for processing passage files.
23 | 
24 | 
25 | Authors
26 | ------
27 | * Amit Beka: amit.beka@gmail.com
28 | * Daniel Hershcovich: danielh@cs.huji.ac.il
29 | 
30 | 
31 | License
32 | -------
33 | This package is licensed under the GPLv3 or later license.
34 | 
35 | [![Build Status](https://travis-ci.org/danielhers/ucca.svg?branch=master)](https://travis-ci.org/danielhers/ucca)
36 | [![Build Status](https://ci.appveyor.com/api/projects/status/github/danielhers/ucca?svg=true)](https://ci.appveyor.com/project/danielh/ucca)
37 | [![PyPI version](https://badge.fury.io/py/UCCA.svg)](https://badge.fury.io/py/UCCA)
38 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/actions.py:
--------------------------------------------------------------------------------
  1 | COMPOUND = "compound"
  2 | 
  3 | class Labels(object):
  4 |     def __init__(self, size):
  5 |         self.size = size  # Maximum number of labels, NOT enforced here but by the user
  6 | 
  7 |     @property
  8 |     def all(self):
  9 |         raise NotImplementedError()
 10 | 
 11 |     @all.setter
 12 |     def all(self, labels):
 13 |         raise NotImplementedError()
 14 | 
 15 |     def save(self, skip=False):
 16 |         return (None if skip else self.all), self.size
 17 | 
 18 |     def load(self, all_size):
 19 |         self.all, self.size = all_size
 20 | 
 21 | 
 22 | class Action(dict):
 23 |     type_to_id = {}
 24 | 
 25 |     def __init__(self, action_type, tag=None, orig_edge=None, orig_node=None, oracle=None, id_=None):
 26 |         self.type = action_type  # String
 27 |         self.tag = tag  # Usually the tag of the created edge; but if COMPOUND_SWAP, the distance
 28 |         self.orig_node = orig_node  # Node created by this action, if any (during training)
 29 |         self.orig_edge = orig_edge  # Edge created by this action, if any (during training)
 30 |         self.node = None  # Will be set by State when the node created by this action is known
 31 |         self.edge = None  # Will be set by State when the edge created by this action is known
 32 |         self.oracle = oracle  # Reference to oracle, to inform it of actually created nodes/edges
 33 |         self.index = None  # Index of this action in history
 34 | 
 35 |         self.type_id = Action.type_to_id.get(self.type)  # Allocate ID for fast comparison
 36 |         if self.type_id is None:
 37 |             self.type_id = len(Action.type_to_id)
 38 |             Action.type_to_id[self.type] = self.type_id
 39 |         self.id = id_
 40 |         super().__init__(action_type=self.type, tag=self.tag)
 41 | 
 42 |     def is_type(self, *others):
 43 |         return self.type_id in (o.type_id for o in others)
 44 | 
 45 |     def apply(self):
 46 |         if self.oracle is not None:
 47 |             self.oracle.remove(self.orig_edge, self.orig_node)
 48 | 
 49 |     def __repr__(self):
 50 |         return Action.__name__ + "(" + ", ".join(map(str, filter(None, (self.type, self.tag)))) + ")"
 51 | 
 52 |     def __str__(self):
 53 |         s = self.type
 54 |         if self.tag:
 55 |             s += "-%s" % self.tag
 56 |         return s
 57 | 
 58 |     def __eq__(self, other):
 59 |         return self.id == other.id
 60 | 
 61 |     def __hash__(self):
 62 |         return hash(self.id)
 63 | 
 64 |     def __call__(self, *args, **kwargs):
 65 |         return Action(self.type, *args, **kwargs)
 66 | 
 67 |     @property
 68 |     def remote(self):
 69 |         return self.is_type(Actions.RemoteNode, Actions.LeftRemote, Actions.RightRemote)
 70 | 
 71 |     @property
 72 |     def is_swap(self):
 73 |         return self.is_type(Actions.Swap)
 74 | 
 75 | 
 76 | class Actions(Labels):
 77 |     Shift = Action("SHIFT")
 78 |     Node = Action("NODE")
 79 |     RemoteNode = Action("REMOTE-NODE")
 80 |     Implicit = Action("IMPLICIT")
 81 |     Label = Action("LABEL")
 82 |     Reduce = Action("REDUCE")
 83 |     LeftEdge = Action("LEFT-EDGE")
 84 |     RightEdge = Action("RIGHT-EDGE")
 85 |     LeftRemote = Action("LEFT-REMOTE")
 86 |     RightRemote = Action("RIGHT-REMOTE")
 87 |     Swap = Action("SWAP")
 88 |     Finish = Action("FINISH")
 89 | 
 90 |     def __init__(self, actions=None, size=None):
 91 |         super().__init__(size=size)
 92 |         self._all = None
 93 |         self._ids = None
 94 |         if actions is not None:
 95 |             self.all = actions
 96 | 
 97 |     def init(self):
 98 |         # edge and node action will be created as they are returned by the oracle
 99 |         swap = 'regular'
100 |         self.all = [Actions.Reduce, Actions.Shift, Actions.Finish] + \
101 |             (list(map(Actions.Swap, range(1, 3))) if swap == COMPOUND else
102 |              [Actions.Swap] if swap else []) + \
103 |             ([Actions.Label] if False else [])
104 | 
105 |     @property
106 |     def all(self):
107 |         if self._all is None:
108 |             self.init()
109 |         return self._all
110 | 
111 |     @all.setter
112 |     def all(self, actions):
113 |         self._all = [Action(**a) if isinstance(a, dict) else a for a in actions]
114 |         self._ids = {(action.type_id, action.tag): i for i, action in enumerate(self._all)}
115 |         for action in self._all:
116 |             self.generate_id(action)
117 | 
118 |     @property
119 |     def ids(self):
120 |         if self._all is None:
121 |             self.init()
122 |         return self._ids
123 | 
124 |     def generate_id(self, action, create=True):
125 |         if action.id is None:
126 |             key = (action.type_id, action.tag)
127 |             action.id = self.ids.get(key)
128 |             if create and action.id is None:  # New action, add to list
129 |                 # noinspection PyTypeChecker
130 |                 action.id = len(self.all)
131 |                 self.all.append(action(tag=action.tag, id_=action.id))
132 |                 self.ids[key] = action.id
133 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/ci/deploy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -xe
3 | 
4 | pip install pypandoc twine
5 | python setup.py sdist
6 | python setup.py bdist_wheel
7 | twine upload --skip-existing dist/*
8 | 
9 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/ci/test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | case "$TEST_SUITE" in
 4 | unit)
 5 |     # unit tests
 6 |     python -m unittest discover -v || exit 1
 7 |     PASSAGES=../doc/toy.xml
 8 |     ;;
 9 | convert)
10 |     mkdir pickle
11 |     curl -L http://www.cs.huji.ac.il/~danielh/ucca/ucca_corpus_pickle.tgz | tar xz -C pickle || curl -L https://www.dropbox.com/s/q4ycn45zlmhuf9k/ucca_corpus_pickle.tgz | tar xz -C pickle
12 |     PASSAGES=../pickle/*.pickle
13 |     ;;
14 | esac
15 | cd $(dirname $0)
16 | mkdir -p converted
17 | for FORMAT in conll sdp export "export --tree"; do
18 |     echo === Evaluating $FORMAT ===
19 |     if [ $# -lt 1 -o "$FORMAT" = "$1" ]; then
20 |         python ../scripts/convert_and_evaluate.py "$PASSAGES" -f $FORMAT | tee "$FORMAT.log"
21 |     fi
22 | done


--------------------------------------------------------------------------------
/14-semparsing/ucca/doc/short_defs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bastings/nn4nlp2017-code-pytorch/189751e9a6a59f2ff24cbb310126ba9032079748/14-semparsing/ucca/doc/short_defs.pdf


--------------------------------------------------------------------------------
/14-semparsing/ucca/doc/toy.xml:
--------------------------------------------------------------------------------
  1 | <root annotationID="0" passageID="504">
  2 |   <attributes />
  3 |   <layer layerID="0">
  4 |     <attributes />
  5 |     <node ID="0.1" type="Word">
  6 |       <attributes paragraph="1" paragraph_position="1" text="After" />
  7 |       <extra dep="prep" head="5" pos="ADP" tag="IN" />
  8 |     </node>
  9 |     <node ID="0.2" type="Word">
 10 |       <attributes paragraph="1" paragraph_position="2" text="graduation" />
 11 |       <extra dep="pobj" head="1" pos="NOUN" tag="NN" />
 12 |     </node>
 13 |     <node ID="0.3" type="Punctuation">
 14 |       <attributes paragraph="1" paragraph_position="3" text="," />
 15 |       <extra dep="punct" head="5" pos="PUNCT" tag="," />
 16 |     </node>
 17 |     <node ID="0.4" type="Word">
 18 |       <attributes paragraph="1" paragraph_position="4" text="Mary" />
 19 |       <extra dep="nsubj" head="5" pos="PROPN" tag="NNP" />
 20 |     </node>
 21 |     <node ID="0.5" type="Word">
 22 |       <attributes paragraph="1" paragraph_position="5" text="moved" />
 23 |       <extra dep="ROOT" head="5" pos="VERB" tag="VBD" />
 24 |     </node>
 25 |     <node ID="0.6" type="Word">
 26 |       <attributes paragraph="1" paragraph_position="6" text="to" />
 27 |       <extra dep="prep" head="5" pos="ADP" tag="IN" />
 28 |     </node>
 29 |     <node ID="0.7" type="Word">
 30 |       <attributes paragraph="1" paragraph_position="7" text="New" />
 31 |       <extra dep="compound" head="9" pos="PROPN" tag="NNP" />
 32 |     </node>
 33 |     <node ID="0.8" type="Word">
 34 |       <attributes paragraph="1" paragraph_position="8" text="York" />
 35 |       <extra dep="compound" head="9" pos="PROPN" tag="NNP" />
 36 |     </node>
 37 |     <node ID="0.9" type="Word">
 38 |       <attributes paragraph="1" paragraph_position="9" text="City" />
 39 |       <extra dep="pobj" head="6" pos="PROPN" tag="NNP" />
 40 |     </node>
 41 |     <node ID="0.10" type="Punctuation">
 42 |       <attributes paragraph="1" paragraph_position="10" text="." />
 43 |       <extra dep="punct" head="5" pos="PUNCT" tag="." />
 44 |     </node>
 45 |   </layer>
 46 |   <layer layerID="1">
 47 |     <attributes />
 48 |     <node ID="1.1" type="FN">
 49 |       <attributes />
 50 |       <edge toID="1.2" type="L">
 51 |         <attributes />
 52 |       </edge>
 53 |       <edge toID="1.3" type="H">
 54 |         <attributes />
 55 |       </edge>
 56 |       <edge toID="1.5" type="U">
 57 |         <attributes />
 58 |       </edge>
 59 |       <edge toID="1.6" type="H">
 60 |         <attributes />
 61 |       </edge>
 62 |       <edge toID="1.12" type="U">
 63 |         <attributes />
 64 |       </edge>
 65 |     </node>
 66 |     <node ID="1.2" type="FN">
 67 |       <attributes />
 68 |       <edge toID="0.1" type="Terminal">
 69 |         <attributes />
 70 |       </edge>
 71 |     </node>
 72 |     <node ID="1.3" type="FN">
 73 |       <attributes />
 74 |       <edge toID="1.4" type="P">
 75 |         <attributes />
 76 |       </edge>
 77 |       <edge toID="1.7" type="A">
 78 |         <attributes remote="True" />
 79 |       </edge>
 80 |     </node>
 81 |     <node ID="1.4" type="FN">
 82 |       <attributes />
 83 |       <edge toID="0.2" type="Terminal">
 84 |         <attributes />
 85 |       </edge>
 86 |     </node>
 87 |     <node ID="1.5" type="PNCT">
 88 |       <attributes />
 89 |       <edge toID="0.3" type="Terminal">
 90 |         <attributes />
 91 |       </edge>
 92 |     </node>
 93 |     <node ID="1.6" type="FN">
 94 |       <attributes />
 95 |       <edge toID="1.7" type="A">
 96 |         <attributes />
 97 |       </edge>
 98 |       <edge toID="1.8" type="P">
 99 |         <attributes />
100 |       </edge>
101 |       <edge toID="1.9" type="A">
102 |         <attributes />
103 |       </edge>
104 |     </node>
105 |     <node ID="1.7" type="FN">
106 |       <attributes />
107 |       <edge toID="0.4" type="Terminal">
108 |         <attributes />
109 |       </edge>
110 |     </node>
111 |     <node ID="1.8" type="FN">
112 |       <attributes />
113 |       <edge toID="0.5" type="Terminal">
114 |         <attributes />
115 |       </edge>
116 |     </node>
117 |     <node ID="1.9" type="FN">
118 |       <attributes />
119 |       <edge toID="1.10" type="R">
120 |         <attributes />
121 |       </edge>
122 |       <edge toID="1.11" type="C">
123 |         <attributes />
124 |       </edge>
125 |     </node>
126 |     <node ID="1.10" type="FN">
127 |       <attributes />
128 |       <edge toID="0.6" type="Terminal">
129 |         <attributes />
130 |       </edge>
131 |     </node>
132 |     <node ID="1.11" type="FN">
133 |       <attributes />
134 |       <edge toID="0.7" type="Terminal">
135 |         <attributes />
136 |       </edge>
137 |       <edge toID="0.8" type="Terminal">
138 |         <attributes />
139 |       </edge>
140 |       <edge toID="0.9" type="Terminal">
141 |         <attributes />
142 |       </edge>
143 |     </node>
144 |     <node ID="1.12" type="PNCT">
145 |       <attributes />
146 |       <edge toID="0.10" type="Terminal">
147 |         <attributes />
148 |       </edge>
149 |     </node>
150 |     <node ID="1.13" type="LKG">
151 |       <attributes />
152 |       <edge toID="1.2" type="LR">
153 |         <attributes />
154 |       </edge>
155 |       <edge toID="1.3" type="LA">
156 |         <attributes />
157 |       </edge>
158 |       <edge toID="1.6" type="LA">
159 |         <attributes />
160 |       </edge>
161 |     </node>
162 |   </layer>
163 | </root>
164 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/runner.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import oracle
 3 | from ucca import diffutil, ioutil, textutil, layer1, evaluation
 4 | from pdb import set_trace
 5 | 
 6 | 
 7 | files = ['../ucca_corpus_pickle/' + f for f in os.listdir('../ucca_corpus_pickle')]
 8 | passages = list(ioutil.read_files_and_dirs(files))
 9 | 
10 | passage = passages[0]
11 | ora = oracle.Oracle(passage)
12 | set_trace()


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bastings/nn4nlp2017-code-pytorch/189751e9a6a59f2ff24cbb310126ba9032079748/14-semparsing/ucca/scripts/__init__.py


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/annotate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import glob
 5 | import sys
 6 | 
 7 | from ucca.ioutil import file2passage, passage2file
 8 | from ucca.textutil import annotate
 9 | 
10 | desc = """Read UCCA standard format in XML or binary pickle, and write back with POS tags and dependency parse."""
11 | 
12 | 
13 | def main():
14 |     argparser = argparse.ArgumentParser(description=desc)
15 |     argparser.add_argument("filenames", nargs="+", help="passage file names to annotate")
16 |     argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage")
17 |     args = argparser.parse_args()
18 | 
19 |     for pattern in args.filenames:
20 |         filenames = glob.glob(pattern)
21 |         if not filenames:
22 |             raise IOError("Not found: " + pattern)
23 |         for filename in filenames:
24 |             passage = file2passage(filename)
25 |             annotate(passage, verbose=args.verbose, replace=True)
26 |             sys.stderr.write("Writing '%s'...\n" % filename)
27 |             passage2file(passage, filename, binary=not filename.endswith("xml"))
28 | 
29 |     sys.exit(0)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     main()
34 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/convert_and_evaluate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import glob
 5 | import sys
 6 | 
 7 | from ucca import convert
 8 | from ucca.evaluation import evaluate, Scores
 9 | from ucca.ioutil import file2passage
10 | 
11 | desc = """Parses files in CoNLL-X, SemEval 2015 SDP, NeGra export or text format,
12 | converts to UCCA standard format, converts back to the original format and evaluates.
13 | """
14 | 
15 | 
16 | def main():
17 |     argparser = argparse.ArgumentParser(description=desc)
18 |     argparser.add_argument("filenames", nargs="+",
19 |                            help="file names to convert and evaluate")
20 |     argparser.add_argument("-f", "--format", required=True, choices=convert.CONVERTERS,
21 |                            help="input file format")
22 |     argparser.add_argument("-T", "--tree", action="store_true",
23 |                            help="remove multiple parents to get a tree")
24 |     argparser.add_argument("-s", "--strict", action="store_true",
25 |                            help="stop immediately if failed to convert or evaluate a file")
26 |     argparser.add_argument("-v", "--verbose", action="store_true",
27 |                            help="print evaluation results for each file separately")
28 |     args = argparser.parse_args()
29 | 
30 |     converter1 = convert.TO_FORMAT[args.format]
31 |     converter2 = convert.FROM_FORMAT[args.format]
32 |     scores = []
33 |     for pattern in args.filenames:
34 |         filenames = glob.glob(pattern)
35 |         if not filenames:
36 |             raise IOError("Not found: " + pattern)
37 |         for filename in filenames:
38 |             sys.stdout.write("\rConverting %s" % filename)
39 |             sys.stdout.flush()
40 |             ref = file2passage(filename)
41 |             try:
42 |                 guessed = next(converter2(converter1(ref, tree=args.tree), ref.ID))
43 |                 scores.append(evaluate(guessed, ref, verbose=args.verbose))
44 |             except Exception as e:
45 |                 if args.strict:
46 |                     raise ValueError("Error evaluating conversion of %s" % filename) from e
47 |                 else:
48 |                     print("Error evaluating conversion of %s: %s" % (filename, e), file=sys.stderr)
49 |     print()
50 |     if args.verbose and len(scores) > 1:
51 |         print("Aggregated scores:")
52 |     Scores.aggregate(scores).print()
53 | 
54 |     sys.exit(0)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/count_parents_children.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import glob
 5 | import sys
 6 | from collections import Counter, defaultdict
 7 | 
 8 | from ucca.ioutil import file2passage
 9 | from ucca import layer1
10 | 
11 | desc = """Parses XML files in UCCA standard format, and creates a histogram for the number of parents per unit.
12 | """
13 | 
14 | 
15 | def plot_histogram(counter, label, plot=None):
16 |     import matplotlib.pyplot as plt
17 |     plt.figure()
18 |     nums = list(counter.keys())
19 |     counts = list(counter.values())
20 |     indices = range(len(counts))
21 |     bars = plt.bar(indices, counts, align="center")
22 |     plt.xticks(indices, nums)
23 |     top = 1.06 * max(counts)
24 |     plt.ylim(min(counts), top)
25 |     plt.xlabel("number of %s" % label)
26 |     plt.ylabel("count")
27 |     for bar in bars:
28 |         count = bar.get_height()
29 |         plt.text(bar.get_x() + bar.get_width() / 2., count, "%.1f%%" % (100.0 * count / sum(counts)),
30 |                  ha="center", va="bottom")
31 |     if plot:
32 |         plt.savefig(plot + "histogram_" + label + ".png")
33 |     else:
34 |         plt.show()
35 | 
36 | 
37 | def plot_pie(counter, label, plot=None):
38 |     import matplotlib.pyplot as plt
39 |     plt.figure()
40 |     nums = list(counter.keys())
41 |     counts = list(counter.values())
42 |     plt.pie(counts, labels=nums, autopct="%1.1f%%",
43 |             counterclock=True, wedgeprops={"edgecolor": "white"})
44 |     plt.axis("equal")
45 |     if plot:
46 |         plt.savefig(plot + "pie_" + label + ".png")
47 |     else:
48 |         plt.show()
49 | 
50 | 
51 | def main():
52 |     argparser = argparse.ArgumentParser(description=desc)
53 |     argparser.add_argument("filenames", nargs="+", help="file names to analyze")
54 |     argparser.add_argument("-o", "--outfile", default="data/counts_",
55 |                            help="output file prefix for histogram")
56 |     argparser.add_argument("-p", "--plot", default="data/plot_",
57 |                            help="output file prefix for plot image file")
58 |     args = argparser.parse_args()
59 | 
60 |     histograms = defaultdict(Counter)
61 |     for pattern in args.filenames:
62 |         for filename in glob.glob(pattern):
63 |             sys.stderr.write("Reading passage '%s'...\n" % filename)
64 |             passage = file2passage(filename)
65 |             for node in passage.layer(layer1.LAYER_ID).all:
66 |                 if node.ID != "1.1":  # Exclude the root node
67 |                     histograms["parents"][clip(node.incoming, 3)] += 1
68 |                     histograms["children"][clip(node.outgoing, 7)] += 1
69 | 
70 |     for label, counter in histograms.items():
71 |         handle = open(args.outfile + label + ".txt", "w", encoding="utf-8") if args.outfile else sys.stdout
72 |         handle.writelines(["%s\t%d\n" % (num, count) for num, count in counter.items()])
73 |         if handle is not sys.stdout:
74 |             handle.close()
75 |         # noinspection PyBroadException
76 |         try:
77 |             plot_histogram(counter, label, plot=args.plot)
78 |             plot_pie(counter, label, plot=args.plot)
79 |         except:
80 |             pass
81 | 
82 |     sys.exit(0)
83 | 
84 | 
85 | def clip(l, m):
86 |     return len(l) if len(l) <= m else ">%d" % m
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     main()
91 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/evaluate_db.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | The evaluation software for UCCA layer 1.
 4 | """
 5 | 
 6 | from optparse import OptionParser
 7 | 
 8 | from scripts import ucca_db
 9 | from ucca import convert
10 | from ucca.evaluation import evaluate
11 | 
12 | 
13 | ##############################################################################
14 | # Returns the command line parser.
15 | ##############################################################################
16 | def cmd_line_parser():
17 |     usage = "usage: %prog [options]\n"
18 |     opt_parser = OptionParser(usage=usage)
19 |     opt_parser.add_option("--db", "-d", dest="db_filename",
20 |                           action="store", type="string",
21 |                           help="the db file name")
22 |     opt_parser.add_option("--host", "--hst", dest="host",
23 |                           action="store", type="string",
24 |                           help="the host name")
25 |     opt_parser.add_option("--pid", "-p", dest="pid", action="store",
26 |                           type="int", help="the passage ID")
27 |     opt_parser.add_option("--from_xids", "-x", dest="from_xids",
28 |                           action="store_true", help="interpret the ref \
29 |                           and the guessed parameters as Xids in the db")
30 |     opt_parser.add_option("--guessed", "-g", dest="guessed", action="store",
31 |                           type="string", help="if a db is defined - \
32 |                           the username for the guessed annotation; \
33 |                           else - the xml file name for the guessed annotation")
34 |     opt_parser.add_option("--ref", "-r", dest="ref", action="store",
35 |                           type="string", help="if a db is defined - \
36 |                           the username for the reference annotation; else - \
37 |                           the xml file name for the reference annotation")
38 |     opt_parser.add_option("--units", "-u", dest="units", action="store_true",
39 |                           help="the units the annotations have in common, \
40 |                           and those each has separately")
41 |     opt_parser.add_option("--fscore", "-f", dest="fscore", action="store_true",
42 |                           help="outputs the traditional P,R,F \
43 |                           instead of the scene structure evaluation")
44 |     opt_parser.add_option("--debug", dest="debug", action="store_true",
45 |                           help="run in debug mode")
46 |     opt_parser.add_option("--errors", "-e", dest="errors", action="store_true",
47 |                           help="prints the error distribution\
48 |                           according to its frequency")
49 |     return opt_parser
50 | 
51 | 
52 | def main():
53 |     opt_parser = cmd_line_parser()
54 |     (options, args) = opt_parser.parse_args()
55 |     if len(args) > 0:
56 |         opt_parser.error("all arguments must be flagged")
57 | 
58 |     if (options.guessed is None) or (options.ref is None) or (options.db_filename is None):
59 |         opt_parser.error("missing arguments. type --help for help.")
60 |     if options.pid is not None and options.from_xids is not None:
61 |         opt_parser.error("inconsistent parameters. \
62 |         you can't have both a pid and from_xids paramters.")
63 | 
64 |     keys = [options.guessed, options.ref]
65 |     if options.from_xids:
66 |         xmls = ucca_db.get_by_xids(options.db_filename, options.host, keys)
67 |     else:
68 |         xmls = ucca_db.get_xml_trees(options.db_filename, options.host,
69 |                                      options.pid, keys)
70 | 
71 |     guessed, ref = [convert.from_site(x) for x in xmls]
72 |     if options.units or options.fscore or options.errors:
73 |         evaluate(guessed, ref,
74 |                  units=options.units, fscore=options.fscore, errors=options.errors, verbose=True)
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     main()
79 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/evaluate_standard.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | The evaluation script for UCCA layer 1.
 4 | """
 5 | import sys
 6 | from argparse import ArgumentParser
 7 | 
 8 | from ucca import evaluation, constructions, ioutil
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     argparser = ArgumentParser(description="Compare two UCCA passages or two directories containing passage files.")
13 |     argparser.add_argument("guessed", help="xml/pickle file name for the guessed annotation, or directory of files")
14 |     argparser.add_argument("ref", help="xml/pickle file name for the reference annotation, or directory of files")
15 |     argparser.add_argument("-u", "--units", action="store_true",
16 |                            help="the units the annotations have in common, and those each has separately")
17 |     argparser.add_argument("-f", "--fscore", action="store_true",
18 |                            help="outputs the traditional P,R,F instead of the scene structure evaluation")
19 |     argparser.add_argument("-e", "--errors", action="store_true",
20 |                            help="prints the error distribution according to its frequency")
21 |     argparser.add_argument("--no-normalize", dest="normalize", action="store_false",
22 |                            help="do not normalize passages before evaluation")
23 |     argparser.add_argument("--out-file", help="file to write results for each evaluated passage to, in CSV format")
24 |     argparser.add_argument("--summary-file", help="file to write aggregated results to, in CSV format")
25 |     group = argparser.add_mutually_exclusive_group()
26 |     group.add_argument("-v", "--verbose", action="store_true",
27 |                        help="prints the results for every single pair (always true if there is only one pair)")
28 |     group.add_argument("-q", "--quiet", action="store_true", help="do not print anything")
29 |     constructions.add_argument(argparser)
30 |     args = argparser.parse_args()
31 | 
32 |     if not (args.units or args.fscore or args.errors):
33 |         argparser.error("At least one of -u, -f or -e is required.")
34 | 
35 |     guessed, ref = [ioutil.read_files_and_dirs((x,)) for x in (args.guessed, args.ref)]
36 |     if len(guessed) != len(ref):
37 |         raise ValueError("Number of passages to compare does not match: %d != %d" % (len(guessed), len(ref)))
38 |     if len(guessed) > 1:
39 |         guessed_by_id = {}
40 |         for g in guessed:
41 |             sys.stdout.write("\rReading %s..." % g.ID)
42 |             sys.stdout.flush()
43 |             guessed_by_id[g.ID] = g
44 |         ids = [p.ID for p in ref]
45 |         try:
46 |             guessed = [guessed_by_id[i] for i in ids]
47 |         except KeyError as e:
48 |             raise ValueError("Passage IDs do not match") from e
49 |     results = []
50 |     for g, r in zip(guessed, ref):
51 |         if len(guessed) > 1:
52 |             sys.stdout.write("\rEvaluating %s%s" % (g.ID, ":" if args.verbose else "..."))
53 |             sys.stdout.flush()
54 |         if args.verbose:
55 |             print()
56 |         result = evaluation.evaluate(g, r, constructions=args.constructions, units=args.units, fscore=args.fscore,
57 |                                      errors=args.errors, verbose=args.verbose or len(guessed) == 1,
58 |                                      normalize=args.normalize)
59 |         if args.verbose:
60 |             print("Average labeled F1 score: %.3f\n" % result.average_f1())
61 |         results.append(result)
62 |     summary = evaluation.Scores.aggregate(results)
63 |     if len(results) > 1:
64 |         if args.verbose:
65 |             print("Aggregated scores:")
66 |         else:
67 |             print(end="\r")
68 |             if not args.quiet:
69 |                 summary.print()
70 |         if not args.quiet:
71 |             print("Average labeled F1 score: %.3f" % summary.average_f1())
72 |     args_constructions = summary.evaluators
73 |     if args.out_file:
74 |         with open(args.out_file, "w", encoding="utf-8") as f:
75 |             print(",".join(summary.titles()), file=f)
76 |             for result in results:
77 |                 print(",".join(result.fields()), file=f)
78 |     if args.summary_file:
79 |         with open(args.summary_file, "w", encoding="utf-8") as f:
80 |             print(",".join(summary.titles()), file=f)
81 |             print(",".join(summary.fields()), file=f)
82 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/find_constructions.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | from ucca import constructions
 4 | from ucca.ioutil import read_files_and_dirs
 5 | 
 6 | if __name__ == "__main__":
 7 |     argparser = ArgumentParser(description="Extract linguistic constructions from UCCA corpus.")
 8 |     argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names")
 9 |     constructions.add_argument(argparser, False)
10 |     argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage")
11 |     args = argparser.parse_args()
12 |     for passage in read_files_and_dirs(args.passages):
13 |         if args.verbose:
14 |             print("%s:" % passage.ID)
15 |         extracted = constructions.extract_edges(passage, constructions=args.constructions, verbose=args.verbose)
16 |         if any(extracted.values()):
17 |             if not args.verbose:
18 |                 print("%s:" % passage.ID)
19 |             for construction, edges in extracted.items():
20 |                 if edges:
21 |                     print("  %s:" % construction.description)
22 |                     for edge in edges:
23 |                         print("    %s [%s %s]" % (edge, edge.tag, edge.child))
24 |             print()
25 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/join_passages.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import glob
 5 | import sys
 6 | 
 7 | from collections import defaultdict
 8 | 
 9 | import ucca.convert
10 | from ucca.ioutil import passage2file, file2passage
11 | 
12 | desc = """Parses XML/pickle files in UCCA standard format, and writes a single passage.
13 | """
14 | 
15 | 
16 | def main():
17 |     argparser = argparse.ArgumentParser(description=desc)
18 |     argparser.add_argument("filenames", nargs="+", help="passage file names to join")
19 |     argparser.add_argument("-o", "--outdir", default=".", help="output directory")
20 |     argparser.add_argument("-p", "--prefix", default="", help="output filename prefix")
21 |     argparser.add_argument("-r", "--remarks", action="store_true", help="annotate original IDs")
22 |     argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)")
23 |     argparser.add_argument("-j", "--join-by-prefix", action="store_true",
24 |                            help="join each set of passages whose IDs share all but the last 3 characters")
25 |     args = argparser.parse_args()
26 | 
27 |     passages = [file2passage(filename) for pattern in args.filenames for filename in sorted(glob.glob(pattern))]
28 |     if args.join_by_prefix:
29 |         subsets = defaultdict(list)
30 |         for passage in passages:
31 |             subsets[passage.ID[:-3]].append(passage)
32 |     else:
33 |         subsets = {passages[0].ID: passages}
34 |     for passage_id, subset in sorted(subsets.items()):
35 |         sys.stderr.write("Joining passages " + ", ".join(passage.ID for passage in subset) + "\n")
36 |         joined = ucca.convert.join_passages(passages, passage_id=passage_id, remarks=args.remarks)
37 |         outfile = "%s/%s.%s" % (args.outdir, args.prefix + joined.ID, "pickle" if args.binary else "xml")
38 |         sys.stderr.write("Writing joined passage file '%s'...\n" % outfile)
39 |         passage2file(joined, outfile, args.binary)
40 | 
41 |     sys.exit(0)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     main()
46 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/join_sdp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import glob
 5 | import os
 6 | import sys
 7 | 
 8 | desc = """Combines several SDP parsed files to one.
 9 | """
10 | 
11 | 
12 | def main():
13 |     argparser = argparse.ArgumentParser(description=desc)
14 |     argparser.add_argument("filenames", nargs="+",
15 |                            help="SDP file names to join")
16 |     argparser.add_argument("-o", "--outfile",
17 |                            help="output filename (standard output if unspecified)")
18 |     argparser.add_argument("-H", "--header", default="SDP 2015",
19 |                            help="first line in the file, not including prefix")
20 |     argparser.add_argument("-p", "--prefix", default="#",
21 |                            help="prefix for comment lines")
22 |     args = argparser.parse_args()
23 | 
24 |     lines = [args.prefix + args.header + "\n"]
25 |     for pattern in args.filenames:
26 |         filenames = sorted(glob.glob(pattern))
27 |         if not filenames:
28 |             raise IOError("Not found: " + pattern)
29 |         for filename in filenames:
30 |             base = os.path.basename(os.path.splitext(filename)[0])
31 |             lines.append(args.prefix + base + "\n")
32 |             with open(filename, encoding="utf-8") as f:
33 |                 lines += f.readlines()
34 |         f = sys.stdout if args.outfile is None else open(args.outfile, "w", encoding="utf-8")
35 |         f.writelines(lines)
36 |         if args.outfile is not None:
37 |             f.close()
38 | 
39 |     sys.exit(0)
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     main()
44 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/pickle_to_standard.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import os
 4 | import sys
 5 | 
 6 | from ucca.ioutil import file2passage, passage2file
 7 | 
 8 | desc = """Parses pickle files in UCCA standard format, and writes them in XML format.
 9 | """
10 | 
11 | 
12 | def main():
13 |     argparser = argparse.ArgumentParser(description=desc)
14 |     argparser.add_argument('filenames', nargs='+', help="pickle file names to convert")
15 |     argparser.add_argument('-o', '--outdir', default='.', help="output directory")
16 |     args = argparser.parse_args()
17 | 
18 |     for filename in args.filenames:
19 |         sys.stderr.write("Reading passage '%s'...\n" % filename)
20 |         passage = file2passage(filename)
21 |         basename = os.path.splitext(os.path.basename(filename))[0]
22 |         outfile = args.outdir + os.path.sep + basename + ".xml"
23 |         sys.stderr.write("Writing file '%s'...\n" % outfile)
24 |         passage2file(passage, outfile)
25 | 
26 |     sys.exit(0)
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     main()
31 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/site_to_standard.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python3
 2 | 
 3 | import argparse
 4 | import pickle
 5 | import sqlite3
 6 | import sys
 7 | from xml.etree.ElementTree import ElementTree, tostring, fromstring
 8 | 
 9 | import ucca.convert
10 | from ucca.textutil import indent_xml
11 | 
12 | desc = """Parses an XML in UCCA site format.
13 | 
14 | The input can be given as either an XML file or a DB file with passage ID
15 | and user name, and the output is either the standard format XML or
16 | a pickled object.
17 | Possible input methods are using a DB file with pid and user, which gets the
18 | annotation of the specified user for the specified passage from teh DB file,
19 | or using filename of a site-formatted XML file.
20 | 
21 | """
22 | 
23 | 
24 | def site2passage(filename):
25 |     """Opens a file and returns its parsed Passage object"""
26 |     with open(filename, encoding="utf-8") as f:
27 |         etree = ElementTree().parse(f)
28 |     return ucca.convert.from_site(etree)
29 | 
30 | 
31 | def db2passage(handle, pid, user):
32 |     """Gets the annotation of user to pid from the DB handle - returns a passage"""
33 |     handle.execute("SELECT id FROM users WHERE username=?", (user,))
34 |     uid = handle.fetchone()[0]
35 |     handle.execute("SELECT xml FROM xmls WHERE paid=? AND uid=? " +
36 |                    "ORDER BY ts DESC", (pid, uid))
37 |     raw_xml = handle.fetchone()[0]
38 |     return ucca.convert.from_site(fromstring(raw_xml))
39 | 
40 | 
41 | def main():
42 |     argparser = argparse.ArgumentParser(description=desc)
43 |     argparser.add_argument("filename", nargs="?", help="XML file name to convert")
44 |     argparser.add_argument("-o", "--outfile", help="output file for standard XML")
45 |     argparser.add_argument("-b", "--binary", help="output file for binary pickel")
46 |     argparser.add_argument("-d", "--db", help="DB file to get input from")
47 |     argparser.add_argument("-p", "--pid", type=int, help="PassageID to query DB")
48 |     argparser.add_argument("-u", "--user", help="Username to DB query")
49 |     args = argparser.parse_args()
50 | 
51 |     # Checking for illegal combinations
52 |     if args.db and args.filename:
53 |         argparser.error("Only one source, XML or DB file, can be used")
54 |     if (not args.db) and (not args.filename):
55 |         argparser.error("Must specify one source, XML or DB file")
56 |     if args.db and not (args.pid and args.user):
57 |         argparser.error("Must specify a username and a passage ID when " +
58 |                      "using DB file option")
59 |     if (args.pid or args.user) and not args.db:
60 |         argparser.error("Cannot use user and passage ID options without DB file")
61 | 
62 |     if args.filename:
63 |         passage = site2passage(args.filename)
64 |     else:
65 |         conn = sqlite3.connect(args.db)
66 |         c = conn.cursor()
67 |         passage = db2passage(c, args.pid, args.user)
68 | 
69 |     if args.binary:
70 |         with open(args.binary, "wb") as binf:
71 |             pickle.dump(passage, binf)
72 |     else:
73 |         root = ucca.convert.to_standard(passage)
74 |         output = indent_xml(tostring(root).decode())
75 |         if args.outfile:
76 |             with open(args.outfile, "w", encoding="utf-8") as outf:
77 |                 outf.write(output)
78 |         else:
79 |             print(output)
80 | 
81 |     sys.exit(0)
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/split_corpus.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from posix import mkdir
 3 | 
 4 | import os
 5 | import re
 6 | from shutil import copyfile
 7 | 
 8 | desc = """Split a directory of files into "train", "dev" and "test" directories.
 9 | All files not in either "train" or "dev" will go into "test".
10 | """
11 | TRAIN_DEFAULT = 300
12 | DEV_DEFAULT = 34
13 | # TEST on all the rest
14 | 
15 | 
16 | def copy(src, dest, link=False):
17 |     if link:
18 |         try:
19 |             os.symlink(src, dest)
20 |         except (NotImplementedError, OSError):
21 |             copyfile(src, dest)
22 |     else:
23 |         copyfile(src, dest)
24 | 
25 | 
26 | def numeric(s):
27 |     try:
28 |         return int(re.findall("([0-9]+)", s)[-1])
29 |     except (ValueError, IndexError):
30 |         return s
31 | 
32 | 
33 | def not_split_dir(filename):
34 |     return filename not in ("train", "dev", "test")
35 | 
36 | 
37 | def split_passages(directory, train, dev, link, quiet=False):
38 |     filenames = sorted(filter(not_split_dir, os.listdir(directory)), key=numeric)
39 |     assert filenames, "No files to split"
40 |     assert train + dev <= len(filenames), "Not enough files to split: %d+%d>%d" % (train, dev, len(filenames))
41 |     directory = os.path.abspath(directory)
42 |     if not directory.endswith(os.sep):
43 |         directory += os.sep
44 |     for subdirectory in "train", "dev", "test":
45 |         if not os.path.exists(directory + subdirectory):
46 |             mkdir(directory + subdirectory)
47 |     print("%d files to split: %d/%d/%d" % (len(filenames), train, dev, len(filenames) - train - dev))
48 |     print_format = "Creating link in %s to: " if link else "Copying to %s: "
49 |     if not quiet:
50 |         print(print_format % "train", end="", flush=True)
51 |     for f in filenames[:train]:
52 |         copy(directory + f, directory + "train" + os.sep + f, link)
53 |         if not quiet:
54 |             print(f, end=" ", flush=True)
55 |     if not quiet:
56 |         print()
57 |         print(print_format % "dev", end="", flush=True)
58 |     for f in filenames[train:train + dev]:
59 |         copy(directory + f, directory + "dev" + os.sep + f, link)
60 |         if not quiet:
61 |             print(f, end=" ", flush=True)
62 |     if not quiet:
63 |         print()
64 |         print(print_format % "test", end="", flush=True)
65 |     for f in filenames[train + dev:]:
66 |         copy(directory + f, directory + "test" + os.sep + f, link)
67 |         if not quiet:
68 |             print(f, end=" ", flush=True)
69 |     if not quiet:
70 |         print()
71 | 
72 | if __name__ == "__main__":
73 |     argparser = argparse.ArgumentParser(description=desc)
74 |     argparser.add_argument("directory", default=".", nargs="?", help="directory to split (default: current directory)")
75 |     argparser.add_argument("-t", "--train", type=int, default=TRAIN_DEFAULT,
76 |                            help="size of train split (default: %d)" % TRAIN_DEFAULT)
77 |     argparser.add_argument("-d", "--dev", type=int, default=DEV_DEFAULT,
78 |                            help="size of dev split (default: %d)" % DEV_DEFAULT)
79 |     argparser.add_argument("-l", "--link", action="store_true", help="create symbolic link instead of copying")
80 |     argparser.add_argument("-q", "--quiet", action="store_true", help="less output")
81 |     args = argparser.parse_args()
82 | 
83 |     split_passages(args.directory, args.train, args.dev, link=args.link, quiet=args.quiet)
84 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/standard_to_pickle.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import os
 4 | import sys
 5 | 
 6 | from ucca.ioutil import file2passage, passage2file
 7 | 
 8 | desc = """Parses an XML in UCCA standard format, and writes them in binary Pickle format.
 9 | """
10 | 
11 | 
12 | def main():
13 |     argparser = argparse.ArgumentParser(description=desc)
14 |     argparser.add_argument('filenames', nargs='+', help="XML file names to convert")
15 |     argparser.add_argument('-o', '--outdir', default='.', help="output directory")
16 |     args = argparser.parse_args()
17 | 
18 |     for filename in args.filenames:
19 |         sys.stderr.write("Reading passage '%s'...\n" % filename)
20 |         passage = file2passage(filename)
21 |         basename = os.path.splitext(os.path.basename(filename))[0]
22 |         outfile = args.outdir + os.path.sep + basename + ".pickle"
23 |         sys.stderr.write("Writing file '%s'...\n" % outfile)
24 |         passage2file(passage, outfile, binary=True)
25 | 
26 |     sys.exit(0)
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     main()
31 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/standard_to_sentences.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import sys
 5 | 
 6 | import ucca.convert
 7 | from ucca.ioutil import file2passage, passage2file
 8 | 
 9 | desc = """Parses an XML in UCCA standard format, and writes a passage per sentence.
10 | """
11 | 
12 | 
13 | def main():
14 |     argparser = argparse.ArgumentParser(description=desc)
15 |     argparser.add_argument('filenames', nargs='+', help="passage file names to convert")
16 |     argparser.add_argument('-o', '--outdir', default='.', help="output directory")
17 |     argparser.add_argument('-p', '--prefix', default='', help="output filename prefix")
18 |     argparser.add_argument('-r', '--remarks', action='store_true', help="annotate original IDs")
19 |     argparser.add_argument("-b", "--binary", action="store_true",
20 |                            help="write in pickle binary format (.pickle)")
21 |     args = argparser.parse_args()
22 | 
23 |     for filename in args.filenames:
24 |         passage = file2passage(filename)
25 |         sentences = ucca.convert.split2sentences(passage, remarks=args.remarks)
26 |         for i, sentence in enumerate(sentences):
27 |             outfile = "%s/%s.%s" % (args.outdir, args.prefix + sentence.ID,
28 |                                     "pickle" if args.binary else "xml")
29 |             sys.stderr.write("Writing passage file for sentence '%s'...\n" % outfile)
30 |             passage2file(sentence, outfile, args.binary)
31 | 
32 |     sys.exit(0)
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     main()
37 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/statistics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import glob
 5 | import sys
 6 | 
 7 | import numpy as np
 8 | 
 9 | from ucca import layer0, layer1
10 | from ucca.ioutil import file2passage
11 | from ucca.layer1 import NodeTags
12 | from ucca.textutil import break2sentences
13 | 
14 | desc = """Prints statistics on UCCA passages
15 | """
16 | 
17 | 
18 | def main():
19 |     argparser = argparse.ArgumentParser(description=desc)
20 |     argparser.add_argument("filenames", nargs="+", help="files to process")
21 |     argparser.add_argument("-o", "--outfile", help="output file for data")
22 |     args = argparser.parse_args()
23 | 
24 |     print("id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont,"
25 |           "edges,primary,remote,linkage,parents,children,mult-parents")
26 |     data = []
27 |     for pattern in args.filenames:
28 |         for filename in glob.glob(pattern):
29 |             passage = file2passage(filename)
30 |             terminals = passage.layer(layer0.LAYER_ID).all
31 |             non_terminals = [n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1"]
32 |             non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage]
33 |             linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages
34 |             edges = {e for n in non_terminals for e in n}
35 |             remote = [e for e in edges if e.attrib.get("remote")]
36 |             linkage_edges = [e for n in linkage_nodes for e in n]
37 |             fields = (int(passage.ID),
38 |                       1,
39 |                       len({t.paragraph for t in terminals}),
40 |                       len(break2sentences(passage)),
41 |                       len(terminals) + len(non_terminals),
42 |                       len(terminals),
43 |                       len(non_terminals),
44 |                       len([n for n in non_linkage if n.attrib.get("implicit")]),
45 |                       len(linkage_nodes),
46 |                       len([n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous]),
47 |                       len(edges),
48 |                       len(edges) - len(remote) - len(linkage_edges),
49 |                       len(remote),
50 |                       len(linkage_edges),
51 |                       sum(len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage),
52 |                       sum(len(n.children) for n in non_linkage),
53 |                       len([n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1]),
54 |                       )
55 |             print(",".join("%d" % f for f in fields))
56 |             data.append(fields)
57 |     data = np.array(data, dtype=int)
58 |     if args.outfile:
59 |         np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t")
60 | 
61 |     sys.exit(0)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     main()
66 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/unique_roles.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import os
 5 | import sys
 6 | from collections import Counter
 7 | 
 8 | from ucca import layer1
 9 | from ucca.ioutil import file2passage
10 | 
11 | desc = """Finds edge tags that are empirically always unique: occur at most once in edges per node
12 | """
13 | 
14 | 
15 | def main():
16 |     argparser = argparse.ArgumentParser(description=desc)
17 |     argparser.add_argument('-d', '--directory', required=True, help="directory with passage files to process")
18 |     argparser.add_argument('-o', '--outfile', default="data/unique_roles.txt", help="output file for data")
19 |     argparser.add_argument('-D', '--direction', default="out", help="direction of edges to check (out|in)")
20 |     args = argparser.parse_args()
21 | 
22 |     out = args.direction == "out"
23 |     if not os.path.isdir(args.directory):
24 |         raise Exception("Not a directory: " + args.directory)
25 |     roles = set(tag for name, tag in layer1.EdgeTags.__dict__.items()
26 |                 if isinstance(tag, str) and not name.startswith('__'))
27 |     for filename in os.listdir(args.directory):
28 |         sys.stderr.write("Reading passage '%s'...\n" % filename)
29 |         passage = file2passage(args.directory + os.path.sep + filename)
30 |         for node in passage.layer(layer1.LAYER_ID).all:
31 |             counts = Counter(edge.tag for edge in (node if out else node.incoming))
32 |             roles.difference_update(tag for tag, count in counts.items() if count > 1)
33 | 
34 |     lines = "\n".join(sorted(roles))
35 |     print(lines)
36 |     if args.outfile:
37 |         with open(args.outfile, "w", encoding="utf-8") as f:
38 |             print(lines, file=f)
39 | 
40 |     sys.exit(0)
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()
45 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/scripts/visualize.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | from ucca import visualization
 6 | from ucca.ioutil import read_files_and_dirs
 7 | 
 8 | if __name__ == "__main__":
 9 |     argparser = ArgumentParser(description="Visualize the given passages as graphs.")
10 |     argparser.add_argument("passages", nargs="+", help="UCCA passages, given as xml/pickle file names")
11 |     args = argparser.parse_args()
12 |     for passage in read_files_and_dirs(args.passages):
13 |         visualization.draw(passage)
14 |         plt.show()
15 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | try:
 6 |     import pypandoc
 7 |     try:
 8 |         pypandoc.convert_file("README.md", "rst", outputfile="README.rst")
 9 |     except (IOError, ImportError, RuntimeError):
10 |         pass
11 |     long_description = pypandoc.convert_file("README.md", "rst")
12 | except (IOError, ImportError, RuntimeError):
13 |     long_description = ""
14 | 
15 | 
16 | setup(name="UCCA",
17 |       version="1.0.11",
18 |       install_requires=["spacy", "requests"],
19 |       extras_require={"visualize": ["matplotlib", "networkx"]},
20 |       description="Universal Conceptual Cognitive Annotation",
21 |       long_description=long_description,
22 |       author="Daniel Hershcovich",
23 |       author_email="danielh@cs.huji.ac.il",
24 |       url="https://github.com/huji-nlp/ucca",
25 |       classifiers=[
26 |           "Development Status :: 4 - Beta",
27 |           "Intended Audience :: Science/Research",
28 |           "Programming Language :: Python :: 3.6",
29 |           "Topic :: Text Processing :: Linguistic",
30 |           "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
31 |       ],
32 |       packages=find_packages(),
33 |       )
34 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/test_files/site1.xml:
--------------------------------------------------------------------------------
 1 | <root schemeVersion="1.0.4">
 2 | 	<unitGroups>
 3 | 	</unitGroups>
 4 | 	<units passageID="118">
 5 | 		<unit type="To Be Defined" id="0" unanalyzable="false" uncertain="false">
 6 | 			<unit type="To Be Defined" id="1" unanalyzable="false" uncertain="false">
 7 | 				<unit type="To Be Defined" id="3" unanalyzable="false" uncertain="false">
 8 | 					<word id="2">1</word>
 9 | 				</unit>
10 | 				<unit type="To Be Defined" id="5" unanalyzable="false" uncertain="false">
11 | 					<word id="4">2</word>
12 | 				</unit>
13 | 				<unit type="To Be Defined" id="7" unanalyzable="false" uncertain="false">
14 | 					<word id="6">3</word>
15 | 				</unit>
16 | 				<unit type="To Be Defined" id="9" unanalyzable="false" uncertain="false">
17 | 					<word id="8">4</word>
18 | 				</unit>
19 | 				<unit type="Punctuation" id="11" unanalyzable="false" uncertain="false">
20 | 					<word id="10">.</word>
21 | 				</unit>
22 | 			</unit>
23 | 			<unit type="To Be Defined" id="12" unanalyzable="false" uncertain="false">
24 | 				<unit type="To Be Defined" id="14" unanalyzable="false" uncertain="false">
25 | 					<word id="13">6</word>
26 | 				</unit>
27 | 				<unit type="To Be Defined" id="16" unanalyzable="false" uncertain="false">
28 | 					<word id="15">7</word>
29 | 				</unit>
30 | 				<unit type="To Be Defined" id="18" unanalyzable="false" uncertain="false">
31 | 					<word id="17">8</word>
32 | 				</unit>
33 | 				<unit type="To Be Defined" id="20" unanalyzable="false" uncertain="false">
34 | 					<word id="19">9</word>
35 | 				</unit>
36 | 				<unit type="To Be Defined" id="22" unanalyzable="false" uncertain="false">
37 | 					<word id="21">10</word>
38 | 				</unit>
39 | 				<unit type="Punctuation" id="24" unanalyzable="false" uncertain="false">
40 | 					<word id="23">.</word>
41 | 				</unit>
42 | 			</unit>
43 | 			<unit type="To Be Defined" id="25" unanalyzable="false" uncertain="false">
44 | 				<unit type="To Be Defined" id="27" unanalyzable="false" uncertain="false">
45 | 					<word id="26">12</word>
46 | 				</unit>
47 | 				<unit type="To Be Defined" id="29" unanalyzable="false" uncertain="false">
48 | 					<word id="28">13</word>
49 | 				</unit>
50 | 				<unit type="To Be Defined" id="31" unanalyzable="false" uncertain="false">
51 | 					<word id="30">14</word>
52 | 				</unit>
53 | 				<unit type="To Be Defined" id="33" unanalyzable="false" uncertain="false">
54 | 					<word id="32">15</word>
55 | 				</unit>
56 | 			</unit>
57 | 		</unit>
58 | 	</units>
59 | 	<LRUunits>
60 | 	</LRUunits>
61 | 	<hiddenUnits>
62 | 	</hiddenUnits>
63 | </root>
64 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/test_files/site2.xml:
--------------------------------------------------------------------------------
 1 | <root schemeVersion="1.0.4">
 2 | 	<unitGroups></unitGroups>
 3 | 	<units passageID="120">
 4 | 		<unit type="To Be Defined" id="0" unanalyzable="false" uncertain="false">
 5 | 			<unit type="To Be Defined" id="1" unanalyzable="false" uncertain="false">
 6 | 				<unit type="Linker" id="35" remarks="&quot;remark&quot;" unanalyzable="false" uncertain="false">
 7 | 					<unit type="Center" id="36" unanalyzable="false" uncertain="false">
 8 | 						<unit type="To Be Defined" id="3" unanalyzable="false" uncertain="false">
 9 | 							<word id="2">1</word>
10 | 						</unit>
11 | 					</unit>
12 | 					<unit type="Elaborator" id="37" unanalyzable="false" uncertain="false">
13 | 						<unit type="To Be Defined" id="5" unanalyzable="false" uncertain="false">
14 | 							<word id="4">2</word>
15 | 						</unit>
16 | 					</unit>
17 | 				</unit>
18 | 				<unit type="Linked U" id="34" unanalyzable="false" uncertain="true">
19 | 					<unit type="To Be Defined" id="7" unanalyzable="false" uncertain="false">
20 | 						<word id="6">3</word>
21 | 					</unit>
22 | 					<unit type="To Be Defined" id="9" unanalyzable="false" uncertain="false">
23 | 						<word id="8">4</word>
24 | 					</unit>
25 | 					<unit type="Punctuation" id="11" unanalyzable="false" uncertain="false">
26 | 						<word id="10">.</word>
27 | 					</unit>
28 | 				</unit>
29 | 			</unit>
30 | 			<unit type="To Be Defined" id="12" unanalyzable="false" uncertain="false">
31 | 				<unit type="To Be Defined" id="14" unanalyzable="false" uncertain="false">
32 | 					<word id="13">6</word>
33 | 				</unit>
34 | 				<unit type="To Be Defined" id="16" unanalyzable="false" uncertain="false">
35 | 					<word id="15">7</word>
36 | 				</unit>
37 | 				<unit type="To Be Defined" id="18" unanalyzable="false" uncertain="false">
38 | 					<word id="17">8</word>
39 | 				</unit>
40 | 				<unit type="To Be Defined" id="20" unanalyzable="false" uncertain="false">
41 | 					<word id="19">9</word>
42 | 				</unit>
43 | 				<unit type="To Be Defined" id="22" unanalyzable="false" uncertain="false">
44 | 					<word id="21">10</word>
45 | 				</unit>
46 | 				<unit type="Punctuation" id="24" unanalyzable="false" uncertain="false">
47 | 					<word id="23">.</word>
48 | 				</unit>
49 | 			</unit>
50 | 			<unit type="To Be Defined" id="25" unanalyzable="false" uncertain="false">
51 | 				<unit type="To Be Defined" id="27" unanalyzable="false" uncertain="false">
52 | 					<word id="26">12</word>
53 | 				</unit>
54 | 				<unit type="To Be Defined" id="29" unanalyzable="false" uncertain="false">
55 | 					<word id="28">13</word>
56 | 				</unit>
57 | 				<unit type="To Be Defined" id="31" unanalyzable="false" uncertain="false">
58 | 					<word id="30">14</word>
59 | 				</unit>
60 | 				<unit type="To Be Defined" id="33" unanalyzable="false" uncertain="false">
61 | 					<word id="32">15</word>
62 | 				</unit>
63 | 			</unit>
64 | 		</unit>
65 | 	</units>
66 | 	<LRUunits>
67 | 		<LRUunit id="37"></LRUunit>
68 | 		<LRUunit id="36"></LRUunit>
69 | 		<LRUunit id="35"></LRUunit>
70 | 		<LRUunit id="34"></LRUunit>
71 | 	</LRUunits>
72 | 	<hiddenUnits></hiddenUnits>
73 | </root>
74 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/test_files/site3.xml:
--------------------------------------------------------------------------------
  1 | ﻿<root schemeVersion="1.0.4">
  2 | 	<unitGroups>
  3 | 		<unit type="Participant" id="67" unanalyzable="false" uncertain="false"></unit>
  4 | 		<unit type="Center" id="63" unanalyzable="false" uncertain="false"></unit>
  5 | 	</unitGroups>
  6 | 	<units passageID="120">
  7 | 		<unit type="To Be Defined" id="0" unanalyzable="false" uncertain="false">
  8 | 			<unit type="To Be Defined" id="1" unanalyzable="false" uncertain="false">
  9 | 				<unit type="Linker" id="59" remarks="&quot;remark&quot;" unanalyzable="false" uncertain="false">
 10 | 					<unit type="Center" id="60" unanalyzable="false" uncertain="false">
 11 | 						<unit type="To Be Defined" id="3" unanalyzable="false" uncertain="false">
 12 | 							<word id="2">1</word>
 13 | 						</unit>
 14 | 					</unit>
 15 | 					<unit type="Elaborator" id="61" unanalyzable="false" uncertain="false">
 16 | 						<unit type="To Be Defined" id="5" unanalyzable="false" uncertain="false">
 17 | 							<word id="4">2</word>
 18 | 						</unit>
 19 | 					</unit>
 20 | 				</unit>
 21 | 				<unit type="Linked U" id="34" unanalyzable="false" uncertain="true">
 22 | 					<unit type="To Be Defined" id="7" unanalyzable="false" uncertain="false">
 23 | 						<word id="6">3</word>
 24 | 					</unit>
 25 | 					<unit type="To Be Defined" id="9" unanalyzable="false" uncertain="false">
 26 | 						<word id="8">4</word>
 27 | 					</unit>
 28 | 					<unit type="Punctuation" id="11" unanalyzable="false" uncertain="false">
 29 | 						<word id="10">.</word>
 30 | 					</unit>
 31 | 				</unit>
 32 | 			</unit>
 33 | 			<unit type="To Be Defined" id="12" unanalyzable="false" uncertain="false">
 34 | 				<unit type="Linked U" id="71" unanalyzable="false" uncertain="false">
 35 | 					<remoteUnit id="69" type="aDverbial"></remoteUnit>
 36 | 					<unit type="To Be Defined" id="66" unitGroupID="67" unanalyzable="false" uncertain="false">
 37 | 						<unit type="Elaborator" id="65" unanalyzable="false" uncertain="false">
 38 | 							<unit type="To Be Defined" id="14" unanalyzable="false" uncertain="false">
 39 | 								<word id="13">6</word>
 40 | 							</unit>
 41 | 						</unit>
 42 | 						<unit type="To Be Defined" id="62" unitGroupID="63" unanalyzable="false" uncertain="false">
 43 | 							<unit type="To Be Defined" id="16" unanalyzable="false" uncertain="false">
 44 | 								<word id="15">7</word>
 45 | 							</unit>
 46 | 						</unit>
 47 | 					</unit>
 48 | 					<unit type="Process" id="70" unanalyzable="false" uncertain="false">
 49 | 						<unit type="To Be Defined" id="18" unanalyzable="false" uncertain="false">
 50 | 							<word id="17">8</word>
 51 | 						</unit>
 52 | 					</unit>
 53 | 					<unit type="To Be Defined" id="68" unitGroupID="67" unanalyzable="false" uncertain="false">
 54 | 						<unit type="To Be Defined" id="64" unitGroupID="63" unanalyzable="false" uncertain="false">
 55 | 							<unit type="To Be Defined" id="20" unanalyzable="false" uncertain="false">
 56 | 								<word id="19">9</word>
 57 | 							</unit>
 58 | 						</unit>
 59 | 					</unit>
 60 | 				</unit>
 61 | 				<unit type="Function" id="69" unanalyzable="false" uncertain="false">
 62 | 					<unit type="To Be Defined" id="22" unanalyzable="false" uncertain="false">
 63 | 						<word id="21">10</word>
 64 | 					</unit>
 65 | 				</unit>
 66 | 				<unit type="Punctuation" id="24" unanalyzable="false" uncertain="false">
 67 | 					<word id="23">.</word>
 68 | 				</unit>
 69 | 			</unit>
 70 | 			<unit type="To Be Defined" id="25" unanalyzable="false" uncertain="false">
 71 | 				<unit type="Linked U" id="74" unanalyzable="false" uncertain="false">
 72 | 					<unit type="To Be Defined" id="27" unanalyzable="false" uncertain="false">
 73 | 						<word id="26">12</word>
 74 | 					</unit>
 75 | 				</unit>
 76 | 				<unit type="Linked U" id="75" unanalyzable="false" uncertain="false">
 77 | 					<unit type="To Be Defined" id="29" unanalyzable="false" uncertain="false">
 78 | 						<word id="28">13</word>
 79 | 					</unit>
 80 | 				</unit>
 81 | 				<unit type="Linked U" id="76" unanalyzable="false" uncertain="false">
 82 | 					<unit type="To Be Defined" id="31" unanalyzable="false" uncertain="false">
 83 | 						<word id="30">14</word>
 84 | 					</unit>
 85 | 				</unit>
 86 | 				<unit type="Linker" id="77" unanalyzable="false" uncertain="false">
 87 | 					<linkage args="74,75,76"></linkage>
 88 | 					<implicitUnit id="78" type="Center"></implicitUnit>
 89 | 					<unit type="Elaborator" id="79" unanalyzable="false" uncertain="false">
 90 |             <unit type="To Be Defined" id="33" unanalyzable="false" uncertain="false">
 91 |               <word id="32">15</word>
 92 |             </unit>
 93 | 					</unit>
 94 | 				</unit>
 95 | 			</unit>
 96 | 		</unit>
 97 | 	</units>
 98 | 	<LRUunits></LRUunits>
 99 | 	<hiddenUnits></hiddenUnits>
100 | </root>
101 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/test_files/standard3.conll:
--------------------------------------------------------------------------------
 1 | # sent_id = 120
 2 | 1	1	_	Word	Word	_	3	L	_	_
 3 | 2	2	_	Word	Word	_	1	E	_	_
 4 | 3	3	_	Word	Word	_	0	ROOT	_	_
 5 | 4	4	_	Word	Word	_	3	Terminal	_	_
 6 | 5	.	_	Punctuation	Punctuation	_	3	U	_	_
 7 | 6	6	_	Word	Word	_	7	E	_	_
 8 | 7	7	_	Word	Word	_	8	A	_	_
 9 | 8	8	_	Word	Word	_	3	H	_	_
10 | 9	9	_	Word	Word	_	7	Terminal	_	_
11 | 10	10	_	Word	Word	_	3	F	_	_
12 | 11	.	_	Punctuation	Punctuation	_	3	U	_	_
13 | 12	12	_	Word	Word	_	3	H	_	_
14 | 13	13	_	Word	Word	_	3	H	_	_
15 | 14	14	_	Word	Word	_	3	H	_	_
16 | 15	15	_	Word	Word	_	3	L	_	_
17 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/test_files/standard3.export:
--------------------------------------------------------------------------------
 1 | #BOS 120
 2 | 1	Word	--	Terminal	500
 3 | 2	Word	--	Terminal	501
 4 | 3	Word	--	Terminal	513
 5 | 4	Word	--	Terminal	513
 6 | .	Punctuation	--	Terminal	502
 7 | 6	Word	--	Terminal	503
 8 | 7	Word	--	Terminal	504
 9 | 8	Word	--	Terminal	505
10 | 9	Word	--	Terminal	504
11 | 10	Word	--	Terminal	506
12 | .	Punctuation	--	Terminal	507
13 | 12	Word	--	Terminal	508
14 | 13	Word	--	Terminal	509
15 | 14	Word	--	Terminal	510
16 | 15	Word	--	Terminal	511
17 | #500	FN	--	C	512
18 | #501	FN	--	E	512
19 | #502	PNCT	--	U	513
20 | #503	FN	--	E	514
21 | #504	FN	--	C	514
22 | #505	FN	--	P	515
23 | #506	FN	--	F	518	D*	515
24 | #507	PNCT	--	U	518
25 | #508	FN	--	H	518	LA	519
26 | #509	FN	--	H	518	LA	519
27 | #510	FN	--	H	518	LA	519
28 | #511	FN	--	E	517
29 | #512	FN	--	L	518
30 | #513	FN	--	H	518
31 | #514	FN	--	A	515
32 | #515	FN	--	H	518
33 | #516	FN	--	C	517
34 | #517	FN	--	L	518	LR	519
35 | #518	FN	--	--	0
36 | #519	LKG	--	--	0
37 | #EOS 120


--------------------------------------------------------------------------------
/14-semparsing/ucca/test_files/standard3.sdp:
--------------------------------------------------------------------------------
 1 | 1	1	_	Word	-	+	_	_	L	_	_
 2 | 2	2	_	Word	-	-	_	E	_	_	_
 3 | 3	3	_	Word	-	+	_	_	_	_	_
 4 | 4	4	_	Word	-	-	_	_	Terminal	_	_
 5 | 5	.	_	Punctuation	-	-	_	_	U	_	_
 6 | 6	6	_	Word	-	-	_	_	_	E	_
 7 | 7	7	_	Word	-	+	_	_	_	_	A
 8 | 8	8	_	Word	-	+	_	_	H	_	_
 9 | 9	9	_	Word	-	-	_	_	_	Terminal	_
10 | 10	10	_	Word	-	-	_	_	F	_	D*
11 | 11	.	_	Punctuation	-	-	_	_	U	_	_
12 | 12	12	_	Word	-	-	_	_	H	_	_
13 | 13	13	_	Word	-	-	_	_	H	_	_
14 | 14	14	_	Word	-	-	_	_	H	_	_
15 | 15	15	_	Word	-	-	_	_	L	_	_
16 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/test_files/standard3.xml:
--------------------------------------------------------------------------------
1 | <root annotationID="0" passageID="120"><attributes /><layer layerID="0"><attributes /><node ID="0.1" type="Word"><attributes paragraph="1" paragraph_position="1" text="1" /></node><node ID="0.2" type="Word"><attributes paragraph="1" paragraph_position="2" text="2" /></node><node ID="0.3" type="Word"><attributes paragraph="1" paragraph_position="3" text="3" /></node><node ID="0.4" type="Word"><attributes paragraph="1" paragraph_position="4" text="4" /></node><node ID="0.5" type="Punctuation"><attributes paragraph="1" paragraph_position="5" text="." /></node><node ID="0.6" type="Word"><attributes paragraph="2" paragraph_position="1" text="6" /></node><node ID="0.7" type="Word"><attributes paragraph="2" paragraph_position="2" text="7" /></node><node ID="0.8" type="Word"><attributes paragraph="2" paragraph_position="3" text="8" /></node><node ID="0.9" type="Word"><attributes paragraph="2" paragraph_position="4" text="9" /></node><node ID="0.10" type="Word"><attributes paragraph="2" paragraph_position="5" text="10" /></node><node ID="0.11" type="Punctuation"><attributes paragraph="2" paragraph_position="6" text="." /></node><node ID="0.12" type="Word"><attributes paragraph="3" paragraph_position="1" text="12" /></node><node ID="0.13" type="Word"><attributes paragraph="3" paragraph_position="2" text="13" /></node><node ID="0.14" type="Word"><attributes paragraph="3" paragraph_position="3" text="14" /></node><node ID="0.15" type="Word"><attributes paragraph="3" paragraph_position="4" text="15" /></node></layer><layer layerID="1"><attributes /><node ID="1.1" type="FN"><attributes /><edge toID="1.2" type="L"><attributes /></edge><edge toID="1.5" type="H"><attributes /></edge><edge toID="1.7" type="H"><attributes /></edge><edge toID="1.12" type="F"><attributes /></edge><edge toID="1.13" type="U"><attributes /></edge><edge toID="1.14" type="H"><attributes /></edge><edge toID="1.15" type="H"><attributes /></edge><edge toID="1.16" type="H"><attributes /></edge><edge toID="1.17" type="L"><attributes /></edge></node><node ID="1.2" type="FN"><attributes /><extra remarks="&quot;remark&quot;" /><edge toID="1.3" type="C"><attributes /></edge><edge toID="1.4" type="E"><attributes /></edge></node><node ID="1.3" type="FN"><attributes /><edge toID="0.1" type="Terminal"><attributes /></edge></node><node ID="1.4" type="FN"><attributes /><edge toID="0.2" type="Terminal"><attributes /></edge></node><node ID="1.5" type="FN"><attributes uncertain="True" /><edge toID="0.3" type="Terminal"><attributes /></edge><edge toID="0.4" type="Terminal"><attributes /></edge><edge toID="1.6" type="U"><attributes /></edge></node><node ID="1.6" type="PNCT"><attributes /><edge toID="0.5" type="Terminal"><attributes /></edge></node><node ID="1.7" type="FN"><attributes /><edge toID="1.8" type="A"><attributes /></edge><edge toID="1.11" type="P"><attributes /></edge><edge toID="1.12" type="D"><attributes remote="True" /></edge></node><node ID="1.8" type="FN"><attributes /><edge toID="1.9" type="E"><attributes /></edge><edge toID="1.10" type="C"><attributes /></edge></node><node ID="1.9" type="FN"><attributes /><edge toID="0.6" type="Terminal"><attributes /></edge></node><node ID="1.10" type="FN"><attributes /><edge toID="0.7" type="Terminal"><attributes /></edge><edge toID="0.9" type="Terminal"><attributes /></edge></node><node ID="1.11" type="FN"><attributes /><edge toID="0.8" type="Terminal"><attributes /></edge></node><node ID="1.12" type="FN"><attributes /><edge toID="0.10" type="Terminal"><attributes /></edge></node><node ID="1.13" type="PNCT"><attributes /><edge toID="0.11" type="Terminal"><attributes /></edge></node><node ID="1.14" type="FN"><attributes /><edge toID="0.12" type="Terminal"><attributes /></edge></node><node ID="1.15" type="FN"><attributes /><edge toID="0.13" type="Terminal"><attributes /></edge></node><node ID="1.16" type="FN"><attributes /><edge toID="0.14" type="Terminal"><attributes /></edge></node><node ID="1.17" type="FN"><attributes /><edge toID="1.18" type="C"><attributes /></edge><edge toID="1.19" type="E"><attributes /></edge></node><node ID="1.18" type="FN"><attributes implicit="True" /></node><node ID="1.19" type="FN"><attributes /><edge toID="0.15" type="Terminal"><attributes /></edge></node><node ID="1.20" type="LKG"><attributes /><edge toID="1.14" type="LA"><attributes /></edge><edge toID="1.15" type="LA"><attributes /></edge><edge toID="1.16" type="LA"><attributes /></edge><edge toID="1.17" type="LR"><attributes /></edge></node></layer></root>


--------------------------------------------------------------------------------
/14-semparsing/ucca/ucca/README.md:
--------------------------------------------------------------------------------
 1 | `ucca` package
 2 | ====================
 3 | 
 4 | List of Modules
 5 | ---------------
 6 | 1. `constructions` -- provides methods for extracting linguistic constructions from text
 7 | 1. `convert` -- provides functions to convert between the UCCA objects (Pythonic)
 8 | to site annotation XML, standard XML representation and text
 9 | 1. `core` -- provides the basic objects of UCCA relations: `Node`, `Edge`, `Layer`
10 | and `Passage`, which are the basic items to work with
11 | 1. `evaluation` -- provides methods for comparing passages and inspecting the differences
12 | 1. `layer0` -- provides the text layer (layer 0) objects: `Layer0` and `Terminal`
13 | 1. `layer1` -- provides the foundational layer objects: `Layer1`, `FoundationalNode`,
14 | `PunctNode` and `Linkage`
15 | 1. `textutil` -- provides text processing utilities
16 | 
17 | In addition, a `tests` package is present, enabling unit-testing.
18 | 
19 | Authors
20 | ------
21 | * Amit Beka: amit.beka@gmail.com
22 | * Daniel Hershcovich: danielh@cs.huji.ac.il


--------------------------------------------------------------------------------
/14-semparsing/ucca/ucca/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bastings/nn4nlp2017-code-pytorch/189751e9a6a59f2ff24cbb310126ba9032079748/14-semparsing/ucca/ucca/__init__.py


--------------------------------------------------------------------------------
/14-semparsing/ucca/ucca/diffutil.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from ucca.ioutil import passage2file
 4 | 
 5 | 
 6 | def diff_passages(true_passage, pred_passage):
 7 |     """
 8 |     Debug method to print missing or mistaken attributes, nodes and edges
 9 |     """
10 |     lines = list()
11 |     if not true_passage._attrib.equals(pred_passage._attrib):
12 |         lines.append("Passage attributes mismatch: %s, %s" %
13 |                      (true_passage._attrib, pred_passage._attrib))
14 |     try:
15 |         for lid, l1 in true_passage._layers.items():
16 |             l2 = true_passage.layer(lid)
17 |             if not l1._attrib.equals(l2._attrib):
18 |                 lines.append("Layer %d attributes mismatch: %s, %s" %
19 |                              (lid, l1._attrib, l2._attrib))
20 |     except KeyError:  # no layer with same ID found
21 |         lines.append("Missing layer: %s, %s" %
22 |                      (true_passage._layers, pred_passage._layers))
23 |     pred_ids = {node.extra["remarks"]: node
24 |                 for node in pred_passage.missing_nodes(true_passage)}
25 |     true_ids = {node.ID: node
26 |                 for node in true_passage.missing_nodes(pred_passage)}
27 |     for pred_id, pred_node in list(pred_ids.items()):
28 |         true_node = true_ids.get(pred_id)
29 |         if true_node:
30 |             pred_ids.pop(pred_id)
31 |             true_ids.pop(pred_id)
32 |             pred_edges = {edge.tag + "->" + edge.child.ID: edge for edge in
33 |                           pred_node.missing_edges(true_node)}
34 |             true_edges = {edge.tag + "->" + edge.child.ID: edge for edge in
35 |                           true_node.missing_edges(pred_node)}
36 |             intersection = set(pred_edges).intersection(set(true_edges))
37 |             pred_edges = {s: edge for s, edge in pred_edges.items() if s not in intersection}
38 |             true_edges = {s: edge for s, edge in true_edges.items() if s not in intersection}
39 | 
40 |             node_lines = []
41 |             if not pred_node._attrib.equals(true_node._attrib):
42 |                 node_lines.append("  Attributes mismatch: %s, %s" %
43 |                                   (sorted(true_node._attrib.items()), sorted(pred_node._attrib.items())))
44 |             if pred_edges:
45 |                 node_lines.append("  Mistake edges: %s" % ", ".join(pred_edges))
46 |             if true_edges:
47 |                 node_lines.append("  Missing edges: %s" % ", ".join(true_edges))
48 |             if node_lines:
49 |                 lines.append("For node " + pred_id + ":")
50 |                 lines.extend(node_lines)
51 |     if pred_ids:
52 |         lines.append("Mistake nodes: %s" % ", ".join(pred_ids))
53 |     if true_ids:
54 |         lines.append("Missing nodes: %s" % ", ".join(true_ids))
55 |     if lines:
56 |         outfile = "%s.xml" % true_passage.ID
57 |         sys.stderr.write("Writing passage '%s'...\n" % outfile)
58 |         passage2file(true_passage, outfile)
59 |         outfile = "%s_pred.xml" % pred_passage.ID
60 |         sys.stderr.write("Writing passage '%s'...\n" % outfile)
61 |         passage2file(pred_passage, outfile)
62 |     return "\n" + "\n".join(lines)
63 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/ucca/ioutil.py:
--------------------------------------------------------------------------------
  1 | """Input/output utility functions for UCCA scripts."""
  2 | import os
  3 | import sys
  4 | import time
  5 | from collections import defaultdict
  6 | from xml.etree.ElementTree import ParseError
  7 | 
  8 | from ucca.convert import file2passage, passage2file, from_text, to_text, split2segments
  9 | from ucca.core import Passage
 10 | 
 11 | 
 12 | class LazyLoadedPassages(object):
 13 |     """
 14 |     Iterable interface to Passage objects that loads files on-the-go and can be iterated more than once
 15 |     """
 16 |     def __init__(self, files, sentences=False, paragraphs=False, converters=None):
 17 |         self.files = files
 18 |         self.sentences = sentences
 19 |         self.paragraphs = paragraphs
 20 |         self.split = self.sentences or self.paragraphs
 21 |         self.converters = defaultdict(lambda: from_text) if converters is None else converters
 22 |         self._files_iter = None
 23 |         self._split_iter = None
 24 |         self._file_handle = None
 25 |         self._next_index = None
 26 | 
 27 |     def __iter__(self):
 28 |         self._next_index = 0
 29 |         self._files_iter = iter(self.files)
 30 |         self._split_iter = None
 31 |         self._file_handle = None
 32 |         return self
 33 | 
 34 |     def __next__(self):
 35 |         passage = self._next_passage()
 36 |         self._next_index += 1
 37 |         return passage
 38 | 
 39 |     def _next_passage(self):
 40 |         passage = None
 41 |         if self._split_iter is None:
 42 |             try:
 43 |                 file = next(self._files_iter)
 44 |             except StopIteration:  # Finished iteration
 45 |                 raise
 46 |             if isinstance(file, Passage):  # Not really a file, but a Passage
 47 |                 passage = file
 48 |             else:  # A file
 49 |                 attempts = 3
 50 |                 while not os.path.exists(file):
 51 |                     if attempts == 0:
 52 |                         print("File not found: %s" % file, file=sys.stderr)
 53 |                         return next(self)
 54 |                     print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr)
 55 |                     time.sleep(5)
 56 |                     attempts -= 1
 57 |                 try:
 58 |                     passage = file2passage(file)  # XML or binary format
 59 |                 except (IOError, ParseError):  # Failed to read as passage file
 60 |                     base, ext = os.path.splitext(os.path.basename(file))
 61 |                     converter = self.converters[ext.lstrip(".")]
 62 |                     self._file_handle = open(file, encoding="utf-8")
 63 |                     self._split_iter = iter(converter(self._file_handle, passage_id=base))
 64 |             if self.split:
 65 |                 if self._split_iter is None:
 66 |                     self._split_iter = (passage,)
 67 |                 self._split_iter = iter(s for p in self._split_iter for s in
 68 |                                         split2segments(p, is_sentences=self.sentences))
 69 |         if self._split_iter is not None:  # Either set before or initialized now
 70 |             try:
 71 |                 # noinspection PyTypeChecker
 72 |                 passage = next(self._split_iter)
 73 |             except StopIteration:  # Finished this converter
 74 |                 self._split_iter = None
 75 |                 if self._file_handle is not None:
 76 |                     self._file_handle.close()
 77 |                     self._file_handle = None
 78 |                 return next(self)
 79 |         return passage
 80 | 
 81 |     # The following three methods are implemented to support shuffle;
 82 |     # note files are shuffled but there is no shuffling within files, as it would not be efficient.
 83 |     # Note also the inconsistency because these access the files while __iter__ accesses individual passages.
 84 |     def __len__(self):
 85 |         return len(self.files)
 86 | 
 87 |     def __getitem__(self, i):
 88 |         return self.files[i]
 89 | 
 90 |     def __setitem__(self, i, value):
 91 |         self.files[i] = value
 92 | 
 93 |     def __bool__(self):
 94 |         return bool(self.files)
 95 | 
 96 | 
 97 | def read_files_and_dirs(files_and_dirs, sentences=False, paragraphs=False, converters=None):
 98 |     """
 99 |     :param files_and_dirs: iterable of files and/or directories to look in
100 |     :param sentences: whether to split to sentences
101 |     :param paragraphs: whether to split to paragraphs
102 |     :param converters: dict of input format converters to use based on the file extension
103 |     :return: list of (lazy-loaded) passages from all files given,
104 |              plus any files directly under any directory given
105 |     """
106 |     files = list(files_and_dirs)
107 |     files += [os.path.join(d, f) for d in files if os.path.isdir(d) for f in os.listdir(d)]
108 |     files = [f for f in files if not os.path.isdir(f)]
109 |     return LazyLoadedPassages(files, sentences, paragraphs, converters)
110 | 
111 | 
112 | def write_passage(passage, output_format, binary, outdir, prefix, converter=None):
113 |     suffix = output_format if output_format and output_format != "ucca" else ("pickle" if binary else "xml")
114 |     outfile = outdir + os.path.sep + prefix + passage.ID + "." + suffix
115 |     print("Writing passage '%s'..." % outfile)
116 |     if output_format is None or output_format in ("ucca", "pickle", "xml"):
117 |         passage2file(passage, outfile, binary=binary)
118 |     else:
119 |         output = "\n".join(line for line in (converter or to_text)(passage))
120 |         with open(outfile, "w", encoding="utf-8") as f:
121 |             f.write(output + "\n")
122 |     return outfile
123 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/ucca/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bastings/nn4nlp2017-code-pytorch/189751e9a6a59f2ff24cbb310126ba9032079748/14-semparsing/ucca/ucca/tests/__init__.py


--------------------------------------------------------------------------------
/14-semparsing/ucca/ucca/visualization.py:
--------------------------------------------------------------------------------
 1 | import operator
 2 | import warnings
 3 | from collections import defaultdict
 4 | 
 5 | import matplotlib.cbook
 6 | import networkx as nx
 7 | 
 8 | from ucca import layer0, layer1
 9 | from ucca.layer1 import Linkage
10 | 
11 | warnings.filterwarnings("ignore", category=matplotlib.cbook.mplDeprecation)
12 | warnings.filterwarnings("ignore", category=UserWarning)
13 | 
14 | 
15 | def draw(passage):
16 |     G = nx.DiGraph()
17 |     terminals = sorted(passage.layer(layer0.LAYER_ID).all, key=operator.attrgetter("position"))
18 |     G.add_nodes_from([(n.ID, {"label": n.text, "node_color": "white"}) for n in terminals])
19 |     G.add_nodes_from([(n.ID, {"label": "IMPLICIT" if n.attrib.get("implicit") else "",
20 |                               "node_color": "gray" if isinstance(n, Linkage) else (
21 |                                   "white" if n.attrib.get("implicit") else "black")})
22 |                       for n in passage.layer(layer1.LAYER_ID).all])
23 |     G.add_edges_from([(n.ID, e.child.ID, {"label": e.tag, "style": "dashed" if e.attrib.get("remote") else "solid"})
24 |                       for layer in passage.layers for n in layer.all for e in n])
25 |     pos = topological_layout(passage)
26 |     nx.draw(G, pos, arrows=False, font_size=10,
27 |             node_color=[d["node_color"] for _, d in G.nodes(data=True)],
28 |             labels={n: d["label"] for n, d in G.nodes(data=True) if d["label"]},
29 |             style=[d["style"] for _, _, d in G.edges(data=True)])
30 |     nx.draw_networkx_edge_labels(G, pos, font_size=8,
31 |                                  edge_labels={(u, v): d["label"] for u, v, d in G.edges(data=True)})
32 | 
33 | 
34 | def topological_layout(passage):
35 |     visited = defaultdict(set)
36 |     pos = {}
37 |     implicit_offset = 1 + max((n.position for n in passage.layer(layer0.LAYER_ID).all), default=-1)
38 |     remaining = [n for layer in passage.layers for n in layer.all if not n.parents]
39 |     while remaining:
40 |         node = remaining.pop()
41 |         if node.ID in pos:  # done already
42 |             continue
43 |         if node.children:
44 |             children = [c for c in node.children if c.ID not in pos and c not in visited[node.ID]]
45 |             if children:
46 |                 visited[node.ID].update(children)  # to avoid cycles
47 |                 remaining += [node] + children
48 |                 continue
49 |             xs, ys = zip(*(pos[c.ID] for c in node.children))
50 |             pos[node.ID] = (sum(xs) / len(xs), 1 + max(ys))  # done with children
51 |         elif node.layer.ID == layer0.LAYER_ID:  # terminal
52 |             pos[node.ID] = (int(node.position), 0)
53 |         else:  # implicit
54 |             pos[node.ID] = (implicit_offset, 0)
55 |             implicit_offset += 1
56 |     return pos
57 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/uccaapp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bastings/nn4nlp2017-code-pytorch/189751e9a6a59f2ff24cbb310126ba9032079748/14-semparsing/ucca/uccaapp/__init__.py


--------------------------------------------------------------------------------
/14-semparsing/ucca/uccaapp/convert_and_evaluate.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | from glob import glob
 4 | 
 5 | from requests.exceptions import HTTPError
 6 | 
 7 | from ucca.evaluation import evaluate, Scores
 8 | from ucca.ioutil import read_files_and_dirs
 9 | from uccaapp.download_task import TaskDownloader
10 | from uccaapp.upload_task import TaskUploader
11 | 
12 | try:
13 |     from simplejson.scanner import JSONDecodeError
14 | except ImportError:
15 |     from json.decoder import JSONDecodeError
16 | 
17 | desc = """Convert a passage file to JSON format and upload to UCCA-App as a completed task,
18 | then download task from UCCA-App and convert to a passage in standard format again,
19 | then evaluate the result against the original"""
20 | 
21 | 
22 | def main(filenames, write, **kwargs):
23 |     uploader = TaskUploader(**kwargs)
24 |     downloader = TaskDownloader(**kwargs)
25 |     scores = []
26 |     try:
27 |         for pattern in filenames:
28 |             filenames = glob(pattern)
29 |             if not filenames:
30 |                 raise IOError("Not found: " + pattern)
31 |             for ref in read_files_and_dirs(filenames):
32 |                 print("Converting passage " + ref.ID + "... ", end="")
33 |                 task = uploader.upload_task(ref)
34 |                 guessed = downloader.download_task(task["id"], write=write, **kwargs)
35 |                 score = evaluate(guessed, ref, **kwargs)
36 |                 print("F1=%.3f" % score.average_f1())
37 |                 scores.append(score)
38 |     except HTTPError as e:
39 |         try:
40 |             raise ValueError(e.response.json()) from e
41 |         except JSONDecodeError:
42 |             raise ValueError(e.response.text) from e
43 |     print()
44 |     if len(scores) > 1:
45 |         print("Aggregated scores:")
46 |     Scores.aggregate(scores).print()
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     argument_parser = argparse.ArgumentParser(description=desc)
51 |     TaskUploader.add_arguments(argument_parser)
52 |     argument_parser.add_argument("--write", action="store_true", help="Write converted passage to file")
53 |     TaskDownloader.add_write_arguments(argument_parser)
54 |     main(**vars(argument_parser.parse_args()))
55 |     sys.exit(0)
56 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/uccaapp/download_task.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import sys
 4 | 
 5 | from ucca.convert import from_json, CONVERTERS, TO_FORMAT
 6 | from ucca.ioutil import write_passage
 7 | from uccaapp.api import ServerAccessor
 8 | 
 9 | desc = """Download task from UCCA-App and convert to a passage in standard format"""
10 | 
11 | 
12 | class TaskDownloader(ServerAccessor):
13 |     def download_tasks(self, task_ids, **kwargs):
14 |         for task_id in task_ids:
15 |             yield self.download_task(task_id, **kwargs)
16 | 
17 |     def download_task(self, task_id, write=True, out_format=None, binary=None, out_dir=None, prefix=None, **kwargs):
18 |         del kwargs
19 |         passage = from_json(self.get_user_task(task_id), all_categories=self.layer["categories"])
20 |         if write:
21 |             write_passage(passage, out_format, binary, out_dir, prefix, TO_FORMAT.get(out_format))
22 |         return passage
23 | 
24 |     @staticmethod
25 |     def add_arguments(argparser):
26 |         argparser.add_argument("task_ids", nargs="+", type=int, help="IDs of tasks to download and convert")
27 |         TaskDownloader.add_write_arguments(argparser)
28 |         ServerAccessor.add_arguments(argparser)
29 | 
30 |     @staticmethod
31 |     def add_write_arguments(argparser):
32 |         argparser.add_argument("-f", "--out-format", choices=CONVERTERS, help="output file format (default: UCCA)")
33 |         argparser.add_argument("-o", "--out-dir", default=".", help="output directory")
34 |         argparser.add_argument("-p", "--prefix", default="", help="output filename prefix")
35 |         argparser.add_argument("-b", "--binary", action="store_true", help="write in binary format (.pickle)")
36 | 
37 | 
38 | def main(**kwargs):
39 |     list(TaskDownloader(**kwargs).download_tasks(**kwargs))
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     argument_parser = argparse.ArgumentParser(description=desc)
44 |     TaskDownloader.add_arguments(argument_parser)
45 |     main(**vars(argument_parser.parse_args()))
46 |     sys.exit(0)
47 | 


--------------------------------------------------------------------------------
/14-semparsing/ucca/uccaapp/upload_task.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import os
 4 | import sys
 5 | from glob import glob
 6 | 
 7 | from requests.exceptions import HTTPError
 8 | 
 9 | from ucca.convert import to_json, to_text
10 | from ucca.ioutil import read_files_and_dirs
11 | from uccaapp.api import ServerAccessor
12 | 
13 | try:
14 |     from simplejson.scanner import JSONDecodeError
15 | except ImportError:
16 |     from json.decoder import JSONDecodeError
17 | 
18 | desc = """Convert a passage file to JSON format and upload to UCCA-App as a completed task"""
19 | 
20 | # https://github.com/omriabnd/UCCA-App/blob/master/UCCAApp_REST_API_Reference.pdf
21 | # ucca-demo.cs.huji.ac.il or ucca.staging.cs.huji.ac.il
22 | # upload the parse as a (completed) task:
23 | # 0. decide which project and user you want to assign it to
24 | # 1. POST passage (easy format)
25 | # 2. POST task x (of type tokenization)
26 | # 3. PUT task x (submit)
27 | # 4. POST task y (of type annotation with parent x; this is the more complicated format)
28 | # 5. PUT task y (submit)
29 | 
30 | USER_ID_ENV_VAR = "UCCA_APP_USER_ID"
31 | 
32 | 
33 | class TaskUploader(ServerAccessor):
34 |     def __init__(self, user_id, **kwargs):
35 |         super().__init__(**kwargs)
36 |         self.user = dict(id=user_id or int(os.environ[USER_ID_ENV_VAR]))
37 |         
38 |     def upload_tasks(self, filenames, **kwargs):
39 |         del kwargs
40 |         try:
41 |             for pattern in filenames:
42 |                 filenames = glob(pattern)
43 |                 if not filenames:
44 |                     raise IOError("Not found: " + pattern)
45 |                 for passage in read_files_and_dirs(filenames):
46 |                     task = self.upload_task(passage)
47 |                     print("Submitted task %d" % task["id"])
48 |                     yield task
49 |         except HTTPError as e:
50 |             try:
51 |                 raise ValueError(e.response.json()) from e
52 |             except JSONDecodeError:
53 |                 raise ValueError(e.response.text) from e
54 | 
55 |     def upload_task(self, passage):
56 |         passage_out = self.create_passage(text=to_text(passage, sentences=False)[0], type="PUBLIC", source=self.source)
57 |         task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user,
58 |                        passage=passage_out, manager_comment=passage.ID, user_comment=passage.ID, parent=None,
59 |                        is_demo=False, is_active=True)
60 |         tok_task_out = self.create_tokenization_task(**task_in)
61 |         tok_user_task_in = dict(tok_task_out)
62 |         tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True))
63 |         tok_user_task_out = self.submit_tokenization_task(**tok_user_task_in)
64 |         task_in.update(parent=tok_task_out, type="ANNOTATION")
65 |         ann_user_task_in = self.create_annotation_task(**task_in)
66 |         ann_user_task_in.update(
67 |             to_json(passage, return_dict=True, tok_task=tok_user_task_out, all_categories=self.layer["categories"]))
68 |         return self.submit_annotation_task(**ann_user_task_in)
69 | 
70 |     @staticmethod
71 |     def add_arguments(argparser):
72 |         argparser.add_argument("filenames", nargs="+", help="passage file names to convert and upload")
73 |         argparser.add_argument("--user-id", type=int, help="user id, otherwise set by " + USER_ID_ENV_VAR)
74 |         ServerAccessor.add_arguments(argparser)
75 | 
76 | 
77 | def main(**kwargs):
78 |     list(TaskUploader(**kwargs).upload_tasks(**kwargs))
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     argument_parser = argparse.ArgumentParser(description=desc)
83 |     TaskUploader.add_arguments(argument_parser)
84 |     main(**vars(argument_parser.parse_args()))
85 |     sys.exit(0)
86 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | Copyright 2017 Graham Neubig
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Neural Networks for NLP Code Examples
2 | 
3 | This is a repository of code examples for the 2017 edition of CMU CS 11-747
4 | [Neural Networks for NLP](http://phontron.com/class/nn4nlp2017/).
5 | 
6 | By Graham Neubig, Daniel Clothiaux, Zhengzhong Liu, and Xuezhe Ma
7 | 
8 | [PyTorch](http://pytorch.org/) code by Joost Bastings
9 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | This contains two example data sets:
2 | 
3 | 1) **Text Data (ptb):** Data from the Penn Treebank dataset provided by Mikolov: http://www.fit.vutbr.cz/~imikolov/rnnlm/
4 | 2) **Tree Data (trees):** The tree data from the Stanford Sentiment Treebank: http://nlp.stanford.edu/sentiment/index.html
5 | 3) **Classification Data (classes):** The data from the Stanford Sentiment Treebank with tree info removed.
6 | 4) **Parallel Data (parallel):** Data from the Tanaka corpus, reduced to only have 10,000 training examples: http://www.edrdg.org/wiki/index.php/Tanaka_Corpus
7 | 5) **Tagging Data (tags):** Data from WikiNER, reduced to only have 10,000 training examples: http://schwa.org/projects/resources/wiki/Wikiner
8 | 


--------------------------------------------------------------------------------