├── .gitignore ├── 01-intro ├── bow-pytorch.py ├── bow-simple-pytorch.py ├── bow.ipynb ├── bow.py ├── cbow-pytorch.py ├── cbow.ipynb ├── cbow.py ├── deep-cbow-pytorch-minibatch.py ├── deep-cbow-pytorch.py ├── deep-cbow.ipynb └── deep-cbow.py ├── 02-lm ├── loglin-lm.py ├── nn-lm-batch.py ├── nn-lm-optim.py └── nn-lm.py ├── 03-wordemb ├── kwic.py ├── tsne.py ├── wordemb-cbow.py ├── wordemb-skip.py └── wordemb-vis-tsne.py ├── 04-efficiency ├── slow-impl.py ├── wordemb-skip-binary.py └── wordemb-skip-ns.py ├── 05-cnn ├── cnn-activation.py └── cnn-class.py ├── 06-rnn ├── lm-lstm.py ├── lm-minibatch.py ├── sentiment-lstm.py └── sentiment-rnn.py ├── 07-sentrep └── text-retrieval.py ├── 08-condlm ├── batched_enc_dec.py ├── bleu.py └── enc_dec.py ├── 09-attention ├── batched_attention.py └── plot_attention.py ├── 10-structured ├── bilstm-tagger.py └── bilstm-variant-tagger.py ├── 12-transitionparsing ├── feed_forward.py ├── oracle.py ├── stack_lstm.py └── tree_parser.py ├── 13-graphparsing ├── biaffine.py ├── biaffine_parser.py └── mst.py ├── 14-semparsing └── ucca │ ├── .appveyor.yml │ ├── .gitignore │ ├── .travis.yml │ ├── LICENSE.txt │ ├── README.md │ ├── actions.py │ ├── ci │ ├── deploy.sh │ └── test.sh │ ├── doc │ ├── README │ ├── short_defs.pdf │ └── toy.xml │ ├── oracle.py │ ├── runner.py │ ├── scripts │ ├── __init__.py │ ├── annotate.py │ ├── convert_and_evaluate.py │ ├── count_parents_children.py │ ├── distances │ │ └── align.py │ ├── evaluate_db.py │ ├── evaluate_standard.py │ ├── find_constructions.py │ ├── join_passages.py │ ├── join_sdp.py │ ├── pickle_to_standard.py │ ├── site_to_standard.py │ ├── split_corpus.py │ ├── standard_to_pickle.py │ ├── standard_to_sentences.py │ ├── statistics.py │ ├── ucca_db.py │ ├── unique_roles.py │ └── visualize.py │ ├── setup.cfg │ ├── setup.py │ ├── test_files │ ├── site1.xml │ ├── site2.xml │ ├── site3.xml │ ├── standard3.conll │ ├── standard3.conll.xml │ ├── standard3.export │ ├── standard3.export.xml │ ├── standard3.sdp │ ├── standard3.sdp.xml │ └── standard3.xml │ ├── ucca │ ├── README.md │ ├── __init__.py │ ├── constructions.py │ ├── convert.py │ ├── core.py │ ├── diffutil.py │ ├── evaluation.py │ ├── ioutil.py │ ├── layer0.py │ ├── layer1.py │ ├── tests │ │ ├── __init__.py │ │ └── test_ucca.py │ ├── textutil.py │ └── visualization.py │ └── uccaapp │ ├── __init__.py │ ├── api.py │ ├── convert_and_evaluate.py │ ├── download_task.py │ └── upload_task.py ├── 15-vae └── vae-lm.py ├── 16-reinforce └── bilstm-tagger.py ├── COPYING ├── README.md └── data ├── README.md ├── classes ├── dev.txt ├── test.txt └── train.txt ├── parallel ├── dev.en ├── dev.ja ├── test.en ├── test.ja ├── train.en └── train.ja ├── parsing ├── graph │ ├── ptb_dev.txt │ └── ptb_train.txt ├── shift_reduce │ ├── small-dev.txt │ ├── small-dev.unk.txt │ ├── small-test.txt │ ├── small-test.unk.txt │ ├── small-train.txt │ ├── small-train.unk.txt │ └── vocab.txt └── trees │ ├── dev.txt │ ├── test.txt │ └── train.txt ├── ptb ├── test.txt ├── train.txt └── valid.txt ├── tags ├── dev.txt └── train.txt └── trees ├── dev.txt ├── test.txt └── train.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | *.log 3 | *.swp 4 | .DS_Store 5 | __MACOSX 6 | __pycache__ 7 | 03-wordemb/*.txt 8 | 03-wordemb/*.png 9 | 04-efficiency/*.txt 10 | 04-efficiency/*.png 11 | 09-attention/*.png 12 | 13 | *.pyc 14 | 15 | .idea/ 16 | *.iml 17 | -------------------------------------------------------------------------------- /01-intro/bow-pytorch.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | """ 4 | BOW 5 | 6 | Based on Graham Neubig's DyNet code examples: 7 | https://github.com/neubig/nn4nlp2017-code 8 | http://phontron.com/class/nn4nlp2017/ 9 | 10 | """ 11 | 12 | from collections import defaultdict 13 | import time 14 | import random 15 | import torch 16 | from torch.autograd import Variable 17 | import torch.nn as nn 18 | import torch.optim as optim 19 | torch.manual_seed(1) 20 | 21 | 22 | # Functions to read in the corpus 23 | w2i = defaultdict(lambda: len(w2i)) 24 | t2i = defaultdict(lambda: len(t2i)) 25 | UNK = w2i[""] 26 | 27 | 28 | def read_dataset(filename): 29 | with open(filename, "r") as f: 30 | for line in f: 31 | tag, words = line.lower().strip().split(" ||| ") 32 | yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 33 | 34 | 35 | # Read in the data 36 | train = list(read_dataset("../data/classes/train.txt")) 37 | w2i = defaultdict(lambda: UNK, w2i) 38 | dev = list(read_dataset("../data/classes/test.txt")) 39 | nwords = len(w2i) 40 | ntags = len(t2i) 41 | 42 | 43 | class BOW(nn.Module): 44 | 45 | def __init__(self, vocab_size, embedding_dim): 46 | super(BOW, self).__init__() 47 | self.embeddings = nn.Embedding(vocab_size, embedding_dim) 48 | self.bias = nn.Parameter(torch.zeros(embedding_dim), requires_grad=True) 49 | 50 | def forward(self, inputs): 51 | embeds = self.embeddings(inputs) 52 | logits = torch.sum(embeds, 1) + self.bias 53 | return logits 54 | 55 | 56 | model = BOW(nwords, ntags) 57 | print(model) 58 | 59 | 60 | def evaluate(model, data): 61 | """Evaluate a model on a data set.""" 62 | correct = 0.0 63 | 64 | for words, tag in data: 65 | lookup_tensor = Variable(torch.LongTensor([words])) 66 | scores = model(lookup_tensor) 67 | predict = scores.data.numpy().argmax(axis=1)[0] 68 | 69 | if predict == tag: 70 | correct += 1 71 | 72 | return correct, len(data), correct/len(data) 73 | 74 | 75 | optimizer = optim.SGD(model.parameters(), lr=0.01) 76 | 77 | for ITER in range(100): 78 | 79 | random.shuffle(train) 80 | train_loss = 0.0 81 | start = time.time() 82 | 83 | for words, tag in train: 84 | 85 | # forward pass 86 | lookup_tensor = Variable(torch.LongTensor([words])) 87 | scores = model(lookup_tensor) 88 | loss = nn.CrossEntropyLoss() 89 | target = Variable(torch.LongTensor([tag])) 90 | output = loss(scores, target) 91 | train_loss += output.data[0] 92 | 93 | # backward pass 94 | model.zero_grad() 95 | output.backward() 96 | 97 | # update weights 98 | optimizer.step() 99 | 100 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % 101 | (ITER, train_loss/len(train), time.time()-start)) 102 | 103 | # evaluate 104 | _, _, acc = evaluate(model, dev) 105 | print("iter %r: test acc=%.4f" % (ITER, acc)) 106 | 107 | -------------------------------------------------------------------------------- /01-intro/bow-simple-pytorch.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | """ 4 | BOW (simple version) 5 | 6 | Based on Graham Neubig's DyNet code examples: 7 | https://github.com/neubig/nn4nlp2017-code 8 | http://phontron.com/class/nn4nlp2017/ 9 | 10 | """ 11 | 12 | from collections import defaultdict 13 | import time 14 | import random 15 | import torch 16 | from torch.autograd import Variable 17 | import torch.nn as nn 18 | 19 | torch.manual_seed(1) 20 | 21 | 22 | # Functions to read in the corpus 23 | w2i = defaultdict(lambda: len(w2i)) 24 | t2i = defaultdict(lambda: len(t2i)) 25 | UNK = w2i[""] 26 | 27 | 28 | def read_dataset(filename): 29 | with open(filename, "r") as f: 30 | for line in f: 31 | tag, words = line.lower().strip().split(" ||| ") 32 | yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 33 | 34 | 35 | # Read in the data 36 | train = list(read_dataset("../data/classes/train.txt")) 37 | w2i = defaultdict(lambda: UNK, w2i) 38 | dev = list(read_dataset("../data/classes/test.txt")) 39 | nwords = len(w2i) 40 | ntags = len(t2i) 41 | 42 | 43 | # The parameters for our BoW-model 44 | dtype = torch.FloatTensor # enable CUDA here if you like 45 | w = Variable(torch.randn(nwords, ntags).type(dtype), requires_grad=True) 46 | b = Variable(torch.randn(ntags).type(dtype), requires_grad=True) 47 | 48 | 49 | # A function to calculate scores for one sentence 50 | def calc_scores(words): 51 | lookup_tensor = Variable(torch.LongTensor(words)) 52 | embed = w[lookup_tensor] 53 | score = torch.sum(embed, 0) + b 54 | return score.view((1, -1)) 55 | 56 | 57 | for ITER in range(100): 58 | 59 | # train 60 | random.shuffle(train) 61 | train_loss = 0.0 62 | start = time.time() 63 | 64 | for words, tag in train: 65 | 66 | # forward pass 67 | scores = calc_scores(words) 68 | target = Variable(torch.LongTensor([tag])) 69 | loss = nn.CrossEntropyLoss() 70 | output = loss(scores, target) 71 | train_loss += output.data[0] 72 | 73 | # backward pass (compute gradients) 74 | output.backward() 75 | 76 | # update weights with SGD 77 | lr = 0.01 78 | w.data -= lr * w.grad.data 79 | b.data -= lr * b.grad.data 80 | 81 | # clear gradients for next step 82 | w.grad.data.zero_() 83 | b.grad.data.zero_() 84 | 85 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % 86 | (ITER, train_loss/len(train), time.time()-start)) 87 | 88 | # evaluate 89 | correct = 0.0 90 | for words, tag in dev: 91 | scores = calc_scores(words) 92 | predict = scores.data.numpy().argmax(axis=1) 93 | if predict == tag: 94 | correct += 1 95 | 96 | print("iter %r: test acc=%.4f" % 97 | (ITER, correct/len(dev))) 98 | -------------------------------------------------------------------------------- /01-intro/bow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from collections import defaultdict\n", 12 | "import time\n", 13 | "import random\n", 14 | "import dynet as dy\n", 15 | "import numpy as np" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "# Functions to read in the corpus\n", 27 | "w2i = defaultdict(lambda: len(w2i))\n", 28 | "t2i = defaultdict(lambda: len(t2i))\n", 29 | "UNK = w2i[\"\"]\n", 30 | "def read_dataset(filename):\n", 31 | " with open(filename, \"r\") as f:\n", 32 | " for line in f:\n", 33 | " tag, words = line.lower().strip().split(\" ||| \")\n", 34 | " yield ([w2i[x] for x in words.split(\" \")], t2i[tag])" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "# Read in the data\n", 46 | "train = list(read_dataset(\"../data/classes/train.txt\"))\n", 47 | "w2i = defaultdict(lambda: UNK, w2i)\n", 48 | "dev = list(read_dataset(\"../data/classes/test.txt\"))\n", 49 | "nwords = len(w2i)\n", 50 | "ntags = len(t2i)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "train[0][1]" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "collapsed": true 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "# Start DyNet and define trainer\n", 73 | "model = dy.Model()\n", 74 | "trainer = dy.AdamTrainer(model)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "# Define the model\n", 86 | "W_sm = model.add_lookup_parameters((nwords, ntags)) # Word weights\n", 87 | "b_sm = model.add_parameters((ntags)) # Softmax bias" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "# A function to calculate scores for one value\n", 99 | "def calc_scores(words):\n", 100 | " dy.renew_cg()\n", 101 | " score = dy.esum([dy.lookup(W_sm, x) for x in words])\n", 102 | " b_sm_exp = dy.parameter(b_sm)\n", 103 | " return score + b_sm_exp" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": false, 111 | "scrolled": false 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "for ITER in range(100):\n", 116 | " # Perform training\n", 117 | " random.shuffle(train)\n", 118 | " train_loss = 0.0\n", 119 | " start = time.time()\n", 120 | " for words, tag in train:\n", 121 | " my_loss = dy.pickneglogsoftmax(calc_scores(words), tag)\n", 122 | " train_loss += my_loss.value()\n", 123 | " my_loss.backward()\n", 124 | " trainer.update()\n", 125 | " print(\"iter %r: train loss/sent=%.4f, time=%.2fs\" % (ITER, train_loss/len(train), time.time()-start))\n", 126 | " # Perform testing\n", 127 | " test_correct = 0.0\n", 128 | " for words, tag in dev:\n", 129 | " scores = calc_scores(words).npvalue()\n", 130 | " predict = np.argmax(scores)\n", 131 | " if predict == tag:\n", 132 | " test_correct += 1\n", 133 | " print(\"iter %r: test acc=%.4f\" % (ITER, test_correct/len(dev)))" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "outputs": [], 143 | "source": [] 144 | } 145 | ], 146 | "metadata": { 147 | "anaconda-cloud": {}, 148 | "kernelspec": { 149 | "display_name": "Python 3", 150 | "language": "python", 151 | "name": "python3" 152 | }, 153 | "language_info": { 154 | "codemirror_mode": { 155 | "name": "ipython", 156 | "version": 3 157 | }, 158 | "file_extension": ".py", 159 | "mimetype": "text/x-python", 160 | "name": "python", 161 | "nbconvert_exporter": "python", 162 | "pygments_lexer": "ipython3", 163 | "version": "3.6.0" 164 | } 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 2 168 | } 169 | -------------------------------------------------------------------------------- /01-intro/bow.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import time 3 | import random 4 | import dynet as dy 5 | import numpy as np 6 | 7 | # Functions to read in the corpus 8 | w2i = defaultdict(lambda: len(w2i)) 9 | t2i = defaultdict(lambda: len(t2i)) 10 | UNK = w2i[""] 11 | def read_dataset(filename): 12 | with open(filename, "r") as f: 13 | for line in f: 14 | tag, words = line.lower().strip().split(" ||| ") 15 | yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 16 | 17 | # Read in the data 18 | train = list(read_dataset("../data/classes/train.txt")) 19 | w2i = defaultdict(lambda: UNK, w2i) 20 | dev = list(read_dataset("../data/classes/test.txt")) 21 | nwords = len(w2i) 22 | ntags = len(t2i) 23 | 24 | # Start DyNet and define trainer 25 | model = dy.Model() 26 | trainer = dy.AdamTrainer(model) 27 | 28 | # Define the model 29 | W_sm = model.add_lookup_parameters((nwords, ntags)) # Word weights 30 | b_sm = model.add_parameters((ntags)) # Softmax bias 31 | 32 | # A function to calculate scores for one value 33 | def calc_scores(words): 34 | # Create a computation graph, and add parameters 35 | dy.renew_cg() 36 | b_sm_exp = dy.parameter(b_sm) 37 | # Take the sum of all the embedding vectors for each word 38 | score = dy.esum([dy.lookup(W_sm, x) for x in words]) 39 | # Add the bias vector and return 40 | return score + b_sm_exp 41 | 42 | for ITER in range(100): 43 | # Perform training 44 | random.shuffle(train) 45 | train_loss = 0.0 46 | start = time.time() 47 | for words, tag in train: 48 | my_loss = dy.pickneglogsoftmax(calc_scores(words), tag) 49 | train_loss += my_loss.value() 50 | my_loss.backward() 51 | trainer.update() 52 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start)) 53 | # Perform testing 54 | test_correct = 0.0 55 | for words, tag in dev: 56 | scores = calc_scores(words).npvalue() 57 | predict = np.argmax(scores) 58 | if predict == tag: 59 | test_correct += 1 60 | print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev))) 61 | -------------------------------------------------------------------------------- /01-intro/cbow-pytorch.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | """ 4 | CBOW 5 | 6 | Based on Graham Neubig's DyNet code examples: 7 | https://github.com/neubig/nn4nlp2017-code 8 | http://phontron.com/class/nn4nlp2017/ 9 | 10 | """ 11 | 12 | from collections import defaultdict 13 | import time 14 | import random 15 | import torch 16 | from torch.autograd import Variable 17 | import torch.nn as nn 18 | import torch.optim as optim 19 | 20 | torch.manual_seed(1) 21 | 22 | 23 | # Functions to read in the corpus 24 | w2i = defaultdict(lambda: len(w2i)) 25 | t2i = defaultdict(lambda: len(t2i)) 26 | UNK = w2i[""] 27 | 28 | 29 | def read_dataset(filename): 30 | with open(filename, "r") as f: 31 | for line in f: 32 | tag, words = line.lower().strip().split(" ||| ") 33 | yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 34 | 35 | 36 | # Read in the data 37 | train = list(read_dataset("../data/classes/train.txt")) 38 | w2i = defaultdict(lambda: UNK, w2i) 39 | dev = list(read_dataset("../data/classes/test.txt")) 40 | nwords = len(w2i) 41 | ntags = len(t2i) 42 | 43 | 44 | class CBOW(nn.Module): 45 | 46 | def __init__(self, vocab_size, embedding_dim, output_dim): 47 | super(CBOW, self).__init__() 48 | self.embeddings = nn.Embedding(vocab_size, embedding_dim) 49 | self.linear = nn.Linear(embedding_dim, output_dim) 50 | 51 | def forward(self, inputs): 52 | embeds = self.embeddings(inputs) 53 | bow = torch.sum(embeds, 1) 54 | logits = self.linear(bow) 55 | return logits 56 | 57 | 58 | model = CBOW(nwords, 64, ntags) 59 | print(model) 60 | 61 | 62 | def evaluate(model, data): 63 | """Evaluate a model on a data set.""" 64 | correct = 0.0 65 | 66 | for words, tag in data: 67 | lookup_tensor = Variable(torch.LongTensor([words])) 68 | scores = model(lookup_tensor) 69 | predict = scores.data.numpy().argmax(axis=1)[0] 70 | 71 | if predict == tag: 72 | correct += 1 73 | 74 | return correct, len(data), correct/len(data) 75 | 76 | 77 | optimizer = optim.SGD(model.parameters(), lr=0.001) 78 | 79 | for ITER in range(100): 80 | 81 | random.shuffle(train) 82 | train_loss = 0.0 83 | start = time.time() 84 | 85 | for words, tag in train: 86 | 87 | # forward pass 88 | lookup_tensor = Variable(torch.LongTensor([words])) 89 | scores = model(lookup_tensor) 90 | loss = nn.CrossEntropyLoss() 91 | target = Variable(torch.LongTensor([tag])) 92 | output = loss(scores, target) 93 | train_loss += output.data[0] 94 | 95 | # backward pass 96 | model.zero_grad() 97 | output.backward() 98 | 99 | # update weights 100 | optimizer.step() 101 | 102 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % 103 | (ITER, train_loss/len(train), time.time()-start)) 104 | 105 | # evaluate 106 | _, _, acc = evaluate(model, dev) 107 | print("iter %r: test acc=%.4f" % (ITER, acc)) 108 | -------------------------------------------------------------------------------- /01-intro/cbow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from collections import defaultdict\n", 12 | "import time\n", 13 | "import random\n", 14 | "import dynet as dy\n", 15 | "import numpy as np\n", 16 | "\n", 17 | "# Functions to read in the corpus\n", 18 | "w2i = defaultdict(lambda: len(w2i))\n", 19 | "t2i = defaultdict(lambda: len(t2i))\n", 20 | "UNK = w2i[\"\"]\n", 21 | "def read_dataset(filename):\n", 22 | " with open(filename, \"r\") as f:\n", 23 | " for line in f:\n", 24 | " tag, words = line.lower().strip().split(\" ||| \")\n", 25 | " yield ([w2i[x] for x in words.split(\" \")], t2i[tag])\n", 26 | "\n", 27 | "# Read in the data\n", 28 | "train = list(read_dataset(\"../data/classes/train.txt\"))\n", 29 | "w2i = defaultdict(lambda: UNK, w2i)\n", 30 | "dev = list(read_dataset(\"../data/classes/test.txt\"))\n", 31 | "nwords = len(w2i)\n", 32 | "ntags = len(t2i)\n", 33 | "\n", 34 | "# Start DyNet and define trainer\n", 35 | "model = dy.Model()\n", 36 | "trainer = dy.AdamTrainer(model)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "# Define the model\n", 48 | "EMB_SIZE = 64\n", 49 | "W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word embeddings\n", 50 | "W_sm = model.add_parameters((ntags, EMB_SIZE)) # Softmax weights\n", 51 | "b_sm = model.add_parameters((ntags)) # Softmax bias" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "# A function to calculate scores for one value\n", 63 | "def calc_scores(words):\n", 64 | " dy.renew_cg()\n", 65 | " cbow = dy.esum([dy.lookup(W_emb, x) for x in words])\n", 66 | " W_sm_exp = dy.parameter(W_sm)\n", 67 | " b_sm_exp = dy.parameter(b_sm)\n", 68 | " return W_sm_exp * cbow + b_sm_exp" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "for ITER in range(100):\n", 80 | " # Perform training\n", 81 | " random.shuffle(train)\n", 82 | " train_loss = 0.0\n", 83 | " start = time.time()\n", 84 | " for words, tag in train:\n", 85 | " my_loss = dy.pickneglogsoftmax(calc_scores(words), tag)\n", 86 | " train_loss += my_loss.value()\n", 87 | " my_loss.backward()\n", 88 | " trainer.update()\n", 89 | " print(\"iter %r: train loss/sent=%.4f, time=%.2fs\" % (ITER, train_loss/len(train), time.time()-start))\n", 90 | " # Perform testing\n", 91 | " test_correct = 0.0\n", 92 | " for words, tag in dev:\n", 93 | " scores = calc_scores(words).npvalue()\n", 94 | " predict = np.argmax(scores)\n", 95 | " if predict == tag:\n", 96 | " test_correct += 1\n", 97 | " print(\"iter %r: test acc=%.4f\" % (ITER, test_correct/len(dev)))" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": true 105 | }, 106 | "outputs": [], 107 | "source": [] 108 | } 109 | ], 110 | "metadata": { 111 | "anaconda-cloud": {}, 112 | "kernelspec": { 113 | "display_name": "Python 3", 114 | "language": "python", 115 | "name": "python3" 116 | }, 117 | "language_info": { 118 | "codemirror_mode": { 119 | "name": "ipython", 120 | "version": 3 121 | }, 122 | "file_extension": ".py", 123 | "mimetype": "text/x-python", 124 | "name": "python", 125 | "nbconvert_exporter": "python", 126 | "pygments_lexer": "ipython3", 127 | "version": "3.6.0" 128 | } 129 | }, 130 | "nbformat": 4, 131 | "nbformat_minor": 2 132 | } 133 | -------------------------------------------------------------------------------- /01-intro/cbow.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import time 3 | import random 4 | import dynet as dy 5 | import numpy as np 6 | 7 | # Functions to read in the corpus 8 | w2i = defaultdict(lambda: len(w2i)) 9 | t2i = defaultdict(lambda: len(t2i)) 10 | UNK = w2i[""] 11 | def read_dataset(filename): 12 | with open(filename, "r") as f: 13 | for line in f: 14 | tag, words = line.lower().strip().split(" ||| ") 15 | yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 16 | 17 | # Read in the data 18 | train = list(read_dataset("../data/classes/train.txt")) 19 | w2i = defaultdict(lambda: UNK, w2i) 20 | dev = list(read_dataset("../data/classes/test.txt")) 21 | nwords = len(w2i) 22 | ntags = len(t2i) 23 | 24 | # Start DyNet and define trainer 25 | model = dy.Model() 26 | trainer = dy.AdamTrainer(model) 27 | 28 | # Define the model 29 | EMB_SIZE = 64 30 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word embeddings 31 | W_sm = model.add_parameters((ntags, EMB_SIZE)) # Softmax weights 32 | b_sm = model.add_parameters((ntags)) # Softmax bias 33 | 34 | # A function to calculate scores for one value 35 | def calc_scores(words): 36 | dy.renew_cg() 37 | cbow = dy.esum([dy.lookup(W_emb, x) for x in words]) 38 | W_sm_exp = dy.parameter(W_sm) 39 | b_sm_exp = dy.parameter(b_sm) 40 | return W_sm_exp * cbow + b_sm_exp 41 | 42 | for ITER in range(100): 43 | # Perform training 44 | random.shuffle(train) 45 | train_loss = 0.0 46 | start = time.time() 47 | for words, tag in train: 48 | my_loss = dy.pickneglogsoftmax(calc_scores(words), tag) 49 | train_loss += my_loss.value() 50 | my_loss.backward() 51 | trainer.update() 52 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start)) 53 | # Perform testing 54 | test_correct = 0.0 55 | for words, tag in dev: 56 | scores = calc_scores(words).npvalue() 57 | predict = np.argmax(scores) 58 | if predict == tag: 59 | test_correct += 1 60 | print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev))) 61 | -------------------------------------------------------------------------------- /01-intro/deep-cbow-pytorch-minibatch.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | """ 4 | Deep CBOW (with minibatching) 5 | 6 | Based on Graham Neubig's DyNet code examples: 7 | https://github.com/neubig/nn4nlp2017-code 8 | http://phontron.com/class/nn4nlp2017/ 9 | 10 | """ 11 | 12 | from collections import defaultdict 13 | from collections import namedtuple 14 | import time 15 | import random 16 | import torch 17 | from torch.autograd import Variable 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | import torch.optim as optim 21 | 22 | torch.manual_seed(1) 23 | random.seed(1) 24 | 25 | 26 | CUDA = torch.cuda.is_available() 27 | print("CUDA: %s" % CUDA) 28 | 29 | 30 | # Functions to read in the corpus 31 | w2i = defaultdict(lambda: len(w2i)) 32 | t2i = defaultdict(lambda: len(t2i)) 33 | UNK = w2i[""] 34 | PAD = w2i[""] 35 | 36 | # One data point 37 | Example = namedtuple("Example", ["words", "tag"]) 38 | 39 | 40 | def read_dataset(filename): 41 | with open(filename, "r") as f: 42 | for line in f: 43 | tag, words = line.lower().strip().split(" ||| ") 44 | yield Example(words=[w2i[x] for x in words.split(" ")], 45 | tag=t2i[tag]) 46 | 47 | 48 | # Read in the data 49 | train = list(read_dataset("../data/classes/train.txt")) 50 | w2i = defaultdict(lambda: UNK, w2i) 51 | dev = list(read_dataset("../data/classes/test.txt")) 52 | nwords = len(w2i) 53 | ntags = len(t2i) 54 | 55 | 56 | class DeepCBOW(nn.Module): 57 | """ 58 | Deep CBOW model 59 | """ 60 | 61 | def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim): 62 | super(DeepCBOW, self).__init__() 63 | self.embeddings = nn.Embedding(vocab_size, embedding_dim) 64 | self.linear1 = nn.Linear(embedding_dim, hidden_dim) 65 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 66 | self.linear3 = nn.Linear(hidden_dim, output_dim) 67 | 68 | def forward(self, inputs): 69 | embeds = self.embeddings(inputs) 70 | h = torch.sum(embeds, 1) 71 | h = F.relu(self.linear1(h)) 72 | h = F.relu(self.linear2(h)) 73 | h = self.linear3(h) 74 | return h 75 | 76 | 77 | model = DeepCBOW(nwords, 64, 64, ntags) 78 | 79 | if CUDA: 80 | model.cuda() 81 | 82 | print(model) 83 | 84 | 85 | def minibatch(data, batch_size=32): 86 | for i in range(0, len(data), batch_size): 87 | yield data[i:i+batch_size] 88 | 89 | 90 | def evaluate(model, data): 91 | """Evaluate a model on a data set.""" 92 | correct = 0.0 93 | 94 | for batch in minibatch(data): 95 | 96 | seqs, tags = preprocess(batch) 97 | scores = model(get_variable(seqs)) 98 | _, predictions = torch.max(scores.data, 1) 99 | targets = get_variable(tags) 100 | 101 | correct += torch.eq(predictions, targets).sum().data[0] 102 | 103 | return correct, len(data), correct/len(data) 104 | 105 | 106 | def get_variable(x): 107 | """Get a Variable given indices x""" 108 | tensor = torch.cuda.LongTensor(x) if CUDA else torch.LongTensor(x) 109 | return Variable(tensor) 110 | 111 | 112 | def preprocess(batch): 113 | """ Add zero-padding to a batch. """ 114 | 115 | tags = [example.tag for example in batch] 116 | 117 | # add zero-padding to make all sequences equally long 118 | seqs = [example.words for example in batch] 119 | max_length = max(map(len, seqs)) 120 | seqs = [seq + [PAD] * (max_length - len(seq)) for seq in seqs] 121 | 122 | return seqs, tags 123 | 124 | 125 | optimizer = optim.Adam(model.parameters(), lr=0.001) 126 | 127 | for ITER in range(100): 128 | 129 | random.shuffle(train) 130 | train_loss = 0.0 131 | start = time.time() 132 | updates = 0 133 | 134 | for batch in minibatch(train): 135 | 136 | updates += 1 137 | 138 | # pad data with zeros 139 | seqs, tags = preprocess(batch) 140 | 141 | # forward pass 142 | scores = model(get_variable(seqs)) 143 | targets = get_variable(tags) 144 | loss = nn.CrossEntropyLoss() 145 | output = loss(scores, targets) 146 | train_loss += output.data[0] 147 | 148 | # backward pass 149 | model.zero_grad() 150 | output.backward() 151 | 152 | # update weights 153 | optimizer.step() 154 | 155 | print("iter %r: avg train loss=%.4f, time=%.2fs" % 156 | (ITER, train_loss/updates, time.time()-start)) 157 | 158 | # evaluate 159 | _, _, acc_train = evaluate(model, train) 160 | _, _, acc_dev = evaluate(model, dev) 161 | print("iter %r: train acc=%.4f test acc=%.4f" % (ITER, acc_train, acc_dev)) 162 | -------------------------------------------------------------------------------- /01-intro/deep-cbow-pytorch.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | """ 4 | Deep CBOW 5 | 6 | Based on Graham Neubig's DyNet code examples: 7 | https://github.com/neubig/nn4nlp2017-code 8 | http://phontron.com/class/nn4nlp2017/ 9 | 10 | """ 11 | 12 | from collections import defaultdict 13 | import time 14 | import random 15 | import torch 16 | from torch.autograd import Variable 17 | import torch.nn as nn 18 | import torch.nn.functional as F 19 | import torch.optim as optim 20 | 21 | torch.manual_seed(1) 22 | 23 | 24 | # Functions to read in the corpus 25 | w2i = defaultdict(lambda: len(w2i)) 26 | t2i = defaultdict(lambda: len(t2i)) 27 | UNK = w2i[""] 28 | 29 | 30 | def read_dataset(filename): 31 | with open(filename, "r") as f: 32 | for line in f: 33 | tag, words = line.lower().strip().split(" ||| ") 34 | yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 35 | 36 | 37 | # Read in the data 38 | train = list(read_dataset("../data/classes/train.txt")) 39 | w2i = defaultdict(lambda: UNK, w2i) 40 | dev = list(read_dataset("../data/classes/test.txt")) 41 | nwords = len(w2i) 42 | ntags = len(t2i) 43 | 44 | 45 | class DeepCBOW(nn.Module): 46 | 47 | def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim): 48 | super(DeepCBOW, self).__init__() 49 | self.embeddings = nn.Embedding(vocab_size, embedding_dim) 50 | self.linear1 = nn.Linear(embedding_dim, hidden_dim) 51 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 52 | self.linear3 = nn.Linear(hidden_dim, output_dim) 53 | 54 | def forward(self, inputs): 55 | embeds = self.embeddings(inputs) 56 | h = torch.sum(embeds, 1) 57 | h = F.tanh(self.linear1(h)) 58 | h = F.tanh(self.linear2(h)) 59 | h = self.linear3(h) 60 | return h 61 | 62 | 63 | model = DeepCBOW(nwords, 64, 64, ntags) 64 | 65 | 66 | print(model) 67 | 68 | 69 | def evaluate(model, data): 70 | """Evaluate a model on a data set.""" 71 | correct = 0.0 72 | 73 | for words, tag in data: 74 | scores = model(get_tensor([words])) 75 | predict = scores.data.numpy().argmax(axis=1)[0] 76 | 77 | if predict == tag: 78 | correct += 1 79 | 80 | return correct, len(data), correct/len(data) 81 | 82 | 83 | def get_tensor(x): 84 | """Get a Variable given indices x""" 85 | return Variable(torch.LongTensor(x)) 86 | 87 | 88 | optimizer = optim.Adam(model.parameters(), lr=0.001) 89 | 90 | for ITER in range(100): 91 | 92 | random.shuffle(train) 93 | train_loss = 0.0 94 | start = time.time() 95 | 96 | for words, tag in train: 97 | 98 | # forward pass 99 | scores = model(get_tensor([words])) 100 | 101 | loss = nn.CrossEntropyLoss() 102 | target = get_tensor([tag]) 103 | output = loss(scores, target) 104 | train_loss += output.data[0] 105 | 106 | # backward pass 107 | model.zero_grad() 108 | output.backward() 109 | 110 | # update weights 111 | optimizer.step() 112 | 113 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % 114 | (ITER, train_loss/len(train), time.time()-start)) 115 | 116 | # evaluate 117 | _, _, acc = evaluate(model, dev) 118 | print("iter %r: test acc=%.4f" % (ITER, acc)) 119 | -------------------------------------------------------------------------------- /01-intro/deep-cbow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from collections import defaultdict\n", 12 | "import time\n", 13 | "import random\n", 14 | "import dynet as dy\n", 15 | "import numpy as np\n", 16 | "\n", 17 | "# Functions to read in the corpus\n", 18 | "w2i = defaultdict(lambda: len(w2i))\n", 19 | "t2i = defaultdict(lambda: len(t2i))\n", 20 | "UNK = w2i[\"\"]\n", 21 | "def read_dataset(filename):\n", 22 | " with open(filename, \"r\") as f:\n", 23 | " for line in f:\n", 24 | " tag, words = line.lower().strip().split(\" ||| \")\n", 25 | " yield ([w2i[x] for x in words.split(\" \")], t2i[tag])\n", 26 | "\n", 27 | "# Read in the data\n", 28 | "train = list(read_dataset(\"../data/classes/train.txt\"))\n", 29 | "w2i = defaultdict(lambda: UNK, w2i)\n", 30 | "dev = list(read_dataset(\"../data/classes/test.txt\"))\n", 31 | "nwords = len(w2i)\n", 32 | "ntags = len(t2i)\n", 33 | "\n", 34 | "# Start DyNet and define trainer\n", 35 | "model = dy.Model()\n", 36 | "trainer = dy.AdamTrainer(model)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "# Define the model\n", 48 | "EMB_SIZE = 64\n", 49 | "HID_SIZE = 64\n", 50 | "HID_LAY = 2\n", 51 | "W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word embeddings\n", 52 | "W_h = [model.add_parameters((HID_SIZE, EMB_SIZE if lay == 0 else HID_SIZE)) for lay in range(HID_LAY)]\n", 53 | "b_h = [model.add_parameters((HID_SIZE)) for lay in range(HID_LAY)]\n", 54 | "W_sm = model.add_parameters((ntags, HID_SIZE)) # Softmax weights\n", 55 | "b_sm = model.add_parameters((ntags)) # Softmax bias" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "# A function to calculate scores for one value\n", 67 | "def calc_scores(words):\n", 68 | " dy.renew_cg()\n", 69 | " h = dy.esum([dy.lookup(W_emb, x) for x in words])\n", 70 | " for W_h_i, b_h_i in zip(W_h, b_h):\n", 71 | " h = dy.tanh( dy.parameter(W_h_i) * h + dy.parameter(b_h_i) )\n", 72 | " return dy.parameter(W_sm) * h + dy.parameter(b_sm)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "for ITER in range(100):\n", 84 | " # Perform training\n", 85 | " random.shuffle(train)\n", 86 | " train_loss = 0.0\n", 87 | " start = time.time()\n", 88 | " for words, tag in train:\n", 89 | " my_loss = dy.pickneglogsoftmax(calc_scores(words), tag)\n", 90 | " train_loss += my_loss.value()\n", 91 | " my_loss.backward()\n", 92 | " trainer.update()\n", 93 | " print(\"iter %r: train loss/sent=%.4f, time=%.2fs\" % (ITER, train_loss/len(train), time.time()-start))\n", 94 | " # Perform testing\n", 95 | " test_correct = 0.0\n", 96 | " for words, tag in dev:\n", 97 | " scores = calc_scores(words).npvalue()\n", 98 | " predict = np.argmax(scores)\n", 99 | " if predict == tag:\n", 100 | " test_correct += 1\n", 101 | " print(\"iter %r: test acc=%.4f\" % (ITER, test_correct/len(dev)))" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "collapsed": true 109 | }, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.6.0" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 2 135 | } 136 | -------------------------------------------------------------------------------- /01-intro/deep-cbow.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import time 3 | import random 4 | import dynet as dy 5 | import numpy as np 6 | 7 | # Functions to read in the corpus 8 | w2i = defaultdict(lambda: len(w2i)) 9 | t2i = defaultdict(lambda: len(t2i)) 10 | UNK = w2i[""] 11 | def read_dataset(filename): 12 | with open(filename, "r") as f: 13 | for line in f: 14 | tag, words = line.lower().strip().split(" ||| ") 15 | yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 16 | 17 | # Read in the data 18 | train = list(read_dataset("../data/classes/train.txt")) 19 | w2i = defaultdict(lambda: UNK, w2i) 20 | dev = list(read_dataset("../data/classes/test.txt")) 21 | nwords = len(w2i) 22 | ntags = len(t2i) 23 | 24 | # Start DyNet and define trainer 25 | model = dy.Model() 26 | trainer = dy.AdamTrainer(model) 27 | 28 | # Define the model 29 | EMB_SIZE = 64 30 | HID_SIZE = 64 31 | HID_LAY = 2 32 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word embeddings 33 | W_h = [model.add_parameters((HID_SIZE, EMB_SIZE if lay == 0 else HID_SIZE)) for lay in range(HID_LAY)] 34 | b_h = [model.add_parameters((HID_SIZE)) for lay in range(HID_LAY)] 35 | W_sm = model.add_parameters((ntags, HID_SIZE)) # Softmax weights 36 | b_sm = model.add_parameters((ntags)) # Softmax bias 37 | 38 | # A function to calculate scores for one value 39 | def calc_scores(words): 40 | dy.renew_cg() 41 | h = dy.esum([dy.lookup(W_emb, x) for x in words]) 42 | for W_h_i, b_h_i in zip(W_h, b_h): 43 | h = dy.tanh( dy.parameter(W_h_i) * h + dy.parameter(b_h_i) ) 44 | return dy.parameter(W_sm) * h + dy.parameter(b_sm) 45 | 46 | for ITER in range(100): 47 | # Perform training 48 | random.shuffle(train) 49 | train_loss = 0.0 50 | start = time.time() 51 | for words, tag in train: 52 | my_loss = dy.pickneglogsoftmax(calc_scores(words), tag) 53 | train_loss += my_loss.value() 54 | my_loss.backward() 55 | trainer.update() 56 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start)) 57 | # Perform training 58 | test_correct = 0.0 59 | for words, tag in dev: 60 | scores = calc_scores(words).npvalue() 61 | predict = np.argmax(scores) 62 | if predict == tag: 63 | test_correct += 1 64 | print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev))) 65 | -------------------------------------------------------------------------------- /02-lm/loglin-lm.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import math 3 | import time 4 | import random 5 | import dynet as dy 6 | import numpy as np 7 | 8 | # The length of the n-gram 9 | N = 2 10 | 11 | # Functions to read in the corpus 12 | # NOTE: We are using data from the Penn Treebank, which is already converted 13 | # into an easy-to-use format with "" symbols. If we were using other 14 | # data we would have to do pre-processing and consider how to choose 15 | # unknown words, etc. 16 | w2i = defaultdict(lambda: len(w2i)) 17 | S = w2i[""] 18 | UNK = w2i[""] 19 | def read_dataset(filename): 20 | with open(filename, "r") as f: 21 | for line in f: 22 | yield [w2i[x] for x in line.strip().split(" ")] 23 | 24 | # Read in the data 25 | train = list(read_dataset("../data/ptb/train.txt")) 26 | w2i = defaultdict(lambda: UNK, w2i) 27 | dev = list(read_dataset("../data/ptb/valid.txt")) 28 | i2w = {v: k for k, v in w2i.items()} 29 | nwords = len(w2i) 30 | 31 | # Start DyNet and define trainer 32 | model = dy.Model() 33 | trainer = dy.SimpleSGDTrainer(model, learning_rate=0.1) 34 | 35 | # Define the model 36 | W_sm = [model.add_lookup_parameters((nwords, nwords)) for _ in range(N)] # Word weights at each position 37 | b_sm = model.add_parameters((nwords)) # Softmax bias 38 | 39 | # A function to calculate scores for one value 40 | def calc_score_of_history(words): 41 | # Create a list of things to sum up with only the bias vector at first 42 | score_vecs = [dy.parameter(b_sm)] 43 | for word_id, lookup_param in zip(words, W_sm): 44 | score_vecs.append(lookup_param[word_id]) 45 | return dy.esum(score_vecs) 46 | 47 | # Calculate the loss value for the entire sentence 48 | def calc_sent_loss(sent): 49 | # Create a computation graph 50 | dy.renew_cg() 51 | # The initial history is equal to end of sentence symbols 52 | hist = [S] * N 53 | # Step through the sentence, including the end of sentence token 54 | all_losses = [] 55 | for next_word in sent + [S]: 56 | s = calc_score_of_history(hist) 57 | all_losses.append(dy.pickneglogsoftmax(s, next_word)) 58 | hist = hist[1:] + [next_word] 59 | return dy.esum(all_losses) 60 | 61 | MAX_LEN = 100 62 | # Generate a sentence 63 | def generate_sent(): 64 | dy.renew_cg() 65 | hist = [S] * N 66 | sent = [] 67 | while True: 68 | p = dy.softmax(calc_score_of_history(hist)).npvalue() 69 | next_word = np.random.choice(nwords, p=p/p.sum()) 70 | if next_word == S or len(sent) == MAX_LEN: 71 | break 72 | sent.append(next_word) 73 | hist = hist[1:] + [next_word] 74 | return sent 75 | 76 | for ITER in range(100): 77 | # Perform training 78 | random.shuffle(train) 79 | train_words, train_loss = 0, 0.0 80 | start = time.time() 81 | for sent_id, sent in enumerate(train): 82 | my_loss = calc_sent_loss(sent) 83 | train_loss += my_loss.value() 84 | train_words += len(sent) 85 | my_loss.backward() 86 | trainer.update() 87 | if (sent_id+1) % 5000 == 0: 88 | print("--finished %r sentences" % (sent_id+1)) 89 | print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start)) 90 | # Evaluate on dev set 91 | dev_words, dev_loss = 0, 0.0 92 | start = time.time() 93 | for sent_id, sent in enumerate(dev): 94 | my_loss = calc_sent_loss(sent) 95 | dev_loss += my_loss.value() 96 | dev_words += len(sent) 97 | trainer.update() 98 | print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start)) 99 | # Generate a few sentences 100 | for _ in range(5): 101 | sent = generate_sent() 102 | print(" ".join([i2w[x] for x in sent])) 103 | -------------------------------------------------------------------------------- /02-lm/nn-lm-batch.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import math 3 | import time 4 | import random 5 | import dynet as dy 6 | import numpy as np 7 | 8 | N = 2 # The length of the n-gram 9 | EMB_SIZE = 128 # The size of the embedding 10 | HID_SIZE = 128 # The size of the hidden layer 11 | 12 | # Functions to read in the corpus 13 | # NOTE: We are using data from the Penn Treebank, which is already converted 14 | # into an easy-to-use format with "" symbols. If we were using other 15 | # data we would have to do pre-processing and consider how to choose 16 | # unknown words, etc. 17 | w2i = defaultdict(lambda: len(w2i)) 18 | S = w2i[""] 19 | UNK = w2i[""] 20 | def read_dataset(filename): 21 | with open(filename, "r") as f: 22 | for line in f: 23 | yield [w2i[x] for x in line.strip().split(" ")] 24 | 25 | # Read in the data 26 | train = list(read_dataset("../data/ptb/train.txt")) 27 | w2i = defaultdict(lambda: UNK, w2i) 28 | dev = list(read_dataset("../data/ptb/valid.txt")) 29 | i2w = {v: k for k, v in w2i.items()} 30 | nwords = len(w2i) 31 | 32 | # Start DyNet and define trainer 33 | model = dy.Model() 34 | trainer = dy.AdamTrainer(model, alpha=0.001) 35 | 36 | # Define the model 37 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights at each position 38 | W_h_p = model.add_parameters((HID_SIZE, EMB_SIZE * N)) # Weights of the softmax 39 | b_h_p = model.add_parameters((HID_SIZE)) # Weights of the softmax 40 | W_sm_p = model.add_parameters((nwords, HID_SIZE)) # Weights of the softmax 41 | b_sm_p = model.add_parameters((nwords)) # Softmax bias 42 | 43 | # A function to calculate scores for one value 44 | def calc_score_of_histories(words, dropout=0.0): 45 | # This will change from a list of histories, to a list of words in each history position 46 | words = np.transpose(words) 47 | # Lookup the embeddings and concatenate them 48 | emb = dy.concatenate([dy.lookup_batch(W_emb, x) for x in words]) 49 | # Create the hidden layer 50 | W_h = dy.parameter(W_h_p) 51 | b_h = dy.parameter(b_h_p) 52 | h = dy.tanh(dy.affine_transform([b_h, W_h, emb])) 53 | # Perform dropout 54 | if dropout != 0.0: 55 | h = dy.dropout(h, dropout) 56 | # Calculate the score and return 57 | W_sm = dy.parameter(W_sm_p) 58 | b_sm = dy.parameter(b_sm_p) 59 | return dy.affine_transform([b_sm, W_sm, h]) 60 | 61 | # Calculate the loss value for the entire sentence 62 | def calc_sent_loss(sent, dropout=0.0): 63 | # Create a computation graph 64 | dy.renew_cg() 65 | # The initial history is equal to end of sentence symbols 66 | hist = [S] * N 67 | # Step through the sentence, including the end of sentence token 68 | all_histories = [] 69 | all_targets = [] 70 | for next_word in sent + [S]: 71 | all_histories.append(list(hist)) 72 | all_targets.append(next_word) 73 | hist = hist[1:] + [next_word] 74 | s = calc_score_of_histories(all_histories, dropout=dropout) 75 | return dy.sum_batches(dy.pickneglogsoftmax_batch(s, all_targets)) 76 | 77 | MAX_LEN = 100 78 | # Generate a sentence 79 | def generate_sent(): 80 | dy.renew_cg() 81 | hist = [S] * N 82 | sent = [] 83 | while True: 84 | p = dy.softmax(calc_score_of_histories([hist])).npvalue() 85 | next_word = np.random.choice(nwords, p=p/p.sum()) 86 | if next_word == S or len(sent) == MAX_LEN: 87 | break 88 | sent.append(next_word) 89 | hist = hist[1:] + [next_word] 90 | return sent 91 | 92 | last_dev = 1e20 93 | best_dev = 1e20 94 | 95 | for ITER in range(100): 96 | # Perform training 97 | random.shuffle(train) 98 | train_words, train_loss = 0, 0.0 99 | start = time.time() 100 | for sent_id, sent in enumerate(train): 101 | my_loss = calc_sent_loss(sent, dropout=0.2) 102 | train_loss += my_loss.value() 103 | train_words += len(sent) 104 | my_loss.backward() 105 | trainer.update() 106 | if (sent_id+1) % 5000 == 0: 107 | print("--finished %r sentences" % (sent_id+1)) 108 | print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start)) 109 | # Evaluate on dev set 110 | dev_words, dev_loss = 0, 0.0 111 | start = time.time() 112 | for sent_id, sent in enumerate(dev): 113 | my_loss = calc_sent_loss(sent) 114 | dev_loss += my_loss.value() 115 | dev_words += len(sent) 116 | trainer.update() 117 | # Keep track of the development accuracy and reduce the learning rate if it got worse 118 | if last_dev < dev_loss: 119 | trainer.learning_rate /= 2 120 | last_dev = dev_loss 121 | # Keep track of the best development accuracy, and save the model only if it's the best one 122 | if best_dev > dev_loss: 123 | model.save("model.txt") 124 | best_dev = dev_loss 125 | # Save the model 126 | print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start)) 127 | # Generate a few sentences 128 | for _ in range(5): 129 | sent = generate_sent() 130 | print(" ".join([i2w[x] for x in sent])) 131 | -------------------------------------------------------------------------------- /02-lm/nn-lm-optim.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import math 3 | import time 4 | import random 5 | import dynet as dy 6 | import numpy as np 7 | 8 | N = 2 # The length of the n-gram 9 | EMB_SIZE = 128 # The size of the embedding 10 | HID_SIZE = 128 # The size of the hidden layer 11 | 12 | # Functions to read in the corpus 13 | # NOTE: We are using data from the Penn Treebank, which is already converted 14 | # into an easy-to-use format with "" symbols. If we were using other 15 | # data we would have to do pre-processing and consider how to choose 16 | # unknown words, etc. 17 | w2i = defaultdict(lambda: len(w2i)) 18 | S = w2i[""] 19 | UNK = w2i[""] 20 | def read_dataset(filename): 21 | with open(filename, "r") as f: 22 | for line in f: 23 | yield [w2i[x] for x in line.strip().split(" ")] 24 | 25 | # Read in the data 26 | train = list(read_dataset("../data/ptb/train.txt")) 27 | w2i = defaultdict(lambda: UNK, w2i) 28 | dev = list(read_dataset("../data/ptb/valid.txt")) 29 | i2w = {v: k for k, v in w2i.items()} 30 | nwords = len(w2i) 31 | 32 | # Start DyNet and define trainer 33 | model = dy.Model() 34 | 35 | # CHANGE 1: Use Adam instead of Simple SGD 36 | trainer = dy.AdamTrainer(model, alpha=0.001) 37 | 38 | # Define the model 39 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights at each position 40 | W_h_p = model.add_parameters((HID_SIZE, EMB_SIZE * N)) # Weights of the softmax 41 | b_h_p = model.add_parameters((HID_SIZE)) # Weights of the softmax 42 | W_sm_p = model.add_parameters((nwords, HID_SIZE)) # Weights of the softmax 43 | b_sm_p = model.add_parameters((nwords)) # Softmax bias 44 | 45 | # A function to calculate scores for one value 46 | def calc_score_of_history(words, dropout=0.0): 47 | # Lookup the embeddings and concatenate them 48 | emb = dy.concatenate([W_emb[x] for x in words]) 49 | # Create the hidden layer 50 | W_h = dy.parameter(W_h_p) 51 | b_h = dy.parameter(b_h_p) 52 | h = dy.tanh(dy.affine_transform([b_h, W_h, emb])) 53 | # CHANGE 2: perform dropout 54 | if dropout != 0.0: 55 | h = dy.dropout(h, dropout) 56 | # Calculate the score and return 57 | W_sm = dy.parameter(W_sm_p) 58 | b_sm = dy.parameter(b_sm_p) 59 | return dy.affine_transform([b_sm, W_sm, h]) 60 | 61 | # Calculate the loss value for the entire sentence 62 | def calc_sent_loss(sent, dropout=0.0): 63 | # Create a computation graph 64 | dy.renew_cg() 65 | # The initial history is equal to end of sentence symbols 66 | hist = [S] * N 67 | # Step through the sentence, including the end of sentence token 68 | all_losses = [] 69 | for next_word in sent + [S]: 70 | s = calc_score_of_history(hist, dropout=dropout) 71 | all_losses.append(dy.pickneglogsoftmax(s, next_word)) 72 | hist = hist[1:] + [next_word] 73 | return dy.esum(all_losses) 74 | 75 | MAX_LEN = 100 76 | # Generate a sentence 77 | def generate_sent(): 78 | dy.renew_cg() 79 | hist = [S] * N 80 | sent = [] 81 | while True: 82 | p = dy.softmax(calc_score_of_history(hist)).npvalue() 83 | next_word = np.random.choice(nwords, p=p/p.sum()) 84 | if next_word == S or len(sent) == MAX_LEN: 85 | break 86 | sent.append(next_word) 87 | hist = hist[1:] + [next_word] 88 | return sent 89 | 90 | last_dev = 1e20 91 | best_dev = 1e20 92 | 93 | for ITER in range(100): 94 | # Perform training 95 | random.shuffle(train) 96 | train_words, train_loss = 0, 0.0 97 | start = time.time() 98 | for sent_id, sent in enumerate(train): 99 | my_loss = calc_sent_loss(sent, dropout=0.2) 100 | train_loss += my_loss.value() 101 | train_words += len(sent) 102 | my_loss.backward() 103 | trainer.update() 104 | if (sent_id+1) % 5000 == 0: 105 | print("--finished %r sentences" % (sent_id+1)) 106 | print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start)) 107 | # Evaluate on dev set 108 | dev_words, dev_loss = 0, 0.0 109 | start = time.time() 110 | for sent_id, sent in enumerate(dev): 111 | my_loss = calc_sent_loss(sent) 112 | dev_loss += my_loss.value() 113 | dev_words += len(sent) 114 | trainer.update() 115 | # CHANGE 3: Keep track of the development accuracy and reduce the learning rate if it got worse 116 | if last_dev < dev_loss: 117 | trainer.learning_rate /= 2 118 | last_dev = dev_loss 119 | # CHANGE 4: Keep track of the best development accuracy, and save the model only if it's the best one 120 | if best_dev > dev_loss: 121 | model.save("model.txt") 122 | best_dev = dev_loss 123 | # Save the model 124 | print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start)) 125 | # Generate a few sentences 126 | for _ in range(5): 127 | sent = generate_sent() 128 | print(" ".join([i2w[x] for x in sent])) 129 | -------------------------------------------------------------------------------- /02-lm/nn-lm.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import math 3 | import time 4 | import random 5 | import dynet as dy 6 | import numpy as np 7 | 8 | N = 2 # The length of the n-gram 9 | EMB_SIZE = 128 # The size of the embedding 10 | HID_SIZE = 128 # The size of the hidden layer 11 | 12 | # Functions to read in the corpus 13 | # NOTE: We are using data from the Penn Treebank, which is already converted 14 | # into an easy-to-use format with "" symbols. If we were using other 15 | # data we would have to do pre-processing and consider how to choose 16 | # unknown words, etc. 17 | w2i = defaultdict(lambda: len(w2i)) 18 | S = w2i[""] 19 | UNK = w2i[""] 20 | def read_dataset(filename): 21 | with open(filename, "r") as f: 22 | for line in f: 23 | yield [w2i[x] for x in line.strip().split(" ")] 24 | 25 | # Read in the data 26 | train = list(read_dataset("../data/ptb/train.txt")) 27 | w2i = defaultdict(lambda: UNK, w2i) 28 | dev = list(read_dataset("../data/ptb/valid.txt")) 29 | i2w = {v: k for k, v in w2i.items()} 30 | nwords = len(w2i) 31 | 32 | # Start DyNet and define trainer 33 | model = dy.Model() 34 | trainer = dy.SimpleSGDTrainer(model, learning_rate=0.1) 35 | 36 | # Define the model 37 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights at each position 38 | W_h_p = model.add_parameters((HID_SIZE, EMB_SIZE * N)) # Weights of the softmax 39 | b_h_p = model.add_parameters((HID_SIZE)) # Weights of the softmax 40 | W_sm_p = model.add_parameters((nwords, HID_SIZE)) # Weights of the softmax 41 | b_sm_p = model.add_parameters((nwords)) # Softmax bias 42 | 43 | # A function to calculate scores for one value 44 | def calc_score_of_history(words): 45 | # Lookup the embeddings and concatenate them 46 | emb = dy.concatenate([W_emb[x] for x in words]) 47 | # Create the hidden layer 48 | W_h = dy.parameter(W_h_p) 49 | b_h = dy.parameter(b_h_p) 50 | h = dy.tanh(dy.affine_transform([b_h, W_h, emb])) 51 | # Calculate the score and return 52 | W_sm = dy.parameter(W_sm_p) 53 | b_sm = dy.parameter(b_sm_p) 54 | return dy.affine_transform([b_sm, W_sm, h]) 55 | 56 | # Calculate the loss value for the entire sentence 57 | def calc_sent_loss(sent): 58 | # Create a computation graph 59 | dy.renew_cg() 60 | # The initial history is equal to end of sentence symbols 61 | hist = [S] * N 62 | # Step through the sentence, including the end of sentence token 63 | all_losses = [] 64 | for next_word in sent + [S]: 65 | s = calc_score_of_history(hist) 66 | all_losses.append(dy.pickneglogsoftmax(s, next_word)) 67 | hist = hist[1:] + [next_word] 68 | return dy.esum(all_losses) 69 | 70 | MAX_LEN = 100 71 | # Generate a sentence 72 | def generate_sent(): 73 | dy.renew_cg() 74 | hist = [S] * N 75 | sent = [] 76 | while True: 77 | p = dy.softmax(calc_score_of_history(hist)).npvalue() 78 | next_word = np.random.choice(nwords, p=p/p.sum()) 79 | if next_word == S or len(sent) == MAX_LEN: 80 | break 81 | sent.append(next_word) 82 | hist = hist[1:] + [next_word] 83 | return sent 84 | 85 | for ITER in range(100): 86 | # Perform training 87 | random.shuffle(train) 88 | train_words, train_loss = 0, 0.0 89 | start = time.time() 90 | for sent_id, sent in enumerate(train): 91 | my_loss = calc_sent_loss(sent) 92 | train_loss += my_loss.value() 93 | train_words += len(sent) 94 | my_loss.backward() 95 | trainer.update() 96 | if (sent_id+1) % 5000 == 0: 97 | print("--finished %r sentences" % (sent_id+1)) 98 | print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start)) 99 | # Evaluate on dev set 100 | dev_words, dev_loss = 0, 0.0 101 | start = time.time() 102 | for sent_id, sent in enumerate(dev): 103 | my_loss = calc_sent_loss(sent) 104 | dev_loss += my_loss.value() 105 | dev_words += len(sent) 106 | trainer.update() 107 | print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start)) 108 | # Generate a few sentences 109 | for _ in range(5): 110 | sent = generate_sent() 111 | print(" ".join([i2w[x] for x in sent])) 112 | -------------------------------------------------------------------------------- /03-wordemb/kwic.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | # Usage: 4 | # kwic.py word < corpus.txt > output.tsv 5 | 6 | N = 4 7 | 8 | for line in sys.stdin: 9 | arr = [""] * N + line.strip().split() + [""] * N 10 | for i, w in enumerate(arr): 11 | if w == sys.argv[1]: 12 | print("\t".join(arr[i-N:i+N+1])) 13 | -------------------------------------------------------------------------------- /03-wordemb/tsne.py: -------------------------------------------------------------------------------- 1 | # 2 | # tsne.py 3 | # 4 | # Implementation of t-SNE in Python. The implementation was tested on Python 2.7.10, and it requires a working 5 | # installation of NumPy. The implementation comes with an example on the MNIST dataset. In order to plot the 6 | # results of this example, a working installation of matplotlib is required. 7 | # 8 | # The example can be run by executing: `ipython tsne.py` 9 | # 10 | # 11 | # Created by Laurens van der Maaten on 20-12-08. 12 | # Copyright (c) 2008 Tilburg University. All rights reserved. 13 | 14 | import numpy as Math 15 | import pylab as Plot 16 | 17 | def Hbeta(D = Math.array([]), beta = 1.0): 18 | """Compute the perplexity and the P-row for a specific value of the precision of a Gaussian distribution.""" 19 | 20 | # Compute P-row and corresponding perplexity 21 | P = Math.exp(-D.copy() * beta) 22 | sumP = sum(P) 23 | H = Math.log(sumP) + beta * Math.sum(D * P) / sumP 24 | P = P / sumP 25 | return H, P 26 | 27 | 28 | def x2p(X = Math.array([]), tol = 1e-5, perplexity = 30.0): 29 | """Performs a binary search to get P-values in such a way that each conditional Gaussian has the same perplexity.""" 30 | 31 | # Initialize some variables 32 | print("Computing pairwise distances...") 33 | (n, d) = X.shape 34 | sum_X = Math.sum(Math.square(X), 1) 35 | D = Math.add(Math.add(-2 * Math.dot(X, X.T), sum_X).T, sum_X) 36 | P = Math.zeros((n, n)) 37 | beta = Math.ones((n, 1)) 38 | logU = Math.log(perplexity) 39 | 40 | # Loop over all datapoints 41 | for i in range(n): 42 | 43 | # Print progress 44 | if i % 500 == 0: 45 | print("Computing P-values for point ", i, " of ", n, "...") 46 | 47 | # Compute the Gaussian kernel and entropy for the current precision 48 | betamin = -Math.inf 49 | betamax = Math.inf 50 | Di = D[i, Math.concatenate((Math.r_[0:i], Math.r_[i+1:n]))] 51 | (H, thisP) = Hbeta(Di, beta[i]) 52 | 53 | # Evaluate whether the perplexity is within tolerance 54 | Hdiff = H - logU 55 | tries = 0 56 | while Math.abs(Hdiff) > tol and tries < 50: 57 | 58 | # If not, increase or decrease precision 59 | if Hdiff > 0: 60 | betamin = beta[i].copy() 61 | if betamax == Math.inf or betamax == -Math.inf: 62 | beta[i] = beta[i] * 2 63 | else: 64 | beta[i] = (beta[i] + betamax) / 2 65 | else: 66 | betamax = beta[i].copy() 67 | if betamin == Math.inf or betamin == -Math.inf: 68 | beta[i] = beta[i] / 2 69 | else: 70 | beta[i] = (beta[i] + betamin) / 2 71 | 72 | # Recompute the values 73 | (H, thisP) = Hbeta(Di, beta[i]) 74 | Hdiff = H - logU 75 | tries = tries + 1 76 | 77 | # Set the final row of P 78 | P[i, Math.concatenate((Math.r_[0:i], Math.r_[i+1:n]))] = thisP 79 | 80 | # Return final P-matrix 81 | print("Mean value of sigma: ", Math.mean(Math.sqrt(1 / beta))) 82 | return P 83 | 84 | 85 | def pca(X = Math.array([]), no_dims = 50): 86 | """Runs PCA on the NxD array X in order to reduce its dimensionality to no_dims dimensions.""" 87 | 88 | print("Preprocessing the data using PCA...") 89 | (n, d) = X.shape 90 | X = X - Math.tile(Math.mean(X, 0), (n, 1)) 91 | (l, M) = Math.linalg.eig(Math.dot(X.T, X)) 92 | Y = Math.dot(X, M[:,0:no_dims]) 93 | return Y 94 | 95 | 96 | def tsne(X = Math.array([]), no_dims = 2, initial_dims = 50, perplexity = 30.0): 97 | """Runs t-SNE on the dataset in the NxD array X to reduce its dimensionality to no_dims dimensions. 98 | The syntaxis of the function is Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array.""" 99 | 100 | # Check inputs 101 | if isinstance(no_dims, float): 102 | print("Error: array X should have type float.") 103 | return -1 104 | if round(no_dims) != no_dims: 105 | print("Error: number of dimensions should be an integer.") 106 | return -1 107 | 108 | # Initialize variables 109 | X = pca(X, initial_dims).real 110 | (n, d) = X.shape 111 | max_iter = 1000 112 | initial_momentum = 0.5 113 | final_momentum = 0.8 114 | eta = 500 115 | min_gain = 0.01 116 | Y = Math.random.randn(n, no_dims) 117 | dY = Math.zeros((n, no_dims)) 118 | iY = Math.zeros((n, no_dims)) 119 | gains = Math.ones((n, no_dims)) 120 | 121 | # Compute P-values 122 | P = x2p(X, 1e-5, perplexity) 123 | P = P + Math.transpose(P) 124 | P = P / Math.sum(P) 125 | P = P * 4; # early exaggeration 126 | P = Math.maximum(P, 1e-12) 127 | 128 | # Run iterations 129 | for iter in range(max_iter): 130 | 131 | # Compute pairwise affinities 132 | sum_Y = Math.sum(Math.square(Y), 1) 133 | num = 1 / (1 + Math.add(Math.add(-2 * Math.dot(Y, Y.T), sum_Y).T, sum_Y)) 134 | num[range(n), range(n)] = 0 135 | Q = num / Math.sum(num) 136 | Q = Math.maximum(Q, 1e-12) 137 | 138 | # Compute gradient 139 | PQ = P - Q 140 | for i in range(n): 141 | dY[i,:] = Math.sum(Math.tile(PQ[:,i] * num[:,i], (no_dims, 1)).T * (Y[i,:] - Y), 0) 142 | 143 | # Perform the update 144 | if iter < 20: 145 | momentum = initial_momentum 146 | else: 147 | momentum = final_momentum 148 | gains = (gains + 0.2) * ((dY > 0) != (iY > 0)) + (gains * 0.8) * ((dY > 0) == (iY > 0)) 149 | gains[gains < min_gain] = min_gain 150 | iY = momentum * iY - eta * (gains * dY) 151 | Y = Y + iY 152 | Y = Y - Math.tile(Math.mean(Y, 0), (n, 1)) 153 | 154 | # Compute current value of cost function 155 | if (iter + 1) % 10 == 0: 156 | C = Math.sum(P * Math.log(P / Q)) 157 | print("Iteration ", (iter + 1), ": error is ", C) 158 | 159 | # Stop lying about P-values 160 | if iter == 100: 161 | P = P / 4 162 | 163 | # Return solution 164 | return Y 165 | 166 | 167 | if __name__ == "__main__": 168 | print("Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset.") 169 | print("Running example on 2,500 MNIST digits...") 170 | X = Math.loadtxt("mnist2500_X.txt") 171 | labels = Math.loadtxt("mnist2500_labels.txt") 172 | Y = tsne(X, 2, 50, 20.0) 173 | Plot.scatter(Y[:,0], Y[:,1], 20, labels) 174 | Plot.show() 175 | -------------------------------------------------------------------------------- /03-wordemb/wordemb-cbow.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import math 3 | import time 4 | import random 5 | import dynet as dy 6 | import numpy as np 7 | 8 | N=2 #length of window on each side (so N=2 gives a total window size of 5, as in t-2 t-1 t t+1 t+2) 9 | EMB_SIZE = 128 # The size of the embedding 10 | 11 | embeddings_location = "embeddings.txt" #the file to write the word embeddings to 12 | labels_location = "labels.txt" #the file to write the labels to 13 | 14 | # We reuse the data reading from the language modeling class 15 | w2i = defaultdict(lambda: len(w2i)) 16 | S = w2i[""] 17 | UNK = w2i[""] 18 | def read_dataset(filename): 19 | with open(filename, "r") as f: 20 | for line in f: 21 | yield [w2i[x] for x in line.strip().split(" ")] 22 | 23 | # Read in the data 24 | train = list(read_dataset("../data/ptb/train.txt")) 25 | w2i = defaultdict(lambda: UNK, w2i) 26 | dev = list(read_dataset("../data/ptb/valid.txt")) 27 | i2w = {v: k for k, v in w2i.items()} 28 | nwords = len(w2i) 29 | 30 | with open(labels_location, 'w') as labels_file: 31 | for i in range(nwords): 32 | labels_file.write(i2w[i] + '\n') 33 | 34 | # Start DyNet and define trainer 35 | model = dy.Model() 36 | trainer = dy.SimpleSGDTrainer(model, learning_rate=0.1) 37 | 38 | # Define the model 39 | W_c_p = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights at each position 40 | W_w_p = model.add_parameters((nwords, EMB_SIZE)) # Weights of the softmax 41 | 42 | # Calculate the loss value for the entire sentence 43 | def calc_sent_loss(sent): 44 | # Create a computation graph 45 | dy.renew_cg() 46 | 47 | #add padding to the sentence equal to the size of the window 48 | #as we need to predict the eos as well, the future window at that point is N past it 49 | padded_sent = [S] * N + sent + [S] * N 50 | padded_emb = [W_c_p[x] for x in padded_sent] 51 | 52 | W_w = dy.parameter(W_w_p) 53 | 54 | # Step through the sentence 55 | all_losses = [] 56 | for i in range(N,len(sent)+N): 57 | c = dy.esum(padded_emb[i-N:i] + padded_emb[i+1:i+N+1]) 58 | s = W_w * c 59 | all_losses.append(dy.pickneglogsoftmax(s, padded_sent[i])) 60 | return dy.esum(all_losses) 61 | 62 | MAX_LEN = 100 63 | 64 | for ITER in range(100): 65 | print("started iter %r" % ITER) 66 | # Perform training 67 | random.shuffle(train) 68 | train_words, train_loss = 0, 0.0 69 | start = time.time() 70 | for sent_id, sent in enumerate(train): 71 | my_loss = calc_sent_loss(sent) 72 | train_loss += my_loss.value() 73 | train_words += len(sent) 74 | my_loss.backward() 75 | trainer.update() 76 | if (sent_id+1) % 5000 == 0: 77 | print("--finished %r sentences" % (sent_id+1)) 78 | print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start)) 79 | # Evaluate on dev set 80 | dev_words, dev_loss = 0, 0.0 81 | start = time.time() 82 | for sent_id, sent in enumerate(dev): 83 | my_loss = calc_sent_loss(sent) 84 | dev_loss += my_loss.value() 85 | dev_words += len(sent) 86 | trainer.update() 87 | print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start)) 88 | 89 | print("saving embedding files") 90 | with open(embeddings_location, 'w') as embeddings_file: 91 | W_w_np = W_w_p.as_array() 92 | for i in range(nwords): 93 | ith_embedding = '\t'.join(map(str, W_w_np[i])) 94 | embeddings_file.write(ith_embedding + '\n') 95 | -------------------------------------------------------------------------------- /03-wordemb/wordemb-skip.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import math 3 | import time 4 | import random 5 | import dynet as dy 6 | import numpy as np 7 | 8 | N=2 #length of window on each side (so N=2 gives a total window size of 5, as in t-2 t-1 t t+1 t+2) 9 | EMB_SIZE = 128 # The size of the embedding 10 | 11 | embeddings_location = "embeddings.txt" #the file to write the word embeddings to 12 | labels_location = "labels.txt" #the file to write the labels to 13 | 14 | # We reuse the data reading from the language modeling class 15 | w2i = defaultdict(lambda: len(w2i)) 16 | S = w2i[""] 17 | UNK = w2i[""] 18 | def read_dataset(filename): 19 | with open(filename, "r") as f: 20 | for line in f: 21 | yield [w2i[x] for x in line.strip().split(" ")] 22 | 23 | # Read in the data 24 | train = list(read_dataset("../data/ptb/train.txt")) 25 | w2i = defaultdict(lambda: UNK, w2i) 26 | dev = list(read_dataset("../data/ptb/valid.txt")) 27 | i2w = {v: k for k, v in w2i.items()} 28 | nwords = len(w2i) 29 | 30 | with open(labels_location, 'w') as labels_file: 31 | for i in range(nwords): 32 | labels_file.write(i2w[i] + '\n') 33 | 34 | # Start DyNet and define trainer 35 | model = dy.Model() 36 | trainer = dy.SimpleSGDTrainer(model, learning_rate=0.1) 37 | 38 | # Define the model 39 | W_c_p = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights at each position 40 | W_w_p = model.add_parameters((nwords, EMB_SIZE)) # Weights of the softmax 41 | 42 | # Calculate the loss value for the entire sentence 43 | def calc_sent_loss(sent): 44 | # Create a computation graph 45 | dy.renew_cg() 46 | 47 | #add padding to the sentence equal to the size of the window 48 | #as we need to predict the eos as well, the future window at that point is N past it 49 | emb = [W_c_p[x] for x in sent] 50 | 51 | W_w = dy.parameter(W_w_p) 52 | 53 | # Step through the sentence 54 | all_losses = [] 55 | for i, my_emb in enumerate(emb): 56 | s = W_w * my_emb 57 | lp = dy.log_softmax(s) 58 | for j in range(1,N+1): 59 | all_losses.append(dy.pick(lp, sent[i-j] if i-j >= 0 else S)) 60 | all_losses.append(dy.pick(lp, sent[i+j] if i+j < len(sent) else S)) 61 | return dy.esum(all_losses) 62 | 63 | MAX_LEN = 100 64 | 65 | for ITER in range(100): 66 | print("started iter %r" % ITER) 67 | # Perform training 68 | random.shuffle(train) 69 | train_words, train_loss = 0, 0.0 70 | start = time.time() 71 | for sent_id, sent in enumerate(train): 72 | my_loss = calc_sent_loss(sent) 73 | train_loss += my_loss.value() 74 | train_words += len(sent) 75 | my_loss.backward() 76 | trainer.update() 77 | if (sent_id+1) % 5000 == 0: 78 | print("--finished %r sentences" % (sent_id+1)) 79 | print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start)) 80 | # Evaluate on dev set 81 | dev_words, dev_loss = 0, 0.0 82 | start = time.time() 83 | for sent_id, sent in enumerate(dev): 84 | my_loss = calc_sent_loss(sent) 85 | dev_loss += my_loss.value() 86 | dev_words += len(sent) 87 | trainer.update() 88 | print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start)) 89 | 90 | print("saving embedding files") 91 | with open(embeddings_location, 'w') as embeddings_file: 92 | W_w_np = W_w_p.as_array() 93 | for i in range(nwords): 94 | ith_embedding = '\t'.join(map(str, W_w_np[i])) 95 | embeddings_file.write(ith_embedding + '\n') 96 | -------------------------------------------------------------------------------- /03-wordemb/wordemb-vis-tsne.py: -------------------------------------------------------------------------------- 1 | # This visualizer is based off of 2 | # http://nlp.yvespeirsman.be/blog/visualizing-word-embeddings-with-tsne/ 3 | 4 | import pylab as Plot 5 | import numpy as np 6 | import argparse 7 | from tsne import tsne # from http://lvdmaaten.github.io/tsne/ 8 | import pdb 9 | 10 | parser = argparse.ArgumentParser(description='Visualize word embeddings using TSNE') 11 | parser.add_argument('vector_file', type=str, help='location of the word vector file') 12 | parser.add_argument('label_file', type=str, help='location of the word vector file') 13 | parser.add_argument('--target_words', dest='target_words', type=str, default=None, help='a list of words to display (if none, shows 1000 random words') 14 | 15 | args = parser.parse_args() 16 | 17 | #read the datafile, with the option for a seperate labels file 18 | def read_data(vector_file_path, labels_file_path=None): 19 | vocab = [] 20 | word_vectors = [] 21 | 22 | with open(labels_file_path) as sample_file: 23 | for line in sample_file: 24 | vocab.append(line.strip()) 25 | with open(vector_file_path) as vector_file: 26 | for line in vector_file: 27 | line = line.strip() 28 | word_vector = line.split() 29 | word_vectors.append([float(i) for i in word_vector]) 30 | return np.array(word_vectors), vocab 31 | 32 | def display_data(word_vectors, words, target_words=None): 33 | target_matrix = word_vectors.copy() 34 | if target_words: 35 | target_words = [line.strip().lower() for line in open(target_words)][:2000] 36 | rows = [words.index(word) for word in target_words if word in words] 37 | target_matrix = target_matrix[rows,:] 38 | else: 39 | rows = np.random.choice(len(word_vectors), size=1000, replace=False) 40 | target_matrix = target_matrix[rows,:] 41 | reduced_matrix = tsne(target_matrix, 2); 42 | 43 | Plot.figure(figsize=(200, 200), dpi=100) 44 | max_x = np.amax(reduced_matrix, axis=0)[0] 45 | max_y = np.amax(reduced_matrix, axis=0)[1] 46 | Plot.xlim((-max_x,max_x)) 47 | Plot.ylim((-max_y,max_y)) 48 | 49 | Plot.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1], 20); 50 | 51 | for row_id in range(0, len(rows)): 52 | target_word = words[rows[row_id]] 53 | x = reduced_matrix[row_id, 0] 54 | y = reduced_matrix[row_id, 1] 55 | Plot.annotate(target_word, (x,y)) 56 | Plot.savefig("word_vectors.png"); 57 | 58 | if __name__ == "__main__": 59 | X, labels = read_data(args.vector_file, args.label_file) 60 | display_data(X, labels, args.target_words) 61 | 62 | -------------------------------------------------------------------------------- /04-efficiency/slow-impl.py: -------------------------------------------------------------------------------- 1 | import dynet as dy 2 | import numpy as np 3 | 4 | # This implementation will be unnecessarily slow, especially on the GPU. 5 | # It can be improved by following the speed tricks covered in class: 6 | # 1) Don't repeat operations. 7 | # 2) Minimize the number of operations. 8 | # 3) Minimize the number of CPU-GPU memory copies, make them earlier. 9 | 10 | # Create the model 11 | model = dy.ParameterCollection() 12 | trainer = dy.SimpleSGDTrainer(model) 13 | W_p = model.add_parameters((100,100)) 14 | 15 | # Create the "training data" 16 | x_vecs = [] 17 | y_vecs = [] 18 | for i in range(10): 19 | x_vecs.append(np.random.rand(100)) 20 | y_vecs.append(np.random.rand(100)) 21 | 22 | # Do the processing 23 | for my_iter in range(1000): 24 | dy.renew_cg() 25 | W = dy.parameter(W_p) 26 | total = 0 27 | for x in x_vecs: 28 | for y in y_vecs: 29 | x_exp = dy.inputTensor(x) 30 | y_exp = dy.inputTensor(y) 31 | total = total + dy.dot_product(W * x_exp, y_exp) 32 | total.forward() 33 | total.backward() 34 | trainer.update() 35 | 36 | -------------------------------------------------------------------------------- /04-efficiency/wordemb-skip-binary.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import math 3 | import time 4 | import random 5 | import dynet as dy 6 | import numpy as np 7 | import pdb 8 | 9 | N=2 #length of window on each side (so N=2 gives a total window size of 5, as in t-2 t-1 t t+1 t+2) 10 | EMB_SIZE = 128 # The size of the embedding 11 | 12 | embeddings_location = "embeddings.txt" #the file to write the word embeddings to 13 | labels_location = "labels.txt" #the file to write the labels to 14 | 15 | # We reuse the data reading from the language modeling class 16 | w2i = defaultdict(lambda: len(w2i)) 17 | 18 | #word counts for negative sampling 19 | word_counts = defaultdict(int) 20 | 21 | S = w2i[""] 22 | UNK = w2i[""] 23 | def read_dataset(filename): 24 | with open(filename, "r") as f: 25 | for line in f: 26 | line = line.strip().split(" ") 27 | for word in line: 28 | word_counts[w2i[word]] += 1 29 | yield [w2i[x] for x in line] 30 | 31 | # Read in the data 32 | train = list(read_dataset("../data/ptb/train.txt")) 33 | w2i = defaultdict(lambda: UNK, w2i) 34 | dev = list(read_dataset("../data/ptb/valid.txt")) 35 | i2w = {v: k for k, v in w2i.items()} 36 | nwords = len(w2i) 37 | nbits = len(np.binary_repr(nwords-1)) 38 | 39 | with open(labels_location, 'w') as labels_file: 40 | for i in range(nwords): 41 | labels_file.write(i2w[i] + '\n') 42 | 43 | # Start DyNet and define trainer 44 | model = dy.Model() 45 | trainer = dy.SimpleSGDTrainer(model, learning_rate=0.1) 46 | 47 | # Define the model 48 | W_w_p = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights 49 | W_c_p = model.add_parameters((nbits, EMB_SIZE)) # Binary prediction weights 50 | 51 | # Calculate the loss value for the entire sentence 52 | def calc_sent_loss(sent): 53 | # Create a computation graph 54 | dy.renew_cg() 55 | 56 | W_c = dy.parameter(W_c_p) 57 | 58 | # Get embeddings for the sentence 59 | emb = [W_w_p[x] for x in sent] 60 | 61 | # Step through the sentence and calculate binary prediction losses 62 | all_losses = [] 63 | for i, my_emb in enumerate(emb): 64 | scores = dy.logistic(W_c * my_emb) 65 | pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] + 66 | [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)]) 67 | word_repr = [[float(y) for y in np.binary_repr(x).zfill(nbits)] for x in pos_words] 68 | word_repr = [dy.inputVector(x) for x in word_repr] 69 | all_losses.extend([dy.binary_log_loss(scores, x) for x in word_repr]) 70 | return dy.esum(all_losses) 71 | 72 | MAX_LEN = 100 73 | 74 | for ITER in range(100): 75 | print("started iter %r" % ITER) 76 | # Perform training 77 | random.shuffle(train) 78 | train_words, train_loss = 0, 0.0 79 | start = time.time() 80 | for sent_id, sent in enumerate(train): 81 | my_loss = calc_sent_loss(sent) 82 | train_loss += my_loss.value() 83 | train_words += len(sent) 84 | my_loss.backward() 85 | trainer.update() 86 | if (sent_id+1) % 5000 == 0: 87 | print("--finished %r sentences" % (sent_id+1)) 88 | print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start)) 89 | # Evaluate on dev set 90 | dev_words, dev_loss = 0, 0.0 91 | start = time.time() 92 | for sent_id, sent in enumerate(dev): 93 | my_loss = calc_sent_loss(sent) 94 | dev_loss += my_loss.value() 95 | dev_words += len(sent) 96 | trainer.update() 97 | print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start)) 98 | 99 | print("saving embedding files") 100 | with open(embeddings_location, 'w') as embeddings_file: 101 | W_w_np = W_w_p.as_array() 102 | for i in range(nwords): 103 | ith_embedding = '\t'.join(map(str, W_w_np[i])) 104 | embeddings_file.write(ith_embedding + '\n') 105 | -------------------------------------------------------------------------------- /04-efficiency/wordemb-skip-ns.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import math 3 | import time 4 | import random 5 | import dynet as dy 6 | import numpy as np 7 | import pdb 8 | 9 | K=3 #number of negative samples 10 | N=2 #length of window on each side (so N=2 gives a total window size of 5, as in t-2 t-1 t t+1 t+2) 11 | EMB_SIZE = 128 # The size of the embedding 12 | 13 | embeddings_location = "embeddings.txt" #the file to write the word embeddings to 14 | labels_location = "labels.txt" #the file to write the labels to 15 | 16 | # We reuse the data reading from the language modeling class 17 | w2i = defaultdict(lambda: len(w2i)) 18 | 19 | #word counts for negative sampling 20 | word_counts = defaultdict(int) 21 | 22 | S = w2i[""] 23 | UNK = w2i[""] 24 | def read_dataset(filename): 25 | with open(filename, "r") as f: 26 | for line in f: 27 | line = line.strip().split(" ") 28 | for word in line: 29 | word_counts[w2i[word]] += 1 30 | yield [w2i[x] for x in line] 31 | 32 | 33 | # Read in the data 34 | train = list(read_dataset("../data/ptb/train.txt")) 35 | w2i = defaultdict(lambda: UNK, w2i) 36 | dev = list(read_dataset("../data/ptb/valid.txt")) 37 | i2w = {v: k for k, v in w2i.items()} 38 | nwords = len(w2i) 39 | 40 | 41 | # take the word counts to the 3/4, normalize 42 | counts = np.array([list(x) for x in word_counts.items()])[:,1]**.75 43 | normalizing_constant = sum(counts) 44 | word_probabilities = np.zeros(nwords) 45 | for word_id in word_counts: 46 | word_probabilities[word_id] = word_counts[word_id]**.75/normalizing_constant 47 | 48 | with open(labels_location, 'w') as labels_file: 49 | for i in range(nwords): 50 | labels_file.write(i2w[i] + '\n') 51 | 52 | # Start DyNet and define trainer 53 | model = dy.Model() 54 | trainer = dy.SimpleSGDTrainer(model, learning_rate=0.1) 55 | 56 | # Define the model 57 | W_c_p = model.add_lookup_parameters((nwords, EMB_SIZE)) # Context word weights 58 | W_w_p = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word weights 59 | 60 | # Calculate the loss value for the entire sentence 61 | def calc_sent_loss(sent): 62 | # Create a computation graph 63 | dy.renew_cg() 64 | 65 | # Get embeddings for the sentence 66 | emb = [W_w_p[x] for x in sent] 67 | 68 | # Sample K negative words for each predicted word at each position 69 | all_neg_words = np.random.choice(nwords, size=2*N*K*len(emb), replace=True, p=word_probabilities) 70 | 71 | # W_w = dy.parameter(W_w_p) 72 | # Step through the sentence and calculate the negative and positive losses 73 | all_losses = [] 74 | for i, my_emb in enumerate(emb): 75 | neg_words = all_neg_words[i*K*2*N:(i+1)*K*2*N] 76 | pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] + 77 | [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)]) 78 | neg_loss = -dy.log(dy.logistic(-dy.dot_product(my_emb, dy.lookup_batch(W_c_p, neg_words)))) 79 | pos_loss = -dy.log(dy.logistic(dy.dot_product(my_emb, dy.lookup_batch(W_c_p, pos_words)))) 80 | all_losses.append(dy.sum_batches(neg_loss) + dy.sum_batches(pos_loss)) 81 | return dy.esum(all_losses) 82 | 83 | MAX_LEN = 100 84 | 85 | for ITER in range(100): 86 | print("started iter %r" % ITER) 87 | # Perform training 88 | random.shuffle(train) 89 | train_words, train_loss = 0, 0.0 90 | start = time.time() 91 | for sent_id, sent in enumerate(train): 92 | my_loss = calc_sent_loss(sent) 93 | train_loss += my_loss.value() 94 | train_words += len(sent) 95 | my_loss.backward() 96 | trainer.update() 97 | if (sent_id+1) % 5000 == 0: 98 | print("--finished %r sentences" % (sent_id+1)) 99 | print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start)) 100 | # Evaluate on dev set 101 | dev_words, dev_loss = 0, 0.0 102 | start = time.time() 103 | for sent_id, sent in enumerate(dev): 104 | my_loss = calc_sent_loss(sent) 105 | dev_loss += my_loss.value() 106 | dev_words += len(sent) 107 | trainer.update() 108 | print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start)) 109 | 110 | print("saving embedding files") 111 | with open(embeddings_location, 'w') as embeddings_file: 112 | W_w_np = W_w_p.as_array() 113 | for i in range(nwords): 114 | ith_embedding = '\t'.join(map(str, W_w_np[i])) 115 | embeddings_file.write(ith_embedding + '\n') 116 | -------------------------------------------------------------------------------- /05-cnn/cnn-activation.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import time 3 | import random 4 | import dynet as dy 5 | import numpy as np 6 | 7 | np.set_printoptions(linewidth=np.nan, threshold=np.nan) 8 | 9 | # Functions to read in the corpus 10 | w2i = defaultdict(lambda: len(w2i)) 11 | UNK = w2i[""] 12 | def read_dataset(filename): 13 | with open(filename, "r") as f: 14 | for line in f: 15 | tag, words = line.lower().strip().split(" ||| ") 16 | words = words.split(" ") 17 | yield (words, [w2i[x] for x in words], int(tag)) 18 | 19 | # Read in the data 20 | train = list(read_dataset("../data/classes/train.txt")) 21 | w2i = defaultdict(lambda: UNK, w2i) 22 | dev = list(read_dataset("../data/classes/test.txt")) 23 | nwords = len(w2i) 24 | ntags = 5 25 | 26 | # Start DyNet and define trainer 27 | model = dy.Model() 28 | trainer = dy.AdamTrainer(model) 29 | 30 | # Define the model 31 | EMB_SIZE = 10 32 | W_emb = model.add_lookup_parameters((nwords, 1, 1, EMB_SIZE)) # Word embeddings 33 | WIN_SIZE = 3 34 | FILTER_SIZE = 8 35 | W_cnn = model.add_parameters((1, WIN_SIZE, EMB_SIZE, FILTER_SIZE)) # cnn weights 36 | b_cnn = model.add_parameters((FILTER_SIZE)) # cnn bias 37 | 38 | W_sm = model.add_parameters((ntags, FILTER_SIZE)) # Softmax weights 39 | b_sm = model.add_parameters((ntags)) # Softmax bias 40 | 41 | def calc_scores(wids): 42 | dy.renew_cg() 43 | W_cnn_express = dy.parameter(W_cnn) 44 | b_cnn_express = dy.parameter(b_cnn) 45 | W_sm_express = dy.parameter(W_sm) 46 | b_sm_express = dy.parameter(b_sm) 47 | if len(wids) < WIN_SIZE: 48 | wids += [0] * (WIN_SIZE-len(wids)) 49 | 50 | cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) 51 | cnn_out = dy.conv2d_bias(cnn_in, W_cnn_express, b_cnn_express, stride=(1, 1), is_valid=False) 52 | pool_out = dy.max_dim(cnn_out, d=1) 53 | pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) 54 | pool_out = dy.rectify(pool_out) 55 | return W_sm_express * pool_out + b_sm_express 56 | 57 | def calc_predict_and_activations(wids, tag, words): 58 | dy.renew_cg() 59 | W_cnn_express = dy.parameter(W_cnn) 60 | b_cnn_express = dy.parameter(b_cnn) 61 | W_sm_express = dy.parameter(W_sm) 62 | b_sm_express = dy.parameter(b_sm) 63 | if len(wids) < WIN_SIZE: 64 | wids += [0] * (WIN_SIZE-len(wids)) 65 | 66 | cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) 67 | cnn_out = dy.conv2d_bias(cnn_in, W_cnn_express, b_cnn_express, stride=(1, 1), is_valid=False) 68 | filters = (dy.reshape(cnn_out, (len(wids), FILTER_SIZE))).npvalue() 69 | activations = filters.argmax(axis=0) 70 | 71 | pool_out = dy.max_dim(cnn_out, d=1) 72 | pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) 73 | pool_out = dy.rectify(pool_out) 74 | 75 | scores = (W_sm_express * pool_out + b_sm_express).npvalue() 76 | print '%d ||| %s' % (tag, ' '.join(words)) 77 | predict = np.argmax(scores) 78 | print display_activations(words, activations) 79 | print 'scores=%s, predict: %d' % (scores, predict) 80 | features = pool_out.npvalue() 81 | W = W_sm_express.npvalue() 82 | bias = b_sm_express.npvalue() 83 | print ' bias=%s' % bias 84 | contributions = W * features 85 | print ' very bad (%.4f): %s' % (scores[0], contributions[0]) 86 | print ' bad (%.4f): %s' % (scores[1], contributions[1]) 87 | print ' neutral (%.4f): %s' % (scores[2], contributions[2]) 88 | print ' good (%.4f): %s' % (scores[3], contributions[3]) 89 | print 'very good (%.4f): %s' % (scores[4], contributions[4]) 90 | 91 | 92 | def display_activations(words, activations): 93 | pad_begin = (WIN_SIZE - 1) / 2 94 | pad_end = WIN_SIZE - 1 - pad_begin 95 | words_padded = ['pad' for i in range(pad_begin)] + words + ['pad' for i in range(pad_end)] 96 | 97 | ngrams = [] 98 | for act in activations: 99 | ngrams.append('[' + ', '.join(words_padded[act:act+WIN_SIZE]) + ']') 100 | 101 | return ngrams 102 | 103 | for ITER in range(10): 104 | # Perform training 105 | random.shuffle(train) 106 | train_loss = 0.0 107 | train_correct = 0.0 108 | start = time.time() 109 | for _, wids, tag in train: 110 | scores = calc_scores(wids) 111 | predict = np.argmax(scores.npvalue()) 112 | if predict == tag: 113 | train_correct += 1 114 | 115 | my_loss = dy.pickneglogsoftmax(scores, tag) 116 | train_loss += my_loss.value() 117 | my_loss.backward() 118 | trainer.update() 119 | print("iter %r: train loss/sent=%.4f, acc=%.4f, time=%.2fs" % (ITER, train_loss/len(train), train_correct/len(train), time.time()-start)) 120 | # Perform testing 121 | test_correct = 0.0 122 | for _, wids, tag in dev: 123 | scores = calc_scores(wids).npvalue() 124 | predict = np.argmax(scores) 125 | if predict == tag: 126 | test_correct += 1 127 | print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev))) 128 | 129 | 130 | for words, wids, tag in dev: 131 | calc_predict_and_activations(wids, tag, words) 132 | raw_input() -------------------------------------------------------------------------------- /05-cnn/cnn-class.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import time 3 | import random 4 | import dynet as dy 5 | import numpy as np 6 | 7 | # Functions to read in the corpus 8 | w2i = defaultdict(lambda: len(w2i)) 9 | t2i = defaultdict(lambda: len(t2i)) 10 | UNK = w2i[""] 11 | def read_dataset(filename): 12 | with open(filename, "r") as f: 13 | for line in f: 14 | tag, words = line.lower().strip().split(" ||| ") 15 | yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 16 | 17 | # Read in the data 18 | train = list(read_dataset("../data/classes/train.txt")) 19 | w2i = defaultdict(lambda: UNK, w2i) 20 | dev = list(read_dataset("../data/classes/test.txt")) 21 | nwords = len(w2i) 22 | ntags = len(t2i) 23 | 24 | # Start DyNet and define trainer 25 | model = dy.Model() 26 | trainer = dy.AdamTrainer(model) 27 | 28 | # Define the model 29 | EMB_SIZE = 64 30 | W_emb = model.add_lookup_parameters((nwords, 1, 1, EMB_SIZE)) # Word embeddings 31 | WIN_SIZE = 3 32 | FILTER_SIZE = 64 33 | W_cnn = model.add_parameters((1, WIN_SIZE, EMB_SIZE, FILTER_SIZE)) # cnn weights 34 | b_cnn = model.add_parameters((FILTER_SIZE)) # cnn bias 35 | 36 | W_sm = model.add_parameters((ntags, FILTER_SIZE)) # Softmax weights 37 | b_sm = model.add_parameters((ntags)) # Softmax bias 38 | 39 | def calc_scores(words): 40 | dy.renew_cg() 41 | W_cnn_express = dy.parameter(W_cnn) 42 | b_cnn_express = dy.parameter(b_cnn) 43 | W_sm_express = dy.parameter(W_sm) 44 | b_sm_express = dy.parameter(b_sm) 45 | if len(words) < WIN_SIZE: 46 | words += [0] * (WIN_SIZE-len(words)) 47 | 48 | cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in words], d=1) 49 | cnn_out = dy.conv2d_bias(cnn_in, W_cnn_express, b_cnn_express, stride=(1, 1), is_valid=False) 50 | pool_out = dy.max_dim(cnn_out, d=1) 51 | pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) 52 | pool_out = dy.rectify(pool_out) 53 | return W_sm_express * pool_out + b_sm_express 54 | 55 | for ITER in range(100): 56 | # Perform training 57 | random.shuffle(train) 58 | train_loss = 0.0 59 | train_correct = 0.0 60 | start = time.time() 61 | for words, tag in train: 62 | scores = calc_scores(words) 63 | predict = np.argmax(scores.npvalue()) 64 | if predict == tag: 65 | train_correct += 1 66 | 67 | my_loss = dy.pickneglogsoftmax(scores, tag) 68 | train_loss += my_loss.value() 69 | my_loss.backward() 70 | trainer.update() 71 | print("iter %r: train loss/sent=%.4f, acc=%.4f, time=%.2fs" % (ITER, train_loss/len(train), train_correct/len(train), time.time()-start)) 72 | # Perform testing 73 | test_correct = 0.0 74 | for words, tag in dev: 75 | scores = calc_scores(words).npvalue() 76 | predict = np.argmax(scores) 77 | if predict == tag: 78 | test_correct += 1 79 | print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev))) 80 | 81 | -------------------------------------------------------------------------------- /06-rnn/lm-lstm.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | 4 | start = time.time() 5 | 6 | from collections import Counter, defaultdict 7 | import random 8 | import math 9 | import sys 10 | import argparse 11 | 12 | import dynet as dy 13 | import numpy as np 14 | 15 | # format of files: each line is "word1 word2 ..." 16 | train_file = "../data/ptb/train.txt" 17 | test_file = "../data/ptb/valid.txt" 18 | 19 | w2i = defaultdict(lambda: len(w2i)) 20 | 21 | 22 | def read(fname): 23 | """ 24 | Read a file where each line is of the form "word1 word2 ..." 25 | Yields lists of the form [word1, word2, ...] 26 | """ 27 | with open(fname, "r") as fh: 28 | for line in fh: 29 | sent = [w2i[x] for x in line.strip().split()] 30 | sent.append(w2i[""]) 31 | yield sent 32 | 33 | 34 | train = list(read(train_file)) 35 | nwords = len(w2i) 36 | test = list(read(test_file)) 37 | S = w2i[""] 38 | assert (nwords == len(w2i)) 39 | 40 | # DyNet Starts 41 | model = dy.Model() 42 | trainer = dy.AdamTrainer(model) 43 | 44 | # Lookup parameters for word embeddings 45 | EMBED_SIZE = 64 46 | HIDDEN_SIZE = 128 47 | WORDS_LOOKUP = model.add_lookup_parameters((nwords, EMBED_SIZE)) 48 | 49 | # Word-level LSTM (layers=1, input=64, output=128, model) 50 | RNN = dy.LSTMBuilder(1, EMBED_SIZE, HIDDEN_SIZE, model) 51 | 52 | # Softmax weights/biases on top of LSTM outputs 53 | W_sm = model.add_parameters((nwords, HIDDEN_SIZE)) 54 | b_sm = model.add_parameters(nwords) 55 | 56 | 57 | # Build the language model graph 58 | def calc_lm_loss(sent): 59 | dy.renew_cg() 60 | # parameters -> expressions 61 | W_exp = dy.parameter(W_sm) 62 | b_exp = dy.parameter(b_sm) 63 | 64 | # initialize the RNN 65 | f_init = RNN.initial_state() 66 | 67 | # get the wids and masks for each step 68 | tot_words = len(sent) 69 | 70 | # start the rnn by inputting "" 71 | s = f_init.add_input(WORDS_LOOKUP[S]) 72 | 73 | # feed word vectors into the RNN and predict the next word 74 | losses = [] 75 | for wid in sent: 76 | # calculate the softmax and loss 77 | score = W_exp * s.output() + b_exp 78 | loss = dy.pickneglogsoftmax(score, wid) 79 | losses.append(loss) 80 | # update the state of the RNN 81 | wemb = WORDS_LOOKUP[wid] 82 | s = s.add_input(wemb) 83 | 84 | return dy.esum(losses), tot_words 85 | 86 | 87 | # Sort training sentences in descending order and count minibatches 88 | train_order = range(len(train)) 89 | 90 | print("startup time: %r" % (time.time() - start)) 91 | # Perform training 92 | start = time.time() 93 | i = all_time = dev_time = all_tagged = this_words = this_loss = 0 94 | for ITER in range(100): 95 | random.shuffle(train_order) 96 | for sid in train_order: 97 | i += 1 98 | if i % int(500) == 0: 99 | trainer.status() 100 | print(this_loss / this_words, file=sys.stderr) 101 | all_tagged += this_words 102 | this_loss = this_words = 0 103 | all_time = time.time() - start 104 | if i % int(10000) == 0: 105 | dev_start = time.time() 106 | dev_loss = dev_words = 0 107 | for sent in test: 108 | loss_exp, mb_words = calc_lm_loss(sent) 109 | dev_loss += loss_exp.scalar_value() 110 | dev_words += mb_words 111 | dev_time += time.time() - dev_start 112 | train_time = time.time() - start - dev_time 113 | print("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % ( 114 | dev_loss / dev_words, math.exp(dev_loss / dev_words), dev_words, train_time, all_tagged / train_time)) 115 | # train on the minibatch 116 | loss_exp, mb_words = calc_lm_loss(train[sid]) 117 | this_loss += loss_exp.scalar_value() 118 | this_words += mb_words 119 | loss_exp.backward() 120 | trainer.update() 121 | print("epoch %r finished" % ITER) 122 | trainer.update_epoch(1.0) 123 | -------------------------------------------------------------------------------- /06-rnn/lm-minibatch.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | 4 | start = time.time() 5 | 6 | from collections import Counter, defaultdict 7 | import random 8 | import math 9 | import sys 10 | import argparse 11 | 12 | import dynet as dy 13 | import numpy as np 14 | 15 | # format of files: each line is "word1/tag2 word2/tag2 ..." 16 | train_file = "../data/ptb/train.txt" 17 | test_file = "../data/ptb/valid.txt" 18 | 19 | w2i = defaultdict(lambda: len(w2i)) 20 | 21 | 22 | def read(fname): 23 | """ 24 | Read a file where each line is of the form "word1 word2 ..." 25 | Yields lists of the form [word1, word2, ...] 26 | """ 27 | with open(fname, "r") as fh: 28 | for line in fh: 29 | sent = [w2i[x] for x in line.strip().split()] 30 | sent.append(w2i[""]) 31 | yield sent 32 | 33 | 34 | train = list(read(train_file)) 35 | nwords = len(w2i) 36 | test = list(read(test_file)) 37 | S = w2i[""] 38 | assert (nwords == len(w2i)) 39 | 40 | # DyNet Starts 41 | 42 | model = dy.Model() 43 | trainer = dy.AdamTrainer(model) 44 | 45 | # Lookup parameters for word embeddings 46 | EMBED_SIZE = 64 47 | HIDDEN_SIZE = 128 48 | WORDS_LOOKUP = model.add_lookup_parameters((nwords, EMBED_SIZE)) 49 | 50 | # Word-level LSTM (layers=1, input=64, output=128, model) 51 | RNN = dy.VanillaLSTMBuilder(1, EMBED_SIZE, HIDDEN_SIZE, model) 52 | 53 | # Softmax weights/biases on top of LSTM outputs 54 | W_sm = model.add_parameters((nwords, HIDDEN_SIZE)) 55 | b_sm = model.add_parameters(nwords) 56 | 57 | 58 | # Build the language model graph 59 | def calc_lm_loss(sents): 60 | dy.renew_cg() 61 | # parameters -> expressions 62 | W_exp = dy.parameter(W_sm) 63 | b_exp = dy.parameter(b_sm) 64 | 65 | # initialize the RNN 66 | f_init = RNN.initial_state() 67 | 68 | # get the wids and masks for each step 69 | tot_words = 0 70 | wids = [] 71 | masks = [] 72 | for i in range(len(sents[0])): 73 | wids.append([(sent[i] if len(sent) > i else S) for sent in sents]) 74 | mask = [(1 if len(sent) > i else 0) for sent in sents] 75 | masks.append(mask) 76 | tot_words += sum(mask) 77 | 78 | # start the rnn by inputting "" 79 | init_ids = [S] * len(sents) 80 | s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP, init_ids)) 81 | 82 | # feed word vectors into the RNN and predict the next word 83 | losses = [] 84 | for wid, mask in zip(wids, masks): 85 | # calculate the softmax and loss 86 | score = dy.affine_transform([b_exp, W_exp, s.output()]) 87 | loss = dy.pickneglogsoftmax_batch(score, wid) 88 | # mask the loss if at least one sentence is shorter 89 | if mask[-1] != 1: 90 | mask_expr = dy.inputVector(mask) 91 | mask_expr = dy.reshape(mask_expr, (1,), len(sents)) 92 | loss = loss * mask_expr 93 | losses.append(loss) 94 | # update the state of the RNN 95 | wemb = dy.lookup_batch(WORDS_LOOKUP, wid) 96 | s = s.add_input(wemb) 97 | 98 | return dy.sum_batches(dy.esum(losses)), tot_words 99 | 100 | 101 | # Sort training sentences in descending order and count minibatches 102 | MB_SIZE = 16 103 | train.sort(key=lambda x: -len(x)) 104 | test.sort(key=lambda x: -len(x)) 105 | train_order = [x * MB_SIZE for x in range(int((len(train) - 1) / MB_SIZE + 1))] 106 | test_order = [x * MB_SIZE for x in range(int((len(test) - 1) / MB_SIZE + 1))] 107 | 108 | print("startup time: %r" % (time.time() - start)) 109 | # Perform training 110 | start = time.time() 111 | i = all_time = dev_time = all_tagged = this_words = this_loss = 0 112 | for ITER in range(100): 113 | random.shuffle(train_order) 114 | for sid in train_order: 115 | i += 1 116 | if i % int(500 / MB_SIZE) == 0: 117 | trainer.status() 118 | print(this_loss / this_words, file=sys.stderr) 119 | all_tagged += this_words 120 | this_loss = this_words = 0 121 | all_time = time.time() - start 122 | if i % int(10000 / MB_SIZE) == 0: 123 | dev_start = time.time() 124 | dev_loss = dev_words = 0 125 | for sid in test_order: 126 | loss_exp, mb_words = calc_lm_loss(test[sid:sid + MB_SIZE]) 127 | dev_loss += loss_exp.scalar_value() 128 | dev_words += mb_words 129 | dev_time += time.time() - dev_start 130 | train_time = time.time() - start - dev_time 131 | print("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % ( 132 | dev_loss / dev_words, math.exp(dev_loss / dev_words), dev_words, train_time, all_tagged / train_time)) 133 | # train on the minibatch 134 | loss_exp, mb_words = calc_lm_loss(train[sid:sid + MB_SIZE]) 135 | this_loss += loss_exp.scalar_value() 136 | # print("loss @ %r: %r" % (i, this_loss)) 137 | this_words += mb_words 138 | loss_exp.backward() 139 | trainer.update() 140 | print("epoch %r finished" % ITER) 141 | trainer.update_epoch(1.0) 142 | -------------------------------------------------------------------------------- /06-rnn/sentiment-lstm.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import time 3 | import random 4 | import dynet as dy 5 | import numpy as np 6 | 7 | # Functions to read in the corpus 8 | w2i = defaultdict(lambda: len(w2i)) 9 | t2i = defaultdict(lambda: len(t2i)) 10 | UNK = w2i[""] 11 | 12 | 13 | def read_dataset(filename): 14 | with open(filename, "r") as f: 15 | for line in f: 16 | tag, words = line.lower().strip().split(" ||| ") 17 | yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 18 | 19 | 20 | # Read in the data 21 | train = list(read_dataset("../data/classes/train.txt")) 22 | w2i = defaultdict(lambda: UNK, w2i) 23 | dev = list(read_dataset("../data/classes/test.txt")) 24 | nwords = len(w2i) 25 | ntags = len(t2i) 26 | 27 | # Start DyNet and defin trainer 28 | model = dy.Model() 29 | trainer = dy.AdamTrainer(model) 30 | 31 | # Define the model 32 | EMB_SIZE = 64 33 | HID_SIZE = 64 34 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word embeddings 35 | fwdLSTM = dy.VanillaLSTMBuilder(1, EMB_SIZE, HID_SIZE, model) # Forward RNN 36 | bwdLSTM = dy.VanillaLSTMBuilder(1, EMB_SIZE, HID_SIZE, model) # Backward RNN 37 | W_sm = model.add_parameters((ntags, 2 * HID_SIZE)) # Softmax weights 38 | b_sm = model.add_parameters((ntags)) # Softmax bias 39 | 40 | 41 | # A function to calculate scores for one value 42 | def calc_scores(words): 43 | dy.renew_cg() 44 | word_embs = [dy.lookup(W_emb, x) for x in words] 45 | fwd_init = fwdLSTM.initial_state() 46 | fwd_embs = fwd_init.transduce(word_embs) 47 | bwd_init = bwdLSTM.initial_state() 48 | bwd_embs = bwd_init.transduce(reversed(word_embs)) 49 | W_sm_exp = dy.parameter(W_sm) 50 | b_sm_exp = dy.parameter(b_sm) 51 | return W_sm_exp * dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) + b_sm_exp 52 | 53 | 54 | for ITER in range(100): 55 | # Perform training 56 | random.shuffle(train) 57 | train_loss = 0.0 58 | start = time.time() 59 | for words, tag in train: 60 | my_loss = dy.pickneglogsoftmax(calc_scores(words), tag) 61 | train_loss += my_loss.value() 62 | my_loss.backward() 63 | trainer.update() 64 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start)) 65 | # Perform training 66 | test_correct = 0.0 67 | for words, tag in dev: 68 | scores = calc_scores(words).npvalue() 69 | predict = np.argmax(scores) 70 | if predict == tag: 71 | test_correct += 1 72 | print("iter %r: test acc=%.4f" % (ITER, test_correct / len(dev))) 73 | -------------------------------------------------------------------------------- /06-rnn/sentiment-rnn.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import time 3 | import random 4 | import dynet as dy 5 | import numpy as np 6 | 7 | # Functions to read in the corpus 8 | w2i = defaultdict(lambda: len(w2i)) 9 | t2i = defaultdict(lambda: len(t2i)) 10 | UNK = w2i[""] 11 | 12 | 13 | def read_dataset(filename): 14 | with open(filename, "r") as f: 15 | for line in f: 16 | tag, words = line.lower().strip().split(" ||| ") 17 | yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 18 | 19 | 20 | # Read in the data 21 | train = list(read_dataset("../data/classes/train.txt")) 22 | w2i = defaultdict(lambda: UNK, w2i) 23 | dev = list(read_dataset("../data/classes/test.txt")) 24 | nwords = len(w2i) 25 | ntags = len(t2i) 26 | 27 | # Start DyNet and defin trainer 28 | model = dy.Model() 29 | trainer = dy.AdamTrainer(model) 30 | 31 | # Define the model 32 | EMB_SIZE = 64 33 | HID_SIZE = 64 34 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word embeddings 35 | fwdLSTM = dy.SimpleRNNBuilder(1, EMB_SIZE, HID_SIZE, model) # Forward LSTM 36 | bwdLSTM = dy.SimpleRNNBuilder(1, EMB_SIZE, HID_SIZE, model) # Backward LSTM 37 | W_sm = model.add_parameters((ntags, 2 * HID_SIZE)) # Softmax weights 38 | b_sm = model.add_parameters((ntags)) # Softmax bias 39 | 40 | 41 | # A function to calculate scores for one value 42 | def calc_scores(words): 43 | dy.renew_cg() 44 | word_embs = [dy.lookup(W_emb, x) for x in words] 45 | fwd_init = fwdLSTM.initial_state() 46 | fwd_embs = fwd_init.transduce(word_embs) 47 | bwd_init = bwdLSTM.initial_state() 48 | bwd_embs = bwd_init.transduce(reversed(word_embs)) 49 | W_sm_exp = dy.parameter(W_sm) 50 | b_sm_exp = dy.parameter(b_sm) 51 | return W_sm_exp * dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) + b_sm_exp 52 | 53 | 54 | for ITER in range(100): 55 | # Perform training 56 | random.shuffle(train) 57 | train_loss = 0.0 58 | start = time.time() 59 | for words, tag in train: 60 | my_loss = dy.pickneglogsoftmax(calc_scores(words), tag) 61 | train_loss += my_loss.value() 62 | my_loss.backward() 63 | trainer.update() 64 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start)) 65 | # Perform training 66 | test_correct = 0.0 67 | for words, tag in dev: 68 | scores = calc_scores(words).npvalue() 69 | predict = np.argmax(scores) 70 | if predict == tag: 71 | test_correct += 1 72 | print("iter %r: test acc=%.4f" % (ITER, test_correct / len(dev))) 73 | -------------------------------------------------------------------------------- /07-sentrep/text-retrieval.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | 4 | from collections import defaultdict 5 | import random 6 | import math 7 | import sys 8 | import argparse 9 | 10 | import dynet as dy 11 | import numpy as np 12 | 13 | # format of files: each line is "word1 word2 ..." aligned line-by-line 14 | train_src_file = "../data/parallel/train.ja" 15 | train_trg_file = "../data/parallel/train.en" 16 | dev_src_file = "../data/parallel/dev.ja" 17 | dev_trg_file = "../data/parallel/dev.en" 18 | 19 | w2i_src = defaultdict(lambda: len(w2i_src)) 20 | w2i_trg = defaultdict(lambda: len(w2i_trg)) 21 | 22 | def read(fname_src, fname_trg): 23 | """ 24 | Read parallel files where each line lines up 25 | """ 26 | with open(fname_src, "r") as f_src, open(fname_trg, "r") as f_trg: 27 | for line_src, line_trg in zip(f_src, f_trg): 28 | sent_src = [w2i_src[x] for x in line_src.strip().split()] 29 | sent_trg = [w2i_trg[x] for x in line_trg.strip().split()] 30 | yield (sent_src, sent_trg) 31 | 32 | # Read the data 33 | train = list(read(train_src_file, train_trg_file)) 34 | unk_src = w2i_src[""] 35 | w2i_src = defaultdict(lambda: unk_src, w2i_src) 36 | unk_trg = w2i_trg[""] 37 | w2i_trg = defaultdict(lambda: unk_trg, w2i_trg) 38 | nwords_src = len(w2i_src) 39 | nwords_trg = len(w2i_trg) 40 | dev = list(read(dev_src_file, dev_trg_file)) 41 | 42 | # DyNet Starts 43 | model = dy.Model() 44 | trainer = dy.AdamTrainer(model) 45 | 46 | # Model parameters 47 | EMBED_SIZE = 64 48 | HIDDEN_SIZE = 128 49 | BATCH_SIZE = 16 50 | 51 | # Lookup parameters for word embeddings 52 | LOOKUP_SRC = model.add_lookup_parameters((nwords_src, EMBED_SIZE)) 53 | LOOKUP_TRG = model.add_lookup_parameters((nwords_trg, EMBED_SIZE)) 54 | 55 | # Word-level BiLSTMs 56 | LSTM_SRC = dy.BiRNNBuilder(1, EMBED_SIZE, HIDDEN_SIZE, model, dy.LSTMBuilder) 57 | LSTM_TRG = dy.BiRNNBuilder(1, EMBED_SIZE, HIDDEN_SIZE, model, dy.LSTMBuilder) 58 | 59 | # Calculate loss for one mini-batch 60 | def calc_loss(sents): 61 | dy.renew_cg() 62 | 63 | # Transduce all batch elements with an LSTM 64 | sent_reps = [(LSTM_SRC.transduce([LOOKUP_SRC[x] for x in src])[-1], 65 | LSTM_TRG.transduce([LOOKUP_TRG[y] for y in trg])[-1]) for src, trg in sents] 66 | 67 | # Concatenate the sentence representations to a single matrix 68 | mtx_src = dy.concatenate_cols([src for src, trg in sent_reps]) 69 | mtx_trg = dy.concatenate_cols([trg for src, trg in sent_reps]) 70 | 71 | # Do matrix multiplication to get a matrix of dot product similarity scores 72 | sim_mtx = dy.transpose(mtx_src) * mtx_trg 73 | 74 | # Calculate the hinge loss over all dimensions 75 | loss = dy.hinge_dim(sim_mtx, list(range(len(sents))), d=1) 76 | 77 | return dy.sum_elems(loss) 78 | 79 | # Calculate representations for one corpus 80 | def index_corpus(sents): 81 | 82 | # To take advantage of auto-batching, do several at a time 83 | for sid in range(0, len(sents), BATCH_SIZE): 84 | dy.renew_cg() 85 | 86 | # Set up the computation graph 87 | exprs = [] 88 | for src, trg in sents[sid:min(sid+BATCH_SIZE,len(sents))]: 89 | exprs.append((LSTM_SRC.transduce([LOOKUP_SRC[x] for x in src])[-1], 90 | LSTM_TRG.transduce([LOOKUP_TRG[y] for y in trg])[-1])) 91 | 92 | # Perform the forward pass to calculate everything at once 93 | exprs[-1][1].forward() 94 | 95 | for src_expr, trg_expr in exprs: 96 | yield (src_expr.npvalue(), trg_expr.npvalue()) 97 | 98 | # Perform retrieval, and return both scores and ranked order of candidates 99 | def retrieve(src, db_mtx): 100 | scores = np.dot(db_mtx,src) 101 | ranks = np.argsort(-scores) 102 | return ranks, scores 103 | 104 | # Perform training 105 | start = time.time() 106 | train_mbs = all_time = dev_time = all_tagged = this_sents = this_loss = 0 107 | for ITER in range(100): 108 | random.shuffle(train) 109 | for sid in range(0, len(train), BATCH_SIZE): 110 | my_size = min(BATCH_SIZE, len(train)-sid) 111 | train_mbs += 1 112 | if train_mbs % int(1000/BATCH_SIZE) == 0: 113 | trainer.status() 114 | print("loss/sent=%.4f, sent/sec=%.4f" % (this_loss / this_sents, (train_mbs * BATCH_SIZE) / (time.time() - start - dev_time)), file=sys.stderr) 115 | this_loss = this_sents = 0 116 | # train on the minibatch 117 | loss_exp = calc_loss(train[sid:sid+BATCH_SIZE]) 118 | this_loss += loss_exp.scalar_value() 119 | this_sents += BATCH_SIZE 120 | loss_exp.backward() 121 | trainer.update() 122 | # Perform evaluation 123 | dev_start = time.time() 124 | rec_at_1, rec_at_5, rec_at_10 = 0, 0, 0 125 | reps = list(index_corpus(dev)) 126 | trg_mtx = np.stack([trg for src, trg in reps]) 127 | for i, (src, trg) in enumerate(reps): 128 | ranks, scores = retrieve(src, trg_mtx) 129 | if ranks[0] == i: rec_at_1 += 1 130 | if i in ranks[:5]: rec_at_5 += 1 131 | if i in ranks[:10]: rec_at_10 += 1 132 | dev_time += time.time()-dev_start 133 | print("epoch %r: dev recall@1=%.2f%% recall@5=%.2f%% recall@10=%.2f%%" % (ITER, rec_at_1/len(dev)*100, rec_at_5/len(dev)*100, rec_at_10/len(dev)*100)) 134 | -------------------------------------------------------------------------------- /08-condlm/bleu.py: -------------------------------------------------------------------------------- 1 | import math 2 | from collections import Counter 3 | import numpy 4 | import sys 5 | 6 | # written by Adam Lopez 7 | 8 | # Collect BLEU-relevant statistics for a single hypothesis/reference pair. 9 | # Return value is a generator yielding: 10 | # (c, r, numerator1, denominator1, ... numerator4, denominator4) 11 | # Summing the columns across calls to this function on an entire corpus will 12 | # produce a vector of statistics that can be used to compute BLEU (below) 13 | def bleu_stats(hypothesis, reference): 14 | stats = [] 15 | stats.append(len(hypothesis)) 16 | stats.append(len(reference)) 17 | for n in xrange(1,5): 18 | s_ngrams = Counter([tuple(hypothesis[i:i+n]) for i in xrange(len(hypothesis)+1-n)]) 19 | r_ngrams = Counter([tuple(reference[i:i+n]) for i in xrange(len(reference)+1-n)]) 20 | stats.append(max([sum((s_ngrams & r_ngrams).values()), 0])) 21 | stats.append(max([len(hypothesis)+1-n, 0])) 22 | return stats 23 | 24 | # Compute BLEU from collected statistics obtained by call(s) to bleu_stats 25 | def bleu(stats): 26 | if len(filter(lambda x: x==0, stats)) > 0: 27 | return 0 28 | (c, r) = stats[:2] 29 | log_bleu_prec = sum([math.log(float(x)/y) for x,y in zip(stats[2::2],stats[3::2])]) / 4. 30 | return math.exp(min([0, 1-float(r)/c]) + log_bleu_prec) 31 | 32 | if __name__=='__main__': 33 | stats = numpy.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]) 34 | for hyp, ref in zip(open(sys.argv[1], 'r'), open(sys.argv[2], 'r')): 35 | hyp, ref = (hyp.strip().split(), ref.strip().split()) 36 | stats += numpy.array(bleu_stats(hyp, ref)) 37 | print "%.2f" % (100*bleu(stats)) -------------------------------------------------------------------------------- /09-attention/plot_attention.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | matplotlib.use('Agg') 4 | 5 | from matplotlib.font_manager import FontProperties 6 | from matplotlib import rcParams 7 | import pdb as pdb 8 | import matplotlib.pyplot as plt 9 | import six 10 | 11 | 12 | # if you are outputting cjk, matplotlib needs to first load a cjk font. 13 | # you can figure out how to find a non-latin font on your system here: 14 | # > https://matplotlib.org/users/text_props.html#text-with-non-latin-glyphs 15 | # for example 16 | # 17 | # 1. run in terminal 18 | # $ fc-list :lang=ja family 19 | # -> displays "MS Gothic" as one of the options 20 | # 21 | # 2. add to code here: 22 | # matplotlib.rcParams['font.family'].insert(0, 'MS Gothic') 23 | 24 | def plot_attention(src_words, trg_words, attention_matrix, file_name=None): 25 | """This takes in source and target words and an attention matrix (in numpy format) 26 | and prints a visualization of this to a file. 27 | :param src_words: a list of words in the source 28 | :param trg_words: a list of target words 29 | :param attention_matrix: a two-dimensional numpy array of values between zero and one, 30 | where rows correspond to source words, and columns correspond to target words 31 | :param file_name: the name of the file to which we write the attention 32 | """ 33 | fig, ax = plt.subplots() 34 | #a lazy, rough, approximate way of making the image large enough 35 | fig.set_figwidth(int(len(trg_words)*.6)) 36 | 37 | # put the major ticks at the middle of each cell 38 | ax.set_xticks(np.arange(attention_matrix.shape[1]) + 0.5, minor=False) 39 | ax.set_yticks(np.arange(attention_matrix.shape[0]) + 0.5, minor=False) 40 | ax.invert_yaxis() 41 | 42 | # label axes by words 43 | ax.set_xticklabels(trg_words, minor=False) 44 | ax.set_yticklabels(src_words, minor=False) 45 | ax.xaxis.tick_top() 46 | plt.setp(ax.get_xticklabels(), rotation=50, horizontalalignment='right') 47 | # draw the heatmap 48 | plt.pcolor(attention_matrix, cmap=plt.cm.Blues, vmin=0, vmax=1) 49 | plt.colorbar() 50 | 51 | if file_name != None: 52 | plt.savefig(file_name, dpi=100) 53 | else: 54 | plt.show() 55 | plt.close() 56 | 57 | -------------------------------------------------------------------------------- /10-structured/bilstm-tagger.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | 4 | from collections import defaultdict 5 | import random 6 | import math 7 | import sys 8 | import argparse 9 | 10 | import dynet as dy 11 | import numpy as np 12 | 13 | # format of files: each line is "word1|tag1 word2|tag2 ..." 14 | train_file = "../data/tags/train.txt" 15 | dev_file = "../data/tags/dev.txt" 16 | 17 | w2i = defaultdict(lambda: len(w2i)) 18 | t2i = defaultdict(lambda: len(t2i)) 19 | 20 | 21 | def read(fname): 22 | """ 23 | Read tagged file 24 | """ 25 | with open(fname, "r") as f: 26 | for line in f: 27 | words, tags = [], [] 28 | for wt in line.strip().split(): 29 | w, t = wt.split('|') 30 | words.append(w2i[w]) 31 | tags.append(t2i[t]) 32 | yield (words, tags) 33 | 34 | 35 | # Read the data 36 | train = list(read(train_file)) 37 | unk_word = w2i[""] 38 | w2i = defaultdict(lambda: unk_word, w2i) 39 | unk_tag = t2i[""] 40 | t2i = defaultdict(lambda: unk_tag, t2i) 41 | nwords = len(w2i) 42 | ntags = len(t2i) 43 | dev = list(read(dev_file)) 44 | 45 | # DyNet Starts 46 | model = dy.Model() 47 | trainer = dy.AdamTrainer(model) 48 | 49 | # Model parameters 50 | EMBED_SIZE = 64 51 | HIDDEN_SIZE = 128 52 | 53 | # Lookup parameters for word embeddings 54 | LOOKUP = model.add_lookup_parameters((nwords, EMBED_SIZE)) 55 | 56 | # Word-level BiLSTM 57 | LSTM = dy.BiRNNBuilder(1, EMBED_SIZE, HIDDEN_SIZE, model, dy.LSTMBuilder) 58 | 59 | # Word-level softmax 60 | W_sm = model.add_parameters((ntags, HIDDEN_SIZE)) 61 | b_sm = model.add_parameters(ntags) 62 | 63 | 64 | # Calculate the scores for one example 65 | def calc_scores(words): 66 | dy.renew_cg() 67 | 68 | # Transduce all batch elements with an LSTM 69 | word_reps = LSTM.transduce([LOOKUP[x] for x in words]) 70 | 71 | # Softmax scores 72 | W = dy.parameter(W_sm) 73 | b = dy.parameter(b_sm) 74 | scores = [dy.affine_transform([b, W, x]) for x in word_reps] 75 | 76 | return scores 77 | 78 | 79 | # Calculate MLE loss for one example 80 | def calc_loss(scores, tags): 81 | losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)] 82 | return dy.esum(losses) 83 | 84 | 85 | # Calculate number of tags correct for one example 86 | def calc_correct(scores, tags): 87 | correct = [np.argmax(score.npvalue()) == tag for score, tag in zip(scores, tags)] 88 | return sum(correct) 89 | 90 | 91 | # Perform training 92 | for ITER in range(100): 93 | random.shuffle(train) 94 | start = time.time() 95 | this_sents = this_words = this_loss = this_correct = 0 96 | for sid in range(0, len(train)): 97 | this_sents += 1 98 | if this_sents % int(1000) == 0: 99 | print("train loss/word=%.4f, acc=%.2f%%, word/sec=%.4f" % ( 100 | this_loss / this_words, 100 * this_correct / this_words, this_words / (time.time() - start)), 101 | file=sys.stderr) 102 | # train on the example 103 | words, tags = train[sid] 104 | scores = calc_scores(words) 105 | loss_exp = calc_loss(scores, tags) 106 | this_correct += calc_correct(scores, tags) 107 | this_loss += loss_exp.scalar_value() 108 | this_words += len(words) 109 | loss_exp.backward() 110 | trainer.update() 111 | # Perform evaluation 112 | start = time.time() 113 | this_sents = this_words = this_loss = this_correct = 0 114 | for words, tags in dev: 115 | this_sents += 1 116 | scores = calc_scores(words) 117 | loss_exp = calc_loss(scores, tags) 118 | this_correct += calc_correct(scores, tags) 119 | this_loss += loss_exp.scalar_value() 120 | this_words += len(words) 121 | print("dev loss/word=%.4f, acc=%.2f%%, word/sec=%.4f" % ( 122 | this_loss / this_words, 100 * this_correct / this_words, this_words / (time.time() - start)), file=sys.stderr) 123 | -------------------------------------------------------------------------------- /12-transitionparsing/oracle.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | class Word: 3 | def __init__(self, word, location, head = -1): 4 | self.word = word 5 | self.location = location 6 | self.head = head 7 | self.rightmost_child = -1 8 | 9 | #abreviated conll file like gold.dev.txt 10 | def read_abrv(file_name): 11 | f = [l.strip() for l in open(file_name)] 12 | sents = [] 13 | sent = [] 14 | loc = 0 15 | for line in f: 16 | if line != '': 17 | line = line.split() 18 | word = line[0] 19 | head = int(line[1]) 20 | sent.append(Word(word, loc, head)) 21 | loc += 1 22 | else: 23 | for word in sent: 24 | if word.location > sent[word.head].rightmost_child: 25 | sent[word.head].rightmost_child = word.location 26 | sents.append(sent) 27 | sent = [] 28 | loc = 0 29 | return sents 30 | 31 | sentences = read_abrv('../data/parsing/gold.txt') 32 | actions_for_sents = [] 33 | f = open('../data/parsing/output.txt', 'w') 34 | for sent in sentences: 35 | stack, buffer = [], [] 36 | acts = [] 37 | for word in sent: 38 | buffer.append(word) 39 | buffer = list(reversed(buffer)) 40 | while len(buffer) > 0 or len(stack) > 1: 41 | if len(stack) < 2: 42 | stack.append(buffer.pop()) 43 | acts.append('SHIFT') 44 | elif stack[-1].head == stack[-2].location and (len(buffer) == 0 or stack[-1].rightmost_child < buffer[-1].location or stack[-2].rightmost_child == -1): 45 | acts.append('REDUCE_R') 46 | stack.pop() 47 | elif stack[-2].head == stack[-1].location and (len(buffer) == 0 or stack[-2].rightmost_child < buffer[-1].location or stack[-2].rightmost_child == -1): 48 | acts.append('REDUCE_L') 49 | temp = stack.pop() 50 | stack.pop() 51 | stack.append(temp) 52 | elif len(buffer) > 0: 53 | stack.append(buffer.pop()) 54 | acts.append('SHIFT') 55 | else: 56 | break 57 | actions_for_sents.append(acts) 58 | f.write(' '.join([s.word for s in sent]) + ' ||| ' + ' '.join(acts) + '\n') 59 | 60 | f.close() 61 | 62 | -------------------------------------------------------------------------------- /12-transitionparsing/tree_parser.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | import codecs 3 | import time 4 | import random 5 | import dynet as dy 6 | import numpy as np 7 | 8 | from tree import Tree 9 | 10 | def read_dataset(filename): 11 | return [Tree.from_sexpr(line.strip()) for line in codecs.open(filename,"r")] 12 | 13 | def get_vocabs(trees): 14 | label_vocab = Counter() 15 | word_vocab = Counter() 16 | for tree in trees: 17 | label_vocab.update([n.label for n in tree.nonterms()]) 18 | word_vocab.update([l.label for l in tree.leaves()]) 19 | labels = [x for x,c in label_vocab.iteritems() if c > 0] 20 | words = ["_UNK_"] + [x for x,c in word_vocab.iteritems() if c > 0] 21 | l2i = {l:i for i,l in enumerate(labels)} 22 | w2i = {w:i for i,w in enumerate(words)} 23 | return l2i, w2i, labels, words 24 | 25 | train = read_dataset("../data/parsing/trees/train.txt") 26 | dev = read_dataset("../data/parsing/trees/dev.txt") 27 | 28 | l2i, w2i, i2l, i2w = get_vocabs(train) 29 | ntags = len(l2i) 30 | nwords = len(w2i) 31 | 32 | # Socher-style Tree RNN 33 | class TreeRNNBuilder(object): 34 | def __init__(self, model, word_vocab, hdim): 35 | self.W = model.add_parameters((hdim, 2*hdim)) 36 | self.E = model.add_lookup_parameters((len(word_vocab),hdim)) 37 | self.w2i = word_vocab 38 | 39 | def expr_for_tree(self, tree): 40 | if tree.isleaf(): 41 | return self.E[self.w2i.get(tree.label,0)] 42 | if len(tree.children) == 1: 43 | assert(tree.children[0].isleaf()) 44 | expr = self.expr_for_tree(tree.children[0]) 45 | return expr 46 | assert(len(tree.children) == 2),tree.children[0] 47 | e1 = self.expr_for_tree(tree.children[0]) 48 | e2 = self.expr_for_tree(tree.children[1]) 49 | W = dy.parameter(self.W) 50 | expr = dy.tanh(W*dy.concatenate([e1,e2])) 51 | return expr 52 | 53 | # Tai-style Tree LSTM 54 | class TreeLSTMBuilder(object): 55 | def __init__(self, model, word_vocab, wdim, hdim): 56 | self.WS = [model.add_parameters((hdim, wdim)) for _ in "iou"] 57 | self.US = [model.add_parameters((hdim, 2*hdim)) for _ in "iou"] 58 | self.UFS =[model.add_parameters((hdim, hdim)) for _ in "ff"] 59 | self.BS = [model.add_parameters(hdim) for _ in "iouf"] 60 | self.E = model.add_lookup_parameters((len(word_vocab),wdim)) 61 | self.w2i = word_vocab 62 | 63 | def expr_for_tree(self, tree): 64 | if tree.isleaf(): 65 | return self.E[self.w2i.get(tree.label,0)] 66 | if len(tree.children) == 1: 67 | assert(tree.children[0].isleaf()) 68 | emb = self.expr_for_tree(tree.children[0]) 69 | Wi,Wo,Wu = [dy.parameter(w) for w in self.WS] 70 | bi,bo,bu,_ = [dy.parameter(b) for b in self.BS] 71 | i = dy.logistic(Wi*emb + bi) 72 | o = dy.logistic(Wo*emb + bo) 73 | u = dy.tanh( Wu*emb + bu) 74 | c = dy.cmult(i,u) 75 | expr = dy.cmult(o,dy.tanh(c)) 76 | return expr 77 | assert(len(tree.children) == 2),tree.children[0] 78 | e1 = self.expr_for_tree(tree.children[0]) 79 | e2 = self.expr_for_tree(tree.children[1]) 80 | Ui,Uo,Uu = [dy.parameter(u) for u in self.US] 81 | Uf1,Uf2 = [dy.parameter(u) for u in self.UFS] 82 | bi,bo,bu,bf = [dy.parameter(b) for b in self.BS] 83 | e = dy.concatenate([e1,e2]) 84 | i = dy.logistic(Ui*e + bi) 85 | o = dy.logistic(Uo*e + bo) 86 | f1 = dy.logistic(Uf1*e1 + bf) 87 | f2 = dy.logistic(Uf2*e2 + bf) 88 | u = dy.tanh( Uu*e + bu) 89 | c = dy.cmult(i,u) + dy.cmult(f1,e1) + dy.cmult(f2,e2) 90 | h = dy.cmult(o,dy.tanh(c)) 91 | expr = h 92 | return expr 93 | 94 | # Start DyNet and define trainer 95 | model = dy.Model() 96 | trainer = dy.AdamTrainer(model) 97 | 98 | # Define the model 99 | EMB_SIZE = 64 100 | HID_SIZE = 64 101 | # builder = TreeRNNBuilder(model, w2i, HID_SIZE) 102 | builder = TreeLSTMBuilder(model, w2i, HID_SIZE, EMB_SIZE) 103 | W_sm = model.add_parameters((ntags, HID_SIZE)) # Softmax weights 104 | b_sm = model.add_parameters((ntags)) # Softmax bias 105 | 106 | # A function to calculate scores for one value 107 | def calc_scores(tree): 108 | dy.renew_cg() 109 | emb = builder.expr_for_tree(tree) 110 | W_sm_exp = dy.parameter(W_sm) 111 | b_sm_exp = dy.parameter(b_sm) 112 | return W_sm_exp * emb + b_sm_exp 113 | 114 | for ITER in range(100): 115 | # Perform training 116 | random.shuffle(train) 117 | train_loss = 0.0 118 | start = time.time() 119 | for tree in train: 120 | my_loss = dy.hinge(calc_scores(tree), l2i[tree.label]) 121 | # my_loss = dy.pickneglogsoftmax(calc_scores(tree), l2i[tree.label]) 122 | train_loss += my_loss.value() 123 | my_loss.backward() 124 | trainer.update() 125 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start)) 126 | # Perform testing 127 | test_correct = 0.0 128 | for tree in dev: 129 | scores = calc_scores(tree).npvalue() 130 | predict = np.argmax(scores) 131 | if predict == l2i[tree.label]: 132 | test_correct += 1 133 | print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev))) 134 | -------------------------------------------------------------------------------- /13-graphparsing/biaffine_parser.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | import random 4 | 5 | start = time.time() 6 | 7 | from collections import Counter, defaultdict 8 | from biaffine import DeepBiaffineAttentionDecoder 9 | 10 | import dynet as dy 11 | import numpy as np 12 | 13 | # format of files: each line is "word1/tag2 word2/tag2 ..." 14 | train_file = "../data/parsing/graph/ptb_train.txt" 15 | test_file = "../data/parsing//graph/ptb_dev.txt" 16 | 17 | w2i = defaultdict(lambda: len(w2i)) 18 | t2i = defaultdict(lambda: len(t2i)) 19 | UNK = w2i[""] 20 | 21 | def read(fname): 22 | with open(fname, "r") as fh: 23 | for line in fh: 24 | tokens = line.strip().split() 25 | num_tokens = len(tokens) 26 | assert num_tokens % 3 == 0 27 | sent = [] 28 | labels = [] 29 | heads = [] 30 | for i in range(num_tokens / 3): 31 | sent.append(w2i[tokens[3 * i]]) 32 | labels.append(t2i[tokens[3 * i + 1]]) 33 | heads.append(int(tokens[3 * i + 2])) 34 | yield (sent, labels, heads) 35 | 36 | 37 | train = list(read(train_file)) 38 | w2i = defaultdict(lambda: UNK, w2i) 39 | dev = list(read(test_file)) 40 | nwords = len(w2i) 41 | ntags = len(t2i) 42 | 43 | # DyNet Starts 44 | 45 | model = dy.Model() 46 | trainer = dy.AdamTrainer(model) 47 | 48 | # Lookup parameters for word embeddings 49 | EMB_SIZE = 32 50 | HID_SIZE = 64 51 | W_emb = model.add_lookup_parameters((nwords, EMB_SIZE)) # Word embeddings 52 | fwdLSTM = dy.SimpleRNNBuilder(1, EMB_SIZE, HID_SIZE, model) # Forward LSTM 53 | bwdLSTM = dy.SimpleRNNBuilder(1, EMB_SIZE, HID_SIZE, model) # Backward LSTM 54 | 55 | biaffineParser = DeepBiaffineAttentionDecoder(model, ntags, src_ctx_dim=HID_SIZE * 2, 56 | n_arc_mlp_units=64, n_label_mlp_units=32) 57 | 58 | def calc_loss(words, labels, heads): 59 | dy.renew_cg() 60 | word_embs = [dy.lookup(W_emb, x) for x in words] 61 | fwd_init = fwdLSTM.initial_state() 62 | fwd_embs = fwd_init.transduce(word_embs) 63 | bwd_init = bwdLSTM.initial_state() 64 | bwd_embs = bwd_init.transduce(reversed(word_embs)) 65 | src_encodings = [dy.reshape(dy.concatenate([f, b]), (HID_SIZE * 2, 1)) for f, b in zip(fwd_embs, reversed(bwd_embs))] 66 | return biaffineParser.decode_loss(src_encodings, ([heads], [labels])) 67 | 68 | 69 | def calc_acc(words, labels, heads): 70 | dy.renew_cg() 71 | word_embs = [dy.lookup(W_emb, x) for x in words] 72 | fwd_init = fwdLSTM.initial_state() 73 | fwd_embs = fwd_init.transduce(word_embs) 74 | bwd_init = bwdLSTM.initial_state() 75 | bwd_embs = bwd_init.transduce(reversed(word_embs)) 76 | src_encodings = [dy.reshape(dy.concatenate([f, b]), (HID_SIZE * 2, 1)) for f, b in zip(fwd_embs, reversed(bwd_embs))] 77 | pred_heads, pred_labels = biaffineParser.decoding(src_encodings) 78 | return biaffineParser.cal_accuracy(pred_heads, pred_labels, heads, labels) 79 | 80 | for ITER in range(100): 81 | # Perform training 82 | random.shuffle(train) 83 | train_loss = 0.0 84 | start = time.time() 85 | for words, labels, heads in train: 86 | loss = calc_loss(words, labels, heads) 87 | train_loss += loss.value() 88 | loss.backward() 89 | trainer.update() 90 | 91 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss / len(train), time.time() - start)) 92 | 93 | correct_heads = 0. 94 | correct_labels = 0. 95 | total = 0. 96 | for words, labels, heads in dev: 97 | head_acc, label_acc = calc_acc(words, labels, heads) 98 | correct_heads += head_acc * len(words) 99 | correct_labels += label_acc * len(words) 100 | total += len(words) 101 | print("iter %r: test head_acc=%.4f, label_acc=%.4f" % (ITER, correct_heads * 100 / total, 102 | correct_labels * 100 / total)) 103 | 104 | -------------------------------------------------------------------------------- /13-graphparsing/mst.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict 3 | 4 | 5 | def mst(scores): 6 | """ 7 | Chu-Liu-Edmonds' algorithm for finding minimum spanning arborescence in graphs. 8 | Calculates the arborescence with node 0 as root. 9 | Source: https://github.com/chantera/biaffineparser/blob/master/utils.py 10 | 11 | :param scores: `scores[i][j]` is the weight of edge from node `i` to node `j` 12 | :returns an array containing the head node (node with edge pointing to current node) for each node, 13 | with head[0] fixed as 0 14 | """ 15 | length = scores.shape[0] 16 | scores = scores * (1 - np.eye(length)) 17 | heads = np.argmax(scores, axis=1) 18 | heads[0] = 0 19 | tokens = np.arange(1, length) 20 | roots = np.where(heads[tokens] == 0)[0] + 1 21 | if len(roots) < 1: 22 | root_scores = scores[tokens, 0] 23 | head_scores = scores[tokens, heads[tokens]] 24 | new_root = tokens[np.argmax(root_scores / head_scores)] 25 | heads[new_root] = 0 26 | elif len(roots) > 1: 27 | root_scores = scores[roots, 0] 28 | scores[roots, 0] = 0 29 | new_heads = np.argmax(scores[roots][:, tokens], axis=1) + 1 30 | new_root = roots[np.argmin( 31 | scores[roots, new_heads] / root_scores)] 32 | heads[roots] = new_heads 33 | heads[new_root] = 0 34 | 35 | edges = defaultdict(set) 36 | vertices = set((0,)) 37 | for dep, head in enumerate(heads[tokens]): 38 | vertices.add(dep + 1) 39 | edges[head].add(dep + 1) 40 | for cycle in _find_cycle(vertices, edges): 41 | dependents = set() 42 | to_visit = set(cycle) 43 | while len(to_visit) > 0: 44 | node = to_visit.pop() 45 | if node not in dependents: 46 | dependents.add(node) 47 | to_visit.update(edges[node]) 48 | cycle = np.array(list(cycle)) 49 | old_heads = heads[cycle] 50 | old_scores = scores[cycle, old_heads] 51 | non_heads = np.array(list(dependents)) 52 | scores[np.repeat(cycle, len(non_heads)), 53 | np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0 54 | new_heads = np.argmax(scores[cycle][:, tokens], axis=1) + 1 55 | new_scores = scores[cycle, new_heads] / old_scores 56 | change = np.argmax(new_scores) 57 | changed_cycle = cycle[change] 58 | old_head = old_heads[change] 59 | new_head = new_heads[change] 60 | heads[changed_cycle] = new_head 61 | edges[new_head].add(changed_cycle) 62 | edges[old_head].remove(changed_cycle) 63 | 64 | return heads 65 | 66 | 67 | def _find_cycle(vertices, edges): 68 | """ 69 | https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm # NOQA 70 | https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/etc/tarjan.py # NOQA 71 | """ 72 | _index = [0] 73 | _stack = [] 74 | _indices = {} 75 | _lowlinks = {} 76 | _onstack = defaultdict(lambda: False) 77 | _SCCs = [] 78 | 79 | def _strongconnect(v): 80 | _indices[v] = _index[0] 81 | _lowlinks[v] = _index[0] 82 | _index[0] += 1 83 | _stack.append(v) 84 | _onstack[v] = True 85 | 86 | for w in edges[v]: 87 | if w not in _indices: 88 | _strongconnect(w) 89 | _lowlinks[v] = min(_lowlinks[v], _lowlinks[w]) 90 | elif _onstack[w]: 91 | _lowlinks[v] = min(_lowlinks[v], _indices[w]) 92 | 93 | if _lowlinks[v] == _indices[v]: 94 | SCC = set() 95 | while True: 96 | w = _stack.pop() 97 | _onstack[w] = False 98 | SCC.add(w) 99 | if not (w != v): 100 | break 101 | _SCCs.append(SCC) 102 | 103 | for v in vertices: 104 | if v not in _indices: 105 | _strongconnect(v) 106 | 107 | return [SCC for SCC in _SCCs if len(SCC) > 1] 108 | 109 | -------------------------------------------------------------------------------- /14-semparsing/ucca/.appveyor.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | matrix: 3 | - PYTHON: "C:\\Python35\\python.exe" 4 | - PYTHON: "C:\\Python36\\python.exe" 5 | 6 | install: 7 | - "%PYTHON% -m pip install -U pip wheel" 8 | - "%PYTHON% setup.py install" 9 | - "%PYTHON% -m spacy download en" 10 | 11 | build: off 12 | 13 | test_script: 14 | - "%PYTHON% -m unittest discover -v" 15 | -------------------------------------------------------------------------------- /14-semparsing/ucca/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__* 2 | *.pyc 3 | -------------------------------------------------------------------------------- /14-semparsing/ucca/.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: python 3 | python: "3.6" 4 | addons: 5 | apt: 6 | packages: 7 | - pandoc 8 | install: 9 | - python setup.py install 10 | - python -m spacy download en 11 | env: 12 | global: 13 | - TWINE_USERNAME=danielh 14 | - secure: QrZ/47sh/8WeeTLU37yfhW94bwO2ocsbMMIRebSS9Y+FssrCi9IbSuTp6NliXlJq17rozGtEf9alu9JetE8hnivACGJm0cz2/j3oYaeCxz8sbTpXeEr8JHiDk6MCfCD9VMrpeo04RBmI76BY1mwdCvxQSJEn/NtkI9jjSaqjLCLcaFWD7mTuYefxrPplROQJPu+jcW1snnubntuux1nRxULC3Ge/IRWb4OYajLJcPXiVsdleSNV9avLE2xIPTFZf4cwHpRxZslKgHeyCLk+JoDlL0qneB4UWB/SZF8CHoYvidPJDzG5NHAEgfxSqbUq3DRvgVAPqR0YoQd/MQbPLBN6v1aY2zbqHJtTS1xidnnYIs3gJWVAurx6WjkNc9QYwdN22EPmYDVquW2tZgvi2kHRoJY+gEYylJRY0jOzqYmZUV9WOZeeb2AzgXnVjQubEm0NSYCC3BYjkiSmwpDWTcr/HvCQ+9iOI1OD56F7B6oowzXBP0Z/IClMd9Pb3vs9cRr6di/Vf+ijjUeHQxyKHiv2R2mGnPuR8d/gR538xmbc/RlEt2tycMD25SBAeFdtlUfB5Si8llTSd6YktZzZhkHiaIPBYAVEbrK3832TM7B7sGAa8R6Y8gctP6ccE/kFpSdnFHuENgRu2VZBDx6q8UmkArRLbrCvzmbn658EySkc= 15 | matrix: 16 | - TEST_SUITE=unit 17 | - TEST_SUITE=convert 18 | script: ci/test.sh 19 | deploy: 20 | provider: script 21 | script: ci/deploy.sh 22 | on: 23 | repo: huji-nlp/ucca 24 | tags: true 25 | 26 | -------------------------------------------------------------------------------- /14-semparsing/ucca/README.md: -------------------------------------------------------------------------------- 1 | Universal Conceptual Cognitive Annotation 2 | ============================ 3 | UCCA is a linguistic framework for semantic annotation, whose details 4 | are available at [the following paper](http://www.cs.huji.ac.il/~oabend/papers/ucca_acl.pdf): 5 | 6 | @inproceedings{abend2013universal, 7 | author={Abend, Omri and Rappoport, Ari}, 8 | title={{U}niversal {C}onceptual {C}ognitive {A}nnotation ({UCCA})}, 9 | booktitle={Proc. of ACL}, 10 | month={August}, 11 | year={2013}, 12 | pages={228--238}, 13 | url={http://aclweb.org/anthology/P13-1023} 14 | } 15 | 16 | This Python 3 package provides an API to the UCCA annotation and tools to 17 | manipulate and process it. Its main features are conversion between different 18 | representations of UCCA annotations, and rich objects for all of the linguistic 19 | relations which appear in the theoretical framework (see `core`, `layer0`, `layer1` 20 | and `convert` modules under the `ucca` package). 21 | 22 | The `scripts` package contains various utilities for processing passage files. 23 | 24 | 25 | Authors 26 | ------ 27 | * Amit Beka: amit.beka@gmail.com 28 | * Daniel Hershcovich: danielh@cs.huji.ac.il 29 | 30 | 31 | License 32 | ------- 33 | This package is licensed under the GPLv3 or later license. 34 | 35 | [![Build Status](https://travis-ci.org/danielhers/ucca.svg?branch=master)](https://travis-ci.org/danielhers/ucca) 36 | [![Build Status](https://ci.appveyor.com/api/projects/status/github/danielhers/ucca?svg=true)](https://ci.appveyor.com/project/danielh/ucca) 37 | [![PyPI version](https://badge.fury.io/py/UCCA.svg)](https://badge.fury.io/py/UCCA) 38 | -------------------------------------------------------------------------------- /14-semparsing/ucca/actions.py: -------------------------------------------------------------------------------- 1 | COMPOUND = "compound" 2 | 3 | class Labels(object): 4 | def __init__(self, size): 5 | self.size = size # Maximum number of labels, NOT enforced here but by the user 6 | 7 | @property 8 | def all(self): 9 | raise NotImplementedError() 10 | 11 | @all.setter 12 | def all(self, labels): 13 | raise NotImplementedError() 14 | 15 | def save(self, skip=False): 16 | return (None if skip else self.all), self.size 17 | 18 | def load(self, all_size): 19 | self.all, self.size = all_size 20 | 21 | 22 | class Action(dict): 23 | type_to_id = {} 24 | 25 | def __init__(self, action_type, tag=None, orig_edge=None, orig_node=None, oracle=None, id_=None): 26 | self.type = action_type # String 27 | self.tag = tag # Usually the tag of the created edge; but if COMPOUND_SWAP, the distance 28 | self.orig_node = orig_node # Node created by this action, if any (during training) 29 | self.orig_edge = orig_edge # Edge created by this action, if any (during training) 30 | self.node = None # Will be set by State when the node created by this action is known 31 | self.edge = None # Will be set by State when the edge created by this action is known 32 | self.oracle = oracle # Reference to oracle, to inform it of actually created nodes/edges 33 | self.index = None # Index of this action in history 34 | 35 | self.type_id = Action.type_to_id.get(self.type) # Allocate ID for fast comparison 36 | if self.type_id is None: 37 | self.type_id = len(Action.type_to_id) 38 | Action.type_to_id[self.type] = self.type_id 39 | self.id = id_ 40 | super().__init__(action_type=self.type, tag=self.tag) 41 | 42 | def is_type(self, *others): 43 | return self.type_id in (o.type_id for o in others) 44 | 45 | def apply(self): 46 | if self.oracle is not None: 47 | self.oracle.remove(self.orig_edge, self.orig_node) 48 | 49 | def __repr__(self): 50 | return Action.__name__ + "(" + ", ".join(map(str, filter(None, (self.type, self.tag)))) + ")" 51 | 52 | def __str__(self): 53 | s = self.type 54 | if self.tag: 55 | s += "-%s" % self.tag 56 | return s 57 | 58 | def __eq__(self, other): 59 | return self.id == other.id 60 | 61 | def __hash__(self): 62 | return hash(self.id) 63 | 64 | def __call__(self, *args, **kwargs): 65 | return Action(self.type, *args, **kwargs) 66 | 67 | @property 68 | def remote(self): 69 | return self.is_type(Actions.RemoteNode, Actions.LeftRemote, Actions.RightRemote) 70 | 71 | @property 72 | def is_swap(self): 73 | return self.is_type(Actions.Swap) 74 | 75 | 76 | class Actions(Labels): 77 | Shift = Action("SHIFT") 78 | Node = Action("NODE") 79 | RemoteNode = Action("REMOTE-NODE") 80 | Implicit = Action("IMPLICIT") 81 | Label = Action("LABEL") 82 | Reduce = Action("REDUCE") 83 | LeftEdge = Action("LEFT-EDGE") 84 | RightEdge = Action("RIGHT-EDGE") 85 | LeftRemote = Action("LEFT-REMOTE") 86 | RightRemote = Action("RIGHT-REMOTE") 87 | Swap = Action("SWAP") 88 | Finish = Action("FINISH") 89 | 90 | def __init__(self, actions=None, size=None): 91 | super().__init__(size=size) 92 | self._all = None 93 | self._ids = None 94 | if actions is not None: 95 | self.all = actions 96 | 97 | def init(self): 98 | # edge and node action will be created as they are returned by the oracle 99 | swap = 'regular' 100 | self.all = [Actions.Reduce, Actions.Shift, Actions.Finish] + \ 101 | (list(map(Actions.Swap, range(1, 3))) if swap == COMPOUND else 102 | [Actions.Swap] if swap else []) + \ 103 | ([Actions.Label] if False else []) 104 | 105 | @property 106 | def all(self): 107 | if self._all is None: 108 | self.init() 109 | return self._all 110 | 111 | @all.setter 112 | def all(self, actions): 113 | self._all = [Action(**a) if isinstance(a, dict) else a for a in actions] 114 | self._ids = {(action.type_id, action.tag): i for i, action in enumerate(self._all)} 115 | for action in self._all: 116 | self.generate_id(action) 117 | 118 | @property 119 | def ids(self): 120 | if self._all is None: 121 | self.init() 122 | return self._ids 123 | 124 | def generate_id(self, action, create=True): 125 | if action.id is None: 126 | key = (action.type_id, action.tag) 127 | action.id = self.ids.get(key) 128 | if create and action.id is None: # New action, add to list 129 | # noinspection PyTypeChecker 130 | action.id = len(self.all) 131 | self.all.append(action(tag=action.tag, id_=action.id)) 132 | self.ids[key] = action.id 133 | -------------------------------------------------------------------------------- /14-semparsing/ucca/ci/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -xe 3 | 4 | pip install pypandoc twine 5 | python setup.py sdist 6 | python setup.py bdist_wheel 7 | twine upload --skip-existing dist/* 8 | 9 | -------------------------------------------------------------------------------- /14-semparsing/ucca/ci/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | case "$TEST_SUITE" in 4 | unit) 5 | # unit tests 6 | python -m unittest discover -v || exit 1 7 | PASSAGES=../doc/toy.xml 8 | ;; 9 | convert) 10 | mkdir pickle 11 | curl -L http://www.cs.huji.ac.il/~danielh/ucca/ucca_corpus_pickle.tgz | tar xz -C pickle || curl -L https://www.dropbox.com/s/q4ycn45zlmhuf9k/ucca_corpus_pickle.tgz | tar xz -C pickle 12 | PASSAGES=../pickle/*.pickle 13 | ;; 14 | esac 15 | cd $(dirname $0) 16 | mkdir -p converted 17 | for FORMAT in conll sdp export "export --tree"; do 18 | echo === Evaluating $FORMAT === 19 | if [ $# -lt 1 -o "$FORMAT" = "$1" ]; then 20 | python ../scripts/convert_and_evaluate.py "$PASSAGES" -f $FORMAT | tee "$FORMAT.log" 21 | fi 22 | done -------------------------------------------------------------------------------- /14-semparsing/ucca/doc/short_defs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bastings/nn4nlp2017-code-pytorch/189751e9a6a59f2ff24cbb310126ba9032079748/14-semparsing/ucca/doc/short_defs.pdf -------------------------------------------------------------------------------- /14-semparsing/ucca/doc/toy.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /14-semparsing/ucca/runner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import oracle 3 | from ucca import diffutil, ioutil, textutil, layer1, evaluation 4 | from pdb import set_trace 5 | 6 | 7 | files = ['../ucca_corpus_pickle/' + f for f in os.listdir('../ucca_corpus_pickle')] 8 | passages = list(ioutil.read_files_and_dirs(files)) 9 | 10 | passage = passages[0] 11 | ora = oracle.Oracle(passage) 12 | set_trace() -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bastings/nn4nlp2017-code-pytorch/189751e9a6a59f2ff24cbb310126ba9032079748/14-semparsing/ucca/scripts/__init__.py -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/annotate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import glob 5 | import sys 6 | 7 | from ucca.ioutil import file2passage, passage2file 8 | from ucca.textutil import annotate 9 | 10 | desc = """Read UCCA standard format in XML or binary pickle, and write back with POS tags and dependency parse.""" 11 | 12 | 13 | def main(): 14 | argparser = argparse.ArgumentParser(description=desc) 15 | argparser.add_argument("filenames", nargs="+", help="passage file names to annotate") 16 | argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage") 17 | args = argparser.parse_args() 18 | 19 | for pattern in args.filenames: 20 | filenames = glob.glob(pattern) 21 | if not filenames: 22 | raise IOError("Not found: " + pattern) 23 | for filename in filenames: 24 | passage = file2passage(filename) 25 | annotate(passage, verbose=args.verbose, replace=True) 26 | sys.stderr.write("Writing '%s'...\n" % filename) 27 | passage2file(passage, filename, binary=not filename.endswith("xml")) 28 | 29 | sys.exit(0) 30 | 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/convert_and_evaluate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import glob 5 | import sys 6 | 7 | from ucca import convert 8 | from ucca.evaluation import evaluate, Scores 9 | from ucca.ioutil import file2passage 10 | 11 | desc = """Parses files in CoNLL-X, SemEval 2015 SDP, NeGra export or text format, 12 | converts to UCCA standard format, converts back to the original format and evaluates. 13 | """ 14 | 15 | 16 | def main(): 17 | argparser = argparse.ArgumentParser(description=desc) 18 | argparser.add_argument("filenames", nargs="+", 19 | help="file names to convert and evaluate") 20 | argparser.add_argument("-f", "--format", required=True, choices=convert.CONVERTERS, 21 | help="input file format") 22 | argparser.add_argument("-T", "--tree", action="store_true", 23 | help="remove multiple parents to get a tree") 24 | argparser.add_argument("-s", "--strict", action="store_true", 25 | help="stop immediately if failed to convert or evaluate a file") 26 | argparser.add_argument("-v", "--verbose", action="store_true", 27 | help="print evaluation results for each file separately") 28 | args = argparser.parse_args() 29 | 30 | converter1 = convert.TO_FORMAT[args.format] 31 | converter2 = convert.FROM_FORMAT[args.format] 32 | scores = [] 33 | for pattern in args.filenames: 34 | filenames = glob.glob(pattern) 35 | if not filenames: 36 | raise IOError("Not found: " + pattern) 37 | for filename in filenames: 38 | sys.stdout.write("\rConverting %s" % filename) 39 | sys.stdout.flush() 40 | ref = file2passage(filename) 41 | try: 42 | guessed = next(converter2(converter1(ref, tree=args.tree), ref.ID)) 43 | scores.append(evaluate(guessed, ref, verbose=args.verbose)) 44 | except Exception as e: 45 | if args.strict: 46 | raise ValueError("Error evaluating conversion of %s" % filename) from e 47 | else: 48 | print("Error evaluating conversion of %s: %s" % (filename, e), file=sys.stderr) 49 | print() 50 | if args.verbose and len(scores) > 1: 51 | print("Aggregated scores:") 52 | Scores.aggregate(scores).print() 53 | 54 | sys.exit(0) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/count_parents_children.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import glob 5 | import sys 6 | from collections import Counter, defaultdict 7 | 8 | from ucca.ioutil import file2passage 9 | from ucca import layer1 10 | 11 | desc = """Parses XML files in UCCA standard format, and creates a histogram for the number of parents per unit. 12 | """ 13 | 14 | 15 | def plot_histogram(counter, label, plot=None): 16 | import matplotlib.pyplot as plt 17 | plt.figure() 18 | nums = list(counter.keys()) 19 | counts = list(counter.values()) 20 | indices = range(len(counts)) 21 | bars = plt.bar(indices, counts, align="center") 22 | plt.xticks(indices, nums) 23 | top = 1.06 * max(counts) 24 | plt.ylim(min(counts), top) 25 | plt.xlabel("number of %s" % label) 26 | plt.ylabel("count") 27 | for bar in bars: 28 | count = bar.get_height() 29 | plt.text(bar.get_x() + bar.get_width() / 2., count, "%.1f%%" % (100.0 * count / sum(counts)), 30 | ha="center", va="bottom") 31 | if plot: 32 | plt.savefig(plot + "histogram_" + label + ".png") 33 | else: 34 | plt.show() 35 | 36 | 37 | def plot_pie(counter, label, plot=None): 38 | import matplotlib.pyplot as plt 39 | plt.figure() 40 | nums = list(counter.keys()) 41 | counts = list(counter.values()) 42 | plt.pie(counts, labels=nums, autopct="%1.1f%%", 43 | counterclock=True, wedgeprops={"edgecolor": "white"}) 44 | plt.axis("equal") 45 | if plot: 46 | plt.savefig(plot + "pie_" + label + ".png") 47 | else: 48 | plt.show() 49 | 50 | 51 | def main(): 52 | argparser = argparse.ArgumentParser(description=desc) 53 | argparser.add_argument("filenames", nargs="+", help="file names to analyze") 54 | argparser.add_argument("-o", "--outfile", default="data/counts_", 55 | help="output file prefix for histogram") 56 | argparser.add_argument("-p", "--plot", default="data/plot_", 57 | help="output file prefix for plot image file") 58 | args = argparser.parse_args() 59 | 60 | histograms = defaultdict(Counter) 61 | for pattern in args.filenames: 62 | for filename in glob.glob(pattern): 63 | sys.stderr.write("Reading passage '%s'...\n" % filename) 64 | passage = file2passage(filename) 65 | for node in passage.layer(layer1.LAYER_ID).all: 66 | if node.ID != "1.1": # Exclude the root node 67 | histograms["parents"][clip(node.incoming, 3)] += 1 68 | histograms["children"][clip(node.outgoing, 7)] += 1 69 | 70 | for label, counter in histograms.items(): 71 | handle = open(args.outfile + label + ".txt", "w", encoding="utf-8") if args.outfile else sys.stdout 72 | handle.writelines(["%s\t%d\n" % (num, count) for num, count in counter.items()]) 73 | if handle is not sys.stdout: 74 | handle.close() 75 | # noinspection PyBroadException 76 | try: 77 | plot_histogram(counter, label, plot=args.plot) 78 | plot_pie(counter, label, plot=args.plot) 79 | except: 80 | pass 81 | 82 | sys.exit(0) 83 | 84 | 85 | def clip(l, m): 86 | return len(l) if len(l) <= m else ">%d" % m 87 | 88 | 89 | if __name__ == "__main__": 90 | main() 91 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/evaluate_db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | The evaluation software for UCCA layer 1. 4 | """ 5 | 6 | from optparse import OptionParser 7 | 8 | from scripts import ucca_db 9 | from ucca import convert 10 | from ucca.evaluation import evaluate 11 | 12 | 13 | ############################################################################## 14 | # Returns the command line parser. 15 | ############################################################################## 16 | def cmd_line_parser(): 17 | usage = "usage: %prog [options]\n" 18 | opt_parser = OptionParser(usage=usage) 19 | opt_parser.add_option("--db", "-d", dest="db_filename", 20 | action="store", type="string", 21 | help="the db file name") 22 | opt_parser.add_option("--host", "--hst", dest="host", 23 | action="store", type="string", 24 | help="the host name") 25 | opt_parser.add_option("--pid", "-p", dest="pid", action="store", 26 | type="int", help="the passage ID") 27 | opt_parser.add_option("--from_xids", "-x", dest="from_xids", 28 | action="store_true", help="interpret the ref \ 29 | and the guessed parameters as Xids in the db") 30 | opt_parser.add_option("--guessed", "-g", dest="guessed", action="store", 31 | type="string", help="if a db is defined - \ 32 | the username for the guessed annotation; \ 33 | else - the xml file name for the guessed annotation") 34 | opt_parser.add_option("--ref", "-r", dest="ref", action="store", 35 | type="string", help="if a db is defined - \ 36 | the username for the reference annotation; else - \ 37 | the xml file name for the reference annotation") 38 | opt_parser.add_option("--units", "-u", dest="units", action="store_true", 39 | help="the units the annotations have in common, \ 40 | and those each has separately") 41 | opt_parser.add_option("--fscore", "-f", dest="fscore", action="store_true", 42 | help="outputs the traditional P,R,F \ 43 | instead of the scene structure evaluation") 44 | opt_parser.add_option("--debug", dest="debug", action="store_true", 45 | help="run in debug mode") 46 | opt_parser.add_option("--errors", "-e", dest="errors", action="store_true", 47 | help="prints the error distribution\ 48 | according to its frequency") 49 | return opt_parser 50 | 51 | 52 | def main(): 53 | opt_parser = cmd_line_parser() 54 | (options, args) = opt_parser.parse_args() 55 | if len(args) > 0: 56 | opt_parser.error("all arguments must be flagged") 57 | 58 | if (options.guessed is None) or (options.ref is None) or (options.db_filename is None): 59 | opt_parser.error("missing arguments. type --help for help.") 60 | if options.pid is not None and options.from_xids is not None: 61 | opt_parser.error("inconsistent parameters. \ 62 | you can't have both a pid and from_xids paramters.") 63 | 64 | keys = [options.guessed, options.ref] 65 | if options.from_xids: 66 | xmls = ucca_db.get_by_xids(options.db_filename, options.host, keys) 67 | else: 68 | xmls = ucca_db.get_xml_trees(options.db_filename, options.host, 69 | options.pid, keys) 70 | 71 | guessed, ref = [convert.from_site(x) for x in xmls] 72 | if options.units or options.fscore or options.errors: 73 | evaluate(guessed, ref, 74 | units=options.units, fscore=options.fscore, errors=options.errors, verbose=True) 75 | 76 | 77 | if __name__ == '__main__': 78 | main() 79 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/evaluate_standard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | The evaluation script for UCCA layer 1. 4 | """ 5 | import sys 6 | from argparse import ArgumentParser 7 | 8 | from ucca import evaluation, constructions, ioutil 9 | 10 | 11 | if __name__ == "__main__": 12 | argparser = ArgumentParser(description="Compare two UCCA passages or two directories containing passage files.") 13 | argparser.add_argument("guessed", help="xml/pickle file name for the guessed annotation, or directory of files") 14 | argparser.add_argument("ref", help="xml/pickle file name for the reference annotation, or directory of files") 15 | argparser.add_argument("-u", "--units", action="store_true", 16 | help="the units the annotations have in common, and those each has separately") 17 | argparser.add_argument("-f", "--fscore", action="store_true", 18 | help="outputs the traditional P,R,F instead of the scene structure evaluation") 19 | argparser.add_argument("-e", "--errors", action="store_true", 20 | help="prints the error distribution according to its frequency") 21 | argparser.add_argument("--no-normalize", dest="normalize", action="store_false", 22 | help="do not normalize passages before evaluation") 23 | argparser.add_argument("--out-file", help="file to write results for each evaluated passage to, in CSV format") 24 | argparser.add_argument("--summary-file", help="file to write aggregated results to, in CSV format") 25 | group = argparser.add_mutually_exclusive_group() 26 | group.add_argument("-v", "--verbose", action="store_true", 27 | help="prints the results for every single pair (always true if there is only one pair)") 28 | group.add_argument("-q", "--quiet", action="store_true", help="do not print anything") 29 | constructions.add_argument(argparser) 30 | args = argparser.parse_args() 31 | 32 | if not (args.units or args.fscore or args.errors): 33 | argparser.error("At least one of -u, -f or -e is required.") 34 | 35 | guessed, ref = [ioutil.read_files_and_dirs((x,)) for x in (args.guessed, args.ref)] 36 | if len(guessed) != len(ref): 37 | raise ValueError("Number of passages to compare does not match: %d != %d" % (len(guessed), len(ref))) 38 | if len(guessed) > 1: 39 | guessed_by_id = {} 40 | for g in guessed: 41 | sys.stdout.write("\rReading %s..." % g.ID) 42 | sys.stdout.flush() 43 | guessed_by_id[g.ID] = g 44 | ids = [p.ID for p in ref] 45 | try: 46 | guessed = [guessed_by_id[i] for i in ids] 47 | except KeyError as e: 48 | raise ValueError("Passage IDs do not match") from e 49 | results = [] 50 | for g, r in zip(guessed, ref): 51 | if len(guessed) > 1: 52 | sys.stdout.write("\rEvaluating %s%s" % (g.ID, ":" if args.verbose else "...")) 53 | sys.stdout.flush() 54 | if args.verbose: 55 | print() 56 | result = evaluation.evaluate(g, r, constructions=args.constructions, units=args.units, fscore=args.fscore, 57 | errors=args.errors, verbose=args.verbose or len(guessed) == 1, 58 | normalize=args.normalize) 59 | if args.verbose: 60 | print("Average labeled F1 score: %.3f\n" % result.average_f1()) 61 | results.append(result) 62 | summary = evaluation.Scores.aggregate(results) 63 | if len(results) > 1: 64 | if args.verbose: 65 | print("Aggregated scores:") 66 | else: 67 | print(end="\r") 68 | if not args.quiet: 69 | summary.print() 70 | if not args.quiet: 71 | print("Average labeled F1 score: %.3f" % summary.average_f1()) 72 | args_constructions = summary.evaluators 73 | if args.out_file: 74 | with open(args.out_file, "w", encoding="utf-8") as f: 75 | print(",".join(summary.titles()), file=f) 76 | for result in results: 77 | print(",".join(result.fields()), file=f) 78 | if args.summary_file: 79 | with open(args.summary_file, "w", encoding="utf-8") as f: 80 | print(",".join(summary.titles()), file=f) 81 | print(",".join(summary.fields()), file=f) 82 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/find_constructions.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from ucca import constructions 4 | from ucca.ioutil import read_files_and_dirs 5 | 6 | if __name__ == "__main__": 7 | argparser = ArgumentParser(description="Extract linguistic constructions from UCCA corpus.") 8 | argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names") 9 | constructions.add_argument(argparser, False) 10 | argparser.add_argument("-v", "--verbose", action="store_true", help="print tagged text for each passage") 11 | args = argparser.parse_args() 12 | for passage in read_files_and_dirs(args.passages): 13 | if args.verbose: 14 | print("%s:" % passage.ID) 15 | extracted = constructions.extract_edges(passage, constructions=args.constructions, verbose=args.verbose) 16 | if any(extracted.values()): 17 | if not args.verbose: 18 | print("%s:" % passage.ID) 19 | for construction, edges in extracted.items(): 20 | if edges: 21 | print(" %s:" % construction.description) 22 | for edge in edges: 23 | print(" %s [%s %s]" % (edge, edge.tag, edge.child)) 24 | print() 25 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/join_passages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import glob 5 | import sys 6 | 7 | from collections import defaultdict 8 | 9 | import ucca.convert 10 | from ucca.ioutil import passage2file, file2passage 11 | 12 | desc = """Parses XML/pickle files in UCCA standard format, and writes a single passage. 13 | """ 14 | 15 | 16 | def main(): 17 | argparser = argparse.ArgumentParser(description=desc) 18 | argparser.add_argument("filenames", nargs="+", help="passage file names to join") 19 | argparser.add_argument("-o", "--outdir", default=".", help="output directory") 20 | argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") 21 | argparser.add_argument("-r", "--remarks", action="store_true", help="annotate original IDs") 22 | argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)") 23 | argparser.add_argument("-j", "--join-by-prefix", action="store_true", 24 | help="join each set of passages whose IDs share all but the last 3 characters") 25 | args = argparser.parse_args() 26 | 27 | passages = [file2passage(filename) for pattern in args.filenames for filename in sorted(glob.glob(pattern))] 28 | if args.join_by_prefix: 29 | subsets = defaultdict(list) 30 | for passage in passages: 31 | subsets[passage.ID[:-3]].append(passage) 32 | else: 33 | subsets = {passages[0].ID: passages} 34 | for passage_id, subset in sorted(subsets.items()): 35 | sys.stderr.write("Joining passages " + ", ".join(passage.ID for passage in subset) + "\n") 36 | joined = ucca.convert.join_passages(passages, passage_id=passage_id, remarks=args.remarks) 37 | outfile = "%s/%s.%s" % (args.outdir, args.prefix + joined.ID, "pickle" if args.binary else "xml") 38 | sys.stderr.write("Writing joined passage file '%s'...\n" % outfile) 39 | passage2file(joined, outfile, args.binary) 40 | 41 | sys.exit(0) 42 | 43 | 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/join_sdp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import glob 5 | import os 6 | import sys 7 | 8 | desc = """Combines several SDP parsed files to one. 9 | """ 10 | 11 | 12 | def main(): 13 | argparser = argparse.ArgumentParser(description=desc) 14 | argparser.add_argument("filenames", nargs="+", 15 | help="SDP file names to join") 16 | argparser.add_argument("-o", "--outfile", 17 | help="output filename (standard output if unspecified)") 18 | argparser.add_argument("-H", "--header", default="SDP 2015", 19 | help="first line in the file, not including prefix") 20 | argparser.add_argument("-p", "--prefix", default="#", 21 | help="prefix for comment lines") 22 | args = argparser.parse_args() 23 | 24 | lines = [args.prefix + args.header + "\n"] 25 | for pattern in args.filenames: 26 | filenames = sorted(glob.glob(pattern)) 27 | if not filenames: 28 | raise IOError("Not found: " + pattern) 29 | for filename in filenames: 30 | base = os.path.basename(os.path.splitext(filename)[0]) 31 | lines.append(args.prefix + base + "\n") 32 | with open(filename, encoding="utf-8") as f: 33 | lines += f.readlines() 34 | f = sys.stdout if args.outfile is None else open(args.outfile, "w", encoding="utf-8") 35 | f.writelines(lines) 36 | if args.outfile is not None: 37 | f.close() 38 | 39 | sys.exit(0) 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/pickle_to_standard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import os 4 | import sys 5 | 6 | from ucca.ioutil import file2passage, passage2file 7 | 8 | desc = """Parses pickle files in UCCA standard format, and writes them in XML format. 9 | """ 10 | 11 | 12 | def main(): 13 | argparser = argparse.ArgumentParser(description=desc) 14 | argparser.add_argument('filenames', nargs='+', help="pickle file names to convert") 15 | argparser.add_argument('-o', '--outdir', default='.', help="output directory") 16 | args = argparser.parse_args() 17 | 18 | for filename in args.filenames: 19 | sys.stderr.write("Reading passage '%s'...\n" % filename) 20 | passage = file2passage(filename) 21 | basename = os.path.splitext(os.path.basename(filename))[0] 22 | outfile = args.outdir + os.path.sep + basename + ".xml" 23 | sys.stderr.write("Writing file '%s'...\n" % outfile) 24 | passage2file(passage, outfile) 25 | 26 | sys.exit(0) 27 | 28 | 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/site_to_standard.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python3 2 | 3 | import argparse 4 | import pickle 5 | import sqlite3 6 | import sys 7 | from xml.etree.ElementTree import ElementTree, tostring, fromstring 8 | 9 | import ucca.convert 10 | from ucca.textutil import indent_xml 11 | 12 | desc = """Parses an XML in UCCA site format. 13 | 14 | The input can be given as either an XML file or a DB file with passage ID 15 | and user name, and the output is either the standard format XML or 16 | a pickled object. 17 | Possible input methods are using a DB file with pid and user, which gets the 18 | annotation of the specified user for the specified passage from teh DB file, 19 | or using filename of a site-formatted XML file. 20 | 21 | """ 22 | 23 | 24 | def site2passage(filename): 25 | """Opens a file and returns its parsed Passage object""" 26 | with open(filename, encoding="utf-8") as f: 27 | etree = ElementTree().parse(f) 28 | return ucca.convert.from_site(etree) 29 | 30 | 31 | def db2passage(handle, pid, user): 32 | """Gets the annotation of user to pid from the DB handle - returns a passage""" 33 | handle.execute("SELECT id FROM users WHERE username=?", (user,)) 34 | uid = handle.fetchone()[0] 35 | handle.execute("SELECT xml FROM xmls WHERE paid=? AND uid=? " + 36 | "ORDER BY ts DESC", (pid, uid)) 37 | raw_xml = handle.fetchone()[0] 38 | return ucca.convert.from_site(fromstring(raw_xml)) 39 | 40 | 41 | def main(): 42 | argparser = argparse.ArgumentParser(description=desc) 43 | argparser.add_argument("filename", nargs="?", help="XML file name to convert") 44 | argparser.add_argument("-o", "--outfile", help="output file for standard XML") 45 | argparser.add_argument("-b", "--binary", help="output file for binary pickel") 46 | argparser.add_argument("-d", "--db", help="DB file to get input from") 47 | argparser.add_argument("-p", "--pid", type=int, help="PassageID to query DB") 48 | argparser.add_argument("-u", "--user", help="Username to DB query") 49 | args = argparser.parse_args() 50 | 51 | # Checking for illegal combinations 52 | if args.db and args.filename: 53 | argparser.error("Only one source, XML or DB file, can be used") 54 | if (not args.db) and (not args.filename): 55 | argparser.error("Must specify one source, XML or DB file") 56 | if args.db and not (args.pid and args.user): 57 | argparser.error("Must specify a username and a passage ID when " + 58 | "using DB file option") 59 | if (args.pid or args.user) and not args.db: 60 | argparser.error("Cannot use user and passage ID options without DB file") 61 | 62 | if args.filename: 63 | passage = site2passage(args.filename) 64 | else: 65 | conn = sqlite3.connect(args.db) 66 | c = conn.cursor() 67 | passage = db2passage(c, args.pid, args.user) 68 | 69 | if args.binary: 70 | with open(args.binary, "wb") as binf: 71 | pickle.dump(passage, binf) 72 | else: 73 | root = ucca.convert.to_standard(passage) 74 | output = indent_xml(tostring(root).decode()) 75 | if args.outfile: 76 | with open(args.outfile, "w", encoding="utf-8") as outf: 77 | outf.write(output) 78 | else: 79 | print(output) 80 | 81 | sys.exit(0) 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/split_corpus.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from posix import mkdir 3 | 4 | import os 5 | import re 6 | from shutil import copyfile 7 | 8 | desc = """Split a directory of files into "train", "dev" and "test" directories. 9 | All files not in either "train" or "dev" will go into "test". 10 | """ 11 | TRAIN_DEFAULT = 300 12 | DEV_DEFAULT = 34 13 | # TEST on all the rest 14 | 15 | 16 | def copy(src, dest, link=False): 17 | if link: 18 | try: 19 | os.symlink(src, dest) 20 | except (NotImplementedError, OSError): 21 | copyfile(src, dest) 22 | else: 23 | copyfile(src, dest) 24 | 25 | 26 | def numeric(s): 27 | try: 28 | return int(re.findall("([0-9]+)", s)[-1]) 29 | except (ValueError, IndexError): 30 | return s 31 | 32 | 33 | def not_split_dir(filename): 34 | return filename not in ("train", "dev", "test") 35 | 36 | 37 | def split_passages(directory, train, dev, link, quiet=False): 38 | filenames = sorted(filter(not_split_dir, os.listdir(directory)), key=numeric) 39 | assert filenames, "No files to split" 40 | assert train + dev <= len(filenames), "Not enough files to split: %d+%d>%d" % (train, dev, len(filenames)) 41 | directory = os.path.abspath(directory) 42 | if not directory.endswith(os.sep): 43 | directory += os.sep 44 | for subdirectory in "train", "dev", "test": 45 | if not os.path.exists(directory + subdirectory): 46 | mkdir(directory + subdirectory) 47 | print("%d files to split: %d/%d/%d" % (len(filenames), train, dev, len(filenames) - train - dev)) 48 | print_format = "Creating link in %s to: " if link else "Copying to %s: " 49 | if not quiet: 50 | print(print_format % "train", end="", flush=True) 51 | for f in filenames[:train]: 52 | copy(directory + f, directory + "train" + os.sep + f, link) 53 | if not quiet: 54 | print(f, end=" ", flush=True) 55 | if not quiet: 56 | print() 57 | print(print_format % "dev", end="", flush=True) 58 | for f in filenames[train:train + dev]: 59 | copy(directory + f, directory + "dev" + os.sep + f, link) 60 | if not quiet: 61 | print(f, end=" ", flush=True) 62 | if not quiet: 63 | print() 64 | print(print_format % "test", end="", flush=True) 65 | for f in filenames[train + dev:]: 66 | copy(directory + f, directory + "test" + os.sep + f, link) 67 | if not quiet: 68 | print(f, end=" ", flush=True) 69 | if not quiet: 70 | print() 71 | 72 | if __name__ == "__main__": 73 | argparser = argparse.ArgumentParser(description=desc) 74 | argparser.add_argument("directory", default=".", nargs="?", help="directory to split (default: current directory)") 75 | argparser.add_argument("-t", "--train", type=int, default=TRAIN_DEFAULT, 76 | help="size of train split (default: %d)" % TRAIN_DEFAULT) 77 | argparser.add_argument("-d", "--dev", type=int, default=DEV_DEFAULT, 78 | help="size of dev split (default: %d)" % DEV_DEFAULT) 79 | argparser.add_argument("-l", "--link", action="store_true", help="create symbolic link instead of copying") 80 | argparser.add_argument("-q", "--quiet", action="store_true", help="less output") 81 | args = argparser.parse_args() 82 | 83 | split_passages(args.directory, args.train, args.dev, link=args.link, quiet=args.quiet) 84 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/standard_to_pickle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import os 4 | import sys 5 | 6 | from ucca.ioutil import file2passage, passage2file 7 | 8 | desc = """Parses an XML in UCCA standard format, and writes them in binary Pickle format. 9 | """ 10 | 11 | 12 | def main(): 13 | argparser = argparse.ArgumentParser(description=desc) 14 | argparser.add_argument('filenames', nargs='+', help="XML file names to convert") 15 | argparser.add_argument('-o', '--outdir', default='.', help="output directory") 16 | args = argparser.parse_args() 17 | 18 | for filename in args.filenames: 19 | sys.stderr.write("Reading passage '%s'...\n" % filename) 20 | passage = file2passage(filename) 21 | basename = os.path.splitext(os.path.basename(filename))[0] 22 | outfile = args.outdir + os.path.sep + basename + ".pickle" 23 | sys.stderr.write("Writing file '%s'...\n" % outfile) 24 | passage2file(passage, outfile, binary=True) 25 | 26 | sys.exit(0) 27 | 28 | 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/standard_to_sentences.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import sys 5 | 6 | import ucca.convert 7 | from ucca.ioutil import file2passage, passage2file 8 | 9 | desc = """Parses an XML in UCCA standard format, and writes a passage per sentence. 10 | """ 11 | 12 | 13 | def main(): 14 | argparser = argparse.ArgumentParser(description=desc) 15 | argparser.add_argument('filenames', nargs='+', help="passage file names to convert") 16 | argparser.add_argument('-o', '--outdir', default='.', help="output directory") 17 | argparser.add_argument('-p', '--prefix', default='', help="output filename prefix") 18 | argparser.add_argument('-r', '--remarks', action='store_true', help="annotate original IDs") 19 | argparser.add_argument("-b", "--binary", action="store_true", 20 | help="write in pickle binary format (.pickle)") 21 | args = argparser.parse_args() 22 | 23 | for filename in args.filenames: 24 | passage = file2passage(filename) 25 | sentences = ucca.convert.split2sentences(passage, remarks=args.remarks) 26 | for i, sentence in enumerate(sentences): 27 | outfile = "%s/%s.%s" % (args.outdir, args.prefix + sentence.ID, 28 | "pickle" if args.binary else "xml") 29 | sys.stderr.write("Writing passage file for sentence '%s'...\n" % outfile) 30 | passage2file(sentence, outfile, args.binary) 31 | 32 | sys.exit(0) 33 | 34 | 35 | if __name__ == '__main__': 36 | main() 37 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/statistics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import glob 5 | import sys 6 | 7 | import numpy as np 8 | 9 | from ucca import layer0, layer1 10 | from ucca.ioutil import file2passage 11 | from ucca.layer1 import NodeTags 12 | from ucca.textutil import break2sentences 13 | 14 | desc = """Prints statistics on UCCA passages 15 | """ 16 | 17 | 18 | def main(): 19 | argparser = argparse.ArgumentParser(description=desc) 20 | argparser.add_argument("filenames", nargs="+", help="files to process") 21 | argparser.add_argument("-o", "--outfile", help="output file for data") 22 | args = argparser.parse_args() 23 | 24 | print("id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont," 25 | "edges,primary,remote,linkage,parents,children,mult-parents") 26 | data = [] 27 | for pattern in args.filenames: 28 | for filename in glob.glob(pattern): 29 | passage = file2passage(filename) 30 | terminals = passage.layer(layer0.LAYER_ID).all 31 | non_terminals = [n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1"] 32 | non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage] 33 | linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages 34 | edges = {e for n in non_terminals for e in n} 35 | remote = [e for e in edges if e.attrib.get("remote")] 36 | linkage_edges = [e for n in linkage_nodes for e in n] 37 | fields = (int(passage.ID), 38 | 1, 39 | len({t.paragraph for t in terminals}), 40 | len(break2sentences(passage)), 41 | len(terminals) + len(non_terminals), 42 | len(terminals), 43 | len(non_terminals), 44 | len([n for n in non_linkage if n.attrib.get("implicit")]), 45 | len(linkage_nodes), 46 | len([n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous]), 47 | len(edges), 48 | len(edges) - len(remote) - len(linkage_edges), 49 | len(remote), 50 | len(linkage_edges), 51 | sum(len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage), 52 | sum(len(n.children) for n in non_linkage), 53 | len([n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1]), 54 | ) 55 | print(",".join("%d" % f for f in fields)) 56 | data.append(fields) 57 | data = np.array(data, dtype=int) 58 | if args.outfile: 59 | np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t") 60 | 61 | sys.exit(0) 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/unique_roles.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import sys 6 | from collections import Counter 7 | 8 | from ucca import layer1 9 | from ucca.ioutil import file2passage 10 | 11 | desc = """Finds edge tags that are empirically always unique: occur at most once in edges per node 12 | """ 13 | 14 | 15 | def main(): 16 | argparser = argparse.ArgumentParser(description=desc) 17 | argparser.add_argument('-d', '--directory', required=True, help="directory with passage files to process") 18 | argparser.add_argument('-o', '--outfile', default="data/unique_roles.txt", help="output file for data") 19 | argparser.add_argument('-D', '--direction', default="out", help="direction of edges to check (out|in)") 20 | args = argparser.parse_args() 21 | 22 | out = args.direction == "out" 23 | if not os.path.isdir(args.directory): 24 | raise Exception("Not a directory: " + args.directory) 25 | roles = set(tag for name, tag in layer1.EdgeTags.__dict__.items() 26 | if isinstance(tag, str) and not name.startswith('__')) 27 | for filename in os.listdir(args.directory): 28 | sys.stderr.write("Reading passage '%s'...\n" % filename) 29 | passage = file2passage(args.directory + os.path.sep + filename) 30 | for node in passage.layer(layer1.LAYER_ID).all: 31 | counts = Counter(edge.tag for edge in (node if out else node.incoming)) 32 | roles.difference_update(tag for tag, count in counts.items() if count > 1) 33 | 34 | lines = "\n".join(sorted(roles)) 35 | print(lines) 36 | if args.outfile: 37 | with open(args.outfile, "w", encoding="utf-8") as f: 38 | print(lines, file=f) 39 | 40 | sys.exit(0) 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /14-semparsing/ucca/scripts/visualize.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | import matplotlib.pyplot as plt 4 | 5 | from ucca import visualization 6 | from ucca.ioutil import read_files_and_dirs 7 | 8 | if __name__ == "__main__": 9 | argparser = ArgumentParser(description="Visualize the given passages as graphs.") 10 | argparser.add_argument("passages", nargs="+", help="UCCA passages, given as xml/pickle file names") 11 | args = argparser.parse_args() 12 | for passage in read_files_and_dirs(args.passages): 13 | visualization.draw(passage) 14 | plt.show() 15 | -------------------------------------------------------------------------------- /14-semparsing/ucca/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /14-semparsing/ucca/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | 5 | try: 6 | import pypandoc 7 | try: 8 | pypandoc.convert_file("README.md", "rst", outputfile="README.rst") 9 | except (IOError, ImportError, RuntimeError): 10 | pass 11 | long_description = pypandoc.convert_file("README.md", "rst") 12 | except (IOError, ImportError, RuntimeError): 13 | long_description = "" 14 | 15 | 16 | setup(name="UCCA", 17 | version="1.0.11", 18 | install_requires=["spacy", "requests"], 19 | extras_require={"visualize": ["matplotlib", "networkx"]}, 20 | description="Universal Conceptual Cognitive Annotation", 21 | long_description=long_description, 22 | author="Daniel Hershcovich", 23 | author_email="danielh@cs.huji.ac.il", 24 | url="https://github.com/huji-nlp/ucca", 25 | classifiers=[ 26 | "Development Status :: 4 - Beta", 27 | "Intended Audience :: Science/Research", 28 | "Programming Language :: Python :: 3.6", 29 | "Topic :: Text Processing :: Linguistic", 30 | "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", 31 | ], 32 | packages=find_packages(), 33 | ) 34 | -------------------------------------------------------------------------------- /14-semparsing/ucca/test_files/site1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 1 9 | 10 | 11 | 2 12 | 13 | 14 | 3 15 | 16 | 17 | 4 18 | 19 | 20 | . 21 | 22 | 23 | 24 | 25 | 6 26 | 27 | 28 | 7 29 | 30 | 31 | 8 32 | 33 | 34 | 9 35 | 36 | 37 | 10 38 | 39 | 40 | . 41 | 42 | 43 | 44 | 45 | 12 46 | 47 | 48 | 13 49 | 50 | 51 | 14 52 | 53 | 54 | 15 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /14-semparsing/ucca/test_files/site2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 1 10 | 11 | 12 | 13 | 14 | 2 15 | 16 | 17 | 18 | 19 | 20 | 3 21 | 22 | 23 | 4 24 | 25 | 26 | . 27 | 28 | 29 | 30 | 31 | 32 | 6 33 | 34 | 35 | 7 36 | 37 | 38 | 8 39 | 40 | 41 | 9 42 | 43 | 44 | 10 45 | 46 | 47 | . 48 | 49 | 50 | 51 | 52 | 12 53 | 54 | 55 | 13 56 | 57 | 58 | 14 59 | 60 | 61 | 15 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /14-semparsing/ucca/test_files/site3.xml: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 1 13 | 14 | 15 | 16 | 17 | 2 18 | 19 | 20 | 21 | 22 | 23 | 3 24 | 25 | 26 | 4 27 | 28 | 29 | . 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 6 40 | 41 | 42 | 43 | 44 | 7 45 | 46 | 47 | 48 | 49 | 50 | 8 51 | 52 | 53 | 54 | 55 | 56 | 9 57 | 58 | 59 | 60 | 61 | 62 | 63 | 10 64 | 65 | 66 | 67 | . 68 | 69 | 70 | 71 | 72 | 73 | 12 74 | 75 | 76 | 77 | 78 | 13 79 | 80 | 81 | 82 | 83 | 14 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 15 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /14-semparsing/ucca/test_files/standard3.conll: -------------------------------------------------------------------------------- 1 | # sent_id = 120 2 | 1 1 _ Word Word _ 3 L _ _ 3 | 2 2 _ Word Word _ 1 E _ _ 4 | 3 3 _ Word Word _ 0 ROOT _ _ 5 | 4 4 _ Word Word _ 3 Terminal _ _ 6 | 5 . _ Punctuation Punctuation _ 3 U _ _ 7 | 6 6 _ Word Word _ 7 E _ _ 8 | 7 7 _ Word Word _ 8 A _ _ 9 | 8 8 _ Word Word _ 3 H _ _ 10 | 9 9 _ Word Word _ 7 Terminal _ _ 11 | 10 10 _ Word Word _ 3 F _ _ 12 | 11 . _ Punctuation Punctuation _ 3 U _ _ 13 | 12 12 _ Word Word _ 3 H _ _ 14 | 13 13 _ Word Word _ 3 H _ _ 15 | 14 14 _ Word Word _ 3 H _ _ 16 | 15 15 _ Word Word _ 3 L _ _ 17 | -------------------------------------------------------------------------------- /14-semparsing/ucca/test_files/standard3.export: -------------------------------------------------------------------------------- 1 | #BOS 120 2 | 1 Word -- Terminal 500 3 | 2 Word -- Terminal 501 4 | 3 Word -- Terminal 513 5 | 4 Word -- Terminal 513 6 | . Punctuation -- Terminal 502 7 | 6 Word -- Terminal 503 8 | 7 Word -- Terminal 504 9 | 8 Word -- Terminal 505 10 | 9 Word -- Terminal 504 11 | 10 Word -- Terminal 506 12 | . Punctuation -- Terminal 507 13 | 12 Word -- Terminal 508 14 | 13 Word -- Terminal 509 15 | 14 Word -- Terminal 510 16 | 15 Word -- Terminal 511 17 | #500 FN -- C 512 18 | #501 FN -- E 512 19 | #502 PNCT -- U 513 20 | #503 FN -- E 514 21 | #504 FN -- C 514 22 | #505 FN -- P 515 23 | #506 FN -- F 518 D* 515 24 | #507 PNCT -- U 518 25 | #508 FN -- H 518 LA 519 26 | #509 FN -- H 518 LA 519 27 | #510 FN -- H 518 LA 519 28 | #511 FN -- E 517 29 | #512 FN -- L 518 30 | #513 FN -- H 518 31 | #514 FN -- A 515 32 | #515 FN -- H 518 33 | #516 FN -- C 517 34 | #517 FN -- L 518 LR 519 35 | #518 FN -- -- 0 36 | #519 LKG -- -- 0 37 | #EOS 120 -------------------------------------------------------------------------------- /14-semparsing/ucca/test_files/standard3.sdp: -------------------------------------------------------------------------------- 1 | 1 1 _ Word - + _ _ L _ _ 2 | 2 2 _ Word - - _ E _ _ _ 3 | 3 3 _ Word - + _ _ _ _ _ 4 | 4 4 _ Word - - _ _ Terminal _ _ 5 | 5 . _ Punctuation - - _ _ U _ _ 6 | 6 6 _ Word - - _ _ _ E _ 7 | 7 7 _ Word - + _ _ _ _ A 8 | 8 8 _ Word - + _ _ H _ _ 9 | 9 9 _ Word - - _ _ _ Terminal _ 10 | 10 10 _ Word - - _ _ F _ D* 11 | 11 . _ Punctuation - - _ _ U _ _ 12 | 12 12 _ Word - - _ _ H _ _ 13 | 13 13 _ Word - - _ _ H _ _ 14 | 14 14 _ Word - - _ _ H _ _ 15 | 15 15 _ Word - - _ _ L _ _ 16 | -------------------------------------------------------------------------------- /14-semparsing/ucca/test_files/standard3.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /14-semparsing/ucca/ucca/README.md: -------------------------------------------------------------------------------- 1 | `ucca` package 2 | ==================== 3 | 4 | List of Modules 5 | --------------- 6 | 1. `constructions` -- provides methods for extracting linguistic constructions from text 7 | 1. `convert` -- provides functions to convert between the UCCA objects (Pythonic) 8 | to site annotation XML, standard XML representation and text 9 | 1. `core` -- provides the basic objects of UCCA relations: `Node`, `Edge`, `Layer` 10 | and `Passage`, which are the basic items to work with 11 | 1. `evaluation` -- provides methods for comparing passages and inspecting the differences 12 | 1. `layer0` -- provides the text layer (layer 0) objects: `Layer0` and `Terminal` 13 | 1. `layer1` -- provides the foundational layer objects: `Layer1`, `FoundationalNode`, 14 | `PunctNode` and `Linkage` 15 | 1. `textutil` -- provides text processing utilities 16 | 17 | In addition, a `tests` package is present, enabling unit-testing. 18 | 19 | Authors 20 | ------ 21 | * Amit Beka: amit.beka@gmail.com 22 | * Daniel Hershcovich: danielh@cs.huji.ac.il -------------------------------------------------------------------------------- /14-semparsing/ucca/ucca/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bastings/nn4nlp2017-code-pytorch/189751e9a6a59f2ff24cbb310126ba9032079748/14-semparsing/ucca/ucca/__init__.py -------------------------------------------------------------------------------- /14-semparsing/ucca/ucca/diffutil.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from ucca.ioutil import passage2file 4 | 5 | 6 | def diff_passages(true_passage, pred_passage): 7 | """ 8 | Debug method to print missing or mistaken attributes, nodes and edges 9 | """ 10 | lines = list() 11 | if not true_passage._attrib.equals(pred_passage._attrib): 12 | lines.append("Passage attributes mismatch: %s, %s" % 13 | (true_passage._attrib, pred_passage._attrib)) 14 | try: 15 | for lid, l1 in true_passage._layers.items(): 16 | l2 = true_passage.layer(lid) 17 | if not l1._attrib.equals(l2._attrib): 18 | lines.append("Layer %d attributes mismatch: %s, %s" % 19 | (lid, l1._attrib, l2._attrib)) 20 | except KeyError: # no layer with same ID found 21 | lines.append("Missing layer: %s, %s" % 22 | (true_passage._layers, pred_passage._layers)) 23 | pred_ids = {node.extra["remarks"]: node 24 | for node in pred_passage.missing_nodes(true_passage)} 25 | true_ids = {node.ID: node 26 | for node in true_passage.missing_nodes(pred_passage)} 27 | for pred_id, pred_node in list(pred_ids.items()): 28 | true_node = true_ids.get(pred_id) 29 | if true_node: 30 | pred_ids.pop(pred_id) 31 | true_ids.pop(pred_id) 32 | pred_edges = {edge.tag + "->" + edge.child.ID: edge for edge in 33 | pred_node.missing_edges(true_node)} 34 | true_edges = {edge.tag + "->" + edge.child.ID: edge for edge in 35 | true_node.missing_edges(pred_node)} 36 | intersection = set(pred_edges).intersection(set(true_edges)) 37 | pred_edges = {s: edge for s, edge in pred_edges.items() if s not in intersection} 38 | true_edges = {s: edge for s, edge in true_edges.items() if s not in intersection} 39 | 40 | node_lines = [] 41 | if not pred_node._attrib.equals(true_node._attrib): 42 | node_lines.append(" Attributes mismatch: %s, %s" % 43 | (sorted(true_node._attrib.items()), sorted(pred_node._attrib.items()))) 44 | if pred_edges: 45 | node_lines.append(" Mistake edges: %s" % ", ".join(pred_edges)) 46 | if true_edges: 47 | node_lines.append(" Missing edges: %s" % ", ".join(true_edges)) 48 | if node_lines: 49 | lines.append("For node " + pred_id + ":") 50 | lines.extend(node_lines) 51 | if pred_ids: 52 | lines.append("Mistake nodes: %s" % ", ".join(pred_ids)) 53 | if true_ids: 54 | lines.append("Missing nodes: %s" % ", ".join(true_ids)) 55 | if lines: 56 | outfile = "%s.xml" % true_passage.ID 57 | sys.stderr.write("Writing passage '%s'...\n" % outfile) 58 | passage2file(true_passage, outfile) 59 | outfile = "%s_pred.xml" % pred_passage.ID 60 | sys.stderr.write("Writing passage '%s'...\n" % outfile) 61 | passage2file(pred_passage, outfile) 62 | return "\n" + "\n".join(lines) 63 | -------------------------------------------------------------------------------- /14-semparsing/ucca/ucca/ioutil.py: -------------------------------------------------------------------------------- 1 | """Input/output utility functions for UCCA scripts.""" 2 | import os 3 | import sys 4 | import time 5 | from collections import defaultdict 6 | from xml.etree.ElementTree import ParseError 7 | 8 | from ucca.convert import file2passage, passage2file, from_text, to_text, split2segments 9 | from ucca.core import Passage 10 | 11 | 12 | class LazyLoadedPassages(object): 13 | """ 14 | Iterable interface to Passage objects that loads files on-the-go and can be iterated more than once 15 | """ 16 | def __init__(self, files, sentences=False, paragraphs=False, converters=None): 17 | self.files = files 18 | self.sentences = sentences 19 | self.paragraphs = paragraphs 20 | self.split = self.sentences or self.paragraphs 21 | self.converters = defaultdict(lambda: from_text) if converters is None else converters 22 | self._files_iter = None 23 | self._split_iter = None 24 | self._file_handle = None 25 | self._next_index = None 26 | 27 | def __iter__(self): 28 | self._next_index = 0 29 | self._files_iter = iter(self.files) 30 | self._split_iter = None 31 | self._file_handle = None 32 | return self 33 | 34 | def __next__(self): 35 | passage = self._next_passage() 36 | self._next_index += 1 37 | return passage 38 | 39 | def _next_passage(self): 40 | passage = None 41 | if self._split_iter is None: 42 | try: 43 | file = next(self._files_iter) 44 | except StopIteration: # Finished iteration 45 | raise 46 | if isinstance(file, Passage): # Not really a file, but a Passage 47 | passage = file 48 | else: # A file 49 | attempts = 3 50 | while not os.path.exists(file): 51 | if attempts == 0: 52 | print("File not found: %s" % file, file=sys.stderr) 53 | return next(self) 54 | print("Failed reading %s, trying %d more times..." % (file, attempts), file=sys.stderr) 55 | time.sleep(5) 56 | attempts -= 1 57 | try: 58 | passage = file2passage(file) # XML or binary format 59 | except (IOError, ParseError): # Failed to read as passage file 60 | base, ext = os.path.splitext(os.path.basename(file)) 61 | converter = self.converters[ext.lstrip(".")] 62 | self._file_handle = open(file, encoding="utf-8") 63 | self._split_iter = iter(converter(self._file_handle, passage_id=base)) 64 | if self.split: 65 | if self._split_iter is None: 66 | self._split_iter = (passage,) 67 | self._split_iter = iter(s for p in self._split_iter for s in 68 | split2segments(p, is_sentences=self.sentences)) 69 | if self._split_iter is not None: # Either set before or initialized now 70 | try: 71 | # noinspection PyTypeChecker 72 | passage = next(self._split_iter) 73 | except StopIteration: # Finished this converter 74 | self._split_iter = None 75 | if self._file_handle is not None: 76 | self._file_handle.close() 77 | self._file_handle = None 78 | return next(self) 79 | return passage 80 | 81 | # The following three methods are implemented to support shuffle; 82 | # note files are shuffled but there is no shuffling within files, as it would not be efficient. 83 | # Note also the inconsistency because these access the files while __iter__ accesses individual passages. 84 | def __len__(self): 85 | return len(self.files) 86 | 87 | def __getitem__(self, i): 88 | return self.files[i] 89 | 90 | def __setitem__(self, i, value): 91 | self.files[i] = value 92 | 93 | def __bool__(self): 94 | return bool(self.files) 95 | 96 | 97 | def read_files_and_dirs(files_and_dirs, sentences=False, paragraphs=False, converters=None): 98 | """ 99 | :param files_and_dirs: iterable of files and/or directories to look in 100 | :param sentences: whether to split to sentences 101 | :param paragraphs: whether to split to paragraphs 102 | :param converters: dict of input format converters to use based on the file extension 103 | :return: list of (lazy-loaded) passages from all files given, 104 | plus any files directly under any directory given 105 | """ 106 | files = list(files_and_dirs) 107 | files += [os.path.join(d, f) for d in files if os.path.isdir(d) for f in os.listdir(d)] 108 | files = [f for f in files if not os.path.isdir(f)] 109 | return LazyLoadedPassages(files, sentences, paragraphs, converters) 110 | 111 | 112 | def write_passage(passage, output_format, binary, outdir, prefix, converter=None): 113 | suffix = output_format if output_format and output_format != "ucca" else ("pickle" if binary else "xml") 114 | outfile = outdir + os.path.sep + prefix + passage.ID + "." + suffix 115 | print("Writing passage '%s'..." % outfile) 116 | if output_format is None or output_format in ("ucca", "pickle", "xml"): 117 | passage2file(passage, outfile, binary=binary) 118 | else: 119 | output = "\n".join(line for line in (converter or to_text)(passage)) 120 | with open(outfile, "w", encoding="utf-8") as f: 121 | f.write(output + "\n") 122 | return outfile 123 | -------------------------------------------------------------------------------- /14-semparsing/ucca/ucca/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bastings/nn4nlp2017-code-pytorch/189751e9a6a59f2ff24cbb310126ba9032079748/14-semparsing/ucca/ucca/tests/__init__.py -------------------------------------------------------------------------------- /14-semparsing/ucca/ucca/visualization.py: -------------------------------------------------------------------------------- 1 | import operator 2 | import warnings 3 | from collections import defaultdict 4 | 5 | import matplotlib.cbook 6 | import networkx as nx 7 | 8 | from ucca import layer0, layer1 9 | from ucca.layer1 import Linkage 10 | 11 | warnings.filterwarnings("ignore", category=matplotlib.cbook.mplDeprecation) 12 | warnings.filterwarnings("ignore", category=UserWarning) 13 | 14 | 15 | def draw(passage): 16 | G = nx.DiGraph() 17 | terminals = sorted(passage.layer(layer0.LAYER_ID).all, key=operator.attrgetter("position")) 18 | G.add_nodes_from([(n.ID, {"label": n.text, "node_color": "white"}) for n in terminals]) 19 | G.add_nodes_from([(n.ID, {"label": "IMPLICIT" if n.attrib.get("implicit") else "", 20 | "node_color": "gray" if isinstance(n, Linkage) else ( 21 | "white" if n.attrib.get("implicit") else "black")}) 22 | for n in passage.layer(layer1.LAYER_ID).all]) 23 | G.add_edges_from([(n.ID, e.child.ID, {"label": e.tag, "style": "dashed" if e.attrib.get("remote") else "solid"}) 24 | for layer in passage.layers for n in layer.all for e in n]) 25 | pos = topological_layout(passage) 26 | nx.draw(G, pos, arrows=False, font_size=10, 27 | node_color=[d["node_color"] for _, d in G.nodes(data=True)], 28 | labels={n: d["label"] for n, d in G.nodes(data=True) if d["label"]}, 29 | style=[d["style"] for _, _, d in G.edges(data=True)]) 30 | nx.draw_networkx_edge_labels(G, pos, font_size=8, 31 | edge_labels={(u, v): d["label"] for u, v, d in G.edges(data=True)}) 32 | 33 | 34 | def topological_layout(passage): 35 | visited = defaultdict(set) 36 | pos = {} 37 | implicit_offset = 1 + max((n.position for n in passage.layer(layer0.LAYER_ID).all), default=-1) 38 | remaining = [n for layer in passage.layers for n in layer.all if not n.parents] 39 | while remaining: 40 | node = remaining.pop() 41 | if node.ID in pos: # done already 42 | continue 43 | if node.children: 44 | children = [c for c in node.children if c.ID not in pos and c not in visited[node.ID]] 45 | if children: 46 | visited[node.ID].update(children) # to avoid cycles 47 | remaining += [node] + children 48 | continue 49 | xs, ys = zip(*(pos[c.ID] for c in node.children)) 50 | pos[node.ID] = (sum(xs) / len(xs), 1 + max(ys)) # done with children 51 | elif node.layer.ID == layer0.LAYER_ID: # terminal 52 | pos[node.ID] = (int(node.position), 0) 53 | else: # implicit 54 | pos[node.ID] = (implicit_offset, 0) 55 | implicit_offset += 1 56 | return pos 57 | -------------------------------------------------------------------------------- /14-semparsing/ucca/uccaapp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bastings/nn4nlp2017-code-pytorch/189751e9a6a59f2ff24cbb310126ba9032079748/14-semparsing/ucca/uccaapp/__init__.py -------------------------------------------------------------------------------- /14-semparsing/ucca/uccaapp/convert_and_evaluate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | from glob import glob 4 | 5 | from requests.exceptions import HTTPError 6 | 7 | from ucca.evaluation import evaluate, Scores 8 | from ucca.ioutil import read_files_and_dirs 9 | from uccaapp.download_task import TaskDownloader 10 | from uccaapp.upload_task import TaskUploader 11 | 12 | try: 13 | from simplejson.scanner import JSONDecodeError 14 | except ImportError: 15 | from json.decoder import JSONDecodeError 16 | 17 | desc = """Convert a passage file to JSON format and upload to UCCA-App as a completed task, 18 | then download task from UCCA-App and convert to a passage in standard format again, 19 | then evaluate the result against the original""" 20 | 21 | 22 | def main(filenames, write, **kwargs): 23 | uploader = TaskUploader(**kwargs) 24 | downloader = TaskDownloader(**kwargs) 25 | scores = [] 26 | try: 27 | for pattern in filenames: 28 | filenames = glob(pattern) 29 | if not filenames: 30 | raise IOError("Not found: " + pattern) 31 | for ref in read_files_and_dirs(filenames): 32 | print("Converting passage " + ref.ID + "... ", end="") 33 | task = uploader.upload_task(ref) 34 | guessed = downloader.download_task(task["id"], write=write, **kwargs) 35 | score = evaluate(guessed, ref, **kwargs) 36 | print("F1=%.3f" % score.average_f1()) 37 | scores.append(score) 38 | except HTTPError as e: 39 | try: 40 | raise ValueError(e.response.json()) from e 41 | except JSONDecodeError: 42 | raise ValueError(e.response.text) from e 43 | print() 44 | if len(scores) > 1: 45 | print("Aggregated scores:") 46 | Scores.aggregate(scores).print() 47 | 48 | 49 | if __name__ == "__main__": 50 | argument_parser = argparse.ArgumentParser(description=desc) 51 | TaskUploader.add_arguments(argument_parser) 52 | argument_parser.add_argument("--write", action="store_true", help="Write converted passage to file") 53 | TaskDownloader.add_write_arguments(argument_parser) 54 | main(**vars(argument_parser.parse_args())) 55 | sys.exit(0) 56 | -------------------------------------------------------------------------------- /14-semparsing/ucca/uccaapp/download_task.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import sys 4 | 5 | from ucca.convert import from_json, CONVERTERS, TO_FORMAT 6 | from ucca.ioutil import write_passage 7 | from uccaapp.api import ServerAccessor 8 | 9 | desc = """Download task from UCCA-App and convert to a passage in standard format""" 10 | 11 | 12 | class TaskDownloader(ServerAccessor): 13 | def download_tasks(self, task_ids, **kwargs): 14 | for task_id in task_ids: 15 | yield self.download_task(task_id, **kwargs) 16 | 17 | def download_task(self, task_id, write=True, out_format=None, binary=None, out_dir=None, prefix=None, **kwargs): 18 | del kwargs 19 | passage = from_json(self.get_user_task(task_id), all_categories=self.layer["categories"]) 20 | if write: 21 | write_passage(passage, out_format, binary, out_dir, prefix, TO_FORMAT.get(out_format)) 22 | return passage 23 | 24 | @staticmethod 25 | def add_arguments(argparser): 26 | argparser.add_argument("task_ids", nargs="+", type=int, help="IDs of tasks to download and convert") 27 | TaskDownloader.add_write_arguments(argparser) 28 | ServerAccessor.add_arguments(argparser) 29 | 30 | @staticmethod 31 | def add_write_arguments(argparser): 32 | argparser.add_argument("-f", "--out-format", choices=CONVERTERS, help="output file format (default: UCCA)") 33 | argparser.add_argument("-o", "--out-dir", default=".", help="output directory") 34 | argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") 35 | argparser.add_argument("-b", "--binary", action="store_true", help="write in binary format (.pickle)") 36 | 37 | 38 | def main(**kwargs): 39 | list(TaskDownloader(**kwargs).download_tasks(**kwargs)) 40 | 41 | 42 | if __name__ == "__main__": 43 | argument_parser = argparse.ArgumentParser(description=desc) 44 | TaskDownloader.add_arguments(argument_parser) 45 | main(**vars(argument_parser.parse_args())) 46 | sys.exit(0) 47 | -------------------------------------------------------------------------------- /14-semparsing/ucca/uccaapp/upload_task.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import os 4 | import sys 5 | from glob import glob 6 | 7 | from requests.exceptions import HTTPError 8 | 9 | from ucca.convert import to_json, to_text 10 | from ucca.ioutil import read_files_and_dirs 11 | from uccaapp.api import ServerAccessor 12 | 13 | try: 14 | from simplejson.scanner import JSONDecodeError 15 | except ImportError: 16 | from json.decoder import JSONDecodeError 17 | 18 | desc = """Convert a passage file to JSON format and upload to UCCA-App as a completed task""" 19 | 20 | # https://github.com/omriabnd/UCCA-App/blob/master/UCCAApp_REST_API_Reference.pdf 21 | # ucca-demo.cs.huji.ac.il or ucca.staging.cs.huji.ac.il 22 | # upload the parse as a (completed) task: 23 | # 0. decide which project and user you want to assign it to 24 | # 1. POST passage (easy format) 25 | # 2. POST task x (of type tokenization) 26 | # 3. PUT task x (submit) 27 | # 4. POST task y (of type annotation with parent x; this is the more complicated format) 28 | # 5. PUT task y (submit) 29 | 30 | USER_ID_ENV_VAR = "UCCA_APP_USER_ID" 31 | 32 | 33 | class TaskUploader(ServerAccessor): 34 | def __init__(self, user_id, **kwargs): 35 | super().__init__(**kwargs) 36 | self.user = dict(id=user_id or int(os.environ[USER_ID_ENV_VAR])) 37 | 38 | def upload_tasks(self, filenames, **kwargs): 39 | del kwargs 40 | try: 41 | for pattern in filenames: 42 | filenames = glob(pattern) 43 | if not filenames: 44 | raise IOError("Not found: " + pattern) 45 | for passage in read_files_and_dirs(filenames): 46 | task = self.upload_task(passage) 47 | print("Submitted task %d" % task["id"]) 48 | yield task 49 | except HTTPError as e: 50 | try: 51 | raise ValueError(e.response.json()) from e 52 | except JSONDecodeError: 53 | raise ValueError(e.response.text) from e 54 | 55 | def upload_task(self, passage): 56 | passage_out = self.create_passage(text=to_text(passage, sentences=False)[0], type="PUBLIC", source=self.source) 57 | task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user, 58 | passage=passage_out, manager_comment=passage.ID, user_comment=passage.ID, parent=None, 59 | is_demo=False, is_active=True) 60 | tok_task_out = self.create_tokenization_task(**task_in) 61 | tok_user_task_in = dict(tok_task_out) 62 | tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True)) 63 | tok_user_task_out = self.submit_tokenization_task(**tok_user_task_in) 64 | task_in.update(parent=tok_task_out, type="ANNOTATION") 65 | ann_user_task_in = self.create_annotation_task(**task_in) 66 | ann_user_task_in.update( 67 | to_json(passage, return_dict=True, tok_task=tok_user_task_out, all_categories=self.layer["categories"])) 68 | return self.submit_annotation_task(**ann_user_task_in) 69 | 70 | @staticmethod 71 | def add_arguments(argparser): 72 | argparser.add_argument("filenames", nargs="+", help="passage file names to convert and upload") 73 | argparser.add_argument("--user-id", type=int, help="user id, otherwise set by " + USER_ID_ENV_VAR) 74 | ServerAccessor.add_arguments(argparser) 75 | 76 | 77 | def main(**kwargs): 78 | list(TaskUploader(**kwargs).upload_tasks(**kwargs)) 79 | 80 | 81 | if __name__ == "__main__": 82 | argument_parser = argparse.ArgumentParser(description=desc) 83 | TaskUploader.add_arguments(argument_parser) 84 | main(**vars(argument_parser.parse_args())) 85 | sys.exit(0) 86 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright 2017 Graham Neubig 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Neural Networks for NLP Code Examples 2 | 3 | This is a repository of code examples for the 2017 edition of CMU CS 11-747 4 | [Neural Networks for NLP](http://phontron.com/class/nn4nlp2017/). 5 | 6 | By Graham Neubig, Daniel Clothiaux, Zhengzhong Liu, and Xuezhe Ma 7 | 8 | [PyTorch](http://pytorch.org/) code by Joost Bastings 9 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | This contains two example data sets: 2 | 3 | 1) **Text Data (ptb):** Data from the Penn Treebank dataset provided by Mikolov: http://www.fit.vutbr.cz/~imikolov/rnnlm/ 4 | 2) **Tree Data (trees):** The tree data from the Stanford Sentiment Treebank: http://nlp.stanford.edu/sentiment/index.html 5 | 3) **Classification Data (classes):** The data from the Stanford Sentiment Treebank with tree info removed. 6 | 4) **Parallel Data (parallel):** Data from the Tanaka corpus, reduced to only have 10,000 training examples: http://www.edrdg.org/wiki/index.php/Tanaka_Corpus 7 | 5) **Tagging Data (tags):** Data from WikiNER, reduced to only have 10,000 training examples: http://schwa.org/projects/resources/wiki/Wikiner 8 | --------------------------------------------------------------------------------