├── .gitignore ├── .travis.yml ├── README.md ├── chainer ├── bilstm-tagger-withchar.py ├── bilstm-tagger.py ├── rnnlm-batch.py └── treenn.py ├── data ├── README.md ├── classes │ ├── dev.txt │ ├── test.txt │ └── train.txt ├── tags │ ├── dev.txt │ └── train.txt ├── text │ ├── dev.txt │ ├── test.txt │ └── train.txt └── trees │ ├── dev.txt │ ├── test.txt │ └── train.txt ├── dynet-cpp ├── Makefile ├── bilstm-tagger-bulk.cc ├── bilstm-tagger-withchar-bulk.cc ├── bilstm-tagger-withchar.cc ├── bilstm-tagger.cc ├── rnnlm-batch.cc ├── rnnlm-seq.cc ├── treenn-bulk.cc └── treenn.cc ├── dynet-py ├── bilstm-tagger-withchar.py ├── bilstm-tagger.py ├── bow.py ├── rnnlm-batch-batch.py ├── rnnlm-batch.py ├── treenn-bulk.py └── treenn.py ├── make-report.py ├── pytorch ├── bilstm-tagger-withchar.py ├── bilstm-tagger.py └── rnnlm.py ├── run-tests.sh ├── tensorflow ├── bilstm-tagger.py ├── bow.py └── rnnlm-batch.py └── theano ├── README.md ├── bilstm-tagger-withchar.py ├── bilstm-tagger.py ├── bow.py ├── nn ├── __init__.py ├── activations.py ├── initializations.py ├── layers │ ├── __init__.py │ ├── core.py │ ├── embeddings.py │ └── recurrent.py ├── optimizers.py └── utils │ ├── __init__.py │ ├── config_factory.py │ ├── generic_utils.py │ ├── io_utils.py │ └── theano_utils.py └── rnnlm-batch.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.dSYM 3 | *.pyc 4 | log 5 | dynet-cpp/rnnlm-batch 6 | dynet-cpp/rnnlm-seq 7 | dynet-cpp/treenn 8 | dynet-cpp/treenn-bulk 9 | dynet-cpp/bilstm-tagger 10 | dynet-cpp/bilstm-tagger-bulk 11 | dynet-cpp/bilstm-tagger-withchar 12 | dynet-cpp/bilstm-tagger-withchar-bulk 13 | dynet-cpp/rnnlm-batch-gpu 14 | dynet-cpp/rnnlm-seq-gpu 15 | dynet-cpp/treenn-gpu 16 | dynet-cpp/treenn-bulk-gpu 17 | dynet-cpp/bilstm-tagger-gpu 18 | dynet-cpp/bilstm-tagger-bulk-gpu 19 | dynet-cpp/bilstm-tagger-withchar-gpu 20 | dynet-cpp/bilstm-tagger-withchar-bulk-gpu 21 | dynet-benchmark-results*.tar.gz 22 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: trusty 2 | language: python 3 | python: 4 | - 2.7 5 | env: 6 | global: 7 | - DYNET_PATH=$PWD/dynet EIGEN_PATH=$PWD/eigen DYFLAGS="--dynet-mem 2048" TIMEOUT=200 LONGTIMEOUT=300 8 | matrix: 9 | - TEST=dynet TASK=rnnlm-batch MBSIZE=64 10 | - TEST=dynet TASK=rnnlm-batch MBSIZE=16 11 | - TEST=dynet TASK=rnnlm-batch MBSIZE=04 12 | - TEST=dynet TASK=rnnlm-batch MBSIZE=01 13 | - TEST=dynet TASK=sparse-rnnlm-batch MBSIZE=16 14 | - TEST=dynet TASK=sparse-rnnlm-batch MBSIZE=01 15 | - TEST=dynet TASK=bilstm-tagger 16 | - TEST=dynet TASK=bilstm-tagger-withchar 17 | - TEST=dynet TASK=treenn 18 | - TEST=chainer TASK=rnnlm-batch MBSIZE=64 19 | - TEST=chainer TASK=rnnlm-batch MBSIZE=16 20 | - TEST=chainer TASK=rnnlm-batch MBSIZE=04 21 | - TEST=chainer TASK=rnnlm-batch MBSIZE=01 22 | - TEST=chainer TASK=bilstm-tagger 23 | - TEST=chainer TASK=bilstm-tagger-withchar 24 | - TEST=chainer TASK=treenn 25 | - TEST=theano TASK=rnnlm-batch MBSIZE=64 26 | - TEST=theano TASK=rnnlm-batch MBSIZE=16 27 | - TEST=theano TASK=rnnlm-batch MBSIZE=04 28 | - TEST=theano TASK=rnnlm-batch MBSIZE=01 29 | - TEST=theano TASK=bilstm-tagger 30 | - TEST=theano TASK=bilstm-tagger-withchar 31 | cache: 32 | directories: 33 | - dynet 34 | - eigen 35 | stages: 36 | - compile 37 | - test 38 | jobs: 39 | include: 40 | - stage: compile 41 | env: 42 | language: cpp 43 | python: 44 | addons: 45 | apt: 46 | sources: 47 | - ubuntu-toolchain-r-test 48 | - boost-latest 49 | packages: 50 | - g++-4.8 51 | - libboost-regex1.55-dev 52 | install: skip 53 | script: 54 | - hg clone https://bitbucket.org/eigen/eigen -r 699b659 || (cd eigen && hg pull && hg update -r 699b659) 55 | - git clone https://github.com/clab/dynet || (cd dynet; git pull) 56 | - mkdir -p dynet/build 57 | - cd dynet/build 58 | - cmake .. -DEIGEN3_INCLUDE_DIR=$TRAVIS_BUILD_DIR/eigen 59 | - make -j$(nproc) 60 | - stage: test 61 | - language: cpp 62 | python: 63 | env: TEST=dynet TASK=rnnlm-batch 64 | install: cd $TRAVIS_BUILD_DIR/dynet-cpp && make -j$(nproc) DYNET_PATH=$DYNET_PATH EIGEN_PATH=$EIGEN_PATH $TASK && cd $TRAVIS_BUILD_DIR 65 | - language: cpp 66 | python: 67 | env: TEST=dynet TASK=sparse-rnnlm-batch 68 | install: cd $TRAVIS_BUILD_DIR/dynet-cpp && make -j$(nproc) DYNET_PATH=$DYNET_PATH EIGEN_PATH=$EIGEN_PATH $TASK && cd $TRAVIS_BUILD_DIR 69 | - language: cpp 70 | python: 71 | env: TEST=dynet TASK=bilstm-tagger 72 | install: cd $TRAVIS_BUILD_DIR/dynet-cpp && make -j$(nproc) DYNET_PATH=$DYNET_PATH EIGEN_PATH=$EIGEN_PATH $TASK && cd $TRAVIS_BUILD_DIR 73 | - language: cpp 74 | python: 75 | env: TEST=dynet TASK=bilstm-tagger-withchar 76 | install: cd $TRAVIS_BUILD_DIR/dynet-cpp && make -j$(nproc) DYNET_PATH=$DYNET_PATH EIGEN_PATH=$EIGEN_PATH $TASK && cd $TRAVIS_BUILD_DIR 77 | - language: cpp 78 | python: 79 | env: TEST=dynet TASK=treenn 80 | install: cd $TRAVIS_BUILD_DIR/dynet-cpp && make -j$(nproc) DYNET_PATH=$DYNET_PATH EIGEN_PATH=$EIGEN_PATH $TASK && cd $TRAVIS_BUILD_DIR 81 | 82 | install: 83 | - pip install -q cython numpy 84 | - pip install -U $TEST 85 | 86 | script: 87 | - ./run-tests.sh 88 | - grep '\(per_sec\|startup\)' log/*/*.log 89 | 90 | after_failure: 91 | - cat $TRAVIS_BUILD_DIR/log/*/*.log 92 | 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DyNet Benchmarks 2 | ---------------- 3 | by Graham Neubig, Yoav Goldberg, Chaitanya Malaviya, Austin Matthews, Yusuke Oda, and Pengcheng Yin 4 | 5 | These are benchmarks to compare [DyNet](http://github.com/clab/dynet) against several other neural network toolkits: TensorFlow, Theano, and Chainer. It covers four different natural language processing tasks, some of which are only implemented in a subset of the toolkits as they wouldn't be straightforward to implement in the others: 6 | 7 | * rnnlm-batch: A recurrent neural network language model with mini-batched training. 8 | * bilstm-tagger: A tagger that runs a bi-directional LSTM and selects a tag for each word. 9 | * bilstm-tagger-withchar: Similar to bilstm-tagger, but uses characer-based embeddings for unknown words. 10 | * treelstm: A text tagger based on tree-structured LSTMs. 11 | 12 | The benchmarks can be run by first compiling the `dynet-cpp` examples, then running run-tests.sh. 13 | 14 | **Note:** `dynet-cpp` needs the sequence-ops branch of DyNet to compile. 15 | -------------------------------------------------------------------------------- /chainer/bilstm-tagger-withchar.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start = time.time() 4 | 5 | from collections import Counter, defaultdict 6 | from itertools import count 7 | import random 8 | import sys 9 | import argparse 10 | 11 | from chainer import Chain, Variable 12 | import chainer.functions as F 13 | import chainer.links as L 14 | import chainer.optimizers as O 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--chainer_gpu', type=int, default=-1, help='GPU id') 18 | parser.add_argument('CEMBED_SIZE', type=int, help='char embedding size') 19 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size') 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 21 | parser.add_argument('MLP_SIZE', type=int, help='embedding size') 22 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 23 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 24 | args = parser.parse_args() 25 | 26 | if args.chainer_gpu >= 0: 27 | # use GPU 28 | from chainer.cuda import cupy as xp, get_device 29 | get_device(args.chainer_gpu).use() 30 | else: 31 | # use CPU 32 | import numpy as xp 33 | 34 | def makevar(x): 35 | return Variable(xp.array([x], dtype=xp.int32)) 36 | 37 | # format of files: each line is "word1|tag2 word2|tag2 ..." 38 | train_file="data/tags/train.txt" 39 | dev_file="data/tags/dev.txt" 40 | 41 | class Vocab: 42 | def __init__(self, w2i=None): 43 | if w2i is None: w2i = defaultdict(count(0).next) 44 | self.w2i = dict(w2i) 45 | self.i2w = {i:w for w,i in w2i.iteritems()} 46 | @classmethod 47 | def from_corpus(cls, corpus): 48 | w2i = defaultdict(count(0).next) 49 | for sent in corpus: 50 | [w2i[word] for word in sent] 51 | return Vocab(w2i) 52 | 53 | def size(self): return len(self.w2i.keys()) 54 | 55 | def read(fname): 56 | """ 57 | Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..." 58 | Yields lists of the form [(word1,tag1), (word2,tag2), ...] 59 | """ 60 | with open(fname) as fh: 61 | for line in fh: 62 | line = line.strip().split() 63 | sent = [tuple(x.rsplit("|",1)) for x in line] 64 | yield sent 65 | 66 | train=list(read(train_file)) 67 | dev=list(read(dev_file)) 68 | words=[] 69 | tags=[] 70 | chars=set() 71 | wc=Counter() 72 | for sent in train: 73 | for w,p in sent: 74 | words.append(w) 75 | tags.append(p) 76 | chars.update(w) 77 | wc[w]+=1 78 | words.append("_UNK_") 79 | chars.add("<*>") 80 | 81 | vw = Vocab.from_corpus([words]) 82 | vt = Vocab.from_corpus([tags]) 83 | vc = Vocab.from_corpus([chars]) 84 | UNK = vw.w2i["_UNK_"] 85 | 86 | nwords = vw.size() 87 | ntags = vt.size() 88 | nchars = vc.size() 89 | print("nwords=%r, ntags=%r, nchars=%r" % (nwords, ntags, nchars)) 90 | 91 | # Chainer Starts 92 | 93 | class Tagger(Chain): 94 | def __init__(self): 95 | super(Tagger, self).__init__( 96 | embedW=L.EmbedID(nwords, args.WEMBED_SIZE), 97 | embedC=L.EmbedID(nwords, args.CEMBED_SIZE), 98 | # MLP on top of biLSTM outputs 100 -> 32 -> ntags 99 | WH=L.Linear(args.HIDDEN_SIZE*2, args.MLP_SIZE, nobias=True), 100 | WO=L.Linear(args.MLP_SIZE, ntags, nobias=True), 101 | # word-level LSTMs 102 | fwdRNN=L.LSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE), 103 | bwdRNN=L.LSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE), 104 | # char-level LSTMs, 105 | cFwdRNN=L.LSTM(args.CEMBED_SIZE, args.WEMBED_SIZE/2), 106 | cBwdRNN=L.LSTM(args.CEMBED_SIZE, args.WEMBED_SIZE/2), 107 | ) 108 | 109 | def word_rep(self, w): 110 | if wc[w] > 5: 111 | return self.embedW(makevar(vw.w2i[w])) 112 | else: 113 | pad_char = vc.w2i["<*>"] 114 | char_ids = [pad_char] + [vc.w2i[c] for c in w] + [pad_char] 115 | char_embs = [self.embedC(makevar(cid)) for cid in char_ids] 116 | self.cFwdRNN.reset_state() 117 | self.cBwdRNN.reset_state() 118 | for e in char_embs: 119 | fw_exp = self.cFwdRNN(e) 120 | for e in reversed(char_embs): 121 | bw_exp = self.cBwdRNN(e) 122 | return F.concat([fw_exp, bw_exp]) 123 | 124 | def build_tagging_graph(self, words): 125 | # initialize the RNNs 126 | self.fwdRNN.reset_state() 127 | self.bwdRNN.reset_state() 128 | 129 | # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. 130 | wembs = [self.word_rep(w) for w in words] 131 | 132 | # feed word vectors into biLSTM 133 | fw_exps = [] 134 | for e in wembs: 135 | fw_exps.append(self.fwdRNN(e)) 136 | bw_exps = [] 137 | for e in reversed(wembs): 138 | bw_exps.append(self.bwdRNN(e)) 139 | 140 | # biLSTM states 141 | bi_exps = [F.concat([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))] 142 | 143 | # feed each biLSTM state to an MLP 144 | exps = [self.WO(F.tanh(self.WH(x))) for x in bi_exps] 145 | return exps 146 | 147 | def sent_loss(self, words, tags): 148 | vecs = self.build_tagging_graph(words) 149 | return sum(F.softmax_cross_entropy(v, makevar(vt.w2i[t])) for v, t in zip(vecs, tags)) 150 | 151 | def tag_sent(self, words): 152 | vecs = self.build_tagging_graph(words) 153 | tags = [vt.i2w[int(v.data.argmax())] for v in vecs] 154 | return zip(words, tags) 155 | 156 | tagger = Tagger() 157 | 158 | if args.chainer_gpu >= 0: 159 | tagger.to_gpu() 160 | 161 | trainer = O.Adam() 162 | trainer.use_cleargrads() 163 | trainer.setup(tagger) 164 | 165 | print("startup time: %r" % (time.time() - start)) 166 | start = time.time() 167 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0 168 | for ITER in xrange(100): 169 | random.shuffle(train) 170 | for s in train: 171 | i += 1 172 | if i % 500 == 0: # print status 173 | print(this_loss / this_tagged) 174 | all_tagged += this_tagged 175 | this_loss = this_tagged = 0 176 | all_time = time.time() - start 177 | if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev 178 | dev_start = time.time() 179 | good_sent = bad_sent = good = bad = 0.0 180 | for sent in dev: 181 | words = [w for w, _ in sent] 182 | golds = [t for _, t in sent] 183 | tags = [t for _, t in tagger.tag_sent(words)] 184 | if tags == golds: 185 | good_sent += 1 186 | else: 187 | bad_sent += 1 188 | for go,gu in zip(golds,tags): 189 | if go == gu: 190 | good += 1 191 | else: 192 | bad += 1 193 | dev_time += time.time() - dev_start 194 | train_time = time.time() - start - dev_time 195 | print("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time)) 196 | if all_time > args.TIMEOUT: 197 | sys.exit(0) 198 | # train on sent 199 | words = [w for w, _ in s] 200 | golds = [t for _, t in s] 201 | 202 | loss_exp = tagger.sent_loss(words, golds) 203 | this_loss += float(loss_exp.data) 204 | this_tagged += len(golds) 205 | tagger.cleargrads() 206 | loss_exp.backward() 207 | trainer.update() 208 | 209 | print("epoch %r finished" % ITER) 210 | trainer.update_epoch(1.0) 211 | -------------------------------------------------------------------------------- /chainer/bilstm-tagger.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start = time.time() 4 | 5 | from collections import Counter, defaultdict 6 | from itertools import count 7 | import random 8 | import time 9 | import sys 10 | import argparse 11 | 12 | from chainer import Chain, Variable 13 | import chainer.functions as F 14 | import chainer.links as L 15 | import chainer.optimizers as O 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--chainer_gpu', type=int, default=-1, help='GPU id') 19 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size') 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 21 | parser.add_argument('MLP_SIZE', type=int, help='embedding size') 22 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 23 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 24 | args = parser.parse_args() 25 | 26 | if args.chainer_gpu >= 0: 27 | # use GPU 28 | import cupy as xp 29 | from chainer.cuda import get_device 30 | # from chainer.cuda import cupy as xp, get_device 31 | get_device(args.chainer_gpu).use() 32 | else: 33 | # use CPU 34 | import numpy as xp 35 | 36 | def makevar(x): 37 | return Variable(xp.array([x], dtype=xp.int32)) 38 | 39 | # format of files: each line is "word1|tag2 word2|tag2 ..." 40 | train_file="data/tags/train.txt" 41 | dev_file="data/tags/dev.txt" 42 | 43 | class Vocab: 44 | def __init__(self, w2i=None): 45 | if w2i is None: w2i = defaultdict(count(0).next) 46 | self.w2i = dict(w2i) 47 | self.i2w = {i:w for w,i in w2i.iteritems()} 48 | @classmethod 49 | def from_corpus(cls, corpus): 50 | w2i = defaultdict(count(0).next) 51 | for sent in corpus: 52 | [w2i[word] for word in sent] 53 | return Vocab(w2i) 54 | 55 | def size(self): return len(self.w2i.keys()) 56 | 57 | def read(fname): 58 | """ 59 | Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..." 60 | Yields lists of the form [(word1,tag1), (word2,tag2), ...] 61 | """ 62 | with open(fname) as fh: 63 | for line in fh: 64 | line = line.strip().split() 65 | sent = [tuple(x.rsplit("|",1)) for x in line] 66 | yield sent 67 | 68 | train=list(read(train_file)) 69 | dev=list(read(dev_file)) 70 | words=[] 71 | tags=[] 72 | wc=Counter() 73 | for sent in train: 74 | for w,p in sent: 75 | words.append(w) 76 | tags.append(p) 77 | wc[w]+=1 78 | words.append("_UNK_") 79 | 80 | vw = Vocab.from_corpus([words]) 81 | vt = Vocab.from_corpus([tags]) 82 | UNK = vw.w2i["_UNK_"] 83 | 84 | nwords = vw.size() 85 | ntags = vt.size() 86 | print ("nwords=%r, ntags=%r" % (nwords, ntags)) 87 | 88 | # Chainer Starts 89 | 90 | class Tagger(Chain): 91 | def __init__(self): 92 | super(Tagger, self).__init__( 93 | embed=L.EmbedID(nwords, args.WEMBED_SIZE), 94 | # MLP on top of biLSTM outputs 100 -> 32 -> ntags 95 | WH=L.Linear(args.HIDDEN_SIZE*2, args.MLP_SIZE, nobias=True), 96 | WO=L.Linear(args.MLP_SIZE, ntags, nobias=True), 97 | # word-level LSTMs 98 | fwdRNN=L.LSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE), 99 | bwdRNN=L.LSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE), 100 | ) 101 | 102 | def word_rep(self, w): 103 | val = vw.w2i[w] if wc[w] > 5 else UNK 104 | return self.embed(makevar(val)) 105 | 106 | def build_tagging_graph(self, words): 107 | #initialize the RNNs 108 | self.fwdRNN.reset_state() 109 | self.bwdRNN.reset_state() 110 | 111 | # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. 112 | wembs = [self.word_rep(w) for w in words] 113 | 114 | # feed word vectors into biLSTM 115 | fw_exps = [] 116 | for e in wembs: 117 | fw_exps.append(self.fwdRNN(e)) 118 | bw_exps = [] 119 | for e in reversed(wembs): 120 | bw_exps.append(self.bwdRNN(e)) 121 | 122 | # biLSTM states 123 | bi_exps = [F.concat([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))] 124 | 125 | # feed each biLSTM state to an MLP 126 | exps = [self.WO(F.tanh(self.WH(x))) for x in bi_exps] 127 | return exps 128 | 129 | def sent_loss(self, words, tags): 130 | vecs = self.build_tagging_graph(words) 131 | return sum(F.softmax_cross_entropy(v, makevar(vt.w2i[t])) for v, t in zip(vecs, tags)) 132 | 133 | def tag_sent(self, words): 134 | vecs = self.build_tagging_graph(words) 135 | tags = [vt.i2w[int(v.data.argmax())] for v in vecs] 136 | return zip(words, tags) 137 | 138 | tagger = Tagger() 139 | 140 | if args.chainer_gpu >= 0: 141 | tagger.to_gpu() 142 | 143 | trainer = O.Adam() 144 | trainer.use_cleargrads() 145 | trainer.setup(tagger) 146 | 147 | print ("startup time: %r" % (time.time() - start)) 148 | start = time.time() 149 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0 150 | for ITER in xrange(100): 151 | random.shuffle(train) 152 | for s in train: 153 | i += 1 154 | if i % 500 == 0: # print status 155 | print (this_loss / this_tagged) 156 | all_tagged += this_tagged 157 | this_loss = this_tagged = 0 158 | all_time = time.time() - start 159 | if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev 160 | dev_start = time.time() 161 | good_sent = bad_sent = good = bad = 0.0 162 | for sent in dev: 163 | words = [w for w, _ in sent] 164 | golds = [t for _, t in sent] 165 | tags = [t for _, t in tagger.tag_sent(words)] 166 | if tags == golds: 167 | good_sent += 1 168 | else: 169 | bad_sent += 1 170 | for go, gu in zip(golds, tags): 171 | if go == gu: 172 | good += 1 173 | else: 174 | bad += 1 175 | dev_time += time.time() - dev_start 176 | train_time = time.time() - start - dev_time 177 | print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time)) 178 | if all_time > args.TIMEOUT: 179 | sys.exit(0) 180 | # train on sent 181 | words = [w for w, _ in s] 182 | golds = [t for _, t in s] 183 | 184 | loss_exp = tagger.sent_loss(words, golds) 185 | this_loss += float(loss_exp.data) 186 | this_tagged += len(golds) 187 | tagger.cleargrads() 188 | loss_exp.backward() 189 | trainer.update() 190 | 191 | print ("epoch %r finished" % ITER) 192 | trainer.update_epoch(1.0) 193 | -------------------------------------------------------------------------------- /chainer/rnnlm-batch.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start = time.time() 4 | 5 | from collections import Counter, defaultdict 6 | from itertools import count 7 | import random 8 | import math 9 | import sys 10 | import argparse 11 | 12 | from chainer import Chain, Variable 13 | import chainer.functions as F 14 | import chainer.links as L 15 | import chainer.optimizers as O 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--chainer_gpu', type=int, default=-1, help='GPU id') 19 | parser.add_argument('MB_SIZE', type=int, help='minibatch size') 20 | parser.add_argument('EMBED_SIZE', type=int, help='embedding size') 21 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 22 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 23 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 24 | args = parser.parse_args() 25 | 26 | # format of files: each line is "word1/tag2 word2/tag2 ..." 27 | train_file="data/text/train.txt" 28 | test_file="data/text/dev.txt" 29 | 30 | w2i = defaultdict(count(0).next) 31 | 32 | def read(fname): 33 | """ 34 | Read a file where each line is of the form "word1 word2 ..." 35 | Yields lists of the form [word1, word2, ...] 36 | """ 37 | with file(fname) as fh: 38 | for line in fh: 39 | sent = [w2i[x] for x in line.strip().split()] 40 | sent.append(w2i[""]) 41 | yield sent 42 | 43 | train=list(read(train_file)) 44 | nwords = len(w2i) 45 | test=list(read(test_file)) 46 | S = w2i[""] 47 | assert(nwords == len(w2i)) 48 | 49 | # Chainer Starts 50 | 51 | class RNNLM(Chain): 52 | def __init__(self): 53 | super(RNNLM, self).__init__( 54 | embed=L.EmbedID(nwords, args.EMBED_SIZE), 55 | rnn=L.LSTM(args.EMBED_SIZE, args.HIDDEN_SIZE), 56 | h2y=L.Linear(args.HIDDEN_SIZE, nwords), 57 | ) 58 | 59 | def reset(self): 60 | self.rnn.reset_state() 61 | 62 | def add_input(self, x): 63 | h = self.rnn(self.embed(x)) 64 | return self.h2y(h) 65 | 66 | lm = RNNLM() 67 | 68 | if args.chainer_gpu >= 0: 69 | # use GPU 70 | from chainer.cuda import cupy as xp, get_device 71 | get_device(args.chainer_gpu).use() 72 | lm.to_gpu() 73 | else: 74 | # use CPU 75 | import numpy as xp 76 | 77 | def makevar(arr): 78 | return Variable(xp.array(arr, dtype=xp.int32)) 79 | 80 | trainer = O.Adam() 81 | trainer.use_cleargrads() 82 | trainer.setup(lm) 83 | 84 | # Build the language model graph 85 | # 86 | # Note: Chainer could not consider masking using default cross entropy function 87 | # which returns an integrated scalar. 88 | # 89 | def calc_lm_loss(sents): 90 | # initialize the RNN 91 | lm.reset() 92 | 93 | # get the wids for each step 94 | tot_words = 0 95 | wids = [] 96 | for i in range(len(sents[0])): 97 | # Note: -1 is the default padding tag in Chainer. 98 | wids.append([ 99 | (sent[i] if len(sent)>i else -1) for sent in sents]) 100 | mask = [(1 if len(sent)>i else 0) for sent in sents] 101 | tot_words += sum(mask) 102 | 103 | # start the rnn by inputting "" 104 | init_ids = [S] * len(sents) 105 | y = lm.add_input(makevar(init_ids)) 106 | 107 | # feed word vectors into the RNN and predict the next word 108 | losses = [] 109 | for wid in wids: 110 | # calculate the softmax and loss 111 | t = makevar(wid) 112 | # Note: Chainer calculates the average. We have to multiply the batch size 113 | # to adjust dynamic range of the loss. 114 | loss = F.softmax_cross_entropy(y, t, normalize=False) * len(sents) 115 | losses.append(loss) 116 | # update the state of the RNN 117 | y = lm.add_input(t) 118 | 119 | return sum(losses), tot_words 120 | 121 | # Sort training sentences in descending order and count minibatches 122 | train.sort(key=lambda x: -len(x)) 123 | test.sort(key=lambda x: -len(x)) 124 | train_order = [x*args.MB_SIZE for x in range((len(train)-1)/args.MB_SIZE + 1)] 125 | test_order = [x*args.MB_SIZE for x in range((len(test)-1)/args.MB_SIZE + 1)] 126 | # Perform training 127 | print ("startup time: %r" % (time.time() - start)) 128 | start = time.time() 129 | i = all_time = dev_time = all_tagged = this_words = this_loss = 0 130 | for ITER in xrange(100): 131 | random.shuffle(train_order) 132 | for sid in train_order: 133 | i += 1 134 | if i % int(500/args.MB_SIZE) == 0: 135 | print(this_loss / this_words) 136 | all_tagged += this_words 137 | this_loss = this_words = 0 138 | all_time = time.time() - start 139 | if i % int(10000 / args.MB_SIZE) == 0 or all_time > args.TIMEOUT: 140 | dev_start = time.time() 141 | dev_loss = dev_words = 0 142 | for sid in test_order: 143 | loss_exp, mb_words = calc_lm_loss(test[sid:sid+args.MB_SIZE]) 144 | dev_loss += float(loss_exp.data) 145 | dev_words += mb_words 146 | dev_time += time.time() - dev_start 147 | train_time = time.time() - start - dev_time 148 | print("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % (dev_loss/dev_words, math.exp(dev_loss/dev_words), dev_words, train_time, all_tagged/train_time)) 149 | if all_time > args.TIMEOUT: 150 | sys.exit(0) 151 | # train on the minibatch 152 | loss_exp, mb_words = calc_lm_loss(train[sid:sid+args.MB_SIZE]) 153 | this_loss += float(loss_exp.data) 154 | this_words += mb_words 155 | lm.cleargrads() 156 | loss_exp.backward() 157 | trainer.update() 158 | print ("epoch %r finished" % ITER) 159 | 160 | 161 | -------------------------------------------------------------------------------- /chainer/treenn.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start = time.time() 4 | 5 | import re 6 | import codecs 7 | from collections import Counter 8 | import random 9 | import sys 10 | import argparse 11 | 12 | from chainer import Chain, Variable 13 | import chainer.functions as F 14 | import chainer.links as L 15 | import chainer.optimizers as O 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--chainer_gpu', type=int, default=-1, help='GPU id') 19 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size') 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 23 | args = parser.parse_args() 24 | 25 | if args.chainer_gpu >= 0: 26 | # use GPU 27 | from chainer.cuda import cupy as xp, get_device 28 | get_device(args.chainer_gpu).use() 29 | else: 30 | # use CPU 31 | import numpy as xp 32 | 33 | def makevar(x): 34 | return Variable(xp.array([x], dtype=xp.int32)) 35 | 36 | def zeros(dim): 37 | return Variable(xp.zeros(dim, dtype=xp.float32)) 38 | 39 | def _tokenize_sexpr(s): 40 | tokker = re.compile(r" +|[()]|[^ ()]+") 41 | toks = [t for t in [match.group(0) for match in tokker.finditer(s)] if t[0] != " "] 42 | return toks 43 | 44 | def _within_bracket(toks): 45 | label = next(toks) 46 | children = [] 47 | for tok in toks: 48 | if tok == "(": 49 | children.append(_within_bracket(toks)) 50 | elif tok == ")": 51 | return Tree(label, children) 52 | else: children.append(Tree(tok, None)) 53 | assert(False),list(toks) 54 | 55 | class Tree(object): 56 | def __init__(self, label, children=None): 57 | self.label = label 58 | self.children = children 59 | 60 | @staticmethod 61 | def from_sexpr(string): 62 | toks = iter(_tokenize_sexpr(string)) 63 | assert next(toks) == "(" 64 | return _within_bracket(toks) 65 | 66 | def __str__(self): 67 | if self.children is None: return self.label 68 | return "[%s %s]" % (self.label, " ".join([str(c) for c in self.children])) 69 | 70 | def isleaf(self): return self.children==None 71 | 72 | def leaves_iter(self): 73 | if self.isleaf(): 74 | yield self 75 | else: 76 | for c in self.children: 77 | for l in c.leaves_iter(): yield l 78 | 79 | def leaves(self): return list(self.leaves_iter()) 80 | 81 | def nonterms_iter(self): 82 | if not self.isleaf(): 83 | yield self 84 | for c in self.children: 85 | for n in c.nonterms_iter(): yield n 86 | 87 | def nonterms(self): return list(self.nonterms_iter()) 88 | 89 | def read_dataset(filename): 90 | return [Tree.from_sexpr(line.strip()) for line in codecs.open(filename,"r")] 91 | 92 | def get_vocabs(trees): 93 | label_vocab = Counter() 94 | word_vocab = Counter() 95 | for tree in trees: 96 | label_vocab.update([n.label for n in tree.nonterms()]) 97 | word_vocab.update([l.label for l in tree.leaves()]) 98 | labels = [x for x,c in label_vocab.iteritems() if c > 0] 99 | words = ["_UNK_"] + [x for x,c in word_vocab.iteritems() if c > 0] 100 | l2i = {l:i for i,l in enumerate(labels)} 101 | w2i = {w:i for i,w in enumerate(words)} 102 | return l2i, w2i, labels, words 103 | 104 | class TreeRNN(Chain): 105 | def __init__(self, word_vocab, hdim, nc): 106 | super(TreeRNN, self).__init__( 107 | embed=L.EmbedID(len(word_vocab), hdim), 108 | WR=L.Linear(2*hdim, hdim, nobias=True), 109 | WO=L.Linear(hdim, nc, nobias=True), 110 | ) 111 | self.w2i = word_vocab 112 | 113 | def expr_for_tree(self, tree, decorate=False): 114 | if tree.isleaf(): 115 | return self.embed(makevar(self.w2i.get(tree.label, 0))) 116 | if len(tree.children) == 1: 117 | assert(tree.children[0].isleaf()) 118 | expr = self.expr_for_tree(tree.children[0]) 119 | if decorate: 120 | tree._e = expr 121 | return expr 122 | assert(len(tree.children) == 2), tree.children[0] 123 | e1 = self.expr_for_tree(tree.children[0], decorate) 124 | e2 = self.expr_for_tree(tree.children[1], decorate) 125 | expr = F.tanh(self.WR(F.concat(e1, e2))) 126 | if decorate: 127 | tree._e = expr 128 | return expr 129 | 130 | def classify(self, e): 131 | return self.WO(e) 132 | 133 | class TreeLSTM(Chain): 134 | def __init__(self, word_vocab, wdim, hdim, nc): 135 | super(TreeLSTM, self).__init__( 136 | embed=L.EmbedID(len(word_vocab), wdim), 137 | WU=L.Linear(wdim, 4 * hdim), # i,f,o,u with bias (semiterminal) 138 | W1=L.Linear(hdim, 4 * hdim), # i,f,o,u with bias (left) 139 | W2=L.Linear(hdim, 4 * hdim), # i,f,o,u with bias (right) 140 | WO=L.Linear(hdim, nc, nobias=True), 141 | ) 142 | self.w2i = word_vocab 143 | self.hdim = hdim 144 | 145 | def expr_for_tree(self, tree, decorate=False): 146 | if tree.isleaf(): 147 | return zeros((1, self.hdim)), self.embed(makevar(self.w2i.get(tree.label, 0))) 148 | if len(tree.children) == 1: 149 | assert(tree.children[0].isleaf()) 150 | c0, e0 = self.expr_for_tree(tree.children[0]) 151 | c, h = F.lstm(c0, self.WU(e0)) 152 | if decorate: 153 | tree._e = (c, h) 154 | return c, h 155 | assert(len(tree.children) == 2), tree.children[0] 156 | c1, e1 = self.expr_for_tree(tree.children[0], decorate) 157 | c2, e2 = self.expr_for_tree(tree.children[1], decorate) 158 | c, h = F.slstm(c1, c2, self.W1(e1), self.W2(e2)) 159 | if decorate: 160 | tree._e = (c, h) 161 | return c, h 162 | 163 | def classify(self, e): 164 | return self.WO(e[1]) 165 | 166 | train = read_dataset("data/trees/train.txt") 167 | dev = read_dataset("data/trees/dev.txt") 168 | 169 | l2i, w2i, i2l, i2w = get_vocabs(train) 170 | 171 | tlm = TreeLSTM(w2i, args.WEMBED_SIZE, args.HIDDEN_SIZE, len(l2i)) 172 | if args.chainer_gpu >= 0: 173 | tlm.to_gpu() 174 | 175 | trainer = O.Adam() 176 | trainer.use_cleargrads() 177 | trainer.setup(tlm) 178 | 179 | print("startup time: %r" % (time.time() - start)) 180 | sents = 0 181 | all_time = 0 182 | for ITER in range(100): 183 | random.shuffle(train) 184 | closs = 0.0 185 | cwords = 0 186 | start = time.time() 187 | for i,tree in enumerate(train,1): 188 | sents += 1 189 | d = tlm.expr_for_tree(tree,True) 190 | nodes = tree.nonterms() 191 | losses = [F.softmax_cross_entropy(tlm.classify(nt._e), makevar(l2i[nt.label])) for nt in nodes] 192 | loss = sum(losses) 193 | closs += float(loss.data) 194 | cwords += len(nodes) 195 | tlm.cleargrads() 196 | loss.backward() 197 | trainer.update() 198 | if sents % 1000 == 0: 199 | print(closs / cwords) 200 | closs = 0.0 201 | cwords = 0 202 | all_time += time.time() - start 203 | good = bad = 0.0 204 | for tree in dev: 205 | my_data = tlm.classify(tlm.expr_for_tree(tree,False)).data 206 | if args.chainer_gpu >= 0: 207 | my_data = xp.asnumpy(my_data) 208 | pred = i2l[my_data.argmax()] 209 | if pred == tree.label: 210 | good += 1 211 | else: 212 | bad += 1 213 | print("acc=%.4f, time=%.4f, sent_per_sec=%.4f" % (good/(good+bad), all_time, sents/all_time)) 214 | if all_time > args.TIMEOUT: 215 | sys.exit(0) 216 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | This contains three example data sets: 2 | 3 | 1. **Text Data (text):** Mikolov's pre-processed version of the Wall Street Journal used for language modeling: http://www.fit.vutbr.cz/~imikolov/rnnlm/ 4 | 2. **Tree Data (trees):** The tree data from the Stanford Sentiment Treebank: http://nlp.stanford.edu/sentiment/index.html 5 | 3. **Tag Data (tags):** Tagged data from WikiNER: http://schwa.org/projects/resources/wiki/Wikiner 6 | -------------------------------------------------------------------------------- /dynet-cpp/Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | CUDA_PATH=/usr/local/cuda/targets/x86_64-linux/lib 4 | EIGEN_PATH=${HOME}/usr/local/eigen 5 | DYNET_PATH=${HOME}/work/dynet 6 | 7 | DYNET_LIB=-ldynet 8 | 9 | # *** Mac or linux 10 | UNAME_S := $(shell uname -s) 11 | ifeq ($(UNAME_S),Darwin) 12 | CXX_FLAGS=-std=c++11 -I${EIGEN_PATH} -I${DYNET_PATH} -L${DYNET_PATH}/build/dynet -lc++ -Ofast 13 | else 14 | CC=g++ 15 | CXX_FLAGS=-std=c++11 -I${EIGEN_PATH} -I${DYNET_PATH} -L${DYNET_PATH}/build/dynet -DBOOST_REGEX -lboost_regex -Ofast 16 | endif 17 | 18 | # The -seq version is commented out for compatibility with master. If using the sequence-ops 19 | # branch you can compile this program as well 20 | all: rnnlm-batch treenn treenn-bulk bilstm-tagger bilstm-tagger-bulk bilstm-tagger-withchar bilstm-tagger-withchar-bulk 21 | 22 | gpu: rnnlm-batch-gpu treenn-gpu treenn-bulk-gpu bilstm-tagger-gpu bilstm-tagger-bulk-gpu bilstm-tagger-withchar-gpu bilstm-tagger-withchar-bulk-gpu 23 | 24 | clean: 25 | rm -f rnnlm-batch treenn treenn-bulk bilstm-tagger bilstm-tagger-bulk bilstm-tagger-withchar bilstm-tagger-withchar-bulk rnnlm-batch-gpu treenn-gpu treenn-bulk-gpu bilstm-tagger-gpu bilstm-tagger-bulk-gpu bilstm-tagger-withchar-gpu bilstm-tagger-withchar-bulk-gpu 26 | 27 | rnnlm-batch: rnnlm-batch.cc 28 | ${CC} -o rnnlm-batch rnnlm-batch.cc ${CXX_FLAGS} ${DYNET_LIB} 29 | 30 | rnnlm-seq: rnnlm-seq.cc 31 | ${CC} -o rnnlm-seq rnnlm-seq.cc ${CXX_FLAGS} ${DYNET_LIB} 32 | 33 | treenn: treenn.cc 34 | ${CC} -o treenn treenn.cc ${CXX_FLAGS} ${DYNET_LIB} 35 | 36 | treenn-bulk: treenn-bulk.cc 37 | ${CC} -o treenn-bulk treenn-bulk.cc ${CXX_FLAGS} ${DYNET_LIB} 38 | 39 | bilstm-tagger: bilstm-tagger.cc 40 | ${CC} -o bilstm-tagger bilstm-tagger.cc ${CXX_FLAGS} ${DYNET_LIB} 41 | 42 | bilstm-tagger-bulk: bilstm-tagger-bulk.cc 43 | ${CC} -o bilstm-tagger-bulk bilstm-tagger-bulk.cc ${CXX_FLAGS} ${DYNET_LIB} 44 | 45 | bilstm-tagger-withchar: bilstm-tagger-withchar.cc 46 | ${CC} -o bilstm-tagger-withchar bilstm-tagger-withchar.cc ${CXX_FLAGS} ${DYNET_LIB} 47 | 48 | bilstm-tagger-withchar-bulk: bilstm-tagger-withchar-bulk.cc 49 | ${CC} -o bilstm-tagger-withchar-bulk bilstm-tagger-withchar-bulk.cc ${CXX_FLAGS} ${DYNET_LIB} 50 | 51 | rnnlm-batch-gpu: rnnlm-batch.cc 52 | ${CC} -o rnnlm-batch-gpu rnnlm-batch.cc ${CXX_FLAGS} ${DYNET_LIB} 53 | 54 | rnnlm-seq-gpu: rnnlm-seq.cc 55 | ${CC} -o rnnlm-seq-gpu rnnlm-seq.cc ${CXX_FLAGS} ${DYNET_LIB} 56 | 57 | treenn-gpu: treenn.cc 58 | ${CC} -o treenn-gpu treenn.cc ${CXX_FLAGS} ${DYNET_LIB} 59 | 60 | treenn-bulk-gpu: treenn-bulk.cc 61 | ${CC} -o treenn-bulk-gpu treenn-bulk.cc ${CXX_FLAGS} ${DYNET_LIB} 62 | 63 | bilstm-tagger-gpu: bilstm-tagger.cc 64 | ${CC} -o bilstm-tagger-gpu bilstm-tagger.cc ${CXX_FLAGS} ${DYNET_LIB} 65 | 66 | bilstm-tagger-bulk-gpu: bilstm-tagger-bulk.cc 67 | ${CC} -o bilstm-tagger-bulk-gpu bilstm-tagger-bulk.cc ${CXX_FLAGS} ${DYNET_LIB} 68 | 69 | bilstm-tagger-withchar-gpu: bilstm-tagger-withchar.cc 70 | ${CC} -o bilstm-tagger-withchar-gpu bilstm-tagger-withchar.cc ${CXX_FLAGS} ${DYNET_LIB} 71 | 72 | bilstm-tagger-withchar-bulk-gpu: bilstm-tagger-withchar-bulk.cc 73 | ${CC} -o bilstm-tagger-withchar-bulk-gpu bilstm-tagger-withchar-bulk.cc ${CXX_FLAGS} ${DYNET_LIB} 74 | -------------------------------------------------------------------------------- /dynet-cpp/bilstm-tagger-bulk.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #ifdef BOOST_REGEX 6 | #include 7 | using namespace boost; 8 | #else 9 | #include 10 | #endif 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | using namespace std; 18 | using namespace std::chrono; 19 | using namespace dynet; 20 | 21 | // Read a file where each line is of the form "word1|tag1 word2|tag2 ..." 22 | // Yields pairs of lists of the form < [word1, word2, ...], [tag1, tag2, ...] > 23 | vector, vector > > read(const string & fname) { 24 | ifstream fh(fname); 25 | if(!fh) throw std::runtime_error("Could not open file"); 26 | string str; 27 | regex re("[ |]"); 28 | vector, vector > > sents; 29 | while(getline(fh, str)) { 30 | pair,vector > word_tags; 31 | sregex_token_iterator first{str.begin(), str.end(), re, -1}, last; 32 | while(first != last) { 33 | word_tags.first.push_back(*first++); 34 | assert(first != last); 35 | word_tags.second.push_back(*first++); 36 | } 37 | sents.push_back(word_tags); 38 | } 39 | return sents; 40 | } 41 | 42 | class BiLSTMTagger { 43 | public: 44 | 45 | BiLSTMTagger(unsigned layers, unsigned wembed_dim, unsigned hidden_dim, unsigned mlp_dim, ParameterCollection & model, Dict & wv, Dict & tv, unordered_map & wc) 46 | : wv(wv), tv(tv), wc(wc) { 47 | unsigned nwords = wv.size(); 48 | unsigned ntags = tv.size(); 49 | word_lookup = model.add_lookup_parameters(nwords, {wembed_dim}); 50 | 51 | // MLP on top of biLSTM outputs 100 -> 32 -> ntags 52 | pH = model.add_parameters({mlp_dim, hidden_dim*2}); 53 | pO = model.add_parameters({ntags, mlp_dim}); 54 | 55 | // word-level LSTMs 56 | fwdRNN = VanillaLSTMBuilder(layers, wembed_dim, hidden_dim, model); // layers, in-dim, out-dim, model 57 | bwdRNN = VanillaLSTMBuilder(layers, wembed_dim, hidden_dim, model); 58 | } 59 | 60 | Dict &wv, &tv; 61 | unordered_map & wc; 62 | LookupParameter word_lookup; 63 | Parameter pH, pO; 64 | VanillaLSTMBuilder fwdRNN, bwdRNN; 65 | Expression H, O; 66 | 67 | void init(ComputationGraph & cg) { 68 | // parameters -> expressions 69 | H = parameter(cg, pH); 70 | O = parameter(cg, pO); 71 | 72 | // initialize the RNNs 73 | fwdRNN.new_graph(cg); 74 | bwdRNN.new_graph(cg); 75 | } 76 | 77 | // Do word representation 78 | Expression word_rep(ComputationGraph & cg, const string & w) { 79 | return lookup(cg, word_lookup, wv.convert(wc[w] > 5 ? w : "")); 80 | } 81 | 82 | vector build_tagging_graph(ComputationGraph & cg, const vector & words) { 83 | 84 | 85 | // get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. 86 | vector wembs(words.size()), fwds(words.size()), bwds(words.size()), fbwds(words.size()); 87 | for(size_t i = 0; i < words.size(); ++i) 88 | wembs[i] = word_rep(cg, words[i]); 89 | 90 | // feed word vectors into biLSTM 91 | fwdRNN.start_new_sequence(); 92 | for(size_t i = 0; i < wembs.size(); ++i) 93 | fwds[i] = fwdRNN.add_input(wembs[i]); 94 | bwdRNN.start_new_sequence(); 95 | for(size_t i = wembs.size(); i > 0; --i) 96 | bwds[i-1] = bwdRNN.add_input(wembs[i-1]); 97 | 98 | // Concatenate and MLP 99 | for(size_t i = 0; i < wembs.size(); ++i) 100 | fbwds[i] = O * tanh( H * concatenate({fwds[i], bwds[i]}) ); 101 | 102 | return fbwds; 103 | } 104 | 105 | Expression sent_loss(ComputationGraph & cg, vector & words, vector & tags) { 106 | vector exprs = build_tagging_graph(cg, words), errs(words.size()); 107 | for(size_t i = 0; i < tags.size(); ++i) 108 | errs[i] = pickneglogsoftmax(exprs[i], tv.convert(tags[i])); 109 | return sum(errs); 110 | } 111 | 112 | vector tag_sent(vector & words, ComputationGraph &cg) { 113 | cg.clear(); 114 | init(cg); 115 | vector exprs = build_tagging_graph(cg, words), errs(words.size()); 116 | vector tags(words.size()); 117 | for(size_t i = 0; i < words.size(); ++i) { 118 | vector scores = as_vector(exprs[i].value()); 119 | size_t max_id = distance(scores.begin(), max_element(scores.begin(), scores.end())); 120 | tags[i] = tv.convert(max_id); 121 | } 122 | return tags; 123 | } 124 | 125 | }; 126 | 127 | int main(int argc, char**argv) { 128 | 129 | time_point start = system_clock::now(); 130 | 131 | vector, vector > > train = read("data/tags/train.txt"); 132 | vector, vector > > dev = read("data/tags/dev.txt"); 133 | Dict word_voc, tag_voc; 134 | unordered_map word_cnt; 135 | for(auto & sent : train) { 136 | for(auto & w : sent.first) { 137 | word_voc.convert(w); 138 | word_cnt[w]++; 139 | } 140 | for(auto & t : sent.second) 141 | tag_voc.convert(t); 142 | } 143 | tag_voc.freeze(); 144 | word_voc.convert(""); word_voc.freeze(); word_voc.set_unk(""); 145 | 146 | // DyNet Starts 147 | dynet::initialize(argc, argv); 148 | ParameterCollection model; 149 | AdamTrainer trainer(model); 150 | trainer.clipping_enabled = false; 151 | 152 | if(argc != 8) { 153 | cerr << "Usage: " << argv[0] << " WEMBED_SIZE HIDDEN_SIZE MLP_SIZE SPARSE BATCH_SIZE LAST_STEP TIMEOUT" << endl; 154 | return 1; 155 | } 156 | int WEMBED_SIZE = atoi(argv[1]); 157 | int HIDDEN_SIZE = atoi(argv[2]); 158 | int MLP_SIZE = atoi(argv[3]); 159 | trainer.sparse_updates_enabled = atoi(argv[4]); 160 | int BATCH_SIZE = atoi(argv[5]); 161 | int LAST_STEP = atoi(argv[6]); 162 | int TIMEOUT = atoi(argv[7]); 163 | 164 | // Initilaize the tagger 165 | BiLSTMTagger tagger(1, WEMBED_SIZE, HIDDEN_SIZE, MLP_SIZE, model, word_voc, tag_voc, word_cnt); 166 | 167 | { 168 | duration fs = (system_clock::now() - start); 169 | float startup_time = duration_cast(fs).count() / float(1000); 170 | cout << "startup time: " << startup_time << endl; 171 | } 172 | 173 | // Do training 174 | shuffle(train.begin(), train.end(), *dynet::rndeng); 175 | start = system_clock::now(); 176 | int i = 0, bi = 0, all_tagged = 0, this_words = 0; 177 | float this_loss = 0.f, all_time = 0.f; 178 | unsigned batch = BATCH_SIZE; 179 | ComputationGraph cg; 180 | for(int iter = 0; iter < 100; iter++) { 181 | for(size_t id1 = 0; id1 <= train.size()-batch; id1 += batch) { 182 | i += batch; 183 | bi++; 184 | if(bi % (500/BATCH_SIZE) == 0) { 185 | trainer.status(); 186 | cout << this_loss/this_words << endl; 187 | all_tagged += this_words; 188 | this_loss = 0.f; 189 | this_words = 0; 190 | } 191 | if(bi % (5000/BATCH_SIZE) == 0) { 192 | duration fs = (system_clock::now() - start); 193 | all_time += duration_cast(fs).count() / float(1000); 194 | int dev_words = 0, dev_good = 0; 195 | float dev_loss = 0; 196 | for(auto & sent : dev) { 197 | vector tags = tagger.tag_sent(sent.first, cg); 198 | for(size_t j = 0; j < tags.size(); ++j) 199 | if(tags[j] == sent.second[j]) 200 | dev_good++; 201 | dev_words += sent.second.size(); 202 | } 203 | cout << "acc=" << dev_good/float(dev_words) << ", time=" << all_time << ", word_per_sec=" << all_tagged/all_time << ", sent_per_sec=" << i/all_time << ", sec_per_sent=" << all_time/i << endl; 204 | if(all_time > TIMEOUT) 205 | exit(0); 206 | start = system_clock::now(); 207 | } 208 | 209 | cg.clear(); 210 | tagger.init(cg); 211 | vector losses; 212 | for(size_t id2 = 0; id2 < batch; ++id2) { 213 | auto & s = train[id1+id2]; 214 | losses.push_back(tagger.sent_loss(cg, s.first, s.second)); 215 | this_words += s.first.size(); 216 | } 217 | Expression loss_exp = sum(losses); 218 | float my_loss = as_scalar(cg.forward(loss_exp)); 219 | this_loss += my_loss; 220 | if(LAST_STEP > 0) { 221 | cg.backward(loss_exp); 222 | if(LAST_STEP > 1) 223 | trainer.update(); 224 | } 225 | } 226 | } 227 | return 0; 228 | } 229 | -------------------------------------------------------------------------------- /dynet-cpp/bilstm-tagger-withchar.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #ifdef BOOST_REGEX 6 | #include 7 | using namespace boost; 8 | #else 9 | #include 10 | #endif 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | using namespace std; 18 | using namespace std::chrono; 19 | using namespace dynet; 20 | 21 | // Read a file where each line is of the form "word1|tag1 word2|tag2 ..." 22 | // Yields pairs of lists of the form < [word1, word2, ...], [tag1, tag2, ...] > 23 | vector, vector > > read(const string & fname) { 24 | ifstream fh(fname); 25 | if(!fh) throw std::runtime_error("Could not open file"); 26 | string str; 27 | regex re("[ |]"); 28 | vector, vector > > sents; 29 | while(getline(fh, str)) { 30 | pair,vector > word_tags; 31 | sregex_token_iterator first{str.begin(), str.end(), re, -1}, last; 32 | while(first != last) { 33 | word_tags.first.push_back(*first++); 34 | assert(first != last); 35 | word_tags.second.push_back(*first++); 36 | } 37 | sents.push_back(word_tags); 38 | } 39 | return sents; 40 | } 41 | 42 | class BiLSTMTagger { 43 | public: 44 | 45 | BiLSTMTagger(unsigned layers, unsigned cembed_dim, unsigned wembed_dim, unsigned hidden_dim, unsigned mlp_dim, ParameterCollection & model, Dict & wv, Dict & cv, Dict & tv, unordered_map & wc) 46 | : wv(wv), cv(cv), tv(tv), wc(wc) { 47 | unsigned nwords = wv.size(); 48 | unsigned ntags = tv.size(); 49 | unsigned nchars = cv.size(); 50 | word_lookup = model.add_lookup_parameters(nwords, {wembed_dim}); 51 | char_lookup = model.add_lookup_parameters(nchars, {cembed_dim}); 52 | 53 | // MLP on top of biLSTM outputs 100 -> mlp_dim -> ntags 54 | pH = model.add_parameters({mlp_dim, hidden_dim*2}); 55 | pO = model.add_parameters({ntags, mlp_dim}); 56 | 57 | // word-level LSTMs 58 | fwdRNN = VanillaLSTMBuilder(1, wembed_dim, hidden_dim, model); // layers, in-dim, out-dim, model 59 | bwdRNN = VanillaLSTMBuilder(1, wembed_dim, hidden_dim, model); 60 | 61 | // char-level LSTMs 62 | cFwdRNN = VanillaLSTMBuilder(1, cembed_dim, wembed_dim/2, model); 63 | cBwdRNN = VanillaLSTMBuilder(1, cembed_dim, wembed_dim/2, model); 64 | } 65 | 66 | Dict &wv, &cv, &tv; 67 | unordered_map & wc; 68 | LookupParameter word_lookup, char_lookup; 69 | Parameter p_t1, pH, pO; 70 | VanillaLSTMBuilder fwdRNN, bwdRNN, cFwdRNN, cBwdRNN; 71 | 72 | // Do word representation 73 | Expression word_rep(ComputationGraph & cg, const string & w) { 74 | if(wc[w] > 5) { 75 | return lookup(cg, word_lookup, wv.convert(w)); 76 | } else { 77 | Expression pad = lookup(cg, char_lookup, cv.convert("<*>")); 78 | vector cembs(w.size()+2, pad); 79 | for(size_t i = 0; i < w.size(); ++i) 80 | cembs[i+1] = lookup(cg, char_lookup, cv.convert(w.substr(i, 1))); 81 | cFwdRNN.start_new_sequence(); 82 | for(size_t i = 0; i < cembs.size(); ++i) cFwdRNN.add_input(cembs[i]); 83 | cBwdRNN.start_new_sequence(); 84 | for(size_t i = cembs.size(); i > 0; --i) cBwdRNN.add_input(cembs[i-1]); 85 | return concatenate({cFwdRNN.back(), cBwdRNN.back()}); 86 | } 87 | } 88 | 89 | vector build_tagging_graph(ComputationGraph & cg, const vector & words) { 90 | // parameters -> expressions 91 | Expression H = parameter(cg, pH); 92 | Expression O = parameter(cg, pO); 93 | 94 | // initialize the RNNs 95 | fwdRNN.new_graph(cg); 96 | bwdRNN.new_graph(cg); 97 | cFwdRNN.new_graph(cg); 98 | cBwdRNN.new_graph(cg); 99 | 100 | // get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. 101 | vector wembs(words.size()), fwds(words.size()), bwds(words.size()), fbwds(words.size()); 102 | for(size_t i = 0; i < words.size(); ++i) 103 | wembs[i] = word_rep(cg, words[i]); 104 | 105 | // feed word vectors into biLSTM 106 | fwdRNN.start_new_sequence(); 107 | for(size_t i = 0; i < wembs.size(); ++i) 108 | fwds[i] = fwdRNN.add_input(wembs[i]); 109 | bwdRNN.start_new_sequence(); 110 | for(size_t i = wembs.size(); i > 0; --i) { 111 | bwds[i-1] = bwdRNN.add_input(wembs[i-1]); 112 | fbwds[i-1] = O * tanh( H * concatenate({fwds[i-1], bwds[i-1]}) ); 113 | } 114 | 115 | return fbwds; 116 | } 117 | 118 | Expression sent_loss(ComputationGraph & cg, vector & words, vector & tags) { 119 | vector exprs = build_tagging_graph(cg, words), errs(words.size()); 120 | for(size_t i = 0; i < tags.size(); ++i) 121 | errs[i] = pickneglogsoftmax(exprs[i], tv.convert(tags[i])); 122 | return sum(errs); 123 | } 124 | 125 | vector tag_sent(vector & words) { 126 | ComputationGraph cg; 127 | vector exprs = build_tagging_graph(cg, words), errs(words.size()); 128 | vector tags(words.size()); 129 | for(size_t i = 0; i < words.size(); ++i) { 130 | vector scores = as_vector(exprs[i].value()); 131 | size_t max_id = distance(scores.begin(), max_element(scores.begin(), scores.end())); 132 | tags[i] = tv.convert(max_id); 133 | } 134 | return tags; 135 | } 136 | 137 | }; 138 | 139 | int main(int argc, char**argv) { 140 | 141 | time_point start = system_clock::now(); 142 | 143 | vector, vector > > train = read("data/tags/train.txt"); 144 | vector, vector > > dev = read("data/tags/dev.txt"); 145 | Dict word_voc, tag_voc, char_voc; 146 | unordered_map word_cnt; 147 | for(auto & sent : train) { 148 | for(auto & w : sent.first) { 149 | word_voc.convert(w); 150 | word_cnt[w]++; 151 | for(size_t i = 0; i < w.size(); ++i) 152 | char_voc.convert(w.substr(i,1)); 153 | } 154 | for(auto & t : sent.second) 155 | tag_voc.convert(t); 156 | } 157 | tag_voc.freeze(); 158 | word_voc.convert(""); word_voc.freeze(); word_voc.set_unk(""); 159 | char_voc.convert("<*>"); char_voc.freeze(); 160 | 161 | // DyNet Starts 162 | dynet::initialize(argc, argv); 163 | ParameterCollection model; 164 | AdamTrainer trainer(model, 0.001); 165 | trainer.clipping_enabled = false; 166 | 167 | if(argc != 7) { 168 | cerr << "Usage: " << argv[0] << " CEMBED_SIZE WEMBED_SIZE HIDDEN_SIZE MLP_SIZE SPARSE TIMEOUT" << endl; 169 | return 1; 170 | } 171 | int CEMBED_SIZE = atoi(argv[1]); 172 | int WEMBED_SIZE = atoi(argv[2]); 173 | int HIDDEN_SIZE = atoi(argv[3]); 174 | int MLP_SIZE = atoi(argv[4]); 175 | trainer.sparse_updates_enabled = atoi(argv[5]); 176 | int TIMEOUT = atoi(argv[6]); 177 | 178 | // Initilaize the tagger 179 | BiLSTMTagger tagger(1, CEMBED_SIZE, WEMBED_SIZE, HIDDEN_SIZE, MLP_SIZE, model, word_voc, char_voc, tag_voc, word_cnt); 180 | 181 | { 182 | duration fs = (system_clock::now() - start); 183 | float startup_time = duration_cast(fs).count() / float(1000); 184 | cout << "startup time: " << startup_time << endl; 185 | } 186 | 187 | // Do training 188 | start = system_clock::now(); 189 | int i = 0, all_tagged = 0, this_words = 0; 190 | float this_loss = 0.f, all_time = 0.f; 191 | for(int iter = 0; iter < 100; iter++) { 192 | shuffle(train.begin(), train.end(), *dynet::rndeng); 193 | for(auto & s : train) { 194 | i++; 195 | if(i % 500 == 0) { 196 | trainer.status(); 197 | cout << this_loss/this_words << endl; 198 | all_tagged += this_words; 199 | this_loss = 0.f; 200 | this_words = 0; 201 | } 202 | if(i % 10000 == 0) { 203 | duration fs = (system_clock::now() - start); 204 | all_time += duration_cast(fs).count() / float(1000); 205 | int dev_words = 0, dev_good = 0; 206 | float dev_loss = 0; 207 | for(auto & sent : dev) { 208 | vector tags = tagger.tag_sent(sent.first); 209 | for(size_t j = 0; j < tags.size(); ++j) 210 | if(tags[j] == sent.second[j]) 211 | dev_good++; 212 | dev_words += sent.second.size(); 213 | } 214 | cout << "acc=" << dev_good/float(dev_words) << ", time=" << all_time << ", word_per_sec=" << all_tagged/all_time << endl; 215 | if(all_time > TIMEOUT) 216 | exit(0); 217 | start = system_clock::now(); 218 | } 219 | 220 | ComputationGraph cg; 221 | Expression loss_exp = tagger.sent_loss(cg, s.first, s.second); 222 | this_loss += as_scalar(cg.forward(loss_exp)); 223 | this_words += s.first.size(); 224 | cg.backward(loss_exp); 225 | trainer.update(); 226 | } 227 | } 228 | return 0; 229 | } 230 | -------------------------------------------------------------------------------- /dynet-cpp/bilstm-tagger.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #ifdef BOOST_REGEX 6 | #include 7 | using namespace boost; 8 | #else 9 | #include 10 | #endif 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | using namespace std; 18 | using namespace std::chrono; 19 | using namespace dynet; 20 | 21 | // Read a file where each line is of the form "word1|tag1 word2|tag2 ..." 22 | // Yields pairs of lists of the form < [word1, word2, ...], [tag1, tag2, ...] > 23 | vector, vector > > read(const string & fname) { 24 | ifstream fh(fname); 25 | if(!fh) throw std::runtime_error("Could not open file"); 26 | string str; 27 | regex re("[ |]"); 28 | vector, vector > > sents; 29 | while(getline(fh, str)) { 30 | pair,vector > word_tags; 31 | sregex_token_iterator first{str.begin(), str.end(), re, -1}, last; 32 | while(first != last) { 33 | word_tags.first.push_back(*first++); 34 | assert(first != last); 35 | word_tags.second.push_back(*first++); 36 | } 37 | sents.push_back(word_tags); 38 | } 39 | return sents; 40 | } 41 | 42 | class BiLSTMTagger { 43 | public: 44 | 45 | BiLSTMTagger(unsigned layers, unsigned wembed_dim, unsigned hidden_dim, unsigned mlp_dim, ParameterCollection & model, Dict & wv, Dict & tv, unordered_map & wc) 46 | : wv(wv), tv(tv), wc(wc) { 47 | unsigned nwords = wv.size(); 48 | unsigned ntags = tv.size(); 49 | word_lookup = model.add_lookup_parameters(nwords, {wembed_dim}); 50 | 51 | // MLP on top of biLSTM outputs 100 -> 32 -> ntags 52 | pH = model.add_parameters({mlp_dim, hidden_dim*2}); 53 | pO = model.add_parameters({ntags, mlp_dim}); 54 | 55 | // word-level LSTMs 56 | fwdRNN = VanillaLSTMBuilder(layers, wembed_dim, hidden_dim, model); // layers, in-dim, out-dim, model 57 | bwdRNN = VanillaLSTMBuilder(layers, wembed_dim, hidden_dim, model); 58 | } 59 | 60 | Dict &wv, &tv; 61 | unordered_map & wc; 62 | LookupParameter word_lookup; 63 | Parameter pH, pO; 64 | VanillaLSTMBuilder fwdRNN, bwdRNN; 65 | 66 | // Do word representation 67 | Expression word_rep(ComputationGraph & cg, const string & w) { 68 | return lookup(cg, word_lookup, wv.convert(wc[w] > 5 ? w : "")); 69 | } 70 | 71 | vector build_tagging_graph(ComputationGraph & cg, const vector & words) { 72 | // parameters -> expressions 73 | Expression H = parameter(cg, pH); 74 | Expression O = parameter(cg, pO); 75 | 76 | // initialize the RNNs 77 | fwdRNN.new_graph(cg); 78 | bwdRNN.new_graph(cg); 79 | 80 | // get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. 81 | vector wembs(words.size()), fwds(words.size()), bwds(words.size()), fbwds(words.size()); 82 | for(size_t i = 0; i < words.size(); ++i) 83 | wembs[i] = word_rep(cg, words[i]); 84 | 85 | // feed word vectors into biLSTM 86 | fwdRNN.start_new_sequence(); 87 | for(size_t i = 0; i < wembs.size(); ++i) 88 | fwds[i] = fwdRNN.add_input(wembs[i]); 89 | bwdRNN.start_new_sequence(); 90 | for(size_t i = wembs.size(); i > 0; --i) 91 | bwds[i-1] = bwdRNN.add_input(wembs[i-1]); 92 | 93 | // Concatenate and MLP 94 | for(size_t i = 0; i < wembs.size(); ++i) 95 | fbwds[i] = O * tanh( H * concatenate({fwds[i], bwds[i]}) ); 96 | 97 | return fbwds; 98 | } 99 | 100 | Expression sent_loss(ComputationGraph & cg, vector & words, vector & tags) { 101 | vector exprs = build_tagging_graph(cg, words), errs(words.size()); 102 | for(size_t i = 0; i < tags.size(); ++i) 103 | errs[i] = pickneglogsoftmax(exprs[i], tv.convert(tags[i])); 104 | return sum(errs); 105 | } 106 | 107 | vector tag_sent(vector & words) { 108 | ComputationGraph cg; 109 | vector exprs = build_tagging_graph(cg, words), errs(words.size()); 110 | vector tags(words.size()); 111 | for(size_t i = 0; i < words.size(); ++i) { 112 | vector scores = as_vector(exprs[i].value()); 113 | size_t max_id = distance(scores.begin(), max_element(scores.begin(), scores.end())); 114 | tags[i] = tv.convert(max_id); 115 | } 116 | return tags; 117 | } 118 | 119 | }; 120 | 121 | int main(int argc, char**argv) { 122 | 123 | time_point start = system_clock::now(); 124 | 125 | vector, vector > > train = read("data/tags/train.txt"); 126 | vector, vector > > dev = read("data/tags/dev.txt"); 127 | Dict word_voc, tag_voc; 128 | unordered_map word_cnt; 129 | for(auto & sent : train) { 130 | for(auto & w : sent.first) { 131 | word_voc.convert(w); 132 | word_cnt[w]++; 133 | } 134 | for(auto & t : sent.second) 135 | tag_voc.convert(t); 136 | } 137 | tag_voc.freeze(); 138 | word_voc.convert(""); word_voc.freeze(); word_voc.set_unk(""); 139 | 140 | // DyNet Starts 141 | dynet::initialize(argc, argv); 142 | ParameterCollection model; 143 | AdamTrainer trainer(model); 144 | trainer.clipping_enabled = false; 145 | 146 | if(argc != 6) { 147 | cerr << "Usage: " << argv[0] << " WEMBED_SIZE HIDDEN_SIZE MLP_SIZE SPARSE TIMEOUT" << endl; 148 | return 1; 149 | } 150 | int WEMBED_SIZE = atoi(argv[1]); 151 | int HIDDEN_SIZE = atoi(argv[2]); 152 | int MLP_SIZE = atoi(argv[3]); 153 | trainer.sparse_updates_enabled = atoi(argv[4]); 154 | int TIMEOUT = atoi(argv[5]); 155 | 156 | // Initilaize the tagger 157 | BiLSTMTagger tagger(1, WEMBED_SIZE, HIDDEN_SIZE, MLP_SIZE, model, word_voc, tag_voc, word_cnt); 158 | 159 | { 160 | duration fs = (system_clock::now() - start); 161 | float startup_time = duration_cast(fs).count() / float(1000); 162 | cout << "startup time: " << startup_time << endl; 163 | } 164 | 165 | // Do training 166 | start = system_clock::now(); 167 | int i = 0, all_tagged = 0, this_words = 0; 168 | float this_loss = 0.f, all_time = 0.f; 169 | for(int iter = 0; iter < 100; iter++) { 170 | shuffle(train.begin(), train.end(), *dynet::rndeng); 171 | for(auto & s : train) { 172 | i++; 173 | if(i % 500 == 0) { 174 | trainer.status(); 175 | cout << this_loss/this_words << endl; 176 | all_tagged += this_words; 177 | this_loss = 0.f; 178 | this_words = 0; 179 | } 180 | if(i % 10000 == 0) { 181 | duration fs = (system_clock::now() - start); 182 | all_time += duration_cast(fs).count() / float(1000); 183 | int dev_words = 0, dev_good = 0; 184 | float dev_loss = 0; 185 | for(auto & sent : dev) { 186 | vector tags = tagger.tag_sent(sent.first); 187 | for(size_t j = 0; j < tags.size(); ++j) 188 | if(tags[j] == sent.second[j]) 189 | dev_good++; 190 | dev_words += sent.second.size(); 191 | } 192 | cout << "acc=" << dev_good/float(dev_words) << ", time=" << all_time << ", word_per_sec=" << all_tagged/all_time << endl; 193 | if(all_time > TIMEOUT) 194 | exit(0); 195 | start = system_clock::now(); 196 | } 197 | 198 | ComputationGraph cg; 199 | Expression loss_exp = tagger.sent_loss(cg, s.first, s.second); 200 | float my_loss = as_scalar(cg.forward(loss_exp)); 201 | this_loss += my_loss; 202 | this_words += s.first.size(); 203 | cg.backward(loss_exp); 204 | trainer.update(); 205 | } 206 | } 207 | return 0; 208 | } 209 | -------------------------------------------------------------------------------- /dynet-cpp/rnnlm-batch.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | using namespace std; 17 | using namespace std::chrono; 18 | using namespace dynet; 19 | 20 | // Read a file where each line is of the form "word1 word2 ..." 21 | // Yields lists of the form [word1, word2, ...] 22 | vector > read(const string & fname, Dict & vw) { 23 | ifstream fh(fname); 24 | if(!fh) throw std::runtime_error("Could not open file"); 25 | string str; 26 | vector > sents; 27 | while(getline(fh, str)) { 28 | istringstream iss(str); 29 | vector tokens; 30 | while(iss >> str) 31 | tokens.push_back(vw.convert(str)); 32 | tokens.push_back(vw.convert("")); 33 | sents.push_back(tokens); 34 | } 35 | return sents; 36 | } 37 | 38 | struct RNNLanguageModel { 39 | LookupParameter p_c; 40 | Parameter W_sm; 41 | Parameter b_sm; 42 | VanillaLSTMBuilder builder; 43 | explicit RNNLanguageModel(unsigned layers, unsigned input_dim, unsigned hidden_dim, unsigned vocab_size, ParameterCollection& model) : builder(layers, input_dim, hidden_dim, model) { 44 | p_c = model.add_lookup_parameters(vocab_size, {input_dim}, ParameterInitUniform(0.1)); 45 | W_sm = model.add_parameters({vocab_size, hidden_dim}, ParameterInitUniform(0.5)); 46 | b_sm = model.add_parameters({vocab_size}, ParameterInitUniform(0.5)); 47 | } 48 | 49 | Expression calc_lm_loss(const vector > & sent, int pos, int mb_size, ComputationGraph & cg) { 50 | 51 | // parameters -> expressions 52 | Expression W_exp = parameter(cg, W_sm); 53 | Expression b_exp = parameter(cg, b_sm); 54 | 55 | // initialize the RNN 56 | builder.new_graph(cg); // reset RNN builder for new graph 57 | builder.start_new_sequence(); 58 | 59 | // start the rnn by inputting "" 60 | size_t tot_sents = min(sent.size()-pos, (size_t)mb_size); 61 | vector wids(tot_sents, 0); 62 | vector masks(tot_sents); 63 | Expression s = builder.add_input(lookup(cg, p_c, wids)); 64 | 65 | // feed word vectors into the RNN and predict the next word 66 | vector losses; 67 | size_t j; 68 | for(size_t i = 0; i < sent[pos].size(); ++i) { 69 | // Get the words 70 | for(j = 0; j < tot_sents && i < sent[pos+j].size(); ++j) { 71 | wids[j] = sent[pos+j][i]; 72 | masks[j] = 1.f; 73 | } 74 | // And the masks 75 | for(; j < tot_sents; ++j) { 76 | wids[j] = 0; 77 | masks[j] = 0.f; 78 | } 79 | // calculate the softmax and loss 80 | Expression score = affine_transform({b_exp, W_exp, s}); 81 | Expression loss = pickneglogsoftmax(score, wids); 82 | if(0.f == *masks.rbegin()) 83 | loss = cmult(loss, input(cg, Dim({1}, tot_sents), masks)); 84 | losses.push_back(loss); 85 | // update the state of the RNN 86 | s = builder.add_input(lookup(cg, p_c, wids)); 87 | } 88 | 89 | return sum_batches(sum(losses)); 90 | } 91 | 92 | }; 93 | 94 | struct length_greater_then { 95 | inline bool operator() (const vector & struct1, const vector & struct2) { 96 | return (struct1.size() > struct2.size()); 97 | } 98 | }; 99 | 100 | vector prepare_minibatch(int mb_size, vector > & data) { 101 | stable_sort(data.begin(), data.end(), length_greater_then()); 102 | vector ids; 103 | for(size_t i = 0; i < data.size(); i += mb_size) 104 | ids.push_back(i); 105 | return ids; 106 | } 107 | 108 | int main(int argc, char** argv) { 109 | 110 | time_point start = system_clock::now(); 111 | 112 | // format of files: each line is "word1 word2 ..." 113 | string train_file = "data/text/train.txt"; 114 | string test_file = "data/text/dev.txt"; 115 | 116 | // DyNet Starts 117 | dynet::initialize(argc, argv); 118 | ParameterCollection model; 119 | 120 | if(argc != 6) { 121 | cerr << "Usage: " << argv[0] << " MB_SIZE EMBED_SIZE HIDDEN_SIZE SPARSE TIMEOUT" << endl; 122 | return 1; 123 | } 124 | int MB_SIZE = atoi(argv[1]); 125 | int EMBED_SIZE = atoi(argv[2]); 126 | int HIDDEN_SIZE = atoi(argv[3]); 127 | int SPARSE = atoi(argv[4]); 128 | int TIMEOUT = atoi(argv[5]); 129 | 130 | AdamTrainer trainer(model, 0.001); 131 | trainer.sparse_updates_enabled = SPARSE; 132 | trainer.clipping_enabled = false; 133 | 134 | Dict vw; 135 | vw.convert(""); 136 | vector > train = read(train_file, vw); 137 | vw.freeze(); 138 | vector > test = read(test_file, vw); 139 | vector train_ids = prepare_minibatch(MB_SIZE, train); 140 | vector test_ids = prepare_minibatch(MB_SIZE, test); 141 | int test_words = 0; 142 | for(auto & sent : test) test_words += sent.size(); 143 | 144 | int nwords = vw.size(); 145 | 146 | RNNLanguageModel rnnlm(1, EMBED_SIZE, HIDDEN_SIZE, nwords, model); 147 | 148 | { 149 | duration fs = (system_clock::now() - start); 150 | float startup_time = duration_cast(fs).count() / float(1000); 151 | cout << "startup time: " << startup_time << endl; 152 | } 153 | 154 | start = system_clock::now(); 155 | int i = 0, all_words = 0, this_words = 0; 156 | float this_loss = 0.f, all_time = 0.f; 157 | for(int iter = 0; iter < 100; iter++) { 158 | shuffle(train_ids.begin(), train_ids.end(), *dynet::rndeng); 159 | for(auto sid : train_ids) { 160 | i++; 161 | if(i % (500/MB_SIZE) == 0) { 162 | trainer.status(); 163 | cout << this_loss/this_words << endl; 164 | all_words += this_words; 165 | this_loss = 0.f; 166 | this_words = 0; 167 | } 168 | if(i % (10000/MB_SIZE) == 0) { 169 | duration fs = (system_clock::now() - start); 170 | all_time += duration_cast(fs).count() / float(1000); 171 | float test_loss = 0; 172 | for(auto sentid : test_ids) { 173 | ComputationGraph cg; 174 | Expression loss_exp = rnnlm.calc_lm_loss(test, sentid, MB_SIZE, cg); 175 | test_loss += as_scalar(cg.forward(loss_exp)); 176 | } 177 | cout << "nll=" << test_loss/test_words << ", ppl=" << exp(test_loss/test_words) << ", words=" << test_words << ", time=" << all_time << ", word_per_sec=" << all_words/all_time << endl; 178 | if(all_time > TIMEOUT) 179 | exit(0); 180 | start = system_clock::now(); 181 | } 182 | 183 | ComputationGraph cg; 184 | Expression loss_exp = rnnlm.calc_lm_loss(train, sid, MB_SIZE, cg); 185 | this_loss += as_scalar(cg.forward(loss_exp)); 186 | for(size_t pos = sid; pos < min((size_t)sid+MB_SIZE, train.size()); ++pos) 187 | this_words += train[pos].size(); 188 | cg.backward(loss_exp); 189 | trainer.update(); 190 | } 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /dynet-cpp/rnnlm-seq.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | using namespace std; 16 | using namespace std::chrono; 17 | using namespace dynet; 18 | 19 | // Read a file where each line is of the form "word1 word2 ..." 20 | // Yields lists of the form [word1, word2, ...] 21 | vector > read(const string & fname, Dict & vw) { 22 | ifstream fh(fname); 23 | if(!fh) throw std::runtime_error("Could not open file"); 24 | string str; 25 | vector > sents; 26 | while(getline(fh, str)) { 27 | istringstream iss(str); 28 | vector tokens; 29 | while(iss >> str) 30 | tokens.push_back(vw.convert(str)); 31 | tokens.push_back(vw.convert("")); 32 | sents.push_back(tokens); 33 | } 34 | return sents; 35 | } 36 | 37 | struct RNNLanguageModel { 38 | LookupParameter p_c; 39 | Parameter W_sm; 40 | Parameter b_sm; 41 | VanillaLSTMBuilder builder; 42 | explicit RNNLanguageModel(unsigned layers, unsigned input_dim, unsigned hidden_dim, unsigned vocab_size, ParameterCollection& model) : builder(layers, input_dim, hidden_dim, model) { 43 | p_c = model.add_lookup_parameters(vocab_size, {input_dim}, ParameterInitUniform(0.1)); 44 | W_sm = model.add_parameters({vocab_size, hidden_dim}, ParameterInitUniform(0.5)); 45 | b_sm = model.add_parameters({vocab_size}, ParameterInitUniform(0.5)); 46 | } 47 | 48 | Expression calc_lm_loss(const vector > & sent, int pos, int mb_size, ComputationGraph & cg) { 49 | 50 | // parameters -> expressions 51 | Expression W_exp = parameter(cg, W_sm); 52 | Expression b_exp = parameter(cg, b_sm); 53 | 54 | // initialize the RNN 55 | builder.new_graph(cg); // reset RNN builder for new graph 56 | builder.start_new_sequence(); 57 | 58 | // Create contexts and perform lookup 59 | size_t tot_sents = min(sent.size()-pos, (size_t)mb_size); 60 | vector > ctxts(sent.begin()+pos, sent.begin()+pos+tot_sents); 61 | for(auto & ctxt : ctxts) 62 | rotate(ctxt.begin(), ctxt.begin()+ctxt.size()-1, ctxt.end()); 63 | Expression looks = lookup_seq(cg, p_c, ctxts); 64 | 65 | // Generate the contexts and scores 66 | Expression states = builder.transduce_seq(looks); 67 | Expression scores = affine_transform({b_exp, W_exp, states}); 68 | 69 | // Calculate the loss 70 | vector > predicts(sent.begin()+pos, sent.begin()+pos+tot_sents); 71 | Expression losses = pickneglogsoftmax_seq(scores, predicts); 72 | 73 | return sum_batches(sum_rows(losses)); 74 | } 75 | 76 | }; 77 | 78 | struct length_greater_then { 79 | inline bool operator() (const vector & struct1, const vector & struct2) { 80 | return (struct1.size() > struct2.size()); 81 | } 82 | }; 83 | 84 | vector prepare_minibatch(int mb_size, vector > & data) { 85 | stable_sort(data.begin(), data.end(), length_greater_then()); 86 | vector ids; 87 | for(size_t i = 0; i < data.size(); i += mb_size) 88 | ids.push_back(i); 89 | return ids; 90 | } 91 | 92 | int main(int argc, char** argv) { 93 | 94 | time_point start = system_clock::now(); 95 | 96 | // format of files: each line is "word1 word2 ..." 97 | string train_file = "data/text/train.txt"; 98 | string test_file = "data/text/dev.txt"; 99 | 100 | // DyNet Starts 101 | dynet::initialize(argc, argv); 102 | ParameterCollection model; 103 | 104 | if(argc != 6) { 105 | cerr << "Usage: " << argv[0] << " MB_SIZE EMBED_SIZE HIDDEN_SIZE SPARSE TIMEOUT" << endl; 106 | return 1; 107 | } 108 | int MB_SIZE = atoi(argv[1]); 109 | int EMBED_SIZE = atoi(argv[2]); 110 | int HIDDEN_SIZE = atoi(argv[3]); 111 | int SPARSE = atoi(argv[4]); 112 | int TIMEOUT = atoi(argv[5]); 113 | 114 | AdamTrainer trainer(model, 0.001); 115 | trainer.sparse_updates_enabled = SPARSE; 116 | trainer.clipping_enabled = false; 117 | 118 | Dict vw; 119 | vw.convert(""); 120 | vector > train = read(train_file, vw); 121 | vw.freeze(); 122 | vector > test = read(test_file, vw); 123 | vector train_ids = prepare_minibatch(MB_SIZE, train); 124 | vector test_ids = prepare_minibatch(MB_SIZE, test); 125 | int test_words = 0; 126 | for(auto & sent : test) test_words += sent.size(); 127 | 128 | int nwords = vw.size(); 129 | 130 | RNNLanguageModel rnnlm(1, EMBED_SIZE, HIDDEN_SIZE, nwords, model); 131 | 132 | { 133 | duration fs = (system_clock::now() - start); 134 | float startup_time = duration_cast(fs).count() / float(1000); 135 | cout << "startup time: " << startup_time << endl; 136 | } 137 | 138 | start = system_clock::now(); 139 | int i = 0, all_words = 0, this_words = 0; 140 | float this_loss = 0.f, all_time = 0.f; 141 | for(int iter = 0; iter < 100; iter++) { 142 | shuffle(train_ids.begin(), train_ids.end(), *dynet::rndeng); 143 | for(auto sid : train_ids) { 144 | i++; 145 | if(i % (500/MB_SIZE) == 0) { 146 | trainer.status(); 147 | cout << this_loss/this_words << endl; 148 | all_words += this_words; 149 | this_loss = 0.f; 150 | this_words = 0; 151 | } 152 | if(i % (10000/MB_SIZE) == 0) { 153 | duration fs = (system_clock::now() - start); 154 | all_time += duration_cast(fs).count() / float(1000); 155 | float test_loss = 0; 156 | for(auto sentid : test_ids) { 157 | ComputationGraph cg; 158 | Expression loss_exp = rnnlm.calc_lm_loss(test, sentid, MB_SIZE, cg); 159 | test_loss += as_scalar(cg.forward(loss_exp)); 160 | } 161 | cout << "nll=" << test_loss/test_words << ", ppl=" << exp(test_loss/test_words) << ", words=" << test_words << ", time=" << all_time << ", word_per_sec=" << all_words/all_time << endl; 162 | if(all_time > TIMEOUT) 163 | exit(0); 164 | start = system_clock::now(); 165 | } 166 | 167 | ComputationGraph cg; 168 | Expression loss_exp = rnnlm.calc_lm_loss(train, sid, MB_SIZE, cg); 169 | this_loss += as_scalar(cg.forward(loss_exp)); 170 | for(size_t pos = sid; pos < min((size_t)sid+MB_SIZE, train.size()); ++pos) 171 | this_words += train[pos].size(); 172 | cg.backward(loss_exp); 173 | trainer.update(); 174 | } 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /dynet-cpp/treenn.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #ifdef BOOST_REGEX 6 | #include 7 | using namespace boost; 8 | #else 9 | #include 10 | #endif 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | using namespace std; 17 | using namespace std::chrono; 18 | using namespace dynet; 19 | 20 | class Tree { 21 | public: 22 | 23 | Tree(const string & label, vector children = vector()) 24 | : label(label), children(children) { } 25 | ~Tree() { 26 | for(auto child : children) delete child; 27 | } 28 | 29 | static Tree* from_sexpr(const string & str) { 30 | vector toks = tokenize_sexpr(str); 31 | vector::const_iterator tokit = toks.begin(); 32 | if(*(tokit++) != "(") throw runtime_error("Poorly structured tree"); 33 | return Tree::within_bracket(tokit); 34 | } 35 | 36 | static vector tokenize_sexpr(const string & s) { 37 | regex tokker(" +|[()]|[^ ()]+"); 38 | vector toks; 39 | for(auto it = sregex_iterator(s.begin(), s.end(), tokker); it != sregex_iterator(); ++it) { 40 | string m = it->str(); 41 | if(m != " ") 42 | toks.push_back(m); 43 | } 44 | return toks; 45 | } 46 | 47 | static Tree* within_bracket(vector::const_iterator & tokit) { 48 | const string & label = *(tokit++); 49 | vector children; 50 | while(true) { 51 | const string & tok = *(tokit++); 52 | if(tok == "(") { 53 | children.push_back(within_bracket(tokit)); 54 | } else if(tok == ")") { 55 | return new Tree(label, children); 56 | } else { 57 | children.push_back(new Tree(tok)); 58 | } 59 | } 60 | throw runtime_error("Poorly structured tree"); 61 | } 62 | 63 | void nonterms(vector & ret) { 64 | if(!isleaf()) { 65 | ret.push_back(this); 66 | for(Tree* child : children) child->nonterms(ret); 67 | } 68 | } 69 | 70 | bool isleaf() const { return children.size() == 0; } 71 | 72 | void make_vocab(Dict & nonterm_voc, Dict & term_voc) { 73 | (isleaf() ? term_voc : nonterm_voc).convert(label); 74 | for(Tree* tr : children) tr->make_vocab(nonterm_voc, term_voc); 75 | } 76 | 77 | string label; 78 | vector children; 79 | Expression expr; 80 | 81 | }; 82 | 83 | ostream& operator<<(ostream& os, const Tree& tr) { 84 | if(tr.isleaf()) { 85 | os << tr.label; 86 | } else { 87 | os << '(' << tr.label; 88 | for(auto child : tr.children) os << ' ' << *child; 89 | os << ')'; 90 | } 91 | return os; 92 | } 93 | 94 | vector read_dataset(const string & filename) { 95 | ifstream file(filename); 96 | if(!file) throw runtime_error("Missing file"); 97 | string line; 98 | vector ret; 99 | while(getline(file, line)) ret.push_back(Tree::from_sexpr(line)); 100 | return ret; 101 | } 102 | 103 | class TreeLSTMBuilder { 104 | public: 105 | TreeLSTMBuilder(ParameterCollection & model, Dict & word_vocab, unsigned wdim, unsigned hdim) : 106 | model(model), word_vocab(word_vocab), wdim(wdim), hdim(hdim) { 107 | WS = {model.add_parameters({hdim, wdim}), // 0: Wi 108 | model.add_parameters({hdim, wdim}), // 1: Wo 109 | model.add_parameters({hdim, wdim}), // 2: Wu 110 | model.add_parameters({hdim, 2*hdim}), // 3: Ui 111 | model.add_parameters({hdim, 2*hdim}), // 4: Uo 112 | model.add_parameters({hdim, 2*hdim}), // 5: Uu 113 | model.add_parameters({hdim, hdim}), // 6: UFS1 114 | model.add_parameters({hdim, hdim}), // 7: UFS2 115 | model.add_parameters({hdim}), // 8: Bi 116 | model.add_parameters({hdim}), // 9: Bo 117 | model.add_parameters({hdim}), // 10: Bu 118 | model.add_parameters({hdim})};// 11: Bf 119 | E = model.add_lookup_parameters(word_vocab.size(),{wdim}); 120 | cg_WS.resize(WS.size()); 121 | } 122 | 123 | void start_graph(ComputationGraph & c) { 124 | cg = &c; 125 | for(size_t i = 0; i < WS.size(); ++i) 126 | cg_WS[i] = parameter(*cg, WS[i]); 127 | } 128 | 129 | pair expr_for_tree(Tree & tree, bool decorate = false) { 130 | assert(!tree.isleaf()); 131 | pair hc_ret; 132 | if(tree.children.size() == 1) { 133 | assert(tree.children[0]->isleaf()); 134 | Expression emb, i, o, u, c, expr; 135 | emb = lookup(*cg, E, word_vocab.convert(tree.children[0]->label)); 136 | i = logistic(affine_transform({cg_WS[8], cg_WS[0], emb})); 137 | o = logistic(affine_transform({cg_WS[9], cg_WS[1], emb})); 138 | u = tanh( affine_transform({cg_WS[10], cg_WS[2], emb})); 139 | hc_ret.second = cmult(i,u); 140 | hc_ret.first = cmult(o,tanh(hc_ret.second)); 141 | } else { 142 | assert(tree.children.size() == 2); 143 | Expression e, i, o, u, f1, f2, c, expr; 144 | pair hc1, hc2; 145 | hc1 = expr_for_tree(*tree.children[0], decorate); 146 | hc2 = expr_for_tree(*tree.children[1], decorate); 147 | e = concatenate({hc1.first,hc2.first}); 148 | i = logistic(affine_transform({cg_WS[8], cg_WS[3], e})); 149 | o = logistic(affine_transform({cg_WS[9], cg_WS[4], e})); 150 | u = tanh( affine_transform({cg_WS[10], cg_WS[5], e})); 151 | f1 = logistic(affine_transform({cg_WS[11], cg_WS[6], hc1.first})); 152 | f2 = logistic(affine_transform({cg_WS[11], cg_WS[7], hc2.first})); 153 | hc_ret.second = cmult(i,u) + cmult(f1,hc1.second) + cmult(f2,hc2.second); 154 | hc_ret.first = cmult(o,tanh(hc_ret.second)); 155 | } 156 | if(decorate) { tree.expr = hc_ret.first; } 157 | return hc_ret; 158 | } 159 | 160 | ParameterCollection & model; 161 | Dict & word_vocab; 162 | unsigned wdim, hdim; 163 | vector WS; 164 | LookupParameter E; 165 | 166 | ComputationGraph * cg; 167 | vector cg_WS; 168 | 169 | }; 170 | 171 | int main(int argc, char**argv) { 172 | 173 | time_point start = system_clock::now(); 174 | 175 | vector train = read_dataset("data/trees/train.txt"); 176 | vector dev = read_dataset("data/trees/dev.txt"); 177 | Dict nonterm_voc, term_voc; 178 | for(auto tree : train) tree->make_vocab(nonterm_voc, term_voc); 179 | nonterm_voc.freeze(); 180 | term_voc.convert(""); term_voc.freeze(); term_voc.set_unk(""); 181 | 182 | // DyNet Starts 183 | dynet::initialize(argc, argv); 184 | ParameterCollection model; 185 | AdamTrainer trainer(model, 0.001); 186 | trainer.clipping_enabled = false; 187 | 188 | if(argc != 5) { 189 | cerr << "Usage: " << argv[0] << " WEMBED_SIZE HIDDEN_SIZE SPARSE TIMEOUT" << endl; 190 | return 1; 191 | } 192 | unsigned WEMBED_SIZE = atoi(argv[1]); 193 | unsigned HIDDEN_SIZE = atoi(argv[2]); 194 | trainer.sparse_updates_enabled = atoi(argv[3]); 195 | int TIMEOUT = atoi(argv[4]); 196 | 197 | // Builder 198 | Parameter W_param = model.add_parameters({nonterm_voc.size(), HIDDEN_SIZE}); 199 | TreeLSTMBuilder builder(model, term_voc, WEMBED_SIZE, HIDDEN_SIZE); 200 | 201 | { 202 | duration fs = (system_clock::now() - start); 203 | float startup_time = duration_cast(fs).count() / float(1000); 204 | cout << "startup time: " << startup_time << endl; 205 | } 206 | 207 | int i = 0, all_tagged = 0, this_nodes = 0; 208 | float this_loss = 0.f, all_time = 0.f; 209 | for(int iter = 0; iter < 100; iter++) { 210 | shuffle(train.begin(), train.end(), *dynet::rndeng); 211 | start = system_clock::now(); 212 | for(auto tree : train) { 213 | ComputationGraph cg; 214 | builder.start_graph(cg); 215 | Expression W = parameter(cg, W_param); 216 | pair hc = builder.expr_for_tree(*tree, true); 217 | vector losses; 218 | vector nodes; tree->nonterms(nodes); 219 | for(auto nt : nodes) 220 | losses.push_back(pickneglogsoftmax(W*nt->expr, nonterm_voc.convert(nt->label))); 221 | Expression loss = sum(losses); 222 | cg.forward(loss); 223 | this_loss += as_scalar(loss.value()); 224 | this_nodes += nodes.size(); 225 | cg.backward(loss); 226 | trainer.update(); 227 | if(++i % 1000 == 0) { 228 | trainer.status(); 229 | cout << this_loss / this_nodes << endl; 230 | this_loss = 0; this_nodes = 0; 231 | } 232 | } 233 | std::chrono::duration fs = (system_clock::now() - start); 234 | all_time += duration_cast(fs).count() / float(1000); 235 | int good = 0, bad = 0; 236 | for(auto tree : dev) { 237 | ComputationGraph cg; 238 | builder.start_graph(cg); 239 | Expression W = parameter(cg, W_param); 240 | pair hc = builder.expr_for_tree(*tree, false); 241 | vector scores = as_vector((W*hc.first).value()); 242 | size_t max_id = std::distance(scores.begin(), std::max_element(scores.begin(), scores.end())); 243 | (nonterm_voc.convert(max_id) == tree->label ? good : bad)++; 244 | } 245 | cout << "acc=" << good/float(good+bad) << ", time=" << all_time << ", sent_per_sec=" << i/all_time << endl; 246 | if(all_time > TIMEOUT) 247 | exit(0); 248 | } 249 | } 250 | -------------------------------------------------------------------------------- /dynet-py/bilstm-tagger-withchar.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start = time.time() 4 | 5 | from collections import Counter, defaultdict 6 | import random 7 | import sys 8 | import argparse 9 | 10 | import dynet as dy 11 | import numpy as np 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--dynet-seed", default=0, type=int) 15 | parser.add_argument("--dynet-gpus", default=0, type=int) 16 | parser.add_argument("--dynet-mem", default=512, type=int) 17 | parser.add_argument('CEMBED_SIZE', type=int, help='char embedding size') 18 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size') 19 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 20 | parser.add_argument('MLP_SIZE', type=int, help='embedding size') 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 23 | args = parser.parse_args() 24 | 25 | # format of files: each line is "word1|tag2 word2|tag2 ..." 26 | train_file="data/tags/train.txt" 27 | dev_file="data/tags/dev.txt" 28 | 29 | class Vocab: 30 | def __init__(self, w2i=None): 31 | if w2i is None: w2i = defaultdict(lambda: len(w2i)) 32 | self.w2i = dict(w2i) 33 | self.i2w = {i:w for w,i in w2i.items()} 34 | @classmethod 35 | def from_corpus(cls, corpus): 36 | w2i = defaultdict(lambda: len(w2i)) 37 | for sent in corpus: 38 | [w2i[word] for word in sent] 39 | return Vocab(w2i) 40 | 41 | def size(self): return len(self.w2i.keys()) 42 | 43 | def read(fname): 44 | """ 45 | Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..." 46 | Yields lists of the form [(word1,tag1), (word2,tag2), ...] 47 | """ 48 | with open(fname, "r") as fh: 49 | for line in fh: 50 | line = line.strip().split() 51 | sent = [tuple(x.rsplit("|",1)) for x in line] 52 | yield sent 53 | 54 | train=list(read(train_file)) 55 | dev=list(read(dev_file)) 56 | words=[] 57 | tags=[] 58 | chars=set() 59 | wc=Counter() 60 | for sent in train: 61 | for w,p in sent: 62 | words.append(w) 63 | tags.append(p) 64 | chars.update(w) 65 | wc[w]+=1 66 | words.append("_UNK_") 67 | chars.add("<*>") 68 | chars.add("_UNK_") 69 | 70 | vw = Vocab.from_corpus([words]) 71 | vt = Vocab.from_corpus([tags]) 72 | vc = Vocab.from_corpus([chars]) 73 | UNK = vw.w2i["_UNK_"] 74 | CUNK = vc.w2i["_UNK_"] 75 | 76 | nwords = vw.size() 77 | ntags = vt.size() 78 | nchars = vc.size() 79 | print ("nwords=%r, ntags=%r, nchars=%r" % (nwords, ntags, nchars)) 80 | 81 | # DyNet Starts 82 | 83 | model = dy.Model() 84 | trainer = dy.AdamTrainer(model) 85 | trainer.set_clip_threshold(-1.0) 86 | trainer.set_sparse_updates(True if args.SPARSE == 1 else False) 87 | 88 | WORDS_LOOKUP = model.add_lookup_parameters((nwords, args.WEMBED_SIZE)) 89 | CHARS_LOOKUP = model.add_lookup_parameters((nchars, args.CEMBED_SIZE)) 90 | 91 | # MLP on top of biLSTM outputs 100 -> 32 -> ntags 92 | pH = model.add_parameters((args.MLP_SIZE, args.HIDDEN_SIZE*2)) 93 | pO = model.add_parameters((ntags, args.MLP_SIZE)) 94 | 95 | # word-level LSTMs 96 | fwdRNN = dy.VanillaLSTMBuilder(1, args.WEMBED_SIZE, args.HIDDEN_SIZE, model) # layers, in-dim, out-dim, model 97 | bwdRNN = dy.VanillaLSTMBuilder(1, args.WEMBED_SIZE, args.HIDDEN_SIZE, model) 98 | 99 | # char-level LSTMs 100 | cFwdRNN = dy.VanillaLSTMBuilder(1, args.CEMBED_SIZE, args.WEMBED_SIZE/2, model) 101 | cBwdRNN = dy.VanillaLSTMBuilder(1, args.CEMBED_SIZE, args.WEMBED_SIZE/2, model) 102 | 103 | def word_rep(w, cf_init, cb_init): 104 | if wc[w] > 5: 105 | w_index = vw.w2i[w] 106 | return WORDS_LOOKUP[w_index] 107 | else: 108 | pad_char = vc.w2i["<*>"] 109 | char_ids = [pad_char] + [vc.w2i.get(c,CUNK) for c in w] + [pad_char] 110 | char_embs = [CHARS_LOOKUP[cid] for cid in char_ids] 111 | fw_exps = cf_init.transduce(char_embs) 112 | bw_exps = cb_init.transduce(reversed(char_embs)) 113 | return dy.concatenate([ fw_exps[-1], bw_exps[-1] ]) 114 | 115 | def build_tagging_graph(words): 116 | dy.renew_cg() 117 | # parameters -> expressions 118 | H = dy.parameter(pH) 119 | O = dy.parameter(pO) 120 | 121 | # initialize the RNNs 122 | f_init = fwdRNN.initial_state() 123 | b_init = bwdRNN.initial_state() 124 | 125 | cf_init = cFwdRNN.initial_state() 126 | cb_init = cBwdRNN.initial_state() 127 | 128 | # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. 129 | wembs = [word_rep(w, cf_init, cb_init) for w in words] 130 | 131 | # feed word vectors into biLSTM 132 | fw_exps = f_init.transduce(wembs) 133 | bw_exps = b_init.transduce(reversed(wembs)) 134 | 135 | # biLSTM states 136 | bi_exps = [dy.concatenate([f,b]) for f,b in zip(fw_exps, reversed(bw_exps))] 137 | 138 | # feed each biLSTM state to an MLP 139 | exps = [] 140 | for x in bi_exps: 141 | r_t = O*(dy.tanh(H * x)) 142 | exps.append(r_t) 143 | 144 | return exps 145 | 146 | def sent_loss_precalc(words, tags, vecs): 147 | errs = [] 148 | for v,t in zip(vecs,tags): 149 | tid = vt.w2i[t] 150 | err = dy.pickneglogsoftmax(v, tid) 151 | errs.append(err) 152 | return dy.esum(errs) 153 | 154 | def sent_loss(words, tags): 155 | return sent_loss_precalc(words, tags, build_tagging_graph(words)) 156 | 157 | def tag_sent_precalc(words, vecs): 158 | log_probs = [v.npvalue() for v in vecs] 159 | tags = [] 160 | for prb in log_probs: 161 | tag = np.argmax(prb) 162 | tags.append(vt.i2w[tag]) 163 | return zip(words, tags) 164 | 165 | def tag_sent(words): 166 | return tag_sent_precalc(words, build_tagging_graph(words)) 167 | 168 | print ("startup time: %r" % (time.time() - start)) 169 | start = time.time() 170 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0 171 | for ITER in range(100): 172 | random.shuffle(train) 173 | for s in train: 174 | i += 1 175 | if i % 500 == 0: # print status 176 | trainer.status() 177 | print (this_loss / this_tagged, file=sys.stderr) 178 | all_tagged += this_tagged 179 | this_loss = this_tagged = 0 180 | all_time = time.time() - start 181 | if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev 182 | dev_start = time.time() 183 | good_sent = bad_sent = good = bad = 0.0 184 | for sent in dev: 185 | words = [w for w,t in sent] 186 | golds = [t for w,t in sent] 187 | tags = [t for w,t in tag_sent(words)] 188 | if tags == golds: good_sent += 1 189 | else: bad_sent += 1 190 | for go,gu in zip(golds,tags): 191 | if go == gu: good += 1 192 | else: bad += 1 193 | dev_time += time.time() - dev_start 194 | train_time = time.time() - start - dev_time 195 | print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time)) 196 | if all_time > args.TIMEOUT: 197 | sys.exit(0) 198 | # train on sent 199 | words = [w for w,t in s] 200 | golds = [t for w,t in s] 201 | 202 | loss_exp = sent_loss(words, golds) 203 | this_loss += loss_exp.scalar_value() 204 | this_tagged += len(golds) 205 | loss_exp.backward() 206 | trainer.update() 207 | print ("epoch %r finished" % ITER) 208 | trainer.update_epoch(1.0) 209 | -------------------------------------------------------------------------------- /dynet-py/bilstm-tagger.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start = time.time() 4 | 5 | from collections import Counter, defaultdict 6 | import random 7 | import sys 8 | import argparse 9 | 10 | import dynet as dy 11 | import numpy as np 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--dynet-seed", default=0, type=int) 15 | parser.add_argument("--dynet-gpus", default=0, type=int) 16 | parser.add_argument("--dynet-mem", default=512, type=int) 17 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size') 18 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 19 | parser.add_argument('MLP_SIZE', type=int, help='embedding size') 20 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 21 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 22 | args = parser.parse_args() 23 | 24 | # format of files: each line is "word1|tag2 word2|tag2 ..." 25 | train_file="data/tags/train.txt" 26 | dev_file="data/tags/dev.txt" 27 | 28 | class Vocab: 29 | def __init__(self, w2i=None): 30 | if w2i is None: w2i = defaultdict(lambda: len(w2i)) 31 | self.w2i = dict(w2i) 32 | self.i2w = {i:w for w,i in w2i.items()} 33 | @classmethod 34 | def from_corpus(cls, corpus): 35 | w2i = defaultdict(lambda: len(w2i)) 36 | for sent in corpus: 37 | [w2i[word] for word in sent] 38 | return Vocab(w2i) 39 | 40 | def size(self): return len(self.w2i.keys()) 41 | 42 | def read(fname): 43 | """ 44 | Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..." 45 | Yields lists of the form [(word1,tag1), (word2,tag2), ...] 46 | """ 47 | with open(fname, "r") as fh: 48 | for line in fh: 49 | line = line.strip().split() 50 | sent = [tuple(x.rsplit("|",1)) for x in line] 51 | yield sent 52 | 53 | train=list(read(train_file)) 54 | dev=list(read(dev_file)) 55 | words=[] 56 | tags=[] 57 | wc=Counter() 58 | for sent in train: 59 | for w,p in sent: 60 | words.append(w) 61 | tags.append(p) 62 | wc[w]+=1 63 | words.append("_UNK_") 64 | 65 | vw = Vocab.from_corpus([words]) 66 | vt = Vocab.from_corpus([tags]) 67 | UNK = vw.w2i["_UNK_"] 68 | 69 | nwords = vw.size() 70 | ntags = vt.size() 71 | print ("nwords=%r, ntags=%r" % (nwords, ntags)) 72 | 73 | # DyNet Starts 74 | 75 | model = dy.Model() 76 | trainer = dy.AdamTrainer(model) 77 | trainer.set_clip_threshold(-1.0) 78 | trainer.set_sparse_updates(True if args.SPARSE == 1 else False) 79 | 80 | WORDS_LOOKUP = model.add_lookup_parameters((nwords, args.WEMBED_SIZE)) 81 | 82 | # MLP on top of biLSTM outputs 100 -> 32 -> ntags 83 | pH = model.add_parameters((args.MLP_SIZE, args.HIDDEN_SIZE*2)) 84 | pO = model.add_parameters((ntags, args.MLP_SIZE)) 85 | 86 | # word-level LSTMs 87 | fwdRNN = dy.VanillaLSTMBuilder(1, args.WEMBED_SIZE, args.HIDDEN_SIZE, model) # layers, in-dim, out-dim, model 88 | bwdRNN = dy.VanillaLSTMBuilder(1, args.WEMBED_SIZE, args.HIDDEN_SIZE, model) 89 | 90 | def word_rep(w): 91 | widx = vw.w2i[w] if wc[w] > 5 else UNK 92 | return WORDS_LOOKUP[widx] 93 | 94 | def build_tagging_graph(words): 95 | dy.renew_cg() 96 | # parameters -> expressions 97 | H = dy.parameter(pH) 98 | O = dy.parameter(pO) 99 | 100 | # initialize the RNNs 101 | f_init = fwdRNN.initial_state() 102 | b_init = bwdRNN.initial_state() 103 | 104 | # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. 105 | wembs = [] 106 | for i, w in enumerate(words): 107 | wembs.append(word_rep(w)) 108 | 109 | # feed word vectors into biLSTM 110 | fw_exps = f_init.transduce(wembs) 111 | bw_exps = b_init.transduce(reversed(wembs)) 112 | 113 | # biLSTM states 114 | bi_exps = [dy.concatenate([f,b]) for f,b in zip(fw_exps, reversed(bw_exps))] 115 | 116 | # feed each biLSTM state to an MLP 117 | exps = [] 118 | for x in bi_exps: 119 | r_t = O*(dy.tanh(H * x)) 120 | exps.append(r_t) 121 | 122 | return exps 123 | 124 | def sent_loss_precalc(words, tags, vecs): 125 | errs = [] 126 | for v,t in zip(vecs,tags): 127 | tid = vt.w2i[t] 128 | err = dy.pickneglogsoftmax(v, tid) 129 | errs.append(err) 130 | return dy.esum(errs) 131 | 132 | def sent_loss(words, tags): 133 | return sent_loss_precalc(words, tags, build_tagging_graph(words)) 134 | 135 | def tag_sent_precalc(words, vecs): 136 | log_probs = [v.npvalue() for v in vecs] 137 | tags = [] 138 | for prb in log_probs: 139 | tag = np.argmax(prb) 140 | tags.append(vt.i2w[tag]) 141 | return zip(words, tags) 142 | 143 | def tag_sent(words): 144 | return tag_sent_precalc(words, build_tagging_graph(words)) 145 | 146 | print ("startup time: %r" % (time.time() - start)) 147 | start = time.time() 148 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0 149 | for ITER in range(100): 150 | random.shuffle(train) 151 | for s in train: 152 | i += 1 153 | if i % 500 == 0: # print status 154 | trainer.status() 155 | print(this_loss / this_tagged, file=sys.stderr) 156 | all_tagged += this_tagged 157 | this_loss = this_tagged = 0 158 | all_time = time.time() - start 159 | if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev 160 | dev_start = time.time() 161 | good_sent = bad_sent = good = bad = 0.0 162 | for sent in dev: 163 | words = [w for w,t in sent] 164 | golds = [t for w,t in sent] 165 | tags = [t for w,t in tag_sent(words)] 166 | if tags == golds: good_sent += 1 167 | else: bad_sent += 1 168 | for go,gu in zip(golds,tags): 169 | if go == gu: good += 1 170 | else: bad += 1 171 | dev_time += time.time() - dev_start 172 | train_time = time.time() - start - dev_time 173 | print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time)) 174 | if all_time > args.TIMEOUT: 175 | sys.exit(0) 176 | # train on sent 177 | words = [w for w,t in s] 178 | golds = [t for w,t in s] 179 | 180 | loss_exp = sent_loss(words, golds) 181 | my_loss = loss_exp.scalar_value() 182 | this_loss += my_loss; 183 | this_tagged += len(golds) 184 | loss_exp.backward() 185 | trainer.update() 186 | print("epoch %r finished" % ITER) 187 | trainer.update_epoch(1.0) 188 | -------------------------------------------------------------------------------- /dynet-py/bow.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start = time.time() 4 | 5 | from collections import defaultdict 6 | import random 7 | import dynet as dy 8 | import numpy as np 9 | import sys 10 | 11 | # Functions to read in the corpus 12 | w2i = defaultdict(lambda: len(w2i)) 13 | t2i = defaultdict(lambda: len(t2i)) 14 | UNK = w2i[""] 15 | def read_dataset(filename): 16 | with open(filename, "r") as f: 17 | for line in f: 18 | tag, words = line.lower().strip().split(" ||| ") 19 | yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 20 | 21 | # Read in the data 22 | train = list(read_dataset("data/classes/train.txt")) 23 | w2i = defaultdict(lambda: UNK, w2i) 24 | dev = list(read_dataset("data/classes/test.txt")) 25 | nwords = len(w2i) 26 | ntags = len(t2i) 27 | 28 | # Start DyNet and define trainer 29 | model = dy.Model() 30 | trainer = dy.AdamTrainer(model, 0.001) 31 | trainer.set_clip_threshold(-1.0) 32 | trainer.set_sparse_updates(False) 33 | 34 | # Define the model 35 | W_sm = model.add_lookup_parameters((nwords, ntags), dy.ConstInitializer(0.0)) # Word weights 36 | b_sm = model.add_parameters((ntags), dy.ConstInitializer(0.0)) # Softmax bias 37 | 38 | # A function to calculate scores for one value 39 | def calc_scores(words): 40 | dy.renew_cg() 41 | score = dy.esum([dy.lookup(W_sm, x) for x in words]) 42 | b_sm_exp = dy.parameter(b_sm) 43 | return score + b_sm_exp 44 | 45 | print ("startup time: %r" % (time.time() - start)) 46 | for ITER in range(100): 47 | # Perform training 48 | # random.shuffle(train) 49 | train_loss = 0.0 50 | start = time.time() 51 | for i, (words, tag) in enumerate(train): 52 | scores = calc_scores(words) 53 | my_loss = dy.pickneglogsoftmax(scores, tag) 54 | train_loss += my_loss.value() 55 | my_loss.backward() 56 | trainer.update() 57 | # print(b_sm.as_array()) 58 | # if i > 5: 59 | # sys.exit(0) 60 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start)) 61 | # Perform testing 62 | test_correct = 0.0 63 | for words, tag in dev: 64 | scores = calc_scores(words).npvalue() 65 | predict = np.argmax(scores) 66 | if predict == tag: 67 | test_correct += 1 68 | print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev))) 69 | -------------------------------------------------------------------------------- /dynet-py/rnnlm-batch-batch.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start = time.time() 4 | 5 | from collections import Counter, defaultdict 6 | import random 7 | import math 8 | import sys 9 | import argparse 10 | 11 | import dynet as dy 12 | import numpy as np 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--dynet-seed", default=0, type=int) 16 | parser.add_argument("--dynet-mem", default=512, type=int) 17 | parser.add_argument("--dynet-gpus", default=0, type=int) 18 | parser.add_argument('MB_SIZE', type=int, help='minibatch size') 19 | parser.add_argument('EMBED_SIZE', type=int, help='embedding size') 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 23 | args = parser.parse_args() 24 | 25 | BATCHED_SENT = True 26 | 27 | def split_cols(matrix): 28 | total, rows, cols, bt = matrix.dim() 29 | assert bt == 1 30 | return [dy.reshape(dy.select_cols(matrix, [i]), (rows,), batch_size=bt) for i in xrange(cols)] 31 | 32 | def batch(cols): 33 | total,nrows,ncols,nbatch=cols[0].dim() 34 | assert(nbatch==1)# doesn't currently work with batched 35 | cols_ = dy.concatenate_cols(cols) 36 | return dy.reshape(cols_, (nrows,ncols), batch_size=len(cols)) 37 | 38 | def unbatch(B): 39 | d = B.dim() 40 | return split_cols(dy.reshape(B,(d[1],d[3]), batch_size=1)) 41 | 42 | # format of files: each line is "word1/tag2 word2/tag2 ..." 43 | train_file="data/text/train.txt" 44 | test_file="data/text/dev.txt" 45 | 46 | w2i = defaultdict(lambda: len(w2i)) 47 | 48 | def read(fname): 49 | """ 50 | Read a file where each line is of the form "word1 word2 ..." 51 | Yields lists of the form [word1, word2, ...] 52 | """ 53 | with open(fname, "r") as fh: 54 | for line in fh: 55 | sent = [w2i[x] for x in line.strip().split()] 56 | sent.append(w2i[""]) 57 | yield sent 58 | 59 | train=list(read(train_file)) 60 | nwords = len(w2i) 61 | test=list(read(test_file)) 62 | S = w2i[""] 63 | assert(nwords == len(w2i)) 64 | 65 | # DyNet Starts 66 | 67 | model = dy.Model() 68 | trainer = dy.AdamTrainer(model) 69 | trainer.set_clip_threshold(-1.0) 70 | #trainer.set_sparse_updates(True if args.SPARSE == 1 else False) 71 | 72 | # Lookup parameters for word embeddings 73 | WORDS_LOOKUP = model.add_lookup_parameters((nwords, args.EMBED_SIZE)) 74 | 75 | # Word-level LSTM (layers=1, input=64, output=128, model) 76 | RNN = dy.VanillaLSTMBuilder(1, args.EMBED_SIZE, args.HIDDEN_SIZE, model) 77 | 78 | # Softmax weights/biases on top of LSTM outputs 79 | W_sm = model.add_parameters((nwords, args.HIDDEN_SIZE)) 80 | b_sm = model.add_parameters(nwords) 81 | 82 | # Build the language model graph 83 | def calc_lm_loss(sents): 84 | 85 | dy.renew_cg() 86 | # parameters -> expressions 87 | W_exp = dy.parameter(W_sm) 88 | b_exp = dy.parameter(b_sm) 89 | 90 | # initialize the RNN 91 | f_init = RNN.initial_state() 92 | 93 | # get the wids and masks for each step 94 | tot_words = 0 95 | wids = [] 96 | masks = [] 97 | for i in range(len(sents[0])): 98 | wids.append([ 99 | (sent[i] if len(sent)>i else S) for sent in sents]) 100 | mask = [(1 if len(sent)>i else 0) for sent in sents] 101 | masks.append(mask) 102 | tot_words += sum(mask) 103 | 104 | # start the rnn by inputting "" 105 | init_ids = [S] * len(sents) 106 | sequence = [init_ids] 107 | sequence.extend(wids[:-1]) # no need to enter the last element, which is EOS or equiv. 108 | outputs = f_init.transduce([dy.lookup_batch(WORDS_LOOKUP, x) for x in sequence]) 109 | if BATCHED_SENT: 110 | scores = unbatch(W_exp*batch(outputs)+b_exp) 111 | else: 112 | scores = [b_exp+(W_exp * o) for o in outputs] 113 | assert(len(scores)==len(wids)) 114 | losses = [dy.pickneglogsoftmax_batch(score, wid) for (score,wid) in zip(scores,wids)] 115 | 116 | for i,mask in enumerate(masks): 117 | if mask[-1] != 1: 118 | mask_expr = dy.inputVector(mask) 119 | mask_expr = dy.reshape(mask_expr, (1,), len(sents)) 120 | losses[i] = losses[i] * mask_expr 121 | 122 | return dy.sum_batches(dy.esum(losses)), tot_words 123 | 124 | # Sort training sentences in descending order and count minibatches 125 | train.sort(key=lambda x: -len(x)) 126 | test.sort(key=lambda x: -len(x)) 127 | train_order = [x*args.MB_SIZE for x in range(int((len(train)-1)/args.MB_SIZE + 1))] 128 | test_order = [x*args.MB_SIZE for x in range(int((len(test)-1)/args.MB_SIZE + 1))] 129 | 130 | print ("startup time: %r" % (time.time() - start)) 131 | # Perform training 132 | start = time.time() 133 | i = all_time = dev_time = all_tagged = this_words = this_loss = 0 134 | for ITER in range(100): 135 | random.shuffle(train_order) 136 | for sid in train_order: 137 | i += 1 138 | if i % int(500/args.MB_SIZE) == 0: 139 | trainer.status() 140 | print (this_loss / this_words, file=sys.stderr) 141 | all_tagged += this_words 142 | this_loss = this_words = 0 143 | all_time = time.time() - start 144 | if i % int(10000 / args.MB_SIZE) == 0 or all_time > args.TIMEOUT: 145 | dev_start = time.time() 146 | dev_loss = dev_words = 0 147 | for sid in test_order: 148 | loss_exp, mb_words = calc_lm_loss(test[sid:sid+args.MB_SIZE]) 149 | dev_loss += loss_exp.scalar_value() 150 | dev_words += mb_words 151 | dev_time += time.time() - dev_start 152 | train_time = time.time() - start - dev_time 153 | print ("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % (dev_loss/dev_words, math.exp(dev_loss/dev_words), dev_words, train_time, all_tagged/train_time)) 154 | if all_time > args.TIMEOUT: 155 | sys.exit(0) 156 | # train on the minibatch 157 | loss_exp, mb_words = calc_lm_loss(train[sid:sid+args.MB_SIZE]) 158 | this_loss += loss_exp.scalar_value() 159 | # print("loss @ %r: %r" % (i, this_loss)) 160 | this_words += mb_words 161 | loss_exp.backward() 162 | trainer.update() 163 | print ("epoch %r finished" % ITER) 164 | trainer.update_epoch(1.0) 165 | -------------------------------------------------------------------------------- /dynet-py/rnnlm-batch.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start = time.time() 4 | 5 | from collections import Counter, defaultdict 6 | import random 7 | import math 8 | import sys 9 | import argparse 10 | 11 | import dynet as dy 12 | import numpy as np 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--dynet-seed", default=0, type=int) 16 | parser.add_argument("--dynet-mem", default=512, type=int) 17 | parser.add_argument("--dynet-gpus", default=0, type=int) 18 | parser.add_argument('MB_SIZE', type=int, help='minibatch size') 19 | parser.add_argument('EMBED_SIZE', type=int, help='embedding size') 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 23 | args = parser.parse_args() 24 | 25 | # format of files: each line is "word1/tag2 word2/tag2 ..." 26 | train_file="data/text/train.txt" 27 | test_file="data/text/dev.txt" 28 | 29 | w2i = defaultdict(lambda: len(w2i)) 30 | 31 | def read(fname): 32 | """ 33 | Read a file where each line is of the form "word1 word2 ..." 34 | Yields lists of the form [word1, word2, ...] 35 | """ 36 | with open(fname, "r") as fh: 37 | for line in fh: 38 | sent = [w2i[x] for x in line.strip().split()] 39 | sent.append(w2i[""]) 40 | yield sent 41 | 42 | train=list(read(train_file)) 43 | nwords = len(w2i) 44 | test=list(read(test_file)) 45 | S = w2i[""] 46 | assert(nwords == len(w2i)) 47 | 48 | # DyNet Starts 49 | 50 | model = dy.Model() 51 | trainer = dy.AdamTrainer(model) 52 | trainer.set_clip_threshold(-1.0) 53 | trainer.set_sparse_updates(True if args.SPARSE == 1 else False) 54 | 55 | # Lookup parameters for word embeddings 56 | WORDS_LOOKUP = model.add_lookup_parameters((nwords, args.EMBED_SIZE)) 57 | 58 | # Word-level LSTM (layers=1, input=64, output=128, model) 59 | RNN = dy.VanillaLSTMBuilder(1, args.EMBED_SIZE, args.HIDDEN_SIZE, model) 60 | 61 | # Softmax weights/biases on top of LSTM outputs 62 | W_sm = model.add_parameters((nwords, args.HIDDEN_SIZE)) 63 | b_sm = model.add_parameters(nwords) 64 | 65 | # Build the language model graph 66 | def calc_lm_loss(sents): 67 | 68 | dy.renew_cg() 69 | # parameters -> expressions 70 | W_exp = dy.parameter(W_sm) 71 | b_exp = dy.parameter(b_sm) 72 | 73 | # initialize the RNN 74 | f_init = RNN.initial_state() 75 | 76 | # get the wids and masks for each step 77 | tot_words = 0 78 | wids = [] 79 | masks = [] 80 | for i in range(len(sents[0])): 81 | wids.append([ 82 | (sent[i] if len(sent)>i else S) for sent in sents]) 83 | mask = [(1 if len(sent)>i else 0) for sent in sents] 84 | masks.append(mask) 85 | tot_words += sum(mask) 86 | 87 | # start the rnn by inputting "" 88 | init_ids = [S] * len(sents) 89 | s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP,init_ids)) 90 | 91 | # feed word vectors into the RNN and predict the next word 92 | losses = [] 93 | for wid, mask in zip(wids, masks): 94 | # calculate the softmax and loss 95 | score = dy.affine_transform([b_exp, W_exp, s.output()]) 96 | loss = dy.pickneglogsoftmax_batch(score, wid) 97 | # mask the loss if at least one sentence is shorter 98 | if mask[-1] != 1: 99 | mask_expr = dy.inputVector(mask) 100 | mask_expr = dy.reshape(mask_expr, (1,), len(sents)) 101 | loss = loss * mask_expr 102 | losses.append(loss) 103 | # update the state of the RNN 104 | wemb = dy.lookup_batch(WORDS_LOOKUP, wid) 105 | s = s.add_input(wemb) 106 | 107 | return dy.sum_batches(dy.esum(losses)), tot_words 108 | 109 | # Sort training sentences in descending order and count minibatches 110 | train.sort(key=lambda x: -len(x)) 111 | test.sort(key=lambda x: -len(x)) 112 | train_order = [x*args.MB_SIZE for x in range(int((len(train)-1)/args.MB_SIZE + 1))] 113 | test_order = [x*args.MB_SIZE for x in range(int((len(test)-1)/args.MB_SIZE + 1))] 114 | 115 | print ("startup time: %r" % (time.time() - start)) 116 | # Perform training 117 | start = time.time() 118 | i = all_time = dev_time = all_tagged = this_words = this_loss = 0 119 | for ITER in range(100): 120 | random.shuffle(train_order) 121 | for sid in train_order: 122 | i += 1 123 | if i % int(500/args.MB_SIZE) == 0: 124 | trainer.status() 125 | print (this_loss / this_words, file=sys.stderr) 126 | all_tagged += this_words 127 | this_loss = this_words = 0 128 | all_time = time.time() - start 129 | if i % int(10000 / args.MB_SIZE) == 0 or all_time > args.TIMEOUT: 130 | dev_start = time.time() 131 | dev_loss = dev_words = 0 132 | for sid in test_order: 133 | loss_exp, mb_words = calc_lm_loss(test[sid:sid+args.MB_SIZE]) 134 | dev_loss += loss_exp.scalar_value() 135 | dev_words += mb_words 136 | dev_time += time.time() - dev_start 137 | train_time = time.time() - start - dev_time 138 | print ("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % (dev_loss/dev_words, math.exp(dev_loss/dev_words), dev_words, train_time, all_tagged/train_time)) 139 | if all_time > args.TIMEOUT: 140 | sys.exit(0) 141 | # train on the minibatch 142 | loss_exp, mb_words = calc_lm_loss(train[sid:sid+args.MB_SIZE]) 143 | this_loss += loss_exp.scalar_value() 144 | # print("loss @ %r: %r" % (i, this_loss)) 145 | this_words += mb_words 146 | loss_exp.backward() 147 | trainer.update() 148 | print ("epoch %r finished" % ITER) 149 | trainer.update_epoch(1.0) 150 | -------------------------------------------------------------------------------- /dynet-py/treenn-bulk.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start = time.time() 4 | 5 | import re 6 | import codecs 7 | import sys 8 | from collections import Counter 9 | import random 10 | import argparse 11 | 12 | import numpy as np 13 | import dynet as dy 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--dynet-seed", default=0, type=int) 17 | parser.add_argument("--dynet-mem", default=512, type=int) 18 | parser.add_argument("--dynet-gpus", default=0, type=int) 19 | parser.add_argument("--dynet-exp", default=1, type=int) 20 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size') 21 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 22 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 23 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 24 | args = parser.parse_args() 25 | 26 | def _tokenize_sexpr(s): 27 | tokker = re.compile(r" +|[()]|[^ ()]+") 28 | toks = [t for t in [match.group(0) for match in tokker.finditer(s)] if t[0] != " "] 29 | return toks 30 | 31 | def _within_bracket(toks): 32 | label = next(toks) 33 | children = [] 34 | for tok in toks: 35 | if tok == "(": 36 | children.append(_within_bracket(toks)) 37 | elif tok == ")": 38 | return Tree(label, children) 39 | else: children.append(Tree(tok, None)) 40 | assert(False),list(toks) 41 | 42 | class Tree(object): 43 | def __init__(self, label, children=None): 44 | self.label = label 45 | self.children = children 46 | 47 | @staticmethod 48 | def from_sexpr(string): 49 | toks = iter(_tokenize_sexpr(string)) 50 | assert next(toks) == "(" 51 | return _within_bracket(toks) 52 | 53 | def __str__(self): 54 | if self.children is None: return self.label 55 | return "[%s %s]" % (self.label, " ".join([str(c) for c in self.children])) 56 | 57 | def isleaf(self): return self.children==None 58 | 59 | def leaves_iter(self): 60 | if self.isleaf(): 61 | yield self 62 | else: 63 | for c in self.children: 64 | for l in c.leaves_iter(): yield l 65 | 66 | def leaves(self): return list(self.leaves_iter()) 67 | 68 | def nonterms_iter(self): 69 | if not self.isleaf(): 70 | yield self 71 | for c in self.children: 72 | for n in c.nonterms_iter(): yield n 73 | 74 | def nonterms(self): return list(self.nonterms_iter()) 75 | 76 | def read_dataset(filename): 77 | return [Tree.from_sexpr(line.strip()) for line in codecs.open(filename,"r")] 78 | 79 | def get_vocabs(trees): 80 | label_vocab = Counter() 81 | word_vocab = Counter() 82 | for tree in trees: 83 | label_vocab.update([n.label for n in tree.nonterms()]) 84 | word_vocab.update([l.label for l in tree.leaves()]) 85 | labels = [x for x,c in label_vocab.items() if c > 0] 86 | words = ["_UNK_"] + [x for x,c in word_vocab.items() if c > 0] 87 | l2i = {l:i for i,l in enumerate(labels)} 88 | w2i = {w:i for i,w in enumerate(words)} 89 | return l2i, w2i, labels, words 90 | 91 | class TreeLSTMBuilder(object): 92 | def __init__(self, model, word_vocab, wdim, hdim): 93 | self.WS = [model.add_parameters((hdim, wdim)) for _ in "iou"] 94 | self.US = [model.add_parameters((hdim, 2*hdim)) for _ in "iou"] 95 | self.UFS =[model.add_parameters((hdim, hdim)) for _ in "ff"] 96 | self.BS = [model.add_parameters(hdim) for _ in "iouf"] 97 | self.E = model.add_lookup_parameters((len(word_vocab),wdim)) 98 | self.w2i = word_vocab 99 | 100 | def expr_for_tree(self, tree, decorate=False): 101 | assert(not tree.isleaf()) 102 | if len(tree.children) == 1: 103 | assert(tree.children[0].isleaf()) 104 | emb = self.E[self.w2i.get(tree.children[0].label,0)] 105 | Wi,Wo,Wu = [dy.parameter(w) for w in self.WS] 106 | bi,bo,bu,_ = [dy.parameter(b) for b in self.BS] 107 | #i = dy.logistic(dy.affine_transform([bi, Wi, emb])) 108 | #o = dy.logistic(dy.affine_transform([bo, Wo, emb])) 109 | #u = dy.tanh( dy.affine_transform([bu, Wu, emb])) 110 | i = dy.logistic(bi+Wi*emb) 111 | o = dy.logistic(bo+Wo*emb) 112 | u = dy.tanh( bu+Wu*emb) 113 | c = dy.cmult(i,u) 114 | h = dy.cmult(o,dy.tanh(c)) 115 | if decorate: tree._e = h 116 | return h, c 117 | assert(len(tree.children) == 2),tree.children[0] 118 | e1, c1 = self.expr_for_tree(tree.children[0], decorate) 119 | e2, c2 = self.expr_for_tree(tree.children[1], decorate) 120 | Ui,Uo,Uu = [dy.parameter(u) for u in self.US] 121 | Uf1,Uf2 = [dy.parameter(u) for u in self.UFS] 122 | bi,bo,bu,bf = [dy.parameter(b) for b in self.BS] 123 | e = dy.concatenate([e1,e2]) 124 | i = dy.logistic(bi+Ui*e) 125 | o = dy.logistic(bi+Uo*e) 126 | f1 = dy.logistic(bf+Uf1*e1) 127 | f2 = dy.logistic(bf+Uf2*e2) 128 | u = dy.tanh( bu+Uu*e) 129 | c = dy.cmult(i,u) + dy.cmult(f1,c1) + dy.cmult(f2,c2) 130 | h = dy.cmult(o,dy.tanh(c)) 131 | if decorate: tree._e = h 132 | return h, c 133 | 134 | train = read_dataset("data/trees/train.txt") 135 | dev = read_dataset("data/trees/dev.txt") 136 | 137 | l2i, w2i, i2l, i2w = get_vocabs(train) 138 | 139 | model = dy.Model() 140 | builder = TreeLSTMBuilder(model, w2i, args.WEMBED_SIZE, args.HIDDEN_SIZE) 141 | W_ = model.add_parameters((len(l2i), args.HIDDEN_SIZE)) 142 | trainer = dy.AdamTrainer(model) 143 | trainer.set_clip_threshold(-1.0) 144 | trainer.set_sparse_updates(True if args.SPARSE == 1 else False) 145 | 146 | print ("startup time: %r" % (time.time() - start)) 147 | sents = 0 148 | all_time = 0 149 | for ITER in range(100): 150 | random.shuffle(train) 151 | closs = 0.0 152 | cwords = 0 153 | start = time.time() 154 | batch = [] 155 | for i,tree in enumerate(train,1): 156 | sents += 1 157 | W = dy.parameter(W_) 158 | h, c = builder.expr_for_tree(tree,True) 159 | nodes = tree.nonterms() 160 | losses = [dy.pickneglogsoftmax(W*nt._e,l2i[nt.label]) for nt in nodes] 161 | loss = dy.esum(losses) 162 | batch.append(loss) 163 | if len(batch) == 50: 164 | loss = dy.esum(batch) 165 | closs += loss.value() 166 | cwords += len(nodes) 167 | loss.backward() 168 | trainer.update() 169 | batch = [] 170 | dy.renew_cg() 171 | if sents % 1000 == 0: 172 | trainer.status() 173 | print (closs / cwords, file=sys.stderr) 174 | closs = 0.0 175 | cwords = 0 176 | all_time += time.time() - start 177 | trainer.update_epoch(1.0) 178 | good = bad = 0.0 179 | 180 | batch = [] 181 | dy.renew_cg() 182 | for tree in dev: 183 | W = dy.parameter(W_) 184 | h, c = builder.expr_for_tree(tree,False) 185 | scores = W*h 186 | batch.append(scores) 187 | if len(batch)==50: 188 | dy.esum(batch).forward() # TODO need nicer API for running the list w/o dummy op. 189 | for scores in batch: 190 | pred = i2l[np.argmax(scores.npvalue())] 191 | if pred == tree.label: good += 1 192 | else: bad += 1 193 | batch = [] 194 | dy.renew_cg() 195 | print ("acc=%.4f, time=%.4f, sent_per_sec=%.4f" % (good/(good+bad), all_time, sents/all_time)) 196 | if all_time > args.TIMEOUT: 197 | sys.exit(0) 198 | -------------------------------------------------------------------------------- /dynet-py/treenn.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start = time.time() 4 | 5 | import re 6 | import codecs 7 | import sys 8 | from collections import Counter 9 | import random 10 | import argparse 11 | 12 | import numpy as np 13 | import dynet as dy 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--dynet-seed", default=0, type=int) 17 | parser.add_argument("--dynet-mem", default=512, type=int) 18 | parser.add_argument("--dynet-gpus", default=0, type=int) 19 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size') 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 23 | args = parser.parse_args() 24 | 25 | def _tokenize_sexpr(s): 26 | tokker = re.compile(r" +|[()]|[^ ()]+") 27 | toks = [t for t in [match.group(0) for match in tokker.finditer(s)] if t[0] != " "] 28 | return toks 29 | 30 | def _within_bracket(toks): 31 | label = next(toks) 32 | children = [] 33 | for tok in toks: 34 | if tok == "(": 35 | children.append(_within_bracket(toks)) 36 | elif tok == ")": 37 | return Tree(label, children) 38 | else: children.append(Tree(tok, None)) 39 | assert(False),list(toks) 40 | 41 | class Tree(object): 42 | def __init__(self, label, children=None): 43 | self.label = label 44 | self.children = children 45 | 46 | @staticmethod 47 | def from_sexpr(string): 48 | toks = iter(_tokenize_sexpr(string)) 49 | assert next(toks) == "(" 50 | return _within_bracket(toks) 51 | 52 | def __str__(self): 53 | if self.children is None: return self.label 54 | return "[%s %s]" % (self.label, " ".join([str(c) for c in self.children])) 55 | 56 | def isleaf(self): return self.children==None 57 | 58 | def leaves_iter(self): 59 | if self.isleaf(): 60 | yield self 61 | else: 62 | for c in self.children: 63 | for l in c.leaves_iter(): yield l 64 | 65 | def leaves(self): return list(self.leaves_iter()) 66 | 67 | def nonterms_iter(self): 68 | if not self.isleaf(): 69 | yield self 70 | for c in self.children: 71 | for n in c.nonterms_iter(): yield n 72 | 73 | def nonterms(self): return list(self.nonterms_iter()) 74 | 75 | def read_dataset(filename): 76 | return [Tree.from_sexpr(line.strip()) for line in codecs.open(filename,"r")] 77 | 78 | def get_vocabs(trees): 79 | label_vocab = Counter() 80 | word_vocab = Counter() 81 | for tree in trees: 82 | label_vocab.update([n.label for n in tree.nonterms()]) 83 | word_vocab.update([l.label for l in tree.leaves()]) 84 | labels = [x for x,c in label_vocab.items() if c > 0] 85 | words = ["_UNK_"] + [x for x,c in word_vocab.items() if c > 0] 86 | l2i = {l:i for i,l in enumerate(labels)} 87 | w2i = {w:i for i,w in enumerate(words)} 88 | return l2i, w2i, labels, words 89 | 90 | class TreeLSTMBuilder(object): 91 | def __init__(self, model, word_vocab, wdim, hdim): 92 | self.WS = [model.add_parameters((hdim, wdim)) for _ in "iou"] 93 | self.US = [model.add_parameters((hdim, 2*hdim)) for _ in "iou"] 94 | self.UFS =[model.add_parameters((hdim, hdim)) for _ in "ff"] 95 | self.BS = [model.add_parameters(hdim) for _ in "iouf"] 96 | self.E = model.add_lookup_parameters((len(word_vocab),wdim)) 97 | self.w2i = word_vocab 98 | 99 | def expr_for_tree(self, tree, decorate=False): 100 | assert(not tree.isleaf()) 101 | if len(tree.children) == 1: 102 | assert(tree.children[0].isleaf()) 103 | emb = self.E[self.w2i.get(tree.children[0].label,0)] 104 | Wi,Wo,Wu = [dy.parameter(w) for w in self.WS] 105 | bi,bo,bu,_ = [dy.parameter(b) for b in self.BS] 106 | i = dy.logistic(dy.affine_transform([bi, Wi, emb])) 107 | o = dy.logistic(dy.affine_transform([bo, Wo, emb])) 108 | u = dy.tanh( dy.affine_transform([bu, Wu, emb])) 109 | c = dy.cmult(i,u) 110 | h = dy.cmult(o,dy.tanh(c)) 111 | if decorate: tree._e = h 112 | return h, c 113 | assert(len(tree.children) == 2),tree.children[0] 114 | e1, c1 = self.expr_for_tree(tree.children[0], decorate) 115 | e2, c2 = self.expr_for_tree(tree.children[1], decorate) 116 | Ui,Uo,Uu = [dy.parameter(u) for u in self.US] 117 | Uf1,Uf2 = [dy.parameter(u) for u in self.UFS] 118 | bi,bo,bu,bf = [dy.parameter(b) for b in self.BS] 119 | e = dy.concatenate([e1,e2]) 120 | i = dy.logistic(dy.affine_transform([bi, Ui, e])) 121 | o = dy.logistic(dy.affine_transform([bo, Uo, e])) 122 | f1 = dy.logistic(dy.affine_transform([bf, Uf1, e1])) 123 | f2 = dy.logistic(dy.affine_transform([bf, Uf2, e2])) 124 | u = dy.tanh( dy.affine_transform([bu, Uu, e])) 125 | c = dy.cmult(i,u) + dy.cmult(f1,c1) + dy.cmult(f2,c2) 126 | h = dy.cmult(o,dy.tanh(c)) 127 | if decorate: tree._e = h 128 | return h, c 129 | 130 | train = read_dataset("data/trees/train.txt") 131 | dev = read_dataset("data/trees/dev.txt") 132 | 133 | l2i, w2i, i2l, i2w = get_vocabs(train) 134 | 135 | model = dy.Model() 136 | builder = TreeLSTMBuilder(model, w2i, args.WEMBED_SIZE, args.HIDDEN_SIZE) 137 | W_ = model.add_parameters((len(l2i), args.HIDDEN_SIZE)) 138 | trainer = dy.AdamTrainer(model) 139 | trainer.set_clip_threshold(-1.0) 140 | trainer.set_sparse_updates(True if args.SPARSE == 1 else False) 141 | 142 | print ("startup time: %r" % (time.time() - start)) 143 | sents = 0 144 | all_time = 0 145 | for ITER in range(100): 146 | random.shuffle(train) 147 | closs = 0.0 148 | cwords = 0 149 | start = time.time() 150 | for i,tree in enumerate(train,1): 151 | sents += 1 152 | dy.renew_cg() 153 | W = dy.parameter(W_) 154 | h, c = builder.expr_for_tree(tree,True) 155 | nodes = tree.nonterms() 156 | losses = [dy.pickneglogsoftmax(W*nt._e,l2i[nt.label]) for nt in nodes] 157 | loss = dy.esum(losses) 158 | closs += loss.value() 159 | cwords += len(nodes) 160 | loss.backward() 161 | trainer.update() 162 | if sents % 1000 == 0: 163 | trainer.status() 164 | print (closs / cwords, file=sys.stderr) 165 | closs = 0.0 166 | cwords = 0 167 | all_time += time.time() - start 168 | trainer.update_epoch(1.0) 169 | good = bad = 0.0 170 | for tree in dev: 171 | dy.renew_cg() 172 | W = dy.parameter(W_) 173 | h, c = builder.expr_for_tree(tree,False) 174 | pred = i2l[np.argmax((W*h).npvalue())] 175 | if pred == tree.label: good += 1 176 | else: bad += 1 177 | print ("acc=%.4f, time=%.4f, sent_per_sec=%.4f" % (good/(good+bad), all_time, sents/all_time)) 178 | if all_time > args.TIMEOUT: 179 | sys.exit(0) 180 | -------------------------------------------------------------------------------- /make-report.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This should be used as 4 | # mkdir -p report 5 | # grep '\(per_sec\|startup\)' log/*/*.log | python make-report.py 6 | 7 | import sys 8 | import re 9 | from collections import defaultdict 10 | 11 | stats = defaultdict(lambda: {}) 12 | allstats = defaultdict(lambda: []) 13 | 14 | ##### Regexes 15 | fnameregex = re.compile(r"log/([a-z-]+?)(-gpu|)/(dynet-py|dynet-cpp|dynet-seq|chainer|theano|tensorflow)-(.*?)-t([123]).log:(.*)") 16 | startregex = re.compile(r"startup time: (.*)") 17 | eqregex = re.compile(r"(.*)=(.*)") 18 | commentregex = re.compile(r"^ *((#|//).*)?") 19 | 20 | ##### Various data 21 | canonicalize = { 22 | "word_per_sec": "speed", 23 | "words_per_sec": "speed", 24 | "sent_per_sec": "speed", 25 | "nll": "accuracy", 26 | "tag_acc": "accuracy", 27 | "acc": "accuracy", 28 | "time": "time" 29 | } 30 | taskna = { 31 | ("tensorflow", "bilstm-tagger-withchar"): 1, 32 | ("tensorflow", "treenn"): 1, 33 | ("theano", "treenn"): 1, 34 | ("dynet-seq", "bilstm-tagger"): 1, 35 | ("dynet-seq", "bilstm-tagger-withchar"): 1, 36 | ("dynet-seq", "treenn"): 1, 37 | } 38 | toolkits = ["dynet-cpp", "dynet-py", "chainer", "dynet-seq", "theano", "tensorflow"] 39 | prettyname = { 40 | "dynet-cpp": "DyC++", 41 | "dynet-py": "DyPy", 42 | "dynet-seq": "DyC++ Seq", 43 | "tensorflow":"TF", 44 | "chainer": "Chainer", 45 | "theano": "Theano" 46 | } 47 | 48 | ##### Load from log files 49 | for line in sys.stdin: 50 | line = line.replace("rnnlm-seq/dynet-cpp", "rnnlm-batch/dynet-seq") 51 | line = line.replace("rnnlm-seq-gpu/dynet-cpp", "rnnlm-batch-gpu/dynet-seq") 52 | m = re.search(fnameregex, line.strip()) 53 | if m: 54 | task = m.group(1) 55 | device = "gpu" if m.group(2) == "-gpu" else "cpu" 56 | toolkit = m.group(3) 57 | params = m.group(4) 58 | trial = int(m.group(5)) 59 | idtup = (task, device, toolkit, params, trial) 60 | data = m.group(6) 61 | m = re.search(startregex, data) 62 | if m: 63 | stats[idtup]["startup"] = float(m.group(1)) 64 | else: 65 | mystats = {} 66 | for val in data.split(", "): 67 | m = re.search(eqregex, val) 68 | if not m: 69 | print("unmatched line: %s" % line) 70 | sys.exit(1) 71 | if m.group(1) in canonicalize: 72 | can = canonicalize[m.group(1)] 73 | val = float(m.group(2)) 74 | mystats[can] = val 75 | if can == "accuracy": 76 | if "rnnlm" not in task: val *= 100 77 | else: val *= -1 78 | stats[idtup][can] = max(val, stats[idtup].get(can,-1e10)) 79 | else: 80 | stats[idtup][can] = val 81 | allstats[idtup].append(mystats) 82 | else: 83 | print("unmatched line: %s" % line) 84 | sys.exit(1) 85 | # print(stats) 86 | 87 | # def format_num(num): 88 | # if num > 1e6: 89 | # return "%.03gM" % (float(num)/1e6) 90 | # elif num > 1e3: 91 | # return "%.03gk" % (float(num)/1e3) 92 | # else: 93 | # return "%.03g" % float(num) 94 | 95 | # TODO: There must be a better way to do this... 96 | def format_num(num): 97 | fnum = float(num) 98 | val = "%.03g" % fnum 99 | if fnum >= 1 and fnum < 10: 100 | val = "%.2f" % fnum 101 | elif fnum >= 10 and fnum < 100: 102 | val = "%.1f" % fnum 103 | elif float(num) > 1000: 104 | val = "%.f" % float(val) 105 | return val 106 | 107 | def getmaxstat(task, device, toolkit, setting, stat, mult=1): 108 | my_stats = [] 109 | for trial in range(1,4): 110 | my_id = (task, device, toolkit, setting, trial) 111 | if my_id in stats and stat in stats[my_id]: 112 | my_stats.append(mult*stats[my_id][stat]) 113 | return format_num(mult*max(my_stats)) if len(my_stats) > 0 else "TODO" 114 | def getminstat(task, device, toolkit, setting, stat): 115 | return getmaxstat(task, device ,toolkit, setting, stat, mult=-1) 116 | 117 | ###### First section: toolkit comparison 118 | 119 | # CPU/GPU speeds for all toolkits/tasks 120 | tasks = [ 121 | ("RNNLM (MB=1) ", "rnnlm-batch", "ms01-es128-hs256-sp0"), 122 | ("RNNLM (MB=4)", "rnnlm-batch", "ms04-es128-hs256-sp0"), 123 | ("RNNLM (MB=16)", "rnnlm-batch", "ms16-es128-hs256-sp0"), 124 | ("RNNLM (MB=64)", "rnnlm-batch", "ms64-es128-hs256-sp0"), 125 | ("BiLSTM Tag", "bilstm-tagger", "ws128-hs50-mlps32-su0"), 126 | ("BiLSTM Tag +sparse", "bilstm-tagger", "ws128-hs50-mlps32-su1"), 127 | ("BiLSTM Tag+Char", "bilstm-tagger-withchar", "cs20-ws128-hs50-mlps32-su0"), 128 | ("BiLSTM Tag+Char +sparse", "bilstm-tagger-withchar", "cs20-ws128-hs50-mlps32-su1"), 129 | ("TreeLSTM", "treenn", "ws128-hs128-su0"), 130 | ("TreeLSTM +sparse", "treenn", "ws128-hs128-su1"), 131 | ] 132 | def make_speed_table(device): 133 | print("\\begin{table}") 134 | print("\\begin{tabular}{c|rrr|rrr}") 135 | print(" & "+" & ".join([prettyname[x] for x in toolkits])+" \\\\ \hline") 136 | for name, task, setting in tasks: 137 | cols = [name] 138 | for i, toolkit in enumerate(toolkits): 139 | if (toolkit, task) in taskna: 140 | cols.append("\\multicolumn{1}{c}{-}") 141 | else: 142 | cols.append(getmaxstat(task, device, toolkit, setting, "speed")) 143 | print(" & ".join(cols)+" \\\\") 144 | print("\\end{tabular}") 145 | print("\\caption{Processing speed for each toolkit on %s. Speeds are measured in words/sec for RNNLM and Tagger and sentences/sec for TreeLSTM.}" % device.upper()) 146 | print("\\label{tab:speeds%s}" % device) 147 | print("\\end{table}") 148 | print("") 149 | make_speed_table("cpu") 150 | make_speed_table("gpu") 151 | 152 | # Startup time table 153 | tasks = [ 154 | ("RNNLM", "rnnlm-batch", "ms01-es128-hs256-sp0"), 155 | ("BiLSTM Tag", "bilstm-tagger", "ws128-hs50-mlps32-su0"), 156 | ("BiLSTM Tag+Char", "bilstm-tagger-withchar", "cs20-ws128-hs50-mlps32-su0"), 157 | ("TreeLSTM", "treenn", "ws128-hs128-su0"), 158 | ] 159 | print("\\begin{table}") 160 | print("\\begin{tabular}{c|rrr|rrr}") 161 | print(" & "+" & ".join([prettyname[x] for x in toolkits])+" \\\\ \hline") 162 | for name, task, setting in tasks: 163 | cols = [name] 164 | for i, toolkit in enumerate(toolkits): 165 | if (toolkit, task) in taskna: 166 | cols.append("\\multicolumn{1}{c}{-}") 167 | else: 168 | cols.append(getminstat(task, device, toolkit, setting, "startup")) 169 | print(" & ".join(cols)+" \\\\") 170 | print("\\end{tabular}") 171 | print("\\caption{Startup time for programs written in each toolkit.}") 172 | print("\\label{tab:startup}") 173 | print("\\end{table}") 174 | print("") 175 | 176 | # Code complexities 177 | def get_code_complexity(toolkit, task): 178 | chars = 0 179 | if toolkit == "dynet-seq": 180 | if not task == "rnnlm-batch": 181 | return "\\multicolumn{1}{c}{-}" 182 | toolkit = "dynet-cpp" 183 | task = "rnnlm-seq" 184 | if (toolkit, task) in taskna: 185 | return "\\multicolumn{1}{c}{-}" 186 | with open("%s/%s.%s" % (toolkit, task, "cc" if toolkit == "dynet-cpp" else "py"), "r") as f: 187 | for line in f: 188 | line = re.sub(commentregex, "", line.strip()) 189 | chars += len(line) 190 | return str(chars) 191 | 192 | tasks = [ 193 | ("RNNLM", "rnnlm-batch"), 194 | ("BiLSTM Tag", "bilstm-tagger"), 195 | ("BiLSTM Tag+Char", "bilstm-tagger-withchar"), 196 | ("TreeLSTM", "treenn"), 197 | ] 198 | print("\\begin{table}") 199 | print("\\begin{tabular}{c|rrrrrr}") 200 | print(" & "+" & ".join([prettyname[x] for x in toolkits])+" \\\\ \hline") 201 | for name, task in tasks: 202 | cols = [name] 203 | for i, toolkit in enumerate(toolkits): 204 | cols.append(get_code_complexity(toolkit, task)) 205 | print(" & ".join(cols)+" \\\\") 206 | print("\\end{tabular}") 207 | print("\\caption{Number of non-comment characters in the implementation of each toolkit.}") 208 | print("\\label{tab:complexity}") 209 | print("\\end{table}") 210 | print("") 211 | 212 | 213 | ###### Second section: effect of minibatching and net size 214 | 215 | 216 | ###### Third section: effect of sparse update 217 | tasks = [ 218 | ("RNNLM (MB=1) ", "rnnlm-batch", "ms01-es128-hs256-sp"), 219 | ("RNNLM (MB=16)", "rnnlm-batch", "ms16-es128-hs256-sp"), 220 | ("BiLSTM Tag", "bilstm-tagger", "ws128-hs50-mlps32-su"), 221 | ("BiLSTM Tag+Char", "bilstm-tagger-withchar", "cs20-ws128-hs50-mlps32-su"), 222 | ("TreeLSTM", "treenn", "ws128-hs128-su"), 223 | ] 224 | print("\\begin{table}") 225 | print("\\begin{tabular}{c|rr|rr|rr|rr}") 226 | print(" & \\multicolumn{4}{c|}{Speed} & \\multicolumn{4}{c}{Accuracy} \\\\") 227 | print(" & \\multicolumn{2}{c|}{Dense} & \\multicolumn{2}{c|}{Sparse} & \\multicolumn{2}{c|}{Dense} & \\multicolumn{2}{c}{Sparse} \\\\") 228 | print(" & "+" & ".join(["CPU & GPU"] * 4)+" \\\\ \\hline") 229 | for name, task, setting in tasks: 230 | cols = [name] 231 | for criterion in ("speed", "accuracy"): 232 | for ds in ("0", "1"): 233 | for device in ("cpu", "gpu"): 234 | cols.append(getmaxstat(task, device, "dynet-cpp", setting+ds, criterion)) 235 | print(" & ".join(cols)+" \\\\") 236 | print("\\end{tabular}") 237 | print("\\caption{Processing speed and accuracy after 10 minutes with dense or sparse updates.}") 238 | print("\\label{tab:sparseresults}") 239 | print("\\end{table}") 240 | print("") 241 | -------------------------------------------------------------------------------- /pytorch/bilstm-tagger-withchar.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import print_function 3 | import time 4 | start = time.time() 5 | 6 | from collections import Counter, defaultdict 7 | import random 8 | import sys 9 | import argparse 10 | import numpy as np 11 | import torch 12 | from torch import nn 13 | from torch import optim 14 | from torch.autograd import Variable 15 | from torch.nn import functional as F 16 | 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('CEMBED_SIZE', type=int, help='char embedding size') 20 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size') 21 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 22 | parser.add_argument('MLP_SIZE', type=int, help='embedding size') 23 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 24 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 25 | parser.add_argument('--CUDA', default=1, type=int) 26 | args = parser.parse_args() 27 | 28 | 29 | # format of files: each line is "word1|tag2 word2|tag2 ..." 30 | train_file = "data/tags/train.txt" 31 | dev_file = "data/tags/dev.txt" 32 | 33 | 34 | class Vocab: 35 | 36 | def __init__(self, w2i=None): 37 | if w2i is None: w2i = defaultdict(lambda: len(w2i)) 38 | self.w2i = dict(w2i) 39 | self.i2w = {i: w for w, i in w2i.items()} 40 | 41 | @classmethod 42 | def from_corpus(cls, corpus): 43 | w2i = defaultdict(lambda: len(w2i)) 44 | for sent in corpus: 45 | [w2i[word] for word in sent] 46 | return Vocab(w2i) 47 | 48 | def size(self): 49 | return len(self.w2i.keys()) 50 | 51 | 52 | def read(fname): 53 | """ 54 | Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..." 55 | Yields lists of the form [(word1,tag1), (word2,tag2), ...] 56 | """ 57 | with open(fname, "r") as fh: 58 | for line in fh: 59 | line = line.strip().split() 60 | sent = [tuple(x.rsplit("|", 1)) for x in line] 61 | yield sent 62 | 63 | 64 | train = list(read(train_file)) 65 | dev = list(read(dev_file)) 66 | words = [] 67 | tags = [] 68 | chars = set() 69 | wc = Counter() 70 | for sent in train: 71 | for w, p in sent: 72 | words.append(w) 73 | tags.append(p) 74 | wc[w] += 1 75 | chars.update(w) 76 | words.append("_UNK_") 77 | chars.add("_UNK_") 78 | chars.add("<*>") 79 | 80 | vw = Vocab.from_corpus([words]) 81 | vt = Vocab.from_corpus([tags]) 82 | vc = Vocab.from_corpus([chars]) 83 | UNK = vw.w2i["_UNK_"] 84 | CUNK = vc.w2i["_UNK_"] 85 | pad_char = vc.w2i["<*>"] 86 | 87 | nwords = vw.size() 88 | ntags = vt.size() 89 | nchars = vc.size() 90 | print ("nwords=%r, ntags=%r, nchars=%r" % (nwords, ntags, nchars)) 91 | 92 | 93 | def get_var(x, volatile=False): 94 | x = Variable(x, volatile=volatile) 95 | return x.cuda() if args.CUDA else x 96 | 97 | 98 | class Model(nn.Module): 99 | 100 | def __init__(self, args): 101 | super(Model, self).__init__() 102 | self.lookup_w = nn.Embedding(nwords, args.WEMBED_SIZE, padding_idx=UNK) 103 | self.lookup_c = nn.Embedding(nchars, args.CEMBED_SIZE, padding_idx=CUNK) 104 | self.lstm = nn.LSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE, 1, bidirectional=True) 105 | self.lstm_c_f = nn.LSTM(args.CEMBED_SIZE, args.WEMBED_SIZE / 2, 1) 106 | self.lstm_c_r = nn.LSTM(args.CEMBED_SIZE, args.WEMBED_SIZE / 2, 1) 107 | self.proj1 = nn.Linear(2 * args.HIDDEN_SIZE, args.MLP_SIZE) 108 | self.proj2 = nn.Linear(args.MLP_SIZE, ntags) 109 | 110 | def forward(self, words, volatile=False): 111 | word_ids = [] 112 | needs_chars = [] 113 | char_ids = [] 114 | for i, w in enumerate(words): 115 | if wc[w] > 5: 116 | word_ids.append(vw.w2i[w]) 117 | else: 118 | word_ids.append(UNK) 119 | needs_chars.append(i) 120 | char_ids.append([pad_char] + [vc.w2i.get(c, CUNK) for c in w] + [pad_char]) 121 | embeddings = self.lookup_w(get_var(torch.LongTensor(word_ids), volatile=volatile)) 122 | if needs_chars: 123 | max_len = max(len(x) for x in char_ids) 124 | fwd_char_ids = [ids + [pad_char for _ in range(max_len - len(ids))] for ids in char_ids] 125 | rev_char_ids = [ids[::-1] + [pad_char for _ in range(max_len - len(ids))] for ids in char_ids] 126 | char_embeddings = torch.cat([ 127 | self.lstm_c_f(self.lookup_c(get_var(torch.LongTensor(fwd_char_ids).t())))[0], 128 | self.lstm_c_r(self.lookup_c(get_var(torch.LongTensor(rev_char_ids).t())))[0] 129 | ], 2) 130 | unk_embeddings = torch.cat([char_embeddings[len(words[j]) + 1, i].unsqueeze(0) for i, j in enumerate(needs_chars)], 0) 131 | embeddings = embeddings.index_add(0, get_var(torch.LongTensor(needs_chars)), unk_embeddings) 132 | return self.proj2(self.proj1(self.lstm(embeddings.unsqueeze(1))[0].squeeze(1))) 133 | 134 | 135 | model = Model(args) 136 | if args.CUDA: 137 | model.cuda() 138 | optimizer = optim.Adam(model.parameters()) 139 | 140 | 141 | print("startup time: %r" % (time.time() - start)) 142 | start = time.time() 143 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0 144 | 145 | for ITER in range(100): 146 | random.shuffle(train) 147 | for s in train: 148 | i += 1 149 | if i % 500 == 0: 150 | print(this_loss / this_tagged, file=sys.stderr) 151 | all_tagged += this_tagged 152 | this_loss = this_tagged = 0 153 | all_time = time.time() - start 154 | if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev 155 | dev_start = time.time() 156 | good_sent = bad_sent = good = bad = 0.0 157 | for sent in dev: 158 | words, golds = zip(*sent) 159 | tags = [vt.i2w[i] for i in model(words, volatile=True).max(1)[1].cpu().data.view(-1)] 160 | if tags == list(golds): good_sent += 1 161 | else: bad_sent += 1 162 | for go, gu in zip(golds, tags): 163 | if go == gu: good += 1 164 | else: bad += 1 165 | dev_time += time.time() - dev_start 166 | train_time = time.time() - start - dev_time 167 | print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time)) 168 | if all_time > args.TIMEOUT: 169 | sys.exit(0) 170 | # batch / loss 171 | words, golds = zip(*s) 172 | preds = model(words) 173 | loss = F.cross_entropy(preds, get_var(torch.LongTensor([vt.w2i[t] for t in golds]))) 174 | # log / optim 175 | this_loss += loss.data[0]*len(golds) 176 | this_tagged += len(golds) 177 | optimizer.zero_grad() 178 | loss.backward() 179 | optimizer.step() 180 | print("epoch %r finished" % ITER) 181 | -------------------------------------------------------------------------------- /pytorch/bilstm-tagger.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from __future__ import print_function 3 | import time 4 | start = time.time() 5 | 6 | from collections import Counter, defaultdict 7 | import random 8 | import sys 9 | import argparse 10 | import numpy as np 11 | import torch 12 | from torch import nn 13 | from torch import optim 14 | from torch.autograd import Variable 15 | from torch.nn import functional as F 16 | 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('WEMBED_SIZE', type=int) 20 | parser.add_argument('HIDDEN_SIZE', type=int) 21 | parser.add_argument('MLP_SIZE', type=int) 22 | parser.add_argument('SPARSE', type=int) 23 | parser.add_argument('TIMEOUT', type=int) 24 | parser.add_argument('--CUDA', default=-1, type=int) 25 | args = parser.parse_args() 26 | 27 | 28 | # format of files: each line is "word1|tag2 word2|tag2 ..." 29 | train_file = "data/tags/train.txt" 30 | dev_file = "data/tags/dev.txt" 31 | 32 | 33 | class Vocab: 34 | 35 | def __init__(self, w2i=None): 36 | if w2i is None: w2i = defaultdict(lambda: len(w2i)) 37 | self.w2i = dict(w2i) 38 | self.i2w = {i: w for w, i in w2i.items()} 39 | 40 | @classmethod 41 | def from_corpus(cls, corpus): 42 | w2i = defaultdict(lambda: len(w2i)) 43 | for sent in corpus: 44 | [w2i[word] for word in sent] 45 | return Vocab(w2i) 46 | 47 | def size(self): 48 | return len(self.w2i.keys()) 49 | 50 | 51 | def read(fname): 52 | """ 53 | Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..." 54 | Yields lists of the form [(word1,tag1), (word2,tag2), ...] 55 | """ 56 | with open(fname, "r") as fh: 57 | for line in fh: 58 | line = line.strip().split() 59 | sent = [tuple(x.rsplit("|", 1)) for x in line] 60 | yield sent 61 | 62 | 63 | train = list(read(train_file)) 64 | dev = list(read(dev_file)) 65 | words = [] 66 | tags = [] 67 | wc = Counter() 68 | for sent in train: 69 | for w, p in sent: 70 | words.append(w) 71 | tags.append(p) 72 | wc[w] += 1 73 | words.append("_UNK_") 74 | 75 | vw = Vocab.from_corpus([words]) 76 | vt = Vocab.from_corpus([tags]) 77 | UNK = vw.w2i["_UNK_"] 78 | nwords = vw.size() 79 | ntags = vt.size() 80 | print("nwords=%r, ntags=%r" % (nwords, ntags)) 81 | 82 | 83 | def get_var(x, volatile=False): 84 | x = Variable(x, volatile=volatile) 85 | return x.cuda() if args.CUDA else x 86 | 87 | 88 | class Model(nn.Module): 89 | 90 | def __init__(self, args): 91 | super(Model, self).__init__() 92 | self.lookup = nn.Embedding(nwords, args.WEMBED_SIZE) 93 | self.lstm = nn.LSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE, 1, bidirectional=True) 94 | self.proj1 = nn.Linear(2 * args.HIDDEN_SIZE, args.MLP_SIZE) 95 | self.proj2 = nn.Linear(args.MLP_SIZE, ntags) 96 | 97 | def forward(self, x): 98 | return nn.functional.softmax(nn.functional.tanh(self.proj2(self.proj1(self.lstm(self.lookup(x).unsqueeze(1))[0].squeeze(1))))) 99 | 100 | 101 | model = Model(args) 102 | if args.CUDA: 103 | model.cuda() 104 | optimizer = optim.Adam(model.parameters()) 105 | 106 | 107 | print("startup time: %r" % (time.time() - start)) 108 | start = time.time() 109 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0 110 | 111 | for ITER in range(100): 112 | random.shuffle(train) 113 | for s in train: 114 | i += 1 115 | if i % 500 == 0: 116 | print(this_loss / this_tagged, file=sys.stderr) 117 | all_tagged += this_tagged 118 | this_loss = this_tagged = 0 119 | all_time = time.time() - start 120 | if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev 121 | dev_start = time.time() 122 | good_sent = bad_sent = good = bad = 0.0 123 | for sent in dev: 124 | words = [vw.w2i[w] if wc[w] > 5 else UNK for w, _ in sent] 125 | golds = [t for w, t in sent] 126 | tags = [vt.i2w[i] for i in model(get_var(torch.LongTensor(words), volatile=True)).max(1)[1].cpu().data.view(-1)] 127 | if tags == golds: good_sent += 1 128 | else: bad_sent += 1 129 | for go, gu in zip(golds, tags): 130 | if go == gu: good += 1 131 | else: bad += 1 132 | dev_time += time.time() - dev_start 133 | train_time = time.time() - start - dev_time 134 | print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time)) 135 | if all_time > args.TIMEOUT: 136 | sys.exit(0) 137 | # batch / loss 138 | words = [vw.w2i[w] if wc[w] > 5 else UNK for w, _ in s] 139 | golds = [vt.w2i[t] for _, t in s] 140 | preds = model(get_var(torch.LongTensor(words))) 141 | loss = F.cross_entropy(preds, get_var(torch.LongTensor(golds))) 142 | # log / optim 143 | this_loss += loss.data[0]*len(golds) 144 | this_tagged += len(golds) 145 | optimizer.zero_grad() 146 | loss.backward() 147 | optimizer.step() 148 | print("epoch %r finished" % ITER) 149 | -------------------------------------------------------------------------------- /pytorch/rnnlm.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | import time 3 | import sys 4 | import random 5 | import argparse 6 | from itertools import count 7 | from collections import defaultdict 8 | 9 | import numpy as np 10 | import torch 11 | from torch import nn 12 | from torch import optim 13 | from torch.autograd import Variable 14 | 15 | start = time.time() 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('MB_SIZE', type=int, help='minibatch size') 19 | parser.add_argument('EMBED_SIZE', type=int, help='embedding size') 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 23 | parser.add_argument('--CUDA', type=int, default=-1, help='use CUDA') 24 | args = parser.parse_args() 25 | 26 | train_file = 'data/text/train.txt' 27 | test_file = 'data/text/dev.txt' 28 | 29 | def read(fname): 30 | """ 31 | Read a file where each line is of the form "word1 word2 ..." 32 | Yields lists of the form [word1, word2, ...] 33 | """ 34 | with open(fname, "r") as fh: 35 | for line in fh: 36 | sent = [w2i[x] for x in line.strip().split()] 37 | sent.append(w2i[""]) 38 | yield torch.LongTensor(sent) 39 | 40 | w2i = defaultdict(count(0).next) 41 | mask = w2i[''] 42 | assert mask == 0 43 | train = list(read(train_file)) 44 | vocab_size = len(w2i) 45 | test = list(read(test_file)) 46 | S = w2i[''] 47 | 48 | def get_batch(sequences, volatile=False): 49 | lengths = torch.LongTensor([len(s) for s in sequences]) 50 | batch = torch.LongTensor(lengths.max(), len(sequences)).fill_(mask) 51 | for i, s in enumerate(sequences): 52 | batch[:len(s), i] = s 53 | if args.CUDA: 54 | batch = batch.cuda() 55 | return Variable(batch, volatile=volatile), lengths 56 | 57 | class RNNLM(nn.Module): 58 | def __init__(self): 59 | super(RNNLM, self).__init__() 60 | self.embeddings = nn.Embedding(vocab_size, args.EMBED_SIZE) 61 | self.rnn = nn.LSTM(args.EMBED_SIZE, args.HIDDEN_SIZE) 62 | self.proj = nn.Linear(args.HIDDEN_SIZE, vocab_size) 63 | def forward(self, sequences): 64 | rnn_output, _ = self.rnn(self.embeddings(sequences)) 65 | return self.proj(rnn_output.view(-1, args.HIDDEN_SIZE)) 66 | 67 | # build the model 68 | rnnlm = RNNLM() 69 | optimizer = optim.Adam(rnnlm.parameters(), lr=0.001) 70 | weight = torch.FloatTensor(vocab_size).fill_(1) 71 | weight[mask] = 0 72 | loss_fn = nn.CrossEntropyLoss(weight, size_average=False) 73 | if args.CUDA: 74 | rnnlm.cuda() 75 | loss_fn.cuda() 76 | 77 | # Sort training sentences in descending order and count minibatches 78 | train.sort(key=lambda x: -len(x)) 79 | test.sort(key=lambda x: -len(x)) 80 | train_order = range(0, len(train), args.MB_SIZE) # [x*args.MB_SIZE for x in range(int((len(train)-1)/args.MB_SIZE + 1))] 81 | test_order = range(0, len(test), args.MB_SIZE) # [x*args.MB_SIZE for x in range(int((len(test)-1)/args.MB_SIZE + 1))] 82 | 83 | # Perform training 84 | print("startup time: %r" % (time.time() - start)) 85 | start = time.time() 86 | i = total_time = dev_time = total_tagged = current_words = current_loss = 0 87 | 88 | for ITER in range(100): 89 | random.shuffle(train_order) 90 | for sid in train_order: 91 | i += 1 92 | # train 93 | batch, lengths = get_batch(train[sid:sid + args.MB_SIZE]) 94 | scores = rnnlm(batch[:-1]) 95 | loss = loss_fn(scores, batch[1:].view(-1)) 96 | # optimization 97 | optimizer.zero_grad() 98 | loss.backward() 99 | optimizer.step() 100 | # log loss 101 | current_words += lengths.sum() - lengths.size(0) # ignore 102 | current_loss += loss.data[0] 103 | if i % int(500 / args.MB_SIZE) == 0: 104 | print(current_loss / current_words) 105 | total_tagged += current_words 106 | current_loss = current_words = 0 107 | total_time = time.time() - start 108 | # log perplexity 109 | if i % int(10000 / args.MB_SIZE) == 0 or total_time > args.TIMEOUT: 110 | dev_start = time.time() 111 | dev_loss = dev_words = 0 112 | for j in test_order: 113 | batch, lengths = get_batch(test[j:j + args.MB_SIZE], volatile=True) 114 | scores = rnnlm(batch[:-1]) 115 | dev_loss += loss_fn(scores, batch[1:].view(-1)).data[0] 116 | dev_words += lengths.sum() - lengths.size(0) # ignore 117 | dev_time += time.time() - dev_start 118 | train_time = time.time() - start - dev_time 119 | print("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % ( 120 | dev_loss / dev_words, np.exp(dev_loss / dev_words), dev_words, train_time, total_tagged / train_time)) 121 | if total_time > args.TIMEOUT: 122 | sys.exit(0) 123 | 124 | print("epoch %r finished" % ITER) 125 | -------------------------------------------------------------------------------- /run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHON_PATH={ANACONDA_PATH:-$HOME/usr/local/anaconda3/envs/benchmark2} 4 | export CUDA_PATH=/usr/local/cuda 5 | export DYNET_PATH=${DYNET_PATH:-$HOME/work/dynet} 6 | export LD_LIBRARY_PATH=$DYNET_PATH/build/dynet:$PYTHON_PATH/lib:$CUDA_PATH/lib64 7 | export LIBRARY_PATH=$DYNET_PATH/build/dynet:$PYTHON_PATH/lib:$CUDA_PATH/lib64 8 | export PYTHONPATH=$DYNET_PATH/build/python 9 | PYTHON=python 10 | 11 | DYFLAGS=${DYFLAGS:-"--dynet-mem 4096"} 12 | GPUSUF= 13 | if [[ $# == 1 ]]; then 14 | export CUDA_VISIBLE_DEVICES=$1 15 | export THEANO_FLAGS="device=gpu0,floatX=float32" 16 | DYFLAGS="$DYFLAGS --dynet-gpus 1" 17 | GPUSUF="-gpu" 18 | CGPU=0 19 | else 20 | export THEANO_FLAGS="device=cpu,floatX=float32" 21 | CGPU=-1 22 | fi 23 | 24 | TIMEOUT=${TIMEOUT:-600} 25 | LONGTIMEOUT=${LONGTIMEOUT:-600} 26 | 27 | runcmd() { 28 | LFILE=log/$2$GPUSUF/$4.log 29 | if [[ ! -e $LFILE ]]; then 30 | MYTIMEOUT=$TIMEOUT 31 | if [[ $1 == "dynet-cpp" ]]; then 32 | mycmd="$1/$2$GPUSUF $DYFLAGS" 33 | if [[ $4 =~ dynet-cpp-bs01-ws128-hs256-.* ]] || [[ $4 =~ dynet-cpp-bs16-ws128-hs256-.* ]] || [[ $2 =~ bilstm.* ]] || [[ $2 =~ treenn ]]; then 34 | MYTIMEOUT=$LONGTIMEOUT 35 | fi 36 | elif [[ $1 == "dynet-py" ]]; then 37 | mycmd="$PYTHON -u $1/$2.py $DYFLAGS" 38 | elif [[ $1 == "chainer" ]]; then 39 | mycmd="$PYTHON -u $1/$2.py --chainer_gpu $CGPU" 40 | elif [[ $1 == "tensorflow" ]]; then 41 | mycmd="$PYTHON -u $1/$2.py --gpu" 42 | else 43 | mycmd="$PYTHON -u $1/$2.py" 44 | fi 45 | mkdir -p log/$2$GPUSUF 46 | echo "$mycmd $3 $MYTIMEOUT &> $LFILE" 47 | EXTERNALTIMEOUT=$((MYTIMEOUT+60)) 48 | timeout $EXTERNALTIMEOUT $mycmd $3 $MYTIMEOUT &> $LFILE 49 | fi 50 | } 51 | 52 | NUM_TRIALS=${NUM_TRIALS:-3} 53 | 54 | for trial in `seq $NUM_TRIALS`; do 55 | 56 | if [[ -z "$TASK" || "$TASK" == "rnnlm-batch" ]]; then 57 | # Run rnnlm-batch 58 | for embsize in 128; do 59 | hidsize=$(($embsize*2)) 60 | for mbsize in 64 16 04 01; do 61 | if [[ -z "$MBSIZE" || "$MBSIZE" == "$mbsize" ]]; then 62 | for f in dynet-cpp dynet-py chainer theano tensorflow; do 63 | if [[ $f == "dynet-cpp" ]]; then 64 | runcmd $f rnnlm-seq "$mbsize $embsize $hidsize 0" $f-ms$mbsize-es$embsize-hs$hidsize-sp0-t$trial 65 | fi 66 | runcmd $f rnnlm-batch "$mbsize $embsize $hidsize 0" $f-ms$mbsize-es$embsize-hs$hidsize-sp0-t$trial 67 | done 68 | fi 69 | done 70 | done 71 | fi 72 | 73 | if [[ -z "$TASK" || "$TASK" == "sparse-rnnlm-batch" ]]; then 74 | # run sparse rnnlm-batch on a subset 75 | for embsize in 128; do 76 | hidsize=$(($embsize*2)) 77 | for mbsize in 16 01; do 78 | if [[ -z "$MBSIZE" || "$MBSIZE" == "$mbsize" ]]; then 79 | for f in dynet-cpp dynet-py; do 80 | runcmd $f rnnlm-batch "$mbsize $embsize $hidsize 1" $f-ms$mbsize-es$embsize-hs$hidsize-sp1-t$trial 81 | done 82 | fi 83 | done 84 | done 85 | fi 86 | 87 | if [[ -z "$TASK" || "$TASK" == "bilstm-tagger" ]]; then 88 | # Run bilstm-tagger 89 | wembsize=128 90 | hidsize=50 91 | mlpsize=32 92 | for f in dynet-cpp dynet-py chainer theano tensorflow; do 93 | runcmd $f bilstm-tagger "$wembsize $hidsize $mlpsize 0" $f-ws$wembsize-hs$hidsize-mlps$mlpsize-su0-t$trial 94 | if [[ $f == dynet* ]]; then 95 | runcmd $f bilstm-tagger "$wembsize $hidsize $mlpsize 1" $f-ws$wembsize-hs$hidsize-mlps$mlpsize-su1-t$trial 96 | fi 97 | done 98 | fi 99 | 100 | if [[ -z "$TASK" || "$TASK" == "bilstm-tagger-withchar" ]]; then 101 | # Run bilstm-tagger-withchar 102 | cembsize=20 103 | wembsize=128 104 | hidsize=50 105 | mlpsize=32 106 | for f in dynet-cpp dynet-py theano chainer; do 107 | runcmd $f bilstm-tagger-withchar "$cembsize $wembsize $hidsize $mlpsize 0" $f-cs$cembsize-ws$wembsize-hs$hidsize-mlps$mlpsize-su0-t$trial 108 | if [[ $f == dynet* ]]; then 109 | runcmd $f bilstm-tagger-withchar "$cembsize $wembsize $hidsize $mlpsize 1" $f-cs$cembsize-ws$wembsize-hs$hidsize-mlps$mlpsize-su1-t$trial 110 | fi 111 | done 112 | fi 113 | 114 | if [[ -z "$TASK" || "$TASK" == "treenn" ]]; then 115 | # Run treenn 116 | wembsize=128 117 | hidsize=128 118 | for f in dynet-cpp dynet-py chainer; do 119 | runcmd $f treenn "$wembsize $hidsize 0" $f-ws$wembsize-hs$hidsize-su0-t$trial 120 | if [[ $f == dynet* ]]; then 121 | runcmd $f treenn "$wembsize $hidsize 1" $f-ws$wembsize-hs$hidsize-su1-t$trial 122 | fi 123 | done 124 | fi 125 | 126 | done 127 | -------------------------------------------------------------------------------- /tensorflow/bilstm-tagger.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start = time.time() 4 | 5 | from collections import Counter, defaultdict 6 | from itertools import count 7 | import random 8 | import math 9 | import sys 10 | import numpy as np 11 | import tensorflow as tf 12 | import argparse 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--gpu', dest='gpu', action='store_true') 16 | parser.set_defaults(gpu=False) 17 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size') 18 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 19 | parser.add_argument('MLP_SIZE', type=int, help='embedding size') 20 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') # sparse updates by default in tensorflow 21 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 22 | args = parser.parse_args() 23 | 24 | NUM_LAYERS = 1 25 | 26 | # format of files: each line is "word1/tag2 word2/tag2 ..." 27 | train_file='data/tags/train.txt' 28 | test_file='data/tags/dev.txt' 29 | 30 | class Vocab: 31 | def __init__(self, w2i=None): 32 | if w2i is None: w2i = defaultdict(count(0).next) 33 | self.w2i = dict(w2i) 34 | self.i2w = {i:w for w,i in w2i.iteritems()} 35 | @classmethod 36 | def from_corpus(cls, corpus): 37 | w2i = defaultdict(count(0).next) 38 | for sent in corpus: 39 | [w2i[word] for word in sent] 40 | return Vocab(w2i) 41 | 42 | def size(self): return len(self.w2i.keys()) 43 | 44 | def read(fname): 45 | """ 46 | Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..." 47 | Yields lists of the form [(word1,tag1), (word2,tag2), ...] 48 | """ 49 | with file(fname) as fh: 50 | for line in fh: 51 | line = line.strip().split() 52 | sent = [tuple(x.rsplit("|",1)) for x in line] 53 | yield sent 54 | 55 | train=list(read(train_file)) 56 | test=list(read(test_file)) 57 | words=[] 58 | tags=[] 59 | wc=Counter() 60 | for sent in train: 61 | for w,p in sent: 62 | words.append(w) 63 | tags.append(p) 64 | wc[w]+=1 65 | words.append("_UNK_") 66 | 67 | vw = Vocab.from_corpus([words]) 68 | vt = Vocab.from_corpus([tags]) 69 | UNK = vw.w2i["_UNK_"] 70 | 71 | nwords = vw.size() 72 | ntags = vt.size() 73 | print ("nwords=%r, ntags=%r" % (nwords, ntags)) 74 | 75 | def get_tags(log_probs): 76 | sent_tags = [] 77 | for word_probs in log_probs: 78 | tag = np.argmax(word_probs, axis=0) 79 | sent_tags.append(tag) 80 | return sent_tags 81 | 82 | if args.gpu: 83 | cpu_or_gpu = '/gpu:0' 84 | else: 85 | cpu_or_gpu = '/cpu:0' 86 | 87 | with tf.device(cpu_or_gpu): 88 | 89 | # Lookup parameters for word embeddings 90 | WORDS_LOOKUP = tf.Variable(tf.random_uniform([nwords, 1, args.WEMBED_SIZE], -1.0, 1.0)) 91 | 92 | mlp_hidden = tf.Variable(tf.random_uniform([args.HIDDEN_SIZE*2, args.MLP_SIZE], -1.0, 1.0)) 93 | mlp_out = tf.Variable(tf.random_uniform([args.MLP_SIZE, ntags], -1.0, 1.0)) 94 | 95 | # input sentence placeholder 96 | words_in = tf.placeholder(tf.int32, [None], name="input_sentence") 97 | golds = tf.placeholder(tf.int32, [None], name="golds") 98 | sent_len = tf.placeholder(tf.int32, shape=(1,), name="sent_len") 99 | 100 | wembs = tf.squeeze(tf.nn.embedding_lookup(WORDS_LOOKUP, words_in), axis=1) 101 | wembs = tf.expand_dims(wembs, axis=0) 102 | wembs.set_shape([1, words_in.get_shape()[0], args.WEMBED_SIZE]) 103 | 104 | # Word-level LSTM (configurable number of layers, input is unspecified, 105 | # but will be equal to the embedding dim, output=128) 106 | 107 | cell = tf.nn.rnn_cell.BasicLSTMCell(args.HIDDEN_SIZE, forget_bias=0.0, state_is_tuple=True) 108 | cell = tf.nn.rnn_cell.MultiRNNCell([cell] * NUM_LAYERS, state_is_tuple=True) 109 | 110 | outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell, 111 | cell_bw=cell, 112 | dtype=tf.float32, 113 | sequence_length=sent_len, 114 | inputs=wembs) 115 | 116 | output_fw, output_bw = outputs 117 | output_concat = tf.squeeze(tf.concat(2, [output_fw, output_bw]), axis=0) # (input_length, 2 * HIDDEN_SIZE) 118 | output_concat.set_shape([None, 2*args.HIDDEN_SIZE]) 119 | 120 | # Pass to MLP 121 | mlp_activation = tf.tanh(tf.matmul(output_concat, mlp_hidden)) 122 | mlp_output = tf.matmul(mlp_activation, mlp_out) 123 | 124 | ## calculate the loss 125 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(mlp_output, golds) 126 | loss = tf.reduce_sum(losses) 127 | 128 | optimizer = tf.train.AdamOptimizer().minimize(loss) 129 | print('Graph created.' , file=sys.stderr) 130 | 131 | sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) 132 | tf.global_variables_initializer().run() 133 | print('Session initialized.' , file=sys.stderr) 134 | train_losses = [] 135 | print ("startup time: %r" % (time.time() - start)) 136 | start_train = time.time() 137 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0 138 | 139 | for ITER in range(100): 140 | random.shuffle(train) 141 | start = time.time() 142 | for s in train: 143 | i += 1 144 | if i % 500 == 0: # print status 145 | print('Updates so far: %d Loss: %f wps: %f' % (i - 1, this_loss / this_tagged, this_tagged/(time.time() - start))) 146 | all_tagged += this_tagged 147 | this_loss = this_tagged = 0 148 | all_time = time.time() - start_train 149 | start = time.time() 150 | if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev 151 | dev_start = time.time() 152 | good_sent = bad_sent = good = bad = 0.0 153 | for sent in test: 154 | x_in = [vw.w2i[w] if wc[w]>5 else UNK for w,_ in sent] 155 | golds_in = [vt.w2i[t] for _,t in sent] 156 | # log_probs = sess.run(mlp_output, feed_dict={words_in: x_in, golds: golds_in, sent_len: [len(sent)]}) 157 | log_probs = mlp_output.eval(feed_dict={words_in: x_in, golds: golds_in, sent_len: [len(sent)]}, session=sess) 158 | tags = get_tags(log_probs) 159 | if tags == golds_in: good_sent += 1 160 | else: bad_sent += 1 161 | for go,gu in zip(golds_in,tags): 162 | if go == gu: good += 1 163 | else: bad += 1 164 | dev_time += time.time() - dev_start 165 | train_time = time.time() - start_train - dev_time 166 | print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time)) 167 | start = start + (time.time() - dev_start) 168 | if all_time > args.TIMEOUT: 169 | sys.exit(0) 170 | # train on sent 171 | x_in = [vw.w2i[w] if wc[w]>5 else UNK for w,_ in s] 172 | golds_in = [vt.w2i[t] for _,t in s] 173 | train_loss, _ = sess.run([loss, optimizer], feed_dict={words_in: x_in, golds: golds_in, sent_len: [len(s)]}) 174 | this_loss += train_loss 175 | this_tagged += len(golds_in) 176 | print("epoch %r finished" % ITER) 177 | -------------------------------------------------------------------------------- /tensorflow/bow.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start = time.time() 4 | 5 | from collections import defaultdict 6 | from operator import itemgetter 7 | import random 8 | import tensorflow as tf 9 | import numpy as np 10 | import sys 11 | 12 | # Functions to read in the corpus 13 | w2i = defaultdict(lambda: len(w2i)) 14 | t2i = defaultdict(lambda: len(t2i)) 15 | UNK = w2i[""] 16 | 17 | def read_dataset(filename): 18 | with open(filename, "r") as f: 19 | for line in f: 20 | tag, words = line.lower().strip().split(" ||| ") 21 | yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 22 | 23 | # Read in the data 24 | train = list(read_dataset("data/classes/train.txt")) 25 | w2i = defaultdict(lambda: UNK, w2i) 26 | dev = list(read_dataset("data/classes/test.txt")) 27 | nwords = len(w2i) 28 | ntags = len(t2i) 29 | EPOCHS = 100 30 | GPU = False 31 | 32 | print ("nwords=%r, ntags=%r" % (nwords, ntags)) 33 | # Determine max length across train and dev set 34 | max_length = 0 35 | for sent in train: 36 | if len(sent[0]) > max_length: 37 | max_length = len(sent[0]) 38 | 39 | for sent in dev: 40 | if len(sent[0]) > max_length: 41 | max_length = len(sent[0]) 42 | 43 | def pad(seq, element, length): 44 | r = seq + [element] * (length - len(seq)) 45 | assert len(r) == length 46 | return r 47 | 48 | def main(_): 49 | if GPU: 50 | cpu_or_gpu = '/gpu:0' 51 | else: 52 | cpu_or_gpu = '/cpu:0' 53 | 54 | with tf.device(cpu_or_gpu): 55 | W_sm = tf.Variable(tf.random_uniform([nwords, ntags], -1.0, 1.0)) # Word weights 56 | b_sm = tf.Variable(tf.random_uniform([ntags], -1.0, 1.0)) # Softmax bias 57 | words_in = tf.placeholder(tf.int32, shape=[max_length]) 58 | tags_in = tf.placeholder(tf.int32, shape=[1]) 59 | masks_in = tf.placeholder(tf.float32, shape=[max_length]) 60 | 61 | ##Calculate scores 62 | embs = [tf.expand_dims(tf.nn.embedding_lookup(W_sm, x), axis=1) for x in tf.unstack(words_in)] 63 | embs_concat = tf.concat(1, embs) # embedding matrix 64 | score = tf.mul(embs_concat, masks_in) # truncate padded tokens' embeddings 65 | score = tf.reduce_sum(score, axis=1) 66 | score_out = tf.add(score, b_sm) 67 | 68 | # Add dims to match cross entropy func definition 69 | score_to_loss = tf.expand_dims(score_out, axis=0) 70 | 71 | # Calculate loss 72 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(score_to_loss, tags_in) 73 | # losses = tf.nn.seq2seq.sequence_loss_by_example(tf.unstack(score_to_loss), tf.unstack(tags_in), loss_weights) 74 | loss = tf.reduce_mean(losses) 75 | 76 | optimizer = tf.train.AdamOptimizer().minimize(loss) 77 | 78 | print >>sys.stderr, 'Graph created.' 79 | 80 | sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True)) 81 | tf.global_variables_initializer().run() 82 | print >>sys.stderr, 'Session initialized.' 83 | 84 | print ("startup time: %r" % (time.time() - start)) 85 | for ITER in range(EPOCHS): 86 | # Perform training 87 | random.shuffle(train) 88 | train_loss = 0.0 89 | start = time.time() 90 | for i, (words, tag) in enumerate(train): 91 | padded_words = pad(words, UNK, max_length) 92 | mask = [1.0] * len(words) + [0.0] * (max_length - len(words)) 93 | _, cur_loss, _ = sess.run([score_out, loss, optimizer], feed_dict={words_in: padded_words, tags_in: [tag], masks_in: mask}) 94 | train_loss += cur_loss 95 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start)) 96 | 97 | # Perform testing 98 | test_correct = 0.0 99 | for words, tag in dev: 100 | padded_words = pad(words, UNK, max_length) 101 | mask = [1.0] * len(words) + [0.0] * (max_length - len(words)) 102 | prob_scores = sess.run(score_out, feed_dict={words_in: padded_words, tags_in: [tag], masks_in: mask}) 103 | predict = np.argmax(prob_scores) 104 | if predict == tag: 105 | test_correct += 1 106 | print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev))) 107 | 108 | if __name__ == "__main__": 109 | tf.app.run() 110 | -------------------------------------------------------------------------------- /tensorflow/rnnlm-batch.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | start_ = time.time() 4 | 5 | from collections import Counter, defaultdict 6 | from itertools import count 7 | import random 8 | import math 9 | import sys 10 | import argparse 11 | 12 | import numpy as np 13 | import tensorflow as tf 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--gpu', dest='gpu', action='store_true') 17 | parser.set_defaults(gpu=False) 18 | parser.add_argument('MB_SIZE', type=int, help='minibatch size') 19 | parser.add_argument('EMBED_SIZE', type=int, help='embedding size') 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') # sparse updates by default in tensorflow 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 23 | args = parser.parse_args() 24 | 25 | NUM_LAYERS = 1 26 | 27 | # format of files: each line is "word1/tag2 word2/tag2 ..." 28 | train_file='data/text/train.txt' 29 | test_file='data/text/dev.txt' 30 | w2i = defaultdict(count(0).next) 31 | eos = '' 32 | 33 | def read(fname): 34 | """ 35 | Read a file where each line is of the form "word1 word2 ..." 36 | Yields lists of the form [word1, word2, ...] 37 | """ 38 | with file(fname) as fh: 39 | for line in fh: 40 | sent = [w2i[eos]] 41 | sent += [w2i[x] for x in line.strip().split()] 42 | sent.append(w2i[eos]) 43 | yield sent 44 | 45 | train = list(read(train_file)) 46 | nwords = len(w2i) 47 | test = list(read(test_file)) 48 | S = w2i[eos] 49 | assert(nwords == len(w2i)) 50 | 51 | train.sort(key=lambda x: len(x), reverse=True) 52 | test.sort(key=lambda x: len(x), reverse=True) 53 | 54 | if args.MB_SIZE != 0: 55 | train_order = [x*args.MB_SIZE for x in range((len(train)-1)/args.MB_SIZE + 1)] 56 | test_order = [x*args.MB_SIZE for x in range((len(test)-1)/args.MB_SIZE + 1)] 57 | else: 58 | train_order = range(len(train)) 59 | test_order = range(len(test)) 60 | 61 | def pad(seq, element, length): 62 | assert len(seq) <= length 63 | r = seq + [element] * (length - len(seq)) 64 | assert len(r) == length 65 | return r 66 | 67 | if args.gpu: 68 | cpu_or_gpu = '/gpu:0' 69 | else: 70 | cpu_or_gpu = '/cpu:0' 71 | 72 | with tf.device(cpu_or_gpu): 73 | # Lookup parameters for word embeddings 74 | WORDS_LOOKUP = tf.Variable(tf.random_uniform([nwords, 1, args.EMBED_SIZE], -1.0, 1.0)) 75 | 76 | # Word-level LSTM (configurable number of layers, input is unspecified, 77 | # but will be equal to the embedding dim, output=128) 78 | cell = tf.nn.rnn_cell.BasicLSTMCell(args.HIDDEN_SIZE, forget_bias=0.0, state_is_tuple=True) 79 | cell = tf.nn.rnn_cell.MultiRNNCell([cell] * NUM_LAYERS, state_is_tuple=True) 80 | 81 | # input sentence placeholder 82 | x_input = tf.placeholder(tf.int32, [None, None], name="x_input") 83 | x_lens = tf.placeholder(tf.int32, [None], name='x_lens') 84 | 85 | x_embs = tf.squeeze(tf.nn.embedding_lookup(WORDS_LOOKUP, x_input), axis=2) 86 | # Hack to fix shape so dynamic_rnn will accept this as input 87 | x_embs.set_shape([None, None, args.EMBED_SIZE]) 88 | 89 | # Actually run the RNN 90 | outputs, _ = tf.nn.dynamic_rnn(cell, x_embs, sequence_length=x_lens, dtype=tf.float32) 91 | 92 | # Affine transform 93 | output = tf.reshape(tf.concat(1, outputs), [-1, args.HIDDEN_SIZE]) 94 | W_sm = tf.Variable(tf.random_uniform([args.HIDDEN_SIZE, nwords])) 95 | b_sm = tf.Variable(tf.random_uniform([nwords])) 96 | logits = tf.matmul(tf.squeeze(output), W_sm) + b_sm 97 | 98 | # Compute categorical loss 99 | # Don't predict the first input (), and don't worry about the last output (after we've input ) 100 | # losses = tf.nn.sparse_softmax_cross_entropy_with_logits(outputs[:-1], x_input[1:]) 101 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits[:-1], tf.reshape(x_input, [-1])[1:]) 102 | loss = tf.reduce_mean(losses) 103 | optimizer = tf.train.AdamOptimizer().minimize(loss) 104 | 105 | print('Graph created.', file=sys.stderr) 106 | 107 | sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) 108 | tf.global_variables_initializer().run() 109 | print('Session initialized.', file=sys.stderr) 110 | 111 | train_losses = [] 112 | print('startup time: %r' % (time.time() - start_)) 113 | i = all_time = dev_time = all_tagged = train_words = 0 114 | start_train = time.time() 115 | for ITER in range(10): 116 | random.shuffle(train_order) 117 | start_ = time.time() 118 | for i, sid in enumerate(train_order, start=1): 119 | if i % int(500 / args.MB_SIZE) == 0: 120 | print('Updates so far: %d Loss: %f wps: %f' % (i - 1, sum(train_losses) / train_words, train_words/(time.time() - start_))) 121 | all_tagged += train_words 122 | train_losses = [] 123 | train_words = 0 124 | all_time = time.time() - start_train 125 | start_ = time.time() 126 | if i % int(10000 / args.MB_SIZE) == 0 or all_time > args.TIMEOUT: 127 | dev_start = time.time() 128 | test_losses = [] 129 | test_words = 0 130 | all_time += time.time() - start_train 131 | print('Testing on dev set...') 132 | for tid in test_order: 133 | t_examples = test[tid:tid+args.MB_SIZE] 134 | x_lens_in = [len(example) for example in t_examples] 135 | x_in = [pad(example, S, max(x_lens_in)) for example in t_examples] 136 | test_loss = sess.run(loss, feed_dict={x_input: x_in, x_lens: x_lens_in}) 137 | tot_words = sum(x_lens_in) - len(x_lens_in) # Subtract out from the denominator - to be in line with other toolkits 138 | test_losses.append(test_loss * tot_words) 139 | test_words += tot_words 140 | nll = sum(test_losses) / test_words 141 | dev_time += time.time() - dev_start 142 | train_time = time.time() - start_train - dev_time 143 | print ('nll=%.4f, ppl=%.4f, time=%.4f, words_per_sec=%.4f' % (nll, math.exp(nll), train_time, all_tagged/train_time), file=sys.stderr) 144 | start_ = start_ + (time.time() - dev_start) 145 | if all_time > args.TIMEOUT: 146 | sys.exit(0) 147 | # train on sent 148 | examples = train[sid : sid+args.MB_SIZE] 149 | x_lens_in = [len(example) for example in examples] 150 | if x_lens_in.count(x_lens_in[0])!=len(x_lens_in): x_in = [pad(example, S, max(x_lens_in)) for example in examples] 151 | else: x_in = examples 152 | train_loss, _ = sess.run([loss, optimizer], feed_dict={x_input: x_in, x_lens: x_lens_in}) 153 | tot_words = sum(x_lens_in) - len(x_lens_in) # Subtract out from the denominator 154 | train_losses.append(train_loss * tot_words) 155 | train_words += tot_words 156 | -------------------------------------------------------------------------------- /theano/README.md: -------------------------------------------------------------------------------- 1 | * rnnlm_mibibatch.py: LSTM rnnlm with (or without) minibatch. Set `batch_size=1` to disable minibatch. 2 | -------------------------------------------------------------------------------- /theano/bilstm-tagger-withchar.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | import time 3 | start = time.time() 4 | 5 | import random 6 | 7 | import theano.tensor as T 8 | import theano 9 | from theano.ifelse import ifelse 10 | import numpy as np 11 | import sys 12 | import argparse 13 | from itertools import chain 14 | 15 | from nn.layers.recurrent import LSTM, BiLSTM 16 | from nn.layers.embeddings import Embedding 17 | from nn.activations import softmax 18 | from nn.optimizers import Adam 19 | from nn.initializations import uniform 20 | 21 | from collections import Counter, defaultdict 22 | from itertools import count 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--dynet_mem", default=512, type=int) 26 | parser.add_argument('CEMBED_SIZE', type=int, help='embedding size') 27 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size') 28 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 29 | parser.add_argument('MLP_SIZE', type=int, help='embedding size') 30 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 31 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 32 | args = parser.parse_args() 33 | 34 | # format of files: each line is "word1|tag2 word2|tag2 ..." 35 | train_file="data/tags/train.txt" 36 | dev_file="data/tags/dev.txt" 37 | 38 | 39 | class Vocab: 40 | def __init__(self, w2i=None): 41 | if w2i is None: w2i = defaultdict(count(0).next) 42 | self.w2i = dict(w2i) 43 | self.i2w = {i:w for w,i in w2i.iteritems()} 44 | 45 | @classmethod 46 | def from_corpus(cls, corpus): 47 | w2i = defaultdict(count(0).next) 48 | for sent in corpus: 49 | [w2i[word] for word in sent] 50 | return Vocab(w2i) 51 | 52 | def size(self): 53 | return len(self.w2i.keys()) 54 | 55 | 56 | def read(fname): 57 | """ 58 | Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..." 59 | Yields lists of the form [(word1,tag1), (word2,tag2), ...] 60 | """ 61 | with file(fname) as fh: 62 | for line in fh: 63 | line = line.strip().split() 64 | sent = [tuple(x.rsplit("|",1)) for x in line] 65 | yield sent 66 | 67 | 68 | train=list(read(train_file)) 69 | dev=list(read(dev_file)) 70 | words=[] 71 | tags=[] 72 | chars=set() 73 | wc=Counter() 74 | for sent in train: 75 | for w,p in sent: 76 | words.append(w) 77 | tags.append(p) 78 | chars.update(w) 79 | wc[w]+=1 80 | words.append("_UNK_") 81 | chars.add("<*>") 82 | 83 | vw = Vocab.from_corpus([words]) 84 | vt = Vocab.from_corpus([tags]) 85 | vc = Vocab.from_corpus([['_CHAR_MASK_'] + list(chars)]) 86 | UNK = vw.w2i["_UNK_"] 87 | 88 | char_mask = vc.w2i['_CHAR_MASK_'] 89 | # mask of chars must be zero 90 | assert char_mask == 0 91 | 92 | nwords = vw.size() 93 | ntags = vt.size() 94 | nchars = vc.size() 95 | print("nwords=%r, ntags=%r, nchars=%r" % (nwords, ntags, nchars)) 96 | 97 | 98 | def word2id(w): 99 | if wc[w] > 5: 100 | w_index = vw.w2i[w] 101 | return w_index 102 | else: 103 | return UNK 104 | 105 | 106 | def build_tag_graph(): 107 | print('build graph..', file=sys.stderr) 108 | 109 | # (sentence_length) 110 | # word indices for a sentence 111 | x = T.ivector(name='sentence') 112 | 113 | # (sentence_length, max_char_num_per_word) 114 | # character indices for each word in a sentence 115 | x_chars = T.imatrix(name='sent_word_chars') 116 | 117 | # (sentence_length) 118 | # target tag 119 | y = T.ivector(name='tag') 120 | 121 | # Lookup parameters for word embeddings 122 | word_embeddings = Embedding(nwords, args.WEMBED_SIZE, name='word_embeddings') 123 | 124 | # Lookup parameters for character embeddings 125 | char_embeddings = Embedding(nchars, args.CEMBED_SIZE, name='char_embeddings') 126 | 127 | # lstm for encoding word characters 128 | char_lstm = BiLSTM(args.CEMBED_SIZE, int(args.WEMBED_SIZE / 2), name='char_lstm') 129 | 130 | # bi-lstm 131 | lstm = BiLSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE, return_sequences=True, name='lstm') 132 | 133 | # MLP 134 | W_mlp_hidden = uniform((args.HIDDEN_SIZE * 2, args.MLP_SIZE), name='W_mlp_hidden') 135 | W_mlp = uniform((args.MLP_SIZE, ntags), name='W_mlp') 136 | 137 | # def get_word_embed_from_chars(word_chars): 138 | # # (max_char_num_per_word, char_embed_dim) 139 | # # (max_char_num_per_word) 140 | # word_char_embeds, word_char_masks = char_embeddings(word_chars, mask_zero=True) 141 | # word_embed = char_lstm(T.unbroadcast(word_char_embeds[None, :, :], 0), mask=T.unbroadcast(word_char_masks[None, :], 0))[0] 142 | # 143 | # return word_embed 144 | 145 | # def word_embed_look_up_step(word_id, word_chars): 146 | # word_embed = ifelse(T.eq(word_id, UNK), 147 | # get_word_embed_from_chars(word_chars), # if it's a unk 148 | # word_embeddings(word_id)) 149 | # 150 | # return word_embed 151 | 152 | word_embed_src = T.eq(x, UNK).astype('float32')[:, None] 153 | 154 | # (sentence_length, word_embedding_dim) 155 | word_embed = word_embeddings(x) 156 | 157 | # (sentence_length, max_char_num_per_word, char_embed_dim) 158 | # (sentence_length, max_char_num_per_word) 159 | word_char_embeds, word_char_masks = char_embeddings(x_chars, mask_zero=True) 160 | 161 | # (sentence_length, word_embedding_dim) 162 | word_embed_from_char = char_lstm(word_char_embeds, mask=word_char_masks) 163 | 164 | sent_embed = word_embed_src * word_embed_from_char + (1 - word_embed_src) * word_embed 165 | 166 | # # (sentence_length, embedding_dim) 167 | # sent_embed, _ = theano.scan(word_embed_look_up_step, sequences=[x, x_chars]) 168 | 169 | # (sentence_length, lstm_hidden_dim) 170 | lstm_output = lstm(T.unbroadcast(sent_embed[None, :, :], 0))[0] 171 | 172 | # (sentence_length, ntags) 173 | mlp_output = T.dot(T.tanh(T.dot(lstm_output, W_mlp_hidden)), W_mlp) 174 | 175 | tag_prob = T.log(T.nnet.softmax(mlp_output)) 176 | 177 | tag_nll = - tag_prob[T.arange(tag_prob.shape[0]), y] 178 | 179 | loss = tag_nll.sum() 180 | 181 | params = word_embeddings.params + char_embeddings.params + char_lstm.params + lstm.params + [W_mlp_hidden, W_mlp] 182 | updates = Adam().get_updates(params, loss) 183 | train_loss_func = theano.function([x, x_chars, y], loss, updates=updates) 184 | 185 | # build the decoding graph 186 | decode_func = theano.function([x, x_chars], tag_prob) 187 | 188 | return train_loss_func, decode_func 189 | 190 | 191 | def sent_to_theano_input(sent): 192 | tags = np.asarray([vt.w2i[t] for w, t in sent], dtype='int32') 193 | words = np.asarray([word2id(w) for w, t in sent], dtype='int32') 194 | 195 | max_char_num_per_word = max(len(w) + 2 for w, t in sent) 196 | word_chars = np.zeros((len(words), max_char_num_per_word), dtype='int32') 197 | pad_char = vc.w2i["<*>"] 198 | for i, (word, tag) in enumerate(sent): 199 | word_chars[i, :len(word) + 2] = [pad_char] + [vc.w2i[c] for c in word] + [pad_char] 200 | 201 | return words, word_chars, tags 202 | 203 | 204 | def tag_sent(sent, decode_func): 205 | words, word_chars, ref_tags = sent_to_theano_input(sent) 206 | 207 | # (sentence_length, tag_num) 208 | tag_prob = decode_func(words, word_chars) 209 | 210 | tag_results = tag_prob.argmax(axis=-1) 211 | tag_results = [vt.i2w[tid] for tid in tag_results] 212 | 213 | return tag_results 214 | 215 | train_func, decode_func = build_tag_graph() 216 | 217 | print("startup time: %r" % (time.time() - start)) 218 | start = time.time() 219 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0 220 | 221 | for ITER in range(100): 222 | random.shuffle(train) 223 | for s in train: 224 | i += 1 225 | 226 | if i % 500 == 0: # print status 227 | print(this_loss / this_tagged) 228 | all_tagged += this_tagged 229 | this_loss = this_tagged = 0 230 | all_time = time.time() - start 231 | if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev 232 | dev_start = time.time() 233 | all_time += time.time() - start 234 | good_sent = bad_sent = good = bad = 0.0 235 | for sent in dev: 236 | golds = [t for w, t in sent] 237 | 238 | # package words in a batch 239 | tags = tag_sent(sent, decode_func) 240 | 241 | if tags == golds: 242 | good_sent += 1 243 | else: 244 | bad_sent += 1 245 | for go, gu in zip(golds, tags): 246 | if go == gu: 247 | good += 1 248 | else: 249 | bad += 1 250 | 251 | dev_time += time.time() - dev_start 252 | train_time = time.time() - start - dev_time 253 | print("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % ( 254 | good / (good + bad), good_sent / (good_sent + bad_sent), train_time, all_tagged / train_time)) 255 | 256 | if all_time > args.TIMEOUT: 257 | sys.exit(0) 258 | 259 | # train on training sentences 260 | 261 | # word indices 262 | # char indices for each word 263 | # gold tags 264 | words, word_chars, tags = sent_to_theano_input(s) 265 | 266 | loss = train_func(words, word_chars, tags) 267 | 268 | this_loss += loss 269 | this_tagged += len(s) 270 | # print('loss: %f' % loss) 271 | 272 | print("epoch %r finished" % ITER) 273 | -------------------------------------------------------------------------------- /theano/bilstm-tagger.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | import time 3 | start = time.time() 4 | 5 | import random 6 | 7 | import theano.tensor as T 8 | import theano 9 | import numpy as np 10 | import sys 11 | import argparse 12 | from itertools import chain 13 | 14 | from nn.layers.recurrent import LSTM, BiLSTM 15 | from nn.layers.embeddings import Embedding 16 | from nn.activations import softmax 17 | from nn.optimizers import Adam 18 | from nn.initializations import uniform 19 | 20 | from collections import Counter, defaultdict 21 | from itertools import count 22 | 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size') 25 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 26 | parser.add_argument('MLP_SIZE', type=int, help='embedding size') 27 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 28 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 29 | args = parser.parse_args() 30 | 31 | MB_SIZE = 1 32 | 33 | # format of files: each line is "word1|tag2 word2|tag2 ..." 34 | train_file="data/tags/train.txt" 35 | dev_file="data/tags/dev.txt" 36 | 37 | 38 | class Vocab: 39 | def __init__(self, w2i=None): 40 | if w2i is None: w2i = defaultdict(count(0).next) 41 | self.w2i = dict(w2i) 42 | self.i2w = {i:w for w,i in w2i.iteritems()} 43 | 44 | @classmethod 45 | def from_corpus(cls, corpus): 46 | w2i = defaultdict(count(0).next) 47 | for sent in corpus: 48 | [w2i[word] for word in sent] 49 | return Vocab(w2i) 50 | 51 | def size(self): 52 | return len(self.w2i.keys()) 53 | 54 | 55 | def read(fname): 56 | """ 57 | Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..." 58 | Yields lists of the form [(word1,tag1), (word2,tag2), ...] 59 | """ 60 | with file(fname) as fh: 61 | for line in fh: 62 | line = line.strip().split() 63 | sent = [tuple(x.rsplit("|",1)) for x in line] 64 | yield sent 65 | 66 | 67 | train=list(read(train_file)) 68 | dev=list(read(dev_file)) 69 | words=[] 70 | tags=[] 71 | wc=Counter() 72 | words.append('_MASK_') 73 | for sent in train: 74 | for w,p in sent: 75 | words.append(w) 76 | tags.append(p) 77 | wc[w]+=1 78 | words.append("_UNK_") 79 | 80 | vw = Vocab.from_corpus([words]) 81 | vt = Vocab.from_corpus([tags]) 82 | UNK = vw.w2i["_UNK_"] 83 | 84 | # mask token must be of index 0 85 | assert vw.w2i['_MASK_'] == 0 86 | 87 | nwords = vw.size() 88 | ntags = vt.size() 89 | 90 | print ("nwords=%r, ntags=%r" % (nwords, ntags)) 91 | 92 | 93 | def word2id(w): 94 | if wc[w] > 5: 95 | w_index = vw.w2i[w] 96 | return w_index 97 | else: 98 | return UNK 99 | 100 | 101 | def pad(seq): 102 | """ 103 | pad a mini-batch input with ending zeros 104 | """ 105 | batch_size = len(seq) 106 | max_len = max(len(seq[i]) for i in xrange(batch_size)) 107 | padded_seq = np.zeros((batch_size, max_len), dtype='int32') 108 | for i in xrange(batch_size): 109 | padded_seq[i, :len(seq[i])] = seq[i] 110 | 111 | return padded_seq 112 | 113 | 114 | def build_tag_graph(): 115 | print ('build graph..', file=sys.stderr) 116 | 117 | # (batch_size, sentence_length) 118 | x = T.imatrix(name='sentence') 119 | 120 | # (batch_size, sentence_length) 121 | y = T.imatrix(name='tag') 122 | 123 | # Lookup parameters for word embeddings 124 | embedding_table = Embedding(nwords, args.WEMBED_SIZE) 125 | 126 | # bi-lstm 127 | lstm = BiLSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE, return_sequences=True) 128 | 129 | # MLP 130 | W_mlp_hidden = uniform((args.HIDDEN_SIZE * 2, args.MLP_SIZE), name='W_mlp_hidden') 131 | W_mlp = uniform((args.MLP_SIZE, ntags), name='W_mlp') 132 | 133 | # (batch_size, sentence_length, embedding_dim) 134 | sent_embed, sent_mask = embedding_table(x, mask_zero=True) 135 | 136 | # (batch_size, sentence_length, lstm_hidden_dim) 137 | lstm_output = lstm(sent_embed, mask=sent_mask) 138 | 139 | # (batch_size, sentence_length, ntags) 140 | mlp_output = T.dot(T.tanh(T.dot(lstm_output, W_mlp_hidden)), W_mlp) 141 | 142 | # (batch_size * sentence_length, ntags) 143 | mlp_output = mlp_output.reshape((mlp_output.shape[0] * mlp_output.shape[1], -1)) 144 | 145 | tag_prob_f = T.log(T.nnet.softmax(mlp_output)) 146 | 147 | y_f = y.flatten() 148 | mask_f = sent_mask.flatten() 149 | 150 | tag_nll = - tag_prob_f[T.arange(tag_prob_f.shape[0]), y_f] * mask_f 151 | 152 | loss = tag_nll.sum() 153 | 154 | params = embedding_table.params + lstm.params + [W_mlp_hidden, W_mlp] 155 | updates = Adam().get_updates(params, loss) 156 | train_loss_func = theano.function([x, y], loss, updates=updates) 157 | 158 | # build the decoding graph 159 | tag_prob = tag_prob_f.reshape((x.shape[0], x.shape[1], -1)) 160 | decode_func = theano.function([x], tag_prob) 161 | 162 | return train_loss_func, decode_func 163 | 164 | 165 | def data2ids(batch_data): 166 | batch_sent_ids = [[word2id(w) for w, t in sent] for sent in batch_data] 167 | batch_tag_ids = [[vt.w2i[t] for w, t in sent] for sent in batch_data] 168 | 169 | return batch_sent_ids, batch_tag_ids 170 | 171 | 172 | def tag_sent(batch_sents, decode_func): 173 | batch_sent_ids = [[word2id(w) for w in sent] for sent in batch_sents] 174 | batch_sents_x = pad(batch_sent_ids) 175 | batch_sents_len = [len(sent) for sent in batch_sents] 176 | 177 | # (batch_size, sentence_length, tag_num) 178 | tag_prob = decode_func(batch_sents_x) 179 | batch_tag_results = [] 180 | 181 | for i, sent in enumerate(batch_sents): 182 | sent_len = batch_sents_len[i] 183 | tag_results = tag_prob[i].argmax(axis=-1)[:sent_len] 184 | tag_results = [vt.i2w[tid] for tid in tag_results] 185 | batch_tag_results.append(tag_results) 186 | 187 | return batch_tag_results 188 | 189 | train_func, decode_func = build_tag_graph() 190 | 191 | batch_num = int(np.ceil(len(train) / float(MB_SIZE))) 192 | batches = [(i * MB_SIZE, min(len(train), (i + 1) * MB_SIZE)) for i in range(0, batch_num)] 193 | 194 | print ("startup time: %r" % (time.time() - start)) 195 | start = time.time() 196 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0 197 | 198 | for ITER in range(100): 199 | random.shuffle(train) 200 | for batch_id, (batch_start, batch_end) in enumerate(batches): 201 | i += MB_SIZE 202 | 203 | if i % 500 == 0: # print status 204 | print (this_loss / this_tagged) 205 | all_tagged += this_tagged 206 | this_loss = this_tagged = 0 207 | all_time = time.time() - start 208 | if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev 209 | dev_start = time.time() 210 | good_sent = bad_sent = good = bad = 0.0 211 | for sent in dev: 212 | words = [w for w, t in sent] 213 | golds = [t for w, t in sent] 214 | 215 | # package words in a batch 216 | tags = tag_sent([words], decode_func) 217 | tags = tags[0] 218 | 219 | if tags == golds: 220 | good_sent += 1 221 | else: 222 | bad_sent += 1 223 | for go, gu in zip(golds, tags): 224 | if go == gu: 225 | good += 1 226 | else: 227 | bad += 1 228 | 229 | dev_time += time.time() - dev_start 230 | train_time = time.time() - start - dev_time 231 | print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % ( 232 | good / (good + bad), good_sent / (good_sent + bad_sent), train_time, all_tagged / train_time)) 233 | 234 | if all_time > args.TIMEOUT: 235 | sys.exit(0) 236 | 237 | # train on training sentences 238 | 239 | batch_data = train[batch_start:batch_end] 240 | batch_sent_ids, batch_tag_ids = data2ids(batch_data) 241 | 242 | batch_x = pad(batch_sent_ids) 243 | batch_y = pad(batch_tag_ids) 244 | 245 | batch_loss = train_func(batch_x, batch_y) 246 | 247 | this_loss += batch_loss 248 | this_tagged += len(list(chain(*batch_data))) 249 | 250 | print ("epoch %r finished" % ITER) 251 | -------------------------------------------------------------------------------- /theano/bow.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | import time 3 | start = time.time() 4 | 5 | import theano.tensor as T 6 | import theano 7 | import numpy as np 8 | import sys 9 | import random 10 | 11 | from nn.optimizers import SGD, Adam 12 | from nn.initializations import uniform, zero 13 | 14 | from collections import defaultdict 15 | 16 | 17 | # Functions to read in the corpus 18 | w2i = defaultdict(lambda: len(w2i)) 19 | t2i = defaultdict(lambda: len(t2i)) 20 | UNK = w2i[""] 21 | def read_dataset(filename): 22 | with open(filename, "r") as f: 23 | for line in f: 24 | tag, words = line.lower().strip().split(" ||| ") 25 | yield ([w2i[x] for x in words.split(" ")], t2i[tag]) 26 | 27 | # Read in the data 28 | train = list(read_dataset("data/classes/train.txt")) 29 | w2i = defaultdict(lambda: UNK, w2i) 30 | dev = list(read_dataset("data/classes/test.txt")) 31 | nwords = len(w2i) 32 | ntags = len(t2i) 33 | 34 | 35 | # Define the model 36 | W_sm = zero((nwords, ntags)) # Word weights 37 | b_sm = zero((ntags)) # Softmax bias 38 | 39 | # bag of words input 40 | x = T.ivector('words') 41 | # gold class 42 | y = T.iscalar('class') 43 | 44 | score = T.sum(W_sm[x], axis=0) + b_sm 45 | # log likelihood 46 | ll = T.log(T.nnet.softmax(score)).flatten() 47 | # negative log likelihood loss 48 | loss = - ll[y] 49 | 50 | params = [W_sm, b_sm] 51 | updates = Adam(lr=0.001).get_updates(params, loss) 52 | 53 | train_func = theano.function([x, y], loss, updates=updates) 54 | test_func = theano.function([x], score) 55 | 56 | print ("startup time: %r" % (time.time() - start)) 57 | for ITER in range(100): 58 | # Perform training 59 | random.shuffle(train) 60 | train_loss = 0.0 61 | start = time.time() 62 | for i, (words, tag) in enumerate(train): 63 | my_loss = train_func(words, tag) 64 | train_loss += my_loss 65 | # print(b_sm.get_value()) 66 | # if i > 5: 67 | # sys.exit(0) 68 | 69 | print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start)) 70 | 71 | # Perform testing 72 | test_correct = 0.0 73 | for words, tag in dev: 74 | scores = test_func(words) 75 | predict = np.argmax(scores) 76 | if predict == tag: 77 | test_correct += 1 78 | 79 | print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev))) 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /theano/nn/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'yinpengcheng' 2 | -------------------------------------------------------------------------------- /theano/nn/activations.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | 4 | def softmax(x): 5 | return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape) 6 | 7 | 8 | def time_distributed_softmax(x): 9 | import warnings 10 | warnings.warn("time_distributed_softmax is deprecated. Just use softmax!", DeprecationWarning) 11 | return softmax(x) 12 | 13 | 14 | def softplus(x): 15 | return T.nnet.softplus(x) 16 | 17 | 18 | def relu(x): 19 | return T.nnet.relu(x) 20 | 21 | 22 | def tanh(x): 23 | return T.tanh(x) 24 | 25 | 26 | def sigmoid(x): 27 | return T.nnet.sigmoid(x) 28 | 29 | 30 | def hard_sigmoid(x): 31 | return T.nnet.hard_sigmoid(x) 32 | 33 | 34 | def linear(x): 35 | ''' 36 | The function returns the variable that is passed in, so all types work 37 | ''' 38 | return x 39 | 40 | 41 | from .utils.generic_utils import get_from_module 42 | def get(identifier): 43 | return get_from_module(identifier, globals(), 'activation function') 44 | -------------------------------------------------------------------------------- /theano/nn/initializations.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | import numpy as np 4 | 5 | from .utils.theano_utils import sharedX, shared_zeros, shared_ones 6 | 7 | 8 | def get_fans(shape): 9 | fan_in = shape[0] if len(shape) == 2 else np.prod(shape[1:]) 10 | fan_out = shape[1] if len(shape) == 2 else shape[0] 11 | return fan_in, fan_out 12 | 13 | 14 | def uniform(shape, scale=0.1, name=None): 15 | return sharedX(np.random.uniform(low=-scale, high=scale, size=shape), name=name) 16 | 17 | 18 | def normal(shape, scale=0.05, name=None): 19 | return sharedX(np.random.randn(*shape) * scale, name=name) 20 | 21 | 22 | def lecun_uniform(shape): 23 | ''' Reference: LeCun 98, Efficient Backprop 24 | http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf 25 | ''' 26 | fan_in, fan_out = get_fans(shape) 27 | scale = np.sqrt(3. / fan_in) 28 | return uniform(shape, scale) 29 | 30 | 31 | def glorot_normal(shape): 32 | ''' Reference: Glorot & Bengio, AISTATS 2010 33 | ''' 34 | fan_in, fan_out = get_fans(shape) 35 | s = np.sqrt(2. / (fan_in + fan_out)) 36 | return normal(shape, s) 37 | 38 | 39 | def glorot_uniform(shape, name=None): 40 | fan_in, fan_out = get_fans(shape) 41 | s = np.sqrt(6. / (fan_in + fan_out)) 42 | return uniform(shape, s, name=name) 43 | 44 | 45 | def he_normal(shape): 46 | ''' Reference: He et al., http://arxiv.org/abs/1502.01852 47 | ''' 48 | fan_in, fan_out = get_fans(shape) 49 | s = np.sqrt(2. / fan_in) 50 | return normal(shape, s) 51 | 52 | 53 | def he_uniform(shape): 54 | fan_in, fan_out = get_fans(shape) 55 | s = np.sqrt(6. / fan_in) 56 | return uniform(shape, s) 57 | 58 | 59 | def orthogonal(shape, scale=1.1): 60 | ''' From Lasagne 61 | ''' 62 | flat_shape = (shape[0], np.prod(shape[1:])) 63 | a = np.random.normal(0.0, 1.0, flat_shape) 64 | u, _, v = np.linalg.svd(a, full_matrices=False) 65 | # pick the one with the correct shape 66 | q = u if u.shape == flat_shape else v 67 | q = q.reshape(shape) 68 | return sharedX(scale * q[:shape[0], :shape[1]]) 69 | 70 | 71 | def identity(shape, scale=1): 72 | if len(shape) != 2 or shape[0] != shape[1]: 73 | raise Exception("Identity matrix initialization can only be used for 2D square matrices") 74 | else: 75 | return sharedX(scale * np.identity(shape[0])) 76 | 77 | 78 | def zero(shape): 79 | return shared_zeros(shape) 80 | 81 | 82 | def one(shape): 83 | return shared_ones(shape) 84 | 85 | 86 | from .utils.generic_utils import get_from_module 87 | def get(identifier): 88 | return get_from_module(identifier, globals(), 'initialization') 89 | -------------------------------------------------------------------------------- /theano/nn/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neulab/dynet-benchmark/b23f01f6f3c5386f67c9a355d9b25d72deb03ced/theano/nn/layers/__init__.py -------------------------------------------------------------------------------- /theano/nn/layers/core.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import theano 4 | import theano.tensor as T 5 | import numpy as np 6 | 7 | from nn.utils.theano_utils import * 8 | import nn.initializations as initializations 9 | import nn.activations as activations 10 | 11 | from theano.tensor.shared_randomstreams import RandomStreams 12 | from theano.sandbox.rng_mrg import MRG_RandomStreams 13 | 14 | 15 | class Layer(object): 16 | def __init__(self): 17 | self.params = [] 18 | 19 | def __call__(self, X): 20 | return X 21 | 22 | def get_params(self): 23 | return self.params 24 | 25 | def set_name(self, name): 26 | if name: 27 | for i in range(len(self.params)): 28 | if self.params[i].name is None: 29 | self.params[i].name = '%s_p%d' % (name, i) 30 | else: 31 | self.params[i].name = name + '_' + self.params[i].name 32 | 33 | self.name = name 34 | 35 | 36 | class Dense(Layer): 37 | def __init__(self, input_dim, output_dim, init='glorot_uniform', activation='tanh', name='Dense'): 38 | 39 | super(Dense, self).__init__() 40 | self.init = initializations.get(init) 41 | self.activation = activations.get(activation) 42 | self.input_dim = input_dim 43 | self.output_dim = output_dim 44 | 45 | self.input = T.matrix() 46 | self.W = self.init((self.input_dim, self.output_dim)) 47 | self.b = shared_zeros((self.output_dim)) 48 | 49 | self.params = [self.W, self.b] 50 | 51 | if name is not None: 52 | self.set_name(name) 53 | 54 | def set_name(self, name): 55 | self.W.name = '%s_W' % name 56 | self.b.name = '%s_b' % name 57 | 58 | def __call__(self, X): 59 | output = self.activation(T.dot(X, self.W) + self.b) 60 | return output 61 | 62 | 63 | class Dropout(Layer): 64 | def __init__(self, p, srng, name='dropout'): 65 | super(Dropout, self).__init__() 66 | 67 | assert 0. < p < 1. 68 | 69 | self.p = p 70 | self.srng = srng 71 | 72 | if name is not None: 73 | self.set_name(name) 74 | 75 | def __call__(self, X, train_only=True): 76 | retain_prob = 1. - self.p 77 | 78 | X_train = X * self.srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX) 79 | X_test = X * retain_prob 80 | 81 | if train_only: 82 | return X_train 83 | else: 84 | return X_train, X_test 85 | -------------------------------------------------------------------------------- /theano/nn/layers/embeddings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .core import Layer 4 | from nn.utils.theano_utils import * 5 | import nn.initializations as initializations 6 | 7 | import nn.activations as activations 8 | from theano.ifelse import ifelse 9 | 10 | class Embedding(Layer): 11 | ''' 12 | Turn positive integers (indexes) into denses vectors of fixed size. 13 | eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]] 14 | 15 | @input_dim: size of vocabulary (highest input integer + 1) 16 | @out_dim: size of dense representation 17 | ''' 18 | def __init__(self, input_dim, output_dim, init='uniform', name=None): 19 | 20 | super(Embedding, self).__init__() 21 | self.init = initializations.get(init) 22 | self.input_dim = input_dim 23 | self.output_dim = output_dim 24 | 25 | self.W = self.init((self.input_dim, self.output_dim)) 26 | 27 | self.params = [self.W] 28 | 29 | if name is not None: 30 | self.set_name(name) 31 | 32 | def get_output_mask(self, X): 33 | return T.ones_like(X, dtype=theano.config.floatX) * (1. - T.eq(X, 0)) 34 | 35 | def __call__(self, X, mask_zero=False): 36 | out = self.W[X] 37 | if mask_zero: 38 | return out, self.get_output_mask(X) 39 | else: 40 | return out 41 | -------------------------------------------------------------------------------- /theano/nn/optimizers.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import theano 3 | import theano.tensor as T 4 | 5 | from .utils.theano_utils import shared_zeros, shared_scalar, floatX 6 | from .utils.generic_utils import get_from_module 7 | from six.moves import zip 8 | from theano.sandbox.rng_mrg import MRG_RandomStreams 9 | from theano.tensor.shared_randomstreams import RandomStreams 10 | import math 11 | from nn.utils.config_factory import config 12 | 13 | 14 | def clip_norm(g, c, n): 15 | if c > 0: 16 | g = T.switch(T.ge(n, c), g * c / n, g) 17 | return g 18 | 19 | 20 | def kl_divergence(p, p_hat): 21 | return p_hat - p + p * T.log(p / p_hat) 22 | 23 | 24 | class Optimizer(object): 25 | def __init__(self, **kwargs): 26 | self.__dict__.update(kwargs) 27 | self.updates = [] 28 | 29 | def get_state(self): 30 | return [u[0].get_value() for u in self.updates] 31 | 32 | def set_state(self, value_list): 33 | assert len(self.updates) == len(value_list) 34 | for u, v in zip(self.updates, value_list): 35 | u[0].set_value(floatX(v)) 36 | 37 | def get_updates(self, params, constraints, loss, **kwargs): 38 | raise NotImplementedError 39 | 40 | def get_gradients(self, loss, params, **kwargs): 41 | 42 | grads = T.grad(loss, params, disconnected_inputs='warn', **kwargs) 43 | 44 | if hasattr(self, 'clipnorm') and self.clipnorm > 0: 45 | norm = T.sqrt(sum([T.sum(g ** 2) for g in grads])) 46 | norm = theano.printing.Print('gradient norm::')(norm) 47 | grads = [clip_norm(g, self.clipnorm, norm) for g in grads] 48 | 49 | return grads 50 | 51 | def get_config(self): 52 | return {"name": self.__class__.__name__} 53 | 54 | 55 | class SGD(Optimizer): 56 | 57 | def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, *args, **kwargs): 58 | super(SGD, self).__init__(**kwargs) 59 | self.__dict__.update(locals()) 60 | self.iterations = shared_scalar(0) 61 | self.lr = shared_scalar(lr) 62 | self.momentum = shared_scalar(momentum) 63 | 64 | def get_updates(self, params, loss): 65 | grads = self.get_gradients(loss, params) 66 | lr = self.lr * (1.0 / (1.0 + self.decay * self.iterations)) 67 | self.updates = [(self.iterations, self.iterations + 1.)] 68 | 69 | for p, g in zip(params, grads): 70 | m = shared_zeros(p.get_value().shape) # momentum 71 | v = self.momentum * m - lr * g # velocity 72 | self.updates.append((m, v)) 73 | 74 | if self.nesterov: 75 | new_p = p + self.momentum * v - lr * g 76 | else: 77 | new_p = p + v 78 | 79 | self.updates.append((p, new_p)) 80 | return self.updates 81 | 82 | def get_config(self): 83 | return {"name": self.__class__.__name__, 84 | "lr": float(self.lr.get_value()), 85 | "momentum": float(self.momentum.get_value()), 86 | "decay": float(self.decay.get_value()), 87 | "nesterov": self.nesterov} 88 | 89 | 90 | class RMSprop(Optimizer): 91 | def __init__(self, lr=0.001, rho=0.9, epsilon=1e-6, *args, **kwargs): 92 | super(RMSprop, self).__init__(**kwargs) 93 | self.__dict__.update(locals()) 94 | self.lr = shared_scalar(lr) 95 | self.rho = shared_scalar(rho) 96 | 97 | def get_updates(self, params, constraints, loss): 98 | grads = self.get_gradients(loss, params) 99 | accumulators = [shared_zeros(p.get_value().shape) for p in params] 100 | self.updates = [] 101 | 102 | for p, g, a, c in zip(params, grads, accumulators, constraints): 103 | new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator 104 | self.updates.append((a, new_a)) 105 | 106 | new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon) 107 | self.updates.append((p, c(new_p))) # apply constraints 108 | return self.updates 109 | 110 | def get_config(self): 111 | return {"name": self.__class__.__name__, 112 | "lr": float(self.lr.get_value()), 113 | "rho": float(self.rho.get_value()), 114 | "epsilon": self.epsilon} 115 | 116 | 117 | class Adagrad(Optimizer): 118 | def __init__(self, lr=0.01, epsilon=1e-6, *args, **kwargs): 119 | super(Adagrad, self).__init__(**kwargs) 120 | self.__dict__.update(locals()) 121 | self.lr = shared_scalar(lr) 122 | 123 | def get_updates(self, params, constraints, loss): 124 | grads = self.get_gradients(loss, params) 125 | accumulators = [shared_zeros(p.get_value().shape) for p in params] 126 | self.updates = [] 127 | 128 | for p, g, a, c in zip(params, grads, accumulators, constraints): 129 | new_a = a + g ** 2 # update accumulator 130 | self.updates.append((a, new_a)) 131 | new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon) 132 | self.updates.append((p, c(new_p))) # apply constraints 133 | return self.updates 134 | 135 | def get_config(self): 136 | return {"name": self.__class__.__name__, 137 | "lr": float(self.lr.get_value()), 138 | "epsilon": self.epsilon} 139 | 140 | 141 | class Adadelta(Optimizer): 142 | ''' 143 | Reference: http://arxiv.org/abs/1212.5701 144 | ''' 145 | def __init__(self, lr=1.0, rho=0.95, epsilon=1e-6, *args, **kwargs): 146 | super(Adadelta, self).__init__(**kwargs) 147 | self.__dict__.update(locals()) 148 | self.lr = shared_scalar(lr) 149 | 150 | def get_updates(self, params, loss): 151 | grads = self.get_gradients(loss, params) 152 | accumulators = [shared_zeros(p.get_value().shape) for p in params] 153 | delta_accumulators = [shared_zeros(p.get_value().shape) for p in params] 154 | self.updates = [] 155 | 156 | for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): 157 | new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator 158 | self.updates.append((a, new_a)) 159 | 160 | # use the new accumulator and the *old* delta_accumulator 161 | update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a + 162 | self.epsilon) 163 | 164 | new_p = p - self.lr * update 165 | self.updates.append((p, new_p)) 166 | 167 | # update delta_accumulator 168 | new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2 169 | self.updates.append((d_a, new_d_a)) 170 | return self.updates, grads 171 | 172 | def get_config(self): 173 | return {"name": self.__class__.__name__, 174 | "lr": float(self.lr.get_value()), 175 | "rho": self.rho, 176 | "epsilon": self.epsilon} 177 | 178 | 179 | class Adam(Optimizer): 180 | ''' 181 | Reference: http://arxiv.org/abs/1412.6980v8 182 | 183 | Default parameters follow those provided in the original paper. 184 | ''' 185 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, *args, **kwargs): 186 | super(Adam, self).__init__(**kwargs) 187 | self.__dict__.update(locals()) 188 | self.iterations = shared_scalar(0) 189 | self.lr = shared_scalar(lr) 190 | 191 | def get_updates(self, params, loss, **kwargs): 192 | grads = self.get_gradients(loss, params, **kwargs) 193 | self.updates = [(self.iterations, self.iterations+1.)] 194 | 195 | t = self.iterations + 1 196 | lr_t = self.lr * T.sqrt(1-self.beta_2**t)/(1-self.beta_1**t) 197 | 198 | 199 | for p, g in zip(params, grads): 200 | m = theano.shared(p.get_value() * 0.) # zero init of moment 201 | v = theano.shared(p.get_value() * 0.) # zero init of velocity 202 | 203 | m_t = (self.beta_1 * m) + (1 - self.beta_1) * g 204 | v_t = (self.beta_2 * v) + (1 - self.beta_2) * (g**2) 205 | p_t = p - lr_t * m_t / (T.sqrt(v_t) + self.epsilon) 206 | 207 | self.updates.append((m, m_t)) 208 | self.updates.append((v, v_t)) 209 | self.updates.append((p, p_t)) 210 | 211 | return self.updates 212 | 213 | def get_config(self): 214 | return {"name": self.__class__.__name__, 215 | "lr": float(self.lr.get_value()), 216 | "beta_1": self.beta_1, 217 | "beta_2": self.beta_2, 218 | "epsilon": self.epsilon} 219 | 220 | 221 | # aliases 222 | sgd = SGD 223 | rmsprop = RMSprop 224 | adagrad = Adagrad 225 | adadelta = Adadelta 226 | adam = Adam 227 | 228 | 229 | def get(identifier, kwargs=None): 230 | return get_from_module(identifier, globals(), 'optimizer', instantiate=True, 231 | kwargs=kwargs) 232 | -------------------------------------------------------------------------------- /theano/nn/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neulab/dynet-benchmark/b23f01f6f3c5386f67c9a355d9b25d72deb03ced/theano/nn/utils/__init__.py -------------------------------------------------------------------------------- /theano/nn/utils/config_factory.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class MetaConfig(type): 5 | def __getitem__(self, key): 6 | return config._config[key] 7 | 8 | def __setitem__(self, key, value): 9 | config._config[key] = value 10 | 11 | 12 | class config(object): 13 | _config = {} 14 | __metaclass__ = MetaConfig 15 | 16 | @staticmethod 17 | def set(key, val): 18 | config._config[key] = val 19 | 20 | @staticmethod 21 | def init_config(file='config.py'): 22 | if len(config._config) > 0: 23 | return 24 | 25 | logging.info('use configuration: %s', file) 26 | data = {} 27 | execfile(file, data) 28 | config._config = data['config'] -------------------------------------------------------------------------------- /theano/nn/utils/generic_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | import time 4 | import sys 5 | import six 6 | import logging 7 | 8 | 9 | def get_from_module(identifier, module_params, module_name, instantiate=False, kwargs=None): 10 | if isinstance(identifier, six.string_types): 11 | res = module_params.get(identifier) 12 | if not res: 13 | raise Exception('Invalid ' + str(module_name) + ': ' + str(identifier)) 14 | if instantiate and not kwargs: 15 | return res() 16 | elif instantiate and kwargs: 17 | return res(**kwargs) 18 | else: 19 | return res 20 | return identifier 21 | 22 | 23 | def make_tuple(*args): 24 | return args 25 | 26 | 27 | def make_batches(size, batch_size): 28 | nb_batch = int(np.ceil(size/float(batch_size))) 29 | return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)] 30 | 31 | 32 | def pad_sequences(sequences, maxlen=None, dtype='int32', 33 | padding='pre', truncating='pre', value=0.): 34 | '''Pads each sequence to the same length: 35 | the length of the longest sequence. 36 | 37 | If maxlen is provided, any sequence longer 38 | than maxlen is truncated to maxlen. 39 | Truncation happens off either the beginning (default) or 40 | the end of the sequence. 41 | 42 | Supports post-padding and pre-padding (default). 43 | 44 | # Arguments 45 | sequences: list of lists where each element is a sequence 46 | maxlen: int, maximum length 47 | dtype: type to cast the resulting sequence. 48 | padding: 'pre' or 'post', pad either before or after each sequence. 49 | truncating: 'pre' or 'post', remove values from sequences larger than 50 | maxlen either in the beginning or in the end of the sequence 51 | value: float, value to pad the sequences to the desired value. 52 | 53 | # Returns 54 | x: numpy array with dimensions (number_of_sequences, maxlen) 55 | ''' 56 | lengths = [len(s) for s in sequences] 57 | 58 | nb_samples = len(sequences) 59 | if maxlen is None: 60 | maxlen = np.max(lengths) 61 | 62 | # take the sample shape from the first non empty sequence 63 | # checking for consistency in the main loop below. 64 | sample_shape = tuple() 65 | for s in sequences: 66 | if len(s) > 0: 67 | sample_shape = np.asarray(s).shape[1:] 68 | break 69 | 70 | x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype) 71 | for idx, s in enumerate(sequences): 72 | if len(s) == 0: 73 | continue # empty list was found 74 | if truncating == 'pre': 75 | trunc = s[-maxlen:] 76 | elif truncating == 'post': 77 | trunc = s[:maxlen] 78 | else: 79 | raise ValueError('Truncating type "%s" not understood' % truncating) 80 | 81 | # check `trunc` has expected shape 82 | trunc = np.asarray(trunc, dtype=dtype) 83 | if trunc.shape[1:] != sample_shape: 84 | raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' % 85 | (trunc.shape[1:], idx, sample_shape)) 86 | 87 | if padding == 'post': 88 | x[idx, :len(trunc)] = trunc 89 | elif padding == 'pre': 90 | x[idx, -len(trunc):] = trunc 91 | else: 92 | raise ValueError('Padding type "%s" not understood' % padding) 93 | return x 94 | -------------------------------------------------------------------------------- /theano/nn/utils/io_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import cPickle 4 | 5 | 6 | def serialize_to_file(obj, path, protocol=cPickle.HIGHEST_PROTOCOL): 7 | f = open(path, 'wb') 8 | cPickle.dump(obj, f, protocol=protocol) 9 | f.close() 10 | 11 | 12 | def deserialize_from_file(path): 13 | f = open(path, 'rb') 14 | obj = cPickle.load(f) 15 | f.close() 16 | return obj -------------------------------------------------------------------------------- /theano/nn/utils/theano_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | import theano 4 | import theano.tensor as T 5 | 6 | 7 | def floatX(X): 8 | return np.asarray(X, dtype=theano.config.floatX) 9 | 10 | 11 | def sharedX(X, dtype=theano.config.floatX, name=None): 12 | return theano.shared(np.asarray(X, dtype=dtype), name=name) 13 | 14 | 15 | def shared_zeros(shape, dtype=theano.config.floatX, name=None): 16 | return sharedX(np.zeros(shape), dtype=dtype, name=name) 17 | 18 | 19 | def shared_scalar(val=0., dtype=theano.config.floatX, name=None): 20 | return theano.shared(np.cast[dtype](val)) 21 | 22 | 23 | def shared_ones(shape, dtype=theano.config.floatX, name=None): 24 | return sharedX(np.ones(shape), dtype=dtype, name=name) 25 | 26 | 27 | def alloc_zeros_matrix(*dims): 28 | return T.alloc(np.cast[theano.config.floatX](0.), *dims) 29 | 30 | 31 | def tensor_right_shift(tensor): 32 | temp = T.zeros_like(tensor) 33 | temp = T.set_subtensor(temp[:, 1:, :], tensor[:, :-1, :]) 34 | 35 | return temp 36 | 37 | 38 | def ndim_tensor(ndim, name=None): 39 | if ndim == 1: 40 | return T.vector() 41 | elif ndim == 2: 42 | return T.matrix() 43 | elif ndim == 3: 44 | return T.tensor3() 45 | elif ndim == 4: 46 | return T.tensor4() 47 | return T.matrix(name=name) 48 | 49 | 50 | # get int32 tensor 51 | def ndim_itensor(ndim, name=None): 52 | if ndim == 2: 53 | return T.imatrix(name) 54 | elif ndim == 3: 55 | return T.itensor3(name) 56 | elif ndim == 4: 57 | return T.itensor4(name) 58 | return T.imatrix(name=name) 59 | 60 | 61 | # get int8 tensor 62 | def ndim_btensor(ndim, name=None): 63 | if ndim == 2: 64 | return T.bmatrix(name) 65 | elif ndim == 3: 66 | return T.btensor3(name) 67 | elif ndim == 4: 68 | return T.btensor4(name) 69 | return T.imatrix(name) -------------------------------------------------------------------------------- /theano/rnnlm-batch.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | import time 3 | start = time.time() 4 | 5 | import theano.tensor as T 6 | import theano 7 | import numpy as np 8 | import sys, time 9 | import random 10 | import cProfile 11 | import argparse 12 | from itertools import chain 13 | 14 | from nn.layers.recurrent import LSTM 15 | from nn.layers.embeddings import Embedding 16 | from nn.optimizers import Adam, SGD 17 | from nn.initializations import uniform 18 | 19 | from collections import Counter, defaultdict 20 | from itertools import count 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('MB_SIZE', type=int, help='minibatch size') 24 | parser.add_argument('EMBED_SIZE', type=int, help='embedding size') 25 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size') 26 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1') 27 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds') 28 | args = parser.parse_args() 29 | 30 | train_file = 'data/text/train.txt' 31 | test_file = 'data/text/dev.txt' 32 | 33 | w2i = defaultdict(count(0).next) 34 | 35 | 36 | def read(fname): 37 | """ 38 | Read a file where each line is of the form "word1 word2 ..." 39 | Yields lists of the form [word1, word2, ...] 40 | """ 41 | with file(fname) as fh: 42 | for line in fh: 43 | sent = [w2i[x] for x in line.strip().split()] 44 | sent.append(w2i[""]) 45 | yield sent 46 | 47 | mask = w2i[''] 48 | assert mask == 0 49 | 50 | train = list(read(train_file)) 51 | vocab_size = len(w2i) 52 | test = list(read(test_file)) 53 | S = w2i[''] 54 | 55 | 56 | def pad(seq): 57 | """ 58 | pad a mini-batch input with ending zeros 59 | """ 60 | batch_size = len(seq) 61 | max_len = max(len(seq[i]) for i in xrange(batch_size)) 62 | padded_seq = np.zeros((batch_size, max_len), dtype='int32') 63 | for i in xrange(batch_size): 64 | padded_seq[i, :len(seq[i])] = seq[i] 65 | 66 | return padded_seq 67 | 68 | 69 | def build_graph(): 70 | print('build graph..') 71 | # Lookup parameters for word embeddings 72 | embedding_table = Embedding(vocab_size, args.EMBED_SIZE) 73 | 74 | lstm = LSTM(args.EMBED_SIZE, args.HIDDEN_SIZE, inner_init="identity", return_sequences=True) 75 | 76 | # Softmax weights/biases on top of LSTM outputs 77 | W_sm = uniform((args.HIDDEN_SIZE, vocab_size), scale=.5, name='W_sm') 78 | b_sm = uniform(vocab_size, scale=.5, name='b_sm') 79 | 80 | # (batch_size, sentence_length) 81 | x = T.imatrix(name='sentence') 82 | 83 | # (batch_size, sentence_length, embedding_dim) 84 | sent_embed, sent_mask = embedding_table(x, mask_zero=True) 85 | 86 | lstm_input = T.set_subtensor(T.zeros_like(sent_embed)[:, 1:, :], sent_embed[:, :-1, :]) 87 | lstm_input = T.set_subtensor(lstm_input[:, 0, :], embedding_table(S)[None, :]) 88 | 89 | # (batch_size, sentence_length, output_dim) 90 | lstm_output = lstm(lstm_input) 91 | 92 | # (batch_size, sentence_length, vocab_size) 93 | logits = T.dot(lstm_output, W_sm) + b_sm 94 | logits = T.nnet.softmax(logits.reshape((logits.shape[0] * logits.shape[1], vocab_size))).reshape(logits.shape) 95 | 96 | loss = T.log(logits).reshape((-1, logits.shape[-1])) 97 | # (batch_size * sentence_length) 98 | loss = loss[T.arange(loss.shape[0]), x.flatten()] 99 | # (batch_size, sentence_length) 100 | loss = - loss.reshape((x.shape[0], x.shape[1])) * sent_mask 101 | # loss = loss.sum(axis=-1) / sent_mask.sum(axis=-1) 102 | # loss = -T.mean(loss) 103 | 104 | # loss is the sum of nll over all words over all examples in the mini-batch 105 | loss = loss.sum() 106 | 107 | params = embedding_table.params + lstm.params + [W_sm, b_sm] 108 | updates = Adam(lr=0.001).get_updates(params, loss) 109 | # updates = SGD(lr=0.01).get_updates(params, loss) 110 | train_loss_func = theano.function([x], loss, updates=updates) 111 | test_loss_func = theano.function([x], loss) 112 | 113 | return train_loss_func, test_loss_func 114 | 115 | train_loss_func, test_loss_func = build_graph() 116 | 117 | # Sort training sentences in descending order and count minibatches 118 | train.sort(key=lambda x: -len(x)) 119 | test.sort(key=lambda x: -len(x)) 120 | train_order = [x * args.MB_SIZE for x in range(int((len(train) - 1) / args.MB_SIZE + 1))] 121 | test_order = [x * args.MB_SIZE for x in range(int((len(test) - 1) / args.MB_SIZE + 1))] 122 | 123 | # Perform training 124 | print("startup time: %r" % (time.time() - start)) 125 | start = time.time() 126 | i = all_time = dev_time = all_tagged = this_words = this_loss = 0 127 | for ITER in range(100): 128 | random.shuffle(train_order) 129 | for sid in train_order: 130 | i += 1 131 | if i % int(500 / args.MB_SIZE) == 0: 132 | print(this_loss / this_words) 133 | all_tagged += this_words 134 | this_loss = this_words = 0 135 | all_time = time.time() - start 136 | if i % int(10000 / args.MB_SIZE) == 0 or all_time > args.TIMEOUT: 137 | dev_start = time.time() 138 | dev_loss = dev_words = 0 139 | for test_sid in test_order: 140 | batch_sents = test[test_sid:test_sid + args.MB_SIZE] 141 | batch_sents_x = pad(batch_sents) 142 | 143 | batch_loss = test_loss_func(batch_sents_x) 144 | dev_loss += batch_loss 145 | 146 | mb_words = sum(len(s) for s in batch_sents) 147 | dev_words += mb_words 148 | dev_time += time.time() - dev_start 149 | train_time = time.time() - start - dev_time 150 | print("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % ( 151 | dev_loss / dev_words, np.exp(dev_loss / dev_words), dev_words, train_time, all_tagged / train_time)) 152 | if all_time > args.TIMEOUT: 153 | sys.exit(0) 154 | 155 | # train on the minibatch 156 | 157 | batch_sents = train[sid:sid + args.MB_SIZE] 158 | batch_sents_x = pad(batch_sents) 159 | 160 | batch_loss = train_loss_func(batch_sents_x) 161 | this_loss += batch_loss 162 | # print("loss @ %r: %r" % (i, this_loss)) 163 | mb_words = sum(len(s) for s in batch_sents) 164 | this_words += mb_words 165 | 166 | print("epoch %r finished" % ITER) 167 | --------------------------------------------------------------------------------