├── .gitignore
├── .travis.yml
├── README.md
├── chainer
    ├── bilstm-tagger-withchar.py
    ├── bilstm-tagger.py
    ├── rnnlm-batch.py
    └── treenn.py
├── data
    ├── README.md
    ├── classes
    │   ├── dev.txt
    │   ├── test.txt
    │   └── train.txt
    ├── tags
    │   ├── dev.txt
    │   └── train.txt
    ├── text
    │   ├── dev.txt
    │   ├── test.txt
    │   └── train.txt
    └── trees
    │   ├── dev.txt
    │   ├── test.txt
    │   └── train.txt
├── dynet-cpp
    ├── Makefile
    ├── bilstm-tagger-bulk.cc
    ├── bilstm-tagger-withchar-bulk.cc
    ├── bilstm-tagger-withchar.cc
    ├── bilstm-tagger.cc
    ├── rnnlm-batch.cc
    ├── rnnlm-seq.cc
    ├── treenn-bulk.cc
    └── treenn.cc
├── dynet-py
    ├── bilstm-tagger-withchar.py
    ├── bilstm-tagger.py
    ├── bow.py
    ├── rnnlm-batch-batch.py
    ├── rnnlm-batch.py
    ├── treenn-bulk.py
    └── treenn.py
├── make-report.py
├── pytorch
    ├── bilstm-tagger-withchar.py
    ├── bilstm-tagger.py
    └── rnnlm.py
├── run-tests.sh
├── tensorflow
    ├── bilstm-tagger.py
    ├── bow.py
    └── rnnlm-batch.py
└── theano
    ├── README.md
    ├── bilstm-tagger-withchar.py
    ├── bilstm-tagger.py
    ├── bow.py
    ├── nn
        ├── __init__.py
        ├── activations.py
        ├── initializations.py
        ├── layers
        │   ├── __init__.py
        │   ├── core.py
        │   ├── embeddings.py
        │   └── recurrent.py
        ├── optimizers.py
        └── utils
        │   ├── __init__.py
        │   ├── config_factory.py
        │   ├── generic_utils.py
        │   ├── io_utils.py
        │   └── theano_utils.py
    └── rnnlm-batch.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | *.dSYM
 3 | *.pyc
 4 | log
 5 | dynet-cpp/rnnlm-batch
 6 | dynet-cpp/rnnlm-seq
 7 | dynet-cpp/treenn
 8 | dynet-cpp/treenn-bulk
 9 | dynet-cpp/bilstm-tagger
10 | dynet-cpp/bilstm-tagger-bulk
11 | dynet-cpp/bilstm-tagger-withchar
12 | dynet-cpp/bilstm-tagger-withchar-bulk
13 | dynet-cpp/rnnlm-batch-gpu
14 | dynet-cpp/rnnlm-seq-gpu
15 | dynet-cpp/treenn-gpu
16 | dynet-cpp/treenn-bulk-gpu
17 | dynet-cpp/bilstm-tagger-gpu
18 | dynet-cpp/bilstm-tagger-bulk-gpu
19 | dynet-cpp/bilstm-tagger-withchar-gpu
20 | dynet-cpp/bilstm-tagger-withchar-bulk-gpu
21 | dynet-benchmark-results*.tar.gz
22 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: trusty
 2 | language: python
 3 | python:
 4 |   - 2.7
 5 | env:
 6 |   global:
 7 |     - DYNET_PATH=$PWD/dynet EIGEN_PATH=$PWD/eigen DYFLAGS="--dynet-mem 2048" TIMEOUT=200 LONGTIMEOUT=300
 8 |   matrix:
 9 |     - TEST=dynet TASK=rnnlm-batch MBSIZE=64
10 |     - TEST=dynet TASK=rnnlm-batch MBSIZE=16
11 |     - TEST=dynet TASK=rnnlm-batch MBSIZE=04
12 |     - TEST=dynet TASK=rnnlm-batch MBSIZE=01
13 |     - TEST=dynet TASK=sparse-rnnlm-batch MBSIZE=16
14 |     - TEST=dynet TASK=sparse-rnnlm-batch MBSIZE=01
15 |     - TEST=dynet TASK=bilstm-tagger
16 |     - TEST=dynet TASK=bilstm-tagger-withchar
17 |     - TEST=dynet TASK=treenn
18 |     - TEST=chainer TASK=rnnlm-batch MBSIZE=64
19 |     - TEST=chainer TASK=rnnlm-batch MBSIZE=16
20 |     - TEST=chainer TASK=rnnlm-batch MBSIZE=04
21 |     - TEST=chainer TASK=rnnlm-batch MBSIZE=01
22 |     - TEST=chainer TASK=bilstm-tagger
23 |     - TEST=chainer TASK=bilstm-tagger-withchar
24 |     - TEST=chainer TASK=treenn
25 |     - TEST=theano TASK=rnnlm-batch MBSIZE=64
26 |     - TEST=theano TASK=rnnlm-batch MBSIZE=16
27 |     - TEST=theano TASK=rnnlm-batch MBSIZE=04
28 |     - TEST=theano TASK=rnnlm-batch MBSIZE=01
29 |     - TEST=theano TASK=bilstm-tagger
30 |     - TEST=theano TASK=bilstm-tagger-withchar
31 | cache:
32 |   directories:
33 |     - dynet
34 |     - eigen
35 | stages:
36 |   - compile
37 |   - test
38 | jobs:
39 |   include:
40 |     - stage: compile
41 |       env:
42 |       language: cpp
43 |       python:
44 |       addons:
45 |         apt:
46 |           sources:
47 |             - ubuntu-toolchain-r-test
48 |             - boost-latest
49 |           packages:
50 |             - g++-4.8
51 |             - libboost-regex1.55-dev
52 |       install: skip
53 |       script:
54 |         - hg clone https://bitbucket.org/eigen/eigen -r 699b659 || (cd eigen && hg pull && hg update -r 699b659)
55 |         - git clone https://github.com/clab/dynet || (cd dynet; git pull)
56 |         - mkdir -p dynet/build
57 |         - cd dynet/build
58 |         - cmake .. -DEIGEN3_INCLUDE_DIR=$TRAVIS_BUILD_DIR/eigen
59 |         - make -j$(nproc)
60 |     - stage: test
61 |     - language: cpp
62 |       python:
63 |       env: TEST=dynet TASK=rnnlm-batch
64 |       install: cd $TRAVIS_BUILD_DIR/dynet-cpp && make -j$(nproc) DYNET_PATH=$DYNET_PATH EIGEN_PATH=$EIGEN_PATH $TASK && cd $TRAVIS_BUILD_DIR
65 |     - language: cpp
66 |       python:
67 |       env: TEST=dynet TASK=sparse-rnnlm-batch
68 |       install: cd $TRAVIS_BUILD_DIR/dynet-cpp && make -j$(nproc) DYNET_PATH=$DYNET_PATH EIGEN_PATH=$EIGEN_PATH $TASK && cd $TRAVIS_BUILD_DIR
69 |     - language: cpp
70 |       python:
71 |       env: TEST=dynet TASK=bilstm-tagger
72 |       install: cd $TRAVIS_BUILD_DIR/dynet-cpp && make -j$(nproc) DYNET_PATH=$DYNET_PATH EIGEN_PATH=$EIGEN_PATH $TASK && cd $TRAVIS_BUILD_DIR
73 |     - language: cpp
74 |       python:
75 |       env: TEST=dynet TASK=bilstm-tagger-withchar
76 |       install: cd $TRAVIS_BUILD_DIR/dynet-cpp && make -j$(nproc) DYNET_PATH=$DYNET_PATH EIGEN_PATH=$EIGEN_PATH $TASK && cd $TRAVIS_BUILD_DIR
77 |     - language: cpp
78 |       python:
79 |       env: TEST=dynet TASK=treenn
80 |       install: cd $TRAVIS_BUILD_DIR/dynet-cpp && make -j$(nproc) DYNET_PATH=$DYNET_PATH EIGEN_PATH=$EIGEN_PATH $TASK && cd $TRAVIS_BUILD_DIR
81 | 
82 | install:
83 |   - pip install -q cython numpy
84 |   - pip install -U $TEST
85 | 
86 | script:
87 |   - ./run-tests.sh
88 |   - grep '\(per_sec\|startup\)' log/*/*.log
89 | 
90 | after_failure:
91 |   - cat $TRAVIS_BUILD_DIR/log/*/*.log
92 | 
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | DyNet Benchmarks
 2 | ----------------
 3 | by Graham Neubig, Yoav Goldberg, Chaitanya Malaviya, Austin Matthews, Yusuke Oda, and Pengcheng Yin
 4 | 
 5 | These are benchmarks to compare [DyNet](http://github.com/clab/dynet) against several other neural network toolkits: TensorFlow, Theano, and Chainer. It covers four different natural language processing tasks, some of which are only implemented in a subset of the toolkits as they wouldn't be straightforward to implement in the others:
 6 | 
 7 | * rnnlm-batch: A recurrent neural network language model with mini-batched training.
 8 | * bilstm-tagger: A tagger that runs a bi-directional LSTM and selects a tag for each word.
 9 | * bilstm-tagger-withchar: Similar to bilstm-tagger, but uses characer-based embeddings for unknown words.
10 | * treelstm: A text tagger based on tree-structured LSTMs.
11 | 
12 | The benchmarks can be run by first compiling the `dynet-cpp` examples, then running run-tests.sh.
13 | 
14 | **Note:** `dynet-cpp` needs the sequence-ops branch of DyNet to compile.
15 | 


--------------------------------------------------------------------------------
/chainer/bilstm-tagger-withchar.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | from collections import Counter, defaultdict
  6 | from itertools import count
  7 | import random
  8 | import sys
  9 | import argparse
 10 | 
 11 | from chainer import Chain, Variable
 12 | import chainer.functions as F
 13 | import chainer.links as L
 14 | import chainer.optimizers as O
 15 | 
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument('--chainer_gpu', type=int, default=-1, help='GPU id')
 18 | parser.add_argument('CEMBED_SIZE', type=int, help='char embedding size')
 19 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size')
 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 21 | parser.add_argument('MLP_SIZE', type=int, help='embedding size')
 22 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 23 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 24 | args = parser.parse_args()
 25 | 
 26 | if args.chainer_gpu >= 0:
 27 |   # use GPU
 28 |   from chainer.cuda import cupy as xp, get_device
 29 |   get_device(args.chainer_gpu).use()
 30 | else:
 31 |   # use CPU
 32 |   import numpy as xp
 33 | 
 34 | def makevar(x):
 35 |   return Variable(xp.array([x], dtype=xp.int32))
 36 | 
 37 | # format of files: each line is "word1|tag2 word2|tag2 ..."
 38 | train_file="data/tags/train.txt"
 39 | dev_file="data/tags/dev.txt"
 40 | 
 41 | class Vocab:
 42 |   def __init__(self, w2i=None):
 43 |     if w2i is None: w2i = defaultdict(count(0).next)
 44 |     self.w2i = dict(w2i)
 45 |     self.i2w = {i:w for w,i in w2i.iteritems()}
 46 |   @classmethod
 47 |   def from_corpus(cls, corpus):
 48 |     w2i = defaultdict(count(0).next)
 49 |     for sent in corpus:
 50 |       [w2i[word] for word in sent]
 51 |     return Vocab(w2i)
 52 | 
 53 |   def size(self): return len(self.w2i.keys())
 54 | 
 55 | def read(fname):
 56 |   """
 57 |   Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..."
 58 |   Yields lists of the form [(word1,tag1), (word2,tag2), ...]
 59 |   """
 60 |   with open(fname) as fh:
 61 |     for line in fh:
 62 |       line = line.strip().split()
 63 |       sent = [tuple(x.rsplit("|",1)) for x in line]
 64 |       yield sent
 65 | 
 66 | train=list(read(train_file))
 67 | dev=list(read(dev_file))
 68 | words=[]
 69 | tags=[]
 70 | chars=set()
 71 | wc=Counter()
 72 | for sent in train:
 73 |   for w,p in sent:
 74 |     words.append(w)
 75 |     tags.append(p)
 76 |     chars.update(w)
 77 |     wc[w]+=1
 78 | words.append("_UNK_")
 79 | chars.add("<*>")
 80 | 
 81 | vw = Vocab.from_corpus([words])
 82 | vt = Vocab.from_corpus([tags])
 83 | vc = Vocab.from_corpus([chars])
 84 | UNK = vw.w2i["_UNK_"]
 85 | 
 86 | nwords = vw.size()
 87 | ntags  = vt.size()
 88 | nchars  = vc.size()
 89 | print("nwords=%r, ntags=%r, nchars=%r" % (nwords, ntags, nchars))
 90 | 
 91 | # Chainer Starts
 92 | 
 93 | class Tagger(Chain):
 94 |   def __init__(self):
 95 |     super(Tagger, self).__init__(
 96 |         embedW=L.EmbedID(nwords, args.WEMBED_SIZE),
 97 |         embedC=L.EmbedID(nwords, args.CEMBED_SIZE),
 98 |         # MLP on top of biLSTM outputs 100 -> 32 -> ntags
 99 |         WH=L.Linear(args.HIDDEN_SIZE*2, args.MLP_SIZE, nobias=True),
100 |         WO=L.Linear(args.MLP_SIZE, ntags, nobias=True),
101 |         # word-level LSTMs
102 |         fwdRNN=L.LSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE),
103 |         bwdRNN=L.LSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE),
104 |         # char-level LSTMs,
105 |         cFwdRNN=L.LSTM(args.CEMBED_SIZE, args.WEMBED_SIZE/2),
106 |         cBwdRNN=L.LSTM(args.CEMBED_SIZE, args.WEMBED_SIZE/2),
107 |     )
108 | 
109 |   def word_rep(self, w):
110 |     if wc[w] > 5:
111 |       return self.embedW(makevar(vw.w2i[w]))
112 |     else:
113 |       pad_char = vc.w2i["<*>"]
114 |       char_ids = [pad_char] + [vc.w2i[c] for c in w] + [pad_char]
115 |       char_embs = [self.embedC(makevar(cid)) for cid in char_ids]
116 |       self.cFwdRNN.reset_state()
117 |       self.cBwdRNN.reset_state()
118 |       for e in char_embs:
119 |         fw_exp = self.cFwdRNN(e)
120 |       for e in reversed(char_embs):
121 |         bw_exp = self.cBwdRNN(e)
122 |       return F.concat([fw_exp, bw_exp])
123 | 
124 |   def build_tagging_graph(self, words):
125 |     # initialize the RNNs
126 |     self.fwdRNN.reset_state()
127 |     self.bwdRNN.reset_state()
128 | 
129 |     # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
130 |     wembs = [self.word_rep(w) for w in words]
131 | 
132 |     # feed word vectors into biLSTM
133 |     fw_exps = []
134 |     for e in wembs:
135 |       fw_exps.append(self.fwdRNN(e))
136 |     bw_exps = []
137 |     for e in reversed(wembs):
138 |       bw_exps.append(self.bwdRNN(e))
139 | 
140 |     # biLSTM states
141 |     bi_exps = [F.concat([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))]
142 | 
143 |     # feed each biLSTM state to an MLP
144 |     exps = [self.WO(F.tanh(self.WH(x))) for x in bi_exps]
145 |     return exps
146 | 
147 |   def sent_loss(self, words, tags):
148 |     vecs = self.build_tagging_graph(words)
149 |     return sum(F.softmax_cross_entropy(v, makevar(vt.w2i[t])) for v, t in zip(vecs, tags))
150 | 
151 |   def tag_sent(self, words):
152 |     vecs = self.build_tagging_graph(words)
153 |     tags = [vt.i2w[int(v.data.argmax())] for v in vecs]
154 |     return zip(words, tags)
155 | 
156 | tagger = Tagger()
157 | 
158 | if args.chainer_gpu >= 0:
159 |   tagger.to_gpu()
160 | 
161 | trainer = O.Adam()
162 | trainer.use_cleargrads()
163 | trainer.setup(tagger)
164 | 
165 | print("startup time: %r" % (time.time() - start))
166 | start = time.time()
167 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0
168 | for ITER in xrange(100):
169 |   random.shuffle(train)
170 |   for s in train:
171 |     i += 1
172 |     if i % 500 == 0:   # print status
173 |       print(this_loss / this_tagged)
174 |       all_tagged += this_tagged
175 |       this_loss = this_tagged = 0
176 |       all_time = time.time() - start
177 |     if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev
178 |       dev_start = time.time()
179 |       good_sent = bad_sent = good = bad = 0.0
180 |       for sent in dev:
181 |         words = [w for w, _ in sent]
182 |         golds = [t for _, t in sent]
183 |         tags = [t for _, t in tagger.tag_sent(words)]
184 |         if tags == golds:
185 |           good_sent += 1
186 |         else:
187 |           bad_sent += 1
188 |         for go,gu in zip(golds,tags):
189 |           if go == gu:
190 |             good += 1
191 |           else:
192 |             bad += 1
193 |       dev_time += time.time() - dev_start
194 |       train_time = time.time() - start - dev_time
195 |       print("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time))
196 |       if all_time > args.TIMEOUT:
197 |         sys.exit(0)
198 |     # train on sent
199 |     words = [w for w, _ in s]
200 |     golds = [t for _, t in s]
201 | 
202 |     loss_exp = tagger.sent_loss(words, golds)
203 |     this_loss += float(loss_exp.data)
204 |     this_tagged += len(golds)
205 |     tagger.cleargrads()
206 |     loss_exp.backward()
207 |     trainer.update()
208 | 
209 |   print("epoch %r finished" % ITER)
210 |   trainer.update_epoch(1.0)
211 | 


--------------------------------------------------------------------------------
/chainer/bilstm-tagger.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | from collections import Counter, defaultdict
  6 | from itertools import count
  7 | import random
  8 | import time
  9 | import sys
 10 | import argparse
 11 | 
 12 | from chainer import Chain, Variable
 13 | import chainer.functions as F
 14 | import chainer.links as L
 15 | import chainer.optimizers as O
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument('--chainer_gpu', type=int, default=-1, help='GPU id')
 19 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size')
 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 21 | parser.add_argument('MLP_SIZE', type=int, help='embedding size')
 22 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 23 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 24 | args = parser.parse_args()
 25 | 
 26 | if args.chainer_gpu >= 0:
 27 |   # use GPU
 28 |   import cupy as xp
 29 |   from chainer.cuda import get_device
 30 |   # from chainer.cuda import cupy as xp, get_device
 31 |   get_device(args.chainer_gpu).use()
 32 | else:
 33 |   # use CPU
 34 |   import numpy as xp
 35 | 
 36 | def makevar(x):
 37 |   return Variable(xp.array([x], dtype=xp.int32))
 38 | 
 39 | # format of files: each line is "word1|tag2 word2|tag2 ..."
 40 | train_file="data/tags/train.txt"
 41 | dev_file="data/tags/dev.txt"
 42 | 
 43 | class Vocab:
 44 |   def __init__(self, w2i=None):
 45 |     if w2i is None: w2i = defaultdict(count(0).next)
 46 |     self.w2i = dict(w2i)
 47 |     self.i2w = {i:w for w,i in w2i.iteritems()}
 48 |   @classmethod
 49 |   def from_corpus(cls, corpus):
 50 |     w2i = defaultdict(count(0).next)
 51 |     for sent in corpus:
 52 |       [w2i[word] for word in sent]
 53 |     return Vocab(w2i)
 54 | 
 55 |   def size(self): return len(self.w2i.keys())
 56 | 
 57 | def read(fname):
 58 |   """
 59 |   Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..."
 60 |   Yields lists of the form [(word1,tag1), (word2,tag2), ...]
 61 |   """
 62 |   with open(fname) as fh:
 63 |     for line in fh:
 64 |       line = line.strip().split()
 65 |       sent = [tuple(x.rsplit("|",1)) for x in line]
 66 |       yield sent
 67 | 
 68 | train=list(read(train_file))
 69 | dev=list(read(dev_file))
 70 | words=[]
 71 | tags=[]
 72 | wc=Counter()
 73 | for sent in train:
 74 |   for w,p in sent:
 75 |     words.append(w)
 76 |     tags.append(p)
 77 |     wc[w]+=1
 78 | words.append("_UNK_")
 79 | 
 80 | vw = Vocab.from_corpus([words])
 81 | vt = Vocab.from_corpus([tags])
 82 | UNK = vw.w2i["_UNK_"]
 83 | 
 84 | nwords = vw.size()
 85 | ntags  = vt.size()
 86 | print ("nwords=%r, ntags=%r" % (nwords, ntags))
 87 | 
 88 | # Chainer Starts
 89 | 
 90 | class Tagger(Chain):
 91 |   def __init__(self):
 92 |     super(Tagger, self).__init__(
 93 |         embed=L.EmbedID(nwords, args.WEMBED_SIZE),
 94 |         # MLP on top of biLSTM outputs 100 -> 32 -> ntags
 95 |         WH=L.Linear(args.HIDDEN_SIZE*2, args.MLP_SIZE, nobias=True),
 96 |         WO=L.Linear(args.MLP_SIZE, ntags, nobias=True),
 97 |         # word-level LSTMs
 98 |         fwdRNN=L.LSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE),
 99 |         bwdRNN=L.LSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE),
100 |     )
101 | 
102 |   def word_rep(self, w):
103 |     val = vw.w2i[w] if wc[w] > 5 else UNK
104 |     return self.embed(makevar(val))
105 | 
106 |   def build_tagging_graph(self, words):
107 |     #initialize the RNNs
108 |     self.fwdRNN.reset_state()
109 |     self.bwdRNN.reset_state()
110 | 
111 |     # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
112 |     wembs = [self.word_rep(w) for w in words]
113 | 
114 |     # feed word vectors into biLSTM
115 |     fw_exps = []
116 |     for e in wembs:
117 |       fw_exps.append(self.fwdRNN(e))
118 |     bw_exps = []
119 |     for e in reversed(wembs):
120 |       bw_exps.append(self.bwdRNN(e))
121 | 
122 |     # biLSTM states
123 |     bi_exps = [F.concat([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))]
124 | 
125 |     # feed each biLSTM state to an MLP
126 |     exps = [self.WO(F.tanh(self.WH(x))) for x in bi_exps]
127 |     return exps
128 | 
129 |   def sent_loss(self, words, tags):
130 |     vecs = self.build_tagging_graph(words)
131 |     return sum(F.softmax_cross_entropy(v, makevar(vt.w2i[t])) for v, t in zip(vecs, tags))
132 | 
133 |   def tag_sent(self, words):
134 |     vecs = self.build_tagging_graph(words)
135 |     tags = [vt.i2w[int(v.data.argmax())] for v in vecs]
136 |     return zip(words, tags)
137 | 
138 | tagger = Tagger()
139 | 
140 | if args.chainer_gpu >= 0:
141 |   tagger.to_gpu()
142 | 
143 | trainer = O.Adam()
144 | trainer.use_cleargrads()
145 | trainer.setup(tagger)
146 | 
147 | print ("startup time: %r" % (time.time() - start))
148 | start = time.time()
149 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0
150 | for ITER in xrange(100):
151 |   random.shuffle(train)
152 |   for s in train:
153 |     i += 1
154 |     if i % 500 == 0:   # print status
155 |       print (this_loss / this_tagged)
156 |       all_tagged += this_tagged
157 |       this_loss = this_tagged = 0
158 |       all_time = time.time() - start
159 |     if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev
160 |       dev_start = time.time()
161 |       good_sent = bad_sent = good = bad = 0.0
162 |       for sent in dev:
163 |         words = [w for w, _ in sent]
164 |         golds = [t for _, t in sent]
165 |         tags = [t for _, t in tagger.tag_sent(words)]
166 |         if tags == golds:
167 |           good_sent += 1
168 |         else:
169 |           bad_sent += 1
170 |         for go, gu in zip(golds, tags):
171 |           if go == gu:
172 |             good += 1
173 |           else:
174 |             bad += 1
175 |       dev_time += time.time() - dev_start
176 |       train_time = time.time() - start - dev_time
177 |       print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time))
178 |       if all_time > args.TIMEOUT:
179 |         sys.exit(0)
180 |     # train on sent
181 |     words = [w for w, _ in s]
182 |     golds = [t for _, t in s]
183 | 
184 |     loss_exp = tagger.sent_loss(words, golds)
185 |     this_loss += float(loss_exp.data)
186 |     this_tagged += len(golds)
187 |     tagger.cleargrads()
188 |     loss_exp.backward()
189 |     trainer.update()
190 | 
191 |   print ("epoch %r finished" % ITER)
192 |   trainer.update_epoch(1.0)
193 | 


--------------------------------------------------------------------------------
/chainer/rnnlm-batch.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | from collections import Counter, defaultdict
  6 | from itertools import count
  7 | import random
  8 | import math
  9 | import sys
 10 | import argparse
 11 | 
 12 | from chainer import Chain, Variable
 13 | import chainer.functions as F
 14 | import chainer.links as L
 15 | import chainer.optimizers as O
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument('--chainer_gpu', type=int, default=-1, help='GPU id')
 19 | parser.add_argument('MB_SIZE', type=int, help='minibatch size')
 20 | parser.add_argument('EMBED_SIZE', type=int, help='embedding size')
 21 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 22 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 23 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 24 | args = parser.parse_args()
 25 | 
 26 | # format of files: each line is "word1/tag2 word2/tag2 ..."
 27 | train_file="data/text/train.txt"
 28 | test_file="data/text/dev.txt"
 29 | 
 30 | w2i = defaultdict(count(0).next)
 31 | 
 32 | def read(fname):
 33 |   """
 34 |   Read a file where each line is of the form "word1 word2 ..."
 35 |   Yields lists of the form [word1, word2, ...]
 36 |   """
 37 |   with file(fname) as fh:
 38 |     for line in fh:
 39 |       sent = [w2i[x] for x in line.strip().split()]
 40 |       sent.append(w2i["<s>"])
 41 |       yield sent
 42 | 
 43 | train=list(read(train_file))
 44 | nwords = len(w2i)
 45 | test=list(read(test_file))
 46 | S = w2i["<s>"]
 47 | assert(nwords == len(w2i))
 48 | 
 49 | # Chainer Starts
 50 | 
 51 | class RNNLM(Chain):
 52 |   def __init__(self):
 53 |     super(RNNLM, self).__init__(
 54 |         embed=L.EmbedID(nwords, args.EMBED_SIZE),
 55 |         rnn=L.LSTM(args.EMBED_SIZE, args.HIDDEN_SIZE),
 56 |         h2y=L.Linear(args.HIDDEN_SIZE, nwords),
 57 |     )
 58 | 
 59 |   def reset(self):
 60 |     self.rnn.reset_state()
 61 | 
 62 |   def add_input(self, x):
 63 |     h = self.rnn(self.embed(x))
 64 |     return self.h2y(h)
 65 | 
 66 | lm = RNNLM()
 67 | 
 68 | if args.chainer_gpu >= 0:
 69 |   # use GPU
 70 |   from chainer.cuda import cupy as xp, get_device
 71 |   get_device(args.chainer_gpu).use()
 72 |   lm.to_gpu()
 73 | else:
 74 |   # use CPU
 75 |   import numpy as xp
 76 | 
 77 | def makevar(arr):
 78 |   return Variable(xp.array(arr, dtype=xp.int32))
 79 | 
 80 | trainer = O.Adam()
 81 | trainer.use_cleargrads()
 82 | trainer.setup(lm)
 83 | 
 84 | # Build the language model graph
 85 | #
 86 | # Note: Chainer could not consider masking using default cross entropy function
 87 | #       which returns an integrated scalar.
 88 | #
 89 | def calc_lm_loss(sents):
 90 |   # initialize the RNN
 91 |   lm.reset()
 92 | 
 93 |   # get the wids for each step
 94 |   tot_words = 0
 95 |   wids = []
 96 |   for i in range(len(sents[0])):
 97 |     # Note: -1 is the default padding tag in Chainer.
 98 |     wids.append([
 99 |       (sent[i] if len(sent)>i else -1) for sent in sents])
100 |     mask = [(1 if len(sent)>i else 0) for sent in sents]
101 |     tot_words += sum(mask)
102 | 
103 |   # start the rnn by inputting "<s>"
104 |   init_ids = [S] * len(sents)
105 |   y = lm.add_input(makevar(init_ids))
106 | 
107 |   # feed word vectors into the RNN and predict the next word
108 |   losses = []
109 |   for wid in wids:
110 |     # calculate the softmax and loss
111 |     t = makevar(wid)
112 |     # Note: Chainer calculates the average. We have to multiply the batch size
113 |     #       to adjust dynamic range of the loss.
114 |     loss = F.softmax_cross_entropy(y, t, normalize=False) * len(sents)
115 |     losses.append(loss)
116 |     # update the state of the RNN
117 |     y = lm.add_input(t)
118 | 
119 |   return sum(losses), tot_words
120 | 
121 | # Sort training sentences in descending order and count minibatches
122 | train.sort(key=lambda x: -len(x))
123 | test.sort(key=lambda x: -len(x))
124 | train_order = [x*args.MB_SIZE for x in range((len(train)-1)/args.MB_SIZE + 1)]
125 | test_order = [x*args.MB_SIZE for x in range((len(test)-1)/args.MB_SIZE + 1)]
126 | # Perform training
127 | print ("startup time: %r" % (time.time() - start))
128 | start = time.time()
129 | i = all_time = dev_time = all_tagged = this_words = this_loss = 0
130 | for ITER in xrange(100):
131 |   random.shuffle(train_order)
132 |   for sid in train_order:
133 |     i += 1
134 |     if i % int(500/args.MB_SIZE) == 0:
135 |       print(this_loss / this_words)
136 |       all_tagged += this_words
137 |       this_loss = this_words = 0
138 |       all_time = time.time() - start
139 |     if i % int(10000 / args.MB_SIZE) == 0 or all_time > args.TIMEOUT:
140 |       dev_start = time.time()
141 |       dev_loss = dev_words = 0
142 |       for sid in test_order:
143 |         loss_exp, mb_words = calc_lm_loss(test[sid:sid+args.MB_SIZE])
144 |         dev_loss += float(loss_exp.data)
145 |         dev_words += mb_words
146 |       dev_time += time.time() - dev_start 
147 |       train_time = time.time() - start - dev_time
148 |       print("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % (dev_loss/dev_words, math.exp(dev_loss/dev_words), dev_words, train_time, all_tagged/train_time))
149 |       if all_time > args.TIMEOUT:
150 |         sys.exit(0)
151 |     # train on the minibatch
152 |     loss_exp, mb_words = calc_lm_loss(train[sid:sid+args.MB_SIZE])
153 |     this_loss += float(loss_exp.data)
154 |     this_words += mb_words
155 |     lm.cleargrads()
156 |     loss_exp.backward()
157 |     trainer.update()
158 |   print ("epoch %r finished" % ITER)
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/chainer/treenn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | import re
  6 | import codecs
  7 | from collections import Counter
  8 | import random
  9 | import sys
 10 | import argparse
 11 | 
 12 | from chainer import Chain, Variable
 13 | import chainer.functions as F
 14 | import chainer.links as L
 15 | import chainer.optimizers as O
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument('--chainer_gpu', type=int, default=-1, help='GPU id')
 19 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size')
 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 23 | args = parser.parse_args()
 24 | 
 25 | if args.chainer_gpu >= 0:
 26 |   # use GPU
 27 |   from chainer.cuda import cupy as xp, get_device
 28 |   get_device(args.chainer_gpu).use()
 29 | else:
 30 |   # use CPU
 31 |   import numpy as xp
 32 | 
 33 | def makevar(x):
 34 |   return Variable(xp.array([x], dtype=xp.int32))
 35 | 
 36 | def zeros(dim):
 37 |   return Variable(xp.zeros(dim, dtype=xp.float32))
 38 | 
 39 | def _tokenize_sexpr(s):
 40 |   tokker = re.compile(r" +|[()]|[^ ()]+")
 41 |   toks = [t for t in [match.group(0) for match in tokker.finditer(s)] if t[0] != " "]
 42 |   return toks
 43 | 
 44 | def _within_bracket(toks):
 45 |   label = next(toks)
 46 |   children = []
 47 |   for tok in toks:
 48 |     if tok == "(":
 49 |       children.append(_within_bracket(toks))
 50 |     elif tok == ")":
 51 |       return Tree(label, children)
 52 |     else: children.append(Tree(tok, None))
 53 |   assert(False),list(toks)
 54 | 
 55 | class Tree(object):
 56 |   def __init__(self, label, children=None):
 57 |     self.label = label
 58 |     self.children = children
 59 | 
 60 |   @staticmethod
 61 |   def from_sexpr(string):
 62 |     toks = iter(_tokenize_sexpr(string))
 63 |     assert next(toks) == "("
 64 |     return _within_bracket(toks)
 65 | 
 66 |   def __str__(self):
 67 |     if self.children is None: return self.label
 68 |     return "[%s %s]" % (self.label, " ".join([str(c) for c in self.children]))
 69 | 
 70 |   def isleaf(self): return self.children==None
 71 | 
 72 |   def leaves_iter(self):
 73 |     if self.isleaf():
 74 |       yield self
 75 |     else:
 76 |       for c in self.children:
 77 |         for l in c.leaves_iter(): yield l
 78 | 
 79 |   def leaves(self): return list(self.leaves_iter())
 80 | 
 81 |   def nonterms_iter(self):
 82 |     if not self.isleaf():
 83 |       yield self
 84 |       for c in self.children:
 85 |         for n in c.nonterms_iter(): yield n
 86 | 
 87 |   def nonterms(self): return list(self.nonterms_iter())
 88 | 
 89 | def read_dataset(filename):
 90 |   return [Tree.from_sexpr(line.strip()) for line in codecs.open(filename,"r")]
 91 | 
 92 | def get_vocabs(trees):
 93 |   label_vocab = Counter()
 94 |   word_vocab  = Counter()
 95 |   for tree in trees:
 96 |     label_vocab.update([n.label for n in tree.nonterms()])
 97 |     word_vocab.update([l.label for l in tree.leaves()])
 98 |   labels = [x for x,c in label_vocab.iteritems() if c > 0]
 99 |   words  = ["_UNK_"] + [x for x,c in word_vocab.iteritems() if c > 0]
100 |   l2i = {l:i for i,l in enumerate(labels)}
101 |   w2i = {w:i for i,w in enumerate(words)}
102 |   return l2i, w2i, labels, words
103 | 
104 | class TreeRNN(Chain):
105 |   def __init__(self, word_vocab, hdim, nc):
106 |     super(TreeRNN, self).__init__(
107 |         embed=L.EmbedID(len(word_vocab), hdim),
108 |         WR=L.Linear(2*hdim, hdim, nobias=True),
109 |         WO=L.Linear(hdim, nc, nobias=True),
110 |     )
111 |     self.w2i = word_vocab
112 | 
113 |   def expr_for_tree(self, tree, decorate=False):
114 |     if tree.isleaf():
115 |       return self.embed(makevar(self.w2i.get(tree.label, 0)))
116 |     if len(tree.children) == 1:
117 |       assert(tree.children[0].isleaf())
118 |       expr = self.expr_for_tree(tree.children[0])
119 |       if decorate:
120 |         tree._e = expr
121 |       return expr
122 |     assert(len(tree.children) == 2), tree.children[0]
123 |     e1 = self.expr_for_tree(tree.children[0], decorate)
124 |     e2 = self.expr_for_tree(tree.children[1], decorate)
125 |     expr = F.tanh(self.WR(F.concat(e1, e2)))
126 |     if decorate:
127 |       tree._e = expr
128 |     return expr
129 | 
130 |   def classify(self, e):
131 |     return self.WO(e)
132 | 
133 | class TreeLSTM(Chain):
134 |   def __init__(self, word_vocab, wdim, hdim, nc):
135 |     super(TreeLSTM, self).__init__(
136 |         embed=L.EmbedID(len(word_vocab), wdim),
137 |         WU=L.Linear(wdim, 4 * hdim), # i,f,o,u with bias (semiterminal)
138 |         W1=L.Linear(hdim, 4 * hdim), # i,f,o,u with bias (left)
139 |         W2=L.Linear(hdim, 4 * hdim), # i,f,o,u with bias (right)
140 |         WO=L.Linear(hdim, nc, nobias=True),
141 |     )
142 |     self.w2i = word_vocab
143 |     self.hdim = hdim
144 | 
145 |   def expr_for_tree(self, tree, decorate=False):
146 |     if tree.isleaf():
147 |       return zeros((1, self.hdim)), self.embed(makevar(self.w2i.get(tree.label, 0)))
148 |     if len(tree.children) == 1:
149 |       assert(tree.children[0].isleaf())
150 |       c0, e0 = self.expr_for_tree(tree.children[0])
151 |       c, h = F.lstm(c0, self.WU(e0))
152 |       if decorate:
153 |         tree._e = (c, h)
154 |       return c, h
155 |     assert(len(tree.children) == 2), tree.children[0]
156 |     c1, e1 = self.expr_for_tree(tree.children[0], decorate)
157 |     c2, e2 = self.expr_for_tree(tree.children[1], decorate)
158 |     c, h = F.slstm(c1, c2, self.W1(e1), self.W2(e2))
159 |     if decorate:
160 |       tree._e = (c, h)
161 |     return c, h
162 | 
163 |   def classify(self, e):
164 |     return self.WO(e[1])
165 | 
166 | train = read_dataset("data/trees/train.txt")
167 | dev = read_dataset("data/trees/dev.txt")
168 | 
169 | l2i, w2i, i2l, i2w = get_vocabs(train)
170 | 
171 | tlm = TreeLSTM(w2i, args.WEMBED_SIZE, args.HIDDEN_SIZE, len(l2i))
172 | if args.chainer_gpu >= 0:
173 |   tlm.to_gpu()
174 | 
175 | trainer = O.Adam()
176 | trainer.use_cleargrads()
177 | trainer.setup(tlm)
178 | 
179 | print("startup time: %r" % (time.time() - start))
180 | sents = 0
181 | all_time = 0
182 | for ITER in range(100):
183 |   random.shuffle(train)
184 |   closs = 0.0
185 |   cwords = 0
186 |   start = time.time()
187 |   for i,tree in enumerate(train,1):
188 |     sents += 1
189 |     d = tlm.expr_for_tree(tree,True)
190 |     nodes = tree.nonterms()
191 |     losses = [F.softmax_cross_entropy(tlm.classify(nt._e), makevar(l2i[nt.label])) for nt in nodes]
192 |     loss = sum(losses)
193 |     closs += float(loss.data)
194 |     cwords += len(nodes)
195 |     tlm.cleargrads()
196 |     loss.backward()
197 |     trainer.update()
198 |     if sents % 1000 == 0:
199 |       print(closs / cwords)
200 |       closs = 0.0
201 |       cwords = 0
202 |   all_time += time.time() - start
203 |   good = bad = 0.0
204 |   for tree in dev:
205 |     my_data = tlm.classify(tlm.expr_for_tree(tree,False)).data
206 |     if args.chainer_gpu >= 0:
207 |       my_data = xp.asnumpy(my_data)
208 |     pred = i2l[my_data.argmax()]
209 |     if pred == tree.label:
210 |       good += 1
211 |     else:
212 |       bad += 1
213 |   print("acc=%.4f, time=%.4f, sent_per_sec=%.4f" % (good/(good+bad), all_time, sents/all_time))
214 |   if all_time > args.TIMEOUT:
215 |     sys.exit(0)
216 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | This contains three example data sets:
2 | 
3 | 1. **Text Data (text):** Mikolov's pre-processed version of the Wall Street Journal used for language modeling: http://www.fit.vutbr.cz/~imikolov/rnnlm/
4 | 2. **Tree Data (trees):** The tree data from the Stanford Sentiment Treebank: http://nlp.stanford.edu/sentiment/index.html
5 | 3. **Tag Data (tags):** Tagged data from WikiNER: http://schwa.org/projects/resources/wiki/Wikiner
6 | 


--------------------------------------------------------------------------------
/dynet-cpp/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | CUDA_PATH=/usr/local/cuda/targets/x86_64-linux/lib
 4 | EIGEN_PATH=${HOME}/usr/local/eigen
 5 | DYNET_PATH=${HOME}/work/dynet
 6 | 
 7 | DYNET_LIB=-ldynet
 8 | 
 9 | # *** Mac or linux
10 | UNAME_S := $(shell uname -s)
11 | ifeq ($(UNAME_S),Darwin)
12 |   CXX_FLAGS=-std=c++11 -I${EIGEN_PATH} -I${DYNET_PATH} -L${DYNET_PATH}/build/dynet -lc++ -Ofast
13 | else
14 |   CC=g++
15 |   CXX_FLAGS=-std=c++11 -I${EIGEN_PATH} -I${DYNET_PATH} -L${DYNET_PATH}/build/dynet -DBOOST_REGEX -lboost_regex -Ofast
16 | endif
17 | 
18 | # The -seq version is commented out for compatibility with master. If using the sequence-ops
19 | # branch you can compile this program as well
20 | all: rnnlm-batch treenn treenn-bulk bilstm-tagger bilstm-tagger-bulk bilstm-tagger-withchar bilstm-tagger-withchar-bulk
21 | 
22 | gpu: rnnlm-batch-gpu treenn-gpu treenn-bulk-gpu bilstm-tagger-gpu bilstm-tagger-bulk-gpu bilstm-tagger-withchar-gpu bilstm-tagger-withchar-bulk-gpu
23 | 
24 | clean:
25 | 	rm -f rnnlm-batch treenn treenn-bulk bilstm-tagger bilstm-tagger-bulk bilstm-tagger-withchar bilstm-tagger-withchar-bulk rnnlm-batch-gpu treenn-gpu treenn-bulk-gpu bilstm-tagger-gpu bilstm-tagger-bulk-gpu bilstm-tagger-withchar-gpu bilstm-tagger-withchar-bulk-gpu
26 | 
27 | rnnlm-batch: rnnlm-batch.cc
28 | 	${CC} -o rnnlm-batch rnnlm-batch.cc ${CXX_FLAGS} ${DYNET_LIB}
29 | 
30 | rnnlm-seq: rnnlm-seq.cc
31 | 	${CC} -o rnnlm-seq rnnlm-seq.cc ${CXX_FLAGS} ${DYNET_LIB}
32 | 
33 | treenn: treenn.cc
34 | 	${CC} -o treenn treenn.cc ${CXX_FLAGS} ${DYNET_LIB}
35 | 
36 | treenn-bulk: treenn-bulk.cc
37 | 	${CC} -o treenn-bulk treenn-bulk.cc ${CXX_FLAGS} ${DYNET_LIB}
38 | 
39 | bilstm-tagger: bilstm-tagger.cc
40 | 	${CC} -o bilstm-tagger bilstm-tagger.cc ${CXX_FLAGS} ${DYNET_LIB}
41 | 
42 | bilstm-tagger-bulk: bilstm-tagger-bulk.cc
43 | 	${CC} -o bilstm-tagger-bulk bilstm-tagger-bulk.cc ${CXX_FLAGS} ${DYNET_LIB}
44 | 
45 | bilstm-tagger-withchar: bilstm-tagger-withchar.cc
46 | 	${CC} -o bilstm-tagger-withchar bilstm-tagger-withchar.cc ${CXX_FLAGS} ${DYNET_LIB}
47 | 
48 | bilstm-tagger-withchar-bulk: bilstm-tagger-withchar-bulk.cc
49 | 	${CC} -o bilstm-tagger-withchar-bulk bilstm-tagger-withchar-bulk.cc ${CXX_FLAGS} ${DYNET_LIB}
50 | 
51 | rnnlm-batch-gpu: rnnlm-batch.cc
52 | 	${CC} -o rnnlm-batch-gpu rnnlm-batch.cc ${CXX_FLAGS} ${DYNET_LIB}
53 | 
54 | rnnlm-seq-gpu: rnnlm-seq.cc
55 | 	${CC} -o rnnlm-seq-gpu rnnlm-seq.cc ${CXX_FLAGS} ${DYNET_LIB}
56 | 
57 | treenn-gpu: treenn.cc
58 | 	${CC} -o treenn-gpu treenn.cc ${CXX_FLAGS} ${DYNET_LIB}
59 | 
60 | treenn-bulk-gpu: treenn-bulk.cc
61 | 	${CC} -o treenn-bulk-gpu treenn-bulk.cc ${CXX_FLAGS} ${DYNET_LIB}
62 | 
63 | bilstm-tagger-gpu: bilstm-tagger.cc
64 | 	${CC} -o bilstm-tagger-gpu bilstm-tagger.cc ${CXX_FLAGS} ${DYNET_LIB}
65 | 
66 | bilstm-tagger-bulk-gpu: bilstm-tagger-bulk.cc
67 | 	${CC} -o bilstm-tagger-bulk-gpu bilstm-tagger-bulk.cc ${CXX_FLAGS} ${DYNET_LIB}
68 | 
69 | bilstm-tagger-withchar-gpu: bilstm-tagger-withchar.cc
70 | 	${CC} -o bilstm-tagger-withchar-gpu bilstm-tagger-withchar.cc ${CXX_FLAGS} ${DYNET_LIB}
71 | 
72 | bilstm-tagger-withchar-bulk-gpu: bilstm-tagger-withchar-bulk.cc
73 | 	${CC} -o bilstm-tagger-withchar-bulk-gpu bilstm-tagger-withchar-bulk.cc ${CXX_FLAGS} ${DYNET_LIB}
74 | 


--------------------------------------------------------------------------------
/dynet-cpp/bilstm-tagger-bulk.cc:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <stdexcept>
  3 | #include <fstream>
  4 | #include <chrono>
  5 | #ifdef BOOST_REGEX
  6 |   #include <boost/regex.hpp>
  7 |   using namespace boost;
  8 | #else
  9 |   #include <regex>
 10 | #endif
 11 | 
 12 | #include <dynet/training.h>
 13 | #include <dynet/expr.h>
 14 | #include <dynet/dict.h>
 15 | #include <dynet/lstm.h>
 16 | 
 17 | using namespace std;
 18 | using namespace std::chrono;
 19 | using namespace dynet;
 20 | 
 21 | // Read a file where each line is of the form "word1|tag1 word2|tag2 ..."
 22 | // Yields pairs of lists of the form < [word1, word2, ...], [tag1, tag2, ...] >
 23 | vector<pair<vector<string>, vector<string> > > read(const string & fname) {
 24 |   ifstream fh(fname);
 25 |   if(!fh) throw std::runtime_error("Could not open file");
 26 |   string str;
 27 |   regex re("[ |]");
 28 |   vector<pair<vector<string>, vector<string> > > sents;
 29 |   while(getline(fh, str)) {
 30 |     pair<vector<string>,vector<string> > word_tags;
 31 |     sregex_token_iterator first{str.begin(), str.end(), re, -1}, last;
 32 |     while(first != last) {
 33 |       word_tags.first.push_back(*first++);
 34 |       assert(first != last);
 35 |       word_tags.second.push_back(*first++);
 36 |     }
 37 |     sents.push_back(word_tags);
 38 |   }
 39 |   return sents;
 40 | }
 41 | 
 42 | class BiLSTMTagger {
 43 | public:
 44 | 
 45 |   BiLSTMTagger(unsigned layers, unsigned wembed_dim, unsigned hidden_dim, unsigned mlp_dim, ParameterCollection & model, Dict & wv, Dict & tv, unordered_map<string,int> & wc)
 46 |                         : wv(wv), tv(tv), wc(wc) {
 47 |     unsigned nwords = wv.size();
 48 |     unsigned ntags  = tv.size();
 49 |     word_lookup = model.add_lookup_parameters(nwords, {wembed_dim});
 50 | 
 51 |     // MLP on top of biLSTM outputs 100 -> 32 -> ntags
 52 |     pH = model.add_parameters({mlp_dim, hidden_dim*2});
 53 |     pO = model.add_parameters({ntags, mlp_dim});
 54 | 
 55 |     // word-level LSTMs
 56 |     fwdRNN = VanillaLSTMBuilder(layers, wembed_dim, hidden_dim, model); // layers, in-dim, out-dim, model
 57 |     bwdRNN = VanillaLSTMBuilder(layers, wembed_dim, hidden_dim, model);
 58 |   }
 59 | 
 60 |   Dict &wv, &tv;
 61 |   unordered_map<string,int> & wc;
 62 |   LookupParameter word_lookup;
 63 |   Parameter pH, pO;
 64 |   VanillaLSTMBuilder fwdRNN, bwdRNN;
 65 |   Expression H, O;
 66 | 
 67 |   void init(ComputationGraph & cg) {
 68 |     // parameters -> expressions
 69 |     H = parameter(cg, pH);
 70 |     O = parameter(cg, pO);
 71 | 
 72 |     // initialize the RNNs
 73 |     fwdRNN.new_graph(cg);
 74 |     bwdRNN.new_graph(cg);
 75 |   }
 76 | 
 77 |   // Do word representation
 78 |   Expression word_rep(ComputationGraph & cg, const string & w) {
 79 |     return lookup(cg, word_lookup, wv.convert(wc[w] > 5 ? w : "<unk>"));
 80 |   }
 81 | 
 82 |   vector<Expression> build_tagging_graph(ComputationGraph & cg, const vector<string> & words) {
 83 | 
 84 | 
 85 |     // get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
 86 |     vector<Expression> wembs(words.size()), fwds(words.size()), bwds(words.size()), fbwds(words.size());
 87 |     for(size_t i = 0; i < words.size(); ++i)
 88 |       wembs[i] = word_rep(cg, words[i]);
 89 | 
 90 |     // feed word vectors into biLSTM
 91 |     fwdRNN.start_new_sequence();
 92 |     for(size_t i = 0; i < wembs.size(); ++i)
 93 |       fwds[i] = fwdRNN.add_input(wembs[i]);
 94 |     bwdRNN.start_new_sequence();
 95 |     for(size_t i = wembs.size(); i > 0; --i)
 96 |       bwds[i-1] = bwdRNN.add_input(wembs[i-1]);
 97 | 
 98 |     // Concatenate and MLP
 99 |     for(size_t i = 0; i < wembs.size(); ++i)
100 |       fbwds[i] = O * tanh( H * concatenate({fwds[i], bwds[i]}) );
101 | 
102 |     return fbwds;
103 |   }
104 | 
105 |   Expression sent_loss(ComputationGraph & cg, vector<string> & words, vector<string> & tags) {
106 |     vector<Expression> exprs = build_tagging_graph(cg, words), errs(words.size());
107 |     for(size_t i = 0; i < tags.size(); ++i)
108 |       errs[i] = pickneglogsoftmax(exprs[i], tv.convert(tags[i]));
109 |     return sum(errs);
110 |   }
111 | 
112 |   vector<string> tag_sent(vector<string> & words, ComputationGraph &cg) {
113 |     cg.clear();
114 |     init(cg);
115 |     vector<Expression> exprs = build_tagging_graph(cg, words), errs(words.size());
116 |     vector<string> tags(words.size());
117 |     for(size_t i = 0; i < words.size(); ++i) {
118 |       vector<float> scores = as_vector(exprs[i].value());
119 |       size_t max_id = distance(scores.begin(), max_element(scores.begin(), scores.end()));
120 |       tags[i] = tv.convert(max_id);
121 |     }
122 |     return tags;
123 |   }
124 | 
125 | };
126 | 
127 | int main(int argc, char**argv) {
128 | 
129 |   time_point<system_clock> start = system_clock::now();
130 | 
131 |   vector<pair<vector<string>, vector<string> > > train = read("data/tags/train.txt");
132 |   vector<pair<vector<string>, vector<string> > > dev = read("data/tags/dev.txt");
133 |   Dict word_voc, tag_voc;
134 |   unordered_map<string, int> word_cnt;
135 |   for(auto & sent : train) {
136 |     for(auto & w : sent.first) {
137 |       word_voc.convert(w);
138 |       word_cnt[w]++;
139 |     }
140 |     for(auto & t : sent.second)
141 |       tag_voc.convert(t);
142 |   }
143 |   tag_voc.freeze();
144 |   word_voc.convert("<unk>"); word_voc.freeze(); word_voc.set_unk("<unk>");
145 | 
146 |   // DyNet Starts
147 |   dynet::initialize(argc, argv);
148 |   ParameterCollection model;
149 |   AdamTrainer trainer(model);
150 |   trainer.clipping_enabled = false;
151 | 
152 |   if(argc != 8) {
153 |     cerr << "Usage: " << argv[0] << " WEMBED_SIZE HIDDEN_SIZE MLP_SIZE SPARSE BATCH_SIZE LAST_STEP TIMEOUT" << endl;
154 |     return 1;
155 |   }
156 |   int WEMBED_SIZE = atoi(argv[1]);
157 |   int HIDDEN_SIZE = atoi(argv[2]);
158 |   int MLP_SIZE = atoi(argv[3]);
159 |   trainer.sparse_updates_enabled = atoi(argv[4]);
160 |   int BATCH_SIZE = atoi(argv[5]);
161 |   int LAST_STEP = atoi(argv[6]);
162 |   int TIMEOUT = atoi(argv[7]);
163 | 
164 |   // Initilaize the tagger
165 |   BiLSTMTagger tagger(1, WEMBED_SIZE, HIDDEN_SIZE, MLP_SIZE, model, word_voc, tag_voc, word_cnt);
166 | 
167 |   {
168 |     duration<float> fs = (system_clock::now() - start);
169 |     float startup_time = duration_cast<milliseconds>(fs).count() / float(1000);
170 |     cout << "startup time: " << startup_time << endl;
171 |   }
172 | 
173 |   // Do training
174 |   shuffle(train.begin(), train.end(), *dynet::rndeng);
175 |   start = system_clock::now();
176 |   int i = 0, bi = 0, all_tagged = 0, this_words = 0;
177 |   float this_loss = 0.f, all_time = 0.f;
178 |   unsigned batch = BATCH_SIZE;
179 |   ComputationGraph cg;
180 |   for(int iter = 0; iter < 100; iter++) {
181 |     for(size_t id1 = 0; id1 <= train.size()-batch; id1 += batch) {
182 |       i += batch;
183 |       bi++;
184 |       if(bi % (500/BATCH_SIZE) == 0) {
185 |         trainer.status();
186 |         cout << this_loss/this_words << endl;
187 |         all_tagged += this_words;
188 |         this_loss = 0.f;
189 |         this_words = 0;
190 |       }
191 |       if(bi % (5000/BATCH_SIZE) == 0) {
192 |         duration<float> fs = (system_clock::now() - start);
193 |         all_time += duration_cast<milliseconds>(fs).count() / float(1000);
194 |         int dev_words = 0, dev_good = 0;
195 |         float dev_loss = 0;
196 |         for(auto & sent : dev) {
197 |           vector<string> tags = tagger.tag_sent(sent.first, cg);
198 |           for(size_t j = 0; j < tags.size(); ++j)
199 |             if(tags[j] == sent.second[j])
200 |               dev_good++;
201 |           dev_words += sent.second.size();
202 |         }
203 |         cout << "acc=" << dev_good/float(dev_words) << ", time=" << all_time << ", word_per_sec=" << all_tagged/all_time << ", sent_per_sec=" << i/all_time << ", sec_per_sent=" << all_time/i << endl;
204 |         if(all_time > TIMEOUT)
205 |           exit(0);
206 |         start = system_clock::now();
207 |       }
208 | 
209 |       cg.clear();
210 |       tagger.init(cg);
211 |       vector<Expression> losses;
212 |       for(size_t id2 = 0; id2 < batch; ++id2) {
213 |         auto & s = train[id1+id2];
214 |         losses.push_back(tagger.sent_loss(cg, s.first, s.second));
215 |         this_words += s.first.size();
216 |       }
217 |       Expression loss_exp = sum(losses);
218 |       float my_loss = as_scalar(cg.forward(loss_exp));
219 |       this_loss += my_loss;
220 |       if(LAST_STEP > 0) {
221 |         cg.backward(loss_exp);
222 |         if(LAST_STEP > 1)
223 |           trainer.update();
224 |       }
225 |     }
226 |   }
227 |   return 0;
228 | }
229 | 


--------------------------------------------------------------------------------
/dynet-cpp/bilstm-tagger-withchar.cc:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <stdexcept>
  3 | #include <fstream>
  4 | #include <chrono>
  5 | #ifdef BOOST_REGEX
  6 |   #include <boost/regex.hpp>
  7 |   using namespace boost;
  8 | #else
  9 |   #include <regex>
 10 | #endif
 11 | 
 12 | #include <dynet/training.h>
 13 | #include <dynet/expr.h>
 14 | #include <dynet/dict.h>
 15 | #include <dynet/lstm.h>
 16 | 
 17 | using namespace std;
 18 | using namespace std::chrono;
 19 | using namespace dynet;
 20 | 
 21 | // Read a file where each line is of the form "word1|tag1 word2|tag2 ..."
 22 | // Yields pairs of lists of the form < [word1, word2, ...], [tag1, tag2, ...] >
 23 | vector<pair<vector<string>, vector<string> > > read(const string & fname) {
 24 |   ifstream fh(fname);
 25 |   if(!fh) throw std::runtime_error("Could not open file");
 26 |   string str;
 27 |   regex re("[ |]");
 28 |   vector<pair<vector<string>, vector<string> > > sents;
 29 |   while(getline(fh, str)) {
 30 |     pair<vector<string>,vector<string> > word_tags;
 31 |     sregex_token_iterator first{str.begin(), str.end(), re, -1}, last;
 32 |     while(first != last) {
 33 |       word_tags.first.push_back(*first++);
 34 |       assert(first != last);
 35 |       word_tags.second.push_back(*first++);
 36 |     }
 37 |     sents.push_back(word_tags);
 38 |   }
 39 |   return sents;
 40 | }
 41 | 
 42 | class BiLSTMTagger {
 43 | public:
 44 | 
 45 |   BiLSTMTagger(unsigned layers, unsigned cembed_dim, unsigned wembed_dim, unsigned hidden_dim, unsigned mlp_dim, ParameterCollection & model, Dict & wv, Dict & cv, Dict & tv, unordered_map<string,int> & wc)
 46 |                         : wv(wv), cv(cv), tv(tv), wc(wc) {
 47 |     unsigned nwords = wv.size();
 48 |     unsigned ntags  = tv.size();
 49 |     unsigned nchars  = cv.size();
 50 |     word_lookup = model.add_lookup_parameters(nwords, {wembed_dim});
 51 |     char_lookup = model.add_lookup_parameters(nchars, {cembed_dim});
 52 | 
 53 |     // MLP on top of biLSTM outputs 100 -> mlp_dim -> ntags
 54 |     pH = model.add_parameters({mlp_dim, hidden_dim*2});
 55 |     pO = model.add_parameters({ntags, mlp_dim});
 56 | 
 57 |     // word-level LSTMs
 58 |     fwdRNN = VanillaLSTMBuilder(1, wembed_dim, hidden_dim, model); // layers, in-dim, out-dim, model
 59 |     bwdRNN = VanillaLSTMBuilder(1, wembed_dim, hidden_dim, model);
 60 | 
 61 |     // char-level LSTMs
 62 |     cFwdRNN = VanillaLSTMBuilder(1, cembed_dim, wembed_dim/2, model);
 63 |     cBwdRNN = VanillaLSTMBuilder(1, cembed_dim, wembed_dim/2, model);
 64 |   }
 65 | 
 66 |   Dict &wv, &cv, &tv;
 67 |   unordered_map<string,int> & wc;
 68 |   LookupParameter word_lookup, char_lookup;
 69 |   Parameter p_t1, pH, pO;
 70 |   VanillaLSTMBuilder fwdRNN, bwdRNN, cFwdRNN, cBwdRNN;
 71 | 
 72 |   // Do word representation
 73 |   Expression word_rep(ComputationGraph & cg, const string & w) {
 74 |     if(wc[w] > 5) {
 75 |       return lookup(cg, word_lookup, wv.convert(w));
 76 |     } else {
 77 |       Expression pad = lookup(cg, char_lookup, cv.convert("<*>"));
 78 |       vector<Expression> cembs(w.size()+2, pad);
 79 |       for(size_t i = 0; i < w.size(); ++i)
 80 |         cembs[i+1] = lookup(cg, char_lookup, cv.convert(w.substr(i, 1)));
 81 |       cFwdRNN.start_new_sequence();
 82 |       for(size_t i = 0; i < cembs.size(); ++i) cFwdRNN.add_input(cembs[i]);
 83 |       cBwdRNN.start_new_sequence();
 84 |       for(size_t i = cembs.size(); i > 0; --i) cBwdRNN.add_input(cembs[i-1]);
 85 |       return concatenate({cFwdRNN.back(), cBwdRNN.back()});
 86 |     }
 87 |   }
 88 | 
 89 |   vector<Expression> build_tagging_graph(ComputationGraph & cg, const vector<string> & words) {
 90 |     // parameters -> expressions
 91 |     Expression H = parameter(cg, pH);
 92 |     Expression O = parameter(cg, pO);
 93 | 
 94 |     // initialize the RNNs
 95 |     fwdRNN.new_graph(cg);
 96 |     bwdRNN.new_graph(cg);
 97 |     cFwdRNN.new_graph(cg);
 98 |     cBwdRNN.new_graph(cg);
 99 | 
100 |     // get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
101 |     vector<Expression> wembs(words.size()), fwds(words.size()), bwds(words.size()), fbwds(words.size());
102 |     for(size_t i = 0; i < words.size(); ++i)
103 |       wembs[i] = word_rep(cg, words[i]);
104 | 
105 |     // feed word vectors into biLSTM
106 |     fwdRNN.start_new_sequence();
107 |     for(size_t i = 0; i < wembs.size(); ++i)
108 |       fwds[i] = fwdRNN.add_input(wembs[i]);
109 |     bwdRNN.start_new_sequence();
110 |     for(size_t i = wembs.size(); i > 0; --i) {
111 |       bwds[i-1] = bwdRNN.add_input(wembs[i-1]);
112 |       fbwds[i-1] = O * tanh( H * concatenate({fwds[i-1], bwds[i-1]}) );
113 |     }
114 | 
115 |     return fbwds;
116 |   }
117 | 
118 |   Expression sent_loss(ComputationGraph & cg, vector<string> & words, vector<string> & tags) {
119 |     vector<Expression> exprs = build_tagging_graph(cg, words), errs(words.size());
120 |     for(size_t i = 0; i < tags.size(); ++i)
121 |       errs[i] = pickneglogsoftmax(exprs[i], tv.convert(tags[i]));
122 |     return sum(errs);
123 |   }
124 | 
125 |   vector<string> tag_sent(vector<string> & words) {
126 |     ComputationGraph cg;
127 |     vector<Expression> exprs = build_tagging_graph(cg, words), errs(words.size());
128 |     vector<string> tags(words.size());
129 |     for(size_t i = 0; i < words.size(); ++i) {
130 |       vector<float> scores = as_vector(exprs[i].value());
131 |       size_t max_id = distance(scores.begin(), max_element(scores.begin(), scores.end()));
132 |       tags[i] = tv.convert(max_id);
133 |     }
134 |     return tags;
135 |   }
136 | 
137 | };
138 | 
139 | int main(int argc, char**argv) {
140 | 
141 |   time_point<system_clock> start = system_clock::now();
142 | 
143 |   vector<pair<vector<string>, vector<string> > > train = read("data/tags/train.txt");
144 |   vector<pair<vector<string>, vector<string> > > dev = read("data/tags/dev.txt");
145 |   Dict word_voc, tag_voc, char_voc;
146 |   unordered_map<string, int> word_cnt;
147 |   for(auto & sent : train) {
148 |     for(auto & w : sent.first) {
149 |       word_voc.convert(w);
150 |       word_cnt[w]++;
151 |       for(size_t i = 0; i < w.size(); ++i)
152 |         char_voc.convert(w.substr(i,1));
153 |     }
154 |     for(auto & t : sent.second)
155 |       tag_voc.convert(t);
156 |   }
157 |   tag_voc.freeze();
158 |   word_voc.convert("<unk>"); word_voc.freeze(); word_voc.set_unk("<unk>");
159 |   char_voc.convert("<*>"); char_voc.freeze();
160 | 
161 |   // DyNet Starts
162 |   dynet::initialize(argc, argv);
163 |   ParameterCollection model;
164 |   AdamTrainer trainer(model, 0.001);
165 |   trainer.clipping_enabled = false;
166 | 
167 |   if(argc != 7) {
168 |     cerr << "Usage: " << argv[0] << " CEMBED_SIZE WEMBED_SIZE HIDDEN_SIZE MLP_SIZE SPARSE TIMEOUT" << endl;
169 |     return 1;
170 |   }
171 |   int CEMBED_SIZE = atoi(argv[1]);
172 |   int WEMBED_SIZE = atoi(argv[2]);
173 |   int HIDDEN_SIZE = atoi(argv[3]);
174 |   int MLP_SIZE = atoi(argv[4]);
175 |   trainer.sparse_updates_enabled = atoi(argv[5]);
176 |   int TIMEOUT = atoi(argv[6]);
177 | 
178 |   // Initilaize the tagger
179 |   BiLSTMTagger tagger(1, CEMBED_SIZE, WEMBED_SIZE, HIDDEN_SIZE, MLP_SIZE, model, word_voc, char_voc, tag_voc, word_cnt);
180 | 
181 |   {
182 |     duration<float> fs = (system_clock::now() - start);
183 |     float startup_time = duration_cast<milliseconds>(fs).count() / float(1000);
184 |     cout << "startup time: " << startup_time << endl;
185 |   }
186 | 
187 |   // Do training
188 |   start = system_clock::now();
189 |   int i = 0, all_tagged = 0, this_words = 0;
190 |   float this_loss = 0.f, all_time = 0.f;
191 |   for(int iter = 0; iter < 100; iter++) {
192 |     shuffle(train.begin(), train.end(), *dynet::rndeng);
193 |     for(auto & s : train) {
194 |       i++;
195 |       if(i % 500 == 0) {
196 |         trainer.status();
197 |         cout << this_loss/this_words << endl;
198 |         all_tagged += this_words;
199 |         this_loss = 0.f;
200 |         this_words = 0;
201 |       }
202 |       if(i % 10000 == 0) {
203 |         duration<float> fs = (system_clock::now() - start);
204 |         all_time += duration_cast<milliseconds>(fs).count() / float(1000);
205 |         int dev_words = 0, dev_good = 0;
206 |         float dev_loss = 0;
207 |         for(auto & sent : dev) {
208 |           vector<string> tags = tagger.tag_sent(sent.first);
209 |           for(size_t j = 0; j < tags.size(); ++j)
210 |             if(tags[j] == sent.second[j])
211 |               dev_good++;
212 |           dev_words += sent.second.size();
213 |         }
214 |         cout << "acc=" << dev_good/float(dev_words) << ", time=" << all_time << ", word_per_sec=" << all_tagged/all_time << endl;
215 |         if(all_time > TIMEOUT)
216 |           exit(0);
217 |         start = system_clock::now();
218 |       }
219 | 
220 |       ComputationGraph cg;
221 |       Expression loss_exp = tagger.sent_loss(cg, s.first, s.second);
222 |       this_loss += as_scalar(cg.forward(loss_exp));
223 |       this_words += s.first.size();
224 |       cg.backward(loss_exp);
225 |       trainer.update();
226 |     }
227 |   }
228 |   return 0;
229 | }
230 | 


--------------------------------------------------------------------------------
/dynet-cpp/bilstm-tagger.cc:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <stdexcept>
  3 | #include <fstream>
  4 | #include <chrono>
  5 | #ifdef BOOST_REGEX
  6 |   #include <boost/regex.hpp>
  7 |   using namespace boost;
  8 | #else
  9 |   #include <regex>
 10 | #endif
 11 | 
 12 | #include <dynet/training.h>
 13 | #include <dynet/expr.h>
 14 | #include <dynet/dict.h>
 15 | #include <dynet/lstm.h>
 16 | 
 17 | using namespace std;
 18 | using namespace std::chrono;
 19 | using namespace dynet;
 20 | 
 21 | // Read a file where each line is of the form "word1|tag1 word2|tag2 ..."
 22 | // Yields pairs of lists of the form < [word1, word2, ...], [tag1, tag2, ...] >
 23 | vector<pair<vector<string>, vector<string> > > read(const string & fname) {
 24 |   ifstream fh(fname);
 25 |   if(!fh) throw std::runtime_error("Could not open file");
 26 |   string str;
 27 |   regex re("[ |]");
 28 |   vector<pair<vector<string>, vector<string> > > sents;
 29 |   while(getline(fh, str)) {
 30 |     pair<vector<string>,vector<string> > word_tags;
 31 |     sregex_token_iterator first{str.begin(), str.end(), re, -1}, last;
 32 |     while(first != last) {
 33 |       word_tags.first.push_back(*first++);
 34 |       assert(first != last);
 35 |       word_tags.second.push_back(*first++);
 36 |     }
 37 |     sents.push_back(word_tags);
 38 |   }
 39 |   return sents;
 40 | }
 41 | 
 42 | class BiLSTMTagger {
 43 | public:
 44 | 
 45 |   BiLSTMTagger(unsigned layers, unsigned wembed_dim, unsigned hidden_dim, unsigned mlp_dim, ParameterCollection & model, Dict & wv, Dict & tv, unordered_map<string,int> & wc)
 46 |                         : wv(wv), tv(tv), wc(wc) {
 47 |     unsigned nwords = wv.size();
 48 |     unsigned ntags  = tv.size();
 49 |     word_lookup = model.add_lookup_parameters(nwords, {wembed_dim});
 50 | 
 51 |     // MLP on top of biLSTM outputs 100 -> 32 -> ntags
 52 |     pH = model.add_parameters({mlp_dim, hidden_dim*2});
 53 |     pO = model.add_parameters({ntags, mlp_dim});
 54 | 
 55 |     // word-level LSTMs
 56 |     fwdRNN = VanillaLSTMBuilder(layers, wembed_dim, hidden_dim, model); // layers, in-dim, out-dim, model
 57 |     bwdRNN = VanillaLSTMBuilder(layers, wembed_dim, hidden_dim, model);
 58 |   }
 59 | 
 60 |   Dict &wv, &tv;
 61 |   unordered_map<string,int> & wc;
 62 |   LookupParameter word_lookup;
 63 |   Parameter pH, pO;
 64 |   VanillaLSTMBuilder fwdRNN, bwdRNN;
 65 | 
 66 |   // Do word representation
 67 |   Expression word_rep(ComputationGraph & cg, const string & w) {
 68 |     return lookup(cg, word_lookup, wv.convert(wc[w] > 5 ? w : "<unk>"));
 69 |   }
 70 | 
 71 |   vector<Expression> build_tagging_graph(ComputationGraph & cg, const vector<string> & words) {
 72 |     // parameters -> expressions
 73 |     Expression H = parameter(cg, pH);
 74 |     Expression O = parameter(cg, pO);
 75 | 
 76 |     // initialize the RNNs
 77 |     fwdRNN.new_graph(cg);
 78 |     bwdRNN.new_graph(cg);
 79 | 
 80 |     // get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
 81 |     vector<Expression> wembs(words.size()), fwds(words.size()), bwds(words.size()), fbwds(words.size());
 82 |     for(size_t i = 0; i < words.size(); ++i)
 83 |       wembs[i] = word_rep(cg, words[i]);
 84 | 
 85 |     // feed word vectors into biLSTM
 86 |     fwdRNN.start_new_sequence();
 87 |     for(size_t i = 0; i < wembs.size(); ++i)
 88 |       fwds[i] = fwdRNN.add_input(wembs[i]);
 89 |     bwdRNN.start_new_sequence();
 90 |     for(size_t i = wembs.size(); i > 0; --i)
 91 |       bwds[i-1] = bwdRNN.add_input(wembs[i-1]);
 92 | 
 93 |     // Concatenate and MLP
 94 |     for(size_t i = 0; i < wembs.size(); ++i)
 95 |       fbwds[i] = O * tanh( H * concatenate({fwds[i], bwds[i]}) );
 96 | 
 97 |     return fbwds;
 98 |   }
 99 | 
100 |   Expression sent_loss(ComputationGraph & cg, vector<string> & words, vector<string> & tags) {
101 |     vector<Expression> exprs = build_tagging_graph(cg, words), errs(words.size());
102 |     for(size_t i = 0; i < tags.size(); ++i)
103 |       errs[i] = pickneglogsoftmax(exprs[i], tv.convert(tags[i]));
104 |     return sum(errs);
105 |   }
106 | 
107 |   vector<string> tag_sent(vector<string> & words) {
108 |     ComputationGraph cg;
109 |     vector<Expression> exprs = build_tagging_graph(cg, words), errs(words.size());
110 |     vector<string> tags(words.size());
111 |     for(size_t i = 0; i < words.size(); ++i) {
112 |       vector<float> scores = as_vector(exprs[i].value());
113 |       size_t max_id = distance(scores.begin(), max_element(scores.begin(), scores.end()));
114 |       tags[i] = tv.convert(max_id);
115 |     }
116 |     return tags;
117 |   }
118 | 
119 | };
120 | 
121 | int main(int argc, char**argv) {
122 | 
123 |   time_point<system_clock> start = system_clock::now();
124 | 
125 |   vector<pair<vector<string>, vector<string> > > train = read("data/tags/train.txt");
126 |   vector<pair<vector<string>, vector<string> > > dev = read("data/tags/dev.txt");
127 |   Dict word_voc, tag_voc;
128 |   unordered_map<string, int> word_cnt;
129 |   for(auto & sent : train) {
130 |     for(auto & w : sent.first) {
131 |       word_voc.convert(w);
132 |       word_cnt[w]++;
133 |     }
134 |     for(auto & t : sent.second)
135 |       tag_voc.convert(t);
136 |   }
137 |   tag_voc.freeze();
138 |   word_voc.convert("<unk>"); word_voc.freeze(); word_voc.set_unk("<unk>");
139 | 
140 |   // DyNet Starts
141 |   dynet::initialize(argc, argv);
142 |   ParameterCollection model;
143 |   AdamTrainer trainer(model);
144 |   trainer.clipping_enabled = false;
145 | 
146 |   if(argc != 6) {
147 |     cerr << "Usage: " << argv[0] << " WEMBED_SIZE HIDDEN_SIZE MLP_SIZE SPARSE TIMEOUT" << endl;
148 |     return 1;
149 |   }
150 |   int WEMBED_SIZE = atoi(argv[1]);
151 |   int HIDDEN_SIZE = atoi(argv[2]);
152 |   int MLP_SIZE = atoi(argv[3]);
153 |   trainer.sparse_updates_enabled = atoi(argv[4]);
154 |   int TIMEOUT = atoi(argv[5]);
155 | 
156 |   // Initilaize the tagger
157 |   BiLSTMTagger tagger(1, WEMBED_SIZE, HIDDEN_SIZE, MLP_SIZE, model, word_voc, tag_voc, word_cnt);
158 | 
159 |   {
160 |     duration<float> fs = (system_clock::now() - start);
161 |     float startup_time = duration_cast<milliseconds>(fs).count() / float(1000);
162 |     cout << "startup time: " << startup_time << endl;
163 |   }
164 | 
165 |   // Do training
166 |   start = system_clock::now();
167 |   int i = 0, all_tagged = 0, this_words = 0;
168 |   float this_loss = 0.f, all_time = 0.f;
169 |   for(int iter = 0; iter < 100; iter++) {
170 |     shuffle(train.begin(), train.end(), *dynet::rndeng);
171 |     for(auto & s : train) {
172 |       i++;
173 |       if(i % 500 == 0) {
174 |         trainer.status();
175 |         cout << this_loss/this_words << endl;
176 |         all_tagged += this_words;
177 |         this_loss = 0.f;
178 |         this_words = 0;
179 |       }
180 |       if(i % 10000 == 0) {
181 |         duration<float> fs = (system_clock::now() - start);
182 |         all_time += duration_cast<milliseconds>(fs).count() / float(1000);
183 |         int dev_words = 0, dev_good = 0;
184 |         float dev_loss = 0;
185 |         for(auto & sent : dev) {
186 |           vector<string> tags = tagger.tag_sent(sent.first);
187 |           for(size_t j = 0; j < tags.size(); ++j)
188 |             if(tags[j] == sent.second[j])
189 |               dev_good++;
190 |           dev_words += sent.second.size();
191 |         }
192 |         cout << "acc=" << dev_good/float(dev_words) << ", time=" << all_time << ", word_per_sec=" << all_tagged/all_time << endl;
193 |         if(all_time > TIMEOUT)
194 |           exit(0);
195 |         start = system_clock::now();
196 |       }
197 | 
198 |       ComputationGraph cg;
199 |       Expression loss_exp = tagger.sent_loss(cg, s.first, s.second);
200 |       float my_loss = as_scalar(cg.forward(loss_exp));
201 |       this_loss += my_loss;
202 |       this_words += s.first.size();
203 |       cg.backward(loss_exp);
204 |       trainer.update();
205 |     }
206 |   }
207 |   return 0;
208 | }
209 | 


--------------------------------------------------------------------------------
/dynet-cpp/rnnlm-batch.cc:
--------------------------------------------------------------------------------
  1 | #include <string>
  2 | #include <vector>
  3 | #include <sstream>
  4 | #include <fstream>
  5 | #include <stdexcept>
  6 | #include <algorithm>
  7 | #include <unordered_map>
  8 | #include <chrono>
  9 | 
 10 | #include <dynet/dict.h>
 11 | #include <dynet/expr.h>
 12 | #include <dynet/lstm.h>
 13 | #include <dynet/training.h>
 14 | #include <dynet/param-init.h>
 15 | 
 16 | using namespace std;
 17 | using namespace std::chrono;
 18 | using namespace dynet;
 19 | 
 20 | // Read a file where each line is of the form "word1 word2 ..."
 21 | // Yields lists of the form [word1, word2, ...]
 22 | vector<vector<int> > read(const string & fname, Dict & vw) {
 23 |   ifstream fh(fname);
 24 |   if(!fh) throw std::runtime_error("Could not open file");
 25 |   string str; 
 26 |   vector<vector<int> > sents;
 27 |   while(getline(fh, str)) {
 28 |     istringstream iss(str);
 29 |     vector<int> tokens;
 30 |     while(iss >> str)
 31 |       tokens.push_back(vw.convert(str));
 32 |     tokens.push_back(vw.convert("<s>"));
 33 |     sents.push_back(tokens);
 34 |   }
 35 |   return sents;
 36 | }
 37 | 
 38 | struct RNNLanguageModel {
 39 |   LookupParameter p_c;
 40 |   Parameter W_sm;
 41 |   Parameter b_sm;
 42 |   VanillaLSTMBuilder builder;
 43 |   explicit RNNLanguageModel(unsigned layers, unsigned input_dim, unsigned hidden_dim, unsigned vocab_size, ParameterCollection& model) : builder(layers, input_dim, hidden_dim, model) {
 44 |     p_c = model.add_lookup_parameters(vocab_size, {input_dim}, ParameterInitUniform(0.1)); 
 45 |     W_sm = model.add_parameters({vocab_size, hidden_dim}, ParameterInitUniform(0.5));
 46 |     b_sm = model.add_parameters({vocab_size}, ParameterInitUniform(0.5));
 47 |   }
 48 | 
 49 |   Expression calc_lm_loss(const vector<vector<int> > & sent, int pos, int mb_size, ComputationGraph & cg) {
 50 |   
 51 |     // parameters -> expressions
 52 |     Expression W_exp = parameter(cg, W_sm);
 53 |     Expression b_exp = parameter(cg, b_sm);
 54 |   
 55 |     // initialize the RNN
 56 |     builder.new_graph(cg);  // reset RNN builder for new graph
 57 |     builder.start_new_sequence();
 58 |   
 59 |     // start the rnn by inputting "<s>"
 60 |     size_t tot_sents = min(sent.size()-pos, (size_t)mb_size);
 61 |     vector<unsigned> wids(tot_sents, 0);
 62 |     vector<float> masks(tot_sents);
 63 |     Expression s = builder.add_input(lookup(cg, p_c, wids)); 
 64 | 
 65 |     // feed word vectors into the RNN and predict the next word
 66 |     vector<Expression> losses;
 67 |     size_t j;
 68 |     for(size_t i = 0; i < sent[pos].size(); ++i) {
 69 |       // Get the words
 70 |       for(j = 0; j < tot_sents && i < sent[pos+j].size(); ++j) {
 71 |         wids[j] = sent[pos+j][i];
 72 |         masks[j] = 1.f;
 73 |       }
 74 |       // And the masks
 75 |       for(; j < tot_sents; ++j) {
 76 |         wids[j] = 0;
 77 |         masks[j] = 0.f;
 78 |       }
 79 |       // calculate the softmax and loss
 80 |       Expression score = affine_transform({b_exp, W_exp, s});
 81 |       Expression loss = pickneglogsoftmax(score, wids);
 82 |       if(0.f == *masks.rbegin())
 83 |         loss = cmult(loss, input(cg, Dim({1}, tot_sents), masks));
 84 |       losses.push_back(loss);
 85 |       // update the state of the RNN
 86 |       s = builder.add_input(lookup(cg, p_c, wids));
 87 |     }
 88 |     
 89 |     return sum_batches(sum(losses));
 90 |   }
 91 | 
 92 | };
 93 | 
 94 | struct length_greater_then {
 95 |     inline bool operator() (const vector<int> & struct1, const vector<int> & struct2) {
 96 |         return (struct1.size() > struct2.size());
 97 |     }
 98 | };
 99 | 
100 | vector<int> prepare_minibatch(int mb_size, vector<vector<int> > & data) {
101 |   stable_sort(data.begin(), data.end(), length_greater_then());
102 |   vector<int> ids;
103 |   for(size_t i = 0; i < data.size(); i += mb_size)
104 |     ids.push_back(i);
105 |   return ids;
106 | }
107 | 
108 | int main(int argc, char** argv) {
109 | 
110 |   time_point<system_clock> start = system_clock::now();
111 | 
112 |   // format of files: each line is "word1 word2 ..."
113 |   string train_file = "data/text/train.txt";
114 |   string test_file = "data/text/dev.txt";
115 | 
116 |   // DyNet Starts
117 |   dynet::initialize(argc, argv);
118 |   ParameterCollection model;
119 | 
120 |   if(argc != 6) {
121 |     cerr << "Usage: " << argv[0] << " MB_SIZE EMBED_SIZE HIDDEN_SIZE SPARSE TIMEOUT" << endl;
122 |     return 1;
123 |   }
124 |   int MB_SIZE = atoi(argv[1]);
125 |   int EMBED_SIZE = atoi(argv[2]);
126 |   int HIDDEN_SIZE = atoi(argv[3]);
127 |   int SPARSE = atoi(argv[4]);
128 |   int TIMEOUT = atoi(argv[5]);
129 | 
130 |   AdamTrainer trainer(model, 0.001);
131 |   trainer.sparse_updates_enabled = SPARSE;
132 |   trainer.clipping_enabled = false;
133 | 
134 |   Dict vw;
135 |   vw.convert("<s>");
136 |   vector<vector<int> > train = read(train_file, vw);
137 |   vw.freeze();
138 |   vector<vector<int> > test = read(test_file, vw);
139 |   vector<int> train_ids = prepare_minibatch(MB_SIZE, train);
140 |   vector<int> test_ids = prepare_minibatch(MB_SIZE, test);
141 |   int test_words = 0;
142 |   for(auto & sent : test) test_words += sent.size();
143 | 
144 |   int nwords = vw.size();
145 | 
146 |   RNNLanguageModel rnnlm(1, EMBED_SIZE, HIDDEN_SIZE, nwords, model);
147 | 
148 |   {
149 |     duration<float> fs = (system_clock::now() - start);
150 |     float startup_time = duration_cast<milliseconds>(fs).count() / float(1000);
151 |     cout << "startup time: " << startup_time << endl;
152 |   }
153 | 
154 |   start = system_clock::now();
155 |   int i = 0, all_words = 0, this_words = 0;
156 |   float this_loss = 0.f, all_time = 0.f;
157 |   for(int iter = 0; iter < 100; iter++) {
158 |     shuffle(train_ids.begin(), train_ids.end(), *dynet::rndeng);
159 |     for(auto sid : train_ids) {
160 |       i++;
161 |       if(i % (500/MB_SIZE) == 0) {
162 |         trainer.status();
163 |         cout << this_loss/this_words << endl;
164 |         all_words += this_words;
165 |         this_loss = 0.f;
166 |         this_words = 0;
167 |       }
168 |       if(i % (10000/MB_SIZE) == 0) {
169 |         duration<float> fs = (system_clock::now() - start);
170 |         all_time += duration_cast<milliseconds>(fs).count() / float(1000);
171 |         float test_loss = 0;
172 |         for(auto sentid : test_ids) {
173 |           ComputationGraph cg;
174 |           Expression loss_exp = rnnlm.calc_lm_loss(test, sentid, MB_SIZE, cg);
175 |           test_loss += as_scalar(cg.forward(loss_exp));
176 |         }
177 |         cout << "nll=" << test_loss/test_words << ", ppl=" << exp(test_loss/test_words) << ", words=" << test_words << ", time=" << all_time << ", word_per_sec=" << all_words/all_time << endl;
178 |         if(all_time > TIMEOUT)
179 |           exit(0);
180 |         start = system_clock::now();
181 |       }
182 | 
183 |       ComputationGraph cg;
184 |       Expression loss_exp = rnnlm.calc_lm_loss(train, sid, MB_SIZE, cg);
185 |       this_loss += as_scalar(cg.forward(loss_exp));
186 |       for(size_t pos = sid; pos < min((size_t)sid+MB_SIZE, train.size()); ++pos)
187 |         this_words += train[pos].size();
188 |       cg.backward(loss_exp);
189 |       trainer.update();
190 |     }
191 |   }
192 | }
193 | 


--------------------------------------------------------------------------------
/dynet-cpp/rnnlm-seq.cc:
--------------------------------------------------------------------------------
  1 | #include <string>
  2 | #include <vector>
  3 | #include <sstream>
  4 | #include <fstream>
  5 | #include <stdexcept>
  6 | #include <algorithm>
  7 | #include <unordered_map>
  8 | #include <chrono>
  9 | 
 10 | #include <dynet/dict.h>
 11 | #include <dynet/expr.h>
 12 | #include <dynet/lstm.h>
 13 | #include <dynet/training.h>
 14 | 
 15 | using namespace std;
 16 | using namespace std::chrono;
 17 | using namespace dynet;
 18 | 
 19 | // Read a file where each line is of the form "word1 word2 ..."
 20 | // Yields lists of the form [word1, word2, ...]
 21 | vector<vector<unsigned> > read(const string & fname, Dict & vw) {
 22 |   ifstream fh(fname);
 23 |   if(!fh) throw std::runtime_error("Could not open file");
 24 |   string str; 
 25 |   vector<vector<unsigned> > sents;
 26 |   while(getline(fh, str)) {
 27 |     istringstream iss(str);
 28 |     vector<unsigned> tokens;
 29 |     while(iss >> str)
 30 |       tokens.push_back(vw.convert(str));
 31 |     tokens.push_back(vw.convert("<s>"));
 32 |     sents.push_back(tokens);
 33 |   }
 34 |   return sents;
 35 | }
 36 | 
 37 | struct RNNLanguageModel {
 38 |   LookupParameter p_c;
 39 |   Parameter W_sm;
 40 |   Parameter b_sm;
 41 |   VanillaLSTMBuilder builder;
 42 |   explicit RNNLanguageModel(unsigned layers, unsigned input_dim, unsigned hidden_dim, unsigned vocab_size, ParameterCollection& model) : builder(layers, input_dim, hidden_dim, model) {
 43 |     p_c = model.add_lookup_parameters(vocab_size, {input_dim}, ParameterInitUniform(0.1)); 
 44 |     W_sm = model.add_parameters({vocab_size, hidden_dim}, ParameterInitUniform(0.5));
 45 |     b_sm = model.add_parameters({vocab_size}, ParameterInitUniform(0.5));
 46 |   }
 47 | 
 48 |   Expression calc_lm_loss(const vector<vector<unsigned> > & sent, int pos, int mb_size, ComputationGraph & cg) {
 49 |   
 50 |     // parameters -> expressions
 51 |     Expression W_exp = parameter(cg, W_sm);
 52 |     Expression b_exp = parameter(cg, b_sm);
 53 |   
 54 |     // initialize the RNN
 55 |     builder.new_graph(cg);  // reset RNN builder for new graph
 56 |     builder.start_new_sequence();
 57 | 
 58 |     // Create contexts and perform lookup
 59 |     size_t tot_sents = min(sent.size()-pos, (size_t)mb_size);
 60 |     vector<vector<unsigned> > ctxts(sent.begin()+pos, sent.begin()+pos+tot_sents);
 61 |     for(auto & ctxt : ctxts)
 62 |       rotate(ctxt.begin(), ctxt.begin()+ctxt.size()-1, ctxt.end());
 63 |     Expression looks = lookup_seq(cg, p_c, ctxts);
 64 | 
 65 |     // Generate the contexts and scores
 66 |     Expression states = builder.transduce_seq(looks);
 67 |     Expression scores = affine_transform({b_exp, W_exp, states});
 68 | 
 69 |     // Calculate the loss
 70 |     vector<vector<unsigned> > predicts(sent.begin()+pos, sent.begin()+pos+tot_sents);
 71 |     Expression losses = pickneglogsoftmax_seq(scores, predicts);
 72 |     
 73 |     return sum_batches(sum_rows(losses));
 74 |   }
 75 | 
 76 | };
 77 | 
 78 | struct length_greater_then {
 79 |     inline bool operator() (const vector<unsigned> & struct1, const vector<unsigned> & struct2) {
 80 |         return (struct1.size() > struct2.size());
 81 |     }
 82 | };
 83 | 
 84 | vector<unsigned> prepare_minibatch(int mb_size, vector<vector<unsigned> > & data) {
 85 |   stable_sort(data.begin(), data.end(), length_greater_then());
 86 |   vector<unsigned> ids;
 87 |   for(size_t i = 0; i < data.size(); i += mb_size)
 88 |     ids.push_back(i);
 89 |   return ids;
 90 | }
 91 | 
 92 | int main(int argc, char** argv) {
 93 | 
 94 |   time_point<system_clock> start = system_clock::now();
 95 | 
 96 |   // format of files: each line is "word1 word2 ..."
 97 |   string train_file = "data/text/train.txt";
 98 |   string test_file = "data/text/dev.txt";
 99 | 
100 |   // DyNet Starts
101 |   dynet::initialize(argc, argv);
102 |   ParameterCollection model;
103 | 
104 |   if(argc != 6) {
105 |     cerr << "Usage: " << argv[0] << " MB_SIZE EMBED_SIZE HIDDEN_SIZE SPARSE TIMEOUT" << endl;
106 |     return 1;
107 |   }
108 |   int MB_SIZE = atoi(argv[1]);
109 |   int EMBED_SIZE = atoi(argv[2]);
110 |   int HIDDEN_SIZE = atoi(argv[3]);
111 |   int SPARSE = atoi(argv[4]);
112 |   int TIMEOUT = atoi(argv[5]);
113 | 
114 |   AdamTrainer trainer(model, 0.001);
115 |   trainer.sparse_updates_enabled = SPARSE;
116 |   trainer.clipping_enabled = false;
117 | 
118 |   Dict vw;
119 |   vw.convert("<s>");
120 |   vector<vector<unsigned> > train = read(train_file, vw);
121 |   vw.freeze();
122 |   vector<vector<unsigned> > test = read(test_file, vw);
123 |   vector<unsigned> train_ids = prepare_minibatch(MB_SIZE, train);
124 |   vector<unsigned> test_ids = prepare_minibatch(MB_SIZE, test);
125 |   int test_words = 0;
126 |   for(auto & sent : test) test_words += sent.size();
127 | 
128 |   int nwords = vw.size();
129 | 
130 |   RNNLanguageModel rnnlm(1, EMBED_SIZE, HIDDEN_SIZE, nwords, model);
131 | 
132 |   {
133 |     duration<float> fs = (system_clock::now() - start);
134 |     float startup_time = duration_cast<milliseconds>(fs).count() / float(1000);
135 |     cout << "startup time: " << startup_time << endl;
136 |   }
137 | 
138 |   start = system_clock::now();
139 |   int i = 0, all_words = 0, this_words = 0;
140 |   float this_loss = 0.f, all_time = 0.f;
141 |   for(int iter = 0; iter < 100; iter++) {
142 |     shuffle(train_ids.begin(), train_ids.end(), *dynet::rndeng);
143 |     for(auto sid : train_ids) {
144 |       i++;
145 |       if(i % (500/MB_SIZE) == 0) {
146 |         trainer.status();
147 |         cout << this_loss/this_words << endl;
148 |         all_words += this_words;
149 |         this_loss = 0.f;
150 |         this_words = 0;
151 |       }
152 |       if(i % (10000/MB_SIZE) == 0) {
153 |         duration<float> fs = (system_clock::now() - start);
154 |         all_time += duration_cast<milliseconds>(fs).count() / float(1000);
155 |         float test_loss = 0;
156 |         for(auto sentid : test_ids) {
157 |           ComputationGraph cg;
158 |           Expression loss_exp = rnnlm.calc_lm_loss(test, sentid, MB_SIZE, cg);
159 |           test_loss += as_scalar(cg.forward(loss_exp));
160 |         }
161 |         cout << "nll=" << test_loss/test_words << ", ppl=" << exp(test_loss/test_words) << ", words=" << test_words << ", time=" << all_time << ", word_per_sec=" << all_words/all_time << endl;
162 |         if(all_time > TIMEOUT)
163 |           exit(0);
164 |         start = system_clock::now();
165 |       }
166 | 
167 |       ComputationGraph cg;
168 |       Expression loss_exp = rnnlm.calc_lm_loss(train, sid, MB_SIZE, cg);
169 |       this_loss += as_scalar(cg.forward(loss_exp));
170 |       for(size_t pos = sid; pos < min((size_t)sid+MB_SIZE, train.size()); ++pos)
171 |         this_words += train[pos].size();
172 |       cg.backward(loss_exp);
173 |       trainer.update();
174 |     }
175 |   }
176 | }
177 | 


--------------------------------------------------------------------------------
/dynet-cpp/treenn.cc:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <stdexcept>
  3 | #include <fstream>
  4 | #include <chrono>
  5 | #ifdef BOOST_REGEX
  6 |   #include <boost/regex.hpp>
  7 |   using namespace boost;
  8 | #else
  9 |   #include <regex>
 10 | #endif
 11 | 
 12 | #include <dynet/training.h>
 13 | #include <dynet/expr.h>
 14 | #include <dynet/dict.h>
 15 | 
 16 | using namespace std;
 17 | using namespace std::chrono;
 18 | using namespace dynet;
 19 | 
 20 | class Tree {
 21 | public:
 22 | 
 23 |   Tree(const string & label, vector<Tree*> children = vector<Tree*>())
 24 |                       : label(label), children(children) { }
 25 |   ~Tree() {
 26 |     for(auto child : children) delete child;
 27 |   }
 28 | 
 29 |   static Tree* from_sexpr(const string & str) {
 30 |     vector<string> toks = tokenize_sexpr(str);
 31 |     vector<string>::const_iterator tokit = toks.begin();
 32 |     if(*(tokit++) != "(") throw runtime_error("Poorly structured tree");
 33 |     return Tree::within_bracket(tokit);
 34 |   }
 35 | 
 36 |   static vector<string> tokenize_sexpr(const string & s) {
 37 |     regex tokker(" +|[()]|[^ ()]+");
 38 |     vector<string> toks;
 39 |     for(auto it = sregex_iterator(s.begin(), s.end(), tokker); it != sregex_iterator(); ++it) {
 40 |       string m = it->str();
 41 |       if(m != " ")
 42 |         toks.push_back(m);
 43 |     }
 44 |     return toks;
 45 |   }
 46 |   
 47 |   static Tree* within_bracket(vector<string>::const_iterator & tokit) {
 48 |     const string & label = *(tokit++);
 49 |     vector<Tree*> children;
 50 |     while(true) {
 51 |       const string & tok = *(tokit++);
 52 |       if(tok == "(") {
 53 |         children.push_back(within_bracket(tokit));
 54 |       } else if(tok == ")") {
 55 |         return new Tree(label, children);
 56 |       } else {
 57 |         children.push_back(new Tree(tok));
 58 |       }
 59 |     }
 60 |     throw runtime_error("Poorly structured tree");
 61 |   }
 62 | 
 63 |   void nonterms(vector<Tree*> & ret) {
 64 |     if(!isleaf()) {
 65 |       ret.push_back(this);
 66 |       for(Tree* child : children) child->nonterms(ret);
 67 |     }
 68 |   }
 69 | 
 70 |   bool isleaf() const { return children.size() == 0; }
 71 | 
 72 |   void make_vocab(Dict & nonterm_voc, Dict & term_voc) {
 73 |     (isleaf() ? term_voc : nonterm_voc).convert(label);
 74 |     for(Tree* tr : children) tr->make_vocab(nonterm_voc, term_voc);
 75 |   }
 76 | 
 77 |   string label;
 78 |   vector<Tree*> children;
 79 |   Expression expr;
 80 | 
 81 | };
 82 | 
 83 | ostream& operator<<(ostream& os, const Tree& tr) {
 84 |   if(tr.isleaf()) {
 85 |     os << tr.label;
 86 |   } else {
 87 |     os << '(' << tr.label;
 88 |     for(auto child : tr.children) os << ' ' << *child;
 89 |     os << ')';
 90 |   }
 91 |   return os;
 92 | }
 93 | 
 94 | vector<Tree*> read_dataset(const string & filename) {
 95 |   ifstream file(filename);
 96 |   if(!file) throw runtime_error("Missing file");
 97 |   string line;
 98 |   vector<Tree*> ret;
 99 |   while(getline(file, line)) ret.push_back(Tree::from_sexpr(line));
100 |   return ret;
101 | }
102 | 
103 | class TreeLSTMBuilder {
104 | public:
105 |   TreeLSTMBuilder(ParameterCollection & model, Dict & word_vocab, unsigned wdim, unsigned hdim) :
106 |           model(model), word_vocab(word_vocab), wdim(wdim), hdim(hdim) {
107 |     WS = {model.add_parameters({hdim, wdim}), // 0: Wi
108 |           model.add_parameters({hdim, wdim}), // 1: Wo
109 |           model.add_parameters({hdim, wdim}), // 2: Wu
110 |           model.add_parameters({hdim, 2*hdim}), // 3: Ui
111 |           model.add_parameters({hdim, 2*hdim}), // 4: Uo
112 |           model.add_parameters({hdim, 2*hdim}), // 5: Uu
113 |           model.add_parameters({hdim, hdim}), // 6: UFS1
114 |           model.add_parameters({hdim, hdim}), // 7: UFS2
115 |           model.add_parameters({hdim}), // 8: Bi
116 |           model.add_parameters({hdim}), // 9: Bo
117 |           model.add_parameters({hdim}), // 10: Bu
118 |           model.add_parameters({hdim})};// 11: Bf
119 |     E = model.add_lookup_parameters(word_vocab.size(),{wdim});
120 |     cg_WS.resize(WS.size());
121 |   }
122 | 
123 |   void start_graph(ComputationGraph & c) {
124 |     cg = &c;
125 |     for(size_t i = 0; i < WS.size(); ++i)
126 |       cg_WS[i] = parameter(*cg, WS[i]);
127 |   }
128 | 
129 |   pair<Expression,Expression> expr_for_tree(Tree & tree, bool decorate = false) {
130 |     assert(!tree.isleaf());
131 |     pair<Expression,Expression> hc_ret;
132 |     if(tree.children.size() == 1) {
133 |       assert(tree.children[0]->isleaf());
134 |       Expression emb, i, o, u, c, expr;
135 |       emb = lookup(*cg, E, word_vocab.convert(tree.children[0]->label));
136 |       i = logistic(affine_transform({cg_WS[8], cg_WS[0], emb}));
137 |       o = logistic(affine_transform({cg_WS[9], cg_WS[1], emb}));
138 |       u = tanh(    affine_transform({cg_WS[10], cg_WS[2], emb}));
139 |       hc_ret.second = cmult(i,u);
140 |       hc_ret.first = cmult(o,tanh(hc_ret.second));
141 |     } else {
142 |       assert(tree.children.size() == 2);
143 |       Expression e, i, o, u, f1, f2, c, expr;
144 |       pair<Expression,Expression> hc1, hc2; 
145 |       hc1 = expr_for_tree(*tree.children[0], decorate);
146 |       hc2 = expr_for_tree(*tree.children[1], decorate);
147 |       e = concatenate({hc1.first,hc2.first});
148 |       i = logistic(affine_transform({cg_WS[8], cg_WS[3], e}));
149 |       o = logistic(affine_transform({cg_WS[9], cg_WS[4], e}));
150 |       u = tanh(    affine_transform({cg_WS[10], cg_WS[5], e}));
151 |       f1 = logistic(affine_transform({cg_WS[11], cg_WS[6], hc1.first}));
152 |       f2 = logistic(affine_transform({cg_WS[11], cg_WS[7], hc2.first}));
153 |       hc_ret.second = cmult(i,u) + cmult(f1,hc1.second) + cmult(f2,hc2.second);
154 |       hc_ret.first = cmult(o,tanh(hc_ret.second));
155 |     }
156 |     if(decorate) { tree.expr = hc_ret.first; }
157 |     return hc_ret;
158 |   }
159 | 
160 |   ParameterCollection & model;
161 |   Dict & word_vocab;
162 |   unsigned wdim, hdim;
163 |   vector<Parameter> WS;
164 |   LookupParameter E;
165 | 
166 |   ComputationGraph * cg;
167 |   vector<Expression> cg_WS;
168 | 
169 | };
170 | 
171 | int main(int argc, char**argv) {
172 | 
173 |   time_point<system_clock> start = system_clock::now();
174 | 
175 |   vector<Tree*> train = read_dataset("data/trees/train.txt");
176 |   vector<Tree*> dev = read_dataset("data/trees/dev.txt");
177 |   Dict nonterm_voc, term_voc;
178 |   for(auto tree : train) tree->make_vocab(nonterm_voc, term_voc);
179 |   nonterm_voc.freeze();
180 |   term_voc.convert("<unk>"); term_voc.freeze(); term_voc.set_unk("<unk>");
181 | 
182 |   // DyNet Starts
183 |   dynet::initialize(argc, argv);
184 |   ParameterCollection model;
185 |   AdamTrainer trainer(model, 0.001);
186 |   trainer.clipping_enabled = false;
187 | 
188 |   if(argc != 5) {
189 |     cerr << "Usage: " << argv[0] << " WEMBED_SIZE HIDDEN_SIZE SPARSE TIMEOUT" << endl;
190 |     return 1;
191 |   }
192 |   unsigned WEMBED_SIZE = atoi(argv[1]);
193 |   unsigned HIDDEN_SIZE = atoi(argv[2]);
194 |   trainer.sparse_updates_enabled = atoi(argv[3]);
195 |   int TIMEOUT = atoi(argv[4]);
196 | 
197 |   // Builder
198 |   Parameter W_param = model.add_parameters({nonterm_voc.size(), HIDDEN_SIZE});
199 |   TreeLSTMBuilder builder(model, term_voc, WEMBED_SIZE, HIDDEN_SIZE);
200 | 
201 |   {
202 |     duration<float> fs = (system_clock::now() - start);
203 |     float startup_time = duration_cast<milliseconds>(fs).count() / float(1000);
204 |     cout << "startup time: " << startup_time << endl;
205 |   }
206 | 
207 |   int i = 0, all_tagged = 0, this_nodes = 0;
208 |   float this_loss = 0.f, all_time = 0.f;
209 |   for(int iter = 0; iter < 100; iter++) {
210 |     shuffle(train.begin(), train.end(), *dynet::rndeng);
211 |     start = system_clock::now();
212 |     for(auto tree : train) {
213 |       ComputationGraph cg;
214 |       builder.start_graph(cg);
215 |       Expression W = parameter(cg, W_param);
216 |       pair<Expression,Expression> hc = builder.expr_for_tree(*tree, true);
217 |       vector<Expression> losses;
218 |       vector<Tree*> nodes; tree->nonterms(nodes);
219 |       for(auto nt : nodes)
220 |         losses.push_back(pickneglogsoftmax(W*nt->expr, nonterm_voc.convert(nt->label)));
221 |       Expression loss = sum(losses);
222 |       cg.forward(loss);
223 |       this_loss += as_scalar(loss.value());
224 |       this_nodes += nodes.size();
225 |       cg.backward(loss);
226 |       trainer.update();
227 |       if(++i % 1000 == 0) {
228 |         trainer.status();
229 |         cout << this_loss / this_nodes << endl;
230 |         this_loss = 0; this_nodes = 0;
231 |       }
232 |     }
233 |     std::chrono::duration<float> fs = (system_clock::now() - start);
234 |     all_time += duration_cast<milliseconds>(fs).count() / float(1000);
235 |     int good = 0, bad = 0;
236 |     for(auto tree : dev) {
237 |       ComputationGraph cg;
238 |       builder.start_graph(cg);
239 |       Expression W = parameter(cg, W_param);
240 |       pair<Expression,Expression> hc = builder.expr_for_tree(*tree, false);
241 |       vector<float> scores = as_vector((W*hc.first).value());
242 |       size_t max_id = std::distance(scores.begin(), std::max_element(scores.begin(), scores.end()));
243 |       (nonterm_voc.convert(max_id) == tree->label ? good : bad)++;
244 |     }
245 |     cout << "acc=" << good/float(good+bad) << ", time=" << all_time << ", sent_per_sec=" << i/all_time << endl;
246 |     if(all_time > TIMEOUT)
247 |       exit(0);
248 |   }
249 | }
250 | 


--------------------------------------------------------------------------------
/dynet-py/bilstm-tagger-withchar.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | from collections import Counter, defaultdict
  6 | import random
  7 | import sys
  8 | import argparse
  9 | 
 10 | import dynet as dy
 11 | import numpy as np
 12 | 
 13 | parser = argparse.ArgumentParser()
 14 | parser.add_argument("--dynet-seed", default=0, type=int)
 15 | parser.add_argument("--dynet-gpus", default=0, type=int)
 16 | parser.add_argument("--dynet-mem", default=512, type=int)
 17 | parser.add_argument('CEMBED_SIZE', type=int, help='char embedding size')
 18 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size')
 19 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 20 | parser.add_argument('MLP_SIZE', type=int, help='embedding size')
 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 23 | args = parser.parse_args()
 24 | 
 25 | # format of files: each line is "word1|tag2 word2|tag2 ..."
 26 | train_file="data/tags/train.txt"
 27 | dev_file="data/tags/dev.txt"
 28 | 
 29 | class Vocab:
 30 |   def __init__(self, w2i=None):
 31 |     if w2i is None: w2i = defaultdict(lambda: len(w2i))
 32 |     self.w2i = dict(w2i)
 33 |     self.i2w = {i:w for w,i in w2i.items()}
 34 |   @classmethod
 35 |   def from_corpus(cls, corpus):
 36 |     w2i = defaultdict(lambda: len(w2i))
 37 |     for sent in corpus:
 38 |       [w2i[word] for word in sent]
 39 |     return Vocab(w2i)
 40 | 
 41 |   def size(self): return len(self.w2i.keys())
 42 | 
 43 | def read(fname):
 44 |   """
 45 |   Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..."
 46 |   Yields lists of the form [(word1,tag1), (word2,tag2), ...]
 47 |   """
 48 |   with open(fname, "r") as fh:
 49 |     for line in fh:
 50 |       line = line.strip().split()
 51 |       sent = [tuple(x.rsplit("|",1)) for x in line]
 52 |       yield sent
 53 | 
 54 | train=list(read(train_file))
 55 | dev=list(read(dev_file))
 56 | words=[]
 57 | tags=[]
 58 | chars=set()
 59 | wc=Counter()
 60 | for sent in train:
 61 |   for w,p in sent:
 62 |     words.append(w)
 63 |     tags.append(p)
 64 |     chars.update(w)
 65 |     wc[w]+=1
 66 | words.append("_UNK_")
 67 | chars.add("<*>")
 68 | chars.add("_UNK_")
 69 | 
 70 | vw = Vocab.from_corpus([words])
 71 | vt = Vocab.from_corpus([tags])
 72 | vc = Vocab.from_corpus([chars])
 73 | UNK = vw.w2i["_UNK_"]
 74 | CUNK = vc.w2i["_UNK_"]
 75 | 
 76 | nwords = vw.size()
 77 | ntags  = vt.size()
 78 | nchars  = vc.size()
 79 | print ("nwords=%r, ntags=%r, nchars=%r" % (nwords, ntags, nchars))
 80 | 
 81 | # DyNet Starts
 82 | 
 83 | model = dy.Model()
 84 | trainer = dy.AdamTrainer(model)
 85 | trainer.set_clip_threshold(-1.0)
 86 | trainer.set_sparse_updates(True if args.SPARSE == 1 else False)
 87 | 
 88 | WORDS_LOOKUP = model.add_lookup_parameters((nwords, args.WEMBED_SIZE))
 89 | CHARS_LOOKUP = model.add_lookup_parameters((nchars, args.CEMBED_SIZE))
 90 | 
 91 | # MLP on top of biLSTM outputs 100 -> 32 -> ntags
 92 | pH = model.add_parameters((args.MLP_SIZE, args.HIDDEN_SIZE*2))
 93 | pO = model.add_parameters((ntags, args.MLP_SIZE))
 94 | 
 95 | # word-level LSTMs
 96 | fwdRNN = dy.VanillaLSTMBuilder(1, args.WEMBED_SIZE, args.HIDDEN_SIZE, model) # layers, in-dim, out-dim, model
 97 | bwdRNN = dy.VanillaLSTMBuilder(1, args.WEMBED_SIZE, args.HIDDEN_SIZE, model)
 98 | 
 99 | # char-level LSTMs
100 | cFwdRNN = dy.VanillaLSTMBuilder(1, args.CEMBED_SIZE, args.WEMBED_SIZE/2, model)
101 | cBwdRNN = dy.VanillaLSTMBuilder(1, args.CEMBED_SIZE, args.WEMBED_SIZE/2, model)
102 | 
103 | def word_rep(w, cf_init, cb_init):
104 |   if wc[w] > 5:
105 |     w_index = vw.w2i[w]
106 |     return WORDS_LOOKUP[w_index]
107 |   else:
108 |     pad_char = vc.w2i["<*>"]
109 |     char_ids = [pad_char] + [vc.w2i.get(c,CUNK) for c in w] + [pad_char]
110 |     char_embs = [CHARS_LOOKUP[cid] for cid in char_ids]
111 |     fw_exps = cf_init.transduce(char_embs)
112 |     bw_exps = cb_init.transduce(reversed(char_embs))
113 |     return dy.concatenate([ fw_exps[-1], bw_exps[-1] ])
114 | 
115 | def build_tagging_graph(words):
116 |   dy.renew_cg()
117 |   # parameters -> expressions
118 |   H = dy.parameter(pH)
119 |   O = dy.parameter(pO)
120 | 
121 |   # initialize the RNNs
122 |   f_init = fwdRNN.initial_state()
123 |   b_init = bwdRNN.initial_state()
124 | 
125 |   cf_init = cFwdRNN.initial_state()
126 |   cb_init = cBwdRNN.initial_state()
127 | 
128 |   # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
129 |   wembs = [word_rep(w, cf_init, cb_init) for w in words]
130 | 
131 |   # feed word vectors into biLSTM
132 |   fw_exps = f_init.transduce(wembs)
133 |   bw_exps = b_init.transduce(reversed(wembs))
134 | 
135 |   # biLSTM states
136 |   bi_exps = [dy.concatenate([f,b]) for f,b in zip(fw_exps, reversed(bw_exps))]
137 | 
138 |   # feed each biLSTM state to an MLP
139 |   exps = []
140 |   for x in bi_exps:
141 |     r_t = O*(dy.tanh(H * x))
142 |     exps.append(r_t)
143 | 
144 |   return exps
145 | 
146 | def sent_loss_precalc(words, tags, vecs):
147 |   errs = []
148 |   for v,t in zip(vecs,tags):
149 |     tid = vt.w2i[t]
150 |     err = dy.pickneglogsoftmax(v, tid)
151 |     errs.append(err)
152 |   return dy.esum(errs)
153 | 
154 | def sent_loss(words, tags):
155 |   return sent_loss_precalc(words, tags, build_tagging_graph(words))
156 | 
157 | def tag_sent_precalc(words, vecs):
158 |   log_probs = [v.npvalue() for v in vecs]
159 |   tags = []
160 |   for prb in log_probs:
161 |     tag = np.argmax(prb)
162 |     tags.append(vt.i2w[tag])
163 |   return zip(words, tags)
164 | 
165 | def tag_sent(words):
166 |   return tag_sent_precalc(words, build_tagging_graph(words))
167 | 
168 | print ("startup time: %r" % (time.time() - start))
169 | start = time.time()
170 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0
171 | for ITER in range(100):
172 |   random.shuffle(train)
173 |   for s in train:
174 |     i += 1
175 |     if i % 500 == 0:   # print status
176 |       trainer.status()
177 |       print (this_loss / this_tagged, file=sys.stderr)
178 |       all_tagged += this_tagged
179 |       this_loss = this_tagged = 0
180 |       all_time = time.time() - start
181 |     if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev
182 |       dev_start = time.time()
183 |       good_sent = bad_sent = good = bad = 0.0
184 |       for sent in dev:
185 |         words = [w for w,t in sent]
186 |         golds = [t for w,t in sent]
187 |         tags = [t for w,t in tag_sent(words)]
188 |         if tags == golds: good_sent += 1
189 |         else: bad_sent += 1
190 |         for go,gu in zip(golds,tags):
191 |           if go == gu: good += 1
192 |           else: bad += 1
193 |       dev_time += time.time() - dev_start 
194 |       train_time = time.time() - start - dev_time
195 |       print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time))
196 |       if all_time > args.TIMEOUT:
197 |         sys.exit(0)
198 |     # train on sent
199 |     words = [w for w,t in s]
200 |     golds = [t for w,t in s]
201 | 
202 |     loss_exp =  sent_loss(words, golds)
203 |     this_loss += loss_exp.scalar_value()
204 |     this_tagged += len(golds)
205 |     loss_exp.backward()
206 |     trainer.update()
207 |   print ("epoch %r finished" % ITER)
208 |   trainer.update_epoch(1.0)
209 | 


--------------------------------------------------------------------------------
/dynet-py/bilstm-tagger.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | from collections import Counter, defaultdict
  6 | import random
  7 | import sys
  8 | import argparse
  9 | 
 10 | import dynet as dy
 11 | import numpy as np
 12 | 
 13 | parser = argparse.ArgumentParser()
 14 | parser.add_argument("--dynet-seed", default=0, type=int)
 15 | parser.add_argument("--dynet-gpus", default=0, type=int)
 16 | parser.add_argument("--dynet-mem", default=512, type=int)
 17 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size')
 18 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 19 | parser.add_argument('MLP_SIZE', type=int, help='embedding size')
 20 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 21 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 22 | args = parser.parse_args()
 23 | 
 24 | # format of files: each line is "word1|tag2 word2|tag2 ..."
 25 | train_file="data/tags/train.txt"
 26 | dev_file="data/tags/dev.txt"
 27 | 
 28 | class Vocab:
 29 |   def __init__(self, w2i=None):
 30 |     if w2i is None: w2i = defaultdict(lambda: len(w2i))
 31 |     self.w2i = dict(w2i)
 32 |     self.i2w = {i:w for w,i in w2i.items()}
 33 |   @classmethod
 34 |   def from_corpus(cls, corpus):
 35 |     w2i = defaultdict(lambda: len(w2i))
 36 |     for sent in corpus:
 37 |       [w2i[word] for word in sent]
 38 |     return Vocab(w2i)
 39 | 
 40 |   def size(self): return len(self.w2i.keys())
 41 | 
 42 | def read(fname):
 43 |   """
 44 |   Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..."
 45 |   Yields lists of the form [(word1,tag1), (word2,tag2), ...]
 46 |   """
 47 |   with open(fname, "r") as fh:
 48 |     for line in fh:
 49 |       line = line.strip().split()
 50 |       sent = [tuple(x.rsplit("|",1)) for x in line]
 51 |       yield sent
 52 | 
 53 | train=list(read(train_file))
 54 | dev=list(read(dev_file))
 55 | words=[]
 56 | tags=[]
 57 | wc=Counter()
 58 | for sent in train:
 59 |   for w,p in sent:
 60 |     words.append(w)
 61 |     tags.append(p)
 62 |     wc[w]+=1
 63 | words.append("_UNK_")
 64 | 
 65 | vw = Vocab.from_corpus([words])
 66 | vt = Vocab.from_corpus([tags])
 67 | UNK = vw.w2i["_UNK_"]
 68 | 
 69 | nwords = vw.size()
 70 | ntags  = vt.size()
 71 | print ("nwords=%r, ntags=%r" % (nwords, ntags))
 72 | 
 73 | # DyNet Starts
 74 | 
 75 | model = dy.Model()
 76 | trainer = dy.AdamTrainer(model)
 77 | trainer.set_clip_threshold(-1.0)
 78 | trainer.set_sparse_updates(True if args.SPARSE == 1 else False)
 79 | 
 80 | WORDS_LOOKUP = model.add_lookup_parameters((nwords, args.WEMBED_SIZE))
 81 | 
 82 | # MLP on top of biLSTM outputs 100 -> 32 -> ntags
 83 | pH = model.add_parameters((args.MLP_SIZE, args.HIDDEN_SIZE*2))
 84 | pO = model.add_parameters((ntags, args.MLP_SIZE))
 85 | 
 86 | # word-level LSTMs
 87 | fwdRNN = dy.VanillaLSTMBuilder(1, args.WEMBED_SIZE, args.HIDDEN_SIZE, model) # layers, in-dim, out-dim, model
 88 | bwdRNN = dy.VanillaLSTMBuilder(1, args.WEMBED_SIZE, args.HIDDEN_SIZE, model)
 89 | 
 90 | def word_rep(w):
 91 |   widx = vw.w2i[w] if wc[w] > 5 else UNK
 92 |   return WORDS_LOOKUP[widx]
 93 | 
 94 | def build_tagging_graph(words):
 95 |   dy.renew_cg()
 96 |   # parameters -> expressions
 97 |   H = dy.parameter(pH)
 98 |   O = dy.parameter(pO)
 99 | 
100 |   # initialize the RNNs
101 |   f_init = fwdRNN.initial_state()
102 |   b_init = bwdRNN.initial_state()
103 | 
104 |   # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
105 |   wembs = []
106 |   for i, w in enumerate(words):
107 |     wembs.append(word_rep(w))
108 | 
109 |   # feed word vectors into biLSTM
110 |   fw_exps = f_init.transduce(wembs)
111 |   bw_exps = b_init.transduce(reversed(wembs))
112 | 
113 |   # biLSTM states
114 |   bi_exps = [dy.concatenate([f,b]) for f,b in zip(fw_exps, reversed(bw_exps))]
115 | 
116 |   # feed each biLSTM state to an MLP
117 |   exps = []
118 |   for x in bi_exps:
119 |     r_t = O*(dy.tanh(H * x))
120 |     exps.append(r_t)
121 | 
122 |   return exps
123 | 
124 | def sent_loss_precalc(words, tags, vecs):
125 |   errs = []
126 |   for v,t in zip(vecs,tags):
127 |     tid = vt.w2i[t]
128 |     err = dy.pickneglogsoftmax(v, tid)
129 |     errs.append(err)
130 |   return dy.esum(errs)
131 | 
132 | def sent_loss(words, tags):
133 |   return sent_loss_precalc(words, tags, build_tagging_graph(words))
134 | 
135 | def tag_sent_precalc(words, vecs):
136 |   log_probs = [v.npvalue() for v in vecs]
137 |   tags = []
138 |   for prb in log_probs:
139 |     tag = np.argmax(prb)
140 |     tags.append(vt.i2w[tag])
141 |   return zip(words, tags)
142 | 
143 | def tag_sent(words):
144 |   return tag_sent_precalc(words, build_tagging_graph(words))
145 | 
146 | print ("startup time: %r" % (time.time() - start))
147 | start = time.time()
148 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0
149 | for ITER in range(100):
150 |   random.shuffle(train)
151 |   for s in train:
152 |     i += 1
153 |     if i % 500 == 0:   # print status
154 |       trainer.status()
155 |       print(this_loss / this_tagged, file=sys.stderr)
156 |       all_tagged += this_tagged
157 |       this_loss = this_tagged = 0
158 |       all_time = time.time() - start
159 |     if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev
160 |       dev_start = time.time()
161 |       good_sent = bad_sent = good = bad = 0.0
162 |       for sent in dev:
163 |         words = [w for w,t in sent]
164 |         golds = [t for w,t in sent]
165 |         tags = [t for w,t in tag_sent(words)]
166 |         if tags == golds: good_sent += 1
167 |         else: bad_sent += 1
168 |         for go,gu in zip(golds,tags):
169 |           if go == gu: good += 1
170 |           else: bad += 1
171 |       dev_time += time.time() - dev_start 
172 |       train_time = time.time() - start - dev_time
173 |       print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time))
174 |       if all_time > args.TIMEOUT:
175 |         sys.exit(0)
176 |     # train on sent
177 |     words = [w for w,t in s]
178 |     golds = [t for w,t in s]
179 | 
180 |     loss_exp = sent_loss(words, golds)
181 |     my_loss = loss_exp.scalar_value()
182 |     this_loss += my_loss;
183 |     this_tagged += len(golds)
184 |     loss_exp.backward()
185 |     trainer.update()
186 |   print("epoch %r finished" % ITER)
187 |   trainer.update_epoch(1.0)
188 | 


--------------------------------------------------------------------------------
/dynet-py/bow.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import time
 3 | start = time.time()
 4 | 
 5 | from collections import defaultdict
 6 | import random
 7 | import dynet as dy
 8 | import numpy as np
 9 | import sys
10 | 
11 | # Functions to read in the corpus
12 | w2i = defaultdict(lambda: len(w2i))
13 | t2i = defaultdict(lambda: len(t2i))
14 | UNK = w2i["<unk>"]
15 | def read_dataset(filename):
16 |   with open(filename, "r") as f:
17 |     for line in f:
18 |       tag, words = line.lower().strip().split(" ||| ")
19 |       yield ([w2i[x] for x in words.split(" ")], t2i[tag])
20 | 
21 | # Read in the data
22 | train = list(read_dataset("data/classes/train.txt"))
23 | w2i = defaultdict(lambda: UNK, w2i)
24 | dev = list(read_dataset("data/classes/test.txt"))
25 | nwords = len(w2i)
26 | ntags = len(t2i)
27 | 
28 | # Start DyNet and define trainer
29 | model = dy.Model()
30 | trainer = dy.AdamTrainer(model, 0.001)
31 | trainer.set_clip_threshold(-1.0)
32 | trainer.set_sparse_updates(False)
33 | 
34 | # Define the model
35 | W_sm = model.add_lookup_parameters((nwords, ntags), dy.ConstInitializer(0.0)) # Word weights
36 | b_sm = model.add_parameters((ntags), dy.ConstInitializer(0.0))                # Softmax bias
37 | 
38 | # A function to calculate scores for one value
39 | def calc_scores(words):
40 |   dy.renew_cg()
41 |   score = dy.esum([dy.lookup(W_sm, x) for x in words])
42 |   b_sm_exp = dy.parameter(b_sm)
43 |   return score + b_sm_exp
44 | 
45 | print ("startup time: %r" % (time.time() - start))
46 | for ITER in range(100):
47 |   # Perform training
48 |   # random.shuffle(train)
49 |   train_loss = 0.0
50 |   start = time.time()
51 |   for i, (words, tag) in enumerate(train):
52 |     scores = calc_scores(words)
53 |     my_loss = dy.pickneglogsoftmax(scores, tag)
54 |     train_loss += my_loss.value()
55 |     my_loss.backward()
56 |     trainer.update()
57 |     # print(b_sm.as_array())
58 |     # if i > 5:
59 |     #     sys.exit(0)
60 |   print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start))
61 |   # Perform testing
62 |   test_correct = 0.0
63 |   for words, tag in dev:
64 |     scores = calc_scores(words).npvalue()
65 |     predict = np.argmax(scores)
66 |     if predict == tag:
67 |       test_correct += 1
68 |   print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))
69 | 


--------------------------------------------------------------------------------
/dynet-py/rnnlm-batch-batch.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | from collections import Counter, defaultdict
  6 | import random
  7 | import math
  8 | import sys
  9 | import argparse
 10 | 
 11 | import dynet as dy
 12 | import numpy as np
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument("--dynet-seed", default=0, type=int)
 16 | parser.add_argument("--dynet-mem", default=512, type=int)
 17 | parser.add_argument("--dynet-gpus", default=0, type=int)
 18 | parser.add_argument('MB_SIZE', type=int, help='minibatch size')
 19 | parser.add_argument('EMBED_SIZE', type=int, help='embedding size')
 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 23 | args = parser.parse_args()
 24 | 
 25 | BATCHED_SENT = True
 26 | 
 27 | def split_cols(matrix):
 28 |     total, rows, cols, bt = matrix.dim()
 29 |     assert bt == 1
 30 |     return [dy.reshape(dy.select_cols(matrix, [i]), (rows,), batch_size=bt) for i in xrange(cols)]
 31 | 
 32 | def batch(cols):
 33 |     total,nrows,ncols,nbatch=cols[0].dim()
 34 |     assert(nbatch==1)# doesn't currently work with batched
 35 |     cols_ = dy.concatenate_cols(cols)
 36 |     return dy.reshape(cols_, (nrows,ncols), batch_size=len(cols))
 37 | 
 38 | def unbatch(B):
 39 |     d = B.dim()
 40 |     return split_cols(dy.reshape(B,(d[1],d[3]), batch_size=1))
 41 | 
 42 | # format of files: each line is "word1/tag2 word2/tag2 ..."
 43 | train_file="data/text/train.txt"
 44 | test_file="data/text/dev.txt"
 45 | 
 46 | w2i = defaultdict(lambda: len(w2i))
 47 | 
 48 | def read(fname):
 49 |   """
 50 |   Read a file where each line is of the form "word1 word2 ..."
 51 |   Yields lists of the form [word1, word2, ...]
 52 |   """
 53 |   with open(fname, "r") as fh:
 54 |     for line in fh:
 55 |       sent = [w2i[x] for x in line.strip().split()]
 56 |       sent.append(w2i["<s>"])
 57 |       yield sent
 58 | 
 59 | train=list(read(train_file))
 60 | nwords = len(w2i)
 61 | test=list(read(test_file))
 62 | S = w2i["<s>"]
 63 | assert(nwords == len(w2i))
 64 | 
 65 | # DyNet Starts
 66 | 
 67 | model = dy.Model()
 68 | trainer = dy.AdamTrainer(model)
 69 | trainer.set_clip_threshold(-1.0)
 70 | #trainer.set_sparse_updates(True if args.SPARSE == 1 else False)
 71 | 
 72 | # Lookup parameters for word embeddings
 73 | WORDS_LOOKUP = model.add_lookup_parameters((nwords, args.EMBED_SIZE))
 74 | 
 75 | # Word-level LSTM (layers=1, input=64, output=128, model)
 76 | RNN = dy.VanillaLSTMBuilder(1, args.EMBED_SIZE, args.HIDDEN_SIZE, model)
 77 | 
 78 | # Softmax weights/biases on top of LSTM outputs
 79 | W_sm = model.add_parameters((nwords, args.HIDDEN_SIZE))
 80 | b_sm = model.add_parameters(nwords)
 81 | 
 82 | # Build the language model graph
 83 | def calc_lm_loss(sents):
 84 | 
 85 |   dy.renew_cg()
 86 |   # parameters -> expressions
 87 |   W_exp = dy.parameter(W_sm)
 88 |   b_exp = dy.parameter(b_sm)
 89 | 
 90 |   # initialize the RNN
 91 |   f_init = RNN.initial_state()
 92 | 
 93 |   # get the wids and masks for each step
 94 |   tot_words = 0
 95 |   wids = []
 96 |   masks = []
 97 |   for i in range(len(sents[0])):
 98 |     wids.append([
 99 |       (sent[i] if len(sent)>i else S) for sent in sents])
100 |     mask = [(1 if len(sent)>i else 0) for sent in sents]
101 |     masks.append(mask)
102 |     tot_words += sum(mask)
103 |   
104 |   # start the rnn by inputting "<s>"
105 |   init_ids = [S] * len(sents)
106 |   sequence = [init_ids]
107 |   sequence.extend(wids[:-1]) # no need to enter the last element, which is EOS or equiv.
108 |   outputs = f_init.transduce([dy.lookup_batch(WORDS_LOOKUP, x) for x in sequence])
109 |   if BATCHED_SENT:
110 |     scores = unbatch(W_exp*batch(outputs)+b_exp)
111 |   else:
112 |     scores  = [b_exp+(W_exp * o) for o in outputs]
113 |   assert(len(scores)==len(wids))
114 |   losses = [dy.pickneglogsoftmax_batch(score, wid) for (score,wid) in zip(scores,wids)]
115 | 
116 |   for i,mask in enumerate(masks):
117 |     if mask[-1] != 1:
118 |         mask_expr = dy.inputVector(mask)
119 |         mask_expr = dy.reshape(mask_expr, (1,), len(sents))
120 |         losses[i] = losses[i] * mask_expr
121 |   
122 |   return dy.sum_batches(dy.esum(losses)), tot_words
123 | 
124 | # Sort training sentences in descending order and count minibatches
125 | train.sort(key=lambda x: -len(x))
126 | test.sort(key=lambda x: -len(x))
127 | train_order = [x*args.MB_SIZE for x in range(int((len(train)-1)/args.MB_SIZE + 1))]
128 | test_order = [x*args.MB_SIZE for x in range(int((len(test)-1)/args.MB_SIZE + 1))]
129 | 
130 | print ("startup time: %r" % (time.time() - start))
131 | # Perform training
132 | start = time.time()
133 | i = all_time = dev_time = all_tagged = this_words = this_loss = 0
134 | for ITER in range(100):
135 |   random.shuffle(train_order)
136 |   for sid in train_order: 
137 |     i += 1
138 |     if i % int(500/args.MB_SIZE) == 0:
139 |       trainer.status()
140 |       print (this_loss / this_words, file=sys.stderr)
141 |       all_tagged += this_words
142 |       this_loss = this_words = 0
143 |       all_time = time.time() - start
144 |     if i % int(10000 / args.MB_SIZE) == 0 or all_time > args.TIMEOUT:
145 |       dev_start = time.time()
146 |       dev_loss = dev_words = 0
147 |       for sid in test_order:
148 |         loss_exp, mb_words = calc_lm_loss(test[sid:sid+args.MB_SIZE])
149 |         dev_loss += loss_exp.scalar_value()
150 |         dev_words += mb_words
151 |       dev_time += time.time() - dev_start 
152 |       train_time = time.time() - start - dev_time
153 |       print ("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % (dev_loss/dev_words, math.exp(dev_loss/dev_words), dev_words, train_time, all_tagged/train_time))
154 |       if all_time > args.TIMEOUT:
155 |         sys.exit(0)
156 |     # train on the minibatch
157 |     loss_exp, mb_words = calc_lm_loss(train[sid:sid+args.MB_SIZE])
158 |     this_loss += loss_exp.scalar_value()
159 |     # print("loss @ %r: %r" % (i, this_loss))
160 |     this_words += mb_words
161 |     loss_exp.backward()
162 |     trainer.update()
163 |   print ("epoch %r finished" % ITER)
164 |   trainer.update_epoch(1.0)
165 | 


--------------------------------------------------------------------------------
/dynet-py/rnnlm-batch.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | from collections import Counter, defaultdict
  6 | import random
  7 | import math
  8 | import sys
  9 | import argparse
 10 | 
 11 | import dynet as dy
 12 | import numpy as np
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument("--dynet-seed", default=0, type=int)
 16 | parser.add_argument("--dynet-mem", default=512, type=int)
 17 | parser.add_argument("--dynet-gpus", default=0, type=int)
 18 | parser.add_argument('MB_SIZE', type=int, help='minibatch size')
 19 | parser.add_argument('EMBED_SIZE', type=int, help='embedding size')
 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 23 | args = parser.parse_args()
 24 | 
 25 | # format of files: each line is "word1/tag2 word2/tag2 ..."
 26 | train_file="data/text/train.txt"
 27 | test_file="data/text/dev.txt"
 28 | 
 29 | w2i = defaultdict(lambda: len(w2i))
 30 | 
 31 | def read(fname):
 32 |   """
 33 |   Read a file where each line is of the form "word1 word2 ..."
 34 |   Yields lists of the form [word1, word2, ...]
 35 |   """
 36 |   with open(fname, "r") as fh:
 37 |     for line in fh:
 38 |       sent = [w2i[x] for x in line.strip().split()]
 39 |       sent.append(w2i["<s>"])
 40 |       yield sent
 41 | 
 42 | train=list(read(train_file))
 43 | nwords = len(w2i)
 44 | test=list(read(test_file))
 45 | S = w2i["<s>"]
 46 | assert(nwords == len(w2i))
 47 | 
 48 | # DyNet Starts
 49 | 
 50 | model = dy.Model()
 51 | trainer = dy.AdamTrainer(model)
 52 | trainer.set_clip_threshold(-1.0)
 53 | trainer.set_sparse_updates(True if args.SPARSE == 1 else False)
 54 | 
 55 | # Lookup parameters for word embeddings
 56 | WORDS_LOOKUP = model.add_lookup_parameters((nwords, args.EMBED_SIZE))
 57 | 
 58 | # Word-level LSTM (layers=1, input=64, output=128, model)
 59 | RNN = dy.VanillaLSTMBuilder(1, args.EMBED_SIZE, args.HIDDEN_SIZE, model)
 60 | 
 61 | # Softmax weights/biases on top of LSTM outputs
 62 | W_sm = model.add_parameters((nwords, args.HIDDEN_SIZE))
 63 | b_sm = model.add_parameters(nwords)
 64 | 
 65 | # Build the language model graph
 66 | def calc_lm_loss(sents):
 67 | 
 68 |   dy.renew_cg()
 69 |   # parameters -> expressions
 70 |   W_exp = dy.parameter(W_sm)
 71 |   b_exp = dy.parameter(b_sm)
 72 | 
 73 |   # initialize the RNN
 74 |   f_init = RNN.initial_state()
 75 | 
 76 |   # get the wids and masks for each step
 77 |   tot_words = 0
 78 |   wids = []
 79 |   masks = []
 80 |   for i in range(len(sents[0])):
 81 |     wids.append([
 82 |       (sent[i] if len(sent)>i else S) for sent in sents])
 83 |     mask = [(1 if len(sent)>i else 0) for sent in sents]
 84 |     masks.append(mask)
 85 |     tot_words += sum(mask)
 86 |   
 87 |   # start the rnn by inputting "<s>"
 88 |   init_ids = [S] * len(sents)
 89 |   s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP,init_ids))
 90 | 
 91 |   # feed word vectors into the RNN and predict the next word
 92 |   losses = []
 93 |   for wid, mask in zip(wids, masks):
 94 |     # calculate the softmax and loss
 95 |     score = dy.affine_transform([b_exp, W_exp, s.output()])
 96 |     loss = dy.pickneglogsoftmax_batch(score, wid)
 97 |     # mask the loss if at least one sentence is shorter
 98 |     if mask[-1] != 1:
 99 |       mask_expr = dy.inputVector(mask)
100 |       mask_expr = dy.reshape(mask_expr, (1,), len(sents))
101 |       loss = loss * mask_expr
102 |     losses.append(loss)
103 |     # update the state of the RNN    
104 |     wemb = dy.lookup_batch(WORDS_LOOKUP, wid)
105 |     s = s.add_input(wemb) 
106 |   
107 |   return dy.sum_batches(dy.esum(losses)), tot_words
108 | 
109 | # Sort training sentences in descending order and count minibatches
110 | train.sort(key=lambda x: -len(x))
111 | test.sort(key=lambda x: -len(x))
112 | train_order = [x*args.MB_SIZE for x in range(int((len(train)-1)/args.MB_SIZE + 1))]
113 | test_order = [x*args.MB_SIZE for x in range(int((len(test)-1)/args.MB_SIZE + 1))]
114 | 
115 | print ("startup time: %r" % (time.time() - start))
116 | # Perform training
117 | start = time.time()
118 | i = all_time = dev_time = all_tagged = this_words = this_loss = 0
119 | for ITER in range(100):
120 |   random.shuffle(train_order)
121 |   for sid in train_order: 
122 |     i += 1
123 |     if i % int(500/args.MB_SIZE) == 0:
124 |       trainer.status()
125 |       print (this_loss / this_words, file=sys.stderr)
126 |       all_tagged += this_words
127 |       this_loss = this_words = 0
128 |       all_time = time.time() - start
129 |     if i % int(10000 / args.MB_SIZE) == 0 or all_time > args.TIMEOUT:
130 |       dev_start = time.time()
131 |       dev_loss = dev_words = 0
132 |       for sid in test_order:
133 |         loss_exp, mb_words = calc_lm_loss(test[sid:sid+args.MB_SIZE])
134 |         dev_loss += loss_exp.scalar_value()
135 |         dev_words += mb_words
136 |       dev_time += time.time() - dev_start 
137 |       train_time = time.time() - start - dev_time
138 |       print ("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % (dev_loss/dev_words, math.exp(dev_loss/dev_words), dev_words, train_time, all_tagged/train_time))
139 |       if all_time > args.TIMEOUT:
140 |         sys.exit(0)
141 |     # train on the minibatch
142 |     loss_exp, mb_words = calc_lm_loss(train[sid:sid+args.MB_SIZE])
143 |     this_loss += loss_exp.scalar_value()
144 |     # print("loss @ %r: %r" % (i, this_loss))
145 |     this_words += mb_words
146 |     loss_exp.backward()
147 |     trainer.update()
148 |   print ("epoch %r finished" % ITER)
149 |   trainer.update_epoch(1.0)
150 | 


--------------------------------------------------------------------------------
/dynet-py/treenn-bulk.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | import re
  6 | import codecs
  7 | import sys
  8 | from collections import Counter
  9 | import random
 10 | import argparse
 11 | 
 12 | import numpy as np
 13 | import dynet as dy
 14 | 
 15 | parser = argparse.ArgumentParser()
 16 | parser.add_argument("--dynet-seed", default=0, type=int)
 17 | parser.add_argument("--dynet-mem", default=512, type=int)
 18 | parser.add_argument("--dynet-gpus", default=0, type=int)
 19 | parser.add_argument("--dynet-exp", default=1, type=int)
 20 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size')
 21 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 22 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 23 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 24 | args = parser.parse_args()
 25 | 
 26 | def _tokenize_sexpr(s):
 27 |     tokker = re.compile(r" +|[()]|[^ ()]+")
 28 |     toks = [t for t in [match.group(0) for match in tokker.finditer(s)] if t[0] != " "]
 29 |     return toks
 30 | 
 31 | def _within_bracket(toks):
 32 |     label = next(toks)
 33 |     children = []
 34 |     for tok in toks:
 35 |         if tok == "(":
 36 |             children.append(_within_bracket(toks))
 37 |         elif tok == ")":
 38 |             return Tree(label, children)
 39 |         else: children.append(Tree(tok, None))
 40 |     assert(False),list(toks)
 41 | 
 42 | class Tree(object):
 43 |     def __init__(self, label, children=None):
 44 |         self.label = label
 45 |         self.children = children
 46 | 
 47 |     @staticmethod
 48 |     def from_sexpr(string):
 49 |         toks = iter(_tokenize_sexpr(string))
 50 |         assert next(toks) == "("
 51 |         return _within_bracket(toks)
 52 | 
 53 |     def __str__(self):
 54 |         if self.children is None: return self.label
 55 |         return "[%s %s]" % (self.label, " ".join([str(c) for c in self.children]))
 56 | 
 57 |     def isleaf(self): return self.children==None
 58 | 
 59 |     def leaves_iter(self):
 60 |         if self.isleaf():
 61 |             yield self
 62 |         else:
 63 |             for c in self.children:
 64 |                 for l in c.leaves_iter(): yield l
 65 | 
 66 |     def leaves(self): return list(self.leaves_iter())
 67 | 
 68 |     def nonterms_iter(self):
 69 |         if not self.isleaf():
 70 |             yield self
 71 |             for c in self.children:
 72 |                 for n in c.nonterms_iter(): yield n
 73 |         
 74 |     def nonterms(self): return list(self.nonterms_iter())
 75 | 
 76 | def read_dataset(filename):
 77 |     return [Tree.from_sexpr(line.strip()) for line in codecs.open(filename,"r")]
 78 | 
 79 | def get_vocabs(trees):
 80 |     label_vocab = Counter()
 81 |     word_vocab  = Counter()
 82 |     for tree in trees:
 83 |         label_vocab.update([n.label for n in tree.nonterms()])
 84 |         word_vocab.update([l.label for l in tree.leaves()])
 85 |     labels = [x for x,c in label_vocab.items() if c > 0]
 86 |     words  = ["_UNK_"] + [x for x,c in word_vocab.items() if c > 0]
 87 |     l2i = {l:i for i,l in enumerate(labels)}
 88 |     w2i = {w:i for i,w in enumerate(words)}
 89 |     return l2i, w2i, labels, words
 90 | 
 91 | class TreeLSTMBuilder(object):
 92 |     def __init__(self, model, word_vocab, wdim, hdim):
 93 |         self.WS = [model.add_parameters((hdim, wdim)) for _ in "iou"]
 94 |         self.US = [model.add_parameters((hdim, 2*hdim)) for _ in "iou"]
 95 |         self.UFS =[model.add_parameters((hdim, hdim)) for _ in "ff"]
 96 |         self.BS = [model.add_parameters(hdim) for _ in "iouf"]
 97 |         self.E = model.add_lookup_parameters((len(word_vocab),wdim))
 98 |         self.w2i = word_vocab
 99 | 
100 |     def expr_for_tree(self, tree, decorate=False):
101 |         assert(not tree.isleaf())
102 |         if len(tree.children) == 1:
103 |             assert(tree.children[0].isleaf())
104 |             emb = self.E[self.w2i.get(tree.children[0].label,0)] 
105 |             Wi,Wo,Wu   = [dy.parameter(w) for w in self.WS]
106 |             bi,bo,bu,_ = [dy.parameter(b) for b in self.BS]
107 |             #i = dy.logistic(dy.affine_transform([bi, Wi, emb]))
108 |             #o = dy.logistic(dy.affine_transform([bo, Wo, emb]))
109 |             #u = dy.tanh(    dy.affine_transform([bu, Wu, emb]))
110 |             i = dy.logistic(bi+Wi*emb)
111 |             o = dy.logistic(bo+Wo*emb)
112 |             u = dy.tanh(    bu+Wu*emb)
113 |             c = dy.cmult(i,u)
114 |             h = dy.cmult(o,dy.tanh(c))
115 |             if decorate: tree._e = h
116 |             return h, c
117 |         assert(len(tree.children) == 2),tree.children[0]
118 |         e1, c1 = self.expr_for_tree(tree.children[0], decorate)
119 |         e2, c2 = self.expr_for_tree(tree.children[1], decorate)
120 |         Ui,Uo,Uu = [dy.parameter(u) for u in self.US]
121 |         Uf1,Uf2 = [dy.parameter(u) for u in self.UFS]
122 |         bi,bo,bu,bf = [dy.parameter(b) for b in self.BS]
123 |         e = dy.concatenate([e1,e2])
124 |         i = dy.logistic(bi+Ui*e)
125 |         o = dy.logistic(bi+Uo*e)
126 |         f1 = dy.logistic(bf+Uf1*e1)
127 |         f2 = dy.logistic(bf+Uf2*e2)
128 |         u = dy.tanh(     bu+Uu*e)
129 |         c = dy.cmult(i,u) + dy.cmult(f1,c1) + dy.cmult(f2,c2)
130 |         h = dy.cmult(o,dy.tanh(c))
131 |         if decorate: tree._e = h
132 |         return h, c
133 | 
134 | train = read_dataset("data/trees/train.txt")
135 | dev = read_dataset("data/trees/dev.txt")
136 | 
137 | l2i, w2i, i2l, i2w = get_vocabs(train)
138 | 
139 | model = dy.Model()
140 | builder = TreeLSTMBuilder(model, w2i, args.WEMBED_SIZE, args.HIDDEN_SIZE)
141 | W_ = model.add_parameters((len(l2i), args.HIDDEN_SIZE))
142 | trainer = dy.AdamTrainer(model)
143 | trainer.set_clip_threshold(-1.0)
144 | trainer.set_sparse_updates(True if args.SPARSE == 1 else False)
145 | 
146 | print ("startup time: %r" % (time.time() - start))
147 | sents = 0
148 | all_time = 0
149 | for ITER in range(100):
150 |     random.shuffle(train)
151 |     closs = 0.0
152 |     cwords = 0
153 |     start = time.time()
154 |     batch = []
155 |     for i,tree in enumerate(train,1):
156 |         sents += 1
157 |         W = dy.parameter(W_)
158 |         h, c = builder.expr_for_tree(tree,True)
159 |         nodes = tree.nonterms()
160 |         losses = [dy.pickneglogsoftmax(W*nt._e,l2i[nt.label]) for nt in nodes]
161 |         loss = dy.esum(losses)
162 |         batch.append(loss)
163 |         if len(batch) == 50:
164 |             loss = dy.esum(batch)
165 |             closs += loss.value()
166 |             cwords += len(nodes)
167 |             loss.backward()
168 |             trainer.update()
169 |             batch = []
170 |             dy.renew_cg()
171 |         if sents % 1000 == 0:
172 |             trainer.status()
173 |             print (closs / cwords, file=sys.stderr)
174 |             closs = 0.0
175 |             cwords = 0
176 |     all_time += time.time() - start
177 |     trainer.update_epoch(1.0)
178 |     good = bad = 0.0
179 | 
180 |     batch = []
181 |     dy.renew_cg()
182 |     for tree in dev:
183 |         W = dy.parameter(W_)
184 |         h, c = builder.expr_for_tree(tree,False)
185 |         scores = W*h
186 |         batch.append(scores)
187 |         if len(batch)==50:
188 |             dy.esum(batch).forward() # TODO need nicer API for running the list w/o dummy op.
189 |             for scores in batch:
190 |                 pred = i2l[np.argmax(scores.npvalue())]
191 |                 if pred == tree.label: good += 1
192 |                 else: bad += 1
193 |             batch = []
194 |             dy.renew_cg()
195 |     print ("acc=%.4f, time=%.4f, sent_per_sec=%.4f" % (good/(good+bad), all_time, sents/all_time))
196 |     if all_time > args.TIMEOUT:
197 |         sys.exit(0)
198 | 


--------------------------------------------------------------------------------
/dynet-py/treenn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | import re
  6 | import codecs
  7 | import sys
  8 | from collections import Counter
  9 | import random
 10 | import argparse
 11 | 
 12 | import numpy as np
 13 | import dynet as dy
 14 | 
 15 | parser = argparse.ArgumentParser()
 16 | parser.add_argument("--dynet-seed", default=0, type=int)
 17 | parser.add_argument("--dynet-mem", default=512, type=int)
 18 | parser.add_argument("--dynet-gpus", default=0, type=int)
 19 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size')
 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 23 | args = parser.parse_args()
 24 | 
 25 | def _tokenize_sexpr(s):
 26 |     tokker = re.compile(r" +|[()]|[^ ()]+")
 27 |     toks = [t for t in [match.group(0) for match in tokker.finditer(s)] if t[0] != " "]
 28 |     return toks
 29 | 
 30 | def _within_bracket(toks):
 31 |     label = next(toks)
 32 |     children = []
 33 |     for tok in toks:
 34 |         if tok == "(":
 35 |             children.append(_within_bracket(toks))
 36 |         elif tok == ")":
 37 |             return Tree(label, children)
 38 |         else: children.append(Tree(tok, None))
 39 |     assert(False),list(toks)
 40 | 
 41 | class Tree(object):
 42 |     def __init__(self, label, children=None):
 43 |         self.label = label
 44 |         self.children = children
 45 | 
 46 |     @staticmethod
 47 |     def from_sexpr(string):
 48 |         toks = iter(_tokenize_sexpr(string))
 49 |         assert next(toks) == "("
 50 |         return _within_bracket(toks)
 51 | 
 52 |     def __str__(self):
 53 |         if self.children is None: return self.label
 54 |         return "[%s %s]" % (self.label, " ".join([str(c) for c in self.children]))
 55 | 
 56 |     def isleaf(self): return self.children==None
 57 | 
 58 |     def leaves_iter(self):
 59 |         if self.isleaf():
 60 |             yield self
 61 |         else:
 62 |             for c in self.children:
 63 |                 for l in c.leaves_iter(): yield l
 64 | 
 65 |     def leaves(self): return list(self.leaves_iter())
 66 | 
 67 |     def nonterms_iter(self):
 68 |         if not self.isleaf():
 69 |             yield self
 70 |             for c in self.children:
 71 |                 for n in c.nonterms_iter(): yield n
 72 |         
 73 |     def nonterms(self): return list(self.nonterms_iter())
 74 | 
 75 | def read_dataset(filename):
 76 |     return [Tree.from_sexpr(line.strip()) for line in codecs.open(filename,"r")]
 77 | 
 78 | def get_vocabs(trees):
 79 |     label_vocab = Counter()
 80 |     word_vocab  = Counter()
 81 |     for tree in trees:
 82 |         label_vocab.update([n.label for n in tree.nonterms()])
 83 |         word_vocab.update([l.label for l in tree.leaves()])
 84 |     labels = [x for x,c in label_vocab.items() if c > 0]
 85 |     words  = ["_UNK_"] + [x for x,c in word_vocab.items() if c > 0]
 86 |     l2i = {l:i for i,l in enumerate(labels)}
 87 |     w2i = {w:i for i,w in enumerate(words)}
 88 |     return l2i, w2i, labels, words
 89 | 
 90 | class TreeLSTMBuilder(object):
 91 |     def __init__(self, model, word_vocab, wdim, hdim):
 92 |         self.WS = [model.add_parameters((hdim, wdim)) for _ in "iou"]
 93 |         self.US = [model.add_parameters((hdim, 2*hdim)) for _ in "iou"]
 94 |         self.UFS =[model.add_parameters((hdim, hdim)) for _ in "ff"]
 95 |         self.BS = [model.add_parameters(hdim) for _ in "iouf"]
 96 |         self.E = model.add_lookup_parameters((len(word_vocab),wdim))
 97 |         self.w2i = word_vocab
 98 | 
 99 |     def expr_for_tree(self, tree, decorate=False):
100 |         assert(not tree.isleaf())
101 |         if len(tree.children) == 1:
102 |             assert(tree.children[0].isleaf())
103 |             emb = self.E[self.w2i.get(tree.children[0].label,0)] 
104 |             Wi,Wo,Wu   = [dy.parameter(w) for w in self.WS]
105 |             bi,bo,bu,_ = [dy.parameter(b) for b in self.BS]
106 |             i = dy.logistic(dy.affine_transform([bi, Wi, emb]))
107 |             o = dy.logistic(dy.affine_transform([bo, Wo, emb]))
108 |             u = dy.tanh(    dy.affine_transform([bu, Wu, emb]))
109 |             c = dy.cmult(i,u)
110 |             h = dy.cmult(o,dy.tanh(c))
111 |             if decorate: tree._e = h
112 |             return h, c
113 |         assert(len(tree.children) == 2),tree.children[0]
114 |         e1, c1 = self.expr_for_tree(tree.children[0], decorate)
115 |         e2, c2 = self.expr_for_tree(tree.children[1], decorate)
116 |         Ui,Uo,Uu = [dy.parameter(u) for u in self.US]
117 |         Uf1,Uf2 = [dy.parameter(u) for u in self.UFS]
118 |         bi,bo,bu,bf = [dy.parameter(b) for b in self.BS]
119 |         e = dy.concatenate([e1,e2])
120 |         i = dy.logistic(dy.affine_transform([bi, Ui, e]))
121 |         o = dy.logistic(dy.affine_transform([bo, Uo, e]))
122 |         f1 = dy.logistic(dy.affine_transform([bf, Uf1, e1]))
123 |         f2 = dy.logistic(dy.affine_transform([bf, Uf2, e2]))
124 |         u = dy.tanh(     dy.affine_transform([bu, Uu, e]))
125 |         c = dy.cmult(i,u) + dy.cmult(f1,c1) + dy.cmult(f2,c2)
126 |         h = dy.cmult(o,dy.tanh(c))
127 |         if decorate: tree._e = h
128 |         return h, c
129 | 
130 | train = read_dataset("data/trees/train.txt")
131 | dev = read_dataset("data/trees/dev.txt")
132 | 
133 | l2i, w2i, i2l, i2w = get_vocabs(train)
134 | 
135 | model = dy.Model()
136 | builder = TreeLSTMBuilder(model, w2i, args.WEMBED_SIZE, args.HIDDEN_SIZE)
137 | W_ = model.add_parameters((len(l2i), args.HIDDEN_SIZE))
138 | trainer = dy.AdamTrainer(model)
139 | trainer.set_clip_threshold(-1.0)
140 | trainer.set_sparse_updates(True if args.SPARSE == 1 else False)
141 | 
142 | print ("startup time: %r" % (time.time() - start))
143 | sents = 0
144 | all_time = 0
145 | for ITER in range(100):
146 |     random.shuffle(train)
147 |     closs = 0.0
148 |     cwords = 0
149 |     start = time.time()
150 |     for i,tree in enumerate(train,1):
151 |         sents += 1
152 |         dy.renew_cg()
153 |         W = dy.parameter(W_)
154 |         h, c = builder.expr_for_tree(tree,True)
155 |         nodes = tree.nonterms()
156 |         losses = [dy.pickneglogsoftmax(W*nt._e,l2i[nt.label]) for nt in nodes]
157 |         loss = dy.esum(losses)
158 |         closs += loss.value()
159 |         cwords += len(nodes)
160 |         loss.backward()
161 |         trainer.update()
162 |         if sents % 1000 == 0:
163 |             trainer.status()
164 |             print (closs / cwords, file=sys.stderr)
165 |             closs = 0.0
166 |             cwords = 0
167 |     all_time += time.time() - start
168 |     trainer.update_epoch(1.0)
169 |     good = bad = 0.0
170 |     for tree in dev:
171 |         dy.renew_cg()
172 |         W = dy.parameter(W_)
173 |         h, c = builder.expr_for_tree(tree,False)
174 |         pred = i2l[np.argmax((W*h).npvalue())]
175 |         if pred == tree.label: good += 1
176 |         else: bad += 1
177 |     print ("acc=%.4f, time=%.4f, sent_per_sec=%.4f" % (good/(good+bad), all_time, sents/all_time))
178 |     if all_time > args.TIMEOUT:
179 |         sys.exit(0)
180 | 


--------------------------------------------------------------------------------
/make-report.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # This should be used as
  4 | # mkdir -p report
  5 | # grep '\(per_sec\|startup\)' log/*/*.log | python make-report.py
  6 | 
  7 | import sys
  8 | import re
  9 | from collections import defaultdict
 10 | 
 11 | stats = defaultdict(lambda: {})
 12 | allstats = defaultdict(lambda: [])
 13 | 
 14 | ##### Regexes
 15 | fnameregex = re.compile(r"log/([a-z-]+?)(-gpu|)/(dynet-py|dynet-cpp|dynet-seq|chainer|theano|tensorflow)-(.*?)-t([123]).log:(.*)")
 16 | startregex = re.compile(r"startup time: (.*)")
 17 | eqregex = re.compile(r"(.*)=(.*)")
 18 | commentregex = re.compile(r"^ *((#|//).*)?")
 19 | 
 20 | ##### Various data
 21 | canonicalize = {
 22 |   "word_per_sec": "speed",
 23 |   "words_per_sec": "speed",
 24 |   "sent_per_sec": "speed",
 25 |   "nll": "accuracy",
 26 |   "tag_acc": "accuracy",
 27 |   "acc": "accuracy",
 28 |   "time": "time"
 29 | }
 30 | taskna = {
 31 |   ("tensorflow", "bilstm-tagger-withchar"): 1,
 32 |   ("tensorflow", "treenn"): 1,
 33 |   ("theano", "treenn"): 1,
 34 |   ("dynet-seq", "bilstm-tagger"): 1,
 35 |   ("dynet-seq", "bilstm-tagger-withchar"): 1,
 36 |   ("dynet-seq", "treenn"): 1,
 37 | }
 38 | toolkits = ["dynet-cpp", "dynet-py", "chainer", "dynet-seq", "theano", "tensorflow"]
 39 | prettyname = {
 40 |   "dynet-cpp": "DyC++",
 41 |   "dynet-py":  "DyPy",
 42 |   "dynet-seq": "DyC++ Seq",
 43 |   "tensorflow":"TF",
 44 |   "chainer":   "Chainer",
 45 |   "theano":    "Theano"
 46 | }
 47 | 
 48 | ##### Load from log files
 49 | for line in sys.stdin:
 50 |   line = line.replace("rnnlm-seq/dynet-cpp", "rnnlm-batch/dynet-seq")
 51 |   line = line.replace("rnnlm-seq-gpu/dynet-cpp", "rnnlm-batch-gpu/dynet-seq")
 52 |   m = re.search(fnameregex, line.strip())
 53 |   if m:
 54 |     task = m.group(1)
 55 |     device = "gpu" if m.group(2) == "-gpu" else "cpu"
 56 |     toolkit = m.group(3)
 57 |     params = m.group(4)
 58 |     trial = int(m.group(5))
 59 |     idtup = (task, device, toolkit, params, trial)
 60 |     data = m.group(6)
 61 |     m = re.search(startregex, data)
 62 |     if m:
 63 |       stats[idtup]["startup"] = float(m.group(1))
 64 |     else:
 65 |       mystats = {}
 66 |       for val in data.split(", "):
 67 |         m = re.search(eqregex, val)
 68 |         if not m:
 69 |           print("unmatched line: %s" % line)
 70 |           sys.exit(1)
 71 |         if m.group(1) in canonicalize:
 72 |           can = canonicalize[m.group(1)]
 73 |           val = float(m.group(2))
 74 |           mystats[can] = val
 75 |           if can == "accuracy":
 76 |             if "rnnlm" not in task: val *= 100
 77 |             else: val *= -1
 78 |             stats[idtup][can] = max(val, stats[idtup].get(can,-1e10))
 79 |           else:
 80 |             stats[idtup][can] = val
 81 |       allstats[idtup].append(mystats)
 82 |   else:
 83 |     print("unmatched line: %s" % line)
 84 |     sys.exit(1)
 85 | # print(stats)
 86 | 
 87 | # def format_num(num):
 88 | #   if num > 1e6:
 89 | #     return "%.03gM" % (float(num)/1e6)
 90 | #   elif num > 1e3:
 91 | #     return "%.03gk" % (float(num)/1e3)
 92 | #   else:
 93 | #     return "%.03g" % float(num)
 94 | 
 95 | # TODO: There must be a better way to do this...
 96 | def format_num(num):
 97 |   fnum = float(num)
 98 |   val = "%.03g" % fnum
 99 |   if fnum >= 1 and fnum < 10:
100 |     val = "%.2f" % fnum
101 |   elif fnum >= 10 and fnum < 100:
102 |     val = "%.1f" % fnum
103 |   elif float(num) > 1000:
104 |     val = "%.f" % float(val)
105 |   return val
106 | 
107 | def getmaxstat(task, device, toolkit, setting, stat, mult=1):
108 |   my_stats = []
109 |   for trial in range(1,4):
110 |     my_id = (task, device, toolkit, setting, trial)
111 |     if my_id in stats and stat in stats[my_id]:
112 |       my_stats.append(mult*stats[my_id][stat])
113 |   return format_num(mult*max(my_stats)) if len(my_stats) > 0 else "TODO"
114 | def getminstat(task, device, toolkit, setting, stat):
115 |   return getmaxstat(task, device ,toolkit, setting, stat, mult=-1)
116 | 
117 | ###### First section: toolkit comparison
118 | 
119 | # CPU/GPU speeds for all toolkits/tasks
120 | tasks = [
121 |   ("RNNLM (MB=1) ", "rnnlm-batch", "ms01-es128-hs256-sp0"),
122 |   ("RNNLM (MB=4)",  "rnnlm-batch", "ms04-es128-hs256-sp0"),
123 |   ("RNNLM (MB=16)", "rnnlm-batch", "ms16-es128-hs256-sp0"),
124 |   ("RNNLM (MB=64)", "rnnlm-batch", "ms64-es128-hs256-sp0"),
125 |   ("BiLSTM Tag", "bilstm-tagger", "ws128-hs50-mlps32-su0"),
126 |   ("BiLSTM Tag +sparse", "bilstm-tagger", "ws128-hs50-mlps32-su1"),
127 |   ("BiLSTM Tag+Char", "bilstm-tagger-withchar", "cs20-ws128-hs50-mlps32-su0"),
128 |   ("BiLSTM Tag+Char +sparse", "bilstm-tagger-withchar", "cs20-ws128-hs50-mlps32-su1"),
129 |   ("TreeLSTM", "treenn", "ws128-hs128-su0"),
130 |   ("TreeLSTM +sparse", "treenn", "ws128-hs128-su1"),
131 | ]
132 | def make_speed_table(device):
133 |   print("\\begin{table}")
134 |   print("\\begin{tabular}{c|rrr|rrr}")
135 |   print(" & "+" & ".join([prettyname[x] for x in toolkits])+" \\\\ \hline")
136 |   for name, task, setting in tasks:
137 |     cols = [name]
138 |     for i, toolkit in enumerate(toolkits):
139 |       if (toolkit, task) in taskna:
140 |         cols.append("\\multicolumn{1}{c}{-}")
141 |       else:
142 |         cols.append(getmaxstat(task, device, toolkit, setting, "speed"))
143 |     print(" & ".join(cols)+" \\\\")
144 |   print("\\end{tabular}")
145 |   print("\\caption{Processing speed for each toolkit on %s. Speeds are measured in words/sec for RNNLM and Tagger and sentences/sec for TreeLSTM.}" % device.upper())
146 |   print("\\label{tab:speeds%s}" % device)
147 |   print("\\end{table}")
148 |   print("")
149 | make_speed_table("cpu")
150 | make_speed_table("gpu")
151 | 
152 | # Startup time table
153 | tasks = [
154 |   ("RNNLM", "rnnlm-batch", "ms01-es128-hs256-sp0"),
155 |   ("BiLSTM Tag", "bilstm-tagger", "ws128-hs50-mlps32-su0"),
156 |   ("BiLSTM Tag+Char", "bilstm-tagger-withchar", "cs20-ws128-hs50-mlps32-su0"),
157 |   ("TreeLSTM", "treenn", "ws128-hs128-su0"),
158 | ]
159 | print("\\begin{table}")
160 | print("\\begin{tabular}{c|rrr|rrr}")
161 | print(" & "+" & ".join([prettyname[x] for x in toolkits])+" \\\\ \hline")
162 | for name, task, setting in tasks:
163 |   cols = [name]
164 |   for i, toolkit in enumerate(toolkits):
165 |     if (toolkit, task) in taskna:
166 |       cols.append("\\multicolumn{1}{c}{-}")
167 |     else:
168 |       cols.append(getminstat(task, device, toolkit, setting, "startup"))
169 |   print(" & ".join(cols)+" \\\\")
170 | print("\\end{tabular}")
171 | print("\\caption{Startup time for programs written in each toolkit.}")
172 | print("\\label{tab:startup}")
173 | print("\\end{table}")
174 | print("")
175 | 
176 | # Code complexities
177 | def get_code_complexity(toolkit, task):
178 |   chars = 0
179 |   if toolkit == "dynet-seq":
180 |     if not task == "rnnlm-batch":
181 |       return "\\multicolumn{1}{c}{-}"
182 |     toolkit = "dynet-cpp"
183 |     task = "rnnlm-seq"
184 |   if (toolkit, task) in taskna:
185 |     return "\\multicolumn{1}{c}{-}"
186 |   with open("%s/%s.%s" % (toolkit, task, "cc" if toolkit == "dynet-cpp" else "py"), "r") as f:
187 |     for line in f:
188 |       line = re.sub(commentregex, "", line.strip())
189 |       chars += len(line)
190 |   return str(chars)
191 | 
192 | tasks = [
193 |   ("RNNLM", "rnnlm-batch"),
194 |   ("BiLSTM Tag", "bilstm-tagger"),
195 |   ("BiLSTM Tag+Char", "bilstm-tagger-withchar"),
196 |   ("TreeLSTM", "treenn"),
197 | ]
198 | print("\\begin{table}")
199 | print("\\begin{tabular}{c|rrrrrr}")
200 | print(" & "+" & ".join([prettyname[x] for x in toolkits])+" \\\\ \hline")
201 | for name, task in tasks:
202 |   cols = [name]
203 |   for i, toolkit in enumerate(toolkits):
204 |     cols.append(get_code_complexity(toolkit, task))
205 |   print(" & ".join(cols)+" \\\\")
206 | print("\\end{tabular}")
207 | print("\\caption{Number of non-comment characters in the implementation of each toolkit.}")
208 | print("\\label{tab:complexity}")
209 | print("\\end{table}")
210 | print("")
211 | 
212 | 
213 | ###### Second section: effect of minibatching and net size
214 | 
215 | 
216 | ###### Third section: effect of sparse update
217 | tasks = [
218 |   ("RNNLM (MB=1) ", "rnnlm-batch", "ms01-es128-hs256-sp"),
219 |   ("RNNLM (MB=16)", "rnnlm-batch", "ms16-es128-hs256-sp"),
220 |   ("BiLSTM Tag", "bilstm-tagger", "ws128-hs50-mlps32-su"),
221 |   ("BiLSTM Tag+Char", "bilstm-tagger-withchar", "cs20-ws128-hs50-mlps32-su"),
222 |   ("TreeLSTM", "treenn", "ws128-hs128-su"),
223 | ]
224 | print("\\begin{table}")
225 | print("\\begin{tabular}{c|rr|rr|rr|rr}")
226 | print(" & \\multicolumn{4}{c|}{Speed} & \\multicolumn{4}{c}{Accuracy} \\\\")
227 | print(" & \\multicolumn{2}{c|}{Dense} & \\multicolumn{2}{c|}{Sparse} & \\multicolumn{2}{c|}{Dense} & \\multicolumn{2}{c}{Sparse} \\\\")
228 | print(" & "+" & ".join(["CPU & GPU"] * 4)+" \\\\ \\hline")
229 | for name, task, setting in tasks:
230 |   cols = [name]
231 |   for criterion in ("speed", "accuracy"):
232 |     for ds in ("0", "1"):
233 |       for device in ("cpu", "gpu"):
234 |         cols.append(getmaxstat(task, device, "dynet-cpp", setting+ds, criterion))
235 |   print(" & ".join(cols)+" \\\\")
236 | print("\\end{tabular}")
237 | print("\\caption{Processing speed and accuracy after 10 minutes with dense or sparse updates.}")
238 | print("\\label{tab:sparseresults}")
239 | print("\\end{table}")
240 | print("")
241 | 


--------------------------------------------------------------------------------
/pytorch/bilstm-tagger-withchar.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import print_function
  3 | import time
  4 | start = time.time()
  5 | 
  6 | from collections import Counter, defaultdict
  7 | import random
  8 | import sys
  9 | import argparse
 10 | import numpy as np
 11 | import torch
 12 | from torch import nn
 13 | from torch import optim
 14 | from torch.autograd import Variable
 15 | from torch.nn import functional as F
 16 | 
 17 | 
 18 | parser = argparse.ArgumentParser()
 19 | parser.add_argument('CEMBED_SIZE', type=int, help='char embedding size')
 20 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size')
 21 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 22 | parser.add_argument('MLP_SIZE', type=int, help='embedding size')
 23 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 24 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 25 | parser.add_argument('--CUDA', default=1, type=int)
 26 | args = parser.parse_args()
 27 | 
 28 | 
 29 | # format of files: each line is "word1|tag2 word2|tag2 ..."
 30 | train_file = "data/tags/train.txt"
 31 | dev_file = "data/tags/dev.txt"
 32 | 
 33 | 
 34 | class Vocab:
 35 | 
 36 |     def __init__(self, w2i=None):
 37 |         if w2i is None: w2i = defaultdict(lambda: len(w2i))
 38 |         self.w2i = dict(w2i)
 39 |         self.i2w = {i: w for w, i in w2i.items()}
 40 | 
 41 |     @classmethod
 42 |     def from_corpus(cls, corpus):
 43 |         w2i = defaultdict(lambda: len(w2i))
 44 |         for sent in corpus:
 45 |             [w2i[word] for word in sent]
 46 |         return Vocab(w2i)
 47 | 
 48 |     def size(self):
 49 |         return len(self.w2i.keys())
 50 | 
 51 | 
 52 | def read(fname):
 53 |     """
 54 |     Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..."
 55 |     Yields lists of the form [(word1,tag1), (word2,tag2), ...]
 56 |     """
 57 |     with open(fname, "r") as fh:
 58 |         for line in fh:
 59 |             line = line.strip().split()
 60 |             sent = [tuple(x.rsplit("|", 1)) for x in line]
 61 |             yield sent
 62 | 
 63 | 
 64 | train = list(read(train_file))
 65 | dev = list(read(dev_file))
 66 | words = []
 67 | tags = []
 68 | chars = set()
 69 | wc = Counter()
 70 | for sent in train:
 71 |     for w, p in sent:
 72 |         words.append(w)
 73 |         tags.append(p)
 74 |         wc[w] += 1
 75 |         chars.update(w)
 76 | words.append("_UNK_")
 77 | chars.add("_UNK_")
 78 | chars.add("<*>")
 79 | 
 80 | vw = Vocab.from_corpus([words])
 81 | vt = Vocab.from_corpus([tags])
 82 | vc = Vocab.from_corpus([chars])
 83 | UNK = vw.w2i["_UNK_"]
 84 | CUNK = vc.w2i["_UNK_"]
 85 | pad_char = vc.w2i["<*>"]
 86 | 
 87 | nwords = vw.size()
 88 | ntags = vt.size()
 89 | nchars = vc.size()
 90 | print ("nwords=%r, ntags=%r, nchars=%r" % (nwords, ntags, nchars))
 91 | 
 92 | 
 93 | def get_var(x, volatile=False):
 94 |     x = Variable(x, volatile=volatile)
 95 |     return x.cuda() if args.CUDA else x
 96 | 
 97 | 
 98 | class Model(nn.Module):
 99 | 
100 |     def __init__(self, args):
101 |         super(Model, self).__init__()
102 |         self.lookup_w = nn.Embedding(nwords, args.WEMBED_SIZE, padding_idx=UNK)
103 |         self.lookup_c = nn.Embedding(nchars, args.CEMBED_SIZE, padding_idx=CUNK)
104 |         self.lstm = nn.LSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE, 1, bidirectional=True)
105 |         self.lstm_c_f = nn.LSTM(args.CEMBED_SIZE, args.WEMBED_SIZE / 2, 1)
106 |         self.lstm_c_r = nn.LSTM(args.CEMBED_SIZE, args.WEMBED_SIZE / 2, 1)
107 |         self.proj1 = nn.Linear(2 * args.HIDDEN_SIZE, args.MLP_SIZE)
108 |         self.proj2 = nn.Linear(args.MLP_SIZE, ntags)
109 | 
110 |     def forward(self, words, volatile=False):
111 |         word_ids = []
112 |         needs_chars = []
113 |         char_ids = []
114 |         for i, w in enumerate(words):
115 |             if wc[w] > 5:
116 |                 word_ids.append(vw.w2i[w])
117 |             else:
118 |                 word_ids.append(UNK)
119 |                 needs_chars.append(i)
120 |                 char_ids.append([pad_char] + [vc.w2i.get(c, CUNK) for c in w] + [pad_char])
121 |         embeddings = self.lookup_w(get_var(torch.LongTensor(word_ids), volatile=volatile))
122 |         if needs_chars:
123 |             max_len = max(len(x) for x in char_ids)
124 |             fwd_char_ids = [ids + [pad_char for _ in range(max_len - len(ids))] for ids in char_ids]
125 |             rev_char_ids = [ids[::-1] + [pad_char for _ in range(max_len - len(ids))] for ids in char_ids]
126 |             char_embeddings = torch.cat([
127 |                     self.lstm_c_f(self.lookup_c(get_var(torch.LongTensor(fwd_char_ids).t())))[0],
128 |                     self.lstm_c_r(self.lookup_c(get_var(torch.LongTensor(rev_char_ids).t())))[0]
129 |                 ], 2)
130 |             unk_embeddings = torch.cat([char_embeddings[len(words[j]) + 1, i].unsqueeze(0) for i, j in enumerate(needs_chars)], 0)
131 |             embeddings = embeddings.index_add(0, get_var(torch.LongTensor(needs_chars)), unk_embeddings)
132 |         return self.proj2(self.proj1(self.lstm(embeddings.unsqueeze(1))[0].squeeze(1)))
133 | 
134 | 
135 | model = Model(args)
136 | if args.CUDA:
137 |     model.cuda()
138 | optimizer = optim.Adam(model.parameters())
139 | 
140 | 
141 | print("startup time: %r" % (time.time() - start))
142 | start = time.time()
143 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0
144 | 
145 | for ITER in range(100):
146 |     random.shuffle(train)
147 |     for s in train:
148 |         i += 1
149 |         if i % 500 == 0:
150 |             print(this_loss / this_tagged, file=sys.stderr)
151 |             all_tagged += this_tagged
152 |             this_loss = this_tagged = 0
153 |             all_time = time.time() - start
154 |         if i % 10000 == 0 or all_time > args.TIMEOUT:  # eval on dev
155 |             dev_start = time.time()
156 |             good_sent = bad_sent = good = bad = 0.0
157 |             for sent in dev:
158 |                 words, golds = zip(*sent)
159 |                 tags = [vt.i2w[i] for i in model(words, volatile=True).max(1)[1].cpu().data.view(-1)]
160 |                 if tags == list(golds): good_sent += 1
161 |                 else: bad_sent += 1
162 |                 for go, gu in zip(golds, tags):
163 |                     if go == gu: good += 1
164 |                     else: bad += 1
165 |             dev_time += time.time() - dev_start
166 |             train_time = time.time() - start - dev_time
167 |             print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time))
168 |             if all_time > args.TIMEOUT:
169 |                 sys.exit(0)
170 |         # batch / loss
171 |         words, golds = zip(*s)
172 |         preds = model(words)
173 |         loss = F.cross_entropy(preds, get_var(torch.LongTensor([vt.w2i[t] for t in golds])))
174 |         # log / optim
175 |         this_loss += loss.data[0]*len(golds)
176 |         this_tagged += len(golds)
177 |         optimizer.zero_grad()
178 |         loss.backward()
179 |         optimizer.step()
180 |     print("epoch %r finished" % ITER)
181 | 


--------------------------------------------------------------------------------
/pytorch/bilstm-tagger.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from __future__ import print_function
  3 | import time
  4 | start = time.time()
  5 | 
  6 | from collections import Counter, defaultdict
  7 | import random
  8 | import sys
  9 | import argparse
 10 | import numpy as np
 11 | import torch
 12 | from torch import nn
 13 | from torch import optim
 14 | from torch.autograd import Variable
 15 | from torch.nn import functional as F
 16 | 
 17 | 
 18 | parser = argparse.ArgumentParser()
 19 | parser.add_argument('WEMBED_SIZE', type=int)
 20 | parser.add_argument('HIDDEN_SIZE', type=int)
 21 | parser.add_argument('MLP_SIZE', type=int)
 22 | parser.add_argument('SPARSE', type=int)
 23 | parser.add_argument('TIMEOUT', type=int)
 24 | parser.add_argument('--CUDA', default=-1, type=int)
 25 | args = parser.parse_args()
 26 | 
 27 | 
 28 | # format of files: each line is "word1|tag2 word2|tag2 ..."
 29 | train_file = "data/tags/train.txt"
 30 | dev_file = "data/tags/dev.txt"
 31 | 
 32 | 
 33 | class Vocab:
 34 | 
 35 |     def __init__(self, w2i=None):
 36 |         if w2i is None: w2i = defaultdict(lambda: len(w2i))
 37 |         self.w2i = dict(w2i)
 38 |         self.i2w = {i: w for w, i in w2i.items()}
 39 | 
 40 |     @classmethod
 41 |     def from_corpus(cls, corpus):
 42 |         w2i = defaultdict(lambda: len(w2i))
 43 |         for sent in corpus:
 44 |             [w2i[word] for word in sent]
 45 |         return Vocab(w2i)
 46 | 
 47 |     def size(self):
 48 |         return len(self.w2i.keys())
 49 | 
 50 | 
 51 | def read(fname):
 52 |     """
 53 |     Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..."
 54 |     Yields lists of the form [(word1,tag1), (word2,tag2), ...]
 55 |     """
 56 |     with open(fname, "r") as fh:
 57 |         for line in fh:
 58 |             line = line.strip().split()
 59 |             sent = [tuple(x.rsplit("|", 1)) for x in line]
 60 |             yield sent
 61 | 
 62 | 
 63 | train = list(read(train_file))
 64 | dev = list(read(dev_file))
 65 | words = []
 66 | tags = []
 67 | wc = Counter()
 68 | for sent in train:
 69 |     for w, p in sent:
 70 |         words.append(w)
 71 |         tags.append(p)
 72 |         wc[w] += 1
 73 | words.append("_UNK_")
 74 | 
 75 | vw = Vocab.from_corpus([words])
 76 | vt = Vocab.from_corpus([tags])
 77 | UNK = vw.w2i["_UNK_"]
 78 | nwords = vw.size()
 79 | ntags = vt.size()
 80 | print("nwords=%r, ntags=%r" % (nwords, ntags))
 81 | 
 82 | 
 83 | def get_var(x, volatile=False):
 84 |     x = Variable(x, volatile=volatile)
 85 |     return x.cuda() if args.CUDA else x
 86 | 
 87 | 
 88 | class Model(nn.Module):
 89 | 
 90 |     def __init__(self, args):
 91 |         super(Model, self).__init__()
 92 |         self.lookup = nn.Embedding(nwords, args.WEMBED_SIZE)
 93 |         self.lstm = nn.LSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE, 1, bidirectional=True)
 94 |         self.proj1 = nn.Linear(2 * args.HIDDEN_SIZE, args.MLP_SIZE)
 95 |         self.proj2 = nn.Linear(args.MLP_SIZE, ntags)
 96 | 
 97 |     def forward(self, x):
 98 |         return nn.functional.softmax(nn.functional.tanh(self.proj2(self.proj1(self.lstm(self.lookup(x).unsqueeze(1))[0].squeeze(1)))))
 99 | 
100 | 
101 | model = Model(args)
102 | if args.CUDA:
103 |     model.cuda()
104 | optimizer = optim.Adam(model.parameters())
105 | 
106 | 
107 | print("startup time: %r" % (time.time() - start))
108 | start = time.time()
109 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0
110 | 
111 | for ITER in range(100):
112 |     random.shuffle(train)
113 |     for s in train:
114 |         i += 1
115 |         if i % 500 == 0:
116 |             print(this_loss / this_tagged, file=sys.stderr)
117 |             all_tagged += this_tagged
118 |             this_loss = this_tagged = 0
119 |             all_time = time.time() - start
120 |         if i % 10000 == 0 or all_time > args.TIMEOUT:  # eval on dev
121 |             dev_start = time.time()
122 |             good_sent = bad_sent = good = bad = 0.0
123 |             for sent in dev:
124 |                 words = [vw.w2i[w] if wc[w] > 5 else UNK for w, _ in sent]
125 |                 golds = [t for w, t in sent]
126 |                 tags = [vt.i2w[i] for i in model(get_var(torch.LongTensor(words), volatile=True)).max(1)[1].cpu().data.view(-1)]
127 |                 if tags == golds: good_sent += 1
128 |                 else: bad_sent += 1
129 |                 for go, gu in zip(golds, tags):
130 |                     if go == gu: good += 1
131 |                     else: bad += 1
132 |             dev_time += time.time() - dev_start
133 |             train_time = time.time() - start - dev_time
134 |             print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time))
135 |             if all_time > args.TIMEOUT:
136 |                 sys.exit(0)
137 |         # batch / loss
138 |         words = [vw.w2i[w] if wc[w] > 5 else UNK for w, _ in s]
139 |         golds = [vt.w2i[t] for _, t in s]
140 |         preds = model(get_var(torch.LongTensor(words)))
141 |         loss = F.cross_entropy(preds, get_var(torch.LongTensor(golds)))
142 |         # log / optim
143 |         this_loss += loss.data[0]*len(golds)
144 |         this_tagged += len(golds)
145 |         optimizer.zero_grad()
146 |         loss.backward()
147 |         optimizer.step()
148 |     print("epoch %r finished" % ITER)
149 | 


--------------------------------------------------------------------------------
/pytorch/rnnlm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, print_function
  2 | import time
  3 | import sys
  4 | import random
  5 | import argparse
  6 | from itertools import count
  7 | from collections import defaultdict
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | from torch import nn
 12 | from torch import optim
 13 | from torch.autograd import Variable
 14 | 
 15 | start = time.time()
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument('MB_SIZE', type=int, help='minibatch size')
 19 | parser.add_argument('EMBED_SIZE', type=int, help='embedding size')
 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 23 | parser.add_argument('--CUDA', type=int, default=-1, help='use CUDA')
 24 | args = parser.parse_args()
 25 | 
 26 | train_file = 'data/text/train.txt'
 27 | test_file = 'data/text/dev.txt'
 28 | 
 29 | def read(fname):
 30 |     """
 31 |     Read a file where each line is of the form "word1 word2 ..."
 32 |     Yields lists of the form [word1, word2, ...]
 33 |     """
 34 |     with open(fname, "r") as fh:
 35 |         for line in fh:
 36 |             sent = [w2i[x] for x in line.strip().split()]
 37 |             sent.append(w2i["<s>"])
 38 |             yield torch.LongTensor(sent)
 39 | 
 40 | w2i = defaultdict(count(0).next)
 41 | mask = w2i['<MASK>']
 42 | assert mask == 0
 43 | train = list(read(train_file))
 44 | vocab_size = len(w2i)
 45 | test = list(read(test_file))
 46 | S = w2i['<s>']
 47 | 
 48 | def get_batch(sequences, volatile=False):
 49 |     lengths = torch.LongTensor([len(s) for s in sequences])
 50 |     batch = torch.LongTensor(lengths.max(), len(sequences)).fill_(mask)
 51 |     for i, s in enumerate(sequences):
 52 |         batch[:len(s), i] = s
 53 |     if args.CUDA:
 54 |         batch = batch.cuda()
 55 |     return Variable(batch, volatile=volatile), lengths
 56 | 
 57 | class RNNLM(nn.Module):
 58 |     def __init__(self):
 59 |         super(RNNLM, self).__init__()
 60 |         self.embeddings = nn.Embedding(vocab_size, args.EMBED_SIZE)
 61 |         self.rnn = nn.LSTM(args.EMBED_SIZE, args.HIDDEN_SIZE)
 62 |         self.proj = nn.Linear(args.HIDDEN_SIZE, vocab_size)
 63 |     def forward(self, sequences):
 64 |         rnn_output, _ = self.rnn(self.embeddings(sequences))
 65 |         return self.proj(rnn_output.view(-1, args.HIDDEN_SIZE))
 66 | 
 67 | # build the model
 68 | rnnlm = RNNLM()
 69 | optimizer = optim.Adam(rnnlm.parameters(), lr=0.001)
 70 | weight = torch.FloatTensor(vocab_size).fill_(1)
 71 | weight[mask] = 0
 72 | loss_fn = nn.CrossEntropyLoss(weight, size_average=False)
 73 | if args.CUDA:
 74 |     rnnlm.cuda()
 75 |     loss_fn.cuda()
 76 | 
 77 | # Sort training sentences in descending order and count minibatches
 78 | train.sort(key=lambda x: -len(x))
 79 | test.sort(key=lambda x: -len(x))
 80 | train_order = range(0, len(train), args.MB_SIZE)  # [x*args.MB_SIZE for x in range(int((len(train)-1)/args.MB_SIZE + 1))]
 81 | test_order = range(0, len(test), args.MB_SIZE)  # [x*args.MB_SIZE for x in range(int((len(test)-1)/args.MB_SIZE + 1))]
 82 | 
 83 | # Perform training
 84 | print("startup time: %r" % (time.time() - start))
 85 | start = time.time()
 86 | i = total_time = dev_time = total_tagged = current_words = current_loss = 0
 87 | 
 88 | for ITER in range(100):
 89 |     random.shuffle(train_order)
 90 |     for sid in train_order:
 91 |         i += 1
 92 |         # train
 93 |         batch, lengths = get_batch(train[sid:sid + args.MB_SIZE])
 94 |         scores = rnnlm(batch[:-1])
 95 |         loss = loss_fn(scores, batch[1:].view(-1))
 96 |         # optimization
 97 |         optimizer.zero_grad()
 98 |         loss.backward()
 99 |         optimizer.step()
100 |         # log loss
101 |         current_words += lengths.sum() - lengths.size(0)  # ignore <s>
102 |         current_loss += loss.data[0]
103 |         if i % int(500 / args.MB_SIZE) == 0:
104 |             print(current_loss / current_words)
105 |             total_tagged += current_words
106 |             current_loss = current_words = 0
107 |             total_time = time.time() - start
108 |         # log perplexity
109 |         if i % int(10000 / args.MB_SIZE) == 0 or total_time > args.TIMEOUT:
110 |             dev_start = time.time()
111 |             dev_loss = dev_words = 0
112 |             for j in test_order:
113 |                 batch, lengths = get_batch(test[j:j + args.MB_SIZE], volatile=True)
114 |                 scores = rnnlm(batch[:-1])
115 |                 dev_loss += loss_fn(scores, batch[1:].view(-1)).data[0]
116 |                 dev_words += lengths.sum() - lengths.size(0)  # ignore <s>
117 |             dev_time += time.time() - dev_start
118 |             train_time = time.time() - start - dev_time
119 |             print("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % (
120 |                 dev_loss / dev_words, np.exp(dev_loss / dev_words), dev_words, train_time, total_tagged / train_time))
121 |         if total_time > args.TIMEOUT:
122 |             sys.exit(0)
123 | 
124 |     print("epoch %r finished" % ITER)
125 | 


--------------------------------------------------------------------------------
/run-tests.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | export PYTHON_PATH={ANACONDA_PATH:-$HOME/usr/local/anaconda3/envs/benchmark2}
  4 | export CUDA_PATH=/usr/local/cuda
  5 | export DYNET_PATH=${DYNET_PATH:-$HOME/work/dynet}
  6 | export LD_LIBRARY_PATH=$DYNET_PATH/build/dynet:$PYTHON_PATH/lib:$CUDA_PATH/lib64
  7 | export LIBRARY_PATH=$DYNET_PATH/build/dynet:$PYTHON_PATH/lib:$CUDA_PATH/lib64
  8 | export PYTHONPATH=$DYNET_PATH/build/python
  9 | PYTHON=python
 10 | 
 11 | DYFLAGS=${DYFLAGS:-"--dynet-mem 4096"}
 12 | GPUSUF=
 13 | if [[ $# == 1 ]]; then
 14 |   export CUDA_VISIBLE_DEVICES=$1
 15 |   export THEANO_FLAGS="device=gpu0,floatX=float32"
 16 |   DYFLAGS="$DYFLAGS --dynet-gpus 1"
 17 |   GPUSUF="-gpu"
 18 |   CGPU=0
 19 | else
 20 |   export THEANO_FLAGS="device=cpu,floatX=float32"
 21 |   CGPU=-1
 22 | fi
 23 | 
 24 | TIMEOUT=${TIMEOUT:-600}
 25 | LONGTIMEOUT=${LONGTIMEOUT:-600}
 26 | 
 27 | runcmd() {
 28 |   LFILE=log/$2$GPUSUF/$4.log
 29 |   if [[ ! -e $LFILE ]]; then
 30 |     MYTIMEOUT=$TIMEOUT
 31 |     if [[ $1 == "dynet-cpp" ]]; then
 32 |       mycmd="$1/$2$GPUSUF $DYFLAGS"
 33 |       if [[ $4 =~ dynet-cpp-bs01-ws128-hs256-.* ]] || [[ $4 =~ dynet-cpp-bs16-ws128-hs256-.* ]] || [[ $2 =~ bilstm.* ]] || [[ $2 =~ treenn ]]; then 
 34 |         MYTIMEOUT=$LONGTIMEOUT
 35 |       fi
 36 |     elif [[ $1 == "dynet-py" ]]; then
 37 |       mycmd="$PYTHON -u $1/$2.py $DYFLAGS"
 38 |     elif [[ $1 == "chainer" ]]; then
 39 |       mycmd="$PYTHON -u $1/$2.py --chainer_gpu $CGPU"
 40 |     elif [[ $1 == "tensorflow" ]]; then
 41 |       mycmd="$PYTHON -u $1/$2.py --gpu"
 42 |     else
 43 |       mycmd="$PYTHON -u $1/$2.py"
 44 |     fi
 45 |     mkdir -p log/$2$GPUSUF
 46 |     echo "$mycmd $3 $MYTIMEOUT &> $LFILE"
 47 |     EXTERNALTIMEOUT=$((MYTIMEOUT+60))
 48 |     timeout $EXTERNALTIMEOUT $mycmd $3 $MYTIMEOUT &> $LFILE
 49 |   fi
 50 | }
 51 | 
 52 | NUM_TRIALS=${NUM_TRIALS:-3}
 53 | 
 54 | for trial in `seq $NUM_TRIALS`; do
 55 | 
 56 |   if [[ -z "$TASK" || "$TASK" == "rnnlm-batch" ]]; then
 57 |     # Run rnnlm-batch
 58 |     for embsize in 128; do
 59 |       hidsize=$(($embsize*2))
 60 |       for mbsize in 64 16 04 01; do
 61 |         if [[ -z "$MBSIZE" || "$MBSIZE" == "$mbsize" ]]; then
 62 |           for f in dynet-cpp dynet-py chainer theano tensorflow; do
 63 |             if [[ $f == "dynet-cpp" ]]; then
 64 |               runcmd $f rnnlm-seq "$mbsize $embsize $hidsize 0" $f-ms$mbsize-es$embsize-hs$hidsize-sp0-t$trial
 65 |             fi
 66 |             runcmd $f rnnlm-batch "$mbsize $embsize $hidsize 0" $f-ms$mbsize-es$embsize-hs$hidsize-sp0-t$trial
 67 |           done
 68 |         fi
 69 |       done
 70 |     done
 71 |   fi
 72 | 
 73 |   if [[ -z "$TASK" || "$TASK" == "sparse-rnnlm-batch" ]]; then
 74 |     # run sparse rnnlm-batch on a subset
 75 |     for embsize in 128; do
 76 |       hidsize=$(($embsize*2))
 77 |       for mbsize in 16 01; do
 78 |         if [[ -z "$MBSIZE" || "$MBSIZE" == "$mbsize" ]]; then
 79 |           for f in dynet-cpp dynet-py; do
 80 |             runcmd $f rnnlm-batch "$mbsize $embsize $hidsize 1" $f-ms$mbsize-es$embsize-hs$hidsize-sp1-t$trial
 81 |           done
 82 |         fi
 83 |       done
 84 |     done
 85 |   fi
 86 | 
 87 |   if [[ -z "$TASK" || "$TASK" == "bilstm-tagger" ]]; then
 88 |     # Run bilstm-tagger
 89 |     wembsize=128
 90 |     hidsize=50
 91 |     mlpsize=32
 92 |     for f in dynet-cpp dynet-py chainer theano tensorflow; do
 93 |       runcmd $f bilstm-tagger "$wembsize $hidsize $mlpsize 0" $f-ws$wembsize-hs$hidsize-mlps$mlpsize-su0-t$trial
 94 |       if [[ $f == dynet* ]]; then
 95 |         runcmd $f bilstm-tagger "$wembsize $hidsize $mlpsize 1" $f-ws$wembsize-hs$hidsize-mlps$mlpsize-su1-t$trial
 96 |       fi
 97 |     done
 98 |   fi
 99 | 
100 |   if [[ -z "$TASK" || "$TASK" == "bilstm-tagger-withchar" ]]; then
101 |     # Run bilstm-tagger-withchar
102 |     cembsize=20
103 |     wembsize=128
104 |     hidsize=50
105 |     mlpsize=32
106 |     for f in dynet-cpp dynet-py theano chainer; do
107 |       runcmd $f bilstm-tagger-withchar "$cembsize $wembsize $hidsize $mlpsize 0" $f-cs$cembsize-ws$wembsize-hs$hidsize-mlps$mlpsize-su0-t$trial
108 |       if [[ $f == dynet* ]]; then
109 |         runcmd $f bilstm-tagger-withchar "$cembsize $wembsize $hidsize $mlpsize 1" $f-cs$cembsize-ws$wembsize-hs$hidsize-mlps$mlpsize-su1-t$trial
110 |       fi
111 |     done
112 |   fi
113 | 
114 |   if [[ -z "$TASK" || "$TASK" == "treenn" ]]; then
115 |     # Run treenn
116 |     wembsize=128
117 |     hidsize=128
118 |     for f in dynet-cpp dynet-py chainer; do
119 |       runcmd $f treenn "$wembsize $hidsize 0" $f-ws$wembsize-hs$hidsize-su0-t$trial
120 |       if [[ $f == dynet* ]]; then
121 |         runcmd $f treenn "$wembsize $hidsize 1" $f-ws$wembsize-hs$hidsize-su1-t$trial
122 |       fi
123 |     done
124 |   fi
125 | 
126 | done
127 | 


--------------------------------------------------------------------------------
/tensorflow/bilstm-tagger.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | from collections import Counter, defaultdict
  6 | from itertools import count
  7 | import random
  8 | import math
  9 | import sys
 10 | import numpy as np
 11 | import tensorflow as tf
 12 | import argparse
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument('--gpu', dest='gpu', action='store_true')
 16 | parser.set_defaults(gpu=False)
 17 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size')
 18 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 19 | parser.add_argument('MLP_SIZE', type=int, help='embedding size')
 20 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')  # sparse updates by default in tensorflow
 21 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 22 | args = parser.parse_args()
 23 | 
 24 | NUM_LAYERS = 1
 25 | 
 26 | # format of files: each line is "word1/tag2 word2/tag2 ..."
 27 | train_file='data/tags/train.txt'
 28 | test_file='data/tags/dev.txt'
 29 | 
 30 | class Vocab:
 31 |     def __init__(self, w2i=None):
 32 |         if w2i is None: w2i = defaultdict(count(0).next)
 33 |         self.w2i = dict(w2i)
 34 |         self.i2w = {i:w for w,i in w2i.iteritems()}
 35 |     @classmethod
 36 |     def from_corpus(cls, corpus):
 37 |         w2i = defaultdict(count(0).next)
 38 |         for sent in corpus:
 39 |             [w2i[word] for word in sent]
 40 |         return Vocab(w2i)
 41 | 
 42 |     def size(self): return len(self.w2i.keys())
 43 | 
 44 | def read(fname):
 45 |     """
 46 |     Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..."
 47 |     Yields lists of the form [(word1,tag1), (word2,tag2), ...]
 48 |     """
 49 |     with file(fname) as fh:
 50 |         for line in fh:
 51 |             line = line.strip().split()
 52 |             sent = [tuple(x.rsplit("|",1)) for x in line]
 53 |             yield sent
 54 | 
 55 | train=list(read(train_file))
 56 | test=list(read(test_file))
 57 | words=[]
 58 | tags=[]
 59 | wc=Counter()
 60 | for sent in train:
 61 |     for w,p in sent:
 62 |         words.append(w)
 63 |         tags.append(p)
 64 |         wc[w]+=1
 65 | words.append("_UNK_")
 66 | 
 67 | vw = Vocab.from_corpus([words]) 
 68 | vt = Vocab.from_corpus([tags])
 69 | UNK = vw.w2i["_UNK_"]
 70 | 
 71 | nwords = vw.size()
 72 | ntags  = vt.size()
 73 | print ("nwords=%r, ntags=%r" % (nwords, ntags))
 74 | 
 75 | def get_tags(log_probs):
 76 |   sent_tags = []
 77 |   for word_probs in log_probs:
 78 |     tag = np.argmax(word_probs, axis=0)
 79 |     sent_tags.append(tag)
 80 |   return sent_tags
 81 | 
 82 | if args.gpu:
 83 |   cpu_or_gpu = '/gpu:0'
 84 | else:
 85 |   cpu_or_gpu = '/cpu:0'
 86 | 
 87 | with tf.device(cpu_or_gpu):
 88 | 
 89 |   # Lookup parameters for word embeddings
 90 |   WORDS_LOOKUP = tf.Variable(tf.random_uniform([nwords, 1, args.WEMBED_SIZE], -1.0, 1.0))
 91 | 
 92 |   mlp_hidden = tf.Variable(tf.random_uniform([args.HIDDEN_SIZE*2, args.MLP_SIZE], -1.0, 1.0))
 93 |   mlp_out = tf.Variable(tf.random_uniform([args.MLP_SIZE, ntags], -1.0, 1.0))
 94 | 
 95 |   # input sentence placeholder
 96 |   words_in = tf.placeholder(tf.int32, [None], name="input_sentence")
 97 |   golds = tf.placeholder(tf.int32, [None], name="golds")
 98 |   sent_len = tf.placeholder(tf.int32, shape=(1,), name="sent_len")
 99 | 
100 |   wembs = tf.squeeze(tf.nn.embedding_lookup(WORDS_LOOKUP, words_in), axis=1)
101 |   wembs = tf.expand_dims(wembs, axis=0)
102 |   wembs.set_shape([1, words_in.get_shape()[0], args.WEMBED_SIZE])
103 | 
104 |   # Word-level LSTM (configurable number of layers, input is unspecified,
105 |   # but will be equal to the embedding dim, output=128)
106 | 
107 |   cell = tf.nn.rnn_cell.BasicLSTMCell(args.HIDDEN_SIZE, forget_bias=0.0, state_is_tuple=True)
108 |   cell = tf.nn.rnn_cell.MultiRNNCell([cell] * NUM_LAYERS, state_is_tuple=True)
109 | 
110 |   outputs, _ =  tf.nn.bidirectional_dynamic_rnn(cell_fw=cell,
111 |                                                 cell_bw=cell,
112 |                                                 dtype=tf.float32,
113 |                                                 sequence_length=sent_len,
114 |                                                 inputs=wembs)
115 | 
116 |   output_fw, output_bw = outputs
117 |   output_concat = tf.squeeze(tf.concat(2, [output_fw, output_bw]), axis=0)  # (input_length, 2 * HIDDEN_SIZE)
118 |   output_concat.set_shape([None, 2*args.HIDDEN_SIZE])
119 | 
120 |   # Pass to MLP
121 |   mlp_activation = tf.tanh(tf.matmul(output_concat, mlp_hidden))
122 |   mlp_output = tf.matmul(mlp_activation, mlp_out)
123 | 
124 |   ## calculate the loss
125 |   losses = tf.nn.sparse_softmax_cross_entropy_with_logits(mlp_output, golds)
126 |   loss = tf.reduce_sum(losses)
127 | 
128 |   optimizer = tf.train.AdamOptimizer().minimize(loss)
129 |   print('Graph created.' , file=sys.stderr)
130 | 
131 | sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))
132 | tf.global_variables_initializer().run()
133 | print('Session initialized.' , file=sys.stderr)
134 | train_losses = [] 
135 | print ("startup time: %r" % (time.time() - start))
136 | start_train = time.time()
137 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0
138 | 
139 | for ITER in range(100):
140 |   random.shuffle(train)
141 |   start = time.time()
142 |   for s in train:
143 |     i += 1
144 |     if i % 500 == 0:   # print status
145 |       print('Updates so far: %d Loss: %f wps: %f' % (i - 1, this_loss / this_tagged, this_tagged/(time.time() - start)))
146 |       all_tagged += this_tagged
147 |       this_loss = this_tagged = 0
148 |       all_time = time.time() - start_train
149 |       start = time.time()
150 |     if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev
151 |       dev_start = time.time()
152 |       good_sent = bad_sent = good = bad = 0.0
153 |       for sent in test:
154 |         x_in = [vw.w2i[w] if wc[w]>5 else UNK for w,_ in sent]
155 |         golds_in = [vt.w2i[t] for _,t in sent]
156 |         # log_probs = sess.run(mlp_output, feed_dict={words_in: x_in, golds: golds_in, sent_len: [len(sent)]})
157 |         log_probs = mlp_output.eval(feed_dict={words_in: x_in, golds: golds_in, sent_len: [len(sent)]}, session=sess)
158 |         tags = get_tags(log_probs)
159 |         if tags == golds_in: good_sent += 1
160 |         else: bad_sent += 1
161 |         for go,gu in zip(golds_in,tags):
162 |           if go == gu: good += 1
163 |           else: bad += 1
164 |       dev_time += time.time() - dev_start
165 |       train_time = time.time() - start_train - dev_time
166 |       print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (good/(good+bad), good_sent/(good_sent+bad_sent), train_time, all_tagged/train_time))
167 |       start = start + (time.time() - dev_start)
168 |       if all_time > args.TIMEOUT:
169 |         sys.exit(0)        
170 |     # train on sent         
171 |     x_in = [vw.w2i[w] if wc[w]>5 else UNK for w,_ in s]
172 |     golds_in = [vt.w2i[t] for _,t in s]
173 |     train_loss, _ = sess.run([loss, optimizer], feed_dict={words_in: x_in, golds: golds_in, sent_len: [len(s)]})
174 |     this_loss += train_loss
175 |     this_tagged += len(golds_in)
176 |   print("epoch %r finished" % ITER)
177 | 


--------------------------------------------------------------------------------
/tensorflow/bow.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | from collections import defaultdict
  6 | from operator import itemgetter
  7 | import random
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | import sys
 11 | 
 12 | # Functions to read in the corpus
 13 | w2i = defaultdict(lambda: len(w2i))
 14 | t2i = defaultdict(lambda: len(t2i))
 15 | UNK = w2i["<unk>"]
 16 | 
 17 | def read_dataset(filename):
 18 |   with open(filename, "r") as f:
 19 |     for line in f:
 20 |       tag, words = line.lower().strip().split(" ||| ")
 21 |       yield ([w2i[x] for x in words.split(" ")], t2i[tag])
 22 | 
 23 | # Read in the data
 24 | train = list(read_dataset("data/classes/train.txt"))
 25 | w2i = defaultdict(lambda: UNK, w2i)
 26 | dev = list(read_dataset("data/classes/test.txt"))
 27 | nwords = len(w2i)
 28 | ntags = len(t2i)
 29 | EPOCHS = 100
 30 | GPU = False
 31 | 
 32 | print ("nwords=%r, ntags=%r" % (nwords, ntags))
 33 | # Determine max length across train and dev set
 34 | max_length = 0
 35 | for sent in train:
 36 |   if len(sent[0]) > max_length:
 37 |     max_length = len(sent[0])
 38 | 
 39 | for sent in dev:
 40 |   if len(sent[0]) > max_length:
 41 |     max_length = len(sent[0])
 42 | 
 43 | def pad(seq, element, length):
 44 |   r = seq + [element] * (length - len(seq))
 45 |   assert len(r) == length
 46 |   return r 
 47 | 
 48 | def main(_):
 49 |   if GPU:
 50 |     cpu_or_gpu = '/gpu:0'
 51 |   else:
 52 |     cpu_or_gpu = '/cpu:0'
 53 | 
 54 |   with tf.device(cpu_or_gpu):
 55 |     W_sm = tf.Variable(tf.random_uniform([nwords, ntags], -1.0, 1.0)) # Word weights 
 56 |     b_sm = tf.Variable(tf.random_uniform([ntags], -1.0, 1.0))  # Softmax bias
 57 |     words_in = tf.placeholder(tf.int32, shape=[max_length])
 58 |     tags_in = tf.placeholder(tf.int32, shape=[1])
 59 |     masks_in = tf.placeholder(tf.float32, shape=[max_length])
 60 |     
 61 |     ##Calculate scores
 62 |     embs = [tf.expand_dims(tf.nn.embedding_lookup(W_sm, x), axis=1) for x in tf.unstack(words_in)]
 63 |     embs_concat = tf.concat(1, embs) # embedding matrix
 64 |     score = tf.mul(embs_concat, masks_in) # truncate padded tokens' embeddings
 65 |     score = tf.reduce_sum(score, axis=1)
 66 |     score_out = tf.add(score, b_sm)
 67 | 
 68 |     # Add dims to match cross entropy func definition
 69 |     score_to_loss = tf.expand_dims(score_out, axis=0)
 70 | 
 71 |     # Calculate loss 
 72 |     losses = tf.nn.sparse_softmax_cross_entropy_with_logits(score_to_loss, tags_in)
 73 |     # losses = tf.nn.seq2seq.sequence_loss_by_example(tf.unstack(score_to_loss), tf.unstack(tags_in), loss_weights)
 74 |     loss = tf.reduce_mean(losses)
 75 | 
 76 |     optimizer = tf.train.AdamOptimizer().minimize(loss)
 77 | 
 78 |     print >>sys.stderr, 'Graph created.' 
 79 | 
 80 |   sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))
 81 |   tf.global_variables_initializer().run()
 82 |   print >>sys.stderr, 'Session initialized.'
 83 | 
 84 |   print ("startup time: %r" % (time.time() - start))
 85 |   for ITER in range(EPOCHS):
 86 |     # Perform training
 87 |     random.shuffle(train)
 88 |     train_loss = 0.0
 89 |     start = time.time()
 90 |     for i, (words, tag) in enumerate(train):
 91 |       padded_words = pad(words, UNK, max_length)
 92 |       mask = [1.0] * len(words) + [0.0] * (max_length - len(words))
 93 |       _, cur_loss, _ = sess.run([score_out, loss, optimizer], feed_dict={words_in: padded_words, tags_in: [tag], masks_in: mask})
 94 |       train_loss += cur_loss
 95 |     print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start))
 96 |     
 97 |     # Perform testing
 98 |     test_correct = 0.0
 99 |     for words, tag in dev:
100 |       padded_words = pad(words, UNK, max_length)
101 |       mask = [1.0] * len(words) + [0.0] * (max_length - len(words))
102 |       prob_scores = sess.run(score_out, feed_dict={words_in: padded_words, tags_in: [tag], masks_in: mask})
103 |       predict = np.argmax(prob_scores)
104 |       if predict == tag:
105 |         test_correct += 1
106 |     print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))
107 | 
108 | if __name__ == "__main__":
109 |     tf.app.run()
110 | 


--------------------------------------------------------------------------------
/tensorflow/rnnlm-batch.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import time
  3 | start_ = time.time()
  4 | 
  5 | from collections import Counter, defaultdict
  6 | from itertools import count
  7 | import random
  8 | import math
  9 | import sys
 10 | import argparse
 11 | 
 12 | import numpy as np
 13 | import tensorflow as tf
 14 | 
 15 | parser = argparse.ArgumentParser()
 16 | parser.add_argument('--gpu', dest='gpu', action='store_true')
 17 | parser.set_defaults(gpu=False)
 18 | parser.add_argument('MB_SIZE', type=int, help='minibatch size')
 19 | parser.add_argument('EMBED_SIZE', type=int, help='embedding size')
 20 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 21 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')  # sparse updates by default in tensorflow
 22 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 23 | args = parser.parse_args()
 24 | 
 25 | NUM_LAYERS = 1
 26 | 
 27 | # format of files: each line is "word1/tag2 word2/tag2 ..."
 28 | train_file='data/text/train.txt'
 29 | test_file='data/text/dev.txt'
 30 | w2i = defaultdict(count(0).next)
 31 | eos = '<s>'
 32 | 
 33 | def read(fname):
 34 |   """
 35 |   Read a file where each line is of the form "word1 word2 ..."
 36 |   Yields lists of the form [word1, word2, ...]
 37 |   """
 38 |   with file(fname) as fh:
 39 |     for line in fh:
 40 |       sent = [w2i[eos]]
 41 |       sent += [w2i[x] for x in line.strip().split()]
 42 |       sent.append(w2i[eos])
 43 |       yield sent
 44 | 
 45 | train = list(read(train_file))
 46 | nwords = len(w2i)
 47 | test = list(read(test_file))
 48 | S = w2i[eos]
 49 | assert(nwords == len(w2i))
 50 | 
 51 | train.sort(key=lambda x: len(x), reverse=True)
 52 | test.sort(key=lambda x: len(x), reverse=True)
 53 | 
 54 | if args.MB_SIZE != 0:
 55 |   train_order = [x*args.MB_SIZE for x in range((len(train)-1)/args.MB_SIZE + 1)]
 56 |   test_order = [x*args.MB_SIZE for x in range((len(test)-1)/args.MB_SIZE + 1)]
 57 | else:
 58 |   train_order = range(len(train))
 59 |   test_order = range(len(test))
 60 | 
 61 | def pad(seq, element, length):
 62 |   assert len(seq) <= length
 63 |   r = seq + [element] * (length - len(seq))
 64 |   assert len(r) == length
 65 |   return r 
 66 | 
 67 | if args.gpu:
 68 |   cpu_or_gpu = '/gpu:0'
 69 | else:
 70 |   cpu_or_gpu = '/cpu:0'
 71 | 
 72 | with tf.device(cpu_or_gpu):
 73 |   # Lookup parameters for word embeddings
 74 |   WORDS_LOOKUP = tf.Variable(tf.random_uniform([nwords, 1, args.EMBED_SIZE], -1.0, 1.0))
 75 | 
 76 |   # Word-level LSTM (configurable number of layers, input is unspecified,
 77 |   # but will be equal to the embedding dim, output=128)
 78 |   cell = tf.nn.rnn_cell.BasicLSTMCell(args.HIDDEN_SIZE, forget_bias=0.0, state_is_tuple=True) 
 79 |   cell = tf.nn.rnn_cell.MultiRNNCell([cell] * NUM_LAYERS, state_is_tuple=True)
 80 | 
 81 |   # input sentence placeholder
 82 |   x_input = tf.placeholder(tf.int32, [None, None], name="x_input")
 83 |   x_lens = tf.placeholder(tf.int32, [None], name='x_lens')
 84 | 
 85 |   x_embs = tf.squeeze(tf.nn.embedding_lookup(WORDS_LOOKUP, x_input), axis=2)
 86 |   # Hack to fix shape so dynamic_rnn will accept this as input
 87 |   x_embs.set_shape([None, None, args.EMBED_SIZE])
 88 | 
 89 |   # Actually run the RNN
 90 |   outputs, _ = tf.nn.dynamic_rnn(cell, x_embs, sequence_length=x_lens, dtype=tf.float32)
 91 |   
 92 |   # Affine transform
 93 |   output = tf.reshape(tf.concat(1, outputs), [-1, args.HIDDEN_SIZE])
 94 |   W_sm = tf.Variable(tf.random_uniform([args.HIDDEN_SIZE, nwords]))
 95 |   b_sm = tf.Variable(tf.random_uniform([nwords]))
 96 |   logits = tf.matmul(tf.squeeze(output), W_sm) + b_sm
 97 | 
 98 |   # Compute categorical loss
 99 |   # Don't predict the first input (<s>), and don't worry about the last output (after we've input </s>)
100 |   # losses = tf.nn.sparse_softmax_cross_entropy_with_logits(outputs[:-1], x_input[1:])
101 |   losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits[:-1], tf.reshape(x_input, [-1])[1:])
102 |   loss = tf.reduce_mean(losses)
103 |   optimizer = tf.train.AdamOptimizer().minimize(loss)
104 | 
105 |   print('Graph created.', file=sys.stderr)
106 | 
107 | sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))
108 | tf.global_variables_initializer().run()
109 | print('Session initialized.', file=sys.stderr)
110 | 
111 | train_losses = [] 
112 | print('startup time: %r' % (time.time() - start_))
113 | i = all_time = dev_time = all_tagged = train_words = 0
114 | start_train = time.time()
115 | for ITER in range(10):
116 |   random.shuffle(train_order)
117 |   start_ = time.time()
118 |   for i, sid in enumerate(train_order, start=1):
119 |     if i % int(500 / args.MB_SIZE) == 0:
120 |       print('Updates so far: %d Loss: %f wps: %f' % (i - 1, sum(train_losses) / train_words, train_words/(time.time() - start_)))
121 |       all_tagged += train_words
122 |       train_losses = []
123 |       train_words = 0
124 |       all_time = time.time() - start_train
125 |       start_ = time.time()
126 |     if i % int(10000 / args.MB_SIZE) == 0 or all_time > args.TIMEOUT:
127 |       dev_start = time.time()
128 |       test_losses = []
129 |       test_words = 0
130 |       all_time += time.time() - start_train
131 |       print('Testing on dev set...')
132 |       for tid in test_order:
133 |         t_examples = test[tid:tid+args.MB_SIZE]
134 |         x_lens_in = [len(example) for example in t_examples]
135 |         x_in = [pad(example, S, max(x_lens_in)) for example in t_examples]
136 |         test_loss = sess.run(loss, feed_dict={x_input: x_in, x_lens: x_lens_in})
137 |         tot_words = sum(x_lens_in) - len(x_lens_in) # Subtract out <s> from the denominator - to be in line with other toolkits
138 |         test_losses.append(test_loss * tot_words)
139 |         test_words += tot_words
140 |       nll = sum(test_losses) / test_words
141 |       dev_time += time.time() - dev_start 
142 |       train_time = time.time() - start_train - dev_time
143 |       print ('nll=%.4f, ppl=%.4f, time=%.4f, words_per_sec=%.4f' % (nll, math.exp(nll), train_time, all_tagged/train_time), file=sys.stderr)
144 |       start_ = start_ + (time.time() - dev_start)
145 |       if all_time > args.TIMEOUT:
146 |         sys.exit(0)
147 |     # train on sent
148 |     examples = train[sid : sid+args.MB_SIZE]
149 |     x_lens_in = [len(example) for example in examples]
150 |     if x_lens_in.count(x_lens_in[0])!=len(x_lens_in): x_in = [pad(example, S, max(x_lens_in)) for example in examples]
151 |     else: x_in = examples
152 |     train_loss, _ = sess.run([loss, optimizer], feed_dict={x_input: x_in, x_lens: x_lens_in})
153 |     tot_words = sum(x_lens_in) - len(x_lens_in) # Subtract out <s> from the denominator
154 |     train_losses.append(train_loss * tot_words)
155 |     train_words += tot_words
156 | 


--------------------------------------------------------------------------------
/theano/README.md:
--------------------------------------------------------------------------------
1 | * rnnlm_mibibatch.py: LSTM rnnlm with (or without) minibatch. Set `batch_size=1` to disable minibatch.
2 | 


--------------------------------------------------------------------------------
/theano/bilstm-tagger-withchar.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | import random
  6 | 
  7 | import theano.tensor as T
  8 | import theano
  9 | from theano.ifelse import ifelse
 10 | import numpy as np
 11 | import sys
 12 | import argparse
 13 | from itertools import chain
 14 | 
 15 | from nn.layers.recurrent import LSTM, BiLSTM
 16 | from nn.layers.embeddings import Embedding
 17 | from nn.activations import softmax
 18 | from nn.optimizers import Adam
 19 | from nn.initializations import uniform
 20 | 
 21 | from collections import Counter, defaultdict
 22 | from itertools import count
 23 | 
 24 | parser = argparse.ArgumentParser()
 25 | parser.add_argument("--dynet_mem", default=512, type=int)
 26 | parser.add_argument('CEMBED_SIZE', type=int, help='embedding size')
 27 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size')
 28 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 29 | parser.add_argument('MLP_SIZE', type=int, help='embedding size')
 30 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 31 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 32 | args = parser.parse_args()
 33 | 
 34 | # format of files: each line is "word1|tag2 word2|tag2 ..."
 35 | train_file="data/tags/train.txt"
 36 | dev_file="data/tags/dev.txt"
 37 | 
 38 | 
 39 | class Vocab:
 40 |   def __init__(self, w2i=None):
 41 |     if w2i is None: w2i = defaultdict(count(0).next)
 42 |     self.w2i = dict(w2i)
 43 |     self.i2w = {i:w for w,i in w2i.iteritems()}
 44 | 
 45 |   @classmethod
 46 |   def from_corpus(cls, corpus):
 47 |     w2i = defaultdict(count(0).next)
 48 |     for sent in corpus:
 49 |       [w2i[word] for word in sent]
 50 |     return Vocab(w2i)
 51 | 
 52 |   def size(self):
 53 |     return len(self.w2i.keys())
 54 | 
 55 | 
 56 | def read(fname):
 57 |   """
 58 |   Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..."
 59 |   Yields lists of the form [(word1,tag1), (word2,tag2), ...]
 60 |   """
 61 |   with file(fname) as fh:
 62 |     for line in fh:
 63 |       line = line.strip().split()
 64 |       sent = [tuple(x.rsplit("|",1)) for x in line]
 65 |       yield sent
 66 | 
 67 | 
 68 | train=list(read(train_file))
 69 | dev=list(read(dev_file))
 70 | words=[]
 71 | tags=[]
 72 | chars=set()
 73 | wc=Counter()
 74 | for sent in train:
 75 |   for w,p in sent:
 76 |     words.append(w)
 77 |     tags.append(p)
 78 |     chars.update(w)
 79 |     wc[w]+=1
 80 | words.append("_UNK_")
 81 | chars.add("<*>")
 82 | 
 83 | vw = Vocab.from_corpus([words])
 84 | vt = Vocab.from_corpus([tags])
 85 | vc = Vocab.from_corpus([['_CHAR_MASK_'] + list(chars)])
 86 | UNK = vw.w2i["_UNK_"]
 87 | 
 88 | char_mask = vc.w2i['_CHAR_MASK_']
 89 | # mask of chars must be zero
 90 | assert char_mask == 0
 91 | 
 92 | nwords = vw.size()
 93 | ntags  = vt.size()
 94 | nchars  = vc.size()
 95 | print("nwords=%r, ntags=%r, nchars=%r" % (nwords, ntags, nchars))
 96 | 
 97 | 
 98 | def word2id(w):
 99 |   if wc[w] > 5:
100 |     w_index = vw.w2i[w]
101 |     return w_index
102 |   else:
103 |     return UNK
104 | 
105 | 
106 | def build_tag_graph():
107 |   print('build graph..', file=sys.stderr)
108 | 
109 |   # (sentence_length)
110 |   # word indices for a sentence
111 |   x = T.ivector(name='sentence')
112 | 
113 |   # (sentence_length, max_char_num_per_word)
114 |   # character indices for each word in a sentence
115 |   x_chars = T.imatrix(name='sent_word_chars')
116 | 
117 |   # (sentence_length)
118 |   # target tag
119 |   y = T.ivector(name='tag')
120 | 
121 |   # Lookup parameters for word embeddings
122 |   word_embeddings = Embedding(nwords, args.WEMBED_SIZE, name='word_embeddings')
123 | 
124 |   # Lookup parameters for character embeddings
125 |   char_embeddings = Embedding(nchars, args.CEMBED_SIZE, name='char_embeddings')
126 | 
127 |   # lstm for encoding word characters
128 |   char_lstm = BiLSTM(args.CEMBED_SIZE, int(args.WEMBED_SIZE / 2), name='char_lstm')
129 | 
130 |   # bi-lstm
131 |   lstm = BiLSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE, return_sequences=True, name='lstm')
132 | 
133 |   # MLP
134 |   W_mlp_hidden = uniform((args.HIDDEN_SIZE * 2, args.MLP_SIZE), name='W_mlp_hidden')
135 |   W_mlp = uniform((args.MLP_SIZE, ntags), name='W_mlp')
136 | 
137 |   # def get_word_embed_from_chars(word_chars):
138 |   #   # (max_char_num_per_word, char_embed_dim)
139 |   #   # (max_char_num_per_word)
140 |   #   word_char_embeds, word_char_masks = char_embeddings(word_chars, mask_zero=True)
141 |   #   word_embed = char_lstm(T.unbroadcast(word_char_embeds[None, :, :], 0), mask=T.unbroadcast(word_char_masks[None, :], 0))[0]
142 |   #
143 |   #   return word_embed
144 | 
145 |   # def word_embed_look_up_step(word_id, word_chars):
146 |   #   word_embed = ifelse(T.eq(word_id, UNK),
147 |   #             get_word_embed_from_chars(word_chars),  # if it's a unk
148 |   #             word_embeddings(word_id))
149 |   #
150 |   #   return word_embed
151 | 
152 |   word_embed_src = T.eq(x, UNK).astype('float32')[:, None]
153 | 
154 |   # (sentence_length, word_embedding_dim)
155 |   word_embed = word_embeddings(x)
156 | 
157 |   # (sentence_length, max_char_num_per_word, char_embed_dim)
158 |   # (sentence_length, max_char_num_per_word)
159 |   word_char_embeds, word_char_masks = char_embeddings(x_chars, mask_zero=True)
160 | 
161 |   # (sentence_length, word_embedding_dim)
162 |   word_embed_from_char = char_lstm(word_char_embeds, mask=word_char_masks)
163 | 
164 |   sent_embed = word_embed_src * word_embed_from_char + (1 - word_embed_src) * word_embed
165 | 
166 |   # # (sentence_length, embedding_dim)
167 |   # sent_embed, _ = theano.scan(word_embed_look_up_step, sequences=[x, x_chars])
168 | 
169 |   # (sentence_length, lstm_hidden_dim)
170 |   lstm_output = lstm(T.unbroadcast(sent_embed[None, :, :], 0))[0]
171 | 
172 |   # (sentence_length, ntags)
173 |   mlp_output = T.dot(T.tanh(T.dot(lstm_output, W_mlp_hidden)), W_mlp)
174 | 
175 |   tag_prob = T.log(T.nnet.softmax(mlp_output))
176 | 
177 |   tag_nll = - tag_prob[T.arange(tag_prob.shape[0]), y]
178 | 
179 |   loss = tag_nll.sum()
180 | 
181 |   params = word_embeddings.params + char_embeddings.params + char_lstm.params + lstm.params + [W_mlp_hidden, W_mlp]
182 |   updates = Adam().get_updates(params, loss)
183 |   train_loss_func = theano.function([x, x_chars, y], loss, updates=updates)
184 | 
185 |   # build the decoding graph
186 |   decode_func = theano.function([x, x_chars], tag_prob)
187 | 
188 |   return train_loss_func, decode_func
189 | 
190 | 
191 | def sent_to_theano_input(sent):
192 |   tags = np.asarray([vt.w2i[t] for w, t in sent], dtype='int32')
193 |   words = np.asarray([word2id(w) for w, t in sent], dtype='int32')
194 | 
195 |   max_char_num_per_word = max(len(w) + 2 for w, t in sent)
196 |   word_chars = np.zeros((len(words), max_char_num_per_word), dtype='int32')
197 |   pad_char = vc.w2i["<*>"]
198 |   for i, (word, tag) in enumerate(sent):
199 |     word_chars[i, :len(word) + 2] = [pad_char] + [vc.w2i[c] for c in word] + [pad_char]
200 | 
201 |   return words, word_chars, tags
202 | 
203 | 
204 | def tag_sent(sent, decode_func):
205 |   words, word_chars, ref_tags = sent_to_theano_input(sent)
206 | 
207 |   # (sentence_length, tag_num)
208 |   tag_prob = decode_func(words, word_chars)
209 | 
210 |   tag_results = tag_prob.argmax(axis=-1)
211 |   tag_results = [vt.i2w[tid] for tid in tag_results]
212 | 
213 |   return tag_results
214 | 
215 | train_func, decode_func = build_tag_graph()
216 | 
217 | print("startup time: %r" % (time.time() - start))
218 | start = time.time()
219 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0
220 | 
221 | for ITER in range(100):
222 |   random.shuffle(train)
223 |   for s in train:
224 |     i += 1
225 | 
226 |     if i % 500 == 0:  # print status
227 |       print(this_loss / this_tagged)
228 |       all_tagged += this_tagged
229 |       this_loss = this_tagged = 0
230 |       all_time = time.time() - start
231 |     if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev
232 |       dev_start = time.time()
233 |       all_time += time.time() - start
234 |       good_sent = bad_sent = good = bad = 0.0
235 |       for sent in dev:
236 |         golds = [t for w, t in sent]
237 | 
238 |         # package words in a batch
239 |         tags = tag_sent(sent, decode_func)
240 | 
241 |         if tags == golds:
242 |           good_sent += 1
243 |         else:
244 |           bad_sent += 1
245 |         for go, gu in zip(golds, tags):
246 |           if go == gu:
247 |             good += 1
248 |           else:
249 |             bad += 1
250 | 
251 |       dev_time += time.time() - dev_start 
252 |       train_time = time.time() - start - dev_time
253 |       print("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (
254 |            good / (good + bad), good_sent / (good_sent + bad_sent), train_time, all_tagged / train_time))
255 | 
256 |       if all_time > args.TIMEOUT:
257 |         sys.exit(0)
258 | 
259 |     # train on training sentences
260 | 
261 |     # word indices
262 |     # char indices for each word
263 |     # gold tags
264 |     words, word_chars, tags = sent_to_theano_input(s)
265 | 
266 |     loss = train_func(words, word_chars, tags)
267 | 
268 |     this_loss += loss
269 |     this_tagged += len(s)
270 |     # print('loss: %f' % loss)
271 | 
272 |   print("epoch %r finished" % ITER)
273 | 


--------------------------------------------------------------------------------
/theano/bilstm-tagger.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | import random
  6 | 
  7 | import theano.tensor as T
  8 | import theano
  9 | import numpy as np
 10 | import sys
 11 | import argparse
 12 | from itertools import chain
 13 | 
 14 | from nn.layers.recurrent import LSTM, BiLSTM
 15 | from nn.layers.embeddings import Embedding
 16 | from nn.activations import softmax
 17 | from nn.optimizers import Adam
 18 | from nn.initializations import uniform
 19 | 
 20 | from collections import Counter, defaultdict
 21 | from itertools import count
 22 | 
 23 | parser = argparse.ArgumentParser()
 24 | parser.add_argument('WEMBED_SIZE', type=int, help='embedding size')
 25 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 26 | parser.add_argument('MLP_SIZE', type=int, help='embedding size')
 27 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 28 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 29 | args = parser.parse_args()
 30 | 
 31 | MB_SIZE = 1
 32 | 
 33 | # format of files: each line is "word1|tag2 word2|tag2 ..."
 34 | train_file="data/tags/train.txt"
 35 | dev_file="data/tags/dev.txt"
 36 | 
 37 | 
 38 | class Vocab:
 39 |   def __init__(self, w2i=None):
 40 |     if w2i is None: w2i = defaultdict(count(0).next)
 41 |     self.w2i = dict(w2i)
 42 |     self.i2w = {i:w for w,i in w2i.iteritems()}
 43 | 
 44 |   @classmethod
 45 |   def from_corpus(cls, corpus):
 46 |     w2i = defaultdict(count(0).next)
 47 |     for sent in corpus:
 48 |       [w2i[word] for word in sent]
 49 |     return Vocab(w2i)
 50 | 
 51 |   def size(self):
 52 |     return len(self.w2i.keys())
 53 | 
 54 | 
 55 | def read(fname):
 56 |   """
 57 |   Read a POS-tagged file where each line is of the form "word1|tag2 word2|tag2 ..."
 58 |   Yields lists of the form [(word1,tag1), (word2,tag2), ...]
 59 |   """
 60 |   with file(fname) as fh:
 61 |     for line in fh:
 62 |       line = line.strip().split()
 63 |       sent = [tuple(x.rsplit("|",1)) for x in line]
 64 |       yield sent
 65 | 
 66 | 
 67 | train=list(read(train_file))
 68 | dev=list(read(dev_file))
 69 | words=[]
 70 | tags=[]
 71 | wc=Counter()
 72 | words.append('_MASK_')
 73 | for sent in train:
 74 |   for w,p in sent:
 75 |     words.append(w)
 76 |     tags.append(p)
 77 |     wc[w]+=1
 78 | words.append("_UNK_")
 79 | 
 80 | vw = Vocab.from_corpus([words])
 81 | vt = Vocab.from_corpus([tags])
 82 | UNK = vw.w2i["_UNK_"]
 83 | 
 84 | # mask token must be of index 0
 85 | assert vw.w2i['_MASK_'] == 0
 86 | 
 87 | nwords = vw.size()
 88 | ntags  = vt.size()
 89 | 
 90 | print ("nwords=%r, ntags=%r" % (nwords, ntags))
 91 | 
 92 | 
 93 | def word2id(w):
 94 |   if wc[w] > 5:
 95 |     w_index = vw.w2i[w]
 96 |     return w_index
 97 |   else:
 98 |     return UNK
 99 | 
100 | 
101 | def pad(seq):
102 |   """
103 |   pad a mini-batch input with ending zeros
104 |   """
105 |   batch_size = len(seq)
106 |   max_len = max(len(seq[i]) for i in xrange(batch_size))
107 |   padded_seq = np.zeros((batch_size, max_len), dtype='int32')
108 |   for i in xrange(batch_size):
109 |     padded_seq[i, :len(seq[i])] = seq[i]
110 | 
111 |   return padded_seq
112 | 
113 | 
114 | def build_tag_graph():
115 |   print ('build graph..', file=sys.stderr)
116 | 
117 |   # (batch_size, sentence_length)
118 |   x = T.imatrix(name='sentence')
119 | 
120 |   # (batch_size, sentence_length)
121 |   y = T.imatrix(name='tag')
122 | 
123 |   # Lookup parameters for word embeddings
124 |   embedding_table = Embedding(nwords, args.WEMBED_SIZE)
125 | 
126 |   # bi-lstm
127 |   lstm = BiLSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE, return_sequences=True)
128 | 
129 |   # MLP
130 |   W_mlp_hidden = uniform((args.HIDDEN_SIZE * 2, args.MLP_SIZE), name='W_mlp_hidden')
131 |   W_mlp = uniform((args.MLP_SIZE, ntags), name='W_mlp')
132 | 
133 |   # (batch_size, sentence_length, embedding_dim)
134 |   sent_embed, sent_mask = embedding_table(x, mask_zero=True)
135 | 
136 |   # (batch_size, sentence_length, lstm_hidden_dim)
137 |   lstm_output = lstm(sent_embed, mask=sent_mask)
138 | 
139 |   # (batch_size, sentence_length, ntags)
140 |   mlp_output = T.dot(T.tanh(T.dot(lstm_output, W_mlp_hidden)), W_mlp)
141 | 
142 |   # (batch_size * sentence_length, ntags)
143 |   mlp_output = mlp_output.reshape((mlp_output.shape[0] * mlp_output.shape[1], -1))
144 | 
145 |   tag_prob_f = T.log(T.nnet.softmax(mlp_output))
146 | 
147 |   y_f = y.flatten()
148 |   mask_f = sent_mask.flatten()
149 | 
150 |   tag_nll = - tag_prob_f[T.arange(tag_prob_f.shape[0]), y_f] * mask_f
151 | 
152 |   loss = tag_nll.sum()
153 | 
154 |   params = embedding_table.params + lstm.params + [W_mlp_hidden, W_mlp]
155 |   updates = Adam().get_updates(params, loss)
156 |   train_loss_func = theano.function([x, y], loss, updates=updates)
157 | 
158 |   # build the decoding graph
159 |   tag_prob = tag_prob_f.reshape((x.shape[0], x.shape[1], -1))
160 |   decode_func = theano.function([x], tag_prob)
161 | 
162 |   return train_loss_func, decode_func
163 | 
164 | 
165 | def data2ids(batch_data):
166 |   batch_sent_ids = [[word2id(w) for w, t in sent] for sent in batch_data]
167 |   batch_tag_ids = [[vt.w2i[t] for w, t in sent] for sent in batch_data]
168 | 
169 |   return batch_sent_ids, batch_tag_ids
170 | 
171 | 
172 | def tag_sent(batch_sents, decode_func):
173 |   batch_sent_ids = [[word2id(w) for w in sent] for sent in batch_sents]
174 |   batch_sents_x = pad(batch_sent_ids)
175 |   batch_sents_len = [len(sent) for sent in batch_sents]
176 | 
177 |   # (batch_size, sentence_length, tag_num)
178 |   tag_prob = decode_func(batch_sents_x)
179 |   batch_tag_results = []
180 | 
181 |   for i, sent in enumerate(batch_sents):
182 |     sent_len = batch_sents_len[i]
183 |     tag_results = tag_prob[i].argmax(axis=-1)[:sent_len]
184 |     tag_results = [vt.i2w[tid] for tid in tag_results]
185 |     batch_tag_results.append(tag_results)
186 | 
187 |   return batch_tag_results
188 | 
189 | train_func, decode_func = build_tag_graph()
190 | 
191 | batch_num = int(np.ceil(len(train) / float(MB_SIZE)))
192 | batches = [(i * MB_SIZE, min(len(train), (i + 1) * MB_SIZE)) for i in range(0, batch_num)]
193 | 
194 | print ("startup time: %r" % (time.time() - start))
195 | start = time.time()
196 | i = all_time = dev_time = all_tagged = this_tagged = this_loss = 0
197 | 
198 | for ITER in range(100):
199 |   random.shuffle(train)
200 |   for batch_id, (batch_start, batch_end) in enumerate(batches):
201 |     i += MB_SIZE
202 | 
203 |     if i % 500 == 0:  # print status
204 |       print (this_loss / this_tagged)
205 |       all_tagged += this_tagged
206 |       this_loss = this_tagged = 0
207 |       all_time = time.time() - start
208 |     if i % 10000 == 0 or all_time > args.TIMEOUT: # eval on dev
209 |       dev_start = time.time()
210 |       good_sent = bad_sent = good = bad = 0.0
211 |       for sent in dev:
212 |         words = [w for w, t in sent]
213 |         golds = [t for w, t in sent]
214 | 
215 |         # package words in a batch
216 |         tags = tag_sent([words], decode_func)
217 |         tags = tags[0]
218 | 
219 |         if tags == golds:
220 |           good_sent += 1
221 |         else:
222 |           bad_sent += 1
223 |         for go, gu in zip(golds, tags):
224 |           if go == gu:
225 |             good += 1
226 |           else:
227 |             bad += 1
228 | 
229 |       dev_time += time.time() - dev_start 
230 |       train_time = time.time() - start - dev_time
231 |       print ("tag_acc=%.4f, sent_acc=%.4f, time=%.4f, word_per_sec=%.4f" % (
232 |            good / (good + bad), good_sent / (good_sent + bad_sent), train_time, all_tagged / train_time))
233 | 
234 |       if all_time > args.TIMEOUT:
235 |         sys.exit(0)
236 | 
237 |     # train on training sentences
238 | 
239 |     batch_data = train[batch_start:batch_end]
240 |     batch_sent_ids, batch_tag_ids = data2ids(batch_data)
241 | 
242 |     batch_x = pad(batch_sent_ids)
243 |     batch_y = pad(batch_tag_ids)
244 | 
245 |     batch_loss = train_func(batch_x, batch_y)
246 | 
247 |     this_loss += batch_loss
248 |     this_tagged += len(list(chain(*batch_data)))
249 | 
250 |   print ("epoch %r finished" % ITER)
251 | 


--------------------------------------------------------------------------------
/theano/bow.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, print_function
 2 | import time
 3 | start = time.time()
 4 | 
 5 | import theano.tensor as T
 6 | import theano
 7 | import numpy as np
 8 | import sys
 9 | import random
10 | 
11 | from nn.optimizers import SGD, Adam
12 | from nn.initializations import uniform, zero
13 | 
14 | from collections import defaultdict
15 | 
16 | 
17 | # Functions to read in the corpus
18 | w2i = defaultdict(lambda: len(w2i))
19 | t2i = defaultdict(lambda: len(t2i))
20 | UNK = w2i["<unk>"]
21 | def read_dataset(filename):
22 |   with open(filename, "r") as f:
23 |   for line in f:
24 |     tag, words = line.lower().strip().split(" ||| ")
25 |     yield ([w2i[x] for x in words.split(" ")], t2i[tag])
26 | 
27 | # Read in the data
28 | train = list(read_dataset("data/classes/train.txt"))
29 | w2i = defaultdict(lambda: UNK, w2i)
30 | dev = list(read_dataset("data/classes/test.txt"))
31 | nwords = len(w2i)
32 | ntags = len(t2i)
33 | 
34 | 
35 | # Define the model
36 | W_sm = zero((nwords, ntags))  # Word weights
37 | b_sm = zero((ntags))      # Softmax bias
38 | 
39 | # bag of words input
40 | x = T.ivector('words')
41 | # gold class
42 | y = T.iscalar('class')
43 | 
44 | score = T.sum(W_sm[x], axis=0) + b_sm
45 | # log likelihood
46 | ll = T.log(T.nnet.softmax(score)).flatten()
47 | # negative log likelihood loss
48 | loss = - ll[y]
49 | 
50 | params = [W_sm, b_sm]
51 | updates = Adam(lr=0.001).get_updates(params, loss)
52 | 
53 | train_func = theano.function([x, y], loss, updates=updates)
54 | test_func = theano.function([x], score)
55 | 
56 | print ("startup time: %r" % (time.time() - start))
57 | for ITER in range(100):
58 |   # Perform training
59 |   random.shuffle(train)
60 |   train_loss = 0.0
61 |   start = time.time()
62 |   for i, (words, tag) in enumerate(train):
63 |     my_loss = train_func(words, tag)
64 |     train_loss += my_loss
65 |     # print(b_sm.get_value())
66 |     # if i > 5:
67 |     #   sys.exit(0)
68 | 
69 |   print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start))
70 | 
71 |   # Perform testing
72 |   test_correct = 0.0
73 |   for words, tag in dev:
74 |     scores = test_func(words)
75 |     predict = np.argmax(scores)
76 |     if predict == tag:
77 |       test_correct += 1
78 | 
79 |   print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/theano/nn/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'yinpengcheng'
2 | 


--------------------------------------------------------------------------------
/theano/nn/activations.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | 
 4 | def softmax(x):
 5 |     return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape)
 6 | 
 7 | 
 8 | def time_distributed_softmax(x):
 9 |     import warnings
10 |     warnings.warn("time_distributed_softmax is deprecated. Just use softmax!", DeprecationWarning)
11 |     return softmax(x)
12 | 
13 | 
14 | def softplus(x):
15 |     return T.nnet.softplus(x)
16 | 
17 | 
18 | def relu(x):
19 |     return T.nnet.relu(x)
20 | 
21 | 
22 | def tanh(x):
23 |     return T.tanh(x)
24 | 
25 | 
26 | def sigmoid(x):
27 |     return T.nnet.sigmoid(x)
28 | 
29 | 
30 | def hard_sigmoid(x):
31 |     return T.nnet.hard_sigmoid(x)
32 | 
33 | 
34 | def linear(x):
35 |     '''
36 |     The function returns the variable that is passed in, so all types work
37 |     '''
38 |     return x
39 | 
40 | 
41 | from .utils.generic_utils import get_from_module
42 | def get(identifier):
43 |     return get_from_module(identifier, globals(), 'activation function')
44 | 


--------------------------------------------------------------------------------
/theano/nn/initializations.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | import theano.tensor as T
 3 | import numpy as np
 4 | 
 5 | from .utils.theano_utils import sharedX, shared_zeros, shared_ones
 6 | 
 7 | 
 8 | def get_fans(shape):
 9 |     fan_in = shape[0] if len(shape) == 2 else np.prod(shape[1:])
10 |     fan_out = shape[1] if len(shape) == 2 else shape[0]
11 |     return fan_in, fan_out
12 | 
13 | 
14 | def uniform(shape, scale=0.1, name=None):
15 |     return sharedX(np.random.uniform(low=-scale, high=scale, size=shape), name=name)
16 | 
17 | 
18 | def normal(shape, scale=0.05, name=None):
19 |     return sharedX(np.random.randn(*shape) * scale, name=name)
20 | 
21 | 
22 | def lecun_uniform(shape):
23 |     ''' Reference: LeCun 98, Efficient Backprop
24 |         http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
25 |     '''
26 |     fan_in, fan_out = get_fans(shape)
27 |     scale = np.sqrt(3. / fan_in)
28 |     return uniform(shape, scale)
29 | 
30 | 
31 | def glorot_normal(shape):
32 |     ''' Reference: Glorot & Bengio, AISTATS 2010
33 |     '''
34 |     fan_in, fan_out = get_fans(shape)
35 |     s = np.sqrt(2. / (fan_in + fan_out))
36 |     return normal(shape, s)
37 | 
38 | 
39 | def glorot_uniform(shape, name=None):
40 |     fan_in, fan_out = get_fans(shape)
41 |     s = np.sqrt(6. / (fan_in + fan_out))
42 |     return uniform(shape, s, name=name)
43 | 
44 | 
45 | def he_normal(shape):
46 |     ''' Reference:  He et al., http://arxiv.org/abs/1502.01852
47 |     '''
48 |     fan_in, fan_out = get_fans(shape)
49 |     s = np.sqrt(2. / fan_in)
50 |     return normal(shape, s)
51 | 
52 | 
53 | def he_uniform(shape):
54 |     fan_in, fan_out = get_fans(shape)
55 |     s = np.sqrt(6. / fan_in)
56 |     return uniform(shape, s)
57 | 
58 | 
59 | def orthogonal(shape, scale=1.1):
60 |     ''' From Lasagne
61 |     '''
62 |     flat_shape = (shape[0], np.prod(shape[1:]))
63 |     a = np.random.normal(0.0, 1.0, flat_shape)
64 |     u, _, v = np.linalg.svd(a, full_matrices=False)
65 |     # pick the one with the correct shape
66 |     q = u if u.shape == flat_shape else v
67 |     q = q.reshape(shape)
68 |     return sharedX(scale * q[:shape[0], :shape[1]])
69 | 
70 | 
71 | def identity(shape, scale=1):
72 |     if len(shape) != 2 or shape[0] != shape[1]:
73 |         raise Exception("Identity matrix initialization can only be used for 2D square matrices")
74 |     else:
75 |         return sharedX(scale * np.identity(shape[0]))
76 | 
77 | 
78 | def zero(shape):
79 |     return shared_zeros(shape)
80 | 
81 | 
82 | def one(shape):
83 |     return shared_ones(shape)
84 | 
85 | 
86 | from .utils.generic_utils import get_from_module
87 | def get(identifier):
88 |     return get_from_module(identifier, globals(), 'initialization')
89 | 


--------------------------------------------------------------------------------
/theano/nn/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/dynet-benchmark/b23f01f6f3c5386f67c9a355d9b25d72deb03ced/theano/nn/layers/__init__.py


--------------------------------------------------------------------------------
/theano/nn/layers/core.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import theano
 4 | import theano.tensor as T
 5 | import numpy as np
 6 | 
 7 | from nn.utils.theano_utils import *
 8 | import nn.initializations as initializations
 9 | import nn.activations as activations
10 | 
11 | from theano.tensor.shared_randomstreams import RandomStreams
12 | from theano.sandbox.rng_mrg import MRG_RandomStreams
13 | 
14 | 
15 | class Layer(object):
16 |     def __init__(self):
17 |         self.params = []
18 | 
19 |     def __call__(self, X):
20 |         return X
21 | 
22 |     def get_params(self):
23 |         return self.params
24 | 
25 |     def set_name(self, name):
26 |         if name:
27 |             for i in range(len(self.params)):
28 |                 if self.params[i].name is None:
29 |                     self.params[i].name = '%s_p%d' % (name, i)
30 |                 else:
31 |                     self.params[i].name = name + '_' + self.params[i].name
32 | 
33 |         self.name = name
34 | 
35 | 
36 | class Dense(Layer):
37 |     def __init__(self, input_dim, output_dim, init='glorot_uniform', activation='tanh', name='Dense'):
38 | 
39 |         super(Dense, self).__init__()
40 |         self.init = initializations.get(init)
41 |         self.activation = activations.get(activation)
42 |         self.input_dim = input_dim
43 |         self.output_dim = output_dim
44 | 
45 |         self.input = T.matrix()
46 |         self.W = self.init((self.input_dim, self.output_dim))
47 |         self.b = shared_zeros((self.output_dim))
48 | 
49 |         self.params = [self.W, self.b]
50 | 
51 |         if name is not None:
52 |             self.set_name(name)
53 | 
54 |     def set_name(self, name):
55 |         self.W.name = '%s_W' % name
56 |         self.b.name = '%s_b' % name
57 | 
58 |     def __call__(self, X):
59 |         output = self.activation(T.dot(X, self.W) + self.b)
60 |         return output
61 | 
62 | 
63 | class Dropout(Layer):
64 |     def __init__(self, p, srng, name='dropout'):
65 |         super(Dropout, self).__init__()
66 | 
67 |         assert 0. < p < 1.
68 | 
69 |         self.p = p
70 |         self.srng = srng
71 | 
72 |         if name is not None:
73 |             self.set_name(name)
74 | 
75 |     def __call__(self, X, train_only=True):
76 |         retain_prob = 1. - self.p
77 | 
78 |         X_train = X * self.srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
79 |         X_test = X * retain_prob
80 | 
81 |         if train_only:
82 |             return X_train
83 |         else:
84 |             return X_train, X_test
85 | 


--------------------------------------------------------------------------------
/theano/nn/layers/embeddings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from .core import Layer
 4 | from nn.utils.theano_utils import *
 5 | import nn.initializations as initializations
 6 | 
 7 | import nn.activations as activations
 8 | from theano.ifelse import ifelse
 9 | 
10 | class Embedding(Layer):
11 |     '''
12 |         Turn positive integers (indexes) into denses vectors of fixed size.
13 |         eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
14 | 
15 |         @input_dim: size of vocabulary (highest input integer + 1)
16 |         @out_dim: size of dense representation
17 |     '''
18 |     def __init__(self, input_dim, output_dim, init='uniform', name=None):
19 | 
20 |         super(Embedding, self).__init__()
21 |         self.init = initializations.get(init)
22 |         self.input_dim = input_dim
23 |         self.output_dim = output_dim
24 | 
25 |         self.W = self.init((self.input_dim, self.output_dim))
26 | 
27 |         self.params = [self.W]
28 | 
29 |         if name is not None:
30 |             self.set_name(name)
31 | 
32 |     def get_output_mask(self, X):
33 |         return T.ones_like(X, dtype=theano.config.floatX) * (1. - T.eq(X, 0))
34 | 
35 |     def __call__(self, X, mask_zero=False):
36 |         out = self.W[X]
37 |         if mask_zero:
38 |             return out, self.get_output_mask(X)
39 |         else:
40 |             return out
41 | 


--------------------------------------------------------------------------------
/theano/nn/optimizers.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | import theano
  3 | import theano.tensor as T
  4 | 
  5 | from .utils.theano_utils import shared_zeros, shared_scalar, floatX
  6 | from .utils.generic_utils import get_from_module
  7 | from six.moves import zip
  8 | from theano.sandbox.rng_mrg import MRG_RandomStreams
  9 | from theano.tensor.shared_randomstreams import RandomStreams
 10 | import math
 11 | from nn.utils.config_factory import config
 12 | 
 13 | 
 14 | def clip_norm(g, c, n):
 15 |     if c > 0:
 16 |         g = T.switch(T.ge(n, c), g * c / n, g)
 17 |     return g
 18 | 
 19 | 
 20 | def kl_divergence(p, p_hat):
 21 |     return p_hat - p + p * T.log(p / p_hat)
 22 | 
 23 | 
 24 | class Optimizer(object):
 25 |     def __init__(self, **kwargs):
 26 |         self.__dict__.update(kwargs)
 27 |         self.updates = []
 28 | 
 29 |     def get_state(self):
 30 |         return [u[0].get_value() for u in self.updates]
 31 | 
 32 |     def set_state(self, value_list):
 33 |         assert len(self.updates) == len(value_list)
 34 |         for u, v in zip(self.updates, value_list):
 35 |             u[0].set_value(floatX(v))
 36 | 
 37 |     def get_updates(self, params, constraints, loss, **kwargs):
 38 |         raise NotImplementedError
 39 | 
 40 |     def get_gradients(self, loss, params, **kwargs):
 41 | 
 42 |         grads = T.grad(loss, params, disconnected_inputs='warn', **kwargs)
 43 | 
 44 |         if hasattr(self, 'clipnorm') and self.clipnorm > 0:
 45 |             norm = T.sqrt(sum([T.sum(g ** 2) for g in grads]))
 46 |             norm = theano.printing.Print('gradient norm::')(norm)
 47 |             grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
 48 | 
 49 |         return grads
 50 | 
 51 |     def get_config(self):
 52 |         return {"name": self.__class__.__name__}
 53 | 
 54 | 
 55 | class SGD(Optimizer):
 56 | 
 57 |     def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, *args, **kwargs):
 58 |         super(SGD, self).__init__(**kwargs)
 59 |         self.__dict__.update(locals())
 60 |         self.iterations = shared_scalar(0)
 61 |         self.lr = shared_scalar(lr)
 62 |         self.momentum = shared_scalar(momentum)
 63 | 
 64 |     def get_updates(self, params, loss):
 65 |         grads = self.get_gradients(loss, params)
 66 |         lr = self.lr * (1.0 / (1.0 + self.decay * self.iterations))
 67 |         self.updates = [(self.iterations, self.iterations + 1.)]
 68 | 
 69 |         for p, g in zip(params, grads):
 70 |             m = shared_zeros(p.get_value().shape)  # momentum
 71 |             v = self.momentum * m - lr * g  # velocity
 72 |             self.updates.append((m, v))
 73 | 
 74 |             if self.nesterov:
 75 |                 new_p = p + self.momentum * v - lr * g
 76 |             else:
 77 |                 new_p = p + v
 78 | 
 79 |             self.updates.append((p, new_p))
 80 |         return self.updates
 81 | 
 82 |     def get_config(self):
 83 |         return {"name": self.__class__.__name__,
 84 |                 "lr": float(self.lr.get_value()),
 85 |                 "momentum": float(self.momentum.get_value()),
 86 |                 "decay": float(self.decay.get_value()),
 87 |                 "nesterov": self.nesterov}
 88 | 
 89 | 
 90 | class RMSprop(Optimizer):
 91 |     def __init__(self, lr=0.001, rho=0.9, epsilon=1e-6, *args, **kwargs):
 92 |         super(RMSprop, self).__init__(**kwargs)
 93 |         self.__dict__.update(locals())
 94 |         self.lr = shared_scalar(lr)
 95 |         self.rho = shared_scalar(rho)
 96 | 
 97 |     def get_updates(self, params, constraints, loss):
 98 |         grads = self.get_gradients(loss, params)
 99 |         accumulators = [shared_zeros(p.get_value().shape) for p in params]
100 |         self.updates = []
101 | 
102 |         for p, g, a, c in zip(params, grads, accumulators, constraints):
103 |             new_a = self.rho * a + (1 - self.rho) * g ** 2  # update accumulator
104 |             self.updates.append((a, new_a))
105 | 
106 |             new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
107 |             self.updates.append((p, c(new_p)))  # apply constraints
108 |         return self.updates
109 | 
110 |     def get_config(self):
111 |         return {"name": self.__class__.__name__,
112 |                 "lr": float(self.lr.get_value()),
113 |                 "rho": float(self.rho.get_value()),
114 |                 "epsilon": self.epsilon}
115 | 
116 | 
117 | class Adagrad(Optimizer):
118 |     def __init__(self, lr=0.01, epsilon=1e-6, *args, **kwargs):
119 |         super(Adagrad, self).__init__(**kwargs)
120 |         self.__dict__.update(locals())
121 |         self.lr = shared_scalar(lr)
122 | 
123 |     def get_updates(self, params, constraints, loss):
124 |         grads = self.get_gradients(loss, params)
125 |         accumulators = [shared_zeros(p.get_value().shape) for p in params]
126 |         self.updates = []
127 | 
128 |         for p, g, a, c in zip(params, grads, accumulators, constraints):
129 |             new_a = a + g ** 2  # update accumulator
130 |             self.updates.append((a, new_a))
131 |             new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
132 |             self.updates.append((p, c(new_p)))  # apply constraints
133 |         return self.updates
134 | 
135 |     def get_config(self):
136 |         return {"name": self.__class__.__name__,
137 |                 "lr": float(self.lr.get_value()),
138 |                 "epsilon": self.epsilon}
139 | 
140 | 
141 | class Adadelta(Optimizer):
142 |     '''
143 |         Reference: http://arxiv.org/abs/1212.5701
144 |     '''
145 |     def __init__(self, lr=1.0, rho=0.95, epsilon=1e-6, *args, **kwargs):
146 |         super(Adadelta, self).__init__(**kwargs)
147 |         self.__dict__.update(locals())
148 |         self.lr = shared_scalar(lr)
149 | 
150 |     def get_updates(self, params, loss):
151 |         grads = self.get_gradients(loss, params)
152 |         accumulators = [shared_zeros(p.get_value().shape) for p in params]
153 |         delta_accumulators = [shared_zeros(p.get_value().shape) for p in params]
154 |         self.updates = []
155 | 
156 |         for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
157 |             new_a = self.rho * a + (1 - self.rho) * g ** 2  # update accumulator
158 |             self.updates.append((a, new_a))
159 | 
160 |             # use the new accumulator and the *old* delta_accumulator
161 |             update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a +
162 |                                                              self.epsilon)
163 | 
164 |             new_p = p - self.lr * update
165 |             self.updates.append((p, new_p))
166 | 
167 |             # update delta_accumulator
168 |             new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2
169 |             self.updates.append((d_a, new_d_a))
170 |         return self.updates, grads
171 | 
172 |     def get_config(self):
173 |         return {"name": self.__class__.__name__,
174 |                 "lr": float(self.lr.get_value()),
175 |                 "rho": self.rho,
176 |                 "epsilon": self.epsilon}
177 | 
178 | 
179 | class Adam(Optimizer):
180 |     '''
181 |         Reference: http://arxiv.org/abs/1412.6980v8
182 | 
183 |         Default parameters follow those provided in the original paper.
184 |     '''
185 |     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, *args, **kwargs):
186 |         super(Adam, self).__init__(**kwargs)
187 |         self.__dict__.update(locals())
188 |         self.iterations = shared_scalar(0)
189 |         self.lr = shared_scalar(lr)
190 | 
191 |     def get_updates(self, params, loss, **kwargs):
192 |         grads = self.get_gradients(loss, params, **kwargs)
193 |         self.updates = [(self.iterations, self.iterations+1.)]
194 | 
195 |         t = self.iterations + 1
196 |         lr_t = self.lr * T.sqrt(1-self.beta_2**t)/(1-self.beta_1**t)
197 | 
198 | 
199 |         for p, g in zip(params, grads):
200 |             m = theano.shared(p.get_value() * 0.)  # zero init of moment
201 |             v = theano.shared(p.get_value() * 0.)  # zero init of velocity
202 | 
203 |             m_t = (self.beta_1 * m) + (1 - self.beta_1) * g
204 |             v_t = (self.beta_2 * v) + (1 - self.beta_2) * (g**2)
205 |             p_t = p - lr_t * m_t / (T.sqrt(v_t) + self.epsilon)
206 | 
207 |             self.updates.append((m, m_t))
208 |             self.updates.append((v, v_t))
209 |             self.updates.append((p, p_t))
210 | 
211 |         return self.updates
212 | 
213 |     def get_config(self):
214 |         return {"name": self.__class__.__name__,
215 |                 "lr": float(self.lr.get_value()),
216 |                 "beta_1": self.beta_1,
217 |                 "beta_2": self.beta_2,
218 |                 "epsilon": self.epsilon}
219 | 
220 | 
221 | # aliases
222 | sgd = SGD
223 | rmsprop = RMSprop
224 | adagrad = Adagrad
225 | adadelta = Adadelta
226 | adam = Adam
227 | 
228 | 
229 | def get(identifier, kwargs=None):
230 |     return get_from_module(identifier, globals(), 'optimizer', instantiate=True,
231 |                            kwargs=kwargs)
232 | 


--------------------------------------------------------------------------------
/theano/nn/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/dynet-benchmark/b23f01f6f3c5386f67c9a355d9b25d72deb03ced/theano/nn/utils/__init__.py


--------------------------------------------------------------------------------
/theano/nn/utils/config_factory.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | class MetaConfig(type):
 5 |     def __getitem__(self, key):
 6 |         return config._config[key]
 7 | 
 8 |     def __setitem__(self, key, value):
 9 |         config._config[key] = value
10 | 
11 | 
12 | class config(object):
13 |     _config = {}
14 |     __metaclass__ = MetaConfig
15 | 
16 |     @staticmethod
17 |     def set(key, val):
18 |         config._config[key] = val
19 | 
20 |     @staticmethod
21 |     def init_config(file='config.py'):
22 |         if len(config._config) > 0:
23 |             return
24 | 
25 |         logging.info('use configuration: %s', file)
26 |         data = {}
27 |         execfile(file, data)
28 |         config._config = data['config']


--------------------------------------------------------------------------------
/theano/nn/utils/generic_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | import numpy as np
 3 | import time
 4 | import sys
 5 | import six
 6 | import logging
 7 | 
 8 | 
 9 | def get_from_module(identifier, module_params, module_name, instantiate=False, kwargs=None):
10 |     if isinstance(identifier, six.string_types):
11 |         res = module_params.get(identifier)
12 |         if not res:
13 |             raise Exception('Invalid ' + str(module_name) + ': ' + str(identifier))
14 |         if instantiate and not kwargs:
15 |             return res()
16 |         elif instantiate and kwargs:
17 |             return res(**kwargs)
18 |         else:
19 |             return res
20 |     return identifier
21 | 
22 | 
23 | def make_tuple(*args):
24 |     return args
25 | 
26 | 
27 | def make_batches(size, batch_size):
28 |     nb_batch = int(np.ceil(size/float(batch_size)))
29 |     return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]
30 | 
31 | 
32 | def pad_sequences(sequences, maxlen=None, dtype='int32',
33 |                   padding='pre', truncating='pre', value=0.):
34 |     '''Pads each sequence to the same length:
35 |     the length of the longest sequence.
36 | 
37 |     If maxlen is provided, any sequence longer
38 |     than maxlen is truncated to maxlen.
39 |     Truncation happens off either the beginning (default) or
40 |     the end of the sequence.
41 | 
42 |     Supports post-padding and pre-padding (default).
43 | 
44 |     # Arguments
45 |         sequences: list of lists where each element is a sequence
46 |         maxlen: int, maximum length
47 |         dtype: type to cast the resulting sequence.
48 |         padding: 'pre' or 'post', pad either before or after each sequence.
49 |         truncating: 'pre' or 'post', remove values from sequences larger than
50 |             maxlen either in the beginning or in the end of the sequence
51 |         value: float, value to pad the sequences to the desired value.
52 | 
53 |     # Returns
54 |         x: numpy array with dimensions (number_of_sequences, maxlen)
55 |     '''
56 |     lengths = [len(s) for s in sequences]
57 | 
58 |     nb_samples = len(sequences)
59 |     if maxlen is None:
60 |         maxlen = np.max(lengths)
61 | 
62 |     # take the sample shape from the first non empty sequence
63 |     # checking for consistency in the main loop below.
64 |     sample_shape = tuple()
65 |     for s in sequences:
66 |         if len(s) > 0:
67 |             sample_shape = np.asarray(s).shape[1:]
68 |             break
69 | 
70 |     x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
71 |     for idx, s in enumerate(sequences):
72 |         if len(s) == 0:
73 |             continue  # empty list was found
74 |         if truncating == 'pre':
75 |             trunc = s[-maxlen:]
76 |         elif truncating == 'post':
77 |             trunc = s[:maxlen]
78 |         else:
79 |             raise ValueError('Truncating type "%s" not understood' % truncating)
80 | 
81 |         # check `trunc` has expected shape
82 |         trunc = np.asarray(trunc, dtype=dtype)
83 |         if trunc.shape[1:] != sample_shape:
84 |             raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
85 |                              (trunc.shape[1:], idx, sample_shape))
86 | 
87 |         if padding == 'post':
88 |             x[idx, :len(trunc)] = trunc
89 |         elif padding == 'pre':
90 |             x[idx, -len(trunc):] = trunc
91 |         else:
92 |             raise ValueError('Padding type "%s" not understood' % padding)
93 |     return x
94 | 


--------------------------------------------------------------------------------
/theano/nn/utils/io_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import cPickle
 4 | 
 5 | 
 6 | def serialize_to_file(obj, path, protocol=cPickle.HIGHEST_PROTOCOL):
 7 |     f = open(path, 'wb')
 8 |     cPickle.dump(obj, f, protocol=protocol)
 9 |     f.close()
10 | 
11 | 
12 | def deserialize_from_file(path):
13 |     f = open(path, 'rb')
14 |     obj = cPickle.load(f)
15 |     f.close()
16 |     return obj


--------------------------------------------------------------------------------
/theano/nn/utils/theano_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | import numpy as np
 3 | import theano
 4 | import theano.tensor as T
 5 | 
 6 | 
 7 | def floatX(X):
 8 |     return np.asarray(X, dtype=theano.config.floatX)
 9 | 
10 | 
11 | def sharedX(X, dtype=theano.config.floatX, name=None):
12 |     return theano.shared(np.asarray(X, dtype=dtype), name=name)
13 | 
14 | 
15 | def shared_zeros(shape, dtype=theano.config.floatX, name=None):
16 |     return sharedX(np.zeros(shape), dtype=dtype, name=name)
17 | 
18 | 
19 | def shared_scalar(val=0., dtype=theano.config.floatX, name=None):
20 |     return theano.shared(np.cast[dtype](val))
21 | 
22 | 
23 | def shared_ones(shape, dtype=theano.config.floatX, name=None):
24 |     return sharedX(np.ones(shape), dtype=dtype, name=name)
25 | 
26 | 
27 | def alloc_zeros_matrix(*dims):
28 |     return T.alloc(np.cast[theano.config.floatX](0.), *dims)
29 | 
30 | 
31 | def tensor_right_shift(tensor):
32 |     temp = T.zeros_like(tensor)
33 |     temp = T.set_subtensor(temp[:, 1:, :], tensor[:, :-1, :])
34 | 
35 |     return temp
36 | 
37 | 
38 | def ndim_tensor(ndim, name=None):
39 |     if ndim == 1:
40 |         return T.vector()
41 |     elif ndim == 2:
42 |         return T.matrix()
43 |     elif ndim == 3:
44 |         return T.tensor3()
45 |     elif ndim == 4:
46 |         return T.tensor4()
47 |     return T.matrix(name=name)
48 | 
49 | 
50 | # get int32 tensor
51 | def ndim_itensor(ndim, name=None):
52 |     if ndim == 2:
53 |         return T.imatrix(name)
54 |     elif ndim == 3:
55 |         return T.itensor3(name)
56 |     elif ndim == 4:
57 |         return T.itensor4(name)
58 |     return T.imatrix(name=name)
59 | 
60 | 
61 | # get int8 tensor
62 | def ndim_btensor(ndim, name=None):
63 |     if ndim == 2:
64 |         return T.bmatrix(name)
65 |     elif ndim == 3:
66 |         return T.btensor3(name)
67 |     elif ndim == 4:
68 |         return T.btensor4(name)
69 |     return T.imatrix(name)


--------------------------------------------------------------------------------
/theano/rnnlm-batch.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, print_function
  2 | import time
  3 | start = time.time()
  4 | 
  5 | import theano.tensor as T
  6 | import theano
  7 | import numpy as np
  8 | import sys, time
  9 | import random
 10 | import cProfile
 11 | import argparse
 12 | from itertools import chain
 13 | 
 14 | from nn.layers.recurrent import LSTM
 15 | from nn.layers.embeddings import Embedding
 16 | from nn.optimizers import Adam, SGD
 17 | from nn.initializations import uniform
 18 | 
 19 | from collections import Counter, defaultdict
 20 | from itertools import count
 21 | 
 22 | parser = argparse.ArgumentParser()
 23 | parser.add_argument('MB_SIZE', type=int, help='minibatch size')
 24 | parser.add_argument('EMBED_SIZE', type=int, help='embedding size')
 25 | parser.add_argument('HIDDEN_SIZE', type=int, help='hidden size')
 26 | parser.add_argument('SPARSE', type=int, help='sparse update 0/1')
 27 | parser.add_argument('TIMEOUT', type=int, help='timeout in seconds')
 28 | args = parser.parse_args()
 29 | 
 30 | train_file = 'data/text/train.txt'
 31 | test_file = 'data/text/dev.txt'
 32 | 
 33 | w2i = defaultdict(count(0).next)
 34 | 
 35 | 
 36 | def read(fname):
 37 |   """
 38 |   Read a file where each line is of the form "word1 word2 ..."
 39 |   Yields lists of the form [word1, word2, ...]
 40 |   """
 41 |   with file(fname) as fh:
 42 |     for line in fh:
 43 |       sent = [w2i[x] for x in line.strip().split()]
 44 |       sent.append(w2i["<s>"])
 45 |       yield sent
 46 | 
 47 | mask = w2i['<MASK>']
 48 | assert mask == 0
 49 | 
 50 | train = list(read(train_file))
 51 | vocab_size = len(w2i)
 52 | test = list(read(test_file))
 53 | S = w2i['<s>']
 54 | 
 55 | 
 56 | def pad(seq):
 57 |   """
 58 |   pad a mini-batch input with ending zeros
 59 |   """
 60 |   batch_size = len(seq)
 61 |   max_len = max(len(seq[i]) for i in xrange(batch_size))
 62 |   padded_seq = np.zeros((batch_size, max_len), dtype='int32')
 63 |   for i in xrange(batch_size):
 64 |     padded_seq[i, :len(seq[i])] = seq[i]
 65 | 
 66 |   return padded_seq
 67 | 
 68 | 
 69 | def build_graph():
 70 |   print('build graph..')
 71 |   # Lookup parameters for word embeddings
 72 |   embedding_table = Embedding(vocab_size, args.EMBED_SIZE)
 73 | 
 74 |   lstm = LSTM(args.EMBED_SIZE, args.HIDDEN_SIZE, inner_init="identity", return_sequences=True)
 75 | 
 76 |   # Softmax weights/biases on top of LSTM outputs
 77 |   W_sm = uniform((args.HIDDEN_SIZE, vocab_size), scale=.5, name='W_sm')
 78 |   b_sm = uniform(vocab_size, scale=.5, name='b_sm')
 79 | 
 80 |   # (batch_size, sentence_length)
 81 |   x = T.imatrix(name='sentence')
 82 | 
 83 |   # (batch_size, sentence_length, embedding_dim)
 84 |   sent_embed, sent_mask = embedding_table(x, mask_zero=True)
 85 | 
 86 |   lstm_input = T.set_subtensor(T.zeros_like(sent_embed)[:, 1:, :], sent_embed[:, :-1, :])
 87 |   lstm_input = T.set_subtensor(lstm_input[:, 0, :], embedding_table(S)[None, :])
 88 | 
 89 |   # (batch_size, sentence_length, output_dim)
 90 |   lstm_output = lstm(lstm_input)
 91 | 
 92 |   # (batch_size, sentence_length, vocab_size)
 93 |   logits = T.dot(lstm_output, W_sm) + b_sm
 94 |   logits = T.nnet.softmax(logits.reshape((logits.shape[0] * logits.shape[1], vocab_size))).reshape(logits.shape)
 95 | 
 96 |   loss = T.log(logits).reshape((-1, logits.shape[-1]))
 97 |   # (batch_size * sentence_length)
 98 |   loss = loss[T.arange(loss.shape[0]), x.flatten()]
 99 |   # (batch_size, sentence_length)
100 |   loss = - loss.reshape((x.shape[0], x.shape[1])) * sent_mask
101 |   # loss = loss.sum(axis=-1) / sent_mask.sum(axis=-1)
102 |   # loss = -T.mean(loss)
103 | 
104 |   # loss is the sum of nll over all words over all examples in the mini-batch
105 |   loss = loss.sum()
106 | 
107 |   params = embedding_table.params + lstm.params + [W_sm, b_sm]
108 |   updates = Adam(lr=0.001).get_updates(params, loss)
109 |   # updates = SGD(lr=0.01).get_updates(params, loss)
110 |   train_loss_func = theano.function([x], loss, updates=updates)
111 |   test_loss_func = theano.function([x], loss)
112 | 
113 |   return train_loss_func, test_loss_func
114 | 
115 | train_loss_func, test_loss_func = build_graph()
116 | 
117 | # Sort training sentences in descending order and count minibatches
118 | train.sort(key=lambda x: -len(x))
119 | test.sort(key=lambda x: -len(x))
120 | train_order = [x * args.MB_SIZE for x in range(int((len(train) - 1) / args.MB_SIZE + 1))]
121 | test_order = [x * args.MB_SIZE for x in range(int((len(test) - 1) / args.MB_SIZE + 1))]
122 | 
123 | # Perform training
124 | print("startup time: %r" % (time.time() - start))
125 | start = time.time()
126 | i = all_time = dev_time = all_tagged = this_words = this_loss = 0
127 | for ITER in range(100):
128 |   random.shuffle(train_order)
129 |   for sid in train_order:
130 |     i += 1
131 |     if i % int(500 / args.MB_SIZE) == 0:
132 |       print(this_loss / this_words)
133 |       all_tagged += this_words
134 |       this_loss = this_words = 0
135 |       all_time = time.time() - start
136 |     if i % int(10000 / args.MB_SIZE) == 0 or all_time > args.TIMEOUT:
137 |       dev_start = time.time()
138 |       dev_loss = dev_words = 0
139 |       for test_sid in test_order:
140 |         batch_sents = test[test_sid:test_sid + args.MB_SIZE]
141 |         batch_sents_x = pad(batch_sents)
142 | 
143 |         batch_loss = test_loss_func(batch_sents_x)
144 |         dev_loss += batch_loss
145 | 
146 |         mb_words = sum(len(s) for s in batch_sents)
147 |         dev_words += mb_words
148 |       dev_time += time.time() - dev_start 
149 |       train_time = time.time() - start - dev_time
150 |       print("nll=%.4f, ppl=%.4f, words=%r, time=%.4f, word_per_sec=%.4f" % (
151 |         dev_loss / dev_words, np.exp(dev_loss / dev_words), dev_words, train_time, all_tagged / train_time))
152 |       if all_time > args.TIMEOUT:
153 |         sys.exit(0)
154 | 
155 |     # train on the minibatch
156 | 
157 |     batch_sents = train[sid:sid + args.MB_SIZE]
158 |     batch_sents_x = pad(batch_sents)
159 | 
160 |     batch_loss = train_loss_func(batch_sents_x)
161 |     this_loss += batch_loss
162 |     # print("loss @ %r: %r" % (i, this_loss))
163 |     mb_words = sum(len(s) for s in batch_sents)
164 |     this_words += mb_words
165 | 
166 |   print("epoch %r finished" % ITER)
167 | 


--------------------------------------------------------------------------------