├── .gitignore
├── LICENSE
├── README.md
├── chainer-1.4
├── lm_rnn.py
├── mt_s2s_attention.py
├── mt_s2s_encdec.py
├── seg_ffnn.py
├── seg_rnn.py
└── util
│ ├── __init__.py
│ ├── chainer_cpu_wrapper.py
│ ├── chainer_gpu_wrapper.py
│ ├── functions.py
│ ├── generators.py
│ ├── model_file.py
│ └── vocabulary.py
└── chainer-1.5
├── LSTMVariants.py
├── attention_lm.py
├── mt_s2s_attention.py
├── mt_s2s_encdec.py
└── util
├── __init__.py
├── functions.py
├── generators.py
└── vocabulary.py
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | data
3 | hyp
4 | model
5 | nohup.out
6 | test
7 | my_settings.py
8 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | chainer_examples License
2 | ()
3 |
4 | Copyright (c) 2015~ Yusuke Oda
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Chainer example code for NLP
2 | ============================
3 |
4 | **This repository is out of date and rough. I do not guarantee that these code works correctly.**
5 |
6 | **I am developing a new NMT toolkit [NMTKit](https://github.com/odashi/nmtkit) and strongly recommend to use it instead of these samples to train neural translation models.**
7 |
8 | This repository contains some neural network examples
9 | for natural language processing (NLP)
10 | using **Chainer** framework.
11 |
12 | [Chainer Official](http://chainer.org/ "Chainer official") ([GitHub](https://github.com/pfnet/chainer "Github"))
13 |
14 | Making Local Client
15 | -------------------
16 |
17 | Before running these scripts, making a local python client using `pyenv` is
18 | reccomended, like:
19 |
20 | $ pyenv install 3.5.0
21 | $ pyenv virtualenv 3.5.0 example
22 | $ pyenv shell example
23 | $ pip install chainer
24 |
25 | Contents
26 | --------
27 |
28 | * **Machine Translation**
29 | * `mt_s2s_encdec.py` - Using encoder-decoder style recurrent neural network
30 | * `mt_s2s_attention.py` - Using attentional neural network
31 |
32 | * **Word Segmentation (Tokenization)**
33 | * `seg_ffnn.py` - Using feedforward neural network
34 | * `seg_rnn.py` - Using recurrent neural network
35 |
36 | * **Language Model**
37 | * `lm_rnn.py` - Using recurrent neural network (RNNLM)
38 |
39 | Contact
40 | -------
41 |
42 | If you find an issue or have some questions, please contact Yusuke Oda:
43 | * @odashi_t on Twitter (faster than other methods)
44 | * yus.takara (at) gmail.com
45 |
46 |
--------------------------------------------------------------------------------
/chainer-1.4/lm_rnn.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | import my_settings
4 |
5 | import datetime
6 | import sys
7 | import math
8 | import numpy as np
9 | from argparse import ArgumentParser
10 | from collections import defaultdict
11 |
12 | from chainer import FunctionSet, Variable, cuda, functions, optimizers
13 |
14 |
15 | def trace(text):
16 | print(datetime.datetime.now(), '...', text, file=sys.stderr)
17 |
18 |
19 | def make_var(array, dtype=np.float32):
20 | #return Variable(np.array(array, dtype=dtype))
21 | return Variable(cuda.to_gpu(np.array(array, dtype=dtype)))
22 |
23 | def get_data(variable):
24 | #return variable.data
25 | return cuda.to_cpu(variable.data)
26 |
27 | def zeros(shape, dtype=np.float32):
28 | #return Variable(np.zeros(shape, dtype=dtype))
29 | return Variable(cuda.zeros(shape, dtype=dtype))
30 |
31 | def make_model(**kwargs):
32 | #return FunctionSet(**kwargs)
33 | return FunctionSet(**kwargs).to_gpu()
34 |
35 |
36 | def make_vocab(filename, vocab_size):
37 | word_freq = defaultdict(lambda: 0)
38 | num_lines = 0
39 | num_words = 0
40 | with open(filename) as fp:
41 | for line in fp:
42 | words = line.split()
43 | num_lines += 1
44 | num_words += len(words)
45 | for word in words:
46 | word_freq[word] += 1
47 |
48 | # 0: unk
49 | # 1:
50 | # 2:
51 | vocab = defaultdict(lambda: 0)
52 | vocab[''] = 1
53 | vocab[''] = 2
54 | for i,(k,v) in zip(range(vocab_size - 3), sorted(word_freq.items(), key=lambda x: -x[1])):
55 | vocab[k] = i + 3
56 |
57 | return vocab, num_lines, num_words
58 |
59 |
60 | def generate_batch(filename, batch_size):
61 | with open(filename) as fp:
62 | batch = []
63 | try:
64 | while True:
65 | for i in range(batch_size):
66 | batch.append(next(fp).split())
67 |
68 | max_len = max(len(x) for x in batch)
69 | batch = [[''] + x + [''] * (max_len - len(x) + 1) for x in batch]
70 | yield batch
71 |
72 | batch = []
73 | except:
74 | pass
75 |
76 | if batch:
77 | max_len = max(len(x) for x in batch)
78 | batch = [[''] + x + [''] * (max_len - len(x) + 1) for x in batch]
79 | yield batch
80 |
81 |
82 | def make_rnnlm_model(n_vocab, n_embed, n_hidden):
83 | return make_model(
84 | w_xe = functions.EmbedID(n_vocab, n_embed),
85 | w_eh = functions.Linear(n_embed, n_hidden),
86 | w_hh = functions.Linear(n_hidden, n_hidden),
87 | w_hy = functions.Linear(n_hidden, n_vocab),
88 | )
89 |
90 |
91 | def save_rnnlm_model(filename, n_vocab, n_embed, n_hidden, vocab, model):
92 | fmt = '%.8e'
93 | dlm = ' '
94 |
95 | model.to_cpu()
96 |
97 | with open(filename, 'w') as fp:
98 | print(n_vocab, file=fp)
99 | print(n_embed, file=fp)
100 | print(n_hidden, file=fp)
101 |
102 | for k, v in vocab.items():
103 | if v == 0:
104 | continue
105 | print('%s %d' % (k, v), file=fp)
106 |
107 | for row in model.w_xe.W:
108 | print(dlm.join(fmt % x for x in row), file=fp)
109 |
110 | for row in model.w_eh.W:
111 | print(dlm.join(fmt % x for x in row), file=fp)
112 | print(dlm.join(fmt % x for x in model.w_eh.b), file=fp)
113 |
114 | for row in model.w_hh.W:
115 | print(dlm.join(fmt % x for x in row), file=fp)
116 | print(dlm.join(fmt % x for x in model.w_hh.b), file=fp)
117 |
118 | for row in model.w_hy.W:
119 | print(dlm.join(fmt % x for x in row), file=fp)
120 | print(dlm.join(fmt % x for x in model.w_hy.b), file=fp)
121 |
122 | model.to_gpu()
123 |
124 |
125 | def parse_args():
126 | def_vocab = 40000
127 | def_embed = 200
128 | def_hidden = 200
129 | def_epoch = 10
130 | def_minibatch = 256
131 |
132 | p = ArgumentParser(description='RNNLM trainer')
133 |
134 | p.add_argument('corpus', help='[in] training corpus')
135 | p.add_argument('model', help='[out] model file')
136 | p.add_argument('-V', '--vocab', default=def_vocab, metavar='INT', type=int,
137 | help='vocabulary size (default: %d)' % def_vocab)
138 | p.add_argument('-E', '--embed', default=def_embed, metavar='INT', type=int,
139 | help='embedding layer size (default: %d)' % def_embed)
140 | p.add_argument('-H', '--hidden', default=def_hidden, metavar='INT', type=int,
141 | help='hidden layer size (default: %d)' % def_hidden)
142 | p.add_argument('-I', '--epoch', default=def_epoch, metavar='INT', type=int,
143 | help='number of training epoch (default: %d)' % def_epoch)
144 | p.add_argument('-B', '--minibatch', default=def_minibatch, metavar='INT', type=int,
145 | help='minibatch size (default: %d)' % def_minibatch)
146 |
147 | args = p.parse_args()
148 |
149 | # check args
150 | try:
151 | if (args.vocab < 1): raise ValueError('you must set --vocab >= 1')
152 | if (args.embed < 1): raise ValueError('you must set --embed >= 1')
153 | if (args.hidden < 1): raise ValueError('you must set --hidden >= 1')
154 | if (args.epoch < 1): raise ValueError('you must set --epoch >= 1')
155 | if (args.minibatch < 1): raise ValueError('you must set --minibatch >= 1')
156 | except Exception as ex:
157 | p.print_usage(file=sys.stderr)
158 | print(ex, file=sys.stderr)
159 | sys.exit()
160 |
161 | return args
162 |
163 |
164 | def main():
165 | args = parse_args()
166 |
167 | trace('making vocabulary ...')
168 | vocab, num_lines, num_words = make_vocab(args.corpus, args.vocab)
169 |
170 | trace('initializing CUDA ...')
171 | cuda.init()
172 |
173 | trace('start training ...')
174 | model = make_rnnlm_model(args.vocab, args.embed, args.hidden)
175 |
176 | for epoch in range(args.epoch):
177 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
178 | log_ppl = 0.0
179 | trained = 0
180 |
181 | opt = optimizers.SGD()
182 | opt.setup(model)
183 |
184 | for batch in generate_batch(args.corpus, args.minibatch):
185 | batch = [[vocab[x] for x in words] for words in batch]
186 | K = len(batch)
187 | L = len(batch[0]) - 1
188 |
189 | opt.zero_grads()
190 | s_h = zeros((K, args.hidden))
191 |
192 | for l in range(L):
193 | s_x = make_var([batch[k][l] for k in range(K)], dtype=np.int32)
194 | s_t = make_var([batch[k][l + 1] for k in range(K)], dtype=np.int32)
195 |
196 | s_e = functions.tanh(model.w_xe(s_x))
197 | s_h = functions.tanh(model.w_eh(s_e) + model.w_hh(s_h))
198 | s_y = model.w_hy(s_h)
199 |
200 | loss = functions.softmax_cross_entropy(s_y, s_t)
201 | loss.backward()
202 |
203 | log_ppl += get_data(loss).reshape(()) * K
204 |
205 | opt.update()
206 | trained += K
207 | trace(' %d/%d' % (trained, num_lines))
208 |
209 | log_ppl /= float(num_words)
210 | trace(' log(PPL) = %.10f' % log_ppl)
211 | trace(' PPL = %.10f' % math.exp(log_ppl))
212 |
213 | trace(' writing model ...')
214 | save_rnnlm_model(args.model + '.%d' % (epoch + 1), args.vocab, args.embed, args.hidden, vocab, model)
215 |
216 | trace('training finished.')
217 |
218 |
219 | if __name__ == '__main__':
220 | main()
221 |
222 |
--------------------------------------------------------------------------------
/chainer-1.4/mt_s2s_attention.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | import my_settings
4 |
5 | import sys
6 | import math
7 | import numpy as np
8 | from argparse import ArgumentParser
9 |
10 | from chainer import functions, optimizers
11 | import chainer.computational_graph as cg
12 |
13 | import util.generators as gens
14 | from util.functions import trace, fill_batch2
15 | from util.model_file import ModelFile
16 | from util.vocabulary import Vocabulary
17 |
18 | #from util.chainer_cpu_wrapper import wrapper
19 | from util.chainer_gpu_wrapper import wrapper
20 |
21 |
22 | class AttentionalTranslationModel:
23 | def __init__(self):
24 | pass
25 |
26 | def __make_model(self):
27 | self.__model = wrapper.make_model(
28 | # input embedding
29 | w_xi = functions.EmbedID(len(self.__src_vocab), self.__n_embed),
30 | # forward encoder
31 | w_ia = functions.Linear(self.__n_embed, 4 * self.__n_hidden),
32 | w_aa = functions.Linear(self.__n_hidden, 4 * self.__n_hidden),
33 | # backward encoder
34 | w_ib = functions.Linear(self.__n_embed, 4 * self.__n_hidden),
35 | w_bb = functions.Linear(self.__n_hidden, 4 * self.__n_hidden),
36 | # attentional weight estimator
37 | w_aw = functions.Linear(self.__n_hidden, self.__n_hidden),
38 | w_bw = functions.Linear(self.__n_hidden, self.__n_hidden),
39 | w_pw = functions.Linear(self.__n_hidden, self.__n_hidden),
40 | w_we = functions.Linear(self.__n_hidden, 1),
41 | # decoder
42 | w_ap = functions.Linear(self.__n_hidden, self.__n_hidden),
43 | w_bp = functions.Linear(self.__n_hidden, self.__n_hidden),
44 | w_yp = functions.EmbedID(len(self.__trg_vocab), 4 * self.__n_hidden),
45 | w_pp = functions.Linear(self.__n_hidden, 4 * self.__n_hidden),
46 | w_cp = functions.Linear(self.__n_hidden, 4 * self.__n_hidden),
47 | w_dp = functions.Linear(self.__n_hidden, 4 * self.__n_hidden),
48 | w_py = functions.Linear(self.__n_hidden, len(self.__trg_vocab)),
49 | )
50 |
51 | @staticmethod
52 | def new(src_vocab, trg_vocab, n_embed, n_hidden):
53 | self = AttentionalTranslationModel()
54 | self.__src_vocab = src_vocab
55 | self.__trg_vocab = trg_vocab
56 | self.__n_embed = n_embed
57 | self.__n_hidden = n_hidden
58 | self.__make_model()
59 | return self
60 |
61 | def save(self, filename):
62 | with ModelFile(filename, 'w') as fp:
63 | self.__src_vocab.save(fp.get_file_pointer())
64 | self.__trg_vocab.save(fp.get_file_pointer())
65 | fp.write(self.__n_embed)
66 | fp.write(self.__n_hidden)
67 | wrapper.begin_model_access(self.__model)
68 | fp.write_embed(self.__model.w_xi)
69 | fp.write_linear(self.__model.w_ia)
70 | fp.write_linear(self.__model.w_aa)
71 | fp.write_linear(self.__model.w_ib)
72 | fp.write_linear(self.__model.w_bb)
73 | fp.write_linear(self.__model.w_aw)
74 | fp.write_linear(self.__model.w_bw)
75 | fp.write_linear(self.__model.w_pw)
76 | fp.write_linear(self.__model.w_we)
77 | fp.write_linear(self.__model.w_ap)
78 | fp.write_linear(self.__model.w_bp)
79 | fp.write_embed(self.__model.w_yp)
80 | fp.write_linear(self.__model.w_pp)
81 | fp.write_linear(self.__model.w_cp)
82 | fp.write_linear(self.__model.w_dp)
83 | fp.write_linear(self.__model.w_py)
84 | wrapper.end_model_access(self.__model)
85 |
86 | @staticmethod
87 | def load(filename):
88 | self = AttentionalTranslationModel()
89 | with ModelFile(filename) as fp:
90 | self.__src_vocab = Vocabulary.load(fp.get_file_pointer())
91 | self.__trg_vocab = Vocabulary.load(fp.get_file_pointer())
92 | self.__n_embed = int(fp.read())
93 | self.__n_hidden = int(fp.read())
94 | self.__make_model()
95 | wrapper.begin_model_access(self.__model)
96 | fp.read_embed(self.__model.w_xi)
97 | fp.read_linear(self.__model.w_ia)
98 | fp.read_linear(self.__model.w_aa)
99 | fp.read_linear(self.__model.w_ib)
100 | fp.read_linear(self.__model.w_bb)
101 | fp.read_linear(self.__model.w_aw)
102 | fp.read_linear(self.__model.w_bw)
103 | fp.read_linear(self.__model.w_pw)
104 | fp.read_linear(self.__model.w_we)
105 | fp.read_linear(self.__model.w_ap)
106 | fp.read_linear(self.__model.w_bp)
107 | fp.read_embed(self.__model.w_yp)
108 | fp.read_linear(self.__model.w_pp)
109 | fp.read_linear(self.__model.w_cp)
110 | fp.read_linear(self.__model.w_dp)
111 | fp.read_linear(self.__model.w_py)
112 | wrapper.end_model_access(self.__model)
113 | return self
114 |
115 | def init_optimizer(self):
116 | self.__opt = optimizers.AdaGrad(lr=0.01)
117 | self.__opt.setup(self.__model)
118 |
119 | def __forward(self, is_training, src_batch, trg_batch = None, generation_limit = None):
120 | m = self.__model
121 | tanh = functions.tanh
122 | lstm = functions.lstm
123 | batch_size = len(src_batch)
124 | hidden_size = self.__n_hidden
125 | src_len = len(src_batch[0])
126 | trg_len = len(trg_batch[0]) - 1 if is_training else generation_limit
127 | src_stoi = self.__src_vocab.stoi
128 | trg_stoi = self.__trg_vocab.stoi
129 | trg_itos = self.__trg_vocab.itos
130 |
131 | hidden_zeros = wrapper.zeros((batch_size, hidden_size))
132 | sum_e_zeros = wrapper.zeros((batch_size, 1))
133 |
134 | # make embedding
135 | list_x = []
136 | for l in range(src_len):
137 | s_x = wrapper.make_var([src_stoi(src_batch[k][l]) for k in range(batch_size)], dtype=np.int32)
138 | list_x.append(s_x)
139 |
140 | # forward encoding
141 | c = hidden_zeros
142 | s_a = hidden_zeros
143 | list_a = []
144 | for l in range(src_len):
145 | s_x = list_x[l]
146 | s_i = tanh(m.w_xi(s_x))
147 | c, s_a = lstm(c, m.w_ia(s_i) + m.w_aa(s_a))
148 | list_a.append(s_a)
149 |
150 | # backward encoding
151 | c = hidden_zeros
152 | s_b = hidden_zeros
153 | list_b = []
154 | for l in reversed(range(src_len)):
155 | s_x = list_x[l]
156 | s_i = tanh(m.w_xi(s_x))
157 | c, s_b = lstm(c, m.w_ib(s_i) + m.w_bb(s_b))
158 | list_b.insert(0, s_b)
159 |
160 | # decoding
161 | c = hidden_zeros
162 | s_p = tanh(m.w_ap(list_a[-1]) + m.w_bp(list_b[0]))
163 | s_y = wrapper.make_var([trg_stoi('') for k in range(batch_size)], dtype=np.int32)
164 |
165 | hyp_batch = [[] for _ in range(batch_size)]
166 | accum_loss = wrapper.zeros(()) if is_training else None
167 |
168 | #for n in range(src_len):
169 | # print(src_batch[0][n], end=' ')
170 | #print()
171 |
172 | for l in range(trg_len):
173 | # calculate attention weights
174 | list_e = []
175 | sum_e = sum_e_zeros
176 | for n in range(src_len):
177 | s_w = tanh(m.w_aw(list_a[n]) + m.w_bw(list_b[n]) + m.w_pw(s_p))
178 | r_e = functions.exp(m.w_we(s_w))
179 | #list_e.append(functions.concat(r_e for _ in range(self.__n_hidden)))
180 | list_e.append(r_e)
181 | sum_e += r_e
182 | #sum_e = functions.concat(sum_e for _ in range(self.__n_hidden))
183 |
184 | # make attention vector
185 | s_c = hidden_zeros
186 | s_d = hidden_zeros
187 | for n in range(src_len):
188 | s_e = list_e[n] / sum_e
189 | #s_c += s_e * list_a[n]
190 | #s_d += s_e * list_b[n]
191 | s_c += functions.reshape(functions.batch_matmul(list_a[n], s_e), (batch_size, hidden_size))
192 | s_d += functions.reshape(functions.batch_matmul(list_b[n], s_e), (batch_size, hidden_size))
193 |
194 | #zxcv = wrapper.get_data(s_e)[0][0]
195 | #if zxcv > 0.9: asdf='#'
196 | #elif zxcv > 0.7: asdf='*'
197 | #elif zxcv > 0.3: asdf='+'
198 | #elif zxcv > 0.1: asdf='.'
199 | #else: asdf=' '
200 | #print(asdf * len(src_batch[0][n]), end=' ')
201 |
202 | # generate next word
203 | c, s_p = lstm(c, m.w_yp(s_y) + m.w_pp(s_p) + m.w_cp(s_c) + m.w_dp(s_d))
204 | r_y = m.w_py(s_p)
205 | output = wrapper.get_data(r_y).argmax(1)
206 | for k in range(batch_size):
207 | hyp_batch[k].append(trg_itos(output[k]))
208 |
209 | #print(hyp_batch[0][-1])
210 |
211 | if is_training:
212 | s_t = wrapper.make_var([trg_stoi(trg_batch[k][l + 1]) for k in range(batch_size)], dtype=np.int32)
213 | accum_loss += functions.softmax_cross_entropy(r_y, s_t)
214 | s_y = s_t
215 | else:
216 | if all(hyp_batch[k][-1] == '' for k in range(batch_size)): break
217 | s_y = wrapper.make_var(output, dtype=np.int32)
218 |
219 | return hyp_batch, accum_loss
220 |
221 | def train(self, src_batch, trg_batch):
222 | self.__opt.zero_grads()
223 | hyp_batch, accum_loss = self.__forward(True, src_batch, trg_batch=trg_batch)
224 | #g = cg.build_computational_graph([accum_loss])
225 | #with open('asdf', 'w') as fp: fp.write(g.dump())
226 | #sys.exit()
227 | accum_loss.backward()
228 | self.__opt.clip_grads(10)
229 | self.__opt.update()
230 | return hyp_batch
231 |
232 | def predict(self, src_batch, generation_limit):
233 | return self.__forward(False, src_batch, generation_limit=generation_limit)[0]
234 |
235 |
236 | def parse_args():
237 | def_vocab = 32768
238 | def_embed = 256
239 | def_hidden = 512
240 | def_epoch = 100
241 | def_minibatch = 64
242 | def_generation_limit = 256
243 |
244 | p = ArgumentParser(description='Attentional neural machine translation')
245 |
246 | p.add_argument('mode', help='\'train\' or \'test\'')
247 | p.add_argument('source', help='[in] source corpus')
248 | p.add_argument('target', help='[in/out] target corpus')
249 | p.add_argument('model', help='[in/out] model file')
250 | p.add_argument('--vocab', default=def_vocab, metavar='INT', type=int,
251 | help='vocabulary size (default: %d)' % def_vocab)
252 | p.add_argument('--embed', default=def_embed, metavar='INT', type=int,
253 | help='embedding layer size (default: %d)' % def_embed)
254 | p.add_argument('--hidden', default=def_hidden, metavar='INT', type=int,
255 | help='hidden layer size (default: %d)' % def_hidden)
256 | p.add_argument('--epoch', default=def_epoch, metavar='INT', type=int,
257 | help='number of training epoch (default: %d)' % def_epoch)
258 | p.add_argument('--minibatch', default=def_minibatch, metavar='INT', type=int,
259 | help='minibatch size (default: %d)' % def_minibatch)
260 | p.add_argument('--generation-limit', default=def_generation_limit, metavar='INT', type=int,
261 | help='maximum number of words to be generated for test input')
262 |
263 | args = p.parse_args()
264 |
265 | # check args
266 | try:
267 | if args.mode not in ['train', 'test']: raise ValueError('you must set mode = \'train\' or \'test\'')
268 | if args.vocab < 1: raise ValueError('you must set --vocab >= 1')
269 | if args.embed < 1: raise ValueError('you must set --embed >= 1')
270 | if args.hidden < 1: raise ValueError('you must set --hidden >= 1')
271 | if args.epoch < 1: raise ValueError('you must set --epoch >= 1')
272 | if args.minibatch < 1: raise ValueError('you must set --minibatch >= 1')
273 | if args.generation_limit < 1: raise ValueError('you must set --generation-limit >= 1')
274 | except Exception as ex:
275 | p.print_usage(file=sys.stderr)
276 | print(ex, file=sys.stderr)
277 | sys.exit()
278 |
279 | return args
280 |
281 |
282 | def train_model(args):
283 | trace('making vocabularies ...')
284 | src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab)
285 | trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab)
286 |
287 | trace('making model ...')
288 | model = AttentionalTranslationModel.new(src_vocab, trg_vocab, args.embed, args.hidden)
289 |
290 | for epoch in range(args.epoch):
291 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
292 | trained = 0
293 | gen1 = gens.word_list(args.source)
294 | gen2 = gens.word_list(args.target)
295 | gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * args.minibatch, order=0), args.minibatch)
296 | model.init_optimizer()
297 |
298 | for src_batch, trg_batch in gen3:
299 | src_batch = fill_batch2(src_batch)
300 | trg_batch = fill_batch2(trg_batch)
301 | K = len(src_batch)
302 | hyp_batch = model.train(src_batch, trg_batch)
303 |
304 | for k in range(K):
305 | trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1))
306 | trace(' src = ' + ' '.join([x if x != '' else '*' for x in src_batch[k]]))
307 | trace(' trg = ' + ' '.join([x if x != '' else '*' for x in trg_batch[k]]))
308 | trace(' hyp = ' + ' '.join([x if x != '' else '*' for x in hyp_batch[k]]))
309 |
310 | trained += K
311 |
312 | trace('saving model ...')
313 | model.save(args.model + '.%03d' % (epoch + 1))
314 |
315 | trace('finished.')
316 |
317 |
318 | def test_model(args):
319 | trace('loading model ...')
320 | model = AttentionalTranslationModel.load(args.model)
321 |
322 | trace('generating translation ...')
323 | generated = 0
324 |
325 | with open(args.target, 'w') as fp:
326 | for src_batch in gens.batch(gens.word_list(args.source), args.minibatch):
327 | src_batch = fill_batch2(src_batch)
328 | K = len(src_batch)
329 |
330 | trace('sample %8d - %8d ...' % (generated + 1, generated + K))
331 | hyp_batch = model.predict(src_batch, args.generation_limit)
332 |
333 | for hyp in hyp_batch:
334 | hyp.append('')
335 | hyp = hyp[:hyp.index('')]
336 | print(' '.join(hyp), file=fp)
337 |
338 | generated += K
339 |
340 | trace('finished.')
341 |
342 |
343 | def main():
344 | args = parse_args()
345 |
346 | trace('initializing ...')
347 | wrapper.init()
348 |
349 | if args.mode == 'train': train_model(args)
350 | elif args.mode == 'test': test_model(args)
351 |
352 |
353 | if __name__ == '__main__':
354 | main()
355 |
356 |
--------------------------------------------------------------------------------
/chainer-1.4/mt_s2s_encdec.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | import my_settings
4 |
5 | import sys
6 | import math
7 | import numpy as np
8 | from argparse import ArgumentParser
9 |
10 | from chainer import functions, optimizers
11 |
12 | import util.generators as gens
13 | from util.functions import trace, fill_batch
14 | from util.model_file import ModelFile
15 | from util.vocabulary import Vocabulary
16 |
17 | #from util.chainer_cpu_wrapper import wrapper
18 | from util.chainer_gpu_wrapper import wrapper
19 |
20 |
21 | class EncoderDecoderModel:
22 | def __init__(self):
23 | pass
24 |
25 | def __make_model(self):
26 | self.__model = wrapper.make_model(
27 | # encoder
28 | w_xi = functions.EmbedID(len(self.__src_vocab), self.__n_embed),
29 | w_ip = functions.Linear(self.__n_embed, 4 * self.__n_hidden),
30 | w_pp = functions.Linear(self.__n_hidden, 4 * self.__n_hidden),
31 | # decoder
32 | w_pq = functions.Linear(self.__n_hidden, 4 * self.__n_hidden),
33 | w_qj = functions.Linear(self.__n_hidden, self.__n_embed),
34 | w_jy = functions.Linear(self.__n_embed, len(self.__trg_vocab)),
35 | w_yq = functions.EmbedID(len(self.__trg_vocab), 4 * self.__n_hidden),
36 | w_qq = functions.Linear(self.__n_hidden, 4 * self.__n_hidden),
37 | )
38 |
39 | @staticmethod
40 | def new(src_vocab, trg_vocab, n_embed, n_hidden):
41 | self = EncoderDecoderModel()
42 | self.__src_vocab = src_vocab
43 | self.__trg_vocab = trg_vocab
44 | self.__n_embed = n_embed
45 | self.__n_hidden = n_hidden
46 | self.__make_model()
47 | return self
48 |
49 | def save(self, filename):
50 | with ModelFile(filename, 'w') as fp:
51 | self.__src_vocab.save(fp.get_file_pointer())
52 | self.__trg_vocab.save(fp.get_file_pointer())
53 | fp.write(self.__n_embed)
54 | fp.write(self.__n_hidden)
55 | wrapper.begin_model_access(self.__model)
56 | fp.write_embed(self.__model.w_xi)
57 | fp.write_linear(self.__model.w_ip)
58 | fp.write_linear(self.__model.w_pp)
59 | fp.write_linear(self.__model.w_pq)
60 | fp.write_linear(self.__model.w_qj)
61 | fp.write_linear(self.__model.w_jy)
62 | fp.write_embed(self.__model.w_yq)
63 | fp.write_linear(self.__model.w_qq)
64 | wrapper.end_model_access(self.__model)
65 |
66 | @staticmethod
67 | def load(filename):
68 | self = EncoderDecoderModel()
69 | with ModelFile(filename) as fp:
70 | self.__src_vocab = Vocabulary.load(fp.get_file_pointer())
71 | self.__trg_vocab = Vocabulary.load(fp.get_file_pointer())
72 | self.__n_embed = int(fp.read())
73 | self.__n_hidden = int(fp.read())
74 | self.__make_model()
75 | wrapper.begin_model_access(self.__model)
76 | fp.read_embed(self.__model.w_xi)
77 | fp.read_linear(self.__model.w_ip)
78 | fp.read_linear(self.__model.w_pp)
79 | fp.read_linear(self.__model.w_pq)
80 | fp.read_linear(self.__model.w_qj)
81 | fp.read_linear(self.__model.w_jy)
82 | fp.read_embed(self.__model.w_yq)
83 | fp.read_linear(self.__model.w_qq)
84 | wrapper.end_model_access(self.__model)
85 | return self
86 |
87 | def init_optimizer(self):
88 | self.__opt = optimizers.AdaGrad(lr=0.01)
89 | self.__opt.setup(self.__model)
90 |
91 | def __forward(self, is_training, src_batch, trg_batch = None, generation_limit = None):
92 | m = self.__model
93 | tanh = functions.tanh
94 | lstm = functions.lstm
95 | batch_size = len(src_batch)
96 | src_len = len(src_batch[0])
97 | src_stoi = self.__src_vocab.stoi
98 | trg_stoi = self.__trg_vocab.stoi
99 | trg_itos = self.__trg_vocab.itos
100 | s_c = wrapper.zeros((batch_size, self.__n_hidden))
101 |
102 | # encoding
103 | s_x = wrapper.make_var([src_stoi('') for _ in range(batch_size)], dtype=np.int32)
104 | s_i = tanh(m.w_xi(s_x))
105 | s_c, s_p = lstm(s_c, m.w_ip(s_i))
106 |
107 | for l in reversed(range(src_len)):
108 | s_x = wrapper.make_var([src_stoi(src_batch[k][l]) for k in range(batch_size)], dtype=np.int32)
109 | s_i = tanh(m.w_xi(s_x))
110 | s_c, s_p = lstm(s_c, m.w_ip(s_i) + m.w_pp(s_p))
111 |
112 | s_c, s_q = lstm(s_c, m.w_pq(s_p))
113 | hyp_batch = [[] for _ in range(batch_size)]
114 |
115 | # decoding
116 | if is_training:
117 | accum_loss = wrapper.zeros(())
118 | trg_len = len(trg_batch[0])
119 |
120 | for l in range(trg_len):
121 | s_j = tanh(m.w_qj(s_q))
122 | r_y = m.w_jy(s_j)
123 | s_t = wrapper.make_var([trg_stoi(trg_batch[k][l]) for k in range(batch_size)], dtype=np.int32)
124 | accum_loss += functions.softmax_cross_entropy(r_y, s_t)
125 | output = wrapper.get_data(r_y).argmax(1)
126 |
127 | for k in range(batch_size):
128 | hyp_batch[k].append(trg_itos(output[k]))
129 |
130 | s_c, s_q = lstm(s_c, m.w_yq(s_t) + m.w_qq(s_q))
131 |
132 | return hyp_batch, accum_loss
133 | else:
134 | while len(hyp_batch[0]) < generation_limit:
135 | s_j = tanh(m.w_qj(s_q))
136 | r_y = m.w_jy(s_j)
137 | output = wrapper.get_data(r_y).argmax(1)
138 |
139 | for k in range(batch_size):
140 | hyp_batch[k].append(trg_itos(output[k]))
141 |
142 | if all(hyp_batch[k][-1] == '' for k in range(batch_size)): break
143 |
144 | s_y = wrapper.make_var(output, dtype=np.int32)
145 | s_c, s_q = lstm(s_c, m.w_yq(s_y) + m.w_qq(s_q))
146 |
147 | return hyp_batch
148 |
149 | def train(self, src_batch, trg_batch):
150 | self.__opt.zero_grads()
151 | hyp_batch, accum_loss = self.__forward(True, src_batch, trg_batch=trg_batch)
152 | accum_loss.backward()
153 | self.__opt.clip_grads(10)
154 | self.__opt.update()
155 | return hyp_batch
156 |
157 | def predict(self, src_batch, generation_limit):
158 | return self.__forward(False, src_batch, generation_limit=generation_limit)
159 |
160 |
161 | def parse_args():
162 | def_vocab = 32768
163 | def_embed = 256
164 | def_hidden = 512
165 | def_epoch = 100
166 | def_minibatch = 64
167 | def_generation_limit = 256
168 |
169 | p = ArgumentParser(description='Encoder-decoder neural machine trainslation')
170 |
171 | p.add_argument('mode', help='\'train\' or \'test\'')
172 | p.add_argument('source', help='[in] source corpus')
173 | p.add_argument('target', help='[in/out] target corpus')
174 | p.add_argument('model', help='[in/out] model file')
175 | p.add_argument('--vocab', default=def_vocab, metavar='INT', type=int,
176 | help='vocabulary size (default: %d)' % def_vocab)
177 | p.add_argument('--embed', default=def_embed, metavar='INT', type=int,
178 | help='embedding layer size (default: %d)' % def_embed)
179 | p.add_argument('--hidden', default=def_hidden, metavar='INT', type=int,
180 | help='hidden layer size (default: %d)' % def_hidden)
181 | p.add_argument('--epoch', default=def_epoch, metavar='INT', type=int,
182 | help='number of training epoch (default: %d)' % def_epoch)
183 | p.add_argument('--minibatch', default=def_minibatch, metavar='INT', type=int,
184 | help='minibatch size (default: %d)' % def_minibatch)
185 | p.add_argument('--generation-limit', default=def_generation_limit, metavar='INT', type=int,
186 | help='maximum number of words to be generated for test input')
187 |
188 | args = p.parse_args()
189 |
190 | # check args
191 | try:
192 | if args.mode not in ['train', 'test']: raise ValueError('you must set mode = \'train\' or \'test\'')
193 | if args.vocab < 1: raise ValueError('you must set --vocab >= 1')
194 | if args.embed < 1: raise ValueError('you must set --embed >= 1')
195 | if args.hidden < 1: raise ValueError('you must set --hidden >= 1')
196 | if args.epoch < 1: raise ValueError('you must set --epoch >= 1')
197 | if args.minibatch < 1: raise ValueError('you must set --minibatch >= 1')
198 | if args.generation_limit < 1: raise ValueError('you must set --generation-limit >= 1')
199 | except Exception as ex:
200 | p.print_usage(file=sys.stderr)
201 | print(ex, file=sys.stderr)
202 | sys.exit()
203 |
204 | return args
205 |
206 |
207 | def train_model(args):
208 | trace('making vocabularies ...')
209 | src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab)
210 | trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab)
211 |
212 | trace('making model ...')
213 | model = EncoderDecoderModel.new(src_vocab, trg_vocab, args.embed, args.hidden)
214 |
215 | for epoch in range(args.epoch):
216 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
217 | trained = 0
218 | gen1 = gens.word_list(args.source)
219 | gen2 = gens.word_list(args.target)
220 | gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * args.minibatch), args.minibatch)
221 | model.init_optimizer()
222 |
223 | for src_batch, trg_batch in gen3:
224 | src_batch = fill_batch(src_batch)
225 | trg_batch = fill_batch(trg_batch)
226 | K = len(src_batch)
227 | hyp_batch = model.train(src_batch, trg_batch)
228 |
229 | for k in range(K):
230 | trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1))
231 | trace(' src = ' + ' '.join([x if x != '' else '*' for x in src_batch[k]]))
232 | trace(' trg = ' + ' '.join([x if x != '' else '*' for x in trg_batch[k]]))
233 | trace(' hyp = ' + ' '.join([x if x != '' else '*' for x in hyp_batch[k]]))
234 |
235 | trained += K
236 |
237 | trace('saving model ...')
238 | model.save(args.model + '.%03d' % (epoch + 1))
239 |
240 | trace('finished.')
241 |
242 |
243 | def test_model(args):
244 | trace('loading model ...')
245 | model = EncoderDecoderModel.load(args.model)
246 |
247 | trace('generating translation ...')
248 | generated = 0
249 |
250 | with open(args.target, 'w') as fp:
251 | for src_batch in gens.batch(gens.word_list(args.source), args.minibatch):
252 | src_batch = fill_batch(src_batch)
253 | K = len(src_batch)
254 |
255 | trace('sample %8d - %8d ...' % (generated + 1, generated + K))
256 | hyp_batch = model.predict(src_batch, args.generation_limit)
257 |
258 | for hyp in hyp_batch:
259 | hyp.append('')
260 | hyp = hyp[:hyp.index('')]
261 | print(' '.join(hyp), file=fp)
262 |
263 | generated += K
264 |
265 | trace('finished.')
266 |
267 |
268 | def main():
269 | args = parse_args()
270 |
271 | trace('initializing ...')
272 | wrapper.init()
273 |
274 | if args.mode == 'train': train_model(args)
275 | elif args.mode == 'test': test_model(args)
276 |
277 |
278 | if __name__ == '__main__':
279 | main()
280 |
281 |
--------------------------------------------------------------------------------
/chainer-1.4/seg_ffnn.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | import my_settings
4 |
5 | import sys
6 | import math
7 | import numpy as np
8 | from argparse import ArgumentParser
9 |
10 | from chainer import functions, optimizers
11 |
12 | import util.generators as gens
13 | from util.functions import trace, fill_batch
14 | from util.model_file import ModelFile
15 | from util.vocabulary import Vocabulary
16 |
17 | from util.chainer_cpu_wrapper import wrapper
18 | #from util.chainer_gpu_wrapper import wrapper
19 |
20 |
21 | class SegmentationModel:
22 | def __init__(self):
23 | pass
24 |
25 | def __make_model(self):
26 | self.__model = wrapper.make_model(
27 | w_xh = functions.EmbedID(2 * self.__n_context * len(self.__vocab), self.__n_hidden),
28 | w_hy = functions.Linear(self.__n_hidden, 1),
29 | )
30 |
31 | @staticmethod
32 | def new(vocab, n_context, n_hidden):
33 | self = SegmentationModel()
34 | self.__vocab = vocab
35 | self.__n_context = n_context
36 | self.__n_hidden = n_hidden
37 | self.__make_model()
38 | return self
39 |
40 | def save(self, filename):
41 | with ModelFile(filename, 'w') as fp:
42 | self.__vocab.save(fp.get_file_pointer())
43 | fp.write(self.__n_context)
44 | fp.write(self.__n_hidden)
45 | wrapper.begin_model_access(self.__model)
46 | fp.write_embed(self.__model.w_xh)
47 | fp.write_linear(self.__model.w_hy)
48 | wrapper.end_model_access(self.__model)
49 |
50 | @staticmethod
51 | def load(filename):
52 | self = SegmentationModel()
53 | with ModelFile(filename) as fp:
54 | self.__vocab = Vocabulary.load(fp.get_file_pointer())
55 | self.__n_context = int(fp.read())
56 | self.__n_hidden = int(fp.read())
57 | self.__make_model()
58 | wrapper.begin_model_access(self.__model)
59 | fp.read_embed(self.__model.w_xh)
60 | fp.read_linear(self.__model.w_hy)
61 | wrapper.end_model_access(self.__model)
62 | return self
63 |
64 | def init_optimizer(self):
65 | self.__opt = optimizers.AdaGrad(lr=0.01)
66 | self.__opt.setup(self.__model)
67 |
68 | def __make_input(self, is_training, text):
69 | c = self.__vocab.stoi
70 | k = self.__n_context - 1
71 | word_list = text.split()
72 | letters = [c('')] * k + [c(x) for x in ''.join(word_list)] + [c('')] * k
73 | if is_training:
74 | labels = []
75 | for x in word_list:
76 | labels += [-1] * (len(x) - 1) + [1]
77 | return letters, labels[:-1]
78 | else:
79 | return letters, None
80 |
81 | def __forward(self, is_training, text):
82 | m = self.__model
83 | tanh = functions.tanh
84 | letters, labels = self.__make_input(is_training, text)
85 | scores = []
86 | accum_loss = wrapper.zeros(()) if is_training else None
87 |
88 | for n in range(len(letters) - 2 * self.__n_context + 1):
89 | s_hu = wrapper.zeros((1, self.__n_hidden))
90 |
91 | for k in range(2 * self.__n_context):
92 | wid = k * len(self.__vocab) + letters[n + k]
93 | s_x = wrapper.make_var([wid], dtype=np.int32)
94 | s_hu += m.w_xh(s_x)
95 |
96 | s_hv = tanh(s_hu)
97 | s_y = tanh(m.w_hy(s_hv))
98 | scores.append(float(wrapper.get_data(s_y)))
99 |
100 | if is_training:
101 | s_t = wrapper.make_var([[labels[n]]])
102 | accum_loss += functions.mean_squared_error(s_y, s_t)
103 |
104 | return scores, accum_loss
105 |
106 | def train(self, text):
107 | self.__opt.zero_grads()
108 | scores, accum_loss = self.__forward(True, text)
109 | accum_loss.backward()
110 | self.__opt.clip_grads(5)
111 | self.__opt.update()
112 | return scores
113 |
114 | def predict(self, text):
115 | return self.__forward(False, text)[0]
116 |
117 |
118 | def parse_args():
119 | def_vocab = 2500
120 | def_hidden = 100
121 | def_epoch = 100
122 | def_context = 3
123 |
124 | p = ArgumentParser(description='Word segmentation using feedforward neural network')
125 |
126 | p.add_argument('mode', help='\'train\' or \'test\'')
127 | p.add_argument('corpus', help='[in] source corpus')
128 | p.add_argument('model', help='[in/out] model file')
129 | p.add_argument('--vocab', default=def_vocab, metavar='INT', type=int,
130 | help='vocabulary size (default: %d)' % def_vocab)
131 | p.add_argument('--hidden', default=def_hidden, metavar='INT', type=int,
132 | help='hidden layer size (default: %d)' % def_hidden)
133 | p.add_argument('--epoch', default=def_epoch, metavar='INT', type=int,
134 | help='number of training epoch (default: %d)' % def_epoch)
135 | p.add_argument('--context', default=def_context, metavar='INT', type=int,
136 | help='width of context window (default: %d)' % def_context)
137 |
138 | args = p.parse_args()
139 |
140 | # check args
141 | try:
142 | if args.mode not in ['train', 'test']: raise ValueError('you must set mode = \'train\' or \'test\'')
143 | if args.vocab < 1: raise ValueError('you must set --vocab >= 1')
144 | if args.hidden < 1: raise ValueError('you must set --hidden >= 1')
145 | if args.epoch < 1: raise ValueError('you must set --epoch >= 1')
146 | if args.context < 1: raise ValueError('you must set --context >= 1')
147 | except Exception as ex:
148 | p.print_usage(file=sys.stderr)
149 | print(ex, file=sys.stderr)
150 | sys.exit()
151 |
152 | return args
153 |
154 |
155 | def make_hyp(letters, scores):
156 | hyp = letters[0]
157 | for w, s in zip(letters[1:], scores):
158 | if s >= 0:
159 | hyp += ' '
160 | hyp += w
161 | return hyp
162 |
163 |
164 | def train_model(args):
165 | trace('making vocabularies ...')
166 | vocab = Vocabulary.new(gens.letter_list(args.corpus), args.vocab)
167 |
168 | trace('start training ...')
169 | model = SegmentationModel.new(vocab, args.context, args.hidden)
170 |
171 | for epoch in range(args.epoch):
172 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
173 | trained = 0
174 |
175 | model.init_optimizer()
176 |
177 | with open(args.corpus) as fp:
178 | for text in fp:
179 | word_list = text.split()
180 | if not word_list:
181 | continue
182 |
183 | text = ' '.join(word_list)
184 | letters = ''.join(word_list)
185 | scores = model.train(text)
186 | trained += 1
187 | hyp = make_hyp(letters, scores)
188 |
189 | trace(trained)
190 | trace(text)
191 | trace(hyp)
192 | trace(' '.join('%+.1f' % x for x in scores))
193 |
194 | if trained % 100 == 0:
195 | trace(' %8d' % trained)
196 |
197 | trace('saveing model ...')
198 | model.save(args.model + '.%03d' % (epoch + 1))
199 |
200 | trace('finished.')
201 |
202 |
203 | def test_model(args):
204 | trace('loading model ...')
205 | model = SegmentationModel.load(args.model)
206 |
207 | trace('generating output ...')
208 |
209 | with open(args.corpus) as fp:
210 | for text in fp:
211 | letters = ''.join(text.split())
212 | if not letters:
213 | print()
214 | continue
215 | scores = model.predict(text)
216 | hyp = make_hyp(letters, scores)
217 | print(hyp)
218 |
219 | trace('finished.')
220 |
221 |
222 | def main():
223 | args = parse_args()
224 |
225 | trace('initializing CUDA ...')
226 | wrapper.init()
227 |
228 | if args.mode == 'train': train_model(args)
229 | elif args.mode == 'test': test_model(args)
230 |
231 |
232 | if __name__ == '__main__':
233 | main()
234 |
235 |
--------------------------------------------------------------------------------
/chainer-1.4/seg_rnn.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | #import my_settings
4 |
5 | import sys
6 | import math
7 | import numpy as np
8 | from argparse import ArgumentParser
9 |
10 | from chainer import functions, optimizers
11 |
12 | import util.generators as gens
13 | from util.functions import trace, fill_batch
14 | from util.model_file import ModelFile
15 | from util.vocabulary import Vocabulary
16 |
17 | from util.chainer_cpu_wrapper import wrapper
18 | #from util.chainer_gpu_wrapper import wrapper
19 |
20 |
21 | class RNNSegmentationModel:
22 | def __init__(self):
23 | pass
24 |
25 | def __make_model(self):
26 | self.__model = wrapper.make_model(
27 | w_xe = functions.EmbedID(len(self.__vocab), self.__n_embed),
28 | w_ea = functions.Linear(self.__n_embed, 4 * self.__n_hidden),
29 | w_aa = functions.Linear(self.__n_hidden, 4 * self.__n_hidden),
30 | w_eb = functions.Linear(self.__n_embed, 4 * self.__n_hidden),
31 | w_bb = functions.Linear(self.__n_hidden, 4 * self.__n_hidden),
32 | w_ay1 = functions.Linear(self.__n_hidden, 1),
33 | w_by1 = functions.Linear(self.__n_hidden, 1),
34 | w_ay2 = functions.Linear(self.__n_hidden, 1),
35 | w_by2 = functions.Linear(self.__n_hidden, 1),
36 | )
37 |
38 | @staticmethod
39 | def new(vocab, n_embed, n_hidden):
40 | self = RNNSegmentationModel()
41 | self.__vocab = vocab
42 | self.__n_embed = n_embed
43 | self.__n_hidden = n_hidden
44 | self.__make_model()
45 | return self
46 |
47 | def save(self, filename):
48 | with ModelFile(filename, 'w') as fp:
49 | self.__vocab.save(fp.get_file_pointer())
50 | fp.write(self.__n_embed)
51 | fp.write(self.__n_hidden)
52 | wrapper.begin_model_access(self.__model)
53 | fp.write_embed(self.__model.w_xe)
54 | fp.write_linear(self.__model.w_ea)
55 | fp.write_linear(self.__model.w_aa)
56 | fp.write_linear(self.__model.w_eb)
57 | fp.write_linear(self.__model.w_bb)
58 | fp.write_linear(self.__model.w_ay1)
59 | fp.write_linear(self.__model.w_by1)
60 | fp.write_linear(self.__model.w_ay2)
61 | fp.write_linear(self.__model.w_by2)
62 | wrapper.end_model_access(self.__model)
63 |
64 | @staticmethod
65 | def load(filename):
66 | self = RNNSegmentationModel()
67 | with ModelFile(filename) as fp:
68 | self.__vocab = Vocabulary.load(fp.get_file_pointer())
69 | self.__n_embed = int(fp.read())
70 | self.__n_hidden = int(fp.read())
71 | self.__make_model()
72 | wrapper.begin_model_access(self.__model)
73 | fp.read_embed(self.__model.w_xe)
74 | fp.read_linear(self.__model.w_ea)
75 | fp.read_linear(self.__model.w_aa)
76 | fp.read_linear(self.__model.w_eb)
77 | fp.read_linear(self.__model.w_bb)
78 | fp.read_linear(self.__model.w_ay1)
79 | fp.read_linear(self.__model.w_by1)
80 | fp.read_linear(self.__model.w_ay2)
81 | fp.read_linear(self.__model.w_by2)
82 | wrapper.end_model_access(self.__model)
83 | return self
84 |
85 | def init_optimizer(self):
86 | self.__opt = optimizers.AdaGrad(lr=0.001)
87 | self.__opt.setup(self.__model)
88 |
89 | def __make_input(self, is_training, text):
90 | word_list = text.split()
91 | letters = [self.__vocab.stoi(x) for x in ''.join(word_list)]
92 | if is_training:
93 | labels = []
94 | for x in word_list:
95 | labels += [-1] * (len(x) - 1) + [1]
96 | return letters, labels[:-1]
97 | else:
98 | return letters, None
99 |
100 | def __forward(self, is_training, text):
101 | m = self.__model
102 | tanh = functions.tanh
103 | lstm = functions.lstm
104 | letters, labels = self.__make_input(is_training, text)
105 | n_letters = len(letters)
106 |
107 | accum_loss = wrapper.zeros(()) if is_training else None
108 | hidden_zeros = wrapper.zeros((1, self.__n_hidden))
109 |
110 | # embedding
111 | list_e = []
112 | for i in range(n_letters):
113 | s_x = wrapper.make_var([letters[i]], dtype=np.int32)
114 | list_e.append(tanh(m.w_xe(s_x)))
115 |
116 | # forward encoding
117 | s_a = hidden_zeros
118 | c = hidden_zeros
119 | list_a = []
120 | for i in range(n_letters):
121 | c, s_a = lstm(c, m.w_ea(list_e[i]) + m.w_aa(s_a))
122 | list_a.append(s_a)
123 |
124 | # backward encoding
125 | s_b = hidden_zeros
126 | c = hidden_zeros
127 | list_b = []
128 | for i in reversed(range(n_letters)):
129 | c, s_b = lstm(c, m.w_eb(list_e[i]) + m.w_bb(s_b))
130 | list_b.append(s_b)
131 |
132 | # segmentation
133 | scores = []
134 | for i in range(n_letters - 1):
135 | s_y = tanh(m.w_ay1(list_a[i]) + m.w_by1(list_b[i]) + m.w_ay2(list_a[i + 1]) + m.w_by2(list_b[i + 1]))
136 | scores.append(float(wrapper.get_data(s_y)))
137 |
138 | if is_training:
139 | s_t = wrapper.make_var([[labels[i]]])
140 | accum_loss += functions.mean_squared_error(s_y, s_t)
141 |
142 | return scores, accum_loss
143 |
144 | def train(self, text):
145 | self.__opt.zero_grads()
146 | scores, accum_loss = self.__forward(True, text)
147 | accum_loss.backward()
148 | self.__opt.clip_grads(5)
149 | self.__opt.update()
150 | return scores
151 |
152 | def predict(self, text):
153 | return self.__forward(False, text)[0]
154 |
155 |
156 | def parse_args():
157 | def_vocab = 2500
158 | def_embed = 100
159 | def_hidden = 100
160 | def_epoch = 20
161 |
162 | p = ArgumentParser(description='Word segmentation using LSTM-RNN')
163 |
164 | p.add_argument('mode', help='\'train\' or \'test\'')
165 | p.add_argument('corpus', help='[in] source corpus')
166 | p.add_argument('model', help='[in/out] model file')
167 | p.add_argument('--vocab', default=def_vocab, metavar='INT', type=int,
168 | help='vocabulary size (default: %d)' % def_vocab)
169 | p.add_argument('--embed', default=def_embed, metavar='INT', type=int,
170 | help='embedding layer size (default: %d)' % def_embed)
171 | p.add_argument('--hidden', default=def_hidden, metavar='INT', type=int,
172 | help='hidden layer size (default: %d)' % def_hidden)
173 | p.add_argument('--epoch', default=def_epoch, metavar='INT', type=int,
174 | help='number of training epoch (default: %d)' % def_epoch)
175 |
176 | args = p.parse_args()
177 |
178 | # check args
179 | try:
180 | if args.mode not in ['train', 'test']: raise ValueError('you must set mode = \'train\' or \'test\'')
181 | if args.vocab < 1: raise ValueError('you must set --vocab >= 1')
182 | if args.embed < 1: raise ValueError('you must set --embed >= 1')
183 | if args.hidden < 1: raise ValueError('you must set --hidden >= 1')
184 | if args.epoch < 1: raise ValueError('you must set --epoch >= 1')
185 | except Exception as ex:
186 | p.print_usage(file=sys.stderr)
187 | print(ex, file=sys.stderr)
188 | sys.exit()
189 |
190 | return args
191 |
192 |
193 | def make_hyp(letters, scores):
194 | hyp = letters[0]
195 | for w, s in zip(letters[1:], scores):
196 | if s >= 0:
197 | hyp += ' '
198 | hyp += w
199 | return hyp
200 |
201 |
202 | def train_model(args):
203 | trace('making vocabularies ...')
204 | vocab = Vocabulary.new(gens.letter_list(args.corpus), args.vocab)
205 |
206 | trace('start training ...')
207 | model = RNNSegmentationModel.new(vocab, args.embed, args.hidden)
208 |
209 | for epoch in range(args.epoch):
210 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
211 | trained = 0
212 |
213 | model.init_optimizer()
214 |
215 | with open(args.corpus) as fp:
216 | for text in fp:
217 | word_list = text.split()
218 | if not word_list:
219 | continue
220 |
221 | text = ' '.join(word_list)
222 | letters = ''.join(word_list)
223 | scores = model.train(text)
224 | trained += 1
225 | hyp = make_hyp(letters, scores)
226 |
227 | trace(trained)
228 | trace(text)
229 | trace(hyp)
230 | trace(' '.join('%+.1f' % x for x in scores))
231 |
232 | if trained % 100 == 0:
233 | trace(' %8d' % trained)
234 |
235 | trace('saveing model ...')
236 | model.save(args.model + '.%03d' % (epoch + 1))
237 |
238 | trace('finished.')
239 |
240 |
241 | def test_model(args):
242 | trace('loading model ...')
243 | model = RNNSegmentationModel.load(args.model)
244 |
245 | trace('generating output ...')
246 |
247 | with open(args.corpus) as fp:
248 | for text in fp:
249 | letters = ''.join(text.split())
250 | if not letters:
251 | print()
252 | continue
253 | scores = model.predict(text)
254 | hyp = make_hyp(letters, scores)
255 | print(hyp)
256 |
257 | trace('finished.')
258 |
259 |
260 | def main():
261 | args = parse_args()
262 |
263 | trace('initializing ...')
264 | wrapper.init()
265 |
266 | if args.mode == 'train': train_model(args)
267 | elif args.mode == 'test': test_model(args)
268 |
269 |
270 | if __name__ == '__main__':
271 | main()
272 |
273 |
--------------------------------------------------------------------------------
/chainer-1.4/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/odashi/chainer_examples/b13ec64e5035b1eb75b873431786d880577b7370/chainer-1.4/util/__init__.py
--------------------------------------------------------------------------------
/chainer-1.4/util/chainer_cpu_wrapper.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import chainer
3 |
4 | class wrapper:
5 | @staticmethod
6 | def init():
7 | pass
8 |
9 | @staticmethod
10 | def make_var(array, dtype=numpy.float32):
11 | return chainer.Variable(numpy.array(array, dtype=dtype))
12 |
13 | @staticmethod
14 | def get_data(variable):
15 | return variable.data
16 |
17 | @staticmethod
18 | def zeros(shape, dtype=numpy.float32):
19 | return chainer.Variable(numpy.zeros(shape, dtype=dtype))
20 |
21 | @staticmethod
22 | def ones(shape, dtype=numpy.float32):
23 | return chainer.Variable(numpy.ones(shape, dtype=dtype))
24 |
25 | @staticmethod
26 | def make_model(**kwargs):
27 | return chainer.FunctionSet(**kwargs)
28 |
29 | @staticmethod
30 | def begin_model_access(model):
31 | pass
32 |
33 | @staticmethod
34 | def end_model_access(model):
35 | pass
36 |
37 |
--------------------------------------------------------------------------------
/chainer-1.4/util/chainer_gpu_wrapper.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import chainer
3 |
4 | class wrapper:
5 | @staticmethod
6 | def init():
7 | chainer.cuda.init()
8 |
9 | @staticmethod
10 | def make_var(array, dtype=numpy.float32):
11 | return chainer.Variable(chainer.cuda.to_gpu(numpy.array(array, dtype=dtype)))
12 |
13 | @staticmethod
14 | def get_data(variable):
15 | return chainer.cuda.to_cpu(variable.data)
16 |
17 | @staticmethod
18 | def zeros(shape, dtype=numpy.float32):
19 | return chainer.Variable(chainer.cuda.zeros(shape, dtype=dtype))
20 |
21 | @staticmethod
22 | def ones(shape, dtype=numpy.float32):
23 | return chainer.Variable(chainer.cuda.ones(shape, dtype=dtype))
24 |
25 | @staticmethod
26 | def make_model(**kwargs):
27 | return chainer.FunctionSet(**kwargs).to_gpu()
28 |
29 | @staticmethod
30 | def begin_model_access(model):
31 | model.to_cpu()
32 |
33 | @staticmethod
34 | def end_model_access(model):
35 | model.to_gpu()
36 |
37 |
--------------------------------------------------------------------------------
/chainer-1.4/util/functions.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import datetime
3 |
4 | def trace(*args):
5 | print(datetime.datetime.now(), '...', *args, file=sys.stderr)
6 | sys.stderr.flush()
7 |
8 | def fill_batch(batch, token=''):
9 | max_len = max(len(x) for x in batch)
10 | return [x + [token] * (max_len - len(x) + 1) for x in batch]
11 |
12 | def fill_batch2(batch, start_token='', end_token=''):
13 | max_len = max(len(x) for x in batch)
14 | return [[start_token] + x + [end_token] * (max_len - len(x) + 1) for x in batch]
15 |
16 | def vtos(v, fmt='%.8e'):
17 | return ' '.join(fmt % x for x in v)
18 |
19 | def stov(s, tp=float):
20 | return [tp(x) for x in s.split()]
21 |
22 |
--------------------------------------------------------------------------------
/chainer-1.4/util/generators.py:
--------------------------------------------------------------------------------
1 | def batch(generator, batch_size):
2 | batch = []
3 | is_tuple = False
4 | for l in generator:
5 | is_tuple = isinstance(l, tuple)
6 | batch.append(l)
7 | if len(batch) == batch_size:
8 | yield tuple(list(x) for x in zip(*batch)) if is_tuple else batch
9 | batch = []
10 | if batch:
11 | yield tuple(list(x) for x in zip(*batch)) if is_tuple else batch
12 |
13 | def sorted_parallel(generator1, generator2, pooling, order=1):
14 | gen1 = batch(generator1, pooling)
15 | gen2 = batch(generator2, pooling)
16 | for batch1, batch2 in zip(gen1, gen2):
17 | #yield from sorted(zip(batch1, batch2), key=lambda x: len(x[1]))
18 | for x in sorted(zip(batch1, batch2), key=lambda x: len(x[order])):
19 | yield x
20 |
21 | def word_list(filename):
22 | with open(filename) as fp:
23 | for l in fp:
24 | yield l.split()
25 |
26 | def letter_list(filename):
27 | with open(filename) as fp:
28 | for l in fp:
29 | yield list(''.join(l.split()))
30 |
31 |
--------------------------------------------------------------------------------
/chainer-1.4/util/model_file.py:
--------------------------------------------------------------------------------
1 | from .functions import vtos, stov
2 |
3 | class ModelFile:
4 | def __init__(self, filename, mode='r'):
5 | self.__fp = open(filename, mode)
6 |
7 | def __enter__(self):
8 | return self
9 |
10 | def __exit__(self, exc_type, exc_value, traceback):
11 | self.__fp.close()
12 | return False
13 |
14 | def write(self, x):
15 | print(x, file=self.__fp)
16 |
17 | def __write_vector(self, x):
18 | self.write(vtos(x))
19 |
20 | def __write_matrix(self, x):
21 | for row in x:
22 | self.__write_vector(row)
23 |
24 | def read(self):
25 | return next(self.__fp).strip()
26 |
27 | def __read_vector(self, x, tp):
28 | data = stov(self.read(), tp)
29 | for i in range(len(data)):
30 | x[i] = data[i]
31 |
32 | def __read_matrix(self, x, tp):
33 | for row in x:
34 | self.__read_vector(row, tp)
35 |
36 | def write_embed(self, f):
37 | self.__write_matrix(f.W)
38 |
39 | def write_linear(self, f):
40 | self.__write_matrix(f.W)
41 | self.__write_vector(f.b)
42 |
43 | def read_embed(self, f):
44 | self.__read_matrix(f.W, float)
45 |
46 | def read_linear(self, f):
47 | self.__read_matrix(f.W, float)
48 | self.__read_vector(f.b, float)
49 |
50 | def get_file_pointer(self):
51 | return self.__fp
52 |
53 |
--------------------------------------------------------------------------------
/chainer-1.4/util/vocabulary.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 |
4 | class Vocabulary:
5 | def __init__(self):
6 | pass
7 |
8 | def __len__(self):
9 | return self.__size
10 |
11 | def stoi(self, s):
12 | return self.__stoi[s]
13 |
14 | def itos(self, i):
15 | return self.__itos[i]
16 |
17 | @staticmethod
18 | def new(list_generator, size):
19 | self = Vocabulary()
20 | self.__size = size
21 |
22 | word_freq = defaultdict(lambda: 0)
23 | for words in list_generator:
24 | for word in words:
25 | word_freq[word] += 1
26 |
27 | self.__stoi = defaultdict(lambda: 0)
28 | self.__stoi[''] = 0
29 | self.__stoi[''] = 1
30 | self.__stoi[''] = 2
31 | self.__itos = [''] * self.__size
32 | self.__itos[0] = ''
33 | self.__itos[1] = ''
34 | self.__itos[2] = ''
35 |
36 | for i, (k, v) in zip(range(self.__size - 3), sorted(word_freq.items(), key=lambda x: -x[1])):
37 | self.__stoi[k] = i + 3
38 | self.__itos[i + 3] = k
39 |
40 | return self
41 |
42 | def save(self, fp):
43 | print(self.__size, file=fp)
44 | for i in range(self.__size):
45 | print(self.__itos[i], file=fp)
46 |
47 | @staticmethod
48 | def load(line_gen):
49 | self = Vocabulary()
50 |
51 | self.__size = int(next(line_gen))
52 |
53 | self.__stoi = defaultdict(lambda: 0)
54 | self.__itos = [''] * self.__size
55 | for i in range(self.__size):
56 | s = next(line_gen).strip()
57 | if s:
58 | self.__stoi[s] = i
59 | self.__itos[i] = s
60 |
61 | return self
62 |
63 |
--------------------------------------------------------------------------------
/chainer-1.5/LSTMVariants.py:
--------------------------------------------------------------------------------
1 | import numpy
2 |
3 | import chainer
4 | from chainer.functions.activation import sigmoid
5 | from chainer.functions.activation import tanh
6 | from chainer import link
7 | from chainer.links.connection import linear
8 |
9 |
10 | class LSTMBase(link.Chain):
11 |
12 | def __init__(self, n_units, n_inputs=None):
13 | if n_inputs is None:
14 | n_inputs = n_units
15 | super(LSTMBase, self).__init__(
16 | W_fh=linear.Linear(n_inputs, n_units),
17 | W_ih=linear.Linear(n_inputs, n_units),
18 | W_oh=linear.Linear(n_inputs, n_units),
19 | W_ch=linear.Linear(n_inputs, n_units),
20 | W_fx=linear.Linear(n_inputs, n_units),
21 | W_ix=linear.Linear(n_inputs, n_units),
22 | W_ox=linear.Linear(n_inputs, n_units),
23 | W_cx=linear.Linear(n_inputs, n_units),
24 | )
25 |
26 | class CoupledForgetLSTMBase(link.Chain):
27 |
28 | def __init__(self, n_units, n_inputs=None):
29 | if n_inputs is None:
30 | n_inputs = n_units
31 | super(LSTMBase, self).__init__(
32 | W_fh=linear.Linear(n_inputs, n_units),
33 | W_oh=linear.Linear(n_inputs, n_units),
34 | W_ch=linear.Linear(n_inputs, n_units),
35 | W_fx=linear.Linear(n_inputs, n_units),
36 | W_ox=linear.Linear(n_inputs, n_units),
37 | W_cx=linear.Linear(n_inputs, n_units),
38 | )
39 |
40 | class PeepHoleLSTMBase(link.Chain):
41 |
42 | def __init__(self, n_units, n_inputs=None):
43 | if n_inputs is None:
44 | n_inputs = n_units
45 | super(PeepHoleLSTMBase, self).__init__(
46 | W_fh=linear.Linear(n_inputs, n_units),
47 | W_fc=linear.Linear(n_inputs, n_units),
48 | W_ih=linear.Linear(n_inputs, n_units),
49 | W_ic=linear.Linear(n_inputs, n_units),
50 | W_oh=linear.Linear(n_inputs, n_units),
51 | W_oc=linear.Linear(n_inputs, n_units),
52 | W_ch=linear.Linear(n_inputs, n_units),
53 | W_fx=linear.Linear(n_inputs, n_units),
54 | W_ix=linear.Linear(n_inputs, n_units),
55 | W_ox=linear.Linear(n_inputs, n_units),
56 | W_cx=linear.Linear(n_inputs, n_units),
57 | )
58 |
59 | class CoupledForgetPeepHoleLSTMBase(link.Chain):
60 |
61 | def __init__(self, n_units, n_inputs=None):
62 | if n_inputs is None:
63 | n_inputs = n_units
64 | super(PeepHoleLSTMBase, self).__init__(
65 | W_fh=linear.Linear(n_inputs, n_units),
66 | W_fc=linear.Linear(n_inputs, n_units),
67 | W_oh=linear.Linear(n_inputs, n_units),
68 | W_oc=linear.Linear(n_inputs, n_units),
69 | W_ch=linear.Linear(n_inputs, n_units),
70 | W_fx=linear.Linear(n_inputs, n_units),
71 | W_ox=linear.Linear(n_inputs, n_units),
72 | W_cx=linear.Linear(n_inputs, n_units),
73 | )
74 |
75 | class StatefulLSTM(LSTMBase):
76 |
77 |
78 | def __init__(self, in_size, out_size):
79 | super(StatefulLSTM, self).__init__(out_size, in_size)
80 | self.state_size = out_size
81 | self.reset_state()
82 |
83 | def to_cpu(self):
84 | super(StatefulLSTM, self).to_cpu()
85 | if self.h is not None:
86 | self.h.to_cpu()
87 | if self.c is not None:
88 | self.c.to_cpu()
89 |
90 | def to_gpu(self, device=None):
91 | super(StatefulLSTM, self).to_gpu(device)
92 | if self.c is not None:
93 | self.c.to_gpu(device)
94 | if self.h is not None:
95 | self.h.to_gpu(device)
96 |
97 | def set_state(self, h, c):
98 | assert isinstance(h, chainer.Variable)
99 | assert isinstance(c, chainer.Variable)
100 | h_ = h
101 | c_ = c
102 | if self.xp == numpy:
103 | h_.to_cpu()
104 | c_.to_cpu()
105 | else:
106 | h_.to_gpu()
107 | c_.to_gpu()
108 | self.h = h_
109 | self.c = c_
110 |
111 | def reset_state(self):
112 | self.h = None
113 | self.c = None
114 |
115 | def __call__(self, x):
116 | ft = self.W_fx(x)
117 | it = self.W_ix(x)
118 | ct = self.W_cx(x)
119 | ot = self.W_ox(x)
120 |
121 | if self.h is not None:
122 | ft += self.W_fh(h)
123 | it += self.W_ih(h)
124 | ct += self.W_ch(h)
125 | ot += self.W_oh(h)
126 | ft = sigmoid.sigmoid(ft)
127 | it = sigmoid.sigmoid(it)
128 | ct = tanh.tanh(ct)
129 | ot = sigmoid.sigmoid(ot)
130 |
131 | c = it * ct
132 | if self.c is not none:
133 | c += ft * self.c
134 | self.c = c
135 | self.h = ot * tanh.tanh(self.c)
136 | return self.h
137 |
138 | def get_state():
139 | return self.c
140 |
141 |
142 | class StatelessLSTM(LSTMBase):
143 | def __init__(self, in_size, out_size):
144 | super(StatelessLSTM, self).__init__(out_size, in_size)
145 | self.state_size = out_size
146 |
147 | def __call__(self, x, h, c):
148 | ft = sigmoid.sigmoid(self.W_fx(x) + self.W_fh(h))
149 | it = sigmoid.sigmoid(self.W_ix(x) + self.W_ih(h))
150 | ct = tanh.tanh(self.W_cx(x) + self.W_ch(h))
151 | ot = sigmoid.sigmoid(self.W_ox(x) + self.W_oh(h))
152 | c = ft * c + it * ct
153 | h = ot * tanh.tanh(c)
154 | return h, c
155 |
156 | class StatefulPeepHoleLSTM(PeepHoleLSTMBase):
157 |
158 |
159 | def __init__(self, in_size, out_size):
160 | super(StatefulPeepHoleLSTM, self).__init__(out_size, in_size)
161 | self.state_size = out_size
162 | self.reset_state()
163 |
164 | def to_cpu(self):
165 | super(StatefulPeepHoleLSTM, self).to_cpu()
166 | if self.h is not None:
167 | self.h.to_cpu()
168 | if self.c is not None:
169 | self.c.to_cpu()
170 |
171 | def to_gpu(self, device=None):
172 | super(StatefulPeepHoleLSTM, self).to_gpu(device)
173 | if self.c is not None:
174 | self.c.to_gpu(device)
175 | if self.h is not None:
176 | self.h.to_gpu(device)
177 |
178 | def set_state(self, h, c):
179 | assert isinstance(h, chainer.Variable)
180 | assert isinstance(c, chainer.Variable)
181 | h_ = h
182 | c_ = c
183 | if self.xp == numpy:
184 | h_.to_cpu()
185 | c_.to_cpu()
186 | else:
187 | h_.to_gpu()
188 | c_.to_gpu()
189 | self.h = h_
190 | self.c = c_
191 |
192 | def reset_state(self):
193 | self.h = None
194 | self.c = None
195 |
196 | def __call__(self, x):
197 | ft = self.W_fx(x)
198 | it = self.W_ix(x)
199 | ct = self.W_cx(x)
200 | ot = self.W_ox(x)
201 |
202 | if self.h is not None and self.c is not None:
203 | ft += self.W_fh(h) + self.W_fc(self.c)
204 | it += self.W_ih(h) + self.W_ic(self.c)
205 | ct += self.W_ch(h)
206 | ot += self.W_oh(h)
207 | ft = sigmoid.sigmoid(ft)
208 | it = sigmoid.sigmoid(it)
209 | ct = tanh.tanh(ct)
210 | ot = sigmoid.sigmoid(ot + self.W_oc(ct))
211 |
212 | c = it * ct
213 | if self.c is not none:
214 | self.c += ft * c
215 |
216 | self.h = ot * tanh.tanh(self.c)
217 | return self.h
218 |
219 | def get_state():
220 | return self.c
221 |
222 |
223 | class StatelessPeepHoleLSTM(PeepHoleLSTMBase):
224 |
225 |
226 | def __init__(self, in_size, out_size):
227 | super(StatelessPeepHoleLSTM, self).__init__(out_size, in_size)
228 | self.state_size = out_size
229 |
230 |
231 | def __call__(self, x, h, c):
232 | ft = sigmoid.sigmoid(self.W_fx(x) + self.W_fh(h) + self.W_fc(c))
233 | it = sigmoid.sigmoid(self.W_ix(x) + self.W_ih(h) + self.W_ic(c))
234 | ct = tanh.tanh(self.W_cx(x) + self.W_ch(h))
235 | c = ft * c + it * ct
236 | ot = sigmoid.sigmoid(self.W_ox(x) + self.W_oh(h) + self.W_oc(c))
237 | h = ot * tanh.tanh(c)
238 | return h, c
239 |
240 | class CoupledForgetStatefulLSTM(CoupledForgetLSTMBase):
241 |
242 |
243 | def __init__(self, in_size, out_size):
244 | super(CoupledForgetStatefulLSTM, self).__init__(out_size, in_size)
245 | self.state_size = out_size
246 | self.reset_state()
247 |
248 | def to_cpu(self):
249 | super(CoupledForgetStatefulLSTM, self).to_cpu()
250 | if self.h is not None:
251 | self.h.to_cpu()
252 | if self.c is not None:
253 | self.c.to_cpu()
254 |
255 | def to_gpu(self, device=None):
256 | super(CoupledForgetStatefulLSTM, self).to_gpu(device)
257 | if self.c is not None:
258 | self.c.to_gpu(device)
259 | if self.h is not None:
260 | self.h.to_gpu(device)
261 |
262 | def set_state(self, h, c):
263 | assert isinstance(h, chainer.Variable)
264 | assert isinstance(c, chainer.Variable)
265 | h_ = h
266 | c_ = c
267 | if self.xp == numpy:
268 | h_.to_cpu()
269 | c_.to_cpu()
270 | else:
271 | h_.to_gpu()
272 | c_.to_gpu()
273 | self.h = h_
274 | self.c = c_
275 |
276 | def reset_state(self):
277 | self.h = None
278 | self.c = None
279 |
280 | def __call__(self, x):
281 | ft = self.W_fx(x)
282 | ct = self.W_cx(x)
283 | ot = self.W_ox(x)
284 |
285 | if self.h is not None:
286 | ft += self.W_fh(h)
287 | ct += self.W_ch(h)
288 | ot += self.W_oh(h)
289 | ft = sigmoid.sigmoid(ft)
290 | ct = tanh.tanh(ct)
291 | ot = sigmoid.sigmoid(ot)
292 |
293 | c = (1 - ft) * ct
294 | if self.c is not none:
295 | c += ft * self.c
296 | self.c = c
297 | self.h = ot * tanh.tanh(self.c)
298 | return self.h
299 |
300 | def get_state():
301 | return self.c
302 |
303 |
304 | class CoupledForgetStatelessLSTM(CoupledForgetLSTMBase):
305 | def __init__(self, in_size, out_size):
306 | super(CoupledForgetStatelessLSTM, self).__init__(out_size, in_size)
307 | self.state_size = out_size
308 |
309 | def __call__(self, x, h, c):
310 | ft = sigmoid.sigmoid(self.W_fx(x) + self.W_fh(h))
311 | ct = tanh.tanh(self.W_cx(x) + self.W_ch(h))
312 | ot = sigmoid.sigmoid(self.W_ox(x) + self.W_oh(h))
313 | c = ft * c + (1 - ft)) * ct
314 | h = ot * tanh.tanh(c)
315 | return h, c
316 |
317 | class CoupledForgetStatefulPeepHoleLSTM(CoupledForgetPeepHoleLSTMBase):
318 |
319 |
320 | def __init__(self, in_size, out_size):
321 | super(CoupledForgetStatefulPeepHoleLSTM, self).__init__(out_size, in_size)
322 | self.state_size = out_size
323 | self.reset_state()
324 |
325 | def to_cpu(self):
326 | super(CoupledForgetStatefulPeepHoleLSTM, self).to_cpu()
327 | if self.h is not None:
328 | self.h.to_cpu()
329 | if self.c is not None:
330 | self.c.to_cpu()
331 |
332 | def to_gpu(self, device=None):
333 | super(CoupledForgetStatefulPeepHoleLSTM, self).to_gpu(device)
334 | if self.c is not None:
335 | self.c.to_gpu(device)
336 | if self.h is not None:
337 | self.h.to_gpu(device)
338 |
339 | def set_state(self, h, c):
340 | assert isinstance(h, chainer.Variable)
341 | assert isinstance(c, chainer.Variable)
342 | h_ = h
343 | c_ = c
344 | if self.xp == numpy:
345 | h_.to_cpu()
346 | c_.to_cpu()
347 | else:
348 | h_.to_gpu()
349 | c_.to_gpu()
350 | self.h = h_
351 | self.c = c_
352 |
353 | def reset_state(self):
354 | self.h = None
355 | self.c = None
356 |
357 | def __call__(self, x):
358 | ft = self.W_fx(x)
359 | ct = self.W_cx(x)
360 | ot = self.W_ox(x)
361 |
362 | if self.h is not None and self.c is not None:
363 | ft += self.W_fh(h) + self.W_fc(self.c)
364 | ct += self.W_ch(h)
365 | ot += self.W_oh(h)
366 | ft = sigmoid.sigmoid(ft)
367 | ct = tanh.tanh(ct)
368 | ot = sigmoid.sigmoid(ot + self.W_oc(ct))
369 |
370 | c = (1 - ft) * ct
371 | if self.c is not none:
372 | self.c += ft * c
373 |
374 | self.h = ot * tanh.tanh(self.c)
375 | return self.h
376 |
377 | def get_state():
378 | return self.c
379 |
380 |
381 | class CoupledForgetStatelessPeepHoleLSTM(CoupledForgetPeepHoleLSTMBase):
382 |
383 |
384 | def __init__(self, in_size, out_size):
385 | super(CoupledForgetStatelessPeepHoleLSTM, self).__init__(out_size, in_size)
386 | self.state_size = out_size
387 |
388 |
389 | def __call__(self, x, h, c):
390 | ft = sigmoid.sigmoid(self.W_fx(x) + self.W_fh(h) + self.W_fc(c))
391 | ct = tanh.tanh(self.W_cx(x) + self.W_ch(h))
392 | c = ft * c + (1 - ft) * ct
393 | ot = sigmoid.sigmoid(self.W_ox(x) + self.W_oh(h) + self.W_oc(c))
394 | h = ot * tanh.tanh(c)
395 | return h, c
396 |
397 |
--------------------------------------------------------------------------------
/chainer-1.5/attention_lm.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import numpy
3 | from argparse import ArgumentParser
4 | from chainer import Chain, Variable, cuda, functions, links, optimizer, optimizers, serializers
5 | import util.generators as gens
6 | from util.functions import trace, fill_batch
7 | from util.vocabulary import Vocabulary
8 |
9 |
10 | #Added comment
11 |
12 | def make_vocab(filename, vocab_size):
13 | word_freq = defaultdict(lambda: 0)
14 | num_lines = 0
15 | num_words = 0
16 | with open(filename) as fp:
17 | for line in fp:
18 | words = line.split()
19 | num_lines += 1
20 | num_words += len(words)
21 | for word in words:
22 | word_freq[word] += 1
23 |
24 | # 0: unk
25 | # 1:
26 | # 2:
27 | vocab = defaultdict(lambda: 0)
28 | vocab[''] = 1
29 | vocab[''] = 2
30 | for i,(k,v) in zip(range(vocab_size - 3), sorted(word_freq.items(), key=lambda x: -x[1])):
31 | vocab[k] = i + 3
32 |
33 | return vocab, num_lines, num_words
34 |
35 |
36 | def generate_batch(filename, batch_size):
37 | with open(filename) as fp:
38 | batch = []
39 | try:
40 | while True:
41 | for i in range(batch_size):
42 | batch.append(next(fp).split())
43 |
44 | max_len = max(len(x) for x in batch)
45 | batch = [[''] + x + [''] * (max_len - len(x) + 1) for x in batch]
46 | yield batch
47 |
48 | batch = []
49 | except:
50 | pass
51 |
52 | if batch:
53 | max_len = max(len(x) for x in batch)
54 | batch = [[''] + x + [''] * (max_len - len(x) + 1) for x in batch]
55 | yield batch
56 |
57 |
58 |
59 | def get_data(variable):
60 | #return variable.data
61 | return cuda.to_cpu(variable.data)
62 |
63 | def parse_args():
64 | def_vocab = 40000
65 | def_embed = 200
66 | def_hidden = 200
67 | def_epoch = 10
68 | def_minibatch = 256
69 | def_model = 0
70 | p = ArgumentParser(description='RNNLM trainer')
71 |
72 | p.add_argument('corpus', help='[in] training corpus')
73 | p.add_argument('valid', help='[in] validation corpus')
74 | p.add_argument('model', help='[out] model file')
75 | p.add_argument('-V', '--vocab', default=def_vocab, metavar='INT', type=int,
76 | help='vocabulary size (default: %d)' % def_vocab)
77 | p.add_argument('-E', '--embed', default=def_embed, metavar='INT', type=int,
78 | help='embedding layer size (default: %d)' % def_embed)
79 | p.add_argument('-H', '--hidden', default=def_hidden, metavar='INT', type=int,
80 | help='hidden layer size (default: %d)' % def_hidden)
81 | p.add_argument('-I', '--epoch', default=def_epoch, metavar='INT', type=int,
82 | help='number of training epoch (default: %d)' % def_epoch)
83 | p.add_argument('-B', '--minibatch', default=def_minibatch, metavar='INT', type=int,
84 | help='minibatch size (default: %d)' % def_minibatch)
85 | p.add_argument('-M', '--model', default=def_model, metavar='INT', type=int,
86 | help='RNN used for LM (default: %d) where 0: Default RNNLM, 1: LSTM RNNLM, 2: Attention RNNLM' % def_model)
87 |
88 | args = p.parse_args()
89 |
90 | # check args
91 | try:
92 | if (args.vocab < 1): raise ValueError('you must set --vocab >= 1')
93 | if (args.embed < 1): raise ValueError('you must set --embed >= 1')
94 | if (args.hidden < 1): raise ValueError('you must set --hidden >= 1')
95 | if (args.epoch < 1): raise ValueError('you must set --epoch >= 1')
96 | if (args.minibatch < 1): raise ValueError('you must set --minibatch >= 1')
97 | except Exception as ex:
98 | p.print_usage(file=sys.stderr)
99 | print(ex)
100 | sys.exit()
101 |
102 | return args
103 |
104 |
105 | class XP:
106 | __lib = None
107 |
108 | @staticmethod
109 | def set_library(args):
110 | if args.use_gpu:
111 | XP.__lib = cuda.cupy
112 | cuda.get_device(args.gpu_device).use()
113 | else:
114 | XP.__lib = numpy
115 |
116 | @staticmethod
117 | def __zeros(shape, dtype):
118 | return Variable(XP.__lib.zeros(shape, dtype=dtype))
119 |
120 | @staticmethod
121 | def fzeros(shape):
122 | return XP.__zeros(shape, XP.__lib.float32)
123 |
124 | @staticmethod
125 | def __nonzeros(shape, dtype, val):
126 | return Variable(val * XP.__lib.ones(shape, dtype=dtype))
127 |
128 | @staticmethod
129 | def fnonzeros(shape, val=1):
130 | return XP.__nonzeros(shape, XP.__lib.float32, val)
131 |
132 | @staticmethod
133 | def __array(array, dtype):
134 | return Variable(XP.__lib.array(array, dtype=dtype))
135 |
136 | @staticmethod
137 | def iarray(array):
138 | return XP.__array(array, XP.__lib.int32)
139 |
140 | @staticmethod
141 | def farray(array):
142 | return XP.__array(array, XP.__lib.float32)
143 |
144 | class SrcEmbed(Chain):
145 | def __init__(self, vocab_size, embed_size):
146 | super(SrcEmbed, self).__init__(
147 | xe = links.EmbedID(vocab_size, embed_size),
148 | )
149 |
150 | def __call__(self, x):
151 | return functions.tanh(self.xe(x))
152 |
153 | class BasicRnnLM(Chain):
154 | def __init__(self, embed_size, hidden_size, vocab_size):
155 | super(BasicRnn, self).__init__(
156 | xe = SrcEmbed(vocab_size, embed_size),
157 | eh = links.Linear(embed_size, hidden_size),
158 | hh = links.Linear(hidden_size, hidden_size),
159 | hy = links.Linear(hidden_size, vocab_size),
160 | )
161 | self.reset_state()
162 |
163 | def reset_state():
164 | self.h = None
165 |
166 | def __call__(self, x):
167 |
168 | e = self.xe(x)
169 | h = self.eh(e)
170 | if self.h is not None:
171 | h += self.hh(self.h)
172 | self.h = functions.tanh(h)
173 | y = self.hy(self.h)
174 | return y
175 |
176 | class LSTMLM(Chain):
177 | def __init__(self, embed_size, hidden_size, vocab_size):
178 | super(LSTMRnn, self).__init__(
179 | xe = SrcEmbed(vocab_size, embed_size),
180 | lstm = links.LSTM(embed_size, hidden_size),
181 | hy = links.Linear(hidden_size, vocab_size),
182 | )
183 |
184 | def reset(self):
185 | self.zerograds()
186 |
187 | def __call__(self, x):
188 | e = self.xe(x)
189 | h = self.lstm(e)
190 | y = self.hy(h)
191 | return y
192 |
193 | class LSTMEncoder(Chain):
194 | def __init__(self, embed_size, hidden_size):
195 | super(LSTMEncoder, self).__init__(
196 | lstm = links.LSTM(embed_size, hidden_size),
197 | )
198 | def reset(self):
199 | self.zerograds()
200 | def __call__(self, x):
201 | h = self.lstm(x)
202 | return h
203 |
204 | class Attention(Chain):
205 | def __init__(self, hidden_size, embed_size):
206 | super(Attention, self).__init__(
207 | aw = links.Linear(embed_size, hidden_size),
208 | pw = links.Linear(hidden_size, hidden_size),
209 | we = links.Linear(hidden_size, 1),
210 | )
211 | self.hidden_size = hidden_size
212 |
213 |
214 |
215 | def __call__(self, a_list, p):
216 | batch_size = p.data.shape[0]
217 | e_list = []
218 | sum_e = XP.fzeros((batch_size, 1))
219 | for a in a_list:
220 | w = functions.tanh(self.aw(a) + self.pw(p))
221 | e = functions.exp(self.we(w))
222 | e_list.append(e)
223 | sum_e += e
224 | ZEROS = XP.fzeros((batch_size, self.hidden_size))
225 | aa = ZEROS
226 | for a, e in zip(a_list, e_list):
227 | e /= sum_e
228 | aa += a * e
229 | #aa += functions.reshape(functions.batch_matmul(a, e), (batch_size, self.hidden_size))
230 | return aa
231 |
232 | class AttentionLM(Chain):
233 | def __init__(self, embed_size, hidden_size, vocab_size):
234 | super(AttentionMT, self).__init__(
235 | emb = SrcEmbed(vocab_size, embed_size),
236 | enc = LSTMEncoder(embed_size, hidden_size),
237 | att = Attention(hidden_size, embed_size),
238 | outhe = links.Linear(hidden_size, hidden_size),
239 | outae = links.Linear(hidden_size, hidden_size),
240 | outey = links.Linear(hidden_size, vocab_size),
241 | )
242 | self.vocab_size = vocab_size
243 | self.embed_size = embed_size
244 | self.hidden_size = hidden_size
245 |
246 | def reset(self):
247 | self.zerograds()
248 | self.x_list = []
249 |
250 | def embed(self, x):
251 | self.x_list.append(self.emb(x))
252 |
253 | def encode(self, x):
254 | self.h = self.enc(x)
255 |
256 | def decode(self, atts_list):
257 | aa = self.att(self.atts_list, self.h)
258 | y = tanh(self.outhe(self.h) + self.outae(aa))
259 | return self.outey(y)
260 |
261 | def save_spec(self, filename):
262 | with open(filename, 'w') as fp:
263 | print(self.vocab_size, file=fp)
264 | print(self.embed_size, file=fp)
265 | print(self.hidden_size, file=fp)
266 |
267 | @staticmethod
268 | def load_spec(filename):
269 | with open(filename) as fp:
270 | vocab_size = int(next(fp))
271 | embed_size = int(next(fp))
272 | hidden_size = int(next(fp))
273 | return AttentionLM(embed_size, hidden_size, vocab_size)
274 |
275 | def forward(batch, model):
276 | batch = [[vocab[x] for x in words] for words in batch]
277 | K = len(batch)
278 | L = len(batch[0]) - 1
279 |
280 | opt.zero_grads()
281 | accum_loss = XP.fzeros(())
282 | accum_log_ppl = XP.fzeros(())
283 |
284 | if args.model is 0 or args.model is 1:
285 |
286 | for l in range(L):
287 | s_x = make_var([batch[k][l] for k in range(K)], dtype=np.int32)
288 | s_t = make_var([batch[k][l + 1] for k in range(K)], dtype=np.int32)
289 |
290 | s_y = model(s_x)
291 |
292 | loss_i = functions.softmax_cross_entropy(s_y, s_t)
293 | accum_loss += loss_i
294 |
295 | accum_log_ppl += get_data(loss_i)
296 |
297 |
298 |
299 | else:
300 | for l in range(L):
301 | s_x = make_var([batch[k][l] for k in range(K)], dtype=np.int32)
302 | model.embed(s_x)
303 | for l in range(L):
304 | s_t = make_var([batch[k][l + 1] for k in range(K)], dtype=np.int32)
305 | model.encode(self.x_list[l])
306 | s_y = model.decode(self.x_list[0:l]+self.x_list[l+1:L])
307 |
308 | loss_i = functions.softmax_cross_entropy(s_y, s_t)
309 | accum_loss += loss_i
310 |
311 | accum_log_ppl += get_data(loss_i)
312 |
313 | return accum_loss, accum_log_ppl
314 |
315 |
316 | def main():
317 | args = parse_args()
318 |
319 | trace('making vocabulary ...')
320 | vocab, num_lines, num_words = make_vocab(args.corpus, args.vocab)
321 |
322 | trace('initializing CUDA ...')
323 | cuda.init()
324 |
325 | trace('start training ...')
326 | if args.model is 0:
327 | model = BasicRnnLM(args.embed, args.hidden, args.vocab)
328 | model.reset()
329 | elif args.model is 1:
330 | model = LSTMRnn(args.embed, args.hidden, args.vocab)
331 | model.reset()
332 | elif args.model is 2:
333 | model = AttentionLM(args.embed, args.hidden, args.vocab)
334 | model.reset()
335 | model.to_gpu()
336 |
337 | for epoch in range(args.epoch):
338 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
339 | log_ppl = 0.0
340 | trained = 0
341 |
342 | opt = optimizers.AdaGrad(lr = 0.01)
343 | opt.setup(model)
344 | opt.add_hook(optimizer.GradientClipping(5))
345 |
346 | for batch in generate_batch(args.corpus, args.minibatch):
347 | K = len(batch)
348 | loss, perplexity= forward(batch, model)
349 | loss.backward()
350 | log_ppl += perplexity
351 | opt.update()
352 | trained += K
353 | model.reset()
354 |
355 | trace(' %d/%d' % (trained, num_lines))
356 | log_ppl /= float(num_words)
357 | trace('Train log(PPL) = %.10f' % log_ppl)
358 | trace('Train PPL = %.10f' % math.exp(log_ppl))
359 |
360 | log_ppl = 0.0
361 |
362 | for batch in generate_batch(args.valid, args.minibatch):
363 | K = len(batch)
364 | loss, perplexity= forward(batch, model)
365 | log_ppl += perplexity
366 | model.reset()
367 |
368 | trace('Valid log(PPL) = %.10f' % log_ppl)
369 | trace('Valid PPL = %.10f' % math.exp(log_ppl))
370 |
371 | trace(' writing model ...')
372 | trace('saving model ...')
373 | prefix = 'RNNLM-'+str(args.model) + '.%03.d' % (epoch + 1)
374 | save_vocab(prefix + '.srcvocab',vocab) #Fix this # Fixed
375 | model.save_spec(prefix + '.spec')
376 | serializers.save_hdf5(prefix + '.weights', model)
377 |
378 | trace('training finished.')
379 |
380 |
381 | if __name__ == '__main__':
382 | main()
383 |
384 |
385 | def save_vocab(filename, vocab):
386 | with open(filename, 'w') as fp:
387 | for k, v in vocab.items():
388 | if v == 0:
389 | continue
390 | print('%s %d' % (k, v), file=fp)
391 |
--------------------------------------------------------------------------------
/chainer-1.5/mt_s2s_attention.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import numpy
3 | from argparse import ArgumentParser
4 | from chainer import Chain, ChainList, Variable, cuda, functions, links, optimizer, optimizers, serializers
5 | import util.generators as gens
6 | from util.functions import trace, fill_batch
7 | from util.vocabulary import Vocabulary
8 |
9 | def parse_args():
10 | def_gpu_device = 0
11 | def_vocab = 1000
12 | def_embed = 100
13 | def_hidden = 200
14 | def_epoch = 10
15 | def_minibatch = 64
16 | def_generation_limit = 128
17 |
18 | p = ArgumentParser(
19 | description='Attentional neural machine trainslation',
20 | usage=
21 | '\n %(prog)s train [options] source target model'
22 | '\n %(prog)s test source target model'
23 | '\n %(prog)s -h',
24 | )
25 |
26 |
27 | p.add_argument('mode', help='\'train\' or \'test\'')
28 | p.add_argument('source', help='[in] source corpus')
29 | p.add_argument('target', help='[in/out] target corpus')
30 | p.add_argument('model', help='[in/out] model file')
31 | p.add_argument('--use-gpu', action='store_true', default=False,
32 | help='use GPU calculation')
33 | p.add_argument('--gpu-device', default=def_gpu_device, metavar='INT', type=int,
34 | help='GPU device ID to be used (default: %(default)d)')
35 | p.add_argument('--vocab', default=def_vocab, metavar='INT', type=int,
36 | help='vocabulary size (default: %(default)d)')
37 | p.add_argument('--embed', default=def_embed, metavar='INT', type=int,
38 | help='embedding layer size (default: %(default)d)')
39 | p.add_argument('--hidden', default=def_hidden, metavar='INT', type=int,
40 | help='hidden layer size (default: %(default)d)')
41 | p.add_argument('--epoch', default=def_epoch, metavar='INT', type=int,
42 | help='number of training epoch (default: %(default)d)')
43 | p.add_argument('--minibatch', default=def_minibatch, metavar='INT', type=int,
44 | help='minibatch size (default: %(default)d)')
45 | p.add_argument('--generation-limit', default=def_generation_limit, metavar='INT', type=int,
46 | help='maximum number of words to be generated for test input (default: %(default)d)')
47 |
48 | args = p.parse_args()
49 |
50 | # check args
51 | try:
52 | if args.mode not in ['train', 'test']: raise ValueError('you must set mode = \'train\' or \'test\'')
53 | if args.vocab < 1: raise ValueError('you must set --vocab >= 1')
54 | if args.embed < 1: raise ValueError('you must set --embed >= 1')
55 | if args.hidden < 1: raise ValueError('you must set --hidden >= 1')
56 | if args.epoch < 1: raise ValueError('you must set --epoch >= 1')
57 | if args.minibatch < 1: raise ValueError('you must set --minibatch >= 1')
58 | if args.generation_limit < 1: raise ValueError('you must set --generation-limit >= 1')
59 | except Exception as ex:
60 | p.print_usage(file=sys.stderr)
61 | print(ex, file=sys.stderr)
62 | sys.exit()
63 |
64 | return args
65 |
66 | class XP:
67 | __lib = None
68 |
69 | @staticmethod
70 | def set_library(args):
71 | if args.use_gpu:
72 | XP.__lib = cuda.cupy
73 | cuda.get_device(args.gpu_device).use()
74 | else:
75 | XP.__lib = numpy
76 |
77 | @staticmethod
78 | def __zeros(shape, dtype):
79 | return Variable(XP.__lib.zeros(shape, dtype=dtype))
80 |
81 | @staticmethod
82 | def fzeros(shape):
83 | return XP.__zeros(shape, XP.__lib.float32)
84 |
85 | @staticmethod
86 | def __nonzeros(shape, dtype, val):
87 | return Variable(val * XP.__lib.ones(shape, dtype=dtype))
88 |
89 | @staticmethod
90 | def fnonzeros(shape, val=1):
91 | return XP.__nonzeros(shape, XP.__lib.float32, val)
92 |
93 | @staticmethod
94 | def __array(array, dtype):
95 | return Variable(XP.__lib.array(array, dtype=dtype))
96 |
97 | @staticmethod
98 | def iarray(array):
99 | return XP.__array(array, XP.__lib.int32)
100 |
101 | @staticmethod
102 | def farray(array):
103 | return XP.__array(array, XP.__lib.float32)
104 |
105 | class SrcEmbed(Chain):
106 | def __init__(self, vocab_size, embed_size):
107 | super(SrcEmbed, self).__init__(
108 | xe = links.EmbedID(vocab_size, embed_size),
109 | )
110 |
111 | def __call__(self, x):
112 | return functions.tanh(self.xe(x))
113 |
114 |
115 |
116 |
117 | class MultiLayerStatefulLSTMEncoder(ChainList):
118 | """
119 | This is an implementation of a Multilayered Stateful LSTM.
120 | The underlying idea is to simply stack multiple LSTMs where the LSTM at the bottom takes the regular input,
121 | and the LSTMs after that simply take the outputs (represented by h) of the previous LSMTs as inputs.
122 | This is simply an analogous version of the Multilayered Stateless LSTM Encoder where the LSTM states are kept hidden.
123 | This LSTM is to be called only by passing the input (x).
124 | To access the cell states you must call the "get_states" function with parameter "num_layers" indicating the number of layers.
125 | Although the cell outputs for each layer are returned, typically only the one of the topmost layer is used for various purposes like attention.
126 | Note that in Tensorflow the concept of "number of attention heads" is used which probably points to attention using the output of each layer.
127 |
128 | Args:
129 | embed_size - The size of embeddings of the inputs
130 | hidden_size - The size of the hidden layer representation of the RNN
131 | num_layers - The number of layers of the RNN (Indicates the number of RNNS stacked on top of each other)
132 |
133 | Attributes:
134 | num_layers: Indicates the number of layers in the RNN
135 | User Defined Methods:
136 | get_states: This simply returns the latest cell states (c) as an array for all layers.
137 |
138 | """
139 |
140 | def __init__(self, embed_size, hidden_size, num_layers):
141 | super(MultiLayerStatefulLSTMEncoder, self).__init__()
142 | self.add_link(links.LSTM(embed_size,hidden_size))
143 | for i in range(1, num_layers):
144 | self.add_link(links.LSTM(hidden_size, hidden_size))
145 | self.num_layers = num_layers
146 |
147 | def __call__(self, x):
148 | """
149 | Updates the internal state and returns the RNN outputs for each layer as a list.
150 |
151 | Args:
152 | x : A new batch from the input sequence.
153 |
154 | Returns:
155 | A list of the outputs (h) of updated RNN units over all the layers.
156 |
157 | """
158 | h_list = []
159 | h_curr = self[0](x)
160 | h_list.append(h_curr)
161 | for i in range(1,self.num_layers):
162 | h_curr = self[1](h_curr)
163 | h_list.append(h_curr)
164 | return h_list
165 |
166 | def get_states():
167 | c_list = []
168 | for i in range(self.num_layers):
169 | c_list.append(self[i].c)
170 | return c_list
171 |
172 | class MultiLayerStatelessLSTMEncoder(ChainList):
173 | """
174 | This is an implementation of a Multilayered Stateless LSTM.
175 | The underlying idea is to simply stack multiple LSTMs where the LSTM at the bottom takes the regular input,
176 | and the LSTMs after that simply take the outputs (represented by h) of the previous LSMTs as inputs.
177 | This is simply an analogous version of the Multilayered Stateful LSTM Encoder where the LSTM states are not hidden.
178 | You have to pass the previous cell states (c) and outputs (h) along with the input (x) when calling the LSTM.
179 | Although the cell outputs for each layer are returned, typically only the one of the topmost layer is used for various purposes like attention.
180 | Note that in Tensorflow the concept of "number of attention heads" is used which probably points to attention using the output of each layer.
181 |
182 | Args:
183 | embed_size - The size of embeddings of the inputs
184 | hidden_size - The size of the hidden layer representation of the RNN
185 | num_layers - The number of layers of the RNN (Indicates the number of RNNS stacked on top of each other)
186 |
187 | Attributes:
188 | num_layers: Indicates the number of layers in the RNN
189 | User Defined Methods:
190 |
191 | """
192 | def __init__(self, embed_size, hidden_size, num_layers):
193 | super(MultiLayerStatelessLSTMEncoder, self).__init__()
194 |
195 | self.add_link(links.Linear(embed_size, 4 * hidden_size))
196 | self.add_link(links.Linear(hidden_size, 4 * hidden_size))
197 | for i in range(1,num_layers):
198 | self.add_link(links.Linear(hidden_size, 4 * hidden_size))
199 | self.add_link(links.Linear(hidden_size, 4 * hidden_size))
200 | self.num_layers = num_layers
201 | def __call__(self, x, c, h):
202 | """
203 | Updates the internal state and returns the RNN outputs for each layer as a list.
204 |
205 | Args:
206 | x : A new batch from the input sequence.
207 | c : The list of the previous cell states.
208 | h : The list of the previous cell outputs.
209 | Returns:
210 | A list of the outputs (h) and another of the states (c) of the updated RNN units over all the layers.
211 |
212 | """
213 | c_list = []
214 | h_list = []
215 | c_curr, h_curr = functions.lstm(c[0], self[0](x) + self[1](h[0]))
216 | c_list.append(c_curr)
217 | h_list.append(h_curr)
218 | for i in range(1,self.num_layers):
219 | c_curr, h_curr = functions.lstm(c[i], self[(i*num_layers)+0](h_curr) + self[(i*num_layers)+1](h[i]))
220 | c_list.append(c_curr)
221 | h_list.append(h_curr)
222 | return c_list, h_list
223 |
224 | class MultiLayerGRUEncoder(ChainList):
225 | """
226 | This is an implementation of a Multilayered Stateless GRU.
227 | The underlying idea is to simply stack multiple GRUs where the GRU at the bottom takes the regular input,
228 | and the GRUs after that simply take the outputs (represented by h) of the previous GRUs as inputs.
229 | You have to pass the previous cell outputs (h) along with the input (x) when calling the LSTM.
230 | The implementation for the Stateful GRU just saves the cell state and thus its multilayered version wont be implemented unless demanded.
231 |
232 | Args:
233 | embed_size - The size of embeddings of the inputs
234 | hidden_size - The size of the hidden layer representation of the RNN
235 | num_layers - The number of layers of the RNN (Indicates the number of RNNS stacked on top of each other)
236 |
237 | Attributes:
238 | num_layers: Indicates the number of layers in the RNN
239 | User Defined Methods:
240 |
241 | """
242 |
243 | def __init__(self, embed_size, hidden_size, num_layers):
244 | super(MultiLayerGRUEncoder, self).__init__()
245 | self.add_link(links.GRU(hidden_size,embed_size))
246 | for i in num_layers:
247 | self.add_link(links.GRU(hidden_size,hidden_size))
248 | self.num_layers = num_layers
249 |
250 | def __call__(self, x, h):
251 | """
252 | Updates the internal state and returns the RNN outputs for each layer as a list.
253 |
254 | Args:
255 | x : A new batch from the input sequence.
256 | h : The list of the previous cell outputs.
257 | Returns:
258 | A list of the outputs (h) of the updated RNN units over all the layers.
259 |
260 | """
261 | h_list = []
262 | h_curr = self[0](h[0], x)
263 | h_list.append(h_curr)
264 | for i in range(1,self.num_layers):
265 | h_curr = self[i](h[i], h_curr)
266 | h_list.append(h_curr)
267 | return h_list
268 |
269 |
270 | class GRUEncoder(Chain):
271 |
272 | """
273 | This is just the same Encoder as below.
274 | The only difference is that the RNN cell is a GRU.
275 |
276 |
277 | Args:
278 | embed_size - The size of embeddings of the inputs
279 | hidden_size - The size of the hidden layer representation of the RNN
280 |
281 |
282 | Attributes:
283 |
284 | User Defined Methods:
285 |
286 | """
287 |
288 | def __init__(self, embed_size, hidden_size):
289 | super(Encoder, self).__init__(
290 | GRU = links.GRU(embed_size, hidden_size),
291 | )
292 |
293 | def __call__(self, x):
294 | """
295 | Updates the internal state and returns the RNN output (h).
296 | Note that for a GRU the internal state is the same as the output. (c and h are the same)
297 |
298 | Args:
299 | x : A new batch from the input sequence.
300 |
301 | Returns:
302 | The output (h) of updated RNN unit.
303 |
304 | """
305 | return self.GRU(x)
306 |
307 | class StatefulEncoder(Chain):
308 |
309 | """
310 | This is just the same Encoder as below.
311 | The only difference is that the LSTM class implementation is used instead of the LSTM function.
312 | Instead of explicitly defining the LSTM components, the LSTM class encapsulates these components making the Encoder look simpler.
313 |
314 | Args:
315 | embed_size - The size of embeddings of the inputs
316 | hidden_size - The size of the hidden layer representation of the RNN
317 |
318 |
319 | Attributes:
320 |
321 | User Defined Methods:
322 | get_state: This simply returns the latest cell state (c).
323 | """
324 |
325 | def __init__(self, embed_size, hidden_size):
326 | super(Encoder, self).__init__(
327 | LSTM = links.LSTM(embed_size, hidden_size),
328 | )
329 |
330 | def __call__(self, x):
331 | """
332 | Updates the internal state and returns the RNN output (h).
333 |
334 | Args:
335 | x : A new batch from the input sequence.
336 |
337 | Returns:
338 | The output (h) of updated RNN unit.
339 |
340 | """
341 | return self.LSTM(x)
342 |
343 | def get_state():
344 | return self.LSTM.c
345 |
346 | class StateLessEncoder(Chain):
347 | """
348 | This is just the same Encoder as below. The name is changed for the sake of disambiguation.
349 | The LSTM components are explicitly defined and the LSTM function is used in place of the LSTM class.
350 |
351 | Args:
352 | embed_size - The size of embeddings of the inputs
353 | hidden_size - The size of the hidden layer representation of the RNN
354 |
355 |
356 | Attributes:
357 |
358 | User Defined Methods:
359 | """
360 | def __init__(self, embed_size, hidden_size):
361 | super(Encoder, self).__init__(
362 | xh = links.Linear(embed_size, 4 * hidden_size),
363 | hh = links.Linear(hidden_size, 4 * hidden_size),
364 | )
365 |
366 | def __call__(self, x, c, h):
367 | """
368 | Updates the internal state and returns the RNN outputs for each layer as a list.
369 |
370 | Args:
371 | x : A new batch from the input sequence.
372 | c : The previous cell state.
373 | h : The previous cell output.
374 | Returns:
375 | The output (h) and the state (c) of the updated RNN unit.
376 |
377 | """
378 | return functions.lstm(c, self.xh(x) + self.hh(h))
379 |
380 | class Encoder(Chain):
381 | def __init__(self, embed_size, hidden_size):
382 | super(Encoder, self).__init__(
383 | xh = links.Linear(embed_size, 4 * hidden_size),
384 | hh = links.Linear(hidden_size, 4 * hidden_size),
385 | )
386 |
387 | def __call__(self, x, c, h):
388 | return functions.lstm(c, self.xh(x) + self.hh(h))
389 |
390 | class Attention(Chain):
391 | def __init__(self, hidden_size):
392 | super(Attention, self).__init__(
393 | aw = links.Linear(hidden_size, hidden_size),
394 | bw = links.Linear(hidden_size, hidden_size),
395 | pw = links.Linear(hidden_size, hidden_size),
396 | we = links.Linear(hidden_size, 1),
397 | )
398 | self.hidden_size = hidden_size
399 |
400 | def __call__(self, a_list, b_list, p):
401 | batch_size = p.data.shape[0]
402 | e_list = []
403 | sum_e = XP.fzeros((batch_size, 1))
404 | for a, b in zip(a_list, b_list):
405 | w = functions.tanh(self.aw(a) + self.bw(b) + self.pw(p))
406 | e = functions.exp(self.we(w))
407 | e_list.append(e)
408 | sum_e += e
409 | ZEROS = XP.fzeros((batch_size, self.hidden_size))
410 | aa = ZEROS
411 | bb = ZEROS
412 | for a, b, e in zip(a_list, b_list, e_list):
413 | e /= sum_e
414 | aa += functions.reshape(functions.batch_matmul(a, e), (batch_size, self.hidden_size))
415 | bb += functions.reshape(functions.batch_matmul(b, e), (batch_size, self.hidden_size))
416 | return aa, bb
417 |
418 | class LocalAttention(Chain):
419 | def __init__(self, hidden_size):
420 | super(Attention, self).__init__(
421 | aw = links.Linear(hidden_size, hidden_size),
422 | bw = links.Linear(hidden_size, hidden_size),
423 | pw = links.Linear(hidden_size, hidden_size),
424 | we = links.Linear(hidden_size, 1),
425 | ts = links.Linear(hidden_size, hidden_size),
426 | sp = links.Linear(hidden_size, 1),
427 | )
428 | self.hidden_size = hidden_size
429 |
430 | def __call__(self, a_list, b_list, p, sentence_length, window_size):
431 | batch_size = p.data.shape[0]
432 | SENTENCE_LENGTH = XP.fnonzeros((batch_size, 1),sentence_length)
433 | e_list = []
434 | sum_e = XP.fzeros((batch_size, 1))
435 | s = functions.tanh(self.ts(p))
436 | pos = SENTENCE_LENGTH * functions.sigmoid(self.sp(s))
437 |
438 | # Develop batch logic to set to zero the components of a and b which are out of the window
439 | # Big question: Do I have to iterate over each element in the batch? That would suck.
440 | # One logic: Get global alignment matrix of (batch x) hidden size x sentence length and then another matrix of (batch x) sentence length which
441 | # will essentially be a matrix containing the gaussian distrubution weight and there will be zeros where the sentence position falls out of the window
442 | # Another logic: Create a matrix of (batch x) sentence length where there will be 1 for each position in the window
443 |
444 | # Separate the attention weights for a and b cause forward is different from backward.
445 |
446 | for a, b in zip(a_list, b_list):
447 | w = functions.tanh(self.aw(a) + self.bw(b) + self.pw(p))
448 | e = functions.exp(self.we(w))
449 | e_list.append(e)
450 | sum_e += e
451 | ZEROS = XP.fzeros((batch_size, self.hidden_size))
452 | aa = ZEROS
453 | bb = ZEROS
454 | for a, b, e in zip(a_list, b_list, e_list):
455 | e /= sum_e
456 | aa += a * e
457 | bb += b * e
458 | return aa, bb
459 |
460 |
461 | class Decoder(Chain):
462 | def __init__(self, vocab_size, embed_size, hidden_size):
463 | super(Decoder, self).__init__(
464 | ye = links.EmbedID(vocab_size, embed_size),
465 | eh = links.Linear(embed_size, 4 * hidden_size),
466 | hh = links.Linear(hidden_size, 4 * hidden_size),
467 | ah = links.Linear(hidden_size, 4 * hidden_size),
468 | bh = links.Linear(hidden_size, 4 * hidden_size),
469 | hf = links.Linear(hidden_size, embed_size),
470 | fy = links.Linear(embed_size, vocab_size),
471 | )
472 |
473 | def __call__(self, y, c, h, a, b):
474 | e = functions.tanh(self.ye(y))
475 | c, h = functions.lstm(c, self.eh(e) + self.hh(h) + self.ah(a) + self.bh(b))
476 | f = functions.tanh(self.hf(h))
477 | return self.fy(f), c, h
478 |
479 | class AttentionMT(Chain):
480 | def __init__(self, vocab_size, embed_size, hidden_size):
481 | super(AttentionMT, self).__init__(
482 | emb = SrcEmbed(vocab_size, embed_size),
483 | fenc = Encoder(embed_size, hidden_size),
484 | benc = Encoder(embed_size, hidden_size),
485 | att = Attention(hidden_size),
486 | dec = Decoder(vocab_size, embed_size, hidden_size),
487 | )
488 | self.vocab_size = vocab_size
489 | self.embed_size = embed_size
490 | self.hidden_size = hidden_size
491 |
492 | def reset(self, batch_size):
493 | self.zerograds()
494 | self.x_list = []
495 |
496 | def embed(self, x):
497 | self.x_list.append(self.emb(x))
498 |
499 | def encode(self):
500 | src_len = len(self.x_list)
501 | batch_size = self.x_list[0].data.shape[0]
502 | ZEROS = XP.fzeros((batch_size, self.hidden_size))
503 | c = ZEROS
504 | a = ZEROS
505 | a_list = []
506 | for x in self.x_list:
507 | c, a = self.fenc(x, c, a)
508 | a_list.append(a)
509 | c = ZEROS
510 | b = ZEROS
511 | b_list = []
512 | for x in reversed(self.x_list):
513 | c, b = self.benc(x, c, b)
514 | b_list.insert(0, b)
515 | self.a_list = a_list
516 | self.b_list = b_list
517 | self.c = ZEROS
518 | self.h = ZEROS
519 |
520 | def decode(self, y):
521 | aa, bb = self.att(self.a_list, self.b_list, self.h)
522 | y, self.c, self.h = self.dec(y, self.c, self.h, aa, bb)
523 | return y
524 |
525 | def save_spec(self, filename):
526 | with open(filename, 'w') as fp:
527 | print(self.vocab_size, file=fp)
528 | print(self.embed_size, file=fp)
529 | print(self.hidden_size, file=fp)
530 |
531 | @staticmethod
532 | def load_spec(filename):
533 | with open(filename) as fp:
534 | vocab_size = int(next(fp))
535 | embed_size = int(next(fp))
536 | hidden_size = int(next(fp))
537 | return AttentionMT(vocab_size, embed_size, hidden_size)
538 |
539 | def forward(src_batch, trg_batch, src_vocab, trg_vocab, attmt, is_training, generation_limit):
540 | batch_size = len(src_batch)
541 | src_len = len(src_batch[0])
542 | trg_len = len(trg_batch[0]) if trg_batch else 0
543 | src_stoi = src_vocab.stoi
544 | trg_stoi = trg_vocab.stoi
545 | trg_itos = trg_vocab.itos
546 | attmt.reset(batch_size)
547 |
548 | x = XP.iarray([src_stoi('') for _ in range(batch_size)])
549 | attmt.embed(x)
550 | for l in range(src_len):
551 | x = XP.iarray([src_stoi(src_batch[k][l]) for k in range(batch_size)])
552 | attmt.embed(x)
553 | x = XP.iarray([src_stoi('') for _ in range(batch_size)])
554 | attmt.embed(x)
555 |
556 | attmt.encode()
557 |
558 | t = XP.iarray([trg_stoi('') for _ in range(batch_size)])
559 | hyp_batch = [[] for _ in range(batch_size)]
560 |
561 | if is_training:
562 | loss = XP.fzeros(())
563 | for l in range(trg_len):
564 | y = attmt.decode(t)
565 | t = XP.iarray([trg_stoi(trg_batch[k][l]) for k in range(batch_size)])
566 | loss += functions.softmax_cross_entropy(y, t)
567 | output = cuda.to_cpu(y.data.argmax(1))
568 | for k in range(batch_size):
569 | hyp_batch[k].append(trg_itos(output[k]))
570 | return hyp_batch, loss
571 |
572 | else:
573 | while len(hyp_batch[0]) < generation_limit:
574 | y = attmt.decode(t)
575 | output = cuda.to_cpu(y.data.argmax(1))
576 | t = XP.iarray(output)
577 | for k in range(batch_size):
578 | hyp_batch[k].append(trg_itos(output[k]))
579 | if all(hyp_batch[k][-1] == '' for k in range(batch_size)):
580 | break
581 |
582 | return hyp_batch
583 |
584 | def train(args):
585 | trace('making vocabularies ...')
586 | src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab)
587 | trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab)
588 |
589 | trace('making model ...')
590 | attmt = AttentionMT(args.vocab, args.embed, args.hidden)
591 | if args.use_gpu:
592 | attmt.to_gpu()
593 |
594 | for epoch in range(args.epoch):
595 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
596 | trained = 0
597 | gen1 = gens.word_list(args.source)
598 | gen2 = gens.word_list(args.target)
599 | gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * args.minibatch), args.minibatch)
600 | opt = optimizers.AdaGrad(lr = 0.01)
601 | opt.setup(attmt)
602 | opt.add_hook(optimizer.GradientClipping(5))
603 |
604 | for src_batch, trg_batch in gen3:
605 | src_batch = fill_batch(src_batch)
606 | trg_batch = fill_batch(trg_batch)
607 | K = len(src_batch)
608 | hyp_batch, loss = forward(src_batch, trg_batch, src_vocab, trg_vocab, attmt, True, 0)
609 | loss.backward()
610 | opt.update()
611 |
612 | for k in range(K):
613 | trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1))
614 | trace(' src = ' + ' '.join([x if x != '' else '*' for x in src_batch[k]]))
615 | trace(' trg = ' + ' '.join([x if x != '' else '*' for x in trg_batch[k]]))
616 | trace(' hyp = ' + ' '.join([x if x != '' else '*' for x in hyp_batch[k]]))
617 |
618 | trained += K
619 |
620 | trace('saving model ...')
621 | prefix = args.model + '.%03.d' % (epoch + 1)
622 | src_vocab.save(prefix + '.srcvocab')
623 | trg_vocab.save(prefix + '.trgvocab')
624 | attmt.save_spec(prefix + '.spec')
625 | serializers.save_hdf5(prefix + '.weights', attmt)
626 |
627 | trace('finished.')
628 |
629 | def test(args):
630 | trace('loading model ...')
631 | src_vocab = Vocabulary.load(args.model + '.srcvocab')
632 | trg_vocab = Vocabulary.load(args.model + '.trgvocab')
633 | attmt = AttentionMT.load_spec(args.model + '.spec')
634 | if args.use_gpu:
635 | attmt.to_gpu()
636 | serializers.load_hdf5(args.model + '.weights', attmt)
637 |
638 | trace('generating translation ...')
639 | generated = 0
640 |
641 | with open(args.target, 'w') as fp:
642 | for src_batch in gens.batch(gens.word_list(args.source), args.minibatch):
643 | src_batch = fill_batch(src_batch)
644 | K = len(src_batch)
645 |
646 | trace('sample %8d - %8d ...' % (generated + 1, generated + K))
647 | hyp_batch = forward(src_batch, None, src_vocab, trg_vocab, attmt, False, args.generation_limit)
648 |
649 | for hyp in hyp_batch:
650 | hyp.append('')
651 | hyp = hyp[:hyp.index('')]
652 | print(' '.join(hyp), file=fp)
653 |
654 | generated += K
655 |
656 | trace('finished.')
657 |
658 | def main():
659 | args = parse_args()
660 | XP.set_library(args)
661 | if args.mode == 'train': train(args)
662 | elif args.mode == 'test': test(args)
663 |
664 | if __name__ == '__main__':
665 | main()
666 |
667 |
--------------------------------------------------------------------------------
/chainer-1.5/mt_s2s_encdec.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import numpy
3 | from argparse import ArgumentParser
4 | from chainer import Chain, Variable, cuda, functions, links, optimizer, optimizers, serializers
5 | import util.generators as gens
6 | from util.functions import trace, fill_batch
7 | from util.vocabulary import Vocabulary
8 |
9 | def parse_args():
10 | def_gpu_device = 0
11 | def_vocab = 1000
12 | def_embed = 100
13 | def_hidden = 200
14 | def_epoch = 10
15 | def_minibatch = 64
16 | def_generation_limit = 128
17 |
18 | p = ArgumentParser(
19 | description='Encoder-decoder neural machine trainslation',
20 | usage=
21 | '\n %(prog)s train [options] source target model'
22 | '\n %(prog)s test source target model'
23 | '\n %(prog)s -h',
24 | )
25 |
26 | p.add_argument('mode', help='\'train\' or \'test\'')
27 | p.add_argument('source', help='[in] source corpus')
28 | p.add_argument('target', help='[in/out] target corpus')
29 | p.add_argument('model', help='[in/out] model file')
30 | p.add_argument('--use-gpu', action='store_true', default=False,
31 | help='use GPU calculation')
32 | p.add_argument('--gpu-device', default=def_gpu_device, metavar='INT', type=int,
33 | help='GPU device ID to be used (default: %(default)d)')
34 | p.add_argument('--vocab', default=def_vocab, metavar='INT', type=int,
35 | help='vocabulary size (default: %(default)d)')
36 | p.add_argument('--embed', default=def_embed, metavar='INT', type=int,
37 | help='embedding layer size (default: %(default)d)')
38 | p.add_argument('--hidden', default=def_hidden, metavar='INT', type=int,
39 | help='hidden layer size (default: %(default)d)')
40 | p.add_argument('--epoch', default=def_epoch, metavar='INT', type=int,
41 | help='number of training epoch (default: %(default)d)')
42 | p.add_argument('--minibatch', default=def_minibatch, metavar='INT', type=int,
43 | help='minibatch size (default: %(default)d)')
44 | p.add_argument('--generation-limit', default=def_generation_limit, metavar='INT', type=int,
45 | help='maximum number of words to be generated for test input (default: %(default)d)')
46 |
47 | args = p.parse_args()
48 |
49 | # check args
50 | try:
51 | if args.mode not in ['train', 'test']: raise ValueError('you must set mode = \'train\' or \'test\'')
52 | if args.vocab < 1: raise ValueError('you must set --vocab >= 1')
53 | if args.embed < 1: raise ValueError('you must set --embed >= 1')
54 | if args.hidden < 1: raise ValueError('you must set --hidden >= 1')
55 | if args.epoch < 1: raise ValueError('you must set --epoch >= 1')
56 | if args.minibatch < 1: raise ValueError('you must set --minibatch >= 1')
57 | if args.generation_limit < 1: raise ValueError('you must set --generation-limit >= 1')
58 | except Exception as ex:
59 | p.print_usage(file=sys.stderr)
60 | print(ex, file=sys.stderr)
61 | sys.exit()
62 |
63 | return args
64 |
65 | class XP:
66 | __lib = None
67 |
68 | @staticmethod
69 | def set_library(args):
70 | if args.use_gpu:
71 | XP.__lib = cuda.cupy
72 | cuda.get_device(args.gpu_device).use()
73 | else:
74 | XP.__lib = numpy
75 |
76 | @staticmethod
77 | def __zeros(shape, dtype):
78 | return Variable(XP.__lib.zeros(shape, dtype=dtype))
79 |
80 | @staticmethod
81 | def fzeros(shape):
82 | return XP.__zeros(shape, XP.__lib.float32)
83 |
84 | @staticmethod
85 | def __array(array, dtype):
86 | return Variable(XP.__lib.array(array, dtype=dtype))
87 |
88 | @staticmethod
89 | def iarray(array):
90 | return XP.__array(array, XP.__lib.int32)
91 |
92 | @staticmethod
93 | def farray(array):
94 | return XP.__array(array, XP.__lib.float32)
95 |
96 | class Encoder(Chain):
97 | def __init__(self, vocab_size, embed_size, hidden_size):
98 | super(Encoder, self).__init__(
99 | xe = links.EmbedID(vocab_size, embed_size),
100 | eh = links.Linear(embed_size, 4 * hidden_size),
101 | hh = links.Linear(hidden_size, 4 * hidden_size),
102 | )
103 |
104 | def __call__(self, x, c, h):
105 | e = functions.tanh(self.xe(x))
106 | return functions.lstm(c, self.eh(e) + self.hh(h))
107 |
108 | class Decoder(Chain):
109 | def __init__(self, vocab_size, embed_size, hidden_size):
110 | super(Decoder, self).__init__(
111 | ye = links.EmbedID(vocab_size, embed_size),
112 | eh = links.Linear(embed_size, 4 * hidden_size),
113 | hh = links.Linear(hidden_size, 4 * hidden_size),
114 | hf = links.Linear(hidden_size, embed_size),
115 | fy = links.Linear(embed_size, vocab_size),
116 | )
117 |
118 | def __call__(self, y, c, h):
119 | e = functions.tanh(self.ye(y))
120 | c, h = functions.lstm(c, self.eh(e) + self.hh(h))
121 | f = functions.tanh(self.hf(h))
122 | return self.fy(f), c, h
123 |
124 | class EncoderDecoder(Chain):
125 | def __init__(self, vocab_size, embed_size, hidden_size):
126 | super(EncoderDecoder, self).__init__(
127 | enc = Encoder(vocab_size, embed_size, hidden_size),
128 | dec = Decoder(vocab_size, embed_size, hidden_size),
129 | )
130 | self.vocab_size = vocab_size
131 | self.embed_size = embed_size
132 | self.hidden_size = hidden_size
133 |
134 | def reset(self, batch_size):
135 | self.zerograds()
136 | self.c = XP.fzeros((batch_size, self.hidden_size))
137 | self.h = XP.fzeros((batch_size, self.hidden_size))
138 |
139 | def encode(self, x):
140 | self.c, self.h = self.enc(x, self.c, self.h)
141 |
142 | def decode(self, y):
143 | y, self.c, self.h = self.dec(y, self.c, self.h)
144 | return y
145 |
146 | def save_spec(self, filename):
147 | with open(filename, 'w') as fp:
148 | print(self.vocab_size, file=fp)
149 | print(self.embed_size, file=fp)
150 | print(self.hidden_size, file=fp)
151 |
152 | @staticmethod
153 | def load_spec(filename):
154 | with open(filename) as fp:
155 | vocab_size = int(next(fp))
156 | embed_size = int(next(fp))
157 | hidden_size = int(next(fp))
158 | return EncoderDecoder(vocab_size, embed_size, hidden_size)
159 |
160 | def forward(src_batch, trg_batch, src_vocab, trg_vocab, encdec, is_training, generation_limit):
161 | batch_size = len(src_batch)
162 | src_len = len(src_batch[0])
163 | trg_len = len(trg_batch[0]) if trg_batch else 0
164 | src_stoi = src_vocab.stoi
165 | trg_stoi = trg_vocab.stoi
166 | trg_itos = trg_vocab.itos
167 | encdec.reset(batch_size)
168 |
169 | x = XP.iarray([src_stoi('') for _ in range(batch_size)])
170 | encdec.encode(x)
171 | for l in reversed(range(src_len)):
172 | x = XP.iarray([src_stoi(src_batch[k][l]) for k in range(batch_size)])
173 | encdec.encode(x)
174 |
175 | t = XP.iarray([trg_stoi('') for _ in range(batch_size)])
176 | hyp_batch = [[] for _ in range(batch_size)]
177 |
178 | if is_training:
179 | loss = XP.fzeros(())
180 | for l in range(trg_len):
181 | y = encdec.decode(t)
182 | t = XP.iarray([trg_stoi(trg_batch[k][l]) for k in range(batch_size)])
183 | loss += functions.softmax_cross_entropy(y, t)
184 | output = cuda.to_cpu(y.data.argmax(1))
185 | for k in range(batch_size):
186 | hyp_batch[k].append(trg_itos(output[k]))
187 | return hyp_batch, loss
188 |
189 | else:
190 | while len(hyp_batch[0]) < generation_limit:
191 | y = encdec.decode(t)
192 | output = cuda.to_cpu(y.data.argmax(1))
193 | t = XP.iarray(output)
194 | for k in range(batch_size):
195 | hyp_batch[k].append(trg_itos(output[k]))
196 | if all(hyp_batch[k][-1] == '' for k in range(batch_size)):
197 | break
198 |
199 | return hyp_batch
200 |
201 | def train(args):
202 | trace('making vocabularies ...')
203 | src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab)
204 | trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab)
205 |
206 | trace('making model ...')
207 | encdec = EncoderDecoder(args.vocab, args.embed, args.hidden)
208 | if args.use_gpu:
209 | encdec.to_gpu()
210 |
211 | for epoch in range(args.epoch):
212 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch))
213 | trained = 0
214 | gen1 = gens.word_list(args.source)
215 | gen2 = gens.word_list(args.target)
216 | gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * args.minibatch), args.minibatch)
217 | opt = optimizers.AdaGrad(lr = 0.01)
218 | opt.setup(encdec)
219 | opt.add_hook(optimizer.GradientClipping(5))
220 |
221 | for src_batch, trg_batch in gen3:
222 | src_batch = fill_batch(src_batch)
223 | trg_batch = fill_batch(trg_batch)
224 | K = len(src_batch)
225 | hyp_batch, loss = forward(src_batch, trg_batch, src_vocab, trg_vocab, encdec, True, 0)
226 | loss.backward()
227 | opt.update()
228 |
229 | for k in range(K):
230 | trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1))
231 | trace(' src = ' + ' '.join([x if x != '' else '*' for x in src_batch[k]]))
232 | trace(' trg = ' + ' '.join([x if x != '' else '*' for x in trg_batch[k]]))
233 | trace(' hyp = ' + ' '.join([x if x != '' else '*' for x in hyp_batch[k]]))
234 |
235 | trained += K
236 |
237 | trace('saving model ...')
238 | prefix = args.model + '.%03.d' % (epoch + 1)
239 | src_vocab.save(prefix + '.srcvocab')
240 | trg_vocab.save(prefix + '.trgvocab')
241 | encdec.save_spec(prefix + '.spec')
242 | serializers.save_hdf5(prefix + '.weights', encdec)
243 |
244 | trace('finished.')
245 |
246 | def test(args):
247 | trace('loading model ...')
248 | src_vocab = Vocabulary.load(args.model + '.srcvocab')
249 | trg_vocab = Vocabulary.load(args.model + '.trgvocab')
250 | encdec = EncoderDecoder.load_spec(args.model + '.spec')
251 | if args.use_gpu:
252 | encdec.to_gpu()
253 | serializers.load_hdf5(args.model + '.weights', encdec)
254 |
255 | trace('generating translation ...')
256 | generated = 0
257 |
258 | with open(args.target, 'w') as fp:
259 | for src_batch in gens.batch(gens.word_list(args.source), args.minibatch):
260 | src_batch = fill_batch(src_batch)
261 | K = len(src_batch)
262 |
263 | trace('sample %8d - %8d ...' % (generated + 1, generated + K))
264 | hyp_batch = forward(src_batch, None, src_vocab, trg_vocab, encdec, False, args.generation_limit)
265 |
266 | for hyp in hyp_batch:
267 | hyp.append('')
268 | hyp = hyp[:hyp.index('')]
269 | print(' '.join(hyp), file=fp)
270 |
271 | generated += K
272 |
273 | trace('finished.')
274 |
275 | def main():
276 | args = parse_args()
277 | XP.set_library(args)
278 | if args.mode == 'train': train(args)
279 | elif args.mode == 'test': test(args)
280 |
281 | if __name__ == '__main__':
282 | main()
283 |
284 |
--------------------------------------------------------------------------------
/chainer-1.5/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/odashi/chainer_examples/b13ec64e5035b1eb75b873431786d880577b7370/chainer-1.5/util/__init__.py
--------------------------------------------------------------------------------
/chainer-1.5/util/functions.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import datetime
3 |
4 | def trace(*args):
5 | print(datetime.datetime.now(), '...', *args, file=sys.stderr)
6 | sys.stderr.flush()
7 |
8 | def fill_batch(batch, token=''):
9 | max_len = max(len(x) for x in batch)
10 | return [x + [token] * (max_len - len(x) + 1) for x in batch]
11 |
12 | def fill_batch2(batch, start_token='', end_token=''):
13 | max_len = max(len(x) for x in batch)
14 | return [[start_token] + x + [end_token] * (max_len - len(x) + 1) for x in batch]
15 |
16 |
--------------------------------------------------------------------------------
/chainer-1.5/util/generators.py:
--------------------------------------------------------------------------------
1 | def batch(generator, batch_size):
2 | batch = []
3 | is_tuple = False
4 | for l in generator:
5 | is_tuple = isinstance(l, tuple)
6 | batch.append(l)
7 | if len(batch) == batch_size:
8 | yield tuple(list(x) for x in zip(*batch)) if is_tuple else batch
9 | batch = []
10 | if batch:
11 | yield tuple(list(x) for x in zip(*batch)) if is_tuple else batch
12 |
13 | def sorted_parallel(generator1, generator2, pooling, order=1):
14 | gen1 = batch(generator1, pooling)
15 | gen2 = batch(generator2, pooling)
16 | for batch1, batch2 in zip(gen1, gen2):
17 | #yield from sorted(zip(batch1, batch2), key=lambda x: len(x[1]))
18 | for x in sorted(zip(batch1, batch2), key=lambda x: len(x[order])):
19 | yield x
20 |
21 | def word_list(filename):
22 | with open(filename) as fp:
23 | for l in fp:
24 | yield l.split()
25 |
26 | def letter_list(filename):
27 | with open(filename) as fp:
28 | for l in fp:
29 | yield list(''.join(l.split()))
30 |
31 |
--------------------------------------------------------------------------------
/chainer-1.5/util/vocabulary.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 |
4 | class Vocabulary:
5 | def __init__(self):
6 | pass
7 |
8 | def __len__(self):
9 | return self.__size
10 |
11 | def stoi(self, s):
12 | return self.__stoi[s]
13 |
14 | def itos(self, i):
15 | return self.__itos[i]
16 |
17 | @staticmethod
18 | def new(list_generator, size):
19 | self = Vocabulary()
20 | self.__size = size
21 |
22 | word_freq = defaultdict(lambda: 0)
23 | for words in list_generator:
24 | for word in words:
25 | word_freq[word] += 1
26 |
27 | self.__stoi = defaultdict(lambda: 0)
28 | self.__stoi[''] = 0
29 | self.__stoi[''] = 1
30 | self.__stoi[''] = 2
31 | self.__itos = [''] * self.__size
32 | self.__itos[0] = ''
33 | self.__itos[1] = ''
34 | self.__itos[2] = ''
35 |
36 | for i, (k, v) in zip(range(self.__size - 3), sorted(word_freq.items(), key=lambda x: -x[1])):
37 | self.__stoi[k] = i + 3
38 | self.__itos[i + 3] = k
39 |
40 | return self
41 |
42 | def save(self, filename):
43 | with open(filename, 'w') as fp:
44 | print(self.__size, file=fp)
45 | for i in range(self.__size):
46 | print(self.__itos[i], file=fp)
47 |
48 | @staticmethod
49 | def load(filename):
50 | with open(filename) as fp:
51 | self = Vocabulary()
52 | self.__size = int(next(fp))
53 | self.__stoi = defaultdict(lambda: 0)
54 | self.__itos = [''] * self.__size
55 | for i in range(self.__size):
56 | s = next(fp).strip()
57 | if s:
58 | self.__stoi[s] = i
59 | self.__itos[i] = s
60 |
61 | return self
62 |
63 |
--------------------------------------------------------------------------------