├── .gitignore ├── LICENSE ├── README.md ├── chainer-1.4 ├── lm_rnn.py ├── mt_s2s_attention.py ├── mt_s2s_encdec.py ├── seg_ffnn.py ├── seg_rnn.py └── util │ ├── __init__.py │ ├── chainer_cpu_wrapper.py │ ├── chainer_gpu_wrapper.py │ ├── functions.py │ ├── generators.py │ ├── model_file.py │ └── vocabulary.py └── chainer-1.5 ├── LSTMVariants.py ├── attention_lm.py ├── mt_s2s_attention.py ├── mt_s2s_encdec.py └── util ├── __init__.py ├── functions.py ├── generators.py └── vocabulary.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | data 3 | hyp 4 | model 5 | nohup.out 6 | test 7 | my_settings.py 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | chainer_examples License 2 | () 3 | 4 | Copyright (c) 2015~ Yusuke Oda 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Chainer example code for NLP 2 | ============================ 3 | 4 | **This repository is out of date and rough. I do not guarantee that these code works correctly.** 5 | 6 | **I am developing a new NMT toolkit [NMTKit](https://github.com/odashi/nmtkit) and strongly recommend to use it instead of these samples to train neural translation models.** 7 | 8 | This repository contains some neural network examples 9 | for natural language processing (NLP) 10 | using **Chainer** framework. 11 | 12 | [Chainer Official](http://chainer.org/ "Chainer official") ([GitHub](https://github.com/pfnet/chainer "Github")) 13 | 14 | Making Local Client 15 | ------------------- 16 | 17 | Before running these scripts, making a local python client using `pyenv` is 18 | reccomended, like: 19 | 20 | $ pyenv install 3.5.0 21 | $ pyenv virtualenv 3.5.0 example 22 | $ pyenv shell example 23 | $ pip install chainer 24 | 25 | Contents 26 | -------- 27 | 28 | * **Machine Translation** 29 | * `mt_s2s_encdec.py` - Using encoder-decoder style recurrent neural network 30 | * `mt_s2s_attention.py` - Using attentional neural network 31 | 32 | * **Word Segmentation (Tokenization)** 33 | * `seg_ffnn.py` - Using feedforward neural network 34 | * `seg_rnn.py` - Using recurrent neural network 35 | 36 | * **Language Model** 37 | * `lm_rnn.py` - Using recurrent neural network (RNNLM) 38 | 39 | Contact 40 | ------- 41 | 42 | If you find an issue or have some questions, please contact Yusuke Oda: 43 | * @odashi_t on Twitter (faster than other methods) 44 | * yus.takara (at) gmail.com 45 | 46 | -------------------------------------------------------------------------------- /chainer-1.4/lm_rnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import my_settings 4 | 5 | import datetime 6 | import sys 7 | import math 8 | import numpy as np 9 | from argparse import ArgumentParser 10 | from collections import defaultdict 11 | 12 | from chainer import FunctionSet, Variable, cuda, functions, optimizers 13 | 14 | 15 | def trace(text): 16 | print(datetime.datetime.now(), '...', text, file=sys.stderr) 17 | 18 | 19 | def make_var(array, dtype=np.float32): 20 | #return Variable(np.array(array, dtype=dtype)) 21 | return Variable(cuda.to_gpu(np.array(array, dtype=dtype))) 22 | 23 | def get_data(variable): 24 | #return variable.data 25 | return cuda.to_cpu(variable.data) 26 | 27 | def zeros(shape, dtype=np.float32): 28 | #return Variable(np.zeros(shape, dtype=dtype)) 29 | return Variable(cuda.zeros(shape, dtype=dtype)) 30 | 31 | def make_model(**kwargs): 32 | #return FunctionSet(**kwargs) 33 | return FunctionSet(**kwargs).to_gpu() 34 | 35 | 36 | def make_vocab(filename, vocab_size): 37 | word_freq = defaultdict(lambda: 0) 38 | num_lines = 0 39 | num_words = 0 40 | with open(filename) as fp: 41 | for line in fp: 42 | words = line.split() 43 | num_lines += 1 44 | num_words += len(words) 45 | for word in words: 46 | word_freq[word] += 1 47 | 48 | # 0: unk 49 | # 1: 50 | # 2: 51 | vocab = defaultdict(lambda: 0) 52 | vocab[''] = 1 53 | vocab[''] = 2 54 | for i,(k,v) in zip(range(vocab_size - 3), sorted(word_freq.items(), key=lambda x: -x[1])): 55 | vocab[k] = i + 3 56 | 57 | return vocab, num_lines, num_words 58 | 59 | 60 | def generate_batch(filename, batch_size): 61 | with open(filename) as fp: 62 | batch = [] 63 | try: 64 | while True: 65 | for i in range(batch_size): 66 | batch.append(next(fp).split()) 67 | 68 | max_len = max(len(x) for x in batch) 69 | batch = [[''] + x + [''] * (max_len - len(x) + 1) for x in batch] 70 | yield batch 71 | 72 | batch = [] 73 | except: 74 | pass 75 | 76 | if batch: 77 | max_len = max(len(x) for x in batch) 78 | batch = [[''] + x + [''] * (max_len - len(x) + 1) for x in batch] 79 | yield batch 80 | 81 | 82 | def make_rnnlm_model(n_vocab, n_embed, n_hidden): 83 | return make_model( 84 | w_xe = functions.EmbedID(n_vocab, n_embed), 85 | w_eh = functions.Linear(n_embed, n_hidden), 86 | w_hh = functions.Linear(n_hidden, n_hidden), 87 | w_hy = functions.Linear(n_hidden, n_vocab), 88 | ) 89 | 90 | 91 | def save_rnnlm_model(filename, n_vocab, n_embed, n_hidden, vocab, model): 92 | fmt = '%.8e' 93 | dlm = ' ' 94 | 95 | model.to_cpu() 96 | 97 | with open(filename, 'w') as fp: 98 | print(n_vocab, file=fp) 99 | print(n_embed, file=fp) 100 | print(n_hidden, file=fp) 101 | 102 | for k, v in vocab.items(): 103 | if v == 0: 104 | continue 105 | print('%s %d' % (k, v), file=fp) 106 | 107 | for row in model.w_xe.W: 108 | print(dlm.join(fmt % x for x in row), file=fp) 109 | 110 | for row in model.w_eh.W: 111 | print(dlm.join(fmt % x for x in row), file=fp) 112 | print(dlm.join(fmt % x for x in model.w_eh.b), file=fp) 113 | 114 | for row in model.w_hh.W: 115 | print(dlm.join(fmt % x for x in row), file=fp) 116 | print(dlm.join(fmt % x for x in model.w_hh.b), file=fp) 117 | 118 | for row in model.w_hy.W: 119 | print(dlm.join(fmt % x for x in row), file=fp) 120 | print(dlm.join(fmt % x for x in model.w_hy.b), file=fp) 121 | 122 | model.to_gpu() 123 | 124 | 125 | def parse_args(): 126 | def_vocab = 40000 127 | def_embed = 200 128 | def_hidden = 200 129 | def_epoch = 10 130 | def_minibatch = 256 131 | 132 | p = ArgumentParser(description='RNNLM trainer') 133 | 134 | p.add_argument('corpus', help='[in] training corpus') 135 | p.add_argument('model', help='[out] model file') 136 | p.add_argument('-V', '--vocab', default=def_vocab, metavar='INT', type=int, 137 | help='vocabulary size (default: %d)' % def_vocab) 138 | p.add_argument('-E', '--embed', default=def_embed, metavar='INT', type=int, 139 | help='embedding layer size (default: %d)' % def_embed) 140 | p.add_argument('-H', '--hidden', default=def_hidden, metavar='INT', type=int, 141 | help='hidden layer size (default: %d)' % def_hidden) 142 | p.add_argument('-I', '--epoch', default=def_epoch, metavar='INT', type=int, 143 | help='number of training epoch (default: %d)' % def_epoch) 144 | p.add_argument('-B', '--minibatch', default=def_minibatch, metavar='INT', type=int, 145 | help='minibatch size (default: %d)' % def_minibatch) 146 | 147 | args = p.parse_args() 148 | 149 | # check args 150 | try: 151 | if (args.vocab < 1): raise ValueError('you must set --vocab >= 1') 152 | if (args.embed < 1): raise ValueError('you must set --embed >= 1') 153 | if (args.hidden < 1): raise ValueError('you must set --hidden >= 1') 154 | if (args.epoch < 1): raise ValueError('you must set --epoch >= 1') 155 | if (args.minibatch < 1): raise ValueError('you must set --minibatch >= 1') 156 | except Exception as ex: 157 | p.print_usage(file=sys.stderr) 158 | print(ex, file=sys.stderr) 159 | sys.exit() 160 | 161 | return args 162 | 163 | 164 | def main(): 165 | args = parse_args() 166 | 167 | trace('making vocabulary ...') 168 | vocab, num_lines, num_words = make_vocab(args.corpus, args.vocab) 169 | 170 | trace('initializing CUDA ...') 171 | cuda.init() 172 | 173 | trace('start training ...') 174 | model = make_rnnlm_model(args.vocab, args.embed, args.hidden) 175 | 176 | for epoch in range(args.epoch): 177 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch)) 178 | log_ppl = 0.0 179 | trained = 0 180 | 181 | opt = optimizers.SGD() 182 | opt.setup(model) 183 | 184 | for batch in generate_batch(args.corpus, args.minibatch): 185 | batch = [[vocab[x] for x in words] for words in batch] 186 | K = len(batch) 187 | L = len(batch[0]) - 1 188 | 189 | opt.zero_grads() 190 | s_h = zeros((K, args.hidden)) 191 | 192 | for l in range(L): 193 | s_x = make_var([batch[k][l] for k in range(K)], dtype=np.int32) 194 | s_t = make_var([batch[k][l + 1] for k in range(K)], dtype=np.int32) 195 | 196 | s_e = functions.tanh(model.w_xe(s_x)) 197 | s_h = functions.tanh(model.w_eh(s_e) + model.w_hh(s_h)) 198 | s_y = model.w_hy(s_h) 199 | 200 | loss = functions.softmax_cross_entropy(s_y, s_t) 201 | loss.backward() 202 | 203 | log_ppl += get_data(loss).reshape(()) * K 204 | 205 | opt.update() 206 | trained += K 207 | trace(' %d/%d' % (trained, num_lines)) 208 | 209 | log_ppl /= float(num_words) 210 | trace(' log(PPL) = %.10f' % log_ppl) 211 | trace(' PPL = %.10f' % math.exp(log_ppl)) 212 | 213 | trace(' writing model ...') 214 | save_rnnlm_model(args.model + '.%d' % (epoch + 1), args.vocab, args.embed, args.hidden, vocab, model) 215 | 216 | trace('training finished.') 217 | 218 | 219 | if __name__ == '__main__': 220 | main() 221 | 222 | -------------------------------------------------------------------------------- /chainer-1.4/mt_s2s_attention.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import my_settings 4 | 5 | import sys 6 | import math 7 | import numpy as np 8 | from argparse import ArgumentParser 9 | 10 | from chainer import functions, optimizers 11 | import chainer.computational_graph as cg 12 | 13 | import util.generators as gens 14 | from util.functions import trace, fill_batch2 15 | from util.model_file import ModelFile 16 | from util.vocabulary import Vocabulary 17 | 18 | #from util.chainer_cpu_wrapper import wrapper 19 | from util.chainer_gpu_wrapper import wrapper 20 | 21 | 22 | class AttentionalTranslationModel: 23 | def __init__(self): 24 | pass 25 | 26 | def __make_model(self): 27 | self.__model = wrapper.make_model( 28 | # input embedding 29 | w_xi = functions.EmbedID(len(self.__src_vocab), self.__n_embed), 30 | # forward encoder 31 | w_ia = functions.Linear(self.__n_embed, 4 * self.__n_hidden), 32 | w_aa = functions.Linear(self.__n_hidden, 4 * self.__n_hidden), 33 | # backward encoder 34 | w_ib = functions.Linear(self.__n_embed, 4 * self.__n_hidden), 35 | w_bb = functions.Linear(self.__n_hidden, 4 * self.__n_hidden), 36 | # attentional weight estimator 37 | w_aw = functions.Linear(self.__n_hidden, self.__n_hidden), 38 | w_bw = functions.Linear(self.__n_hidden, self.__n_hidden), 39 | w_pw = functions.Linear(self.__n_hidden, self.__n_hidden), 40 | w_we = functions.Linear(self.__n_hidden, 1), 41 | # decoder 42 | w_ap = functions.Linear(self.__n_hidden, self.__n_hidden), 43 | w_bp = functions.Linear(self.__n_hidden, self.__n_hidden), 44 | w_yp = functions.EmbedID(len(self.__trg_vocab), 4 * self.__n_hidden), 45 | w_pp = functions.Linear(self.__n_hidden, 4 * self.__n_hidden), 46 | w_cp = functions.Linear(self.__n_hidden, 4 * self.__n_hidden), 47 | w_dp = functions.Linear(self.__n_hidden, 4 * self.__n_hidden), 48 | w_py = functions.Linear(self.__n_hidden, len(self.__trg_vocab)), 49 | ) 50 | 51 | @staticmethod 52 | def new(src_vocab, trg_vocab, n_embed, n_hidden): 53 | self = AttentionalTranslationModel() 54 | self.__src_vocab = src_vocab 55 | self.__trg_vocab = trg_vocab 56 | self.__n_embed = n_embed 57 | self.__n_hidden = n_hidden 58 | self.__make_model() 59 | return self 60 | 61 | def save(self, filename): 62 | with ModelFile(filename, 'w') as fp: 63 | self.__src_vocab.save(fp.get_file_pointer()) 64 | self.__trg_vocab.save(fp.get_file_pointer()) 65 | fp.write(self.__n_embed) 66 | fp.write(self.__n_hidden) 67 | wrapper.begin_model_access(self.__model) 68 | fp.write_embed(self.__model.w_xi) 69 | fp.write_linear(self.__model.w_ia) 70 | fp.write_linear(self.__model.w_aa) 71 | fp.write_linear(self.__model.w_ib) 72 | fp.write_linear(self.__model.w_bb) 73 | fp.write_linear(self.__model.w_aw) 74 | fp.write_linear(self.__model.w_bw) 75 | fp.write_linear(self.__model.w_pw) 76 | fp.write_linear(self.__model.w_we) 77 | fp.write_linear(self.__model.w_ap) 78 | fp.write_linear(self.__model.w_bp) 79 | fp.write_embed(self.__model.w_yp) 80 | fp.write_linear(self.__model.w_pp) 81 | fp.write_linear(self.__model.w_cp) 82 | fp.write_linear(self.__model.w_dp) 83 | fp.write_linear(self.__model.w_py) 84 | wrapper.end_model_access(self.__model) 85 | 86 | @staticmethod 87 | def load(filename): 88 | self = AttentionalTranslationModel() 89 | with ModelFile(filename) as fp: 90 | self.__src_vocab = Vocabulary.load(fp.get_file_pointer()) 91 | self.__trg_vocab = Vocabulary.load(fp.get_file_pointer()) 92 | self.__n_embed = int(fp.read()) 93 | self.__n_hidden = int(fp.read()) 94 | self.__make_model() 95 | wrapper.begin_model_access(self.__model) 96 | fp.read_embed(self.__model.w_xi) 97 | fp.read_linear(self.__model.w_ia) 98 | fp.read_linear(self.__model.w_aa) 99 | fp.read_linear(self.__model.w_ib) 100 | fp.read_linear(self.__model.w_bb) 101 | fp.read_linear(self.__model.w_aw) 102 | fp.read_linear(self.__model.w_bw) 103 | fp.read_linear(self.__model.w_pw) 104 | fp.read_linear(self.__model.w_we) 105 | fp.read_linear(self.__model.w_ap) 106 | fp.read_linear(self.__model.w_bp) 107 | fp.read_embed(self.__model.w_yp) 108 | fp.read_linear(self.__model.w_pp) 109 | fp.read_linear(self.__model.w_cp) 110 | fp.read_linear(self.__model.w_dp) 111 | fp.read_linear(self.__model.w_py) 112 | wrapper.end_model_access(self.__model) 113 | return self 114 | 115 | def init_optimizer(self): 116 | self.__opt = optimizers.AdaGrad(lr=0.01) 117 | self.__opt.setup(self.__model) 118 | 119 | def __forward(self, is_training, src_batch, trg_batch = None, generation_limit = None): 120 | m = self.__model 121 | tanh = functions.tanh 122 | lstm = functions.lstm 123 | batch_size = len(src_batch) 124 | hidden_size = self.__n_hidden 125 | src_len = len(src_batch[0]) 126 | trg_len = len(trg_batch[0]) - 1 if is_training else generation_limit 127 | src_stoi = self.__src_vocab.stoi 128 | trg_stoi = self.__trg_vocab.stoi 129 | trg_itos = self.__trg_vocab.itos 130 | 131 | hidden_zeros = wrapper.zeros((batch_size, hidden_size)) 132 | sum_e_zeros = wrapper.zeros((batch_size, 1)) 133 | 134 | # make embedding 135 | list_x = [] 136 | for l in range(src_len): 137 | s_x = wrapper.make_var([src_stoi(src_batch[k][l]) for k in range(batch_size)], dtype=np.int32) 138 | list_x.append(s_x) 139 | 140 | # forward encoding 141 | c = hidden_zeros 142 | s_a = hidden_zeros 143 | list_a = [] 144 | for l in range(src_len): 145 | s_x = list_x[l] 146 | s_i = tanh(m.w_xi(s_x)) 147 | c, s_a = lstm(c, m.w_ia(s_i) + m.w_aa(s_a)) 148 | list_a.append(s_a) 149 | 150 | # backward encoding 151 | c = hidden_zeros 152 | s_b = hidden_zeros 153 | list_b = [] 154 | for l in reversed(range(src_len)): 155 | s_x = list_x[l] 156 | s_i = tanh(m.w_xi(s_x)) 157 | c, s_b = lstm(c, m.w_ib(s_i) + m.w_bb(s_b)) 158 | list_b.insert(0, s_b) 159 | 160 | # decoding 161 | c = hidden_zeros 162 | s_p = tanh(m.w_ap(list_a[-1]) + m.w_bp(list_b[0])) 163 | s_y = wrapper.make_var([trg_stoi('') for k in range(batch_size)], dtype=np.int32) 164 | 165 | hyp_batch = [[] for _ in range(batch_size)] 166 | accum_loss = wrapper.zeros(()) if is_training else None 167 | 168 | #for n in range(src_len): 169 | # print(src_batch[0][n], end=' ') 170 | #print() 171 | 172 | for l in range(trg_len): 173 | # calculate attention weights 174 | list_e = [] 175 | sum_e = sum_e_zeros 176 | for n in range(src_len): 177 | s_w = tanh(m.w_aw(list_a[n]) + m.w_bw(list_b[n]) + m.w_pw(s_p)) 178 | r_e = functions.exp(m.w_we(s_w)) 179 | #list_e.append(functions.concat(r_e for _ in range(self.__n_hidden))) 180 | list_e.append(r_e) 181 | sum_e += r_e 182 | #sum_e = functions.concat(sum_e for _ in range(self.__n_hidden)) 183 | 184 | # make attention vector 185 | s_c = hidden_zeros 186 | s_d = hidden_zeros 187 | for n in range(src_len): 188 | s_e = list_e[n] / sum_e 189 | #s_c += s_e * list_a[n] 190 | #s_d += s_e * list_b[n] 191 | s_c += functions.reshape(functions.batch_matmul(list_a[n], s_e), (batch_size, hidden_size)) 192 | s_d += functions.reshape(functions.batch_matmul(list_b[n], s_e), (batch_size, hidden_size)) 193 | 194 | #zxcv = wrapper.get_data(s_e)[0][0] 195 | #if zxcv > 0.9: asdf='#' 196 | #elif zxcv > 0.7: asdf='*' 197 | #elif zxcv > 0.3: asdf='+' 198 | #elif zxcv > 0.1: asdf='.' 199 | #else: asdf=' ' 200 | #print(asdf * len(src_batch[0][n]), end=' ') 201 | 202 | # generate next word 203 | c, s_p = lstm(c, m.w_yp(s_y) + m.w_pp(s_p) + m.w_cp(s_c) + m.w_dp(s_d)) 204 | r_y = m.w_py(s_p) 205 | output = wrapper.get_data(r_y).argmax(1) 206 | for k in range(batch_size): 207 | hyp_batch[k].append(trg_itos(output[k])) 208 | 209 | #print(hyp_batch[0][-1]) 210 | 211 | if is_training: 212 | s_t = wrapper.make_var([trg_stoi(trg_batch[k][l + 1]) for k in range(batch_size)], dtype=np.int32) 213 | accum_loss += functions.softmax_cross_entropy(r_y, s_t) 214 | s_y = s_t 215 | else: 216 | if all(hyp_batch[k][-1] == '' for k in range(batch_size)): break 217 | s_y = wrapper.make_var(output, dtype=np.int32) 218 | 219 | return hyp_batch, accum_loss 220 | 221 | def train(self, src_batch, trg_batch): 222 | self.__opt.zero_grads() 223 | hyp_batch, accum_loss = self.__forward(True, src_batch, trg_batch=trg_batch) 224 | #g = cg.build_computational_graph([accum_loss]) 225 | #with open('asdf', 'w') as fp: fp.write(g.dump()) 226 | #sys.exit() 227 | accum_loss.backward() 228 | self.__opt.clip_grads(10) 229 | self.__opt.update() 230 | return hyp_batch 231 | 232 | def predict(self, src_batch, generation_limit): 233 | return self.__forward(False, src_batch, generation_limit=generation_limit)[0] 234 | 235 | 236 | def parse_args(): 237 | def_vocab = 32768 238 | def_embed = 256 239 | def_hidden = 512 240 | def_epoch = 100 241 | def_minibatch = 64 242 | def_generation_limit = 256 243 | 244 | p = ArgumentParser(description='Attentional neural machine translation') 245 | 246 | p.add_argument('mode', help='\'train\' or \'test\'') 247 | p.add_argument('source', help='[in] source corpus') 248 | p.add_argument('target', help='[in/out] target corpus') 249 | p.add_argument('model', help='[in/out] model file') 250 | p.add_argument('--vocab', default=def_vocab, metavar='INT', type=int, 251 | help='vocabulary size (default: %d)' % def_vocab) 252 | p.add_argument('--embed', default=def_embed, metavar='INT', type=int, 253 | help='embedding layer size (default: %d)' % def_embed) 254 | p.add_argument('--hidden', default=def_hidden, metavar='INT', type=int, 255 | help='hidden layer size (default: %d)' % def_hidden) 256 | p.add_argument('--epoch', default=def_epoch, metavar='INT', type=int, 257 | help='number of training epoch (default: %d)' % def_epoch) 258 | p.add_argument('--minibatch', default=def_minibatch, metavar='INT', type=int, 259 | help='minibatch size (default: %d)' % def_minibatch) 260 | p.add_argument('--generation-limit', default=def_generation_limit, metavar='INT', type=int, 261 | help='maximum number of words to be generated for test input') 262 | 263 | args = p.parse_args() 264 | 265 | # check args 266 | try: 267 | if args.mode not in ['train', 'test']: raise ValueError('you must set mode = \'train\' or \'test\'') 268 | if args.vocab < 1: raise ValueError('you must set --vocab >= 1') 269 | if args.embed < 1: raise ValueError('you must set --embed >= 1') 270 | if args.hidden < 1: raise ValueError('you must set --hidden >= 1') 271 | if args.epoch < 1: raise ValueError('you must set --epoch >= 1') 272 | if args.minibatch < 1: raise ValueError('you must set --minibatch >= 1') 273 | if args.generation_limit < 1: raise ValueError('you must set --generation-limit >= 1') 274 | except Exception as ex: 275 | p.print_usage(file=sys.stderr) 276 | print(ex, file=sys.stderr) 277 | sys.exit() 278 | 279 | return args 280 | 281 | 282 | def train_model(args): 283 | trace('making vocabularies ...') 284 | src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab) 285 | trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab) 286 | 287 | trace('making model ...') 288 | model = AttentionalTranslationModel.new(src_vocab, trg_vocab, args.embed, args.hidden) 289 | 290 | for epoch in range(args.epoch): 291 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch)) 292 | trained = 0 293 | gen1 = gens.word_list(args.source) 294 | gen2 = gens.word_list(args.target) 295 | gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * args.minibatch, order=0), args.minibatch) 296 | model.init_optimizer() 297 | 298 | for src_batch, trg_batch in gen3: 299 | src_batch = fill_batch2(src_batch) 300 | trg_batch = fill_batch2(trg_batch) 301 | K = len(src_batch) 302 | hyp_batch = model.train(src_batch, trg_batch) 303 | 304 | for k in range(K): 305 | trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1)) 306 | trace(' src = ' + ' '.join([x if x != '' else '*' for x in src_batch[k]])) 307 | trace(' trg = ' + ' '.join([x if x != '' else '*' for x in trg_batch[k]])) 308 | trace(' hyp = ' + ' '.join([x if x != '' else '*' for x in hyp_batch[k]])) 309 | 310 | trained += K 311 | 312 | trace('saving model ...') 313 | model.save(args.model + '.%03d' % (epoch + 1)) 314 | 315 | trace('finished.') 316 | 317 | 318 | def test_model(args): 319 | trace('loading model ...') 320 | model = AttentionalTranslationModel.load(args.model) 321 | 322 | trace('generating translation ...') 323 | generated = 0 324 | 325 | with open(args.target, 'w') as fp: 326 | for src_batch in gens.batch(gens.word_list(args.source), args.minibatch): 327 | src_batch = fill_batch2(src_batch) 328 | K = len(src_batch) 329 | 330 | trace('sample %8d - %8d ...' % (generated + 1, generated + K)) 331 | hyp_batch = model.predict(src_batch, args.generation_limit) 332 | 333 | for hyp in hyp_batch: 334 | hyp.append('') 335 | hyp = hyp[:hyp.index('')] 336 | print(' '.join(hyp), file=fp) 337 | 338 | generated += K 339 | 340 | trace('finished.') 341 | 342 | 343 | def main(): 344 | args = parse_args() 345 | 346 | trace('initializing ...') 347 | wrapper.init() 348 | 349 | if args.mode == 'train': train_model(args) 350 | elif args.mode == 'test': test_model(args) 351 | 352 | 353 | if __name__ == '__main__': 354 | main() 355 | 356 | -------------------------------------------------------------------------------- /chainer-1.4/mt_s2s_encdec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import my_settings 4 | 5 | import sys 6 | import math 7 | import numpy as np 8 | from argparse import ArgumentParser 9 | 10 | from chainer import functions, optimizers 11 | 12 | import util.generators as gens 13 | from util.functions import trace, fill_batch 14 | from util.model_file import ModelFile 15 | from util.vocabulary import Vocabulary 16 | 17 | #from util.chainer_cpu_wrapper import wrapper 18 | from util.chainer_gpu_wrapper import wrapper 19 | 20 | 21 | class EncoderDecoderModel: 22 | def __init__(self): 23 | pass 24 | 25 | def __make_model(self): 26 | self.__model = wrapper.make_model( 27 | # encoder 28 | w_xi = functions.EmbedID(len(self.__src_vocab), self.__n_embed), 29 | w_ip = functions.Linear(self.__n_embed, 4 * self.__n_hidden), 30 | w_pp = functions.Linear(self.__n_hidden, 4 * self.__n_hidden), 31 | # decoder 32 | w_pq = functions.Linear(self.__n_hidden, 4 * self.__n_hidden), 33 | w_qj = functions.Linear(self.__n_hidden, self.__n_embed), 34 | w_jy = functions.Linear(self.__n_embed, len(self.__trg_vocab)), 35 | w_yq = functions.EmbedID(len(self.__trg_vocab), 4 * self.__n_hidden), 36 | w_qq = functions.Linear(self.__n_hidden, 4 * self.__n_hidden), 37 | ) 38 | 39 | @staticmethod 40 | def new(src_vocab, trg_vocab, n_embed, n_hidden): 41 | self = EncoderDecoderModel() 42 | self.__src_vocab = src_vocab 43 | self.__trg_vocab = trg_vocab 44 | self.__n_embed = n_embed 45 | self.__n_hidden = n_hidden 46 | self.__make_model() 47 | return self 48 | 49 | def save(self, filename): 50 | with ModelFile(filename, 'w') as fp: 51 | self.__src_vocab.save(fp.get_file_pointer()) 52 | self.__trg_vocab.save(fp.get_file_pointer()) 53 | fp.write(self.__n_embed) 54 | fp.write(self.__n_hidden) 55 | wrapper.begin_model_access(self.__model) 56 | fp.write_embed(self.__model.w_xi) 57 | fp.write_linear(self.__model.w_ip) 58 | fp.write_linear(self.__model.w_pp) 59 | fp.write_linear(self.__model.w_pq) 60 | fp.write_linear(self.__model.w_qj) 61 | fp.write_linear(self.__model.w_jy) 62 | fp.write_embed(self.__model.w_yq) 63 | fp.write_linear(self.__model.w_qq) 64 | wrapper.end_model_access(self.__model) 65 | 66 | @staticmethod 67 | def load(filename): 68 | self = EncoderDecoderModel() 69 | with ModelFile(filename) as fp: 70 | self.__src_vocab = Vocabulary.load(fp.get_file_pointer()) 71 | self.__trg_vocab = Vocabulary.load(fp.get_file_pointer()) 72 | self.__n_embed = int(fp.read()) 73 | self.__n_hidden = int(fp.read()) 74 | self.__make_model() 75 | wrapper.begin_model_access(self.__model) 76 | fp.read_embed(self.__model.w_xi) 77 | fp.read_linear(self.__model.w_ip) 78 | fp.read_linear(self.__model.w_pp) 79 | fp.read_linear(self.__model.w_pq) 80 | fp.read_linear(self.__model.w_qj) 81 | fp.read_linear(self.__model.w_jy) 82 | fp.read_embed(self.__model.w_yq) 83 | fp.read_linear(self.__model.w_qq) 84 | wrapper.end_model_access(self.__model) 85 | return self 86 | 87 | def init_optimizer(self): 88 | self.__opt = optimizers.AdaGrad(lr=0.01) 89 | self.__opt.setup(self.__model) 90 | 91 | def __forward(self, is_training, src_batch, trg_batch = None, generation_limit = None): 92 | m = self.__model 93 | tanh = functions.tanh 94 | lstm = functions.lstm 95 | batch_size = len(src_batch) 96 | src_len = len(src_batch[0]) 97 | src_stoi = self.__src_vocab.stoi 98 | trg_stoi = self.__trg_vocab.stoi 99 | trg_itos = self.__trg_vocab.itos 100 | s_c = wrapper.zeros((batch_size, self.__n_hidden)) 101 | 102 | # encoding 103 | s_x = wrapper.make_var([src_stoi('') for _ in range(batch_size)], dtype=np.int32) 104 | s_i = tanh(m.w_xi(s_x)) 105 | s_c, s_p = lstm(s_c, m.w_ip(s_i)) 106 | 107 | for l in reversed(range(src_len)): 108 | s_x = wrapper.make_var([src_stoi(src_batch[k][l]) for k in range(batch_size)], dtype=np.int32) 109 | s_i = tanh(m.w_xi(s_x)) 110 | s_c, s_p = lstm(s_c, m.w_ip(s_i) + m.w_pp(s_p)) 111 | 112 | s_c, s_q = lstm(s_c, m.w_pq(s_p)) 113 | hyp_batch = [[] for _ in range(batch_size)] 114 | 115 | # decoding 116 | if is_training: 117 | accum_loss = wrapper.zeros(()) 118 | trg_len = len(trg_batch[0]) 119 | 120 | for l in range(trg_len): 121 | s_j = tanh(m.w_qj(s_q)) 122 | r_y = m.w_jy(s_j) 123 | s_t = wrapper.make_var([trg_stoi(trg_batch[k][l]) for k in range(batch_size)], dtype=np.int32) 124 | accum_loss += functions.softmax_cross_entropy(r_y, s_t) 125 | output = wrapper.get_data(r_y).argmax(1) 126 | 127 | for k in range(batch_size): 128 | hyp_batch[k].append(trg_itos(output[k])) 129 | 130 | s_c, s_q = lstm(s_c, m.w_yq(s_t) + m.w_qq(s_q)) 131 | 132 | return hyp_batch, accum_loss 133 | else: 134 | while len(hyp_batch[0]) < generation_limit: 135 | s_j = tanh(m.w_qj(s_q)) 136 | r_y = m.w_jy(s_j) 137 | output = wrapper.get_data(r_y).argmax(1) 138 | 139 | for k in range(batch_size): 140 | hyp_batch[k].append(trg_itos(output[k])) 141 | 142 | if all(hyp_batch[k][-1] == '' for k in range(batch_size)): break 143 | 144 | s_y = wrapper.make_var(output, dtype=np.int32) 145 | s_c, s_q = lstm(s_c, m.w_yq(s_y) + m.w_qq(s_q)) 146 | 147 | return hyp_batch 148 | 149 | def train(self, src_batch, trg_batch): 150 | self.__opt.zero_grads() 151 | hyp_batch, accum_loss = self.__forward(True, src_batch, trg_batch=trg_batch) 152 | accum_loss.backward() 153 | self.__opt.clip_grads(10) 154 | self.__opt.update() 155 | return hyp_batch 156 | 157 | def predict(self, src_batch, generation_limit): 158 | return self.__forward(False, src_batch, generation_limit=generation_limit) 159 | 160 | 161 | def parse_args(): 162 | def_vocab = 32768 163 | def_embed = 256 164 | def_hidden = 512 165 | def_epoch = 100 166 | def_minibatch = 64 167 | def_generation_limit = 256 168 | 169 | p = ArgumentParser(description='Encoder-decoder neural machine trainslation') 170 | 171 | p.add_argument('mode', help='\'train\' or \'test\'') 172 | p.add_argument('source', help='[in] source corpus') 173 | p.add_argument('target', help='[in/out] target corpus') 174 | p.add_argument('model', help='[in/out] model file') 175 | p.add_argument('--vocab', default=def_vocab, metavar='INT', type=int, 176 | help='vocabulary size (default: %d)' % def_vocab) 177 | p.add_argument('--embed', default=def_embed, metavar='INT', type=int, 178 | help='embedding layer size (default: %d)' % def_embed) 179 | p.add_argument('--hidden', default=def_hidden, metavar='INT', type=int, 180 | help='hidden layer size (default: %d)' % def_hidden) 181 | p.add_argument('--epoch', default=def_epoch, metavar='INT', type=int, 182 | help='number of training epoch (default: %d)' % def_epoch) 183 | p.add_argument('--minibatch', default=def_minibatch, metavar='INT', type=int, 184 | help='minibatch size (default: %d)' % def_minibatch) 185 | p.add_argument('--generation-limit', default=def_generation_limit, metavar='INT', type=int, 186 | help='maximum number of words to be generated for test input') 187 | 188 | args = p.parse_args() 189 | 190 | # check args 191 | try: 192 | if args.mode not in ['train', 'test']: raise ValueError('you must set mode = \'train\' or \'test\'') 193 | if args.vocab < 1: raise ValueError('you must set --vocab >= 1') 194 | if args.embed < 1: raise ValueError('you must set --embed >= 1') 195 | if args.hidden < 1: raise ValueError('you must set --hidden >= 1') 196 | if args.epoch < 1: raise ValueError('you must set --epoch >= 1') 197 | if args.minibatch < 1: raise ValueError('you must set --minibatch >= 1') 198 | if args.generation_limit < 1: raise ValueError('you must set --generation-limit >= 1') 199 | except Exception as ex: 200 | p.print_usage(file=sys.stderr) 201 | print(ex, file=sys.stderr) 202 | sys.exit() 203 | 204 | return args 205 | 206 | 207 | def train_model(args): 208 | trace('making vocabularies ...') 209 | src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab) 210 | trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab) 211 | 212 | trace('making model ...') 213 | model = EncoderDecoderModel.new(src_vocab, trg_vocab, args.embed, args.hidden) 214 | 215 | for epoch in range(args.epoch): 216 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch)) 217 | trained = 0 218 | gen1 = gens.word_list(args.source) 219 | gen2 = gens.word_list(args.target) 220 | gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * args.minibatch), args.minibatch) 221 | model.init_optimizer() 222 | 223 | for src_batch, trg_batch in gen3: 224 | src_batch = fill_batch(src_batch) 225 | trg_batch = fill_batch(trg_batch) 226 | K = len(src_batch) 227 | hyp_batch = model.train(src_batch, trg_batch) 228 | 229 | for k in range(K): 230 | trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1)) 231 | trace(' src = ' + ' '.join([x if x != '' else '*' for x in src_batch[k]])) 232 | trace(' trg = ' + ' '.join([x if x != '' else '*' for x in trg_batch[k]])) 233 | trace(' hyp = ' + ' '.join([x if x != '' else '*' for x in hyp_batch[k]])) 234 | 235 | trained += K 236 | 237 | trace('saving model ...') 238 | model.save(args.model + '.%03d' % (epoch + 1)) 239 | 240 | trace('finished.') 241 | 242 | 243 | def test_model(args): 244 | trace('loading model ...') 245 | model = EncoderDecoderModel.load(args.model) 246 | 247 | trace('generating translation ...') 248 | generated = 0 249 | 250 | with open(args.target, 'w') as fp: 251 | for src_batch in gens.batch(gens.word_list(args.source), args.minibatch): 252 | src_batch = fill_batch(src_batch) 253 | K = len(src_batch) 254 | 255 | trace('sample %8d - %8d ...' % (generated + 1, generated + K)) 256 | hyp_batch = model.predict(src_batch, args.generation_limit) 257 | 258 | for hyp in hyp_batch: 259 | hyp.append('') 260 | hyp = hyp[:hyp.index('')] 261 | print(' '.join(hyp), file=fp) 262 | 263 | generated += K 264 | 265 | trace('finished.') 266 | 267 | 268 | def main(): 269 | args = parse_args() 270 | 271 | trace('initializing ...') 272 | wrapper.init() 273 | 274 | if args.mode == 'train': train_model(args) 275 | elif args.mode == 'test': test_model(args) 276 | 277 | 278 | if __name__ == '__main__': 279 | main() 280 | 281 | -------------------------------------------------------------------------------- /chainer-1.4/seg_ffnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import my_settings 4 | 5 | import sys 6 | import math 7 | import numpy as np 8 | from argparse import ArgumentParser 9 | 10 | from chainer import functions, optimizers 11 | 12 | import util.generators as gens 13 | from util.functions import trace, fill_batch 14 | from util.model_file import ModelFile 15 | from util.vocabulary import Vocabulary 16 | 17 | from util.chainer_cpu_wrapper import wrapper 18 | #from util.chainer_gpu_wrapper import wrapper 19 | 20 | 21 | class SegmentationModel: 22 | def __init__(self): 23 | pass 24 | 25 | def __make_model(self): 26 | self.__model = wrapper.make_model( 27 | w_xh = functions.EmbedID(2 * self.__n_context * len(self.__vocab), self.__n_hidden), 28 | w_hy = functions.Linear(self.__n_hidden, 1), 29 | ) 30 | 31 | @staticmethod 32 | def new(vocab, n_context, n_hidden): 33 | self = SegmentationModel() 34 | self.__vocab = vocab 35 | self.__n_context = n_context 36 | self.__n_hidden = n_hidden 37 | self.__make_model() 38 | return self 39 | 40 | def save(self, filename): 41 | with ModelFile(filename, 'w') as fp: 42 | self.__vocab.save(fp.get_file_pointer()) 43 | fp.write(self.__n_context) 44 | fp.write(self.__n_hidden) 45 | wrapper.begin_model_access(self.__model) 46 | fp.write_embed(self.__model.w_xh) 47 | fp.write_linear(self.__model.w_hy) 48 | wrapper.end_model_access(self.__model) 49 | 50 | @staticmethod 51 | def load(filename): 52 | self = SegmentationModel() 53 | with ModelFile(filename) as fp: 54 | self.__vocab = Vocabulary.load(fp.get_file_pointer()) 55 | self.__n_context = int(fp.read()) 56 | self.__n_hidden = int(fp.read()) 57 | self.__make_model() 58 | wrapper.begin_model_access(self.__model) 59 | fp.read_embed(self.__model.w_xh) 60 | fp.read_linear(self.__model.w_hy) 61 | wrapper.end_model_access(self.__model) 62 | return self 63 | 64 | def init_optimizer(self): 65 | self.__opt = optimizers.AdaGrad(lr=0.01) 66 | self.__opt.setup(self.__model) 67 | 68 | def __make_input(self, is_training, text): 69 | c = self.__vocab.stoi 70 | k = self.__n_context - 1 71 | word_list = text.split() 72 | letters = [c('')] * k + [c(x) for x in ''.join(word_list)] + [c('')] * k 73 | if is_training: 74 | labels = [] 75 | for x in word_list: 76 | labels += [-1] * (len(x) - 1) + [1] 77 | return letters, labels[:-1] 78 | else: 79 | return letters, None 80 | 81 | def __forward(self, is_training, text): 82 | m = self.__model 83 | tanh = functions.tanh 84 | letters, labels = self.__make_input(is_training, text) 85 | scores = [] 86 | accum_loss = wrapper.zeros(()) if is_training else None 87 | 88 | for n in range(len(letters) - 2 * self.__n_context + 1): 89 | s_hu = wrapper.zeros((1, self.__n_hidden)) 90 | 91 | for k in range(2 * self.__n_context): 92 | wid = k * len(self.__vocab) + letters[n + k] 93 | s_x = wrapper.make_var([wid], dtype=np.int32) 94 | s_hu += m.w_xh(s_x) 95 | 96 | s_hv = tanh(s_hu) 97 | s_y = tanh(m.w_hy(s_hv)) 98 | scores.append(float(wrapper.get_data(s_y))) 99 | 100 | if is_training: 101 | s_t = wrapper.make_var([[labels[n]]]) 102 | accum_loss += functions.mean_squared_error(s_y, s_t) 103 | 104 | return scores, accum_loss 105 | 106 | def train(self, text): 107 | self.__opt.zero_grads() 108 | scores, accum_loss = self.__forward(True, text) 109 | accum_loss.backward() 110 | self.__opt.clip_grads(5) 111 | self.__opt.update() 112 | return scores 113 | 114 | def predict(self, text): 115 | return self.__forward(False, text)[0] 116 | 117 | 118 | def parse_args(): 119 | def_vocab = 2500 120 | def_hidden = 100 121 | def_epoch = 100 122 | def_context = 3 123 | 124 | p = ArgumentParser(description='Word segmentation using feedforward neural network') 125 | 126 | p.add_argument('mode', help='\'train\' or \'test\'') 127 | p.add_argument('corpus', help='[in] source corpus') 128 | p.add_argument('model', help='[in/out] model file') 129 | p.add_argument('--vocab', default=def_vocab, metavar='INT', type=int, 130 | help='vocabulary size (default: %d)' % def_vocab) 131 | p.add_argument('--hidden', default=def_hidden, metavar='INT', type=int, 132 | help='hidden layer size (default: %d)' % def_hidden) 133 | p.add_argument('--epoch', default=def_epoch, metavar='INT', type=int, 134 | help='number of training epoch (default: %d)' % def_epoch) 135 | p.add_argument('--context', default=def_context, metavar='INT', type=int, 136 | help='width of context window (default: %d)' % def_context) 137 | 138 | args = p.parse_args() 139 | 140 | # check args 141 | try: 142 | if args.mode not in ['train', 'test']: raise ValueError('you must set mode = \'train\' or \'test\'') 143 | if args.vocab < 1: raise ValueError('you must set --vocab >= 1') 144 | if args.hidden < 1: raise ValueError('you must set --hidden >= 1') 145 | if args.epoch < 1: raise ValueError('you must set --epoch >= 1') 146 | if args.context < 1: raise ValueError('you must set --context >= 1') 147 | except Exception as ex: 148 | p.print_usage(file=sys.stderr) 149 | print(ex, file=sys.stderr) 150 | sys.exit() 151 | 152 | return args 153 | 154 | 155 | def make_hyp(letters, scores): 156 | hyp = letters[0] 157 | for w, s in zip(letters[1:], scores): 158 | if s >= 0: 159 | hyp += ' ' 160 | hyp += w 161 | return hyp 162 | 163 | 164 | def train_model(args): 165 | trace('making vocabularies ...') 166 | vocab = Vocabulary.new(gens.letter_list(args.corpus), args.vocab) 167 | 168 | trace('start training ...') 169 | model = SegmentationModel.new(vocab, args.context, args.hidden) 170 | 171 | for epoch in range(args.epoch): 172 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch)) 173 | trained = 0 174 | 175 | model.init_optimizer() 176 | 177 | with open(args.corpus) as fp: 178 | for text in fp: 179 | word_list = text.split() 180 | if not word_list: 181 | continue 182 | 183 | text = ' '.join(word_list) 184 | letters = ''.join(word_list) 185 | scores = model.train(text) 186 | trained += 1 187 | hyp = make_hyp(letters, scores) 188 | 189 | trace(trained) 190 | trace(text) 191 | trace(hyp) 192 | trace(' '.join('%+.1f' % x for x in scores)) 193 | 194 | if trained % 100 == 0: 195 | trace(' %8d' % trained) 196 | 197 | trace('saveing model ...') 198 | model.save(args.model + '.%03d' % (epoch + 1)) 199 | 200 | trace('finished.') 201 | 202 | 203 | def test_model(args): 204 | trace('loading model ...') 205 | model = SegmentationModel.load(args.model) 206 | 207 | trace('generating output ...') 208 | 209 | with open(args.corpus) as fp: 210 | for text in fp: 211 | letters = ''.join(text.split()) 212 | if not letters: 213 | print() 214 | continue 215 | scores = model.predict(text) 216 | hyp = make_hyp(letters, scores) 217 | print(hyp) 218 | 219 | trace('finished.') 220 | 221 | 222 | def main(): 223 | args = parse_args() 224 | 225 | trace('initializing CUDA ...') 226 | wrapper.init() 227 | 228 | if args.mode == 'train': train_model(args) 229 | elif args.mode == 'test': test_model(args) 230 | 231 | 232 | if __name__ == '__main__': 233 | main() 234 | 235 | -------------------------------------------------------------------------------- /chainer-1.4/seg_rnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | #import my_settings 4 | 5 | import sys 6 | import math 7 | import numpy as np 8 | from argparse import ArgumentParser 9 | 10 | from chainer import functions, optimizers 11 | 12 | import util.generators as gens 13 | from util.functions import trace, fill_batch 14 | from util.model_file import ModelFile 15 | from util.vocabulary import Vocabulary 16 | 17 | from util.chainer_cpu_wrapper import wrapper 18 | #from util.chainer_gpu_wrapper import wrapper 19 | 20 | 21 | class RNNSegmentationModel: 22 | def __init__(self): 23 | pass 24 | 25 | def __make_model(self): 26 | self.__model = wrapper.make_model( 27 | w_xe = functions.EmbedID(len(self.__vocab), self.__n_embed), 28 | w_ea = functions.Linear(self.__n_embed, 4 * self.__n_hidden), 29 | w_aa = functions.Linear(self.__n_hidden, 4 * self.__n_hidden), 30 | w_eb = functions.Linear(self.__n_embed, 4 * self.__n_hidden), 31 | w_bb = functions.Linear(self.__n_hidden, 4 * self.__n_hidden), 32 | w_ay1 = functions.Linear(self.__n_hidden, 1), 33 | w_by1 = functions.Linear(self.__n_hidden, 1), 34 | w_ay2 = functions.Linear(self.__n_hidden, 1), 35 | w_by2 = functions.Linear(self.__n_hidden, 1), 36 | ) 37 | 38 | @staticmethod 39 | def new(vocab, n_embed, n_hidden): 40 | self = RNNSegmentationModel() 41 | self.__vocab = vocab 42 | self.__n_embed = n_embed 43 | self.__n_hidden = n_hidden 44 | self.__make_model() 45 | return self 46 | 47 | def save(self, filename): 48 | with ModelFile(filename, 'w') as fp: 49 | self.__vocab.save(fp.get_file_pointer()) 50 | fp.write(self.__n_embed) 51 | fp.write(self.__n_hidden) 52 | wrapper.begin_model_access(self.__model) 53 | fp.write_embed(self.__model.w_xe) 54 | fp.write_linear(self.__model.w_ea) 55 | fp.write_linear(self.__model.w_aa) 56 | fp.write_linear(self.__model.w_eb) 57 | fp.write_linear(self.__model.w_bb) 58 | fp.write_linear(self.__model.w_ay1) 59 | fp.write_linear(self.__model.w_by1) 60 | fp.write_linear(self.__model.w_ay2) 61 | fp.write_linear(self.__model.w_by2) 62 | wrapper.end_model_access(self.__model) 63 | 64 | @staticmethod 65 | def load(filename): 66 | self = RNNSegmentationModel() 67 | with ModelFile(filename) as fp: 68 | self.__vocab = Vocabulary.load(fp.get_file_pointer()) 69 | self.__n_embed = int(fp.read()) 70 | self.__n_hidden = int(fp.read()) 71 | self.__make_model() 72 | wrapper.begin_model_access(self.__model) 73 | fp.read_embed(self.__model.w_xe) 74 | fp.read_linear(self.__model.w_ea) 75 | fp.read_linear(self.__model.w_aa) 76 | fp.read_linear(self.__model.w_eb) 77 | fp.read_linear(self.__model.w_bb) 78 | fp.read_linear(self.__model.w_ay1) 79 | fp.read_linear(self.__model.w_by1) 80 | fp.read_linear(self.__model.w_ay2) 81 | fp.read_linear(self.__model.w_by2) 82 | wrapper.end_model_access(self.__model) 83 | return self 84 | 85 | def init_optimizer(self): 86 | self.__opt = optimizers.AdaGrad(lr=0.001) 87 | self.__opt.setup(self.__model) 88 | 89 | def __make_input(self, is_training, text): 90 | word_list = text.split() 91 | letters = [self.__vocab.stoi(x) for x in ''.join(word_list)] 92 | if is_training: 93 | labels = [] 94 | for x in word_list: 95 | labels += [-1] * (len(x) - 1) + [1] 96 | return letters, labels[:-1] 97 | else: 98 | return letters, None 99 | 100 | def __forward(self, is_training, text): 101 | m = self.__model 102 | tanh = functions.tanh 103 | lstm = functions.lstm 104 | letters, labels = self.__make_input(is_training, text) 105 | n_letters = len(letters) 106 | 107 | accum_loss = wrapper.zeros(()) if is_training else None 108 | hidden_zeros = wrapper.zeros((1, self.__n_hidden)) 109 | 110 | # embedding 111 | list_e = [] 112 | for i in range(n_letters): 113 | s_x = wrapper.make_var([letters[i]], dtype=np.int32) 114 | list_e.append(tanh(m.w_xe(s_x))) 115 | 116 | # forward encoding 117 | s_a = hidden_zeros 118 | c = hidden_zeros 119 | list_a = [] 120 | for i in range(n_letters): 121 | c, s_a = lstm(c, m.w_ea(list_e[i]) + m.w_aa(s_a)) 122 | list_a.append(s_a) 123 | 124 | # backward encoding 125 | s_b = hidden_zeros 126 | c = hidden_zeros 127 | list_b = [] 128 | for i in reversed(range(n_letters)): 129 | c, s_b = lstm(c, m.w_eb(list_e[i]) + m.w_bb(s_b)) 130 | list_b.append(s_b) 131 | 132 | # segmentation 133 | scores = [] 134 | for i in range(n_letters - 1): 135 | s_y = tanh(m.w_ay1(list_a[i]) + m.w_by1(list_b[i]) + m.w_ay2(list_a[i + 1]) + m.w_by2(list_b[i + 1])) 136 | scores.append(float(wrapper.get_data(s_y))) 137 | 138 | if is_training: 139 | s_t = wrapper.make_var([[labels[i]]]) 140 | accum_loss += functions.mean_squared_error(s_y, s_t) 141 | 142 | return scores, accum_loss 143 | 144 | def train(self, text): 145 | self.__opt.zero_grads() 146 | scores, accum_loss = self.__forward(True, text) 147 | accum_loss.backward() 148 | self.__opt.clip_grads(5) 149 | self.__opt.update() 150 | return scores 151 | 152 | def predict(self, text): 153 | return self.__forward(False, text)[0] 154 | 155 | 156 | def parse_args(): 157 | def_vocab = 2500 158 | def_embed = 100 159 | def_hidden = 100 160 | def_epoch = 20 161 | 162 | p = ArgumentParser(description='Word segmentation using LSTM-RNN') 163 | 164 | p.add_argument('mode', help='\'train\' or \'test\'') 165 | p.add_argument('corpus', help='[in] source corpus') 166 | p.add_argument('model', help='[in/out] model file') 167 | p.add_argument('--vocab', default=def_vocab, metavar='INT', type=int, 168 | help='vocabulary size (default: %d)' % def_vocab) 169 | p.add_argument('--embed', default=def_embed, metavar='INT', type=int, 170 | help='embedding layer size (default: %d)' % def_embed) 171 | p.add_argument('--hidden', default=def_hidden, metavar='INT', type=int, 172 | help='hidden layer size (default: %d)' % def_hidden) 173 | p.add_argument('--epoch', default=def_epoch, metavar='INT', type=int, 174 | help='number of training epoch (default: %d)' % def_epoch) 175 | 176 | args = p.parse_args() 177 | 178 | # check args 179 | try: 180 | if args.mode not in ['train', 'test']: raise ValueError('you must set mode = \'train\' or \'test\'') 181 | if args.vocab < 1: raise ValueError('you must set --vocab >= 1') 182 | if args.embed < 1: raise ValueError('you must set --embed >= 1') 183 | if args.hidden < 1: raise ValueError('you must set --hidden >= 1') 184 | if args.epoch < 1: raise ValueError('you must set --epoch >= 1') 185 | except Exception as ex: 186 | p.print_usage(file=sys.stderr) 187 | print(ex, file=sys.stderr) 188 | sys.exit() 189 | 190 | return args 191 | 192 | 193 | def make_hyp(letters, scores): 194 | hyp = letters[0] 195 | for w, s in zip(letters[1:], scores): 196 | if s >= 0: 197 | hyp += ' ' 198 | hyp += w 199 | return hyp 200 | 201 | 202 | def train_model(args): 203 | trace('making vocabularies ...') 204 | vocab = Vocabulary.new(gens.letter_list(args.corpus), args.vocab) 205 | 206 | trace('start training ...') 207 | model = RNNSegmentationModel.new(vocab, args.embed, args.hidden) 208 | 209 | for epoch in range(args.epoch): 210 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch)) 211 | trained = 0 212 | 213 | model.init_optimizer() 214 | 215 | with open(args.corpus) as fp: 216 | for text in fp: 217 | word_list = text.split() 218 | if not word_list: 219 | continue 220 | 221 | text = ' '.join(word_list) 222 | letters = ''.join(word_list) 223 | scores = model.train(text) 224 | trained += 1 225 | hyp = make_hyp(letters, scores) 226 | 227 | trace(trained) 228 | trace(text) 229 | trace(hyp) 230 | trace(' '.join('%+.1f' % x for x in scores)) 231 | 232 | if trained % 100 == 0: 233 | trace(' %8d' % trained) 234 | 235 | trace('saveing model ...') 236 | model.save(args.model + '.%03d' % (epoch + 1)) 237 | 238 | trace('finished.') 239 | 240 | 241 | def test_model(args): 242 | trace('loading model ...') 243 | model = RNNSegmentationModel.load(args.model) 244 | 245 | trace('generating output ...') 246 | 247 | with open(args.corpus) as fp: 248 | for text in fp: 249 | letters = ''.join(text.split()) 250 | if not letters: 251 | print() 252 | continue 253 | scores = model.predict(text) 254 | hyp = make_hyp(letters, scores) 255 | print(hyp) 256 | 257 | trace('finished.') 258 | 259 | 260 | def main(): 261 | args = parse_args() 262 | 263 | trace('initializing ...') 264 | wrapper.init() 265 | 266 | if args.mode == 'train': train_model(args) 267 | elif args.mode == 'test': test_model(args) 268 | 269 | 270 | if __name__ == '__main__': 271 | main() 272 | 273 | -------------------------------------------------------------------------------- /chainer-1.4/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/odashi/chainer_examples/b13ec64e5035b1eb75b873431786d880577b7370/chainer-1.4/util/__init__.py -------------------------------------------------------------------------------- /chainer-1.4/util/chainer_cpu_wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import chainer 3 | 4 | class wrapper: 5 | @staticmethod 6 | def init(): 7 | pass 8 | 9 | @staticmethod 10 | def make_var(array, dtype=numpy.float32): 11 | return chainer.Variable(numpy.array(array, dtype=dtype)) 12 | 13 | @staticmethod 14 | def get_data(variable): 15 | return variable.data 16 | 17 | @staticmethod 18 | def zeros(shape, dtype=numpy.float32): 19 | return chainer.Variable(numpy.zeros(shape, dtype=dtype)) 20 | 21 | @staticmethod 22 | def ones(shape, dtype=numpy.float32): 23 | return chainer.Variable(numpy.ones(shape, dtype=dtype)) 24 | 25 | @staticmethod 26 | def make_model(**kwargs): 27 | return chainer.FunctionSet(**kwargs) 28 | 29 | @staticmethod 30 | def begin_model_access(model): 31 | pass 32 | 33 | @staticmethod 34 | def end_model_access(model): 35 | pass 36 | 37 | -------------------------------------------------------------------------------- /chainer-1.4/util/chainer_gpu_wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import chainer 3 | 4 | class wrapper: 5 | @staticmethod 6 | def init(): 7 | chainer.cuda.init() 8 | 9 | @staticmethod 10 | def make_var(array, dtype=numpy.float32): 11 | return chainer.Variable(chainer.cuda.to_gpu(numpy.array(array, dtype=dtype))) 12 | 13 | @staticmethod 14 | def get_data(variable): 15 | return chainer.cuda.to_cpu(variable.data) 16 | 17 | @staticmethod 18 | def zeros(shape, dtype=numpy.float32): 19 | return chainer.Variable(chainer.cuda.zeros(shape, dtype=dtype)) 20 | 21 | @staticmethod 22 | def ones(shape, dtype=numpy.float32): 23 | return chainer.Variable(chainer.cuda.ones(shape, dtype=dtype)) 24 | 25 | @staticmethod 26 | def make_model(**kwargs): 27 | return chainer.FunctionSet(**kwargs).to_gpu() 28 | 29 | @staticmethod 30 | def begin_model_access(model): 31 | model.to_cpu() 32 | 33 | @staticmethod 34 | def end_model_access(model): 35 | model.to_gpu() 36 | 37 | -------------------------------------------------------------------------------- /chainer-1.4/util/functions.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import datetime 3 | 4 | def trace(*args): 5 | print(datetime.datetime.now(), '...', *args, file=sys.stderr) 6 | sys.stderr.flush() 7 | 8 | def fill_batch(batch, token=''): 9 | max_len = max(len(x) for x in batch) 10 | return [x + [token] * (max_len - len(x) + 1) for x in batch] 11 | 12 | def fill_batch2(batch, start_token='', end_token=''): 13 | max_len = max(len(x) for x in batch) 14 | return [[start_token] + x + [end_token] * (max_len - len(x) + 1) for x in batch] 15 | 16 | def vtos(v, fmt='%.8e'): 17 | return ' '.join(fmt % x for x in v) 18 | 19 | def stov(s, tp=float): 20 | return [tp(x) for x in s.split()] 21 | 22 | -------------------------------------------------------------------------------- /chainer-1.4/util/generators.py: -------------------------------------------------------------------------------- 1 | def batch(generator, batch_size): 2 | batch = [] 3 | is_tuple = False 4 | for l in generator: 5 | is_tuple = isinstance(l, tuple) 6 | batch.append(l) 7 | if len(batch) == batch_size: 8 | yield tuple(list(x) for x in zip(*batch)) if is_tuple else batch 9 | batch = [] 10 | if batch: 11 | yield tuple(list(x) for x in zip(*batch)) if is_tuple else batch 12 | 13 | def sorted_parallel(generator1, generator2, pooling, order=1): 14 | gen1 = batch(generator1, pooling) 15 | gen2 = batch(generator2, pooling) 16 | for batch1, batch2 in zip(gen1, gen2): 17 | #yield from sorted(zip(batch1, batch2), key=lambda x: len(x[1])) 18 | for x in sorted(zip(batch1, batch2), key=lambda x: len(x[order])): 19 | yield x 20 | 21 | def word_list(filename): 22 | with open(filename) as fp: 23 | for l in fp: 24 | yield l.split() 25 | 26 | def letter_list(filename): 27 | with open(filename) as fp: 28 | for l in fp: 29 | yield list(''.join(l.split())) 30 | 31 | -------------------------------------------------------------------------------- /chainer-1.4/util/model_file.py: -------------------------------------------------------------------------------- 1 | from .functions import vtos, stov 2 | 3 | class ModelFile: 4 | def __init__(self, filename, mode='r'): 5 | self.__fp = open(filename, mode) 6 | 7 | def __enter__(self): 8 | return self 9 | 10 | def __exit__(self, exc_type, exc_value, traceback): 11 | self.__fp.close() 12 | return False 13 | 14 | def write(self, x): 15 | print(x, file=self.__fp) 16 | 17 | def __write_vector(self, x): 18 | self.write(vtos(x)) 19 | 20 | def __write_matrix(self, x): 21 | for row in x: 22 | self.__write_vector(row) 23 | 24 | def read(self): 25 | return next(self.__fp).strip() 26 | 27 | def __read_vector(self, x, tp): 28 | data = stov(self.read(), tp) 29 | for i in range(len(data)): 30 | x[i] = data[i] 31 | 32 | def __read_matrix(self, x, tp): 33 | for row in x: 34 | self.__read_vector(row, tp) 35 | 36 | def write_embed(self, f): 37 | self.__write_matrix(f.W) 38 | 39 | def write_linear(self, f): 40 | self.__write_matrix(f.W) 41 | self.__write_vector(f.b) 42 | 43 | def read_embed(self, f): 44 | self.__read_matrix(f.W, float) 45 | 46 | def read_linear(self, f): 47 | self.__read_matrix(f.W, float) 48 | self.__read_vector(f.b, float) 49 | 50 | def get_file_pointer(self): 51 | return self.__fp 52 | 53 | -------------------------------------------------------------------------------- /chainer-1.4/util/vocabulary.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | 4 | class Vocabulary: 5 | def __init__(self): 6 | pass 7 | 8 | def __len__(self): 9 | return self.__size 10 | 11 | def stoi(self, s): 12 | return self.__stoi[s] 13 | 14 | def itos(self, i): 15 | return self.__itos[i] 16 | 17 | @staticmethod 18 | def new(list_generator, size): 19 | self = Vocabulary() 20 | self.__size = size 21 | 22 | word_freq = defaultdict(lambda: 0) 23 | for words in list_generator: 24 | for word in words: 25 | word_freq[word] += 1 26 | 27 | self.__stoi = defaultdict(lambda: 0) 28 | self.__stoi[''] = 0 29 | self.__stoi[''] = 1 30 | self.__stoi[''] = 2 31 | self.__itos = [''] * self.__size 32 | self.__itos[0] = '' 33 | self.__itos[1] = '' 34 | self.__itos[2] = '' 35 | 36 | for i, (k, v) in zip(range(self.__size - 3), sorted(word_freq.items(), key=lambda x: -x[1])): 37 | self.__stoi[k] = i + 3 38 | self.__itos[i + 3] = k 39 | 40 | return self 41 | 42 | def save(self, fp): 43 | print(self.__size, file=fp) 44 | for i in range(self.__size): 45 | print(self.__itos[i], file=fp) 46 | 47 | @staticmethod 48 | def load(line_gen): 49 | self = Vocabulary() 50 | 51 | self.__size = int(next(line_gen)) 52 | 53 | self.__stoi = defaultdict(lambda: 0) 54 | self.__itos = [''] * self.__size 55 | for i in range(self.__size): 56 | s = next(line_gen).strip() 57 | if s: 58 | self.__stoi[s] = i 59 | self.__itos[i] = s 60 | 61 | return self 62 | 63 | -------------------------------------------------------------------------------- /chainer-1.5/LSTMVariants.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | import chainer 4 | from chainer.functions.activation import sigmoid 5 | from chainer.functions.activation import tanh 6 | from chainer import link 7 | from chainer.links.connection import linear 8 | 9 | 10 | class LSTMBase(link.Chain): 11 | 12 | def __init__(self, n_units, n_inputs=None): 13 | if n_inputs is None: 14 | n_inputs = n_units 15 | super(LSTMBase, self).__init__( 16 | W_fh=linear.Linear(n_inputs, n_units), 17 | W_ih=linear.Linear(n_inputs, n_units), 18 | W_oh=linear.Linear(n_inputs, n_units), 19 | W_ch=linear.Linear(n_inputs, n_units), 20 | W_fx=linear.Linear(n_inputs, n_units), 21 | W_ix=linear.Linear(n_inputs, n_units), 22 | W_ox=linear.Linear(n_inputs, n_units), 23 | W_cx=linear.Linear(n_inputs, n_units), 24 | ) 25 | 26 | class CoupledForgetLSTMBase(link.Chain): 27 | 28 | def __init__(self, n_units, n_inputs=None): 29 | if n_inputs is None: 30 | n_inputs = n_units 31 | super(LSTMBase, self).__init__( 32 | W_fh=linear.Linear(n_inputs, n_units), 33 | W_oh=linear.Linear(n_inputs, n_units), 34 | W_ch=linear.Linear(n_inputs, n_units), 35 | W_fx=linear.Linear(n_inputs, n_units), 36 | W_ox=linear.Linear(n_inputs, n_units), 37 | W_cx=linear.Linear(n_inputs, n_units), 38 | ) 39 | 40 | class PeepHoleLSTMBase(link.Chain): 41 | 42 | def __init__(self, n_units, n_inputs=None): 43 | if n_inputs is None: 44 | n_inputs = n_units 45 | super(PeepHoleLSTMBase, self).__init__( 46 | W_fh=linear.Linear(n_inputs, n_units), 47 | W_fc=linear.Linear(n_inputs, n_units), 48 | W_ih=linear.Linear(n_inputs, n_units), 49 | W_ic=linear.Linear(n_inputs, n_units), 50 | W_oh=linear.Linear(n_inputs, n_units), 51 | W_oc=linear.Linear(n_inputs, n_units), 52 | W_ch=linear.Linear(n_inputs, n_units), 53 | W_fx=linear.Linear(n_inputs, n_units), 54 | W_ix=linear.Linear(n_inputs, n_units), 55 | W_ox=linear.Linear(n_inputs, n_units), 56 | W_cx=linear.Linear(n_inputs, n_units), 57 | ) 58 | 59 | class CoupledForgetPeepHoleLSTMBase(link.Chain): 60 | 61 | def __init__(self, n_units, n_inputs=None): 62 | if n_inputs is None: 63 | n_inputs = n_units 64 | super(PeepHoleLSTMBase, self).__init__( 65 | W_fh=linear.Linear(n_inputs, n_units), 66 | W_fc=linear.Linear(n_inputs, n_units), 67 | W_oh=linear.Linear(n_inputs, n_units), 68 | W_oc=linear.Linear(n_inputs, n_units), 69 | W_ch=linear.Linear(n_inputs, n_units), 70 | W_fx=linear.Linear(n_inputs, n_units), 71 | W_ox=linear.Linear(n_inputs, n_units), 72 | W_cx=linear.Linear(n_inputs, n_units), 73 | ) 74 | 75 | class StatefulLSTM(LSTMBase): 76 | 77 | 78 | def __init__(self, in_size, out_size): 79 | super(StatefulLSTM, self).__init__(out_size, in_size) 80 | self.state_size = out_size 81 | self.reset_state() 82 | 83 | def to_cpu(self): 84 | super(StatefulLSTM, self).to_cpu() 85 | if self.h is not None: 86 | self.h.to_cpu() 87 | if self.c is not None: 88 | self.c.to_cpu() 89 | 90 | def to_gpu(self, device=None): 91 | super(StatefulLSTM, self).to_gpu(device) 92 | if self.c is not None: 93 | self.c.to_gpu(device) 94 | if self.h is not None: 95 | self.h.to_gpu(device) 96 | 97 | def set_state(self, h, c): 98 | assert isinstance(h, chainer.Variable) 99 | assert isinstance(c, chainer.Variable) 100 | h_ = h 101 | c_ = c 102 | if self.xp == numpy: 103 | h_.to_cpu() 104 | c_.to_cpu() 105 | else: 106 | h_.to_gpu() 107 | c_.to_gpu() 108 | self.h = h_ 109 | self.c = c_ 110 | 111 | def reset_state(self): 112 | self.h = None 113 | self.c = None 114 | 115 | def __call__(self, x): 116 | ft = self.W_fx(x) 117 | it = self.W_ix(x) 118 | ct = self.W_cx(x) 119 | ot = self.W_ox(x) 120 | 121 | if self.h is not None: 122 | ft += self.W_fh(h) 123 | it += self.W_ih(h) 124 | ct += self.W_ch(h) 125 | ot += self.W_oh(h) 126 | ft = sigmoid.sigmoid(ft) 127 | it = sigmoid.sigmoid(it) 128 | ct = tanh.tanh(ct) 129 | ot = sigmoid.sigmoid(ot) 130 | 131 | c = it * ct 132 | if self.c is not none: 133 | c += ft * self.c 134 | self.c = c 135 | self.h = ot * tanh.tanh(self.c) 136 | return self.h 137 | 138 | def get_state(): 139 | return self.c 140 | 141 | 142 | class StatelessLSTM(LSTMBase): 143 | def __init__(self, in_size, out_size): 144 | super(StatelessLSTM, self).__init__(out_size, in_size) 145 | self.state_size = out_size 146 | 147 | def __call__(self, x, h, c): 148 | ft = sigmoid.sigmoid(self.W_fx(x) + self.W_fh(h)) 149 | it = sigmoid.sigmoid(self.W_ix(x) + self.W_ih(h)) 150 | ct = tanh.tanh(self.W_cx(x) + self.W_ch(h)) 151 | ot = sigmoid.sigmoid(self.W_ox(x) + self.W_oh(h)) 152 | c = ft * c + it * ct 153 | h = ot * tanh.tanh(c) 154 | return h, c 155 | 156 | class StatefulPeepHoleLSTM(PeepHoleLSTMBase): 157 | 158 | 159 | def __init__(self, in_size, out_size): 160 | super(StatefulPeepHoleLSTM, self).__init__(out_size, in_size) 161 | self.state_size = out_size 162 | self.reset_state() 163 | 164 | def to_cpu(self): 165 | super(StatefulPeepHoleLSTM, self).to_cpu() 166 | if self.h is not None: 167 | self.h.to_cpu() 168 | if self.c is not None: 169 | self.c.to_cpu() 170 | 171 | def to_gpu(self, device=None): 172 | super(StatefulPeepHoleLSTM, self).to_gpu(device) 173 | if self.c is not None: 174 | self.c.to_gpu(device) 175 | if self.h is not None: 176 | self.h.to_gpu(device) 177 | 178 | def set_state(self, h, c): 179 | assert isinstance(h, chainer.Variable) 180 | assert isinstance(c, chainer.Variable) 181 | h_ = h 182 | c_ = c 183 | if self.xp == numpy: 184 | h_.to_cpu() 185 | c_.to_cpu() 186 | else: 187 | h_.to_gpu() 188 | c_.to_gpu() 189 | self.h = h_ 190 | self.c = c_ 191 | 192 | def reset_state(self): 193 | self.h = None 194 | self.c = None 195 | 196 | def __call__(self, x): 197 | ft = self.W_fx(x) 198 | it = self.W_ix(x) 199 | ct = self.W_cx(x) 200 | ot = self.W_ox(x) 201 | 202 | if self.h is not None and self.c is not None: 203 | ft += self.W_fh(h) + self.W_fc(self.c) 204 | it += self.W_ih(h) + self.W_ic(self.c) 205 | ct += self.W_ch(h) 206 | ot += self.W_oh(h) 207 | ft = sigmoid.sigmoid(ft) 208 | it = sigmoid.sigmoid(it) 209 | ct = tanh.tanh(ct) 210 | ot = sigmoid.sigmoid(ot + self.W_oc(ct)) 211 | 212 | c = it * ct 213 | if self.c is not none: 214 | self.c += ft * c 215 | 216 | self.h = ot * tanh.tanh(self.c) 217 | return self.h 218 | 219 | def get_state(): 220 | return self.c 221 | 222 | 223 | class StatelessPeepHoleLSTM(PeepHoleLSTMBase): 224 | 225 | 226 | def __init__(self, in_size, out_size): 227 | super(StatelessPeepHoleLSTM, self).__init__(out_size, in_size) 228 | self.state_size = out_size 229 | 230 | 231 | def __call__(self, x, h, c): 232 | ft = sigmoid.sigmoid(self.W_fx(x) + self.W_fh(h) + self.W_fc(c)) 233 | it = sigmoid.sigmoid(self.W_ix(x) + self.W_ih(h) + self.W_ic(c)) 234 | ct = tanh.tanh(self.W_cx(x) + self.W_ch(h)) 235 | c = ft * c + it * ct 236 | ot = sigmoid.sigmoid(self.W_ox(x) + self.W_oh(h) + self.W_oc(c)) 237 | h = ot * tanh.tanh(c) 238 | return h, c 239 | 240 | class CoupledForgetStatefulLSTM(CoupledForgetLSTMBase): 241 | 242 | 243 | def __init__(self, in_size, out_size): 244 | super(CoupledForgetStatefulLSTM, self).__init__(out_size, in_size) 245 | self.state_size = out_size 246 | self.reset_state() 247 | 248 | def to_cpu(self): 249 | super(CoupledForgetStatefulLSTM, self).to_cpu() 250 | if self.h is not None: 251 | self.h.to_cpu() 252 | if self.c is not None: 253 | self.c.to_cpu() 254 | 255 | def to_gpu(self, device=None): 256 | super(CoupledForgetStatefulLSTM, self).to_gpu(device) 257 | if self.c is not None: 258 | self.c.to_gpu(device) 259 | if self.h is not None: 260 | self.h.to_gpu(device) 261 | 262 | def set_state(self, h, c): 263 | assert isinstance(h, chainer.Variable) 264 | assert isinstance(c, chainer.Variable) 265 | h_ = h 266 | c_ = c 267 | if self.xp == numpy: 268 | h_.to_cpu() 269 | c_.to_cpu() 270 | else: 271 | h_.to_gpu() 272 | c_.to_gpu() 273 | self.h = h_ 274 | self.c = c_ 275 | 276 | def reset_state(self): 277 | self.h = None 278 | self.c = None 279 | 280 | def __call__(self, x): 281 | ft = self.W_fx(x) 282 | ct = self.W_cx(x) 283 | ot = self.W_ox(x) 284 | 285 | if self.h is not None: 286 | ft += self.W_fh(h) 287 | ct += self.W_ch(h) 288 | ot += self.W_oh(h) 289 | ft = sigmoid.sigmoid(ft) 290 | ct = tanh.tanh(ct) 291 | ot = sigmoid.sigmoid(ot) 292 | 293 | c = (1 - ft) * ct 294 | if self.c is not none: 295 | c += ft * self.c 296 | self.c = c 297 | self.h = ot * tanh.tanh(self.c) 298 | return self.h 299 | 300 | def get_state(): 301 | return self.c 302 | 303 | 304 | class CoupledForgetStatelessLSTM(CoupledForgetLSTMBase): 305 | def __init__(self, in_size, out_size): 306 | super(CoupledForgetStatelessLSTM, self).__init__(out_size, in_size) 307 | self.state_size = out_size 308 | 309 | def __call__(self, x, h, c): 310 | ft = sigmoid.sigmoid(self.W_fx(x) + self.W_fh(h)) 311 | ct = tanh.tanh(self.W_cx(x) + self.W_ch(h)) 312 | ot = sigmoid.sigmoid(self.W_ox(x) + self.W_oh(h)) 313 | c = ft * c + (1 - ft)) * ct 314 | h = ot * tanh.tanh(c) 315 | return h, c 316 | 317 | class CoupledForgetStatefulPeepHoleLSTM(CoupledForgetPeepHoleLSTMBase): 318 | 319 | 320 | def __init__(self, in_size, out_size): 321 | super(CoupledForgetStatefulPeepHoleLSTM, self).__init__(out_size, in_size) 322 | self.state_size = out_size 323 | self.reset_state() 324 | 325 | def to_cpu(self): 326 | super(CoupledForgetStatefulPeepHoleLSTM, self).to_cpu() 327 | if self.h is not None: 328 | self.h.to_cpu() 329 | if self.c is not None: 330 | self.c.to_cpu() 331 | 332 | def to_gpu(self, device=None): 333 | super(CoupledForgetStatefulPeepHoleLSTM, self).to_gpu(device) 334 | if self.c is not None: 335 | self.c.to_gpu(device) 336 | if self.h is not None: 337 | self.h.to_gpu(device) 338 | 339 | def set_state(self, h, c): 340 | assert isinstance(h, chainer.Variable) 341 | assert isinstance(c, chainer.Variable) 342 | h_ = h 343 | c_ = c 344 | if self.xp == numpy: 345 | h_.to_cpu() 346 | c_.to_cpu() 347 | else: 348 | h_.to_gpu() 349 | c_.to_gpu() 350 | self.h = h_ 351 | self.c = c_ 352 | 353 | def reset_state(self): 354 | self.h = None 355 | self.c = None 356 | 357 | def __call__(self, x): 358 | ft = self.W_fx(x) 359 | ct = self.W_cx(x) 360 | ot = self.W_ox(x) 361 | 362 | if self.h is not None and self.c is not None: 363 | ft += self.W_fh(h) + self.W_fc(self.c) 364 | ct += self.W_ch(h) 365 | ot += self.W_oh(h) 366 | ft = sigmoid.sigmoid(ft) 367 | ct = tanh.tanh(ct) 368 | ot = sigmoid.sigmoid(ot + self.W_oc(ct)) 369 | 370 | c = (1 - ft) * ct 371 | if self.c is not none: 372 | self.c += ft * c 373 | 374 | self.h = ot * tanh.tanh(self.c) 375 | return self.h 376 | 377 | def get_state(): 378 | return self.c 379 | 380 | 381 | class CoupledForgetStatelessPeepHoleLSTM(CoupledForgetPeepHoleLSTMBase): 382 | 383 | 384 | def __init__(self, in_size, out_size): 385 | super(CoupledForgetStatelessPeepHoleLSTM, self).__init__(out_size, in_size) 386 | self.state_size = out_size 387 | 388 | 389 | def __call__(self, x, h, c): 390 | ft = sigmoid.sigmoid(self.W_fx(x) + self.W_fh(h) + self.W_fc(c)) 391 | ct = tanh.tanh(self.W_cx(x) + self.W_ch(h)) 392 | c = ft * c + (1 - ft) * ct 393 | ot = sigmoid.sigmoid(self.W_ox(x) + self.W_oh(h) + self.W_oc(c)) 394 | h = ot * tanh.tanh(c) 395 | return h, c 396 | 397 | -------------------------------------------------------------------------------- /chainer-1.5/attention_lm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy 3 | from argparse import ArgumentParser 4 | from chainer import Chain, Variable, cuda, functions, links, optimizer, optimizers, serializers 5 | import util.generators as gens 6 | from util.functions import trace, fill_batch 7 | from util.vocabulary import Vocabulary 8 | 9 | 10 | #Added comment 11 | 12 | def make_vocab(filename, vocab_size): 13 | word_freq = defaultdict(lambda: 0) 14 | num_lines = 0 15 | num_words = 0 16 | with open(filename) as fp: 17 | for line in fp: 18 | words = line.split() 19 | num_lines += 1 20 | num_words += len(words) 21 | for word in words: 22 | word_freq[word] += 1 23 | 24 | # 0: unk 25 | # 1: 26 | # 2: 27 | vocab = defaultdict(lambda: 0) 28 | vocab[''] = 1 29 | vocab[''] = 2 30 | for i,(k,v) in zip(range(vocab_size - 3), sorted(word_freq.items(), key=lambda x: -x[1])): 31 | vocab[k] = i + 3 32 | 33 | return vocab, num_lines, num_words 34 | 35 | 36 | def generate_batch(filename, batch_size): 37 | with open(filename) as fp: 38 | batch = [] 39 | try: 40 | while True: 41 | for i in range(batch_size): 42 | batch.append(next(fp).split()) 43 | 44 | max_len = max(len(x) for x in batch) 45 | batch = [[''] + x + [''] * (max_len - len(x) + 1) for x in batch] 46 | yield batch 47 | 48 | batch = [] 49 | except: 50 | pass 51 | 52 | if batch: 53 | max_len = max(len(x) for x in batch) 54 | batch = [[''] + x + [''] * (max_len - len(x) + 1) for x in batch] 55 | yield batch 56 | 57 | 58 | 59 | def get_data(variable): 60 | #return variable.data 61 | return cuda.to_cpu(variable.data) 62 | 63 | def parse_args(): 64 | def_vocab = 40000 65 | def_embed = 200 66 | def_hidden = 200 67 | def_epoch = 10 68 | def_minibatch = 256 69 | def_model = 0 70 | p = ArgumentParser(description='RNNLM trainer') 71 | 72 | p.add_argument('corpus', help='[in] training corpus') 73 | p.add_argument('valid', help='[in] validation corpus') 74 | p.add_argument('model', help='[out] model file') 75 | p.add_argument('-V', '--vocab', default=def_vocab, metavar='INT', type=int, 76 | help='vocabulary size (default: %d)' % def_vocab) 77 | p.add_argument('-E', '--embed', default=def_embed, metavar='INT', type=int, 78 | help='embedding layer size (default: %d)' % def_embed) 79 | p.add_argument('-H', '--hidden', default=def_hidden, metavar='INT', type=int, 80 | help='hidden layer size (default: %d)' % def_hidden) 81 | p.add_argument('-I', '--epoch', default=def_epoch, metavar='INT', type=int, 82 | help='number of training epoch (default: %d)' % def_epoch) 83 | p.add_argument('-B', '--minibatch', default=def_minibatch, metavar='INT', type=int, 84 | help='minibatch size (default: %d)' % def_minibatch) 85 | p.add_argument('-M', '--model', default=def_model, metavar='INT', type=int, 86 | help='RNN used for LM (default: %d) where 0: Default RNNLM, 1: LSTM RNNLM, 2: Attention RNNLM' % def_model) 87 | 88 | args = p.parse_args() 89 | 90 | # check args 91 | try: 92 | if (args.vocab < 1): raise ValueError('you must set --vocab >= 1') 93 | if (args.embed < 1): raise ValueError('you must set --embed >= 1') 94 | if (args.hidden < 1): raise ValueError('you must set --hidden >= 1') 95 | if (args.epoch < 1): raise ValueError('you must set --epoch >= 1') 96 | if (args.minibatch < 1): raise ValueError('you must set --minibatch >= 1') 97 | except Exception as ex: 98 | p.print_usage(file=sys.stderr) 99 | print(ex) 100 | sys.exit() 101 | 102 | return args 103 | 104 | 105 | class XP: 106 | __lib = None 107 | 108 | @staticmethod 109 | def set_library(args): 110 | if args.use_gpu: 111 | XP.__lib = cuda.cupy 112 | cuda.get_device(args.gpu_device).use() 113 | else: 114 | XP.__lib = numpy 115 | 116 | @staticmethod 117 | def __zeros(shape, dtype): 118 | return Variable(XP.__lib.zeros(shape, dtype=dtype)) 119 | 120 | @staticmethod 121 | def fzeros(shape): 122 | return XP.__zeros(shape, XP.__lib.float32) 123 | 124 | @staticmethod 125 | def __nonzeros(shape, dtype, val): 126 | return Variable(val * XP.__lib.ones(shape, dtype=dtype)) 127 | 128 | @staticmethod 129 | def fnonzeros(shape, val=1): 130 | return XP.__nonzeros(shape, XP.__lib.float32, val) 131 | 132 | @staticmethod 133 | def __array(array, dtype): 134 | return Variable(XP.__lib.array(array, dtype=dtype)) 135 | 136 | @staticmethod 137 | def iarray(array): 138 | return XP.__array(array, XP.__lib.int32) 139 | 140 | @staticmethod 141 | def farray(array): 142 | return XP.__array(array, XP.__lib.float32) 143 | 144 | class SrcEmbed(Chain): 145 | def __init__(self, vocab_size, embed_size): 146 | super(SrcEmbed, self).__init__( 147 | xe = links.EmbedID(vocab_size, embed_size), 148 | ) 149 | 150 | def __call__(self, x): 151 | return functions.tanh(self.xe(x)) 152 | 153 | class BasicRnnLM(Chain): 154 | def __init__(self, embed_size, hidden_size, vocab_size): 155 | super(BasicRnn, self).__init__( 156 | xe = SrcEmbed(vocab_size, embed_size), 157 | eh = links.Linear(embed_size, hidden_size), 158 | hh = links.Linear(hidden_size, hidden_size), 159 | hy = links.Linear(hidden_size, vocab_size), 160 | ) 161 | self.reset_state() 162 | 163 | def reset_state(): 164 | self.h = None 165 | 166 | def __call__(self, x): 167 | 168 | e = self.xe(x) 169 | h = self.eh(e) 170 | if self.h is not None: 171 | h += self.hh(self.h) 172 | self.h = functions.tanh(h) 173 | y = self.hy(self.h) 174 | return y 175 | 176 | class LSTMLM(Chain): 177 | def __init__(self, embed_size, hidden_size, vocab_size): 178 | super(LSTMRnn, self).__init__( 179 | xe = SrcEmbed(vocab_size, embed_size), 180 | lstm = links.LSTM(embed_size, hidden_size), 181 | hy = links.Linear(hidden_size, vocab_size), 182 | ) 183 | 184 | def reset(self): 185 | self.zerograds() 186 | 187 | def __call__(self, x): 188 | e = self.xe(x) 189 | h = self.lstm(e) 190 | y = self.hy(h) 191 | return y 192 | 193 | class LSTMEncoder(Chain): 194 | def __init__(self, embed_size, hidden_size): 195 | super(LSTMEncoder, self).__init__( 196 | lstm = links.LSTM(embed_size, hidden_size), 197 | ) 198 | def reset(self): 199 | self.zerograds() 200 | def __call__(self, x): 201 | h = self.lstm(x) 202 | return h 203 | 204 | class Attention(Chain): 205 | def __init__(self, hidden_size, embed_size): 206 | super(Attention, self).__init__( 207 | aw = links.Linear(embed_size, hidden_size), 208 | pw = links.Linear(hidden_size, hidden_size), 209 | we = links.Linear(hidden_size, 1), 210 | ) 211 | self.hidden_size = hidden_size 212 | 213 | 214 | 215 | def __call__(self, a_list, p): 216 | batch_size = p.data.shape[0] 217 | e_list = [] 218 | sum_e = XP.fzeros((batch_size, 1)) 219 | for a in a_list: 220 | w = functions.tanh(self.aw(a) + self.pw(p)) 221 | e = functions.exp(self.we(w)) 222 | e_list.append(e) 223 | sum_e += e 224 | ZEROS = XP.fzeros((batch_size, self.hidden_size)) 225 | aa = ZEROS 226 | for a, e in zip(a_list, e_list): 227 | e /= sum_e 228 | aa += a * e 229 | #aa += functions.reshape(functions.batch_matmul(a, e), (batch_size, self.hidden_size)) 230 | return aa 231 | 232 | class AttentionLM(Chain): 233 | def __init__(self, embed_size, hidden_size, vocab_size): 234 | super(AttentionMT, self).__init__( 235 | emb = SrcEmbed(vocab_size, embed_size), 236 | enc = LSTMEncoder(embed_size, hidden_size), 237 | att = Attention(hidden_size, embed_size), 238 | outhe = links.Linear(hidden_size, hidden_size), 239 | outae = links.Linear(hidden_size, hidden_size), 240 | outey = links.Linear(hidden_size, vocab_size), 241 | ) 242 | self.vocab_size = vocab_size 243 | self.embed_size = embed_size 244 | self.hidden_size = hidden_size 245 | 246 | def reset(self): 247 | self.zerograds() 248 | self.x_list = [] 249 | 250 | def embed(self, x): 251 | self.x_list.append(self.emb(x)) 252 | 253 | def encode(self, x): 254 | self.h = self.enc(x) 255 | 256 | def decode(self, atts_list): 257 | aa = self.att(self.atts_list, self.h) 258 | y = tanh(self.outhe(self.h) + self.outae(aa)) 259 | return self.outey(y) 260 | 261 | def save_spec(self, filename): 262 | with open(filename, 'w') as fp: 263 | print(self.vocab_size, file=fp) 264 | print(self.embed_size, file=fp) 265 | print(self.hidden_size, file=fp) 266 | 267 | @staticmethod 268 | def load_spec(filename): 269 | with open(filename) as fp: 270 | vocab_size = int(next(fp)) 271 | embed_size = int(next(fp)) 272 | hidden_size = int(next(fp)) 273 | return AttentionLM(embed_size, hidden_size, vocab_size) 274 | 275 | def forward(batch, model): 276 | batch = [[vocab[x] for x in words] for words in batch] 277 | K = len(batch) 278 | L = len(batch[0]) - 1 279 | 280 | opt.zero_grads() 281 | accum_loss = XP.fzeros(()) 282 | accum_log_ppl = XP.fzeros(()) 283 | 284 | if args.model is 0 or args.model is 1: 285 | 286 | for l in range(L): 287 | s_x = make_var([batch[k][l] for k in range(K)], dtype=np.int32) 288 | s_t = make_var([batch[k][l + 1] for k in range(K)], dtype=np.int32) 289 | 290 | s_y = model(s_x) 291 | 292 | loss_i = functions.softmax_cross_entropy(s_y, s_t) 293 | accum_loss += loss_i 294 | 295 | accum_log_ppl += get_data(loss_i) 296 | 297 | 298 | 299 | else: 300 | for l in range(L): 301 | s_x = make_var([batch[k][l] for k in range(K)], dtype=np.int32) 302 | model.embed(s_x) 303 | for l in range(L): 304 | s_t = make_var([batch[k][l + 1] for k in range(K)], dtype=np.int32) 305 | model.encode(self.x_list[l]) 306 | s_y = model.decode(self.x_list[0:l]+self.x_list[l+1:L]) 307 | 308 | loss_i = functions.softmax_cross_entropy(s_y, s_t) 309 | accum_loss += loss_i 310 | 311 | accum_log_ppl += get_data(loss_i) 312 | 313 | return accum_loss, accum_log_ppl 314 | 315 | 316 | def main(): 317 | args = parse_args() 318 | 319 | trace('making vocabulary ...') 320 | vocab, num_lines, num_words = make_vocab(args.corpus, args.vocab) 321 | 322 | trace('initializing CUDA ...') 323 | cuda.init() 324 | 325 | trace('start training ...') 326 | if args.model is 0: 327 | model = BasicRnnLM(args.embed, args.hidden, args.vocab) 328 | model.reset() 329 | elif args.model is 1: 330 | model = LSTMRnn(args.embed, args.hidden, args.vocab) 331 | model.reset() 332 | elif args.model is 2: 333 | model = AttentionLM(args.embed, args.hidden, args.vocab) 334 | model.reset() 335 | model.to_gpu() 336 | 337 | for epoch in range(args.epoch): 338 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch)) 339 | log_ppl = 0.0 340 | trained = 0 341 | 342 | opt = optimizers.AdaGrad(lr = 0.01) 343 | opt.setup(model) 344 | opt.add_hook(optimizer.GradientClipping(5)) 345 | 346 | for batch in generate_batch(args.corpus, args.minibatch): 347 | K = len(batch) 348 | loss, perplexity= forward(batch, model) 349 | loss.backward() 350 | log_ppl += perplexity 351 | opt.update() 352 | trained += K 353 | model.reset() 354 | 355 | trace(' %d/%d' % (trained, num_lines)) 356 | log_ppl /= float(num_words) 357 | trace('Train log(PPL) = %.10f' % log_ppl) 358 | trace('Train PPL = %.10f' % math.exp(log_ppl)) 359 | 360 | log_ppl = 0.0 361 | 362 | for batch in generate_batch(args.valid, args.minibatch): 363 | K = len(batch) 364 | loss, perplexity= forward(batch, model) 365 | log_ppl += perplexity 366 | model.reset() 367 | 368 | trace('Valid log(PPL) = %.10f' % log_ppl) 369 | trace('Valid PPL = %.10f' % math.exp(log_ppl)) 370 | 371 | trace(' writing model ...') 372 | trace('saving model ...') 373 | prefix = 'RNNLM-'+str(args.model) + '.%03.d' % (epoch + 1) 374 | save_vocab(prefix + '.srcvocab',vocab) #Fix this # Fixed 375 | model.save_spec(prefix + '.spec') 376 | serializers.save_hdf5(prefix + '.weights', model) 377 | 378 | trace('training finished.') 379 | 380 | 381 | if __name__ == '__main__': 382 | main() 383 | 384 | 385 | def save_vocab(filename, vocab): 386 | with open(filename, 'w') as fp: 387 | for k, v in vocab.items(): 388 | if v == 0: 389 | continue 390 | print('%s %d' % (k, v), file=fp) 391 | -------------------------------------------------------------------------------- /chainer-1.5/mt_s2s_attention.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy 3 | from argparse import ArgumentParser 4 | from chainer import Chain, ChainList, Variable, cuda, functions, links, optimizer, optimizers, serializers 5 | import util.generators as gens 6 | from util.functions import trace, fill_batch 7 | from util.vocabulary import Vocabulary 8 | 9 | def parse_args(): 10 | def_gpu_device = 0 11 | def_vocab = 1000 12 | def_embed = 100 13 | def_hidden = 200 14 | def_epoch = 10 15 | def_minibatch = 64 16 | def_generation_limit = 128 17 | 18 | p = ArgumentParser( 19 | description='Attentional neural machine trainslation', 20 | usage= 21 | '\n %(prog)s train [options] source target model' 22 | '\n %(prog)s test source target model' 23 | '\n %(prog)s -h', 24 | ) 25 | 26 | 27 | p.add_argument('mode', help='\'train\' or \'test\'') 28 | p.add_argument('source', help='[in] source corpus') 29 | p.add_argument('target', help='[in/out] target corpus') 30 | p.add_argument('model', help='[in/out] model file') 31 | p.add_argument('--use-gpu', action='store_true', default=False, 32 | help='use GPU calculation') 33 | p.add_argument('--gpu-device', default=def_gpu_device, metavar='INT', type=int, 34 | help='GPU device ID to be used (default: %(default)d)') 35 | p.add_argument('--vocab', default=def_vocab, metavar='INT', type=int, 36 | help='vocabulary size (default: %(default)d)') 37 | p.add_argument('--embed', default=def_embed, metavar='INT', type=int, 38 | help='embedding layer size (default: %(default)d)') 39 | p.add_argument('--hidden', default=def_hidden, metavar='INT', type=int, 40 | help='hidden layer size (default: %(default)d)') 41 | p.add_argument('--epoch', default=def_epoch, metavar='INT', type=int, 42 | help='number of training epoch (default: %(default)d)') 43 | p.add_argument('--minibatch', default=def_minibatch, metavar='INT', type=int, 44 | help='minibatch size (default: %(default)d)') 45 | p.add_argument('--generation-limit', default=def_generation_limit, metavar='INT', type=int, 46 | help='maximum number of words to be generated for test input (default: %(default)d)') 47 | 48 | args = p.parse_args() 49 | 50 | # check args 51 | try: 52 | if args.mode not in ['train', 'test']: raise ValueError('you must set mode = \'train\' or \'test\'') 53 | if args.vocab < 1: raise ValueError('you must set --vocab >= 1') 54 | if args.embed < 1: raise ValueError('you must set --embed >= 1') 55 | if args.hidden < 1: raise ValueError('you must set --hidden >= 1') 56 | if args.epoch < 1: raise ValueError('you must set --epoch >= 1') 57 | if args.minibatch < 1: raise ValueError('you must set --minibatch >= 1') 58 | if args.generation_limit < 1: raise ValueError('you must set --generation-limit >= 1') 59 | except Exception as ex: 60 | p.print_usage(file=sys.stderr) 61 | print(ex, file=sys.stderr) 62 | sys.exit() 63 | 64 | return args 65 | 66 | class XP: 67 | __lib = None 68 | 69 | @staticmethod 70 | def set_library(args): 71 | if args.use_gpu: 72 | XP.__lib = cuda.cupy 73 | cuda.get_device(args.gpu_device).use() 74 | else: 75 | XP.__lib = numpy 76 | 77 | @staticmethod 78 | def __zeros(shape, dtype): 79 | return Variable(XP.__lib.zeros(shape, dtype=dtype)) 80 | 81 | @staticmethod 82 | def fzeros(shape): 83 | return XP.__zeros(shape, XP.__lib.float32) 84 | 85 | @staticmethod 86 | def __nonzeros(shape, dtype, val): 87 | return Variable(val * XP.__lib.ones(shape, dtype=dtype)) 88 | 89 | @staticmethod 90 | def fnonzeros(shape, val=1): 91 | return XP.__nonzeros(shape, XP.__lib.float32, val) 92 | 93 | @staticmethod 94 | def __array(array, dtype): 95 | return Variable(XP.__lib.array(array, dtype=dtype)) 96 | 97 | @staticmethod 98 | def iarray(array): 99 | return XP.__array(array, XP.__lib.int32) 100 | 101 | @staticmethod 102 | def farray(array): 103 | return XP.__array(array, XP.__lib.float32) 104 | 105 | class SrcEmbed(Chain): 106 | def __init__(self, vocab_size, embed_size): 107 | super(SrcEmbed, self).__init__( 108 | xe = links.EmbedID(vocab_size, embed_size), 109 | ) 110 | 111 | def __call__(self, x): 112 | return functions.tanh(self.xe(x)) 113 | 114 | 115 | 116 | 117 | class MultiLayerStatefulLSTMEncoder(ChainList): 118 | """ 119 | This is an implementation of a Multilayered Stateful LSTM. 120 | The underlying idea is to simply stack multiple LSTMs where the LSTM at the bottom takes the regular input, 121 | and the LSTMs after that simply take the outputs (represented by h) of the previous LSMTs as inputs. 122 | This is simply an analogous version of the Multilayered Stateless LSTM Encoder where the LSTM states are kept hidden. 123 | This LSTM is to be called only by passing the input (x). 124 | To access the cell states you must call the "get_states" function with parameter "num_layers" indicating the number of layers. 125 | Although the cell outputs for each layer are returned, typically only the one of the topmost layer is used for various purposes like attention. 126 | Note that in Tensorflow the concept of "number of attention heads" is used which probably points to attention using the output of each layer. 127 | 128 | Args: 129 | embed_size - The size of embeddings of the inputs 130 | hidden_size - The size of the hidden layer representation of the RNN 131 | num_layers - The number of layers of the RNN (Indicates the number of RNNS stacked on top of each other) 132 | 133 | Attributes: 134 | num_layers: Indicates the number of layers in the RNN 135 | User Defined Methods: 136 | get_states: This simply returns the latest cell states (c) as an array for all layers. 137 | 138 | """ 139 | 140 | def __init__(self, embed_size, hidden_size, num_layers): 141 | super(MultiLayerStatefulLSTMEncoder, self).__init__() 142 | self.add_link(links.LSTM(embed_size,hidden_size)) 143 | for i in range(1, num_layers): 144 | self.add_link(links.LSTM(hidden_size, hidden_size)) 145 | self.num_layers = num_layers 146 | 147 | def __call__(self, x): 148 | """ 149 | Updates the internal state and returns the RNN outputs for each layer as a list. 150 | 151 | Args: 152 | x : A new batch from the input sequence. 153 | 154 | Returns: 155 | A list of the outputs (h) of updated RNN units over all the layers. 156 | 157 | """ 158 | h_list = [] 159 | h_curr = self[0](x) 160 | h_list.append(h_curr) 161 | for i in range(1,self.num_layers): 162 | h_curr = self[1](h_curr) 163 | h_list.append(h_curr) 164 | return h_list 165 | 166 | def get_states(): 167 | c_list = [] 168 | for i in range(self.num_layers): 169 | c_list.append(self[i].c) 170 | return c_list 171 | 172 | class MultiLayerStatelessLSTMEncoder(ChainList): 173 | """ 174 | This is an implementation of a Multilayered Stateless LSTM. 175 | The underlying idea is to simply stack multiple LSTMs where the LSTM at the bottom takes the regular input, 176 | and the LSTMs after that simply take the outputs (represented by h) of the previous LSMTs as inputs. 177 | This is simply an analogous version of the Multilayered Stateful LSTM Encoder where the LSTM states are not hidden. 178 | You have to pass the previous cell states (c) and outputs (h) along with the input (x) when calling the LSTM. 179 | Although the cell outputs for each layer are returned, typically only the one of the topmost layer is used for various purposes like attention. 180 | Note that in Tensorflow the concept of "number of attention heads" is used which probably points to attention using the output of each layer. 181 | 182 | Args: 183 | embed_size - The size of embeddings of the inputs 184 | hidden_size - The size of the hidden layer representation of the RNN 185 | num_layers - The number of layers of the RNN (Indicates the number of RNNS stacked on top of each other) 186 | 187 | Attributes: 188 | num_layers: Indicates the number of layers in the RNN 189 | User Defined Methods: 190 | 191 | """ 192 | def __init__(self, embed_size, hidden_size, num_layers): 193 | super(MultiLayerStatelessLSTMEncoder, self).__init__() 194 | 195 | self.add_link(links.Linear(embed_size, 4 * hidden_size)) 196 | self.add_link(links.Linear(hidden_size, 4 * hidden_size)) 197 | for i in range(1,num_layers): 198 | self.add_link(links.Linear(hidden_size, 4 * hidden_size)) 199 | self.add_link(links.Linear(hidden_size, 4 * hidden_size)) 200 | self.num_layers = num_layers 201 | def __call__(self, x, c, h): 202 | """ 203 | Updates the internal state and returns the RNN outputs for each layer as a list. 204 | 205 | Args: 206 | x : A new batch from the input sequence. 207 | c : The list of the previous cell states. 208 | h : The list of the previous cell outputs. 209 | Returns: 210 | A list of the outputs (h) and another of the states (c) of the updated RNN units over all the layers. 211 | 212 | """ 213 | c_list = [] 214 | h_list = [] 215 | c_curr, h_curr = functions.lstm(c[0], self[0](x) + self[1](h[0])) 216 | c_list.append(c_curr) 217 | h_list.append(h_curr) 218 | for i in range(1,self.num_layers): 219 | c_curr, h_curr = functions.lstm(c[i], self[(i*num_layers)+0](h_curr) + self[(i*num_layers)+1](h[i])) 220 | c_list.append(c_curr) 221 | h_list.append(h_curr) 222 | return c_list, h_list 223 | 224 | class MultiLayerGRUEncoder(ChainList): 225 | """ 226 | This is an implementation of a Multilayered Stateless GRU. 227 | The underlying idea is to simply stack multiple GRUs where the GRU at the bottom takes the regular input, 228 | and the GRUs after that simply take the outputs (represented by h) of the previous GRUs as inputs. 229 | You have to pass the previous cell outputs (h) along with the input (x) when calling the LSTM. 230 | The implementation for the Stateful GRU just saves the cell state and thus its multilayered version wont be implemented unless demanded. 231 | 232 | Args: 233 | embed_size - The size of embeddings of the inputs 234 | hidden_size - The size of the hidden layer representation of the RNN 235 | num_layers - The number of layers of the RNN (Indicates the number of RNNS stacked on top of each other) 236 | 237 | Attributes: 238 | num_layers: Indicates the number of layers in the RNN 239 | User Defined Methods: 240 | 241 | """ 242 | 243 | def __init__(self, embed_size, hidden_size, num_layers): 244 | super(MultiLayerGRUEncoder, self).__init__() 245 | self.add_link(links.GRU(hidden_size,embed_size)) 246 | for i in num_layers: 247 | self.add_link(links.GRU(hidden_size,hidden_size)) 248 | self.num_layers = num_layers 249 | 250 | def __call__(self, x, h): 251 | """ 252 | Updates the internal state and returns the RNN outputs for each layer as a list. 253 | 254 | Args: 255 | x : A new batch from the input sequence. 256 | h : The list of the previous cell outputs. 257 | Returns: 258 | A list of the outputs (h) of the updated RNN units over all the layers. 259 | 260 | """ 261 | h_list = [] 262 | h_curr = self[0](h[0], x) 263 | h_list.append(h_curr) 264 | for i in range(1,self.num_layers): 265 | h_curr = self[i](h[i], h_curr) 266 | h_list.append(h_curr) 267 | return h_list 268 | 269 | 270 | class GRUEncoder(Chain): 271 | 272 | """ 273 | This is just the same Encoder as below. 274 | The only difference is that the RNN cell is a GRU. 275 | 276 | 277 | Args: 278 | embed_size - The size of embeddings of the inputs 279 | hidden_size - The size of the hidden layer representation of the RNN 280 | 281 | 282 | Attributes: 283 | 284 | User Defined Methods: 285 | 286 | """ 287 | 288 | def __init__(self, embed_size, hidden_size): 289 | super(Encoder, self).__init__( 290 | GRU = links.GRU(embed_size, hidden_size), 291 | ) 292 | 293 | def __call__(self, x): 294 | """ 295 | Updates the internal state and returns the RNN output (h). 296 | Note that for a GRU the internal state is the same as the output. (c and h are the same) 297 | 298 | Args: 299 | x : A new batch from the input sequence. 300 | 301 | Returns: 302 | The output (h) of updated RNN unit. 303 | 304 | """ 305 | return self.GRU(x) 306 | 307 | class StatefulEncoder(Chain): 308 | 309 | """ 310 | This is just the same Encoder as below. 311 | The only difference is that the LSTM class implementation is used instead of the LSTM function. 312 | Instead of explicitly defining the LSTM components, the LSTM class encapsulates these components making the Encoder look simpler. 313 | 314 | Args: 315 | embed_size - The size of embeddings of the inputs 316 | hidden_size - The size of the hidden layer representation of the RNN 317 | 318 | 319 | Attributes: 320 | 321 | User Defined Methods: 322 | get_state: This simply returns the latest cell state (c). 323 | """ 324 | 325 | def __init__(self, embed_size, hidden_size): 326 | super(Encoder, self).__init__( 327 | LSTM = links.LSTM(embed_size, hidden_size), 328 | ) 329 | 330 | def __call__(self, x): 331 | """ 332 | Updates the internal state and returns the RNN output (h). 333 | 334 | Args: 335 | x : A new batch from the input sequence. 336 | 337 | Returns: 338 | The output (h) of updated RNN unit. 339 | 340 | """ 341 | return self.LSTM(x) 342 | 343 | def get_state(): 344 | return self.LSTM.c 345 | 346 | class StateLessEncoder(Chain): 347 | """ 348 | This is just the same Encoder as below. The name is changed for the sake of disambiguation. 349 | The LSTM components are explicitly defined and the LSTM function is used in place of the LSTM class. 350 | 351 | Args: 352 | embed_size - The size of embeddings of the inputs 353 | hidden_size - The size of the hidden layer representation of the RNN 354 | 355 | 356 | Attributes: 357 | 358 | User Defined Methods: 359 | """ 360 | def __init__(self, embed_size, hidden_size): 361 | super(Encoder, self).__init__( 362 | xh = links.Linear(embed_size, 4 * hidden_size), 363 | hh = links.Linear(hidden_size, 4 * hidden_size), 364 | ) 365 | 366 | def __call__(self, x, c, h): 367 | """ 368 | Updates the internal state and returns the RNN outputs for each layer as a list. 369 | 370 | Args: 371 | x : A new batch from the input sequence. 372 | c : The previous cell state. 373 | h : The previous cell output. 374 | Returns: 375 | The output (h) and the state (c) of the updated RNN unit. 376 | 377 | """ 378 | return functions.lstm(c, self.xh(x) + self.hh(h)) 379 | 380 | class Encoder(Chain): 381 | def __init__(self, embed_size, hidden_size): 382 | super(Encoder, self).__init__( 383 | xh = links.Linear(embed_size, 4 * hidden_size), 384 | hh = links.Linear(hidden_size, 4 * hidden_size), 385 | ) 386 | 387 | def __call__(self, x, c, h): 388 | return functions.lstm(c, self.xh(x) + self.hh(h)) 389 | 390 | class Attention(Chain): 391 | def __init__(self, hidden_size): 392 | super(Attention, self).__init__( 393 | aw = links.Linear(hidden_size, hidden_size), 394 | bw = links.Linear(hidden_size, hidden_size), 395 | pw = links.Linear(hidden_size, hidden_size), 396 | we = links.Linear(hidden_size, 1), 397 | ) 398 | self.hidden_size = hidden_size 399 | 400 | def __call__(self, a_list, b_list, p): 401 | batch_size = p.data.shape[0] 402 | e_list = [] 403 | sum_e = XP.fzeros((batch_size, 1)) 404 | for a, b in zip(a_list, b_list): 405 | w = functions.tanh(self.aw(a) + self.bw(b) + self.pw(p)) 406 | e = functions.exp(self.we(w)) 407 | e_list.append(e) 408 | sum_e += e 409 | ZEROS = XP.fzeros((batch_size, self.hidden_size)) 410 | aa = ZEROS 411 | bb = ZEROS 412 | for a, b, e in zip(a_list, b_list, e_list): 413 | e /= sum_e 414 | aa += functions.reshape(functions.batch_matmul(a, e), (batch_size, self.hidden_size)) 415 | bb += functions.reshape(functions.batch_matmul(b, e), (batch_size, self.hidden_size)) 416 | return aa, bb 417 | 418 | class LocalAttention(Chain): 419 | def __init__(self, hidden_size): 420 | super(Attention, self).__init__( 421 | aw = links.Linear(hidden_size, hidden_size), 422 | bw = links.Linear(hidden_size, hidden_size), 423 | pw = links.Linear(hidden_size, hidden_size), 424 | we = links.Linear(hidden_size, 1), 425 | ts = links.Linear(hidden_size, hidden_size), 426 | sp = links.Linear(hidden_size, 1), 427 | ) 428 | self.hidden_size = hidden_size 429 | 430 | def __call__(self, a_list, b_list, p, sentence_length, window_size): 431 | batch_size = p.data.shape[0] 432 | SENTENCE_LENGTH = XP.fnonzeros((batch_size, 1),sentence_length) 433 | e_list = [] 434 | sum_e = XP.fzeros((batch_size, 1)) 435 | s = functions.tanh(self.ts(p)) 436 | pos = SENTENCE_LENGTH * functions.sigmoid(self.sp(s)) 437 | 438 | # Develop batch logic to set to zero the components of a and b which are out of the window 439 | # Big question: Do I have to iterate over each element in the batch? That would suck. 440 | # One logic: Get global alignment matrix of (batch x) hidden size x sentence length and then another matrix of (batch x) sentence length which 441 | # will essentially be a matrix containing the gaussian distrubution weight and there will be zeros where the sentence position falls out of the window 442 | # Another logic: Create a matrix of (batch x) sentence length where there will be 1 for each position in the window 443 | 444 | # Separate the attention weights for a and b cause forward is different from backward. 445 | 446 | for a, b in zip(a_list, b_list): 447 | w = functions.tanh(self.aw(a) + self.bw(b) + self.pw(p)) 448 | e = functions.exp(self.we(w)) 449 | e_list.append(e) 450 | sum_e += e 451 | ZEROS = XP.fzeros((batch_size, self.hidden_size)) 452 | aa = ZEROS 453 | bb = ZEROS 454 | for a, b, e in zip(a_list, b_list, e_list): 455 | e /= sum_e 456 | aa += a * e 457 | bb += b * e 458 | return aa, bb 459 | 460 | 461 | class Decoder(Chain): 462 | def __init__(self, vocab_size, embed_size, hidden_size): 463 | super(Decoder, self).__init__( 464 | ye = links.EmbedID(vocab_size, embed_size), 465 | eh = links.Linear(embed_size, 4 * hidden_size), 466 | hh = links.Linear(hidden_size, 4 * hidden_size), 467 | ah = links.Linear(hidden_size, 4 * hidden_size), 468 | bh = links.Linear(hidden_size, 4 * hidden_size), 469 | hf = links.Linear(hidden_size, embed_size), 470 | fy = links.Linear(embed_size, vocab_size), 471 | ) 472 | 473 | def __call__(self, y, c, h, a, b): 474 | e = functions.tanh(self.ye(y)) 475 | c, h = functions.lstm(c, self.eh(e) + self.hh(h) + self.ah(a) + self.bh(b)) 476 | f = functions.tanh(self.hf(h)) 477 | return self.fy(f), c, h 478 | 479 | class AttentionMT(Chain): 480 | def __init__(self, vocab_size, embed_size, hidden_size): 481 | super(AttentionMT, self).__init__( 482 | emb = SrcEmbed(vocab_size, embed_size), 483 | fenc = Encoder(embed_size, hidden_size), 484 | benc = Encoder(embed_size, hidden_size), 485 | att = Attention(hidden_size), 486 | dec = Decoder(vocab_size, embed_size, hidden_size), 487 | ) 488 | self.vocab_size = vocab_size 489 | self.embed_size = embed_size 490 | self.hidden_size = hidden_size 491 | 492 | def reset(self, batch_size): 493 | self.zerograds() 494 | self.x_list = [] 495 | 496 | def embed(self, x): 497 | self.x_list.append(self.emb(x)) 498 | 499 | def encode(self): 500 | src_len = len(self.x_list) 501 | batch_size = self.x_list[0].data.shape[0] 502 | ZEROS = XP.fzeros((batch_size, self.hidden_size)) 503 | c = ZEROS 504 | a = ZEROS 505 | a_list = [] 506 | for x in self.x_list: 507 | c, a = self.fenc(x, c, a) 508 | a_list.append(a) 509 | c = ZEROS 510 | b = ZEROS 511 | b_list = [] 512 | for x in reversed(self.x_list): 513 | c, b = self.benc(x, c, b) 514 | b_list.insert(0, b) 515 | self.a_list = a_list 516 | self.b_list = b_list 517 | self.c = ZEROS 518 | self.h = ZEROS 519 | 520 | def decode(self, y): 521 | aa, bb = self.att(self.a_list, self.b_list, self.h) 522 | y, self.c, self.h = self.dec(y, self.c, self.h, aa, bb) 523 | return y 524 | 525 | def save_spec(self, filename): 526 | with open(filename, 'w') as fp: 527 | print(self.vocab_size, file=fp) 528 | print(self.embed_size, file=fp) 529 | print(self.hidden_size, file=fp) 530 | 531 | @staticmethod 532 | def load_spec(filename): 533 | with open(filename) as fp: 534 | vocab_size = int(next(fp)) 535 | embed_size = int(next(fp)) 536 | hidden_size = int(next(fp)) 537 | return AttentionMT(vocab_size, embed_size, hidden_size) 538 | 539 | def forward(src_batch, trg_batch, src_vocab, trg_vocab, attmt, is_training, generation_limit): 540 | batch_size = len(src_batch) 541 | src_len = len(src_batch[0]) 542 | trg_len = len(trg_batch[0]) if trg_batch else 0 543 | src_stoi = src_vocab.stoi 544 | trg_stoi = trg_vocab.stoi 545 | trg_itos = trg_vocab.itos 546 | attmt.reset(batch_size) 547 | 548 | x = XP.iarray([src_stoi('') for _ in range(batch_size)]) 549 | attmt.embed(x) 550 | for l in range(src_len): 551 | x = XP.iarray([src_stoi(src_batch[k][l]) for k in range(batch_size)]) 552 | attmt.embed(x) 553 | x = XP.iarray([src_stoi('') for _ in range(batch_size)]) 554 | attmt.embed(x) 555 | 556 | attmt.encode() 557 | 558 | t = XP.iarray([trg_stoi('') for _ in range(batch_size)]) 559 | hyp_batch = [[] for _ in range(batch_size)] 560 | 561 | if is_training: 562 | loss = XP.fzeros(()) 563 | for l in range(trg_len): 564 | y = attmt.decode(t) 565 | t = XP.iarray([trg_stoi(trg_batch[k][l]) for k in range(batch_size)]) 566 | loss += functions.softmax_cross_entropy(y, t) 567 | output = cuda.to_cpu(y.data.argmax(1)) 568 | for k in range(batch_size): 569 | hyp_batch[k].append(trg_itos(output[k])) 570 | return hyp_batch, loss 571 | 572 | else: 573 | while len(hyp_batch[0]) < generation_limit: 574 | y = attmt.decode(t) 575 | output = cuda.to_cpu(y.data.argmax(1)) 576 | t = XP.iarray(output) 577 | for k in range(batch_size): 578 | hyp_batch[k].append(trg_itos(output[k])) 579 | if all(hyp_batch[k][-1] == '' for k in range(batch_size)): 580 | break 581 | 582 | return hyp_batch 583 | 584 | def train(args): 585 | trace('making vocabularies ...') 586 | src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab) 587 | trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab) 588 | 589 | trace('making model ...') 590 | attmt = AttentionMT(args.vocab, args.embed, args.hidden) 591 | if args.use_gpu: 592 | attmt.to_gpu() 593 | 594 | for epoch in range(args.epoch): 595 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch)) 596 | trained = 0 597 | gen1 = gens.word_list(args.source) 598 | gen2 = gens.word_list(args.target) 599 | gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * args.minibatch), args.minibatch) 600 | opt = optimizers.AdaGrad(lr = 0.01) 601 | opt.setup(attmt) 602 | opt.add_hook(optimizer.GradientClipping(5)) 603 | 604 | for src_batch, trg_batch in gen3: 605 | src_batch = fill_batch(src_batch) 606 | trg_batch = fill_batch(trg_batch) 607 | K = len(src_batch) 608 | hyp_batch, loss = forward(src_batch, trg_batch, src_vocab, trg_vocab, attmt, True, 0) 609 | loss.backward() 610 | opt.update() 611 | 612 | for k in range(K): 613 | trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1)) 614 | trace(' src = ' + ' '.join([x if x != '' else '*' for x in src_batch[k]])) 615 | trace(' trg = ' + ' '.join([x if x != '' else '*' for x in trg_batch[k]])) 616 | trace(' hyp = ' + ' '.join([x if x != '' else '*' for x in hyp_batch[k]])) 617 | 618 | trained += K 619 | 620 | trace('saving model ...') 621 | prefix = args.model + '.%03.d' % (epoch + 1) 622 | src_vocab.save(prefix + '.srcvocab') 623 | trg_vocab.save(prefix + '.trgvocab') 624 | attmt.save_spec(prefix + '.spec') 625 | serializers.save_hdf5(prefix + '.weights', attmt) 626 | 627 | trace('finished.') 628 | 629 | def test(args): 630 | trace('loading model ...') 631 | src_vocab = Vocabulary.load(args.model + '.srcvocab') 632 | trg_vocab = Vocabulary.load(args.model + '.trgvocab') 633 | attmt = AttentionMT.load_spec(args.model + '.spec') 634 | if args.use_gpu: 635 | attmt.to_gpu() 636 | serializers.load_hdf5(args.model + '.weights', attmt) 637 | 638 | trace('generating translation ...') 639 | generated = 0 640 | 641 | with open(args.target, 'w') as fp: 642 | for src_batch in gens.batch(gens.word_list(args.source), args.minibatch): 643 | src_batch = fill_batch(src_batch) 644 | K = len(src_batch) 645 | 646 | trace('sample %8d - %8d ...' % (generated + 1, generated + K)) 647 | hyp_batch = forward(src_batch, None, src_vocab, trg_vocab, attmt, False, args.generation_limit) 648 | 649 | for hyp in hyp_batch: 650 | hyp.append('') 651 | hyp = hyp[:hyp.index('')] 652 | print(' '.join(hyp), file=fp) 653 | 654 | generated += K 655 | 656 | trace('finished.') 657 | 658 | def main(): 659 | args = parse_args() 660 | XP.set_library(args) 661 | if args.mode == 'train': train(args) 662 | elif args.mode == 'test': test(args) 663 | 664 | if __name__ == '__main__': 665 | main() 666 | 667 | -------------------------------------------------------------------------------- /chainer-1.5/mt_s2s_encdec.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy 3 | from argparse import ArgumentParser 4 | from chainer import Chain, Variable, cuda, functions, links, optimizer, optimizers, serializers 5 | import util.generators as gens 6 | from util.functions import trace, fill_batch 7 | from util.vocabulary import Vocabulary 8 | 9 | def parse_args(): 10 | def_gpu_device = 0 11 | def_vocab = 1000 12 | def_embed = 100 13 | def_hidden = 200 14 | def_epoch = 10 15 | def_minibatch = 64 16 | def_generation_limit = 128 17 | 18 | p = ArgumentParser( 19 | description='Encoder-decoder neural machine trainslation', 20 | usage= 21 | '\n %(prog)s train [options] source target model' 22 | '\n %(prog)s test source target model' 23 | '\n %(prog)s -h', 24 | ) 25 | 26 | p.add_argument('mode', help='\'train\' or \'test\'') 27 | p.add_argument('source', help='[in] source corpus') 28 | p.add_argument('target', help='[in/out] target corpus') 29 | p.add_argument('model', help='[in/out] model file') 30 | p.add_argument('--use-gpu', action='store_true', default=False, 31 | help='use GPU calculation') 32 | p.add_argument('--gpu-device', default=def_gpu_device, metavar='INT', type=int, 33 | help='GPU device ID to be used (default: %(default)d)') 34 | p.add_argument('--vocab', default=def_vocab, metavar='INT', type=int, 35 | help='vocabulary size (default: %(default)d)') 36 | p.add_argument('--embed', default=def_embed, metavar='INT', type=int, 37 | help='embedding layer size (default: %(default)d)') 38 | p.add_argument('--hidden', default=def_hidden, metavar='INT', type=int, 39 | help='hidden layer size (default: %(default)d)') 40 | p.add_argument('--epoch', default=def_epoch, metavar='INT', type=int, 41 | help='number of training epoch (default: %(default)d)') 42 | p.add_argument('--minibatch', default=def_minibatch, metavar='INT', type=int, 43 | help='minibatch size (default: %(default)d)') 44 | p.add_argument('--generation-limit', default=def_generation_limit, metavar='INT', type=int, 45 | help='maximum number of words to be generated for test input (default: %(default)d)') 46 | 47 | args = p.parse_args() 48 | 49 | # check args 50 | try: 51 | if args.mode not in ['train', 'test']: raise ValueError('you must set mode = \'train\' or \'test\'') 52 | if args.vocab < 1: raise ValueError('you must set --vocab >= 1') 53 | if args.embed < 1: raise ValueError('you must set --embed >= 1') 54 | if args.hidden < 1: raise ValueError('you must set --hidden >= 1') 55 | if args.epoch < 1: raise ValueError('you must set --epoch >= 1') 56 | if args.minibatch < 1: raise ValueError('you must set --minibatch >= 1') 57 | if args.generation_limit < 1: raise ValueError('you must set --generation-limit >= 1') 58 | except Exception as ex: 59 | p.print_usage(file=sys.stderr) 60 | print(ex, file=sys.stderr) 61 | sys.exit() 62 | 63 | return args 64 | 65 | class XP: 66 | __lib = None 67 | 68 | @staticmethod 69 | def set_library(args): 70 | if args.use_gpu: 71 | XP.__lib = cuda.cupy 72 | cuda.get_device(args.gpu_device).use() 73 | else: 74 | XP.__lib = numpy 75 | 76 | @staticmethod 77 | def __zeros(shape, dtype): 78 | return Variable(XP.__lib.zeros(shape, dtype=dtype)) 79 | 80 | @staticmethod 81 | def fzeros(shape): 82 | return XP.__zeros(shape, XP.__lib.float32) 83 | 84 | @staticmethod 85 | def __array(array, dtype): 86 | return Variable(XP.__lib.array(array, dtype=dtype)) 87 | 88 | @staticmethod 89 | def iarray(array): 90 | return XP.__array(array, XP.__lib.int32) 91 | 92 | @staticmethod 93 | def farray(array): 94 | return XP.__array(array, XP.__lib.float32) 95 | 96 | class Encoder(Chain): 97 | def __init__(self, vocab_size, embed_size, hidden_size): 98 | super(Encoder, self).__init__( 99 | xe = links.EmbedID(vocab_size, embed_size), 100 | eh = links.Linear(embed_size, 4 * hidden_size), 101 | hh = links.Linear(hidden_size, 4 * hidden_size), 102 | ) 103 | 104 | def __call__(self, x, c, h): 105 | e = functions.tanh(self.xe(x)) 106 | return functions.lstm(c, self.eh(e) + self.hh(h)) 107 | 108 | class Decoder(Chain): 109 | def __init__(self, vocab_size, embed_size, hidden_size): 110 | super(Decoder, self).__init__( 111 | ye = links.EmbedID(vocab_size, embed_size), 112 | eh = links.Linear(embed_size, 4 * hidden_size), 113 | hh = links.Linear(hidden_size, 4 * hidden_size), 114 | hf = links.Linear(hidden_size, embed_size), 115 | fy = links.Linear(embed_size, vocab_size), 116 | ) 117 | 118 | def __call__(self, y, c, h): 119 | e = functions.tanh(self.ye(y)) 120 | c, h = functions.lstm(c, self.eh(e) + self.hh(h)) 121 | f = functions.tanh(self.hf(h)) 122 | return self.fy(f), c, h 123 | 124 | class EncoderDecoder(Chain): 125 | def __init__(self, vocab_size, embed_size, hidden_size): 126 | super(EncoderDecoder, self).__init__( 127 | enc = Encoder(vocab_size, embed_size, hidden_size), 128 | dec = Decoder(vocab_size, embed_size, hidden_size), 129 | ) 130 | self.vocab_size = vocab_size 131 | self.embed_size = embed_size 132 | self.hidden_size = hidden_size 133 | 134 | def reset(self, batch_size): 135 | self.zerograds() 136 | self.c = XP.fzeros((batch_size, self.hidden_size)) 137 | self.h = XP.fzeros((batch_size, self.hidden_size)) 138 | 139 | def encode(self, x): 140 | self.c, self.h = self.enc(x, self.c, self.h) 141 | 142 | def decode(self, y): 143 | y, self.c, self.h = self.dec(y, self.c, self.h) 144 | return y 145 | 146 | def save_spec(self, filename): 147 | with open(filename, 'w') as fp: 148 | print(self.vocab_size, file=fp) 149 | print(self.embed_size, file=fp) 150 | print(self.hidden_size, file=fp) 151 | 152 | @staticmethod 153 | def load_spec(filename): 154 | with open(filename) as fp: 155 | vocab_size = int(next(fp)) 156 | embed_size = int(next(fp)) 157 | hidden_size = int(next(fp)) 158 | return EncoderDecoder(vocab_size, embed_size, hidden_size) 159 | 160 | def forward(src_batch, trg_batch, src_vocab, trg_vocab, encdec, is_training, generation_limit): 161 | batch_size = len(src_batch) 162 | src_len = len(src_batch[0]) 163 | trg_len = len(trg_batch[0]) if trg_batch else 0 164 | src_stoi = src_vocab.stoi 165 | trg_stoi = trg_vocab.stoi 166 | trg_itos = trg_vocab.itos 167 | encdec.reset(batch_size) 168 | 169 | x = XP.iarray([src_stoi('') for _ in range(batch_size)]) 170 | encdec.encode(x) 171 | for l in reversed(range(src_len)): 172 | x = XP.iarray([src_stoi(src_batch[k][l]) for k in range(batch_size)]) 173 | encdec.encode(x) 174 | 175 | t = XP.iarray([trg_stoi('') for _ in range(batch_size)]) 176 | hyp_batch = [[] for _ in range(batch_size)] 177 | 178 | if is_training: 179 | loss = XP.fzeros(()) 180 | for l in range(trg_len): 181 | y = encdec.decode(t) 182 | t = XP.iarray([trg_stoi(trg_batch[k][l]) for k in range(batch_size)]) 183 | loss += functions.softmax_cross_entropy(y, t) 184 | output = cuda.to_cpu(y.data.argmax(1)) 185 | for k in range(batch_size): 186 | hyp_batch[k].append(trg_itos(output[k])) 187 | return hyp_batch, loss 188 | 189 | else: 190 | while len(hyp_batch[0]) < generation_limit: 191 | y = encdec.decode(t) 192 | output = cuda.to_cpu(y.data.argmax(1)) 193 | t = XP.iarray(output) 194 | for k in range(batch_size): 195 | hyp_batch[k].append(trg_itos(output[k])) 196 | if all(hyp_batch[k][-1] == '' for k in range(batch_size)): 197 | break 198 | 199 | return hyp_batch 200 | 201 | def train(args): 202 | trace('making vocabularies ...') 203 | src_vocab = Vocabulary.new(gens.word_list(args.source), args.vocab) 204 | trg_vocab = Vocabulary.new(gens.word_list(args.target), args.vocab) 205 | 206 | trace('making model ...') 207 | encdec = EncoderDecoder(args.vocab, args.embed, args.hidden) 208 | if args.use_gpu: 209 | encdec.to_gpu() 210 | 211 | for epoch in range(args.epoch): 212 | trace('epoch %d/%d: ' % (epoch + 1, args.epoch)) 213 | trained = 0 214 | gen1 = gens.word_list(args.source) 215 | gen2 = gens.word_list(args.target) 216 | gen3 = gens.batch(gens.sorted_parallel(gen1, gen2, 100 * args.minibatch), args.minibatch) 217 | opt = optimizers.AdaGrad(lr = 0.01) 218 | opt.setup(encdec) 219 | opt.add_hook(optimizer.GradientClipping(5)) 220 | 221 | for src_batch, trg_batch in gen3: 222 | src_batch = fill_batch(src_batch) 223 | trg_batch = fill_batch(trg_batch) 224 | K = len(src_batch) 225 | hyp_batch, loss = forward(src_batch, trg_batch, src_vocab, trg_vocab, encdec, True, 0) 226 | loss.backward() 227 | opt.update() 228 | 229 | for k in range(K): 230 | trace('epoch %3d/%3d, sample %8d' % (epoch + 1, args.epoch, trained + k + 1)) 231 | trace(' src = ' + ' '.join([x if x != '' else '*' for x in src_batch[k]])) 232 | trace(' trg = ' + ' '.join([x if x != '' else '*' for x in trg_batch[k]])) 233 | trace(' hyp = ' + ' '.join([x if x != '' else '*' for x in hyp_batch[k]])) 234 | 235 | trained += K 236 | 237 | trace('saving model ...') 238 | prefix = args.model + '.%03.d' % (epoch + 1) 239 | src_vocab.save(prefix + '.srcvocab') 240 | trg_vocab.save(prefix + '.trgvocab') 241 | encdec.save_spec(prefix + '.spec') 242 | serializers.save_hdf5(prefix + '.weights', encdec) 243 | 244 | trace('finished.') 245 | 246 | def test(args): 247 | trace('loading model ...') 248 | src_vocab = Vocabulary.load(args.model + '.srcvocab') 249 | trg_vocab = Vocabulary.load(args.model + '.trgvocab') 250 | encdec = EncoderDecoder.load_spec(args.model + '.spec') 251 | if args.use_gpu: 252 | encdec.to_gpu() 253 | serializers.load_hdf5(args.model + '.weights', encdec) 254 | 255 | trace('generating translation ...') 256 | generated = 0 257 | 258 | with open(args.target, 'w') as fp: 259 | for src_batch in gens.batch(gens.word_list(args.source), args.minibatch): 260 | src_batch = fill_batch(src_batch) 261 | K = len(src_batch) 262 | 263 | trace('sample %8d - %8d ...' % (generated + 1, generated + K)) 264 | hyp_batch = forward(src_batch, None, src_vocab, trg_vocab, encdec, False, args.generation_limit) 265 | 266 | for hyp in hyp_batch: 267 | hyp.append('') 268 | hyp = hyp[:hyp.index('')] 269 | print(' '.join(hyp), file=fp) 270 | 271 | generated += K 272 | 273 | trace('finished.') 274 | 275 | def main(): 276 | args = parse_args() 277 | XP.set_library(args) 278 | if args.mode == 'train': train(args) 279 | elif args.mode == 'test': test(args) 280 | 281 | if __name__ == '__main__': 282 | main() 283 | 284 | -------------------------------------------------------------------------------- /chainer-1.5/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/odashi/chainer_examples/b13ec64e5035b1eb75b873431786d880577b7370/chainer-1.5/util/__init__.py -------------------------------------------------------------------------------- /chainer-1.5/util/functions.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import datetime 3 | 4 | def trace(*args): 5 | print(datetime.datetime.now(), '...', *args, file=sys.stderr) 6 | sys.stderr.flush() 7 | 8 | def fill_batch(batch, token=''): 9 | max_len = max(len(x) for x in batch) 10 | return [x + [token] * (max_len - len(x) + 1) for x in batch] 11 | 12 | def fill_batch2(batch, start_token='', end_token=''): 13 | max_len = max(len(x) for x in batch) 14 | return [[start_token] + x + [end_token] * (max_len - len(x) + 1) for x in batch] 15 | 16 | -------------------------------------------------------------------------------- /chainer-1.5/util/generators.py: -------------------------------------------------------------------------------- 1 | def batch(generator, batch_size): 2 | batch = [] 3 | is_tuple = False 4 | for l in generator: 5 | is_tuple = isinstance(l, tuple) 6 | batch.append(l) 7 | if len(batch) == batch_size: 8 | yield tuple(list(x) for x in zip(*batch)) if is_tuple else batch 9 | batch = [] 10 | if batch: 11 | yield tuple(list(x) for x in zip(*batch)) if is_tuple else batch 12 | 13 | def sorted_parallel(generator1, generator2, pooling, order=1): 14 | gen1 = batch(generator1, pooling) 15 | gen2 = batch(generator2, pooling) 16 | for batch1, batch2 in zip(gen1, gen2): 17 | #yield from sorted(zip(batch1, batch2), key=lambda x: len(x[1])) 18 | for x in sorted(zip(batch1, batch2), key=lambda x: len(x[order])): 19 | yield x 20 | 21 | def word_list(filename): 22 | with open(filename) as fp: 23 | for l in fp: 24 | yield l.split() 25 | 26 | def letter_list(filename): 27 | with open(filename) as fp: 28 | for l in fp: 29 | yield list(''.join(l.split())) 30 | 31 | -------------------------------------------------------------------------------- /chainer-1.5/util/vocabulary.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | 4 | class Vocabulary: 5 | def __init__(self): 6 | pass 7 | 8 | def __len__(self): 9 | return self.__size 10 | 11 | def stoi(self, s): 12 | return self.__stoi[s] 13 | 14 | def itos(self, i): 15 | return self.__itos[i] 16 | 17 | @staticmethod 18 | def new(list_generator, size): 19 | self = Vocabulary() 20 | self.__size = size 21 | 22 | word_freq = defaultdict(lambda: 0) 23 | for words in list_generator: 24 | for word in words: 25 | word_freq[word] += 1 26 | 27 | self.__stoi = defaultdict(lambda: 0) 28 | self.__stoi[''] = 0 29 | self.__stoi[''] = 1 30 | self.__stoi[''] = 2 31 | self.__itos = [''] * self.__size 32 | self.__itos[0] = '' 33 | self.__itos[1] = '' 34 | self.__itos[2] = '' 35 | 36 | for i, (k, v) in zip(range(self.__size - 3), sorted(word_freq.items(), key=lambda x: -x[1])): 37 | self.__stoi[k] = i + 3 38 | self.__itos[i + 3] = k 39 | 40 | return self 41 | 42 | def save(self, filename): 43 | with open(filename, 'w') as fp: 44 | print(self.__size, file=fp) 45 | for i in range(self.__size): 46 | print(self.__itos[i], file=fp) 47 | 48 | @staticmethod 49 | def load(filename): 50 | with open(filename) as fp: 51 | self = Vocabulary() 52 | self.__size = int(next(fp)) 53 | self.__stoi = defaultdict(lambda: 0) 54 | self.__itos = [''] * self.__size 55 | for i in range(self.__size): 56 | s = next(fp).strip() 57 | if s: 58 | self.__stoi[s] = i 59 | self.__itos[i] = s 60 | 61 | return self 62 | 63 | --------------------------------------------------------------------------------