├── .idea └── vcs.xml ├── LICENSE ├── README.md ├── emolga ├── __init__.py ├── basic │ ├── __init__.py │ ├── activations.py │ ├── initializations.py │ ├── objectives.py │ └── optimizers.py ├── config.py ├── config_variant.py ├── dataset │ └── build_dataset.py ├── layers │ ├── __init__.py │ ├── attention.py │ ├── core.py │ ├── embeddings.py │ ├── gridlstm.py │ ├── ntm_minibatch.py │ └── recurrent.py ├── models │ ├── __init__.py │ ├── core.py │ ├── core.pyc │ ├── covc_encdec.py │ ├── encdec.py │ ├── ntm_encdec.py │ ├── pointers.py │ └── variational.py ├── run.py ├── test_lm.py ├── test_nvtm.py ├── test_run.py ├── utils │ ├── __init__.py │ ├── generic_utils.py │ ├── io_utils.py │ ├── np_utils.py │ ├── test_utils.py │ └── theano_utils.py └── voc.pkl └── experiments ├── __init__.py ├── bst_dataset.py ├── bst_vest.py ├── config.py ├── config.pyc ├── copynet.py ├── copynet_input.py ├── dataset.py ├── lcsts_dataset.py ├── lcsts_rouge.py ├── lcsts_sample.py ├── lcsts_test.py ├── lcsts_vest.py ├── lcsts_vest_new.py ├── movie_dataset.py ├── syn_vest.py ├── syntest.py ├── synthetic.py ├── weibo_dataset.py └── weibo_vest.py /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Jiatao Gu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CopyNet 2 | incorporating copying mechanism in sequence-to-sequence learning 3 | -------------------------------------------------------------------------------- /emolga/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'yinpengcheng' 2 | -------------------------------------------------------------------------------- /emolga/basic/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jiataogu' 2 | -------------------------------------------------------------------------------- /emolga/basic/activations.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | 4 | def softmax(x): 5 | return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape) 6 | 7 | 8 | def vector_softmax(x): 9 | return T.nnet.softmax(x.reshape((1, x.shape[0])))[0] 10 | 11 | 12 | def time_distributed_softmax(x): 13 | import warnings 14 | warnings.warn("time_distributed_softmax is deprecated. Just use softmax!", DeprecationWarning) 15 | return softmax(x) 16 | 17 | 18 | def softplus(x): 19 | return T.nnet.softplus(x) 20 | 21 | 22 | def relu(x): 23 | return T.nnet.relu(x) 24 | 25 | 26 | def tanh(x): 27 | return T.tanh(x) 28 | 29 | 30 | def sigmoid(x): 31 | return T.nnet.sigmoid(x) 32 | 33 | 34 | def hard_sigmoid(x): 35 | return T.nnet.hard_sigmoid(x) 36 | 37 | 38 | def linear(x): 39 | ''' 40 | The function returns the variable that is passed in, so all types work 41 | ''' 42 | return x 43 | 44 | 45 | def maxout2(x): 46 | shape = x.shape 47 | if x.ndim == 1: 48 | shape1 = T.cast(shape[0] / 2, 'int64') 49 | shape2 = T.cast(2, 'int64') 50 | x = x.reshape([shape1, shape2]) 51 | x = x.max(1) 52 | elif x.ndim == 2: 53 | shape1 = T.cast(shape[1] / 2, 'int64') 54 | shape2 = T.cast(2, 'int64') 55 | x = x.reshape([shape[0], shape1, shape2]) 56 | x = x.max(2) 57 | elif x.ndim == 3: 58 | shape1 = T.cast(shape[2] / 2, 'int64') 59 | shape2 = T.cast(2, 'int64') 60 | x = x.reshape([shape[0], shape[1], shape1, shape2]) 61 | x = x.max(3) 62 | return x 63 | 64 | 65 | from emolga.utils.generic_utils import get_from_module 66 | 67 | 68 | def get(identifier): 69 | return get_from_module(identifier, globals(), 'activation function') 70 | -------------------------------------------------------------------------------- /emolga/basic/initializations.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | import numpy as np 4 | 5 | from emolga.utils.theano_utils import sharedX, shared_zeros, shared_ones 6 | 7 | 8 | def get_fans(shape): 9 | if isinstance(shape, int): 10 | shape = (1, shape) 11 | fan_in = shape[0] if len(shape) == 2 else np.prod(shape[1:]) 12 | fan_out = shape[1] if len(shape) == 2 else shape[0] 13 | return fan_in, fan_out 14 | 15 | 16 | def uniform(shape, scale=0.1): 17 | return sharedX(np.random.uniform(low=-scale, high=scale, size=shape)) 18 | 19 | 20 | def normal(shape, scale=0.05): 21 | return sharedX(np.random.randn(*shape) * scale) 22 | 23 | 24 | def lecun_uniform(shape): 25 | ''' Reference: LeCun 98, Efficient Backprop 26 | http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf 27 | ''' 28 | fan_in, fan_out = get_fans(shape) 29 | scale = np.sqrt(3. / fan_in) 30 | return uniform(shape, scale) 31 | 32 | 33 | def glorot_normal(shape): 34 | ''' Reference: Glorot & Bengio, AISTATS 2010 35 | ''' 36 | fan_in, fan_out = get_fans(shape) 37 | s = np.sqrt(2. / (fan_in + fan_out)) 38 | return normal(shape, s) 39 | 40 | 41 | def glorot_uniform(shape): 42 | fan_in, fan_out = get_fans(shape) 43 | s = np.sqrt(6. / (fan_in + fan_out)) 44 | return uniform(shape, s) 45 | 46 | 47 | def he_normal(shape): 48 | ''' Reference: He et al., http://arxiv.org/abs/1502.01852 49 | ''' 50 | fan_in, fan_out = get_fans(shape) 51 | s = np.sqrt(2. / fan_in) 52 | return normal(shape, s) 53 | 54 | 55 | def he_uniform(shape): 56 | fan_in, fan_out = get_fans(shape) 57 | s = np.sqrt(6. / fan_in) 58 | return uniform(shape, s) 59 | 60 | 61 | def orthogonal(shape, scale=1.1): 62 | ''' From Lasagne 63 | ''' 64 | flat_shape = (shape[0], np.prod(shape[1:])) 65 | a = np.random.normal(0.0, 1.0, flat_shape) 66 | u, _, v = np.linalg.svd(a, full_matrices=False) 67 | # pick the one with the correct shape 68 | q = u if u.shape == flat_shape else v 69 | q = q.reshape(shape) 70 | return sharedX(scale * q[:shape[0], :shape[1]]) 71 | 72 | 73 | def identity(shape, scale=1): 74 | if len(shape) != 2 or shape[0] != shape[1]: 75 | raise Exception("Identity matrix initialization can only be used for 2D square matrices") 76 | else: 77 | return sharedX(scale * np.identity(shape[0])) 78 | 79 | 80 | def zero(shape): 81 | return shared_zeros(shape) 82 | 83 | 84 | def one(shape): 85 | return shared_ones(shape) 86 | 87 | from emolga.utils.generic_utils import get_from_module 88 | def get(identifier): 89 | return get_from_module(identifier, globals(), 'initialization') 90 | -------------------------------------------------------------------------------- /emolga/basic/objectives.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import theano 3 | import theano.tensor as T 4 | import numpy as np 5 | from six.moves import range 6 | 7 | if theano.config.floatX == 'float64': 8 | epsilon = 1.0e-9 9 | else: 10 | epsilon = 1.0e-7 11 | 12 | 13 | def mean_squared_error(y_true, y_pred): 14 | return T.sqr(y_pred - y_true).mean(axis=-1) 15 | 16 | 17 | def mean_absolute_error(y_true, y_pred): 18 | return T.abs_(y_pred - y_true).mean(axis=-1) 19 | 20 | 21 | def mean_absolute_percentage_error(y_true, y_pred): 22 | return T.abs_((y_true - y_pred) / T.clip(T.abs_(y_true), epsilon, np.inf)).mean(axis=-1) * 100. 23 | 24 | 25 | def mean_squared_logarithmic_error(y_true, y_pred): 26 | return T.sqr(T.log(T.clip(y_pred, epsilon, np.inf) + 1.) - T.log(T.clip(y_true, epsilon, np.inf) + 1.)).mean(axis=-1) 27 | 28 | 29 | def squared_hinge(y_true, y_pred): 30 | return T.sqr(T.maximum(1. - y_true * y_pred, 0.)).mean(axis=-1) 31 | 32 | 33 | def hinge(y_true, y_pred): 34 | return T.maximum(1. - y_true * y_pred, 0.).mean(axis=-1) 35 | 36 | 37 | def categorical_crossentropy(y_true, y_pred): 38 | '''Expects a binary class matrix instead of a vector of scalar classes 39 | ''' 40 | y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon) 41 | # scale preds so that the class probas of each sample sum to 1 42 | y_pred /= y_pred.sum(axis=-1, keepdims=True) 43 | cce = T.nnet.categorical_crossentropy(y_pred, y_true) 44 | return cce 45 | 46 | 47 | def binary_crossentropy(y_true, y_pred): 48 | y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon) 49 | bce = T.nnet.binary_crossentropy(y_pred, y_true).mean(axis=-1) 50 | return bce 51 | 52 | 53 | def poisson_loss(y_true, y_pred): 54 | return T.mean(y_pred - y_true * T.log(y_pred + epsilon), axis=-1) 55 | 56 | #################################################### 57 | # Variational Auto-encoder 58 | 59 | def gaussian_kl_divergence(mean, ln_var): 60 | """Computes the KL-divergence of Gaussian variables from the standard one. 61 | 62 | Given two variable ``mean`` representing :math:`\\mu` and ``ln_var`` 63 | representing :math:`\\log(\\sigma^2)`, this function returns a variable 64 | representing the KL-divergence between the given multi-dimensional Gaussian 65 | :math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)` 66 | 67 | .. math:: 68 | 69 | D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)), 70 | 71 | where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2` 72 | and :math:`I` is an identity matrix. 73 | 74 | Args: 75 | mean (~chainer.Variable): A variable representing mean of given 76 | gaussian distribution, :math:`\\mu`. 77 | ln_var (~chainer.Variable): A variable representing logarithm of 78 | variance of given gaussian distribution, :math:`\\log(\\sigma^2)`. 79 | 80 | Returns: 81 | ~chainer.Variable: A variable representing KL-divergence between 82 | given gaussian distribution and the standard gaussian. 83 | 84 | """ 85 | var = T.exp(ln_var) 86 | return 0.5 * T.sum(mean * mean + var - ln_var - 1, 1) 87 | 88 | 89 | # aliases 90 | mse = MSE = mean_squared_error 91 | mae = MAE = mean_absolute_error 92 | mape = MAPE = mean_absolute_percentage_error 93 | msle = MSLE = mean_squared_logarithmic_error 94 | gkl = GKL = gaussian_kl_divergence 95 | 96 | from emolga.utils.generic_utils import get_from_module 97 | def get(identifier): 98 | return get_from_module(identifier, globals(), 'objective') 99 | -------------------------------------------------------------------------------- /emolga/basic/optimizers.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import theano 3 | import sys 4 | 5 | from theano.sandbox.rng_mrg import MRG_RandomStreams 6 | import theano.tensor as T 7 | import logging 8 | 9 | from emolga.utils.theano_utils import shared_zeros, shared_scalar, floatX 10 | from emolga.utils.generic_utils import get_from_module 11 | from six.moves import zip 12 | from copy import copy, deepcopy 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def clip_norm(g, c, n): 18 | if c > 0: 19 | g = T.switch(T.ge(n, c), g * c / n, g) 20 | return g 21 | 22 | 23 | def kl_divergence(p, p_hat): 24 | return p_hat - p + p * T.log(p / p_hat) 25 | 26 | 27 | class Optimizer(object): 28 | def __init__(self, **kwargs): 29 | self.__dict__.update(kwargs) 30 | self.updates = [] 31 | self.save_parm = [] 32 | 33 | def add(self, v): 34 | self.save_parm += [v] 35 | 36 | def get_state(self): 37 | return [u[0].get_value() for u in self.updates] 38 | 39 | def set_state(self, value_list): 40 | assert len(self.updates) == len(value_list) 41 | for u, v in zip(self.updates, value_list): 42 | u[0].set_value(floatX(v)) 43 | 44 | def get_updates(self, params, loss): 45 | raise NotImplementedError 46 | 47 | def get_gradients(self, loss, params): 48 | """ 49 | Consider the situation that gradient is weighted. 50 | """ 51 | if isinstance(loss, list): 52 | grads = T.grad(loss[0], params, consider_constant=loss[1:]) # gradient of loss 53 | else: 54 | grads = T.grad(loss, params) 55 | 56 | if hasattr(self, 'clipnorm') and self.clipnorm > 0: 57 | print 'use gradient clipping!!' 58 | norm = T.sqrt(sum([T.sum(g ** 2) for g in grads])) 59 | grads = [clip_norm(g, self.clipnorm, norm) for g in grads] 60 | 61 | return grads 62 | 63 | def get_config(self): 64 | return {"name": self.__class__.__name__} 65 | 66 | 67 | class SGD(Optimizer): 68 | 69 | def __init__(self, lr=0.05, momentum=0.9, decay=0.01, nesterov=True, *args, **kwargs): 70 | super(SGD, self).__init__(**kwargs) 71 | self.__dict__.update(locals()) 72 | self.iterations = shared_scalar(0) 73 | self.lr = shared_scalar(lr) 74 | self.momentum = shared_scalar(momentum) 75 | 76 | def get_updates(self, params, loss): 77 | grads = self.get_gradients(loss, params) 78 | lr = self.lr * (1.0 / (1.0 + self.decay * self.iterations)) 79 | self.updates = [(self.iterations, self.iterations + 1.)] 80 | 81 | for p, g in zip(params, grads): 82 | m = shared_zeros(p.get_value().shape) # momentum 83 | v = self.momentum * m - lr * g # velocity 84 | self.updates.append((m, v)) 85 | 86 | if self.nesterov: 87 | new_p = p + self.momentum * v - lr * g 88 | else: 89 | new_p = p + v 90 | 91 | self.updates.append((p, new_p)) # apply constraints 92 | return self.updates 93 | 94 | def get_config(self): 95 | return {"name": self.__class__.__name__, 96 | "lr": float(self.lr.get_value()), 97 | "momentum": float(self.momentum.get_value()), 98 | "decay": float(self.decay.get_value()), 99 | "nesterov": self.nesterov} 100 | 101 | 102 | class RMSprop(Optimizer): 103 | def __init__(self, lr=0.001, rho=0.9, epsilon=1e-6, *args, **kwargs): 104 | super(RMSprop, self).__init__(**kwargs) 105 | self.__dict__.update(locals()) 106 | self.lr = shared_scalar(lr) 107 | self.rho = shared_scalar(rho) 108 | self.iterations = shared_scalar(0) 109 | 110 | def get_updates(self, params, loss): 111 | grads = self.get_gradients(loss, params) 112 | accumulators = [shared_zeros(p.get_value().shape) for p in params] 113 | self.updates = [(self.iterations, self.iterations + 1.)] 114 | 115 | for p, g, a in zip(params, grads, accumulators): 116 | new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator 117 | self.updates.append((a, new_a)) 118 | 119 | new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon) 120 | self.updates.append((p, new_p)) # apply constraints 121 | return self.updates 122 | 123 | def get_config(self): 124 | return {"name": self.__class__.__name__, 125 | "lr": float(self.lr.get_value()), 126 | "rho": float(self.rho.get_value()), 127 | "epsilon": self.epsilon} 128 | 129 | 130 | class Adagrad(Optimizer): 131 | def __init__(self, lr=0.01, epsilon=1e-6, *args, **kwargs): 132 | super(Adagrad, self).__init__(**kwargs) 133 | self.__dict__.update(locals()) 134 | self.lr = shared_scalar(lr) 135 | 136 | def get_updates(self, params, constraints, loss): 137 | grads = self.get_gradients(loss, params) 138 | accumulators = [shared_zeros(p.get_value().shape) for p in params] 139 | self.updates = [] 140 | 141 | for p, g, a, c in zip(params, grads, accumulators, constraints): 142 | new_a = a + g ** 2 # update accumulator 143 | self.updates.append((a, new_a)) 144 | new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon) 145 | self.updates.append((p, c(new_p))) # apply constraints 146 | return self.updates 147 | 148 | def get_config(self): 149 | return {"name": self.__class__.__name__, 150 | "lr": float(self.lr.get_value()), 151 | "epsilon": self.epsilon} 152 | 153 | 154 | class Adadelta(Optimizer): 155 | ''' 156 | Reference: http://arxiv.org/abs/1212.5701 157 | ''' 158 | def __init__(self, lr=0.1, rho=0.95, epsilon=1e-6, *args, **kwargs): 159 | super(Adadelta, self).__init__(**kwargs) 160 | self.__dict__.update(locals()) 161 | self.lr = shared_scalar(lr) 162 | self.iterations = shared_scalar(0) 163 | 164 | def get_updates(self, params, loss): 165 | grads = self.get_gradients(loss, params) 166 | accumulators = [shared_zeros(p.get_value().shape) for p in params] 167 | delta_accumulators = [shared_zeros(p.get_value().shape) for p in params] 168 | # self.updates = [] 169 | self.updates = [(self.iterations, self.iterations + 1.)] 170 | 171 | for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): 172 | new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator 173 | self.updates.append((a, new_a)) 174 | 175 | # use the new accumulator and the *old* delta_accumulator 176 | update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a + 177 | self.epsilon) 178 | 179 | new_p = p - self.lr * update 180 | self.updates.append((p, new_p)) 181 | 182 | # update delta_accumulator 183 | new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2 184 | self.updates.append((d_a, new_d_a)) 185 | return self.updates 186 | 187 | def get_config(self): 188 | return {"name": self.__class__.__name__, 189 | "lr": float(self.lr.get_value()), 190 | "rho": self.rho, 191 | "epsilon": self.epsilon} 192 | 193 | 194 | class Adam(Optimizer): # new Adam is designed for our purpose. 195 | ''' 196 | Reference: http://arxiv.org/abs/1412.6980v8 197 | 198 | Default parameters follow those provided in the original paper. 199 | We add Gaussian Noise to improve the performance. 200 | ''' 201 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, save=False, rng=None, *args, **kwargs): 202 | super(Adam, self).__init__(**kwargs) 203 | self.__dict__.update(locals()) 204 | print locals() 205 | 206 | self.iterations = shared_scalar(0, name='iteration') 207 | self.lr = shared_scalar(lr, name='lr') 208 | self.rng = MRG_RandomStreams(use_cuda=True) 209 | self.noise = [] 210 | self.forget = dict() 211 | self.rng = rng 212 | 213 | self.add(self.iterations) 214 | self.add(self.lr) 215 | 216 | def add_noise(self, param): 217 | if param.name not in self.noise: 218 | logger.info('add gradient noise to {}'.format(param)) 219 | self.noise += [param.name] 220 | 221 | def add_forget(self, param): 222 | if param.name not in self.forget: 223 | logger.info('add forgetting list to {}'.format(param)) 224 | self.forget[param.name] = theano.shared(param.get_value()) 225 | 226 | def get_updates(self, params, loss): 227 | grads = self.get_gradients(loss, params) 228 | self.updates = [(self.iterations, self.iterations + 1.)] 229 | self.pu = [] 230 | 231 | t = self.iterations + 1 232 | lr_t = self.lr * T.sqrt(1 - self.beta_2**t) / (1 - self.beta_1**t) 233 | for p, g in zip(params, grads): 234 | m = theano.shared(p.get_value() * 0., name=p.name + '_m') # zero init of moment 235 | v = theano.shared(p.get_value() * 0., name=p.name + '_v') # zero init of velocity 236 | 237 | self.add(m) 238 | self.add(v) 239 | 240 | # g_noise = self.rng.normal(g.shape, 0, T.sqrt(0.005 * t ** (-0.55)), dtype='float32') 241 | 242 | # if p.name in self.noise: 243 | # g_deviated = g + g_noise 244 | # else: 245 | # g_deviated = g 246 | 247 | g_deviated = g # + g_noise 248 | m_t = (self.beta_1 * m) + (1 - self.beta_1) * g_deviated 249 | v_t = (self.beta_2 * v) + (1 - self.beta_2) * (g_deviated**2) 250 | u_t = -lr_t * m_t / (T.sqrt(v_t) + self.epsilon) 251 | p_t = p + u_t 252 | 253 | # # memory reformatting! 254 | # if p.name in self.forget: 255 | # p_t = (1 - p_mem) * p_t + p_mem * self.forget[p.name] 256 | # p_s = (1 - p_fgt) * p_t + p_fgt * self.forget[p.name] 257 | # self.updates.append((self.forget[p.name], p_s)) 258 | 259 | self.updates.append((m, m_t)) 260 | self.updates.append((v, v_t)) 261 | self.updates.append((p, p_t)) # apply constraints 262 | self.pu.append((p, p_t - p)) 263 | 264 | if self.save: 265 | return self.updates, self.pu 266 | return self.updates 267 | 268 | # aliases 269 | sgd = SGD 270 | rmsprop = RMSprop 271 | adagrad = Adagrad 272 | adadelta = Adadelta 273 | adam = Adam 274 | 275 | 276 | def get(identifier, kwargs=None): 277 | return get_from_module(identifier, globals(), 'optimizer', instantiate=True, 278 | kwargs=kwargs) 279 | -------------------------------------------------------------------------------- /emolga/config.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jiataogu' 2 | import os 3 | import os.path as path 4 | 5 | def setup_ptb2(): 6 | # pretraining setting up. 7 | # get the lm_config. 8 | 9 | config = dict() 10 | config['on_unused_input'] = 'ignore' 11 | config['seed'] = 3030029828 12 | config['level'] = 'DEBUG' 13 | 14 | # config['model'] = 'RNNLM' 15 | # config['model'] = 'VAE' 16 | # config['model'] = 'RNNLM' #'Helmholtz' 17 | config['model'] = 'HarX' 18 | config['highway'] = False 19 | config['use_noise'] = False 20 | 21 | config['optimizer'] = 'adam' #'adadelta' 22 | # config['lr'] = 0.1 23 | 24 | # config['optimizer'] = 'sgd' 25 | 26 | # dataset 27 | config['path'] = path.realpath(path.curdir) + '/' # '/home/thoma/Work/Dial-DRL/' 28 | config['vocabulary_set'] = config['path'] + 'dataset/ptbcorpus/voc.pkl' 29 | config['dataset'] = config['path'] + 'dataset/ptbcorpus/data_train.pkl' 30 | config['dataset_valid'] = config['path'] + 'dataset/ptbcorpus/data_valid.pkl' 31 | config['dataset_test'] = config['path'] + 'dataset/ptbcorpus/data_test.pkl' 32 | # output hdf5 file place. 33 | config['path_h5'] = config['path'] + 'H5' 34 | if not os.path.exists(config['path_h5']): 35 | os.mkdir(config['path_h5']) 36 | 37 | # output log place 38 | config['path_log'] = config['path'] + 'Logs' 39 | if not os.path.exists(config['path_log']): 40 | os.mkdir(config['path_log']) 41 | 42 | # size 43 | config['batch_size'] = 20 44 | config['eval_batch_size'] = 20 45 | config['mode'] = 'RNN' # NTM 46 | config['binary'] = False 47 | 48 | # Encoder: dimension 49 | config['enc_embedd_dim'] = 300 50 | config['enc_hidden_dim'] = 300 51 | config['enc_contxt_dim'] = 350 52 | config['encoder'] = 'RNN' 53 | config['pooling'] = False 54 | 55 | # Encoder: Model 56 | config['bidirectional'] = False # True 57 | config['decposterior'] = True 58 | config['enc_use_contxt'] = False 59 | 60 | # Agent: dimension 61 | config['action_dim'] = 50 62 | config['output_dim'] = 300 63 | 64 | # Decoder: dimension 65 | config['dec_embedd_dim'] = 300 66 | config['dec_hidden_dim'] = 300 67 | config['dec_contxt_dim'] = 300 68 | 69 | # Decoder: Model 70 | config['shared_embed'] = False 71 | config['use_input'] = False 72 | config['bias_code'] = False # True 73 | config['dec_use_contxt'] = True 74 | config['deep_out'] = False 75 | config['deep_out_activ'] = 'tanh' # maxout2 76 | config['bigram_predict'] = False 77 | config['context_predict'] = True # False 78 | config['leaky_predict'] = False # True 79 | config['dropout'] = 0.3 80 | 81 | # Decoder: sampling 82 | config['max_len'] = 88 # 15 83 | config['sample_beam'] = 10 84 | config['sample_stoch'] = False 85 | config['sample_argmax'] = False 86 | 87 | # Auto-Encoder 88 | config['nonlinear_A'] = True 89 | config['nonlinear_B'] = False 90 | 91 | # VAE/Helmholtz: Model 92 | config['repeats'] = 10 93 | config['eval_repeats'] = 10 94 | config['eval_N'] = 10 95 | 96 | config['variant_control'] = False 97 | config['factor'] = 10. 98 | config['mult_q'] = 10. 99 | 100 | print 'setup ok.' 101 | return config 102 | 103 | 104 | -------------------------------------------------------------------------------- /emolga/config_variant.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jiataogu' 2 | from config import setup_ptb2 3 | setup = setup_ptb2 4 | 5 | """ 6 | This file is for small variant fix on original 7 | """ 8 | 9 | 10 | def setup_bienc(config=None): 11 | if config is None: 12 | config = setup() 13 | print 'make some modification' 14 | 15 | config['bidirectional'] = True 16 | config['decposterior'] = False 17 | return config 18 | 19 | 20 | def setup_dim(config=None): 21 | if config is None: 22 | config = setup() 23 | print 'make some modification' 24 | 25 | config['enc_embedd_dim'] = 300 26 | config['enc_hidden_dim'] = 300 27 | config['action_dim'] = 100 28 | 29 | config['dec_embedd_dim'] = 300 30 | config['dec_hidden_dim'] = 300 31 | config['dec_contxt_dim'] = 300 32 | return config 33 | 34 | 35 | def setup_rep(config=None): 36 | if config is None: 37 | config = setup() 38 | print 'make some modification' 39 | 40 | config['repeats'] = 5 41 | return config 42 | 43 | 44 | def setup_opt(config=None): 45 | if config is None: 46 | config = setup() 47 | print 'make some modification' 48 | 49 | config['optimizer'] = 'Adam' 50 | return config -------------------------------------------------------------------------------- /emolga/dataset/build_dataset.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jiataogu' 2 | import numpy as np 3 | import numpy.random as rng 4 | import cPickle 5 | import pprint 6 | import sys 7 | 8 | from collections import OrderedDict 9 | from fuel import datasets 10 | from fuel import transformers 11 | from fuel import schemes 12 | from fuel import streams 13 | 14 | 15 | def serialize_to_file(obj, path, protocol=cPickle.HIGHEST_PROTOCOL): 16 | f = open(path, 'wb') 17 | cPickle.dump(obj, f, protocol=protocol) 18 | f.close() 19 | 20 | 21 | def show_txt(array, path): 22 | f = open(path, 'w') 23 | for line in array: 24 | f.write(' '.join(line) + '\n') 25 | 26 | f.close() 27 | 28 | 29 | def divide_dataset(dataset, test_size, max_size): 30 | train_set = dict() 31 | test_set = dict() 32 | 33 | for w in dataset: 34 | train_set[w] = dataset[w][test_size:max_size].astype('int32') 35 | test_set[w] = dataset[w][:test_size].astype('int32') 36 | 37 | return train_set, test_set 38 | 39 | 40 | def deserialize_from_file(path): 41 | f = open(path, 'rb') 42 | obj = cPickle.load(f) 43 | f.close() 44 | return obj 45 | 46 | 47 | def build_fuel(data): 48 | # create fuel dataset. 49 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('data', data)])) 50 | dataset.example_iteration_scheme \ 51 | = schemes.ShuffledExampleScheme(dataset.num_examples) 52 | return dataset, len(data) 53 | 54 | 55 | def obtain_stream(dataset, batch_size, size=1): 56 | if size == 1: 57 | data_stream = dataset.get_example_stream() 58 | data_stream = transformers.Batch(data_stream, iteration_scheme=schemes.ConstantScheme(batch_size)) 59 | 60 | # add padding and masks to the dataset 61 | data_stream = transformers.Padding(data_stream, mask_sources=('data')) 62 | return data_stream 63 | else: 64 | data_streams = [dataset.get_example_stream() for _ in xrange(size)] 65 | data_streams = [transformers.Batch(data_stream, iteration_scheme=schemes.ConstantScheme(batch_size)) 66 | for data_stream in data_streams] 67 | data_streams = [transformers.Padding(data_stream, mask_sources=('data')) for data_stream in data_streams] 68 | return data_streams 69 | 70 | def build_ptb(): 71 | path = './ptbcorpus/' 72 | print path 73 | # make the dataset and vocabulary 74 | X_train = [l.split() for l in open(path + 'ptb.train.txt').readlines()] 75 | X_test = [l.split() for l in open(path + 'ptb.test.txt').readlines()] 76 | X_valid = [l.split() for l in open(path + 'ptb.valid.txt').readlines()] 77 | 78 | X = X_train + X_test + X_valid 79 | idx2word = dict(enumerate(set([w for l in X for w in l]), 1)) 80 | idx2word[0] = '' 81 | word2idx = {v: k for k, v in idx2word.items()} 82 | ixwords_train = [[word2idx[w] for w in l] for l in X_train] 83 | ixwords_test = [[word2idx[w] for w in l] for l in X_test] 84 | ixwords_valid = [[word2idx[w] for w in l] for l in X_valid] 85 | ixwords_tv = [[word2idx[w] for w in l] for l in (X_train + X_valid)] 86 | 87 | max_len = max([len(w) for w in X_train]) 88 | print max_len 89 | # serialization: 90 | # serialize_to_file(ixwords_train, path + 'data_train.pkl') 91 | # serialize_to_file(ixwords_test, path + 'data_test.pkl') 92 | # serialize_to_file(ixwords_valid, path + 'data_valid.pkl') 93 | # serialize_to_file(ixwords_tv, path + 'data_tv.pkl') 94 | # serialize_to_file([idx2word, word2idx], path + 'voc.pkl') 95 | # show_txt(X, 'data.txt') 96 | print 'save done.' 97 | 98 | 99 | def filter_unk(X, min_freq=5): 100 | voc = dict() 101 | for l in X: 102 | for w in l: 103 | if w not in voc: 104 | voc[w] = 1 105 | else: 106 | voc[w] += 1 107 | 108 | word2idx = dict() 109 | word2idx[''] = 0 110 | id2word = dict() 111 | id2word[0] = '' 112 | 113 | at = 1 114 | for w in voc: 115 | if voc[w] > min_freq: 116 | word2idx[w] = at 117 | id2word[at] = w 118 | at += 1 119 | 120 | word2idx[''] = at 121 | id2word[at] = '' 122 | return word2idx, id2word 123 | 124 | 125 | def build_msr(): 126 | # path = '/home/thoma/Work/Dial-DRL/dataset/MSRSCC/' 127 | path = '/Users/jiataogu/Work/Dial-DRL/dataset/MSRSCC/' 128 | print path 129 | 130 | X = [l.split() for l in open(path + 'train.txt').readlines()] 131 | word2idx, idx2word = filter_unk(X, min_freq=5) 132 | print 'vocabulary size={0}. {1} samples'.format(len(word2idx), len(X)) 133 | 134 | mean_len = np.mean([len(w) for w in X]) 135 | print 'mean len = {}'.format(mean_len) 136 | 137 | ixwords = [[word2idx[w] 138 | if w in word2idx 139 | else word2idx[''] 140 | for w in l] for l in X] 141 | print ixwords[0] 142 | # serialization: 143 | serialize_to_file(ixwords, path + 'data_train.pkl') 144 | 145 | 146 | if __name__ == '__main__': 147 | build_msr() 148 | # build_ptb() 149 | # build_dataset() 150 | # game = GuessOrder(size=8) 151 | # q = 'Is there any number smaller de than 6 in the last 3 numbers ?' 152 | # print game.easy_parse(q) 153 | 154 | -------------------------------------------------------------------------------- /emolga/layers/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'yinpengcheng' 2 | -------------------------------------------------------------------------------- /emolga/layers/attention.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jiataogu' 2 | from .core import * 3 | """ 4 | Attention Model. 5 | <::: Two kinds of attention models ::::> 6 | -- Linear Transformation 7 | -- Inner Product 8 | """ 9 | 10 | 11 | class Attention(Layer): 12 | def __init__(self, target_dim, source_dim, hidden_dim, 13 | init='glorot_uniform', name='attention', 14 | coverage=False, max_len=50, 15 | shared=False): 16 | 17 | super(Attention, self).__init__() 18 | self.init = initializations.get(init) 19 | self.softmax = activations.get('softmax') 20 | self.tanh = activations.get('tanh') 21 | self.target_dim = target_dim 22 | self.source_dim = source_dim 23 | self.hidden_dim = hidden_dim 24 | self.max_len = max_len 25 | self.coverage = coverage 26 | 27 | if coverage: 28 | print 'Use Coverage Trick!' 29 | 30 | self.Wa = self.init((self.target_dim, self.hidden_dim)) 31 | self.Ua = self.init((self.source_dim, self.hidden_dim)) 32 | self.va = self.init((self.hidden_dim, 1)) 33 | 34 | self.Wa.name, self.Ua.name, self.va.name = \ 35 | '{}_Wa'.format(name), '{}_Ua'.format(name), '{}_va'.format(name) 36 | self.params = [self.Wa, self.Ua, self.va] 37 | if coverage: 38 | self.Ca = self.init((1, self.hidden_dim)) 39 | self.Ca.name = '{}_Ca'.format(name) 40 | self.params += [self.Ca] 41 | 42 | def __call__(self, X, S, 43 | Smask=None, 44 | return_log=False, 45 | Cov=None): 46 | assert X.ndim + 1 == S.ndim, 'source should be one more dimension than target.' 47 | # X is the key: (nb_samples, x_dim) 48 | # S is the source (nb_samples, maxlen_s, ctx_dim) 49 | # Cov is the coverage vector (nb_samples, maxlen_s) 50 | 51 | if X.ndim == 1: 52 | X = X[None, :] 53 | S = S[None, :, :] 54 | if not Smask: 55 | Smask = Smask[None, :] 56 | 57 | Eng = dot(X[:, None, :], self.Wa) + dot(S, self.Ua) # (nb_samples, source_num, hidden_dims) 58 | Eng = self.tanh(Eng) 59 | # location aware: 60 | if self.coverage: 61 | Eng += dot(Cov[:, :, None], self.Ca) # (nb_samples, source_num, hidden_dims) 62 | 63 | Eng = dot(Eng, self.va) 64 | Eng = Eng[:, :, 0] # ? (nb_samples, source_num) 65 | 66 | if Smask is not None: 67 | # I want to use mask! 68 | EngSum = logSumExp(Eng, axis=1, mask=Smask) 69 | if return_log: 70 | return (Eng - EngSum) * Smask 71 | else: 72 | return T.exp(Eng - EngSum) * Smask 73 | else: 74 | if return_log: 75 | return T.log(self.softmax(Eng)) 76 | else: 77 | return self.softmax(Eng) 78 | 79 | 80 | class CosineAttention(Layer): 81 | def __init__(self, target_dim, source_dim, 82 | init='glorot_uniform', 83 | use_pipe=True, 84 | name='attention'): 85 | 86 | super(CosineAttention, self).__init__() 87 | self.init = initializations.get(init) 88 | self.softmax = activations.get('softmax') 89 | self.softplus = activations.get('softplus') 90 | self.tanh = activations.get('tanh') 91 | self.use_pipe = use_pipe 92 | 93 | self.target_dim = target_dim 94 | self.source_dim = source_dim 95 | 96 | # pipe 97 | if self.use_pipe: 98 | self.W_key = Dense(self.target_dim, self.source_dim, name='W_key') 99 | else: 100 | assert target_dim == source_dim 101 | self.W_key = Identity(name='W_key') 102 | self._add(self.W_key) 103 | 104 | # sharpen 105 | # self.W_beta = Dense(self.target_dim, 1, name='W_beta') 106 | # dio-sharpen 107 | # self.W_beta = Dense(self.target_dim, self.source_dim, name='W_beta') 108 | # self._add(self.W_beta) 109 | 110 | # self.gamma = self.init((source_dim, )) 111 | # self.gamma = self.init((target_dim, source_dim)) 112 | # self.gamma.name = 'o_gamma' 113 | # self.params += [self.gamma] 114 | 115 | def __call__(self, X, S, Smask=None, return_log=False): 116 | assert X.ndim + 1 == S.ndim, 'source should be one more dimension than target.' 117 | 118 | if X.ndim == 1: 119 | X = X[None, :] 120 | S = S[None, :, :] 121 | if not Smask: 122 | Smask = Smask[None, :] 123 | 124 | key = self.W_key(X) # (nb_samples, source_dim) 125 | # beta = self.softplus(self.W_beta(X)) # (nb_samples, source_dim) 126 | 127 | Eng = dot_2d(key, S) #, g=self.gamma) 128 | # Eng = cosine_sim2d(key, S) # (nb_samples, source_num) 129 | # Eng = T.repeat(beta, Eng.shape[1], axis=1) * Eng 130 | 131 | if Smask is not None: 132 | # I want to use mask! 133 | EngSum = logSumExp(Eng, axis=1, mask=Smask) 134 | if return_log: 135 | return (Eng - EngSum) * Smask 136 | else: 137 | return T.exp(Eng - EngSum) * Smask 138 | else: 139 | if return_log: 140 | return T.log(self.softmax(Eng)) 141 | else: 142 | return self.softmax(Eng) 143 | 144 | -------------------------------------------------------------------------------- /emolga/layers/core.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from emolga.utils.theano_utils import * 4 | import emolga.basic.initializations as initializations 5 | import emolga.basic.activations as activations 6 | 7 | 8 | class Layer(object): 9 | def __init__(self): 10 | self.params = [] 11 | self.layers = [] 12 | self.monitor = {} 13 | self.watchlist = [] 14 | 15 | def init_updates(self): 16 | self.updates = [] 17 | 18 | def _monitoring(self): 19 | # add monitoring variables 20 | for l in self.layers: 21 | for v in l.monitor: 22 | name = v + '@' + l.name 23 | print name 24 | self.monitor[name] = l.monitor[v] 25 | 26 | def __call__(self, X, *args, **kwargs): 27 | return X 28 | 29 | def _add(self, layer): 30 | if layer: 31 | self.layers.append(layer) 32 | self.params += layer.params 33 | 34 | def supports_masked_input(self): 35 | ''' Whether or not this layer respects the output mask of its previous layer in its calculations. If you try 36 | to attach a layer that does *not* support masked_input to a layer that gives a non-None output_mask() that is 37 | an error''' 38 | return False 39 | 40 | def get_output_mask(self, train=None): 41 | ''' 42 | For some models (such as RNNs) you want a way of being able to mark some output data-points as 43 | "masked", so they are not used in future calculations. In such a model, get_output_mask() should return a mask 44 | of one less dimension than get_output() (so if get_output is (nb_samples, nb_timesteps, nb_dimensions), then the mask 45 | is (nb_samples, nb_timesteps), with a one for every unmasked datapoint, and a zero for every masked one. 46 | 47 | If there is *no* masking then it shall return None. For instance if you attach an Activation layer (they support masking) 48 | to a layer with an output_mask, then that Activation shall also have an output_mask. If you attach it to a layer with no 49 | such mask, then the Activation's get_output_mask shall return None. 50 | 51 | Some emolga have an output_mask even if their input is unmasked, notably Embedding which can turn the entry "0" into 52 | a mask. 53 | ''' 54 | return None 55 | 56 | def set_weights(self, weights): 57 | for p, w in zip(self.params, weights): 58 | if p.eval().shape != w.shape: 59 | raise Exception("Layer shape %s not compatible with weight shape %s." % (p.eval().shape, w.shape)) 60 | p.set_value(floatX(w)) 61 | 62 | def get_weights(self): 63 | weights = [] 64 | for p in self.params: 65 | weights.append(p.get_value()) 66 | return weights 67 | 68 | def get_params(self): 69 | return self.params 70 | 71 | def set_name(self, name): 72 | for i in range(len(self.params)): 73 | if self.params[i].name is None: 74 | self.params[i].name = '%s_p%d' % (name, i) 75 | else: 76 | self.params[i].name = name + '_' + self.params[i].name 77 | self.name = name 78 | 79 | 80 | class MaskedLayer(Layer): 81 | ''' 82 | If your layer trivially supports masking (by simply copying the input mask to the output), then subclass MaskedLayer 83 | instead of Layer, and make sure that you incorporate the input mask into your calculation of get_output() 84 | ''' 85 | def supports_masked_input(self): 86 | return True 87 | 88 | 89 | class Identity(Layer): 90 | def __init__(self, name='Identity'): 91 | super(Identity, self).__init__() 92 | if name is not None: 93 | self.set_name(name) 94 | 95 | def __call__(self, X): 96 | return X 97 | 98 | 99 | class Dense(Layer): 100 | def __init__(self, input_dim, output_dim, init='glorot_uniform', activation='tanh', name='Dense', 101 | learn_bias=True, negative_bias=False): 102 | 103 | super(Dense, self).__init__() 104 | self.init = initializations.get(init) 105 | self.activation = activations.get(activation) 106 | self.input_dim = input_dim 107 | self.output_dim = output_dim 108 | self.linear = (activation == 'linear') 109 | 110 | # self.input = T.matrix() 111 | self.W = self.init((self.input_dim, self.output_dim)) 112 | if not negative_bias: 113 | self.b = shared_zeros((self.output_dim)) 114 | else: 115 | self.b = shared_ones((self.output_dim)) 116 | 117 | self.learn_bias = learn_bias 118 | if self.learn_bias: 119 | self.params = [self.W, self.b] 120 | else: 121 | self.params = [self.W] 122 | 123 | if name is not None: 124 | self.set_name(name) 125 | 126 | def set_name(self, name): 127 | self.W.name = '%s_W' % name 128 | self.b.name = '%s_b' % name 129 | 130 | def __call__(self, X): 131 | output = self.activation(T.dot(X, self.W) + 4. * self.b) 132 | return output 133 | 134 | def reverse(self, Y): 135 | assert self.linear 136 | 137 | output = T.dot((Y - self.b), self.W.T) 138 | return output 139 | 140 | 141 | class Dense2(Layer): 142 | def __init__(self, input_dim1, input_dim2, output_dim, init='glorot_uniform', activation='tanh', name='Dense', learn_bias=True): 143 | 144 | super(Dense2, self).__init__() 145 | self.init = initializations.get(init) 146 | self.activation = activations.get(activation) 147 | self.input_dim1 = input_dim1 148 | self.input_dim2 = input_dim2 149 | self.output_dim = output_dim 150 | self.linear = (activation == 'linear') 151 | 152 | # self.input = T.matrix() 153 | 154 | self.W1 = self.init((self.input_dim1, self.output_dim)) 155 | self.W2 = self.init((self.input_dim2, self.output_dim)) 156 | self.b = shared_zeros((self.output_dim)) 157 | 158 | self.learn_bias = learn_bias 159 | if self.learn_bias: 160 | self.params = [self.W1, self.W2, self.b] 161 | else: 162 | self.params = [self.W1, self.W2] 163 | 164 | if name is not None: 165 | self.set_name(name) 166 | 167 | def set_name(self, name): 168 | self.W1.name = '%s_W1' % name 169 | self.W2.name = '%s_W2' % name 170 | self.b.name = '%s_b' % name 171 | 172 | def __call__(self, X1, X2): 173 | output = self.activation(T.dot(X1, self.W1) + T.dot(X2, self.W2) + self.b) 174 | return output 175 | 176 | 177 | class Constant(Layer): 178 | def __init__(self, input_dim, output_dim, init=None, activation='tanh', name='Bias'): 179 | 180 | super(Constant, self).__init__() 181 | assert input_dim == output_dim, 'Bias Layer needs to have the same input/output nodes.' 182 | 183 | self.init = initializations.get(init) 184 | self.activation = activations.get(activation) 185 | self.input_dim = input_dim 186 | self.output_dim = output_dim 187 | 188 | self.b = shared_zeros(self.output_dim) 189 | self.params = [self.b] 190 | 191 | if name is not None: 192 | self.set_name(name) 193 | 194 | def set_name(self, name): 195 | self.b.name = '%s_b' % name 196 | 197 | def __call__(self, X=None): 198 | output = self.activation(self.b) 199 | if X: 200 | L = X.shape[0] 201 | output = T.extra_ops.repeat(output[None, :], L, axis=0) 202 | return output 203 | 204 | 205 | class MemoryLinear(Layer): 206 | def __init__(self, input_dim, input_wdth, init='glorot_uniform', 207 | activation='tanh', name='Bias', has_input=True): 208 | super(MemoryLinear, self).__init__() 209 | 210 | self.init = initializations.get(init) 211 | self.activation = activations.get(activation) 212 | self.input_dim = input_dim 213 | self.input_wdth = input_wdth 214 | 215 | self.b = self.init((self.input_dim, self.input_wdth)) 216 | self.params = [self.b] 217 | 218 | if has_input: 219 | self.P = self.init((self.input_dim, self.input_wdth)) 220 | self.params += [self.P] 221 | 222 | if name is not None: 223 | self.set_name(name) 224 | 225 | def __call__(self, X=None): 226 | out = self.b[None, :, :] 227 | if X: 228 | out += self.P[None, :, :] * X 229 | return self.activation(out) 230 | 231 | 232 | class Dropout(MaskedLayer): 233 | """ 234 | Hinton's dropout. 235 | """ 236 | def __init__(self, rng=None, p=1., name=None): 237 | super(Dropout, self).__init__() 238 | self.p = p 239 | self.rng = rng 240 | 241 | def __call__(self, X, train=True): 242 | if self.p > 0.: 243 | retain_prob = 1. - self.p 244 | if train: 245 | X *= self.rng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX) 246 | else: 247 | X *= retain_prob 248 | return X 249 | 250 | 251 | class Activation(MaskedLayer): 252 | """ 253 | Apply an activation function to an output. 254 | """ 255 | def __init__(self, activation): 256 | super(Activation, self).__init__() 257 | self.activation = activations.get(activation) 258 | 259 | def __call__(self, X): 260 | return self.activation(X) 261 | 262 | -------------------------------------------------------------------------------- /emolga/layers/embeddings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .core import Layer 4 | from emolga.utils.theano_utils import * 5 | import emolga.basic.initializations as initializations 6 | 7 | 8 | class Embedding(Layer): 9 | ''' 10 | Turn positive integers (indexes) into denses vectors of fixed size. 11 | eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]] 12 | 13 | @input_dim: size of vocabulary (highest input integer + 1) 14 | @out_dim: size of dense representation 15 | ''' 16 | 17 | def __init__(self, input_dim, output_dim, init='uniform', name=None): 18 | 19 | super(Embedding, self).__init__() 20 | self.init = initializations.get(init) 21 | self.input_dim = input_dim 22 | self.output_dim = output_dim 23 | 24 | self.W = self.init((self.input_dim, self.output_dim)) 25 | 26 | self.params = [self.W] 27 | 28 | if name is not None: 29 | self.set_name(name) 30 | 31 | def get_output_mask(self, X): 32 | return T.ones_like(X) * (1 - T.eq(X, 0)) 33 | 34 | def __call__(self, X, mask_zero=False, context=None): 35 | if context is None: 36 | out = self.W[X] 37 | else: 38 | assert context.ndim == 3 39 | flag = False 40 | if X.ndim == 1: 41 | flag = True 42 | X = X[:, None] 43 | 44 | b_size = context.shape[0] 45 | 46 | EMB = T.repeat(self.W[None, :, :], b_size, axis=0) 47 | EMB = T.concatenate([EMB, context], axis=1) 48 | 49 | m_size = EMB.shape[1] 50 | e_size = EMB.shape[2] 51 | maxlen = X.shape[1] 52 | 53 | EMB = EMB.reshape((b_size * m_size, e_size)) 54 | Z = (T.arange(b_size)[:, None] * m_size + X).reshape((b_size * maxlen,)) 55 | out = EMB[Z] # (b_size * maxlen, e_size) 56 | 57 | if not flag: 58 | out = out.reshape((b_size, maxlen, e_size)) 59 | else: 60 | out = out.reshape((b_size, e_size)) 61 | 62 | if mask_zero: 63 | return out, T.cast(self.get_output_mask(X), dtype='float32') 64 | else: 65 | return out 66 | 67 | 68 | class Zero(Layer): 69 | def __call__(self, X): 70 | out = T.zeros(X.shape) 71 | return out 72 | 73 | 74 | class Bias(Layer): 75 | def __call__(self, X): 76 | tmp = X.flatten() 77 | tmp = tmp.dimshuffle(0, 'x') 78 | return tmp 79 | -------------------------------------------------------------------------------- /emolga/models/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jiataogu' 2 | -------------------------------------------------------------------------------- /emolga/models/core.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jiataogu' 2 | import theano 3 | import logging 4 | import deepdish as dd 5 | 6 | from emolga.dataset.build_dataset import serialize_to_file, deserialize_from_file 7 | from emolga.utils.theano_utils import floatX 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class Model(object): 13 | def __init__(self): 14 | self.layers = [] 15 | self.params = [] 16 | self.monitor = {} 17 | self.watchlist = [] 18 | 19 | def _add(self, layer): 20 | if layer: 21 | self.layers.append(layer) 22 | self.params += layer.params 23 | 24 | def _monitoring(self): 25 | # add monitoring variables 26 | for l in self.layers: 27 | for v in l.monitor: 28 | name = v + '@' + l.name 29 | print name 30 | self.monitor[name] = l.monitor[v] 31 | 32 | def compile_monitoring(self, inputs, updates=None): 33 | logger.info('compile monitoring') 34 | for i, v in enumerate(self.monitor): 35 | self.watchlist.append(v) 36 | logger.info('monitoring [{0}]: {1}'.format(i, v)) 37 | 38 | self.watch = theano.function(inputs, 39 | [self.monitor[v] for v in self.watchlist], 40 | updates=updates 41 | ) 42 | logger.info('done.') 43 | 44 | def set_weights(self, weights): 45 | if hasattr(self, 'save_parm'): 46 | params = self.params + self.save_parm 47 | else: 48 | params = self.params 49 | 50 | for p, w in zip(params, weights): 51 | print p.name 52 | if p.eval().shape != w.shape: 53 | raise Exception("Layer shape %s not compatible with weight shape %s." % (p.eval().shape, w.shape)) 54 | p.set_value(floatX(w)) 55 | 56 | def get_weights(self): 57 | weights = [] 58 | for p in self.params: 59 | weights.append(p.get_value()) 60 | 61 | if hasattr(self, 'save_parm'): 62 | for v in self.save_parm: 63 | weights.append(v.get_value()) 64 | 65 | return weights 66 | 67 | def set_name(self, name): 68 | for i in range(len(self.params)): 69 | if self.params[i].name is None: 70 | self.params[i].name = '%s_p%d' % (name, i) 71 | else: 72 | self.params[i].name = name + '@' + self.params[i].name 73 | self.name = name 74 | 75 | def save(self, filename): 76 | if hasattr(self, 'save_parm'): 77 | params = self.params + self.save_parm 78 | else: 79 | params = self.params 80 | ps = 'save: <\n' 81 | for p in params: 82 | ps += '{0}: {1}\n'.format(p.name, p.eval().shape) 83 | ps += '> to ... {}'.format(filename) 84 | logger.info(ps) 85 | 86 | # hdf5 module seems works abnormal !! 87 | # dd.io.save(filename, self.get_weights()) 88 | serialize_to_file(self.get_weights(), filename) 89 | 90 | def load(self, filename): 91 | logger.info('load the weights.') 92 | 93 | # hdf5 module seems works abnormal !! 94 | # weights = dd.io.load(filename) 95 | weights = deserialize_from_file(filename) 96 | print len(weights) 97 | self.set_weights(weights) 98 | -------------------------------------------------------------------------------- /emolga/models/core.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/CopyNet/dba24d58d505abaa169e05ee762987f37f41f566/emolga/models/core.pyc -------------------------------------------------------------------------------- /emolga/test_lm.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jiataogu' 2 | 3 | import logging 4 | 5 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 6 | 7 | from emolga.models.encdec import RNNLM, AutoEncoder 8 | from emolga.models.variational import Helmholtz, VAE, HarX, THarX, NVTM 9 | # from models.ntm_encdec import RNNLM, AutoEncoder, Helmholtz, BinaryHelmholtz 10 | from emolga.utils.generic_utils import * 11 | from emolga.dataset.build_dataset import deserialize_from_file, build_fuel, obtain_stream 12 | from emolga.config import setup_ptbz, setup_ptb2 13 | from emolga.config_variant import * 14 | 15 | setup = setup_bienc 16 | 17 | 18 | def init_logging(logfile): 19 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', 20 | datefmt='%m/%d/%Y %H:%M:%S' ) 21 | fh = logging.FileHandler(logfile) 22 | # ch = logging.StreamHandler() 23 | 24 | fh.setFormatter(formatter) 25 | # ch.setFormatter(formatter) 26 | # fh.setLevel(logging.INFO) 27 | # ch.setLevel(logging.INFO) 28 | # logging.getLogger().addHandler(ch) 29 | logging.getLogger().addHandler(fh) 30 | logging.getLogger().setLevel(logging.INFO) 31 | 32 | return logging 33 | 34 | # prepare logging. 35 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) 36 | config = setup() # load settings. 37 | for w in config: 38 | print '{0}={1}'.format(w, config[w]) 39 | 40 | logger = init_logging(config['path_log'] + '/emolga.RHM.id={}.log'.format(tmark)) 41 | n_rng = np.random.RandomState(config['seed']) 42 | np.random.seed(config['seed']) 43 | rng = RandomStreams(n_rng.randint(2 ** 30)) 44 | 45 | logger.info('Start!') 46 | 47 | # load the dataset and build a fuel-dataset. 48 | idx2word, word2idx = deserialize_from_file(config['vocabulary_set']) 49 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1 50 | config['dec_voc_size'] = config['enc_voc_size'] 51 | logger.info('build dataset done. vocabulary size = {0}/ batch size = {1}'.format( 52 | config['dec_voc_size'], config['batch_size'])) 53 | 54 | # training & valid & tesing set. 55 | train_set, train_size = build_fuel(deserialize_from_file(config['dataset'])) 56 | valid_set, valid_size = build_fuel(deserialize_from_file(config['dataset_test'])) # use test set for a try 57 | 58 | # weiget save. 59 | savefile = config['path_h5'] + '/emolga.RHM.id={}.h5'.format(tmark) 60 | 61 | # build the agent 62 | if config['model'] == 'RNNLM': 63 | agent = RNNLM(config, n_rng, rng, mode=config['mode']) 64 | elif config['model'] == 'HarX': 65 | agent = THarX(config, n_rng, rng, mode=config['mode']) 66 | elif config['model'] == 'Helmholtz': 67 | agent = Helmholtz(config, n_rng, rng, mode=config['mode']) 68 | else: 69 | raise NotImplementedError 70 | 71 | agent.build_() 72 | agent.compile_('train') 73 | print 'compile ok' 74 | 75 | # learning to speak language. 76 | count = 1000 77 | echo = 0 78 | epochs = 50 79 | while echo < epochs: 80 | echo += 1 81 | loss = [] 82 | correct = 0 83 | scans = 0 84 | 85 | # visualization the embedding weights. 86 | # if echo > 1: 87 | # plt.figure(3) 88 | # visualize_(plt.subplot(111), agent.decoder.Embed.get_params()[0].get_value(), name='encoder embedding', 89 | # text=idx2word) 90 | # plt.show() 91 | 92 | # if not config['use_noise']: 93 | 94 | # training 95 | train_batches = obtain_stream(train_set, config['batch_size']).get_epoch_iterator(as_dict=True) 96 | valid_batches = obtain_stream(valid_set, config['eval_batch_size']).get_epoch_iterator(as_dict=True) 97 | 98 | def prepare_batch(batch): 99 | data = batch['data'].astype('int32') 100 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1) 101 | 102 | def cut_zeros(data): 103 | for k in range(data.shape[1] - 1, 0, -1): 104 | data_col = data[:, k].sum() 105 | if data_col > 0: 106 | return data[:, : k + 2] 107 | return data 108 | data = cut_zeros(data) 109 | return data 110 | 111 | # training 112 | logger.info('Epoch = {} -> Training Set Learning...'.format(echo)) 113 | progbar = Progbar(train_size / config['batch_size']) 114 | for it, batch in enumerate(train_batches): 115 | # get data 116 | data = prepare_batch(batch) 117 | if config['model'] == 'RNNLM' or config['model'] == 'AutoEncoder': 118 | loss.append(agent.train_(data, config['repeats'])) 119 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])]) 120 | elif config['model'] == 'Helmholtz' or 'HarX': 121 | loss.append(agent.train_(data, config['repeats'])) 122 | weightss = np.sum([np.sum(abs(w)) for w in agent.get_weights()]) 123 | progbar.update(it, [('lossPa', loss[-1][0]), ('lossPxa', loss[-1][1]), ('lossQ', loss[-1][2]), 124 | ('perplexity', np.log(loss[-1][3])), ('NLL', loss[-1][4]), ('L1', weightss)]) 125 | 126 | """ 127 | watch = agent.watch(data) 128 | print '.' 129 | pprint(watch[0][0]) 130 | pprint(watch[2][0]) 131 | # pprint(watch[2][0]) 132 | sys.exit(111) 133 | """ 134 | 135 | # if it % 100 == 50: 136 | # sys.exit(-1) 137 | # # print '.' 138 | # # print 'encoded = {}'.format(encoded[11]) 139 | # # print 'mean = {}'.format(mean[11]) 140 | # # print 'std = {}'.format(std[11]) 141 | # 142 | # # watch = agent.watch(data) 143 | # # print '.' 144 | # # print 'train memory {}'.format(watch[0][0]) 145 | # 146 | # for kk in xrange(5): 147 | # # sample a sentence. 148 | # # action = agent.action_sampler() 149 | # # context = agent.context_trans(action) 150 | # if config['model'] is 'AutoEncoder': 151 | # source = data[kk][None, :] 152 | # truth = ' '.join(print_sample(idx2word, source[0].tolist())[:-1]) 153 | # print '\ntruth: {}'.format(truth) 154 | # context = agent.memorize(source) 155 | # sample, score = agent.generate_(context, max_len=data.shape[1]) 156 | # else: 157 | # sample, score = agent.generate_(max_len=data.shape[1]) 158 | # 159 | # if sample[-1] is not 0: 160 | # sample += [0] # fix the end. 161 | # question = ' '.join(print_sample(idx2word, sample)[:-1]) 162 | # print '\nsample: {}'.format(question) 163 | # print 'PPL: {}'.format(score) 164 | # scans += 1.0 165 | 166 | print ' .' 167 | logger.info('Epoch = {0} finished.'.format(echo)) 168 | 169 | # validation 170 | logger.info('Epoch = {} -> Vadlidation Set Evaluation...'.format(echo)) 171 | progbar = Progbar(valid_size / config['batch_size']) 172 | for it, batch in enumerate(valid_batches): 173 | # get data 174 | data = prepare_batch(batch) 175 | if config['model'] == 'Helmholtz' or 'HarX': 176 | loss.append(agent.evaluate_(data)) 177 | progbar.update(it, [('NLL', loss[-1][0]), ('perplexity', np.log(loss[-1][1]))]) 178 | else: 179 | raise NotImplementedError 180 | 181 | print ' .' 182 | # save the weights. 183 | agent.save(config['path_h5'] + '/emolga.RHM.id={0}.epoch={1}.pkl'.format(tmark, echo)) 184 | 185 | # logger.info('Learning percentage: {}'.format(correct / scans)) 186 | 187 | 188 | # inference test 189 | # batches = data_stream.get_epoch_iterator(as_dict=True) 190 | # for it, batch in enumerate(batches): 191 | # data = batch['data'].astype('int32') 192 | # data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1) 193 | # mean, std = agent.inference_(data) 194 | # print mean 195 | # break 196 | # print count -------------------------------------------------------------------------------- /emolga/test_nvtm.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jiataogu' 2 | 3 | import logging 4 | 5 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 6 | 7 | from emolga.models.encdec import RNNLM, AutoEncoder 8 | from emolga.models.variational import Helmholtz, VAE, HarX, THarX, NVTM 9 | # from models.ntm_encdec import RNNLM, AutoEncoder, Helmholtz, BinaryHelmholtz 10 | from emolga.utils.generic_utils import * 11 | from emolga.dataset.build_dataset import deserialize_from_file, build_fuel, obtain_stream 12 | from emolga.config import setup_ptbz, setup_ptb2 13 | from emolga.config_variant import * 14 | 15 | setup = setup_bienc 16 | 17 | 18 | def init_logging(logfile): 19 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', 20 | datefmt='%m/%d/%Y %H:%M:%S' ) 21 | fh = logging.FileHandler(logfile) 22 | # ch = logging.StreamHandler() 23 | 24 | fh.setFormatter(formatter) 25 | # ch.setFormatter(formatter) 26 | # fh.setLevel(logging.INFO) 27 | # ch.setLevel(logging.INFO) 28 | # logging.getLogger().addHandler(ch) 29 | logging.getLogger().addHandler(fh) 30 | logging.getLogger().setLevel(logging.INFO) 31 | 32 | return logging 33 | 34 | # prepare logging. 35 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) 36 | config = setup() # load settings. 37 | for w in config: 38 | print '{0}={1}'.format(w, config[w]) 39 | 40 | logger = init_logging(config['path_log'] + '/emolga.RHM.id={}.log'.format(tmark)) 41 | n_rng = np.random.RandomState(config['seed']) 42 | np.random.seed(config['seed']) 43 | rng = RandomStreams(n_rng.randint(2 ** 30)) 44 | 45 | logger.info('Start!') 46 | 47 | # load the dataset and build a fuel-dataset. 48 | idx2word, word2idx = deserialize_from_file(config['vocabulary_set']) 49 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1 50 | config['dec_voc_size'] = config['enc_voc_size'] 51 | logger.info('build dataset done. vocabulary size = {0}/ batch size = {1}'.format( 52 | config['dec_voc_size'], config['batch_size'])) 53 | 54 | # training & valid & tesing set. 55 | train_set, train_size = build_fuel(deserialize_from_file(config['dataset'])) 56 | valid_set, valid_size = build_fuel(deserialize_from_file(config['dataset_test'])) # use test set for a try 57 | 58 | # weiget save. 59 | savefile = config['path_h5'] + '/emolga.RHM.id={}.h5'.format(tmark) 60 | 61 | # build the agent 62 | if config['model'] == 'RNNLM': 63 | agent = RNNLM(config, n_rng, rng, mode=config['mode']) 64 | elif config['model'] == 'HarX': 65 | agent = NVTM(config, n_rng, rng, mode=config['mode']) 66 | elif config['model'] == 'Helmholtz': 67 | agent = Helmholtz(config, n_rng, rng, mode=config['mode']) 68 | else: 69 | raise NotImplementedError 70 | 71 | agent.build_() 72 | agent.compile_('train') 73 | print 'compile ok' 74 | 75 | # learning to speak language. 76 | count = 1000 77 | echo = 0 78 | epochs = 50 79 | while echo < epochs: 80 | echo += 1 81 | loss = [] 82 | correct = 0 83 | scans = 0 84 | 85 | # visualization the embedding weights. 86 | # if echo > 1: 87 | # plt.figure(3) 88 | # visualize_(plt.subplot(111), agent.decoder.Embed.get_params()[0].get_value(), name='encoder embedding', 89 | # text=idx2word) 90 | # plt.show() 91 | 92 | # if not config['use_noise']: 93 | 94 | # training 95 | train_batches = obtain_stream(train_set, config['batch_size']).get_epoch_iterator(as_dict=True) 96 | valid_batches = obtain_stream(valid_set, config['eval_batch_size']).get_epoch_iterator(as_dict=True) 97 | 98 | def prepare_batch(batch): 99 | data = batch['data'].astype('int32') 100 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1) 101 | 102 | def cut_zeros(data): 103 | for k in range(data.shape[1] - 1, 0, -1): 104 | data_col = data[:, k].sum() 105 | if data_col > 0: 106 | return data[:, : k + 2] 107 | return data 108 | data = cut_zeros(data) 109 | return data 110 | 111 | # training 112 | logger.info('Epoch = {} -> Training Set Learning...'.format(echo)) 113 | progbar = Progbar(train_size / config['batch_size']) 114 | for it, batch in enumerate(train_batches): 115 | # get data 116 | data = prepare_batch(batch) 117 | if config['model'] == 'RNNLM' or config['model'] == 'AutoEncoder': 118 | loss.append(agent.train_(data, config['repeats'])) 119 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])]) 120 | elif config['model'] == 'Helmholtz' or 'HarX': 121 | loss.append(agent.train_(data, 1)) 122 | weightss = np.sum([np.sum(abs(w)) for w in agent.get_weights()]) 123 | progbar.update(it, [('lossL', loss[-1][0]), ('lossP', loss[-1][1]), ('lossQ', loss[-1][2]), 124 | ('perplexity', np.log(loss[-1][3])), ('NLL', loss[-1][4]), ('Baseline', loss[-1][5])]) 125 | 126 | """ 127 | watch = agent.watch(data) 128 | print '.' 129 | pprint(watch[0][0]) 130 | pprint(watch[2][0]) 131 | # pprint(watch[2][0]) 132 | sys.exit(111) 133 | """ 134 | 135 | # if it % 100 == 50: 136 | # sys.exit(-1) 137 | # # print '.' 138 | # # print 'encoded = {}'.format(encoded[11]) 139 | # # print 'mean = {}'.format(mean[11]) 140 | # # print 'std = {}'.format(std[11]) 141 | # 142 | # # watch = agent.watch(data) 143 | # # print '.' 144 | # # print 'train memory {}'.format(watch[0][0]) 145 | # 146 | # for kk in xrange(5): 147 | # # sample a sentence. 148 | # # action = agent.action_sampler() 149 | # # context = agent.context_trans(action) 150 | # if config['model'] is 'AutoEncoder': 151 | # source = data[kk][None, :] 152 | # truth = ' '.join(print_sample(idx2word, source[0].tolist())[:-1]) 153 | # print '\ntruth: {}'.format(truth) 154 | # context = agent.memorize(source) 155 | # sample, score = agent.generate_(context, max_len=data.shape[1]) 156 | # else: 157 | # sample, score = agent.generate_(max_len=data.shape[1]) 158 | # 159 | # if sample[-1] is not 0: 160 | # sample += [0] # fix the end. 161 | # question = ' '.join(print_sample(idx2word, sample)[:-1]) 162 | # print '\nsample: {}'.format(question) 163 | # print 'PPL: {}'.format(score) 164 | # scans += 1.0 165 | 166 | print ' .' 167 | logger.info('Epoch = {0} finished.'.format(echo)) 168 | loss = zip(*loss) 169 | logger.info('LossL: {0}, LossP: {1}, LossQ: {2}, PPL: {3}, NLL: {4}, Baseline: {5}'.format( 170 | np.mean(loss[0]), np.mean(loss[1]), np.mean(loss[2]), np.mean(loss[3]), np.mean(loss[4]), np.mean(loss[5]) 171 | )) 172 | 173 | # validation 174 | loss = [] 175 | logger.info('Epoch = {} -> Vadlidation Set Evaluation...'.format(echo)) 176 | progbar = Progbar(valid_size / config['batch_size']) 177 | for it, batch in enumerate(valid_batches): 178 | # get data 179 | data = prepare_batch(batch) 180 | if config['model'] == 'Helmholtz' or 'HarX': 181 | # loss.append(agent.evaluate_(data)) 182 | loss.append(agent.explore_(data, 10)) 183 | weightss = np.sum([np.sum(abs(w)) for w in agent.get_weights()]) 184 | progbar.update(it, [('lossL', loss[-1][0]), ('lossP', loss[-1][1]), ('lossQ', loss[-1][2]), 185 | ('perplexity', np.log(loss[-1][3])), ('NLL', loss[-1][4]), ('Baseline', loss[-1][5])]) 186 | else: 187 | raise NotImplementedError 188 | 189 | print ' .' 190 | loss = zip(*loss) 191 | logger.info('LossL: {0}, LossP: {1}, LossQ: {2}, PPL: {3}, NLL: {4}, Baseline: {5}'.format( 192 | np.mean(loss[0]), np.mean(loss[1]), np.mean(loss[2]), np.mean(loss[3]), np.mean(loss[4]), np.mean(loss[5]) 193 | )) 194 | # save the weights. 195 | agent.save(config['path_h5'] + '/emolga.RHM.id={0}.epoch={1}.pkl'.format(tmark, echo)) 196 | 197 | # logger.info('Learning percentage: {}'.format(correct / scans)) 198 | 199 | 200 | # inference test 201 | # batches = data_stream.get_epoch_iterator(as_dict=True) 202 | # for it, batch in enumerate(batches): 203 | # data = batch['data'].astype('int32') 204 | # data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1) 205 | # mean, std = agent.inference_(data) 206 | # print mean 207 | # break 208 | # print count -------------------------------------------------------------------------------- /emolga/utils/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'yinpengcheng' 2 | -------------------------------------------------------------------------------- /emolga/utils/generic_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from matplotlib.ticker import FuncFormatter 3 | import numpy as np 4 | import time 5 | import sys 6 | import six 7 | import matplotlib.pyplot as plt 8 | import matplotlib 9 | 10 | def get_from_module(identifier, module_params, module_name, instantiate=False, kwargs=None): 11 | if isinstance(identifier, six.string_types): 12 | res = module_params.get(identifier) 13 | if not res: 14 | raise Exception('Invalid ' + str(module_name) + ': ' + str(identifier)) 15 | if instantiate and not kwargs: 16 | return res() 17 | elif instantiate and kwargs: 18 | return res(**kwargs) 19 | else: 20 | return res 21 | return identifier 22 | 23 | 24 | def make_tuple(*args): 25 | return args 26 | 27 | 28 | def printv(v, prefix=''): 29 | if type(v) == dict: 30 | if 'name' in v: 31 | print(prefix + '#' + v['name']) 32 | del v['name'] 33 | prefix += '...' 34 | for nk, nv in v.items(): 35 | if type(nv) in [dict, list]: 36 | print(prefix + nk + ':') 37 | printv(nv, prefix) 38 | else: 39 | print(prefix + nk + ':' + str(nv)) 40 | elif type(v) == list: 41 | prefix += '...' 42 | for i, nv in enumerate(v): 43 | print(prefix + '#' + str(i)) 44 | printv(nv, prefix) 45 | else: 46 | prefix += '...' 47 | print(prefix + str(v)) 48 | 49 | 50 | def make_batches(size, batch_size): 51 | nb_batch = int(np.ceil(size/float(batch_size))) 52 | return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)] 53 | 54 | 55 | def slice_X(X, start=None, stop=None): 56 | if type(X) == list: 57 | if hasattr(start, '__len__'): 58 | return [x[start] for x in X] 59 | else: 60 | return [x[start:stop] for x in X] 61 | else: 62 | if hasattr(start, '__len__'): 63 | return X[start] 64 | else: 65 | return X[start:stop] 66 | 67 | 68 | class Progbar(object): 69 | def __init__(self, target, width=30, verbose=1): 70 | ''' 71 | @param target: total number of steps expected 72 | ''' 73 | self.width = width 74 | self.target = target 75 | self.sum_values = {} 76 | self.unique_values = [] 77 | self.start = time.time() 78 | self.total_width = 0 79 | self.seen_so_far = 0 80 | self.verbose = verbose 81 | 82 | def update(self, current, values=[]): 83 | ''' 84 | @param current: index of current step 85 | @param values: list of tuples (name, value_for_last_step). 86 | The progress bar will display averages for these values. 87 | ''' 88 | for k, v in values: 89 | if k not in self.sum_values: 90 | self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far] 91 | self.unique_values.append(k) 92 | else: 93 | self.sum_values[k][0] += v * (current - self.seen_so_far) 94 | self.sum_values[k][1] += (current - self.seen_so_far) 95 | self.seen_so_far = current 96 | 97 | now = time.time() 98 | if self.verbose == 1: 99 | prev_total_width = self.total_width 100 | sys.stdout.write("\b" * prev_total_width) 101 | sys.stdout.write("\r") 102 | 103 | numdigits = int(np.floor(np.log10(self.target))) + 1 104 | barstr = '%%%dd/%%%dd [' % (numdigits, numdigits) 105 | bar = barstr % (current, self.target) 106 | prog = float(current)/self.target 107 | prog_width = int(self.width*prog) 108 | if prog_width > 0: 109 | bar += ('.'*(prog_width-1)) 110 | if current < self.target: 111 | bar += '(-w-)' 112 | else: 113 | bar += '(-v-)!!' 114 | bar += ('~' * (self.width-prog_width)) 115 | bar += ']' 116 | sys.stdout.write(bar) 117 | self.total_width = len(bar) 118 | 119 | if current: 120 | time_per_unit = (now - self.start) / current 121 | else: 122 | time_per_unit = 0 123 | eta = time_per_unit*(self.target - current) 124 | info = '' 125 | if current < self.target: 126 | info += ' - ETA: %ds' % eta 127 | else: 128 | info += ' - %ds' % (now - self.start) 129 | for k in self.unique_values: 130 | if k == 'perplexity' or k == 'PPL': 131 | info += ' - %s: %.4f' % (k, np.exp(self.sum_values[k][0] / max(1, self.sum_values[k][1]))) 132 | else: 133 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1])) 134 | 135 | self.total_width += len(info) 136 | if prev_total_width > self.total_width: 137 | info += ((prev_total_width-self.total_width) * " ") 138 | 139 | sys.stdout.write(info) 140 | sys.stdout.flush() 141 | 142 | if current >= self.target: 143 | sys.stdout.write("\n") 144 | 145 | if self.verbose == 2: 146 | if current >= self.target: 147 | info = '%ds' % (now - self.start) 148 | for k in self.unique_values: 149 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1])) 150 | sys.stdout.write(info + "\n") 151 | 152 | def add(self, n, values=[]): 153 | self.update(self.seen_so_far + n, values) 154 | 155 | def clear(self): 156 | self.sum_values = {} 157 | self.unique_values = [] 158 | self.total_width = 0 159 | self.seen_so_far = 0 160 | 161 | 162 | def print_sample(idx2word, idx): 163 | def cut_eol(words): 164 | for i, word in enumerate(words): 165 | if words[i] == '': 166 | return words[:i + 1] 167 | raise Exception("No end-of-line found") 168 | 169 | return cut_eol(map(lambda w_idx : idx2word[w_idx], idx)) 170 | 171 | 172 | def visualize_(subplots, data, w=None, h=None, name=None, 173 | display='on', size=10, text=None, normal=True, 174 | grid=False): 175 | fig, ax = subplots 176 | if data.ndim == 1: 177 | if w and h: 178 | # vector visualization 179 | assert w * h == np.prod(data.shape) 180 | data = data.reshape((w, h)) 181 | else: 182 | L = data.shape[0] 183 | w = int(np.sqrt(L)) 184 | while L % w > 0: 185 | w -= 1 186 | h = L / w 187 | assert w * h == np.prod(data.shape) 188 | data = data.reshape((w, h)) 189 | else: 190 | w = data.shape[0] 191 | h = data.shape[1] 192 | 193 | if not size: 194 | size = 30 / np.sqrt(w * h) 195 | 196 | print data.shape 197 | 198 | major_ticks = np.arange(0, h, 1) 199 | ax.set_xticks(major_ticks) 200 | ax.set_xlim(0, h) 201 | major_ticks = np.arange(0, w, 1) 202 | ax.set_ylim(w, -1) 203 | ax.set_yticks(major_ticks) 204 | ax.set_aspect('equal') 205 | if grid: 206 | pass 207 | ax.grid(which='both') 208 | # ax.axis('equal') 209 | if normal: 210 | cax = ax.imshow(data, cmap=plt.cm.pink, interpolation='nearest', 211 | vmax=1.0, vmin=0.0, aspect='auto') 212 | else: 213 | cax = ax.imshow(data, cmap=plt.cm.bone, interpolation='nearest', aspect='auto') 214 | 215 | if name: 216 | ax.set_title(name) 217 | else: 218 | ax.set_title('sample.') 219 | import matplotlib.ticker as ticker 220 | 221 | # ax.xaxis.set_ticks(np.arange(0, h, 1.)) 222 | # ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%0.1f')) 223 | # ax.yaxis.set_ticks(np.arange(0, w, 1.)) 224 | # ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.1f')) 225 | 226 | # ax.set_xticks(np.linspace(0, 1, h)) 227 | # ax.set_yticks(np.linspace(0, 1, w)) 228 | # Move left and bottom spines outward by 10 points 229 | # ax.spines['left'].set_position(('outward', size)) 230 | # ax.spines['bottom'].set_position(('outward', size)) 231 | # # Hide the right and top spines 232 | # ax.spines['right'].set_visible(False) 233 | # ax.spines['top'].set_visible(False) 234 | # # Only show ticks on the left and bottom spines 235 | # ax.yaxis.set_ticks_position('left') 236 | # ax.xaxis.set_ticks_position('bottom') 237 | 238 | if text: 239 | ax.set_yticks(np.linspace(0, 1, 33) * size * 3.2) 240 | ax.set_yticklabels([text[s] for s in xrange(33)]) 241 | # cbar = fig.colorbar(cax) 242 | 243 | if display == 'on': 244 | plt.show() 245 | else: 246 | return ax 247 | 248 | 249 | def vis_Gaussian(subplot, mean, std, name=None, display='off', size=10): 250 | ax = subplot 251 | data = np.random.normal(size=(2, 10000)) 252 | data[0] = data[0] * std[0] + mean[0] 253 | data[1] = data[1] * std[1] + mean[1] 254 | 255 | ax.scatter(data[0].tolist(), data[1].tolist(), 'r.') 256 | if display == 'on': 257 | plt.show() 258 | else: 259 | return ax -------------------------------------------------------------------------------- /emolga/utils/io_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import h5py 3 | import numpy as np 4 | import cPickle 5 | from collections import defaultdict 6 | 7 | 8 | class HDF5Matrix(): 9 | refs = defaultdict(int) 10 | 11 | def __init__(self, datapath, dataset, start, end, normalizer=None): 12 | if datapath not in list(self.refs.keys()): 13 | f = h5py.File(datapath) 14 | self.refs[datapath] = f 15 | else: 16 | f = self.refs[datapath] 17 | self.start = start 18 | self.end = end 19 | self.data = f[dataset] 20 | self.normalizer = normalizer 21 | 22 | def __len__(self): 23 | return self.end - self.start 24 | 25 | def __getitem__(self, key): 26 | if isinstance(key, slice): 27 | if key.stop + self.start <= self.end: 28 | idx = slice(key.start+self.start, key.stop + self.start) 29 | else: 30 | raise IndexError 31 | elif isinstance(key, int): 32 | if key + self.start < self.end: 33 | idx = key+self.start 34 | else: 35 | raise IndexError 36 | elif isinstance(key, np.ndarray): 37 | if np.max(key) + self.start < self.end: 38 | idx = (self.start + key).tolist() 39 | else: 40 | raise IndexError 41 | elif isinstance(key, list): 42 | if max(key) + self.start < self.end: 43 | idx = [x + self.start for x in key] 44 | else: 45 | raise IndexError 46 | if self.normalizer is not None: 47 | return self.normalizer(self.data[idx]) 48 | else: 49 | return self.data[idx] 50 | 51 | @property 52 | def shape(self): 53 | return tuple([self.end - self.start, self.data.shape[1]]) 54 | 55 | 56 | def save_array(array, name): 57 | import tables 58 | f = tables.open_file(name, 'w') 59 | atom = tables.Atom.from_dtype(array.dtype) 60 | ds = f.createCArray(f.root, 'data', atom, array.shape) 61 | ds[:] = array 62 | f.close() 63 | 64 | 65 | def load_array(name): 66 | import tables 67 | f = tables.open_file(name) 68 | array = f.root.data 69 | a = np.empty(shape=array.shape, dtype=array.dtype) 70 | a[:] = array[:] 71 | f.close() 72 | return a 73 | 74 | 75 | def save_config(): 76 | pass 77 | 78 | 79 | def load_config(): 80 | pass -------------------------------------------------------------------------------- /emolga/utils/np_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | import scipy as sp 4 | from six.moves import range 5 | from six.moves import zip 6 | 7 | 8 | def to_categorical(y, nb_classes=None): 9 | '''Convert class vector (integers from 0 to nb_classes) 10 | to binary class matrix, for use with categorical_crossentropy 11 | ''' 12 | y = np.asarray(y, dtype='int32') 13 | if not nb_classes: 14 | nb_classes = np.max(y)+1 15 | Y = np.zeros((len(y), nb_classes)) 16 | for i in range(len(y)): 17 | Y[i, y[i]] = 1. 18 | return Y 19 | 20 | 21 | def normalize(a, axis=-1, order=2): 22 | l2 = np.atleast_1d(np.linalg.norm(a, order, axis)) 23 | l2[l2 == 0] = 1 24 | return a / np.expand_dims(l2, axis) 25 | 26 | 27 | def binary_logloss(p, y): 28 | epsilon = 1e-15 29 | p = sp.maximum(epsilon, p) 30 | p = sp.minimum(1-epsilon, p) 31 | res = sum(y * sp.log(p) + sp.subtract(1, y) * sp.log(sp.subtract(1, p))) 32 | res *= -1.0/len(y) 33 | return res 34 | 35 | 36 | def multiclass_logloss(P, Y): 37 | score = 0. 38 | npreds = [P[i][Y[i]-1] for i in range(len(Y))] 39 | score = -(1. / len(Y)) * np.sum(np.log(npreds)) 40 | return score 41 | 42 | 43 | def accuracy(p, y): 44 | return np.mean([a == b for a, b in zip(p, y)]) 45 | 46 | 47 | def probas_to_classes(y_pred): 48 | if len(y_pred.shape) > 1 and y_pred.shape[1] > 1: 49 | return categorical_probas_to_classes(y_pred) 50 | return np.array([1 if p > 0.5 else 0 for p in y_pred]) 51 | 52 | 53 | def categorical_probas_to_classes(p): 54 | return np.argmax(p, axis=1) 55 | -------------------------------------------------------------------------------- /emolga/utils/test_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def get_test_data(nb_train=1000, nb_test=500, input_shape=(10,), output_shape=(2,), 5 | classification=True, nb_class=2): 6 | ''' 7 | classification=True overrides output_shape 8 | (i.e. output_shape is set to (1,)) and the output 9 | consists in integers in [0, nb_class-1]. 10 | 11 | Otherwise: float output with shape output_shape. 12 | ''' 13 | nb_sample = nb_train + nb_test 14 | if classification: 15 | y = np.random.randint(0, nb_class, size=(nb_sample, 1)) 16 | X = np.zeros((nb_sample,) + input_shape) 17 | for i in range(nb_sample): 18 | X[i] = np.random.normal(loc=y[i], scale=1.0, size=input_shape) 19 | else: 20 | y_loc = np.random.random((nb_sample,)) 21 | X = np.zeros((nb_sample,) + input_shape) 22 | y = np.zeros((nb_sample,) + output_shape) 23 | for i in range(nb_sample): 24 | X[i] = np.random.normal(loc=y_loc[i], scale=1.0, size=input_shape) 25 | y[i] = np.random.normal(loc=y_loc[i], scale=1.0, size=output_shape) 26 | 27 | return (X[:nb_train], y[:nb_train]), (X[nb_train:], y[nb_train:]) 28 | -------------------------------------------------------------------------------- /emolga/utils/theano_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from theano import gof 4 | from theano.tensor import basic as tensor 5 | import numpy as np 6 | import theano 7 | import theano.tensor as T 8 | 9 | 10 | def floatX(X): 11 | return np.asarray(X, dtype=theano.config.floatX) 12 | 13 | 14 | def sharedX(X, dtype=theano.config.floatX, name=None): 15 | return theano.shared(np.asarray(X, dtype=dtype), name=name) 16 | 17 | 18 | def shared_zeros(shape, dtype=theano.config.floatX, name=None): 19 | return sharedX(np.zeros(shape), dtype=dtype, name=name) 20 | 21 | 22 | def shared_scalar(val=0., dtype=theano.config.floatX, name=None): 23 | return theano.shared(np.cast[dtype](val), name=name) 24 | 25 | 26 | def shared_ones(shape, dtype=theano.config.floatX, name=None): 27 | return sharedX(np.ones(shape), dtype=dtype, name=name) 28 | 29 | 30 | def alloc_zeros_matrix(*dims): 31 | return T.alloc(np.cast[theano.config.floatX](0.), *dims) 32 | 33 | 34 | def alloc_ones_matrix(*dims): 35 | return T.alloc(np.cast[theano.config.floatX](1.), *dims) 36 | 37 | 38 | def ndim_tensor(ndim): 39 | if ndim == 1: 40 | return T.vector() 41 | elif ndim == 2: 42 | return T.matrix() 43 | elif ndim == 3: 44 | return T.tensor3() 45 | elif ndim == 4: 46 | return T.tensor4() 47 | return T.matrix() 48 | 49 | 50 | # get int32 tensor 51 | def ndim_itensor(ndim, name=None): 52 | if ndim == 2: 53 | return T.imatrix(name) 54 | elif ndim == 3: 55 | return T.itensor3(name) 56 | elif ndim == 4: 57 | return T.itensor4(name) 58 | return T.imatrix(name) 59 | 60 | 61 | # dot-product 62 | def dot(inp, matrix, bias=None): 63 | """ 64 | Decide the right type of dot product depending on the input 65 | arguments 66 | """ 67 | if 'int' in inp.dtype and inp.ndim == 2: 68 | return matrix[inp.flatten()] 69 | elif 'int' in inp.dtype: 70 | return matrix[inp] 71 | elif 'float' in inp.dtype and inp.ndim == 3: 72 | shape0 = inp.shape[0] 73 | shape1 = inp.shape[1] 74 | shape2 = inp.shape[2] 75 | if bias: 76 | return (T.dot(inp.reshape((shape0 * shape1, shape2)), matrix) + bias).reshape((shape0, shape1, matrix.shape[1])) 77 | else: 78 | return T.dot(inp.reshape((shape0 * shape1, shape2)), matrix).reshape((shape0, shape1, matrix.shape[1])) 79 | else: 80 | if bias: 81 | return T.dot(inp, matrix) + bias 82 | else: 83 | return T.dot(inp, matrix) 84 | 85 | 86 | # Numerically stable log(sum(exp(A))). Can also be used in softmax function. 87 | def logSumExp(x, axis=None, mask=None, status='theano', c=None, err=1e-7): 88 | """ 89 | Numerically stable log(sum(exp(A))). Can also be used in softmax function. 90 | c is the additional input when it doesn't require masking but x need. 91 | 92 | """ 93 | if status == 'theano': 94 | J = T 95 | else: 96 | J = np 97 | 98 | if c is None: 99 | x_max = J.max(x, axis=axis, keepdims=True) 100 | else: 101 | x_max = J.max(J.concatenate([c, x], axis=-1), axis=axis, keepdims=True) 102 | 103 | if c is None: 104 | if not mask: 105 | l_t = J.sum(J.exp(x - x_max), axis=axis, keepdims=True) 106 | 107 | else: 108 | l_t = J.sum(J.exp(x - x_max) * mask, axis=axis, keepdims=True) 109 | else: 110 | if not mask: 111 | l_t = J.sum(J.exp(x - x_max), axis=axis, keepdims=True) + \ 112 | J.sum(J.exp(c - x_max), axis=axis, keepdims=True) 113 | else: 114 | l_t = J.sum(J.exp(x - x_max) * mask, axis=axis, keepdims=True) + \ 115 | J.sum(J.exp(c - x_max), axis=axis, keepdims=True) 116 | 117 | x_t = J.log(J.maximum(l_t, err)) + x_max 118 | return x_t 119 | 120 | 121 | def softmax(x): 122 | return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape) 123 | 124 | 125 | def masked_softmax(x, mask, err=1e-9): 126 | assert x.ndim == 2, 'support two-dimension' 127 | weights = softmax(x) 128 | weights *= mask 129 | weights = weights / (T.sum(weights, axis=-1)[:, None] + err) * mask 130 | return weights 131 | 132 | 133 | def cosine_sim(k, M): 134 | k_unit = k / (T.sqrt(T.sum(k**2)) + 1e-5) 135 | # T.patternbroadcast(k_unit.reshape((1,k_unit.shape[0])),(True,False)) 136 | k_unit = k_unit.dimshuffle(('x', 0)) 137 | k_unit.name = "k_unit" 138 | M_lengths = T.sqrt(T.sum(M**2, axis=1)).dimshuffle((0, 'x')) 139 | M_unit = M / (M_lengths + 1e-5) 140 | M_unit.name = "M_unit" 141 | return T.sum(k_unit * M_unit, axis=1) 142 | 143 | 144 | def cosine_sim2d(k, M): 145 | # k: (nb_samples, memory_width) 146 | # M: (nb_samples, memory_dim, memory_width) 147 | 148 | # norms of keys and memories 149 | k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,) 150 | M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,) 151 | 152 | k = k[:, None, :] # (nb_samples, 1, memory_width) 153 | k_norm = k_norm[:, None] # (nb_samples, 1) 154 | 155 | sim = T.sum(k * M, axis=2) # (nb_samples, memory_dim,) 156 | sim /= k_norm * M_norm # (nb_samples, memory_dim,) 157 | return sim 158 | 159 | 160 | def dot_2d(k, M, b=None, g=None): 161 | # k: (nb_samples, memory_width) 162 | # M: (nb_samples, memory_dim, memory_width) 163 | 164 | # norms of keys and memories 165 | # k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,) 166 | # M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,) 167 | 168 | k = k[:, None, :] # (nb_samples, 1, memory_width) 169 | value = k * M 170 | if b is not None: 171 | b = b[:, None, :] 172 | value *= b # (nb_samples, memory_dim,) 173 | 174 | if g is not None: 175 | g = g[None, None, :] 176 | value *= g 177 | 178 | sim = T.sum(value, axis=2) 179 | return sim 180 | 181 | 182 | def shift_convolve(weight, shift, shift_conv): 183 | shift = shift.dimshuffle((0, 'x')) 184 | return T.sum(shift * weight[shift_conv], axis=0) 185 | 186 | 187 | def shift_convolve2d(weight, shift, shift_conv): 188 | return T.sum(shift[:, :, None] * weight[:, shift_conv], axis=1) 189 | -------------------------------------------------------------------------------- /emolga/voc.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/CopyNet/dba24d58d505abaa169e05ee762987f37f41f566/emolga/voc.pkl -------------------------------------------------------------------------------- /experiments/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jiataogu' -------------------------------------------------------------------------------- /experiments/bst_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | __author__ = 'jiataogu' 3 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file 4 | import numpy.random as n_rng 5 | 6 | 7 | class BSTnode(object): 8 | """ 9 | Representation of a node in a binary search tree. 10 | Has a left child, right child, and key value, and stores its subtree size. 11 | """ 12 | def __init__(self, parent, t): 13 | """Create a new leaf with key t.""" 14 | self.key = t 15 | self.parent = parent 16 | self.left = None 17 | self.right = None 18 | self.size = 1 19 | 20 | def update_stats(self): 21 | """Updates this node's size based on its children's sizes.""" 22 | self.size = (0 if self.left is None else self.left.size) + (0 if self.right is None else self.right.size) 23 | 24 | def insert(self, t, NodeType): 25 | """Insert key t into the subtree rooted at this node (updating subtree size).""" 26 | self.size += 1 27 | if t < self.key: 28 | if self.left is None: 29 | self.left = NodeType(self, t) 30 | return self.left 31 | else: 32 | return self.left.insert(t, NodeType) 33 | elif t > self.key: 34 | if self.right is None: 35 | self.right = NodeType(self, t) 36 | return self.right 37 | else: 38 | return self.right.insert(t, NodeType) 39 | else: 40 | return self 41 | 42 | def find(self, t): 43 | """Return the node for key t if it is in this tree, or None otherwise.""" 44 | if t == self.key: 45 | return self 46 | elif t < self.key: 47 | if self.left is None: 48 | return None 49 | else: 50 | return self.left.find(t) 51 | else: 52 | if self.right is None: 53 | return None 54 | else: 55 | return self.right.find(t) 56 | 57 | def rank(self, t): 58 | """Return the number of keys <= t in the subtree rooted at this node.""" 59 | left_size = 0 if self.left is None else self.left.size 60 | if t == self.key: 61 | return left_size + 1 62 | elif t < self.key: 63 | if self.left is None: 64 | return 0 65 | else: 66 | return self.left.rank(t) 67 | else: 68 | if self.right is None: 69 | return left_size + 1 70 | else: 71 | return self.right.rank(t) + left_size + 1 72 | 73 | def minimum(self): 74 | """Returns the node with the smallest key in the subtree rooted by this node.""" 75 | current = self 76 | while current.left is not None: 77 | current = current.left 78 | return current 79 | 80 | 81 | def successor(self): 82 | """Returns the node with the smallest key larger than this node's key, or None if this has the largest key in the tree.""" 83 | if self.right is not None: 84 | return self.right.minimum() 85 | current = self 86 | while current.parent is not None and current.parent.right is current: 87 | current = current.parent 88 | return current.parent 89 | 90 | def delete(self): 91 | """"Delete this node from the tree.""" 92 | if self.left is None or self.right is None: 93 | if self is self.parent.left: 94 | self.parent.left = self.left or self.right 95 | if self.parent.left is not None: 96 | self.parent.left.parent = self.parent 97 | else: 98 | self.parent.right = self.left or self.right 99 | if self.parent.right is not None: 100 | self.parent.right.parent = self.parent 101 | current = self.parent 102 | while current.key is not None: 103 | current.update_stats() 104 | current = current.parent 105 | return self 106 | else: 107 | s = self.successor() 108 | self.key, s.key = s.key, self.key 109 | return s.delete() 110 | 111 | def check(self, lokey, hikey): 112 | """Checks that the subtree rooted at t is a valid BST and all keys are between (lokey, hikey).""" 113 | if lokey is not None and self.key <= lokey: 114 | raise "BST RI violation" 115 | if hikey is not None and self.key >= hikey: 116 | raise "BST RI violation" 117 | if self.left is not None: 118 | if self.left.parent is not self: 119 | raise "BST RI violation" 120 | self.left.check(lokey, self.key) 121 | if self.right is not None: 122 | if self.right.parent is not self: 123 | raise "BST RI violation" 124 | self.right.check(self.key, hikey) 125 | if self.size != 1 + (0 if self.left is None else self.left.size) + (0 if self.right is None else self.right.size): 126 | raise "BST RI violation" 127 | 128 | def __repr__(self): 129 | return "" 130 | 131 | 132 | class BST(object): 133 | """ 134 | Simple binary search tree implementation, augmented with subtree sizes. 135 | This BST supports insert, find, and delete-min operations. 136 | Each tree contains some (possibly 0) BSTnode objects, representing nodes, 137 | and a pointer to the root. 138 | """ 139 | 140 | def __init__(self, NodeType=BSTnode): 141 | self.root = None 142 | self.NodeType = NodeType 143 | self.psroot = self.NodeType(None, None) 144 | 145 | def reroot(self): 146 | self.root = self.psroot.left 147 | 148 | def insert(self, t): 149 | """Insert key t into this BST, modifying it in-place.""" 150 | if self.root is None: 151 | self.psroot.left = self.NodeType(self.psroot, t) 152 | self.reroot() 153 | return self.root 154 | else: 155 | return self.root.insert(t, self.NodeType) 156 | 157 | def find(self, t): 158 | """Return the node for key t if is in the tree, or None otherwise.""" 159 | if self.root is None: 160 | return None 161 | else: 162 | return self.root.find(t) 163 | 164 | def rank(self, t): 165 | """The number of keys <= t in the tree.""" 166 | if self.root is None: 167 | return 0 168 | else: 169 | return self.root.rank(t) 170 | 171 | def delete(self, t): 172 | """Delete the node for key t if it is in the tree.""" 173 | node = self.find(t) 174 | deleted = self.root.delete() 175 | self.reroot() 176 | return deleted 177 | 178 | def check(self): 179 | if self.root is not None: 180 | self.root.check(None, None) 181 | 182 | def __str__(self): 183 | if self.root is None: 184 | return '' 185 | 186 | def nested(node): 187 | if node is None: 188 | return '0' 189 | head = str(node.key) 190 | left = nested(node.left) 191 | right = nested(node.right) 192 | 193 | if left == '0' and right == '0': 194 | return head 195 | else: 196 | return ' '.join(['(', head, left, right, ')']) 197 | 198 | return nested(self.root) 199 | 200 | # def recurse(node): 201 | # if node is None: 202 | # return [], 0, 0 203 | # label = str(node.key) 204 | # left_lines, left_pos, left_width = recurse(node.left) 205 | # right_lines, right_pos, right_width = recurse(node.right) 206 | # middle = max(right_pos + left_width - left_pos + 1, len(label), 2) 207 | # pos = left_pos + middle // 2 208 | # width = left_pos + middle + right_width - right_pos 209 | # while len(left_lines) < len(right_lines): 210 | # left_lines.append(' ' * left_width) 211 | # while len(right_lines) < len(left_lines): 212 | # right_lines.append(' ' * right_width) 213 | # if (middle - len(label)) % 2 == 1 and node.parent is not None and \ 214 | # node is node.parent.left and len(label) < middle: 215 | # label += '.' 216 | # label = label.center(middle, '.') 217 | # if label[0] == '.': label = ' ' + label[1:] 218 | # if label[-1] == '.': label = label[:-1] + ' ' 219 | # lines = [' ' * left_pos + label + ' ' * (right_width - right_pos), 220 | # ' ' * left_pos + '/' + ' ' * (middle-2) + 221 | # '\\' + ' ' * (right_width - right_pos)] + \ 222 | # [left_line + ' ' * (width - left_width - right_width) + 223 | # right_line 224 | # for left_line, right_line in zip(left_lines, right_lines)] 225 | # return lines, pos, width 226 | # return '\n'.join(recurse(self.root) [0]) 227 | 228 | test1 = range(0, 100, 10) 229 | test2 = [31, 41, 59, 26, 53, 58, 97, 93, 23] 230 | test3 = "algorithms" 231 | 232 | 233 | def printsizes(node): 234 | if node is None: 235 | print "node is nil" 236 | else: 237 | print "node", node.key, "has a subtree of size", node.size 238 | 239 | 240 | def test(args=None, BSTtype=BST): 241 | import random, sys 242 | random.seed(19920206) 243 | if not args: 244 | args = sys.argv[1:] 245 | if not args: 246 | print 'usage: %s ' % \ 247 | sys.argv[0] 248 | sys.exit() 249 | elif len(args) == 1: 250 | items = (random.randrange(100) for i in xrange(int(args[0]))) 251 | else: 252 | items = [int(i) for i in args] 253 | 254 | tree = BSTtype() 255 | source = [] 256 | for item in items: 257 | tree.insert(item) 258 | source += [str(item)] 259 | print ' '.join(source) 260 | print tree 261 | 262 | 263 | def generate(): 264 | import random, sys 265 | random.seed(19920206) 266 | 267 | Lmin = 2 ** 2 - 1 268 | Lmax = 2 ** 4 - 1 269 | Xnum = 1000000 270 | voc = 26 271 | 272 | wfile = open('/home/thoma/Work/Dial-DRL/dataset/BST_1M.txt', 'w') 273 | for id in xrange(Xnum): 274 | tree = BST() 275 | items = (random.randrange(voc) for i in 276 | xrange(random.randint(Lmin, Lmax))) 277 | source = [] 278 | for item in items: 279 | item = chr(item + 65) 280 | tree.insert(item) 281 | source += [str(item)] 282 | source = ' '.join(source) 283 | target = str(tree) 284 | line = '{0} -> {1}'.format(source, target) 285 | wfile.write(line + '\n') 286 | if id % 10000 == 0: 287 | print id 288 | 289 | 290 | def obtain_dataset(): 291 | rfile = open('/home/thoma/Work/Dial-DRL/dataset/BST_1M.txt', 'r') 292 | line = rfile.readline() 293 | 294 | word2idx = dict() 295 | word2idx[''] = 0 296 | word2idx[''] = 1 297 | pairs = [] 298 | at = 2 299 | lines = 0 300 | while line: 301 | lines += 1 302 | line = line.strip() 303 | source, target = line.split('->') 304 | source = source.split() 305 | target = target.split() 306 | 307 | for w in source: 308 | if w not in word2idx: 309 | word2idx[w] = at 310 | at += 1 311 | for w in target: 312 | if w not in word2idx: 313 | word2idx[w] = at 314 | at += 1 315 | pairs.append((source, target)) 316 | if lines % 20000 == 0: 317 | print lines 318 | line = rfile.readline() 319 | 320 | idx2word = dict() 321 | for v, k in word2idx.items(): 322 | idx2word[k] = v 323 | 324 | Lmax = len(idx2word) 325 | print 'read dataset ok.' 326 | print Lmax 327 | for i in xrange(Lmax): 328 | print idx2word[i] 329 | 330 | def build_data(data): 331 | instance = dict(text=[], summary=[], source=[], target=[], target_c=[]) 332 | for pair in data: 333 | source, target = pair 334 | A = [word2idx[w] for w in source] 335 | B = [word2idx[w] for w in target] 336 | # C = np.asarray([[w == l for w in source] for l in target], dtype='float32') 337 | C = [0 if w not in source else source.index(w) + Lmax for w in target] 338 | 339 | instance['text'] += [source] 340 | instance['summary'] += [target] 341 | instance['source'] += [A] 342 | instance['target'] += [B] 343 | # instance['cc_matrix'] += [C] 344 | instance['target_c'] += [C] 345 | 346 | print instance['target'][5000] 347 | print instance['target_c'][5000] 348 | return instance 349 | 350 | train_set = build_data(pairs[100000:]) 351 | test_set = build_data(pairs[:100000]) 352 | serialize_to_file([train_set, test_set, idx2word, word2idx], 353 | '/home/thoma/Work/Dial-DRL/dataset/BST_1M.data.pkl') 354 | 355 | 356 | if __name__ == '__main__': 357 | generate() 358 | obtain_dataset() -------------------------------------------------------------------------------- /experiments/bst_vest.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ 3 | This is the implementation of Copy-NET 4 | We start from the basic Seq2seq framework for a auto-encoder. 5 | """ 6 | import logging 7 | import time 8 | import numpy as np 9 | import sys 10 | import copy 11 | 12 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 13 | from experiments.config import setup_lcsts, setup_weibo, setup_syn, setup_bst 14 | from emolga.utils.generic_utils import * 15 | from emolga.models.covc_encdec import NRM 16 | from emolga.models.encdec import NRM as NRM0 17 | from emolga.dataset.build_dataset import deserialize_from_file 18 | from collections import OrderedDict 19 | from fuel import datasets 20 | from fuel import transformers 21 | from fuel import schemes 22 | 23 | # setup = setup_lcsts 24 | # setup = setup_syn 25 | setup = setup_bst 26 | 27 | 28 | def init_logging(logfile): 29 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', 30 | datefmt='%m/%d/%Y %H:%M:%S' ) 31 | fh = logging.FileHandler(logfile) 32 | # ch = logging.StreamHandler() 33 | 34 | fh.setFormatter(formatter) 35 | # ch.setFormatter(formatter) 36 | # fh.setLevel(logging.INFO) 37 | # ch.setLevel(logging.INFO) 38 | # logging.getLogger().addHandler(ch) 39 | logging.getLogger().addHandler(fh) 40 | logging.getLogger().setLevel(logging.INFO) 41 | 42 | return logging 43 | 44 | # prepare logging. 45 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) 46 | config = setup() # load settings. 47 | for w in config: 48 | print '{0}={1}'.format(w, config[w]) 49 | 50 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark)) 51 | n_rng = np.random.RandomState(config['seed']) 52 | np.random.seed(config['seed']) 53 | rng = RandomStreams(n_rng.randint(2 ** 30)) 54 | logger.info('Start!') 55 | 56 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset']) 57 | 58 | if config['voc_size'] == -1: # not use unk 59 | config['enc_voc_size'] = len(word2idx) 60 | config['dec_voc_size'] = config['enc_voc_size'] 61 | else: 62 | config['enc_voc_size'] = config['voc_size'] 63 | config['dec_voc_size'] = config['enc_voc_size'] 64 | 65 | samples = len(train_set['source']) 66 | logger.info('build dataset done. ' + 67 | 'dataset size: {} ||'.format(samples) + 68 | 'vocabulary size = {0}/ batch size = {1}'.format( 69 | config['dec_voc_size'], config['batch_size'])) 70 | 71 | 72 | def build_data(data): 73 | # create fuel dataset. 74 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']), 75 | ('target', data['target']), 76 | ('target_c', data['target_c']), 77 | ])) 78 | dataset.example_iteration_scheme \ 79 | = schemes.ShuffledExampleScheme(dataset.num_examples) 80 | return dataset 81 | 82 | 83 | train_data = build_data(train_set) 84 | train_data_plain = zip(*(train_set['source'], train_set['target'])) 85 | test_data_plain = zip(*(test_set['source'], test_set['target'])) 86 | 87 | # train_data_plain = zip(*(train_set['source'], train_set['target'])) 88 | # test_data_plain = zip(*(test_set['source'], test_set['target'])) 89 | 90 | train_size = len(train_data_plain) 91 | test_size = len(test_data_plain) 92 | tr_idx = n_rng.permutation(train_size)[:2000].tolist() 93 | ts_idx = n_rng.permutation(test_size )[:2000].tolist() 94 | logger.info('load the data ok.') 95 | notrain = False 96 | 97 | # build the agent 98 | if config['copynet']: 99 | agent = NRM(config, n_rng, rng, mode=config['mode'], 100 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 101 | else: 102 | agent = NRM0(config, n_rng, rng, mode=config['mode'], 103 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 104 | 105 | agent.build_() 106 | if notrain: 107 | agent.compile_('display') 108 | else: 109 | agent.compile_('all') 110 | print 'compile ok.' 111 | 112 | # load the model 113 | # agent.load(config['path_h5'] + 114 | # '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format('20160229-105153', 1, config['modelname'])) 115 | 116 | echo = 0 117 | epochs = 10 118 | skip = -1 # 25000 119 | if echo > 0: 120 | tmark = '20160229-105153' # '20160227-013418' # copynet multi-source model 121 | agent.load(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname'])) 122 | 123 | while echo < epochs: 124 | echo += 1 125 | loss = [] 126 | 127 | def output_stream(dataset, batch_size, size=1): 128 | data_stream = dataset.get_example_stream() 129 | data_stream = transformers.Batch(data_stream, 130 | iteration_scheme=schemes.ConstantScheme(batch_size)) 131 | 132 | # add padding and masks to the dataset 133 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target')) 134 | return data_stream 135 | 136 | def prepare_batch(batch, mask, fix_len=None): 137 | data = batch[mask].astype('int32') 138 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1) 139 | 140 | def cut_zeros(data, fix_len=None): 141 | if fix_len is not None: 142 | return data[:, : fix_len] 143 | for k in range(data.shape[1] - 1, 0, -1): 144 | data_col = data[:, k].sum() 145 | if data_col > 0: 146 | return data[:, : k + 2] 147 | return data 148 | data = cut_zeros(data, fix_len) 149 | return data 150 | 151 | def cc_martix(source, target): 152 | cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32') 153 | for k in xrange(source.shape[0]): 154 | for j in xrange(target.shape[1]): 155 | for i in xrange(source.shape[1]): 156 | if (source[k, i] == target[k, j]) and (source[k, i] > 0): 157 | cc[k][j][i] = 1. 158 | return cc 159 | 160 | def unk_filter(data): 161 | if config['voc_size'] == -1: 162 | return copy.copy(data) 163 | else: 164 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32') 165 | data = copy.copy(data * mask + (1 - mask)) 166 | return data 167 | 168 | # training 169 | if not notrain: 170 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True) 171 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo)) 172 | progbar = Progbar(train_size / config['batch_size']) 173 | for it, batch in enumerate(train_batches): 174 | 175 | # skip some iterations 176 | if echo == 1 and it < skip: 177 | continue 178 | 179 | # obtain data 180 | data_s = prepare_batch(batch, 'source') 181 | data_t = prepare_batch(batch, 'target') 182 | if config['copynet']: 183 | data_c = cc_martix(data_s, data_t) 184 | # data_c = prepare_batch(batch, 'target_c', data_t.shape[1]) 185 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)] 186 | else: 187 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))] 188 | 189 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])]) 190 | 191 | if it % 200 == 0: 192 | logger.info('Echo={} Evaluation Sampling.'.format(it)) 193 | logger.info('generating [training set] samples') 194 | for _ in xrange(5): 195 | idx = int(np.floor(n_rng.rand() * train_size)) 196 | train_s, train_t = train_data_plain[idx] 197 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'), 198 | np.asarray(train_t, dtype='int32'), 199 | idx2word, 200 | np.asarray(unk_filter(train_s), dtype='int32')) 201 | print '*' * 50 202 | 203 | logger.info('generating [testing set] samples') 204 | for _ in xrange(5): 205 | idx = int(np.floor(n_rng.rand() * test_size)) 206 | test_s, test_t = test_data_plain[idx] 207 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'), 208 | np.asarray(test_t, dtype='int32'), 209 | idx2word, 210 | np.asarray(unk_filter(test_s), dtype='int32')) 211 | print '*' * 50 212 | 213 | # save the weights. 214 | if it % 5000 == 0: 215 | agent.save(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname'])) 216 | 217 | if (it % 5000 == 0) and it > 0: 218 | print 'testing accuracy !!' 219 | 220 | def analysis_(data_plain, t_idx, mode='Training'): 221 | progbar_tr = Progbar(2000) 222 | print '\n' + '__' * 50 223 | cpy, cpy_pos = 0, 0 224 | for it, idx in enumerate(t_idx): 225 | train_s, train_t = data_plain[idx] 226 | c = float(agent.analyse_(np.asarray(train_s, dtype='int32'), 227 | np.asarray(train_t, dtype='int32'), 228 | idx2word)) 229 | # copy mode 230 | cpy += 1 231 | cpy_pos += c 232 | progbar_tr.update(it + 1, [('Copy', cpy_pos)]) 233 | logger.info('\n{0} Accuracy:' + 234 | '\t{1}/{2} = {3}%'.format(mode, cpy_pos, cpy, 100 * cpy_pos / float(cpy))) 235 | print '==' * 50 236 | 237 | # analysis_(train_data_plain, tr_idx, 'Training') 238 | analysis_(test_data_plain, ts_idx, 'Testing') 239 | 240 | 241 | 242 | -------------------------------------------------------------------------------- /experiments/config.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MultiPath/CopyNet/dba24d58d505abaa169e05ee762987f37f41f566/experiments/config.pyc -------------------------------------------------------------------------------- /experiments/copynet.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the implementation of Copy-NET 3 | We start from the basic Seq2seq framework for a auto-encoder. 4 | """ 5 | import logging 6 | import time 7 | import numpy as np 8 | 9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 10 | from experiments.config import setup 11 | from emolga.utils.generic_utils import * 12 | from emolga.models.encdec import * 13 | from emolga.dataset.build_dataset import deserialize_from_file 14 | from collections import OrderedDict 15 | from fuel import datasets 16 | from fuel import transformers 17 | from fuel import schemes 18 | 19 | 20 | def init_logging(logfile): 21 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', 22 | datefmt='%m/%d/%Y %H:%M:%S' ) 23 | fh = logging.FileHandler(logfile) 24 | # ch = logging.StreamHandler() 25 | 26 | fh.setFormatter(formatter) 27 | # ch.setFormatter(formatter) 28 | # fh.setLevel(logging.INFO) 29 | # ch.setLevel(logging.INFO) 30 | # logging.getLogger().addHandler(ch) 31 | logging.getLogger().addHandler(fh) 32 | logging.getLogger().setLevel(logging.INFO) 33 | 34 | return logging 35 | 36 | # prepare logging. 37 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) 38 | config = setup() # load settings. 39 | for w in config: 40 | print '{0}={1}'.format(w, config[w]) 41 | 42 | logger = init_logging(config['path_log'] + '/experiments.Copy.id={}.log'.format(tmark)) 43 | n_rng = np.random.RandomState(config['seed']) 44 | np.random.seed(config['seed']) 45 | rng = RandomStreams(n_rng.randint(2 ** 30)) 46 | logger.info('Start!') 47 | 48 | idx2word, word2idx, idx2word_o, word2idx_o \ 49 | = deserialize_from_file(config['voc']) 50 | idx2word_o[0] = '' 51 | word2idx_o[''] = 0 52 | 53 | source, target, origin = deserialize_from_file(config['dataset']) 54 | samlpes = len(source) 55 | 56 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1 57 | config['dec_voc_size'] = config['enc_voc_size'] 58 | logger.info('build dataset done. ' + 59 | 'dataset size: {} ||'.format(samlpes) + 60 | 'vocabulary size = {0}/ batch size = {1}'.format( 61 | config['dec_voc_size'], config['batch_size'])) 62 | 63 | 64 | def build_data(source, target): 65 | # create fuel dataset. 66 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', source), ('target', target)])) 67 | dataset.example_iteration_scheme \ 68 | = schemes.ShuffledExampleScheme(dataset.num_examples) 69 | return dataset, len(source) 70 | 71 | 72 | train_data, train_size = build_data(source[int(0.2 * samlpes):], target[int(0.2 * samlpes):]) 73 | train_data_plain = zip(*(source[int(0.2 * samlpes):], target[int(0.2 * samlpes):], origin[int(0.2 * samlpes):])) 74 | test_data_plain = zip(*(source[:int(0.2 * samlpes)], target[:int(0.2 * samlpes)], origin[:int(0.2 * samlpes)])) 75 | test_size = len(test_data_plain) 76 | logger.info('load the data ok.') 77 | 78 | # build the agent 79 | agent = NRM(config, n_rng, rng, mode=config['mode'], 80 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 81 | agent.build_() 82 | agent.compile_('all') 83 | print 'compile ok.' 84 | 85 | echo = 0 86 | epochs = 10 87 | while echo < epochs: 88 | echo += 1 89 | loss = [] 90 | 91 | def output_stream(dataset, batch_size, size=1): 92 | data_stream = dataset.get_example_stream() 93 | data_stream = transformers.Batch(data_stream, 94 | iteration_scheme=schemes.ConstantScheme(batch_size)) 95 | 96 | # add padding and masks to the dataset 97 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target')) 98 | return data_stream 99 | 100 | def prepare_batch(batch, mask): 101 | data = batch[mask].astype('int32') 102 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1) 103 | 104 | def cut_zeros(data): 105 | for k in range(data.shape[1] - 1, 0, -1): 106 | data_col = data[:, k].sum() 107 | if data_col > 0: 108 | return data[:, : k + 2] 109 | return data 110 | data = cut_zeros(data) 111 | return data 112 | 113 | # training 114 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True) 115 | logger.info('Epoch = {} -> Training Set Learning...'.format(echo)) 116 | progbar = Progbar(train_size / config['batch_size']) 117 | for it, batch in enumerate(train_batches): 118 | # obtain data 119 | data_s, data_t = prepare_batch(batch, 'source'), prepare_batch(batch, 'target') 120 | loss += [agent.train_(data_s, data_t)] 121 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])]) 122 | 123 | if it % 500 == 0: 124 | logger.info('generating [training set] samples') 125 | for _ in xrange(5): 126 | idx = int(np.floor(n_rng.rand() * train_size)) 127 | train_s, train_t, train_o = train_data_plain[idx] 128 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'), 129 | np.asarray(train_t, dtype='int32'), 130 | idx2word, np.asarray(train_o, dtype='int32'), idx2word_o) 131 | print '*' * 50 132 | 133 | logger.info('generating [testing set] samples') 134 | for _ in xrange(5): 135 | idx = int(np.floor(n_rng.rand() * test_size)) 136 | test_s, test_t, test_o = test_data_plain[idx] 137 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'), 138 | np.asarray(test_t, dtype='int32'), 139 | idx2word, np.asarray(test_o, dtype='int32'), idx2word_o) 140 | print '*' * 50 141 | -------------------------------------------------------------------------------- /experiments/copynet_input.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | import time 4 | import numpy as np 5 | import sys 6 | import copy 7 | 8 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 9 | from experiments.config import setup_lcsts 10 | from emolga.utils.generic_utils import * 11 | from emolga.models.covc_encdec import NRM 12 | from emolga.models.encdec import NRM as NRM0 13 | from emolga.dataset.build_dataset import deserialize_from_file 14 | from collections import OrderedDict 15 | from fuel import datasets 16 | from fuel import transformers 17 | from fuel import schemes 18 | 19 | setup = setup_lcsts 20 | 21 | 22 | def init_logging(logfile): 23 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', 24 | datefmt='%m/%d/%Y %H:%M:%S' ) 25 | fh = logging.FileHandler(logfile) 26 | # ch = logging.StreamHandler() 27 | 28 | fh.setFormatter(formatter) 29 | # ch.setFormatter(formatter) 30 | # fh.setLevel(logging.INFO) 31 | # ch.setLevel(logging.INFO) 32 | # logging.getLogger().addHandler(ch) 33 | logging.getLogger().addHandler(fh) 34 | logging.getLogger().setLevel(logging.INFO) 35 | return logging 36 | 37 | # prepare logging. 38 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) 39 | config = setup() # load settings. 40 | for w in config: 41 | print '{0}={1}'.format(w, config[w]) 42 | 43 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark)) 44 | n_rng = np.random.RandomState(config['seed']) 45 | np.random.seed(config['seed']) 46 | rng = RandomStreams(n_rng.randint(2 ** 30)) 47 | logger.info('Start!') 48 | 49 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset']) 50 | if config['voc_size'] == -1: # not use unk 51 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1 52 | config['dec_voc_size'] = config['enc_voc_size'] 53 | else: 54 | config['enc_voc_size'] = config['voc_size'] 55 | config['dec_voc_size'] = config['enc_voc_size'] 56 | 57 | samples = len(train_set['source']) 58 | 59 | logger.info('build dataset done. ' + 60 | 'dataset size: {} ||'.format(samples) + 61 | 'vocabulary size = {0}/ batch size = {1}'.format( 62 | config['dec_voc_size'], config['batch_size'])) 63 | 64 | 65 | def unk_filter(data): 66 | if config['voc_size'] == -1: 67 | return copy.copy(data) 68 | else: 69 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32') 70 | data = copy.copy(data * mask + (1 - mask)) 71 | return data 72 | 73 | source = '临近 岁末 , 新 基金 发行 步入 旺季 , 11 月份 以来 单周 新基 ' + \ 74 | '发行 数 始终保持 35 只 以上 的 高位 , 仅 11 月 25 日 一天 , ' + \ 75 | '就 有 12 只 基金 同时 发售 。 国内 首只 公募 对冲 混合型 基金 — 嘉实 绝对 收益 策略 ' + \ 76 | '定期 混合 基金 自 发行 首日 便 备受 各界 青睐 , 每日 认购 均 能 达到 上 亿' 77 | target = '首只 公募 对冲 基金 每日 吸金 上 亿' 78 | 79 | test_s = [word2idx[w.decode('utf-8')] for w in source.split()] 80 | test_t = [word2idx[w.decode('utf-8')] for w in target.split()] 81 | 82 | logger.info('load the data ok.') 83 | 84 | logger.info('Evaluate CopyNet') 85 | echo = 9 86 | tmark = '20160226-164053' # '20160221-025049' # copy-net model [no unk] 87 | config['copynet'] = True 88 | agent = NRM(config, n_rng, rng, mode=config['mode'], 89 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 90 | agent.build_() 91 | agent.compile_('display') 92 | agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo)) 93 | logger.info('generating [testing set] samples') 94 | 95 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'), 96 | np.asarray(test_t, dtype='int32'), 97 | idx2word, np.asarray(unk_filter(test_s), dtype='int32')) 98 | logger.info('Complete!') -------------------------------------------------------------------------------- /experiments/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preprocess the bAbI datset. 3 | """ 4 | import logging 5 | import os 6 | import sys 7 | import numpy.random as n_rng 8 | from emolga.dataset.build_dataset import serialize_to_file 9 | 10 | data_path = './dataset/bAbI/en-10k/' 11 | data = [] 12 | n_rng.seed(19920206) 13 | 14 | for p, folders, docs in os.walk(data_path): 15 | for doc in docs: 16 | with open(os.path.join(p, doc)) as f: 17 | l = f.readline() 18 | while l: 19 | l = l.strip().lower() 20 | l = l[l.find(' ') + 1:] 21 | if len(l.split('\t')) == 1: 22 | data += [l[:-1].split()] 23 | l = f.readline() 24 | 25 | idx2word = dict(enumerate(set([w for l in data for w in l]), 1)) 26 | word2idx = {v: k for k, v in idx2word.items()} 27 | 28 | persons = [1, 6, 24, 37, 38, 47, 60, 61, 73, 74, 90, 94, 107, 110, 114] 29 | colors = [3, 20, 34, 48, 99, 121] 30 | shapes = [11, 15, 27, 99] 31 | 32 | 33 | def repeat_name(l): 34 | ll = [] 35 | for word in l: 36 | if word2idx[word] in persons: 37 | k = n_rng.randint(5) + 1 38 | ll += [idx2word[persons[i]] for i in n_rng.randint(len(persons), size=k).tolist()] 39 | elif word2idx[word] in colors: 40 | k = n_rng.randint(5) + 1 41 | ll += [idx2word[colors[i]] for i in n_rng.randint(len(colors), size=k).tolist()] 42 | elif word2idx[word] in shapes: 43 | k = n_rng.randint(5) + 1 44 | ll += [idx2word[shapes[i]] for i in n_rng.randint(len(shapes), size=k).tolist()] 45 | else: 46 | ll += [word] 47 | return ll 48 | 49 | data_rep = [repeat_name(l) for l in data] 50 | origin = [[word2idx[w] for w in l] for l in data_rep] 51 | 52 | def replace(word): 53 | if word2idx[word] in [1, 6, 24, 37, 38, 47, 60, 61, 73, 74, 90, 94, 107, 110, 114]: 54 | return '' 55 | elif word2idx[word] in [3, 20, 34, 48, 99, 121]: 56 | return '' 57 | elif word2idx[word] in [11, 15, 27, 99]: 58 | return '' 59 | else: 60 | return word 61 | 62 | # prepare the vocabulary 63 | data_clean = [[replace(w) for w in l] for l in data_rep] 64 | idx2word2 = dict(enumerate(set([w for l in data_clean for w in l]), 1)) 65 | idx2word2[0] = '' 66 | word2idx2 = {v: k for k, v in idx2word2.items()} 67 | Lmax = len(idx2word2) 68 | 69 | for k in xrange(len(idx2word2)): 70 | print k, '\t', idx2word2[k] 71 | print 'Max: {}'.format(Lmax) 72 | 73 | serialize_to_file([idx2word2, word2idx2, idx2word, word2idx], './dataset/bAbI/voc-b.pkl') 74 | 75 | # get ready for the dataset. 76 | source = [[word2idx2[w] for w in l] for l in data_clean] 77 | target = [[word2idx2[w] if w not in ['', '', ''] 78 | else it + Lmax 79 | for it, w in enumerate(l)] for l in data_clean] 80 | 81 | 82 | def print_str(data): 83 | for d in data: 84 | print ' '.join(str(w) for w in d) 85 | 86 | 87 | print_str(data[10000: 10005]) 88 | print_str(data_rep[10000: 10005]) 89 | print_str(data_clean[10000: 10005]) 90 | print_str(source[10000: 10005]) 91 | print_str(target[10000: 10005]) 92 | 93 | serialize_to_file([source, target, origin], './dataset/bAbI/dataset-b.pkl') 94 | -------------------------------------------------------------------------------- /experiments/lcsts_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import chardet 3 | import sys 4 | import numpy as np 5 | import jieba as jb 6 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file 7 | 8 | word2idx = dict() 9 | wordfreq = dict() 10 | word2idx[''] = 0 11 | word2idx[''] = 1 12 | 13 | segment = False # True 14 | 15 | # training set 16 | pairs = [] 17 | f = open('./dataset/LCSTS/PART_I/PART_full.txt', 'r') 18 | line = f.readline().strip() 19 | at = 2 20 | lines = 0 21 | while line: 22 | if line == '': 23 | summary = f.readline().strip().decode('utf-8') 24 | if segment: 25 | summary = [w for w in jb.cut(summary)] 26 | 27 | for w in summary: 28 | if w not in wordfreq: 29 | wordfreq[w] = 1 30 | else: 31 | wordfreq[w] += 1 32 | # if w not in word2idx: 33 | # word2idx[w] = at 34 | # at += 1 35 | 36 | f.readline() 37 | f.readline() 38 | text = f.readline().strip().decode('utf-8') 39 | if segment: 40 | text = [w for w in jb.cut(text)] 41 | for w in text: 42 | if w not in wordfreq: 43 | wordfreq[w] = 1 44 | else: 45 | wordfreq[w] += 1 46 | # if w not in word2idx: 47 | # word2idx[w] = at 48 | # at += 1 49 | 50 | pair = (text, summary) 51 | pairs.append(pair) 52 | lines += 1 53 | if lines % 20000 == 0: 54 | print lines 55 | line = f.readline().strip() 56 | 57 | # testing set 58 | tests = [] 59 | f = open('./dataset/LCSTS/PART_II/PART_II.txt', 'r') 60 | line = f.readline().strip() 61 | lines = 0 62 | while line: 63 | if line == '': 64 | summary = f.readline().strip().decode('utf-8') 65 | if segment: 66 | summary = [w for w in jb.cut(summary)] 67 | 68 | for w in summary: 69 | if w not in wordfreq: 70 | wordfreq[w] = 1 71 | else: 72 | wordfreq[w] += 1 73 | # if w not in word2idx: 74 | # word2idx[w] = at 75 | # at += 1 76 | 77 | f.readline() 78 | f.readline() 79 | text = f.readline().strip().decode('utf-8') 80 | if segment: 81 | text = [w for w in jb.cut(text)] 82 | for w in text: 83 | if w not in wordfreq: 84 | wordfreq[w] = 1 85 | else: 86 | wordfreq[w] += 1 87 | # if w not in word2idx: 88 | # word2idx[w] = at 89 | # at += 1 90 | 91 | pair = (text, summary) 92 | tests.append(pair) 93 | lines += 1 94 | if lines % 20000 == 0: 95 | print lines 96 | line = f.readline().strip() 97 | 98 | print len(pairs), len(tests) 99 | 100 | # sort the vocabulary 101 | wordfreq = sorted(wordfreq.items(), key=lambda a:a[1], reverse=True) 102 | for w in wordfreq: 103 | word2idx[w[0]] = at 104 | at += 1 105 | 106 | idx2word = {k: v for v, k in word2idx.items()} 107 | Lmax = len(idx2word) 108 | print 'read dataset ok.' 109 | print Lmax 110 | for i in xrange(Lmax): 111 | print idx2word[i].encode('utf-8') 112 | 113 | # use character-based model [on] 114 | # use word-based model [off] 115 | 116 | 117 | def build_data(data): 118 | instance = dict(text=[], summary=[], source=[], target=[], target_c=[]) 119 | for pair in data: 120 | source, target = pair 121 | A = [word2idx[w] for w in source] 122 | B = [word2idx[w] for w in target] 123 | # C = np.asarray([[w == l for w in source] for l in target], dtype='float32') 124 | C = [0 if w not in source else source.index(w) + Lmax for w in target] 125 | 126 | instance['text'] += [source] 127 | instance['summary'] += [target] 128 | instance['source'] += [A] 129 | instance['target'] += [B] 130 | # instance['cc_matrix'] += [C] 131 | instance['target_c'] += [C] 132 | 133 | print instance['target'][5000] 134 | print instance['target_c'][5000] 135 | return instance 136 | 137 | 138 | train_set = build_data(pairs) 139 | test_set = build_data(tests) 140 | serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/lcsts_data-char-full.pkl') 141 | -------------------------------------------------------------------------------- /experiments/lcsts_rouge.py: -------------------------------------------------------------------------------- 1 | """ 2 | Evaluation using ROUGE for LCSTS dataset. 3 | """ 4 | # load the testing set. 5 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file 6 | import jieba as jb 7 | import logging 8 | import copy 9 | from pyrouge import Rouge155 10 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 11 | from experiments.config import setup_lcsts, setup_weibo, setup_syn 12 | from emolga.utils.generic_utils import * 13 | from emolga.models.covc_encdec import NRM 14 | from emolga.models.encdec import NRM as NRM0 15 | from emolga.dataset.build_dataset import deserialize_from_file 16 | from collections import OrderedDict 17 | from fuel import datasets 18 | from fuel import transformers 19 | from fuel import schemes 20 | from pprint import pprint 21 | setup = setup_lcsts 22 | 23 | 24 | def build_evaluation(train_set, segment): 25 | _, _, idx2word, word2idx = deserialize_from_file(train_set) 26 | pairs = [] 27 | f = open('./dataset/LCSTS/PART_III/PART_III.txt', 'r') 28 | line = f.readline().strip() 29 | lines = 0 30 | segment = segment 31 | while line: 32 | if '' in line: 33 | score = int(line[13]) 34 | if score >= 3: 35 | f.readline() 36 | summary = f.readline().strip().decode('utf-8') 37 | if segment: 38 | summary = [w for w in jb.cut(summary)] 39 | target = [] 40 | for w in summary: 41 | if w not in word2idx: 42 | word2idx[w] = len(word2idx) 43 | idx2word[len(idx2word)] = w 44 | target += [word2idx[w]] 45 | 46 | f.readline() 47 | f.readline() 48 | text = f.readline().strip().decode('utf-8') 49 | if segment: 50 | text = [w for w in jb.cut(text)] 51 | source = [] 52 | for w in text: 53 | if w not in word2idx: 54 | word2idx[w] = len(word2idx) 55 | idx2word[len(idx2word)] = w 56 | source += [word2idx[w]] 57 | 58 | pair = (text, summary, score, source, target) 59 | pairs.append(pair) 60 | lines += 1 61 | if lines % 1000 == 0: 62 | print lines 63 | line = f.readline().strip() 64 | print 'lines={}'.format(len(pairs)) 65 | return pairs, word2idx, idx2word 66 | 67 | # words, wwi, wiw = build_evaluation('./dataset/lcsts_data-word-full.pkl', True) 68 | # chars, cwi, ciw = build_evaluation('./dataset/lcsts_data-char-full.pkl', False) 69 | # 70 | # serialize_to_file([words, chars, [wwi, wiw], [cwi, ciw]], './dataset/lcsts_evaluate_data.pkl') 71 | 72 | 73 | def init_logging(logfile): 74 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', 75 | datefmt='%m/%d/%Y %H:%M:%S' ) 76 | fh = logging.FileHandler(logfile) 77 | # ch = logging.StreamHandler() 78 | 79 | fh.setFormatter(formatter) 80 | # ch.setFormatter(formatter) 81 | # fh.setLevel(logging.INFO) 82 | # ch.setLevel(logging.INFO) 83 | # logging.getLogger().addHandler(ch) 84 | logging.getLogger().addHandler(fh) 85 | logging.getLogger().setLevel(logging.INFO) 86 | 87 | return logging 88 | 89 | 90 | # prepare logging. 91 | config = setup() # load settings. 92 | for w in config: 93 | print '{0}={1}'.format(w, config[w]) 94 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) 95 | logger = init_logging(config['path_log'] + '/experiments.LCSTS.Eval.id={}.log'.format(tmark)) 96 | n_rng = np.random.RandomState(config['seed']) 97 | np.random.seed(config['seed']) 98 | rng = RandomStreams(n_rng.randint(2 ** 30)) 99 | logger.info('Start!') 100 | 101 | segment = config['segment'] 102 | word_set, char_set, word_voc, char_voc = deserialize_from_file('./dataset/lcsts_evaluate_data.pkl') 103 | 104 | if segment: 105 | eval_set = word_set 106 | word2idx, idx2word = word_voc 107 | else: 108 | eval_set = char_set 109 | word2idx, idx2word = char_voc 110 | 111 | if config['voc_size'] == -1: # not use unk 112 | config['enc_voc_size'] = len(word2idx) 113 | config['dec_voc_size'] = config['enc_voc_size'] 114 | else: 115 | config['enc_voc_size'] = config['voc_size'] 116 | config['dec_voc_size'] = config['enc_voc_size'] 117 | 118 | samples = len(eval_set) 119 | logger.info('build dataset done. ' + 120 | 'dataset size: {} ||'.format(samples) + 121 | 'vocabulary size = {0}/ batch size = {1}'.format( 122 | config['dec_voc_size'], config['batch_size'])) 123 | logger.info('load the data ok.') 124 | 125 | # build the agent 126 | if config['copynet']: 127 | agent = NRM(config, n_rng, rng, mode=config['mode'], 128 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 129 | else: 130 | agent = NRM0(config, n_rng, rng, mode=config['mode'], 131 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 132 | 133 | agent.build_() 134 | agent.compile_('display') 135 | print 'compile ok.' 136 | 137 | # load the model 138 | agent.load(config['trained_model']) 139 | 140 | 141 | def unk_filter(data): 142 | if config['voc_size'] == -1: 143 | return copy.copy(data) 144 | else: 145 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32') 146 | data = copy.copy(data * mask + (1 - mask)) 147 | return data 148 | 149 | rouge = Rouge155(n_words=40) 150 | evalsets = {'rouge_1_f_score': 'R1', 151 | 'rouge_2_f_score': 'R2', 152 | 'rouge_3_f_score': 'R3', 153 | 'rouge_4_f_score': 'R4', 154 | 'rouge_l_f_score': 'RL', 155 | 'rouge_su4_f_score': 'RSU4'} 156 | scores = dict() 157 | for id, sample in enumerate(eval_set): 158 | text, summary, score, source, target = sample 159 | v = agent.evaluate_(np.asarray(source, dtype='int32'), 160 | np.asarray(target, dtype='int32'), 161 | idx2word, 162 | np.asarray(unk_filter(source), dtype='int32')).decode('utf-8').split('\n') 163 | 164 | print 'ID = {} ||'.format(id) + '*' * 50 165 | ref = ' '.join(['t{}'.format(char_voc[0][u]) for u in ''.join([w for w in v[2][9:].split()])]) 166 | sym = ' '.join(['t{}'.format(char_voc[0][u]) for u in ''.join([w for w in v[3][9:].split()])]) 167 | 168 | sssss = rouge.score_summary(sym, {'A': ref}) 169 | 170 | for si in sssss: 171 | if si not in scores: 172 | scores[si] = sssss[si] 173 | else: 174 | scores[si] += sssss[si] 175 | 176 | for e in evalsets: 177 | print '{0}: {1}'.format(evalsets[e], scores[e] / (id + 1)), 178 | print './.' 179 | 180 | # average 181 | for si in scores: 182 | scores[si] /= float(len(eval_set)) 183 | -------------------------------------------------------------------------------- /experiments/lcsts_sample.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the implementation of Copy-NET 3 | We start from the basic Seq2seq framework for a auto-encoder. 4 | """ 5 | import logging 6 | import time 7 | import numpy as np 8 | import sys 9 | import copy 10 | 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 12 | from experiments.config import setup_lcsts 13 | from emolga.utils.generic_utils import * 14 | from emolga.models.covc_encdec import NRM 15 | from emolga.models.encdec import NRM as NRM0 16 | from emolga.dataset.build_dataset import deserialize_from_file 17 | from collections import OrderedDict 18 | from fuel import datasets 19 | from fuel import transformers 20 | from fuel import schemes 21 | 22 | setup = setup_lcsts 23 | 24 | 25 | def init_logging(logfile): 26 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', 27 | datefmt='%m/%d/%Y %H:%M:%S' ) 28 | fh = logging.FileHandler(logfile) 29 | # ch = logging.StreamHandler() 30 | 31 | fh.setFormatter(formatter) 32 | # ch.setFormatter(formatter) 33 | # fh.setLevel(logging.INFO) 34 | # ch.setLevel(logging.INFO) 35 | # logging.getLogger().addHandler(ch) 36 | logging.getLogger().addHandler(fh) 37 | logging.getLogger().setLevel(logging.INFO) 38 | return logging 39 | 40 | # prepare logging. 41 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) 42 | config = setup() # load settings. 43 | for w in config: 44 | print '{0}={1}'.format(w, config[w]) 45 | 46 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark)) 47 | n_rng = np.random.RandomState(config['seed']) 48 | np.random.seed(config['seed']) 49 | rng = RandomStreams(n_rng.randint(2 ** 30)) 50 | logger.info('Start!') 51 | 52 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset']) 53 | if config['voc_size'] == -1: # not use unk 54 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1 55 | config['dec_voc_size'] = config['enc_voc_size'] 56 | else: 57 | config['enc_voc_size'] = config['voc_size'] 58 | config['dec_voc_size'] = config['enc_voc_size'] 59 | 60 | samples = len(train_set['source']) 61 | 62 | logger.info('build dataset done. ' + 63 | 'dataset size: {} ||'.format(samples) + 64 | 'vocabulary size = {0}/ batch size = {1}'.format( 65 | config['dec_voc_size'], config['batch_size'])) 66 | 67 | 68 | def build_data(data): 69 | # create fuel dataset. 70 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']), 71 | ('target', data['target']), 72 | ('target_c', data['target_c']), 73 | ])) 74 | dataset.example_iteration_scheme \ 75 | = schemes.ShuffledExampleScheme(dataset.num_examples) 76 | return dataset 77 | 78 | 79 | def unk_filter(data): 80 | if config['voc_size'] == -1: 81 | return copy.copy(data) 82 | else: 83 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32') 84 | data = copy.copy(data * mask + (1 - mask)) 85 | return data 86 | 87 | 88 | train_data_plain = zip(*(train_set['source'], train_set['target'])) 89 | test_data_plain = zip(*(test_set['source'], test_set['target'])) 90 | train_size = len(train_data_plain) 91 | test_size = len(test_data_plain) 92 | tr_idx = n_rng.permutation(train_size)[:2000].tolist() 93 | ts_idx = n_rng.permutation(test_size)[:100].tolist() 94 | 95 | logger.info('load the data ok.') 96 | 97 | # logger.info('Evaluate Enc-Dec') 98 | # log_gen = open(config['path_log'] + '/experiments.CopyLCSTS.generate_{}.log'.format(0), 'w') 99 | # config['copynet'] = True 100 | # echo = 10 101 | # tmark = '20160224-185023' # '20160221-171853' # enc-dec model [no unk] 102 | # agent = NRM(config, n_rng, rng, mode=config['mode'], 103 | # use_attention=True, copynet=config['copynet'], identity=config['identity']) 104 | # agent.build_() 105 | # agent.compile_('display') 106 | # agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo)) 107 | # logger.info('generating [testing set] samples') 108 | # for idx in ts_idx: 109 | # # idx = int(np.floor(n_rng.rand() * test_size)) 110 | # test_s, test_t = test_data_plain[idx] 111 | # v = agent.evaluate_(np.asarray(test_s, dtype='int32'), 112 | # np.asarray(test_t, dtype='int32'), 113 | # idx2word) 114 | # log_gen.write(v) 115 | # log_gen.write('*' * 50 + '\n') 116 | # log_gen.close() 117 | 118 | logger.info('Evaluate CopyNet') 119 | echo = 6 120 | tmark = '20160224-185023' # '20160221-025049' # copy-net model [no unk] 121 | log_cp = open(config['path_logX'] + '/experiments.copy_{0}_{1}.log'.format(tmark, echo), 'w') 122 | config['copynet'] = True 123 | agent = NRM(config, n_rng, rng, mode=config['mode'], 124 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 125 | agent.build_() 126 | agent.compile_('display') 127 | agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo)) 128 | logger.info('generating [testing set] samples') 129 | for idx in ts_idx: 130 | # idx = int(np.floor(n_rng.rand() * test_size)) 131 | test_s, test_t = test_data_plain[idx] 132 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'), 133 | np.asarray(test_t, dtype='int32'), 134 | idx2word, np.asarray(unk_filter(test_s), dtype='int32')) 135 | log_cp.write(v) 136 | log_cp.write('*' * 50 + '\n') 137 | log_cp.close() 138 | logger.info('Complete!') -------------------------------------------------------------------------------- /experiments/lcsts_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the implementation of Copy-NET 3 | We start from the basic Seq2seq framework for a auto-encoder. 4 | """ 5 | import logging 6 | import time 7 | import numpy as np 8 | import sys 9 | 10 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 11 | from experiments.config import setup_lcsts 12 | from emolga.utils.generic_utils import * 13 | from emolga.models.cooc_encdec import NRM 14 | from emolga.models.encdec import NRM as NRM0 15 | from emolga.dataset.build_dataset import deserialize_from_file 16 | from collections import OrderedDict 17 | from fuel import datasets 18 | from fuel import transformers 19 | from fuel import schemes 20 | 21 | setup = setup_lcsts 22 | 23 | 24 | def init_logging(logfile): 25 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', 26 | datefmt='%m/%d/%Y %H:%M:%S' ) 27 | fh = logging.FileHandler(logfile) 28 | # ch = logging.StreamHandler() 29 | 30 | fh.setFormatter(formatter) 31 | # ch.setFormatter(formatter) 32 | # fh.setLevel(logging.INFO) 33 | # ch.setLevel(logging.INFO) 34 | # logging.getLogger().addHandler(ch) 35 | logging.getLogger().addHandler(fh) 36 | logging.getLogger().setLevel(logging.INFO) 37 | 38 | return logging 39 | 40 | # prepare logging. 41 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) 42 | config = setup() # load settings. 43 | for w in config: 44 | print '{0}={1}'.format(w, config[w]) 45 | 46 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark)) 47 | n_rng = np.random.RandomState(config['seed']) 48 | np.random.seed(config['seed']) 49 | rng = RandomStreams(n_rng.randint(2 ** 30)) 50 | logger.info('Start!') 51 | 52 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset']) 53 | 54 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1 55 | config['dec_voc_size'] = config['enc_voc_size'] 56 | samples = len(train_set['source']) 57 | 58 | logger.info('build dataset done. ' + 59 | 'dataset size: {} ||'.format(samples) + 60 | 'vocabulary size = {0}/ batch size = {1}'.format( 61 | config['dec_voc_size'], config['batch_size'])) 62 | 63 | 64 | def build_data(data): 65 | # create fuel dataset. 66 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']), 67 | ('target', data['target']), 68 | ('target_c', data['target_c']), 69 | ])) 70 | dataset.example_iteration_scheme \ 71 | = schemes.ShuffledExampleScheme(dataset.num_examples) 72 | return dataset 73 | 74 | 75 | train_data = build_data(train_set) 76 | train_data_plain = zip(*(train_set['source'], train_set['target'])) 77 | test_data_plain = zip(*(test_set['source'], test_set['target'])) 78 | 79 | train_size = len(train_data_plain) 80 | test_size = len(test_data_plain) 81 | tr_idx = n_rng.permutation(train_size)[:2000].tolist() 82 | ts_idx = n_rng.permutation(test_size )[:2000].tolist() 83 | logger.info('load the data ok.') 84 | 85 | # build the agent 86 | if config['copynet']: 87 | agent = NRM(config, n_rng, rng, mode=config['mode'], 88 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 89 | else: 90 | agent = NRM0(config, n_rng, rng, mode=config['mode'], 91 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 92 | 93 | agent.build_() 94 | agent.compile_('all') 95 | print 'compile ok.' 96 | 97 | echo = 2 98 | epochs = 10 99 | if echo > 0: 100 | tmark = '20160217-232113' 101 | agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo)) 102 | 103 | while echo < epochs: 104 | echo += 1 105 | loss = [] 106 | 107 | def output_stream(dataset, batch_size, size=1): 108 | data_stream = dataset.get_example_stream() 109 | data_stream = transformers.Batch(data_stream, 110 | iteration_scheme=schemes.ConstantScheme(batch_size)) 111 | 112 | # add padding and masks to the dataset 113 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target', 'target_c')) 114 | return data_stream 115 | 116 | def prepare_batch(batch, mask, fix_len=None): 117 | data = batch[mask].astype('int32') 118 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1) 119 | 120 | def cut_zeros(data, fix_len=None): 121 | if fix_len is not None: 122 | return data[:, : fix_len] 123 | for k in range(data.shape[1] - 1, 0, -1): 124 | data_col = data[:, k].sum() 125 | if data_col > 0: 126 | return data[:, : k + 2] 127 | return data 128 | data = cut_zeros(data, fix_len) 129 | return data 130 | 131 | # training 132 | notrain = False 133 | if not notrain: 134 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True) 135 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo)) 136 | progbar = Progbar(train_size / config['batch_size']) 137 | for it, batch in enumerate(train_batches): 138 | # obtain data 139 | data_s = prepare_batch(batch, 'source') 140 | data_t = prepare_batch(batch, 'target') 141 | data_c = prepare_batch(batch, 'target_c', data_t.shape[1]) 142 | 143 | if config['copynet']: 144 | loss += [agent.train_(data_s, data_t, data_c)] 145 | else: 146 | loss += [agent.train_(data_s, data_t)] 147 | 148 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])]) 149 | 150 | if it % 200 == 0: 151 | logger.info('Echo={} Evaluation Sampling.'.format(it)) 152 | logger.info('generating [training set] samples') 153 | for _ in xrange(5): 154 | idx = int(np.floor(n_rng.rand() * train_size)) 155 | train_s, train_t = train_data_plain[idx] 156 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'), 157 | np.asarray(train_t, dtype='int32'), 158 | idx2word) 159 | print '*' * 50 160 | 161 | logger.info('generating [testing set] samples') 162 | for _ in xrange(5): 163 | idx = int(np.floor(n_rng.rand() * test_size)) 164 | test_s, test_t = test_data_plain[idx] 165 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'), 166 | np.asarray(test_t, dtype='int32'), 167 | idx2word) 168 | print '*' * 50 169 | 170 | # save the weights. 171 | agent.save(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo)) 172 | 173 | # # test accuracy 174 | # progbar_tr = Progbar(2000) 175 | # 176 | # print '\n' + '__' * 50 177 | # gen, gen_pos = 0, 0 178 | # cpy, cpy_pos = 0, 0 179 | # for it, idx in enumerate(tr_idx): 180 | # train_s, train_t = train_data_plain[idx] 181 | # 182 | # c = agent.analyse_(np.asarray(train_s, dtype='int32'), 183 | # np.asarray(train_t, dtype='int32'), 184 | # idx2word) 185 | # if c[1] == 0: 186 | # # generation mode 187 | # gen += 1 188 | # gen_pos += c[0] 189 | # else: 190 | # # copy mode 191 | # cpy += 1 192 | # cpy_pos += c[0] 193 | # 194 | # progbar_tr.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)]) 195 | # 196 | # logger.info('\nTraining Accuracy:' + 197 | # '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) + 198 | # '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy))) 199 | # 200 | # progbar_ts = Progbar(2000) 201 | # print '\n' + '__' * 50 202 | # gen, gen_pos = 0, 0 203 | # cpy, cpy_pos = 0, 0 204 | # for it, idx in enumerate(ts_idx): 205 | # test_s, test_t = test_data_plain[idx] 206 | # c = agent.analyse_(np.asarray(test_s, dtype='int32'), 207 | # np.asarray(test_t, dtype='int32'), 208 | # idx2word) 209 | # if c[1] == 0: 210 | # # generation mode 211 | # gen += 1 212 | # gen_pos += c[0] 213 | # else: 214 | # # copy mode 215 | # cpy += 1 216 | # cpy_pos += c[0] 217 | # 218 | # progbar_ts.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)]) 219 | # 220 | # logger.info('\nTesting Accuracy:' + 221 | # '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) + 222 | # '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy))) 223 | -------------------------------------------------------------------------------- /experiments/lcsts_vest.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the implementation of Copy-NET 3 | We start from the basic Seq2seq framework for a auto-encoder. 4 | """ 5 | import logging 6 | import time 7 | import numpy as np 8 | import sys 9 | import copy 10 | 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 12 | from experiments.config import setup_lcsts, setup_weibo, setup_syn 13 | from emolga.utils.generic_utils import * 14 | from emolga.models.covc_encdec import NRM 15 | from emolga.models.encdec import NRM as NRM0 16 | from emolga.dataset.build_dataset import deserialize_from_file 17 | from collections import OrderedDict 18 | from fuel import datasets 19 | from fuel import transformers 20 | from fuel import schemes 21 | 22 | setup = setup_lcsts 23 | # setup = setup_syn 24 | 25 | 26 | def init_logging(logfile): 27 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', 28 | datefmt='%m/%d/%Y %H:%M:%S' ) 29 | fh = logging.FileHandler(logfile) 30 | # ch = logging.StreamHandler() 31 | 32 | fh.setFormatter(formatter) 33 | # ch.setFormatter(formatter) 34 | # fh.setLevel(logging.INFO) 35 | # ch.setLevel(logging.INFO) 36 | # logging.getLogger().addHandler(ch) 37 | logging.getLogger().addHandler(fh) 38 | logging.getLogger().setLevel(logging.INFO) 39 | 40 | return logging 41 | 42 | # prepare logging. 43 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) 44 | config = setup() # load settings. 45 | for w in config: 46 | print '{0}={1}'.format(w, config[w]) 47 | 48 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTSXXX.id={}.log'.format(tmark)) 49 | n_rng = np.random.RandomState(config['seed']) 50 | np.random.seed(config['seed']) 51 | rng = RandomStreams(n_rng.randint(2 ** 30)) 52 | logger.info('Start!') 53 | 54 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset']) 55 | 56 | if config['voc_size'] == -1: # not use unk 57 | config['enc_voc_size'] = len(word2idx) 58 | config['dec_voc_size'] = config['enc_voc_size'] 59 | else: 60 | config['enc_voc_size'] = config['voc_size'] 61 | config['dec_voc_size'] = config['enc_voc_size'] 62 | 63 | samples = len(train_set['source']) 64 | logger.info('build dataset done. ' + 65 | 'dataset size: {} ||'.format(samples) + 66 | 'vocabulary size = {0}/ batch size = {1}'.format( 67 | config['dec_voc_size'], config['batch_size'])) 68 | 69 | 70 | def build_data(data): 71 | # create fuel dataset. 72 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']), 73 | ('target', data['target']), 74 | ('target_c', data['target_c']), 75 | ])) 76 | dataset.example_iteration_scheme \ 77 | = schemes.ShuffledExampleScheme(dataset.num_examples) 78 | return dataset 79 | 80 | 81 | train_data = build_data(train_set) 82 | train_data_plain = zip(*(train_set['source'], train_set['target'])) 83 | test_data_plain = zip(*(test_set['source'], test_set['target'])) 84 | 85 | # train_data_plain = zip(*(train_set['source'], train_set['target'])) 86 | # test_data_plain = zip(*(test_set['source'], test_set['target'])) 87 | 88 | train_size = len(train_data_plain) 89 | test_size = len(test_data_plain) 90 | tr_idx = n_rng.permutation(train_size)[:2000].tolist() 91 | ts_idx = n_rng.permutation(test_size )[:2000].tolist() 92 | logger.info('load the data ok.') 93 | 94 | # build the agent 95 | if config['copynet']: 96 | agent = NRM(config, n_rng, rng, mode=config['mode'], 97 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 98 | else: 99 | agent = NRM0(config, n_rng, rng, mode=config['mode'], 100 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 101 | 102 | 103 | echo = 0 104 | epochs = 10 105 | if echo > 0: 106 | tmark = '20160221-025049' # copynet multi-source model 107 | agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.pkl'.format(tmark, echo)) 108 | 109 | # recover from Nan 110 | tmark = '20160307-135907' # '20160301-105813' 111 | skip = 26000 112 | sep = 1 113 | 114 | # tmark = '20160301-114653' 115 | # skip = 14000 116 | agent.build_(0.001, 26000) 117 | agent.compile_('all') 118 | print 'compile ok.' 119 | agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(tmark, sep, skip)) 120 | 121 | 122 | while echo < epochs: 123 | echo += 1 124 | loss = [] 125 | 126 | def output_stream(dataset, batch_size, size=1): 127 | data_stream = dataset.get_example_stream() 128 | data_stream = transformers.Batch(data_stream, 129 | iteration_scheme=schemes.ConstantScheme(batch_size)) 130 | 131 | # add padding and masks to the dataset 132 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target')) 133 | return data_stream 134 | 135 | def prepare_batch(batch, mask, fix_len=None): 136 | data = batch[mask].astype('int32') 137 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1) 138 | 139 | def cut_zeros(data, fix_len=None): 140 | if fix_len is not None: 141 | return data[:, : fix_len] 142 | for k in range(data.shape[1] - 1, 0, -1): 143 | data_col = data[:, k].sum() 144 | if data_col > 0: 145 | return data[:, : k + 2] 146 | return data 147 | data = cut_zeros(data, fix_len) 148 | return data 149 | 150 | def cc_martix(source, target): 151 | cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32') 152 | for k in xrange(source.shape[0]): 153 | for j in xrange(target.shape[1]): 154 | for i in xrange(source.shape[1]): 155 | if (source[k, i] == target[k, j]) and (source[k, i] > 0): 156 | cc[k][j][i] = 1. 157 | return cc 158 | 159 | def unk_filter(data): 160 | if config['voc_size'] == -1: 161 | return copy.copy(data) 162 | else: 163 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32') 164 | data = copy.copy(data * mask + (1 - mask)) 165 | return data 166 | 167 | # training 168 | notrain = False 169 | if not notrain: 170 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True) 171 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo)) 172 | progbar = Progbar(train_size / config['batch_size']) 173 | for it, batch in enumerate(train_batches): 174 | if (echo < sep): 175 | continue 176 | 177 | if (echo == sep) and (skip > it): 178 | if it % 200 == 0: 179 | print it 180 | continue 181 | 182 | # obtain data 183 | data_s = prepare_batch(batch, 'source') 184 | data_t = prepare_batch(batch, 'target') 185 | if config['copynet']: 186 | data_c = cc_martix(data_s, data_t) 187 | # data_c = prepare_batch(batch, 'target_c', data_t.shape[1]) 188 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)] 189 | else: 190 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))] 191 | 192 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])]) 193 | 194 | if it % 200 == 0: 195 | logger.info('Echo={} Evaluation Sampling.'.format(it)) 196 | logger.info('generating [training set] samples') 197 | for _ in xrange(5): 198 | idx = int(np.floor(n_rng.rand() * train_size)) 199 | train_s, train_t = train_data_plain[idx] 200 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'), 201 | np.asarray(train_t, dtype='int32'), 202 | idx2word, 203 | np.asarray(unk_filter(train_s), dtype='int32')) 204 | print '*' * 50 205 | 206 | logger.info('generating [testing set] samples') 207 | for _ in xrange(5): 208 | idx = int(np.floor(n_rng.rand() * test_size)) 209 | test_s, test_t = test_data_plain[idx] 210 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'), 211 | np.asarray(test_t, dtype='int32'), 212 | idx2word, 213 | np.asarray(unk_filter(test_s), dtype='int32')) 214 | print '*' * 50 215 | 216 | # save the weights. 217 | if it % 2000 == 0: 218 | agent.save(config['path_h5'] + 219 | '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format( 220 | tmark, echo, it)) 221 | 222 | # # test accuracy 223 | test = False 224 | if test: 225 | progbar_tr = Progbar(2000) 226 | 227 | print '\n' + '__' * 50 228 | cpy, cpy_pos = 0, 0 229 | for it, idx in enumerate(tr_idx): 230 | train_s, train_t = train_data_plain[idx] 231 | 232 | c = agent.analyse_(np.asarray(train_s, dtype='int32'), 233 | np.asarray(train_t, dtype='int32'), 234 | idx2word) 235 | # copy mode 236 | cpy += 1 237 | cpy_pos += c 238 | progbar_tr.update(it + 1, [('Copy', cpy_pos)]) 239 | 240 | logger.info('\nTraining Accuracy:' + 241 | '\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy))) 242 | 243 | progbar_ts = Progbar(2000) 244 | print '\n' + '__' * 50 245 | 246 | cpy, cpy_pos = 0, 0 247 | for it, idx in enumerate(ts_idx): 248 | test_s, test_t = test_data_plain[idx] 249 | c = agent.analyse_(np.asarray(test_s, dtype='int32'), 250 | np.asarray(test_t, dtype='int32'), 251 | idx2word) 252 | cpy += 1 253 | cpy_pos += c 254 | progbar_ts.update(it + 1, [('Copy', cpy_pos)]) 255 | 256 | logger.info('\nTesting Accuracy:' + 257 | '\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy))) 258 | -------------------------------------------------------------------------------- /experiments/lcsts_vest_new.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the implementation of Copy-NET 3 | We start from the basic Seq2seq framework for a auto-encoder. 4 | """ 5 | import logging 6 | import time 7 | import numpy as np 8 | import sys 9 | import copy 10 | 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 12 | from experiments.config import setup_lcsts, setup_weibo, setup_syn 13 | from emolga.utils.generic_utils import * 14 | from emolga.models.covc_encdec import NRM 15 | from emolga.models.encdec import NRM as NRM0 16 | from emolga.dataset.build_dataset import deserialize_from_file 17 | from collections import OrderedDict 18 | from fuel import datasets 19 | from fuel import transformers 20 | from fuel import schemes 21 | 22 | setup = setup_lcsts 23 | # setup = setup_syn 24 | 25 | 26 | def init_logging(logfile): 27 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', 28 | datefmt='%m/%d/%Y %H:%M:%S' ) 29 | fh = logging.FileHandler(logfile) 30 | # ch = logging.StreamHandler() 31 | 32 | fh.setFormatter(formatter) 33 | # ch.setFormatter(formatter) 34 | # fh.setLevel(logging.INFO) 35 | # ch.setLevel(logging.INFO) 36 | # logging.getLogger().addHandler(ch) 37 | logging.getLogger().addHandler(fh) 38 | logging.getLogger().setLevel(logging.INFO) 39 | 40 | return logging 41 | 42 | # prepare logging. 43 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) 44 | config = setup() # load settings. 45 | for w in config: 46 | print '{0}={1}'.format(w, config[w]) 47 | 48 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTSXXX.id={}.log'.format(tmark)) 49 | n_rng = np.random.RandomState(config['seed']) 50 | np.random.seed(config['seed']) 51 | rng = RandomStreams(n_rng.randint(2 ** 30)) 52 | logger.info('Start!') 53 | 54 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset']) 55 | 56 | if config['voc_size'] == -1: # not use unk 57 | config['enc_voc_size'] = len(word2idx) 58 | config['dec_voc_size'] = config['enc_voc_size'] 59 | else: 60 | config['enc_voc_size'] = config['voc_size'] 61 | config['dec_voc_size'] = config['enc_voc_size'] 62 | 63 | samples = len(train_set['source']) 64 | logger.info('build dataset done. ' + 65 | 'dataset size: {} ||'.format(samples) + 66 | 'vocabulary size = {0}/ batch size = {1}'.format( 67 | config['dec_voc_size'], config['batch_size'])) 68 | 69 | 70 | def build_data(data): 71 | # create fuel dataset. 72 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']), 73 | ('target', data['target']), 74 | ('target_c', data['target_c']), 75 | ])) 76 | dataset.example_iteration_scheme \ 77 | = schemes.ShuffledExampleScheme(dataset.num_examples) 78 | return dataset 79 | 80 | 81 | train_data = build_data(train_set) 82 | train_data_plain = zip(*(train_set['source'], train_set['target'])) 83 | test_data_plain = zip(*(test_set['source'], test_set['target'])) 84 | 85 | # train_data_plain = zip(*(train_set['source'], train_set['target'])) 86 | # test_data_plain = zip(*(test_set['source'], test_set['target'])) 87 | 88 | train_size = len(train_data_plain) 89 | test_size = len(test_data_plain) 90 | tr_idx = n_rng.permutation(train_size)[:2000].tolist() 91 | ts_idx = n_rng.permutation(test_size )[:2000].tolist() 92 | logger.info('load the data ok.') 93 | 94 | # build the agent 95 | if config['copynet']: 96 | agent = NRM(config, n_rng, rng, mode=config['mode'], 97 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 98 | else: 99 | agent = NRM0(config, n_rng, rng, mode=config['mode'], 100 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 101 | 102 | agent.build_() 103 | agent.compile_('all') 104 | print 'compile ok.' 105 | 106 | echo = 0 107 | epochs = 10 108 | if echo > 0: 109 | tmark = '20160221-025049' # copynet multi-source model 110 | agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.pkl'.format(tmark, echo)) 111 | 112 | # recover from Nan 113 | # tmark = '20160301-105813' 114 | skip = -1 #100000 115 | sep = -1 #2 116 | 117 | # tmark = '20160301-114653' 118 | # skip = 14000 119 | # agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(tmark, sep, skip)) 120 | 121 | 122 | while echo < epochs: 123 | echo += 1 124 | loss = [] 125 | 126 | def output_stream(dataset, batch_size, size=1): 127 | data_stream = dataset.get_example_stream() 128 | data_stream = transformers.Batch(data_stream, 129 | iteration_scheme=schemes.ConstantScheme(batch_size)) 130 | 131 | # add padding and masks to the dataset 132 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target')) 133 | return data_stream 134 | 135 | def prepare_batch(batch, mask, fix_len=None): 136 | data = batch[mask].astype('int32') 137 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1) 138 | 139 | def cut_zeros(data, fix_len=None): 140 | if fix_len is not None: 141 | return data[:, : fix_len] 142 | for k in range(data.shape[1] - 1, 0, -1): 143 | data_col = data[:, k].sum() 144 | if data_col > 0: 145 | return data[:, : k + 2] 146 | return data 147 | data = cut_zeros(data, fix_len) 148 | return data 149 | 150 | def cc_martix(source, target): 151 | cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32') 152 | for k in xrange(source.shape[0]): 153 | for j in xrange(target.shape[1]): 154 | for i in xrange(source.shape[1]): 155 | if (source[k, i] == target[k, j]) and (source[k, i] > 0): 156 | cc[k][j][i] = 1. 157 | return cc 158 | 159 | def unk_filter(data): 160 | if config['voc_size'] == -1: 161 | return copy.copy(data) 162 | else: 163 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32') 164 | data = copy.copy(data * mask + (1 - mask)) 165 | return data 166 | 167 | # training 168 | notrain = False 169 | if not notrain: 170 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True) 171 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo)) 172 | progbar = Progbar(train_size / config['batch_size']) 173 | for it, batch in enumerate(train_batches): 174 | if (echo < sep): 175 | continue 176 | 177 | if (echo == sep) and (skip > it): 178 | if it % 200 == 0: 179 | print it 180 | continue 181 | 182 | # obtain data 183 | data_s = prepare_batch(batch, 'source') 184 | data_t = prepare_batch(batch, 'target') 185 | if config['copynet']: 186 | data_c = cc_martix(data_s, data_t) 187 | # data_c = prepare_batch(batch, 'target_c', data_t.shape[1]) 188 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)] 189 | else: 190 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))] 191 | 192 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])]) 193 | 194 | if it % 200 == 0: 195 | logger.info('Echo={} Evaluation Sampling.'.format(it)) 196 | logger.info('generating [training set] samples') 197 | for _ in xrange(5): 198 | idx = int(np.floor(n_rng.rand() * train_size)) 199 | train_s, train_t = train_data_plain[idx] 200 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'), 201 | np.asarray(train_t, dtype='int32'), 202 | idx2word, 203 | np.asarray(unk_filter(train_s), dtype='int32')) 204 | print '*' * 50 205 | 206 | logger.info('generating [testing set] samples') 207 | for _ in xrange(5): 208 | idx = int(np.floor(n_rng.rand() * test_size)) 209 | test_s, test_t = test_data_plain[idx] 210 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'), 211 | np.asarray(test_t, dtype='int32'), 212 | idx2word, 213 | np.asarray(unk_filter(test_s), dtype='int32')) 214 | print '*' * 50 215 | 216 | # save the weights. 217 | if it % 2000 == 0: 218 | agent.save(config['path_h5'] + 219 | '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format( 220 | tmark, echo, it)) 221 | 222 | # # test accuracy 223 | test = False 224 | if test: 225 | progbar_tr = Progbar(2000) 226 | 227 | print '\n' + '__' * 50 228 | cpy, cpy_pos = 0, 0 229 | for it, idx in enumerate(tr_idx): 230 | train_s, train_t = train_data_plain[idx] 231 | 232 | c = agent.analyse_(np.asarray(train_s, dtype='int32'), 233 | np.asarray(train_t, dtype='int32'), 234 | idx2word) 235 | # copy mode 236 | cpy += 1 237 | cpy_pos += c 238 | progbar_tr.update(it + 1, [('Copy', cpy_pos)]) 239 | 240 | logger.info('\nTraining Accuracy:' + 241 | '\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy))) 242 | 243 | progbar_ts = Progbar(2000) 244 | print '\n' + '__' * 50 245 | 246 | cpy, cpy_pos = 0, 0 247 | for it, idx in enumerate(ts_idx): 248 | test_s, test_t = test_data_plain[idx] 249 | c = agent.analyse_(np.asarray(test_s, dtype='int32'), 250 | np.asarray(test_t, dtype='int32'), 251 | idx2word) 252 | cpy += 1 253 | cpy_pos += c 254 | progbar_ts.update(it + 1, [('Copy', cpy_pos)]) 255 | 256 | logger.info('\nTesting Accuracy:' + 257 | '\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy))) 258 | -------------------------------------------------------------------------------- /experiments/movie_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file 3 | import string 4 | import random 5 | import sys 6 | 7 | random.seed(19920206) 8 | word2idx = dict() 9 | wordfreq = dict() 10 | word2idx[''] = 0 11 | word2idx[''] = 1 12 | word2freq = dict() 13 | 14 | 15 | def mark(line): 16 | tmp_line = '' 17 | for c in line: 18 | if c in string.punctuation: 19 | if c is not "'": 20 | tmp_line += ' ' + c + ' ' 21 | else: 22 | tmp_line += ' ' + c 23 | else: 24 | tmp_line += c 25 | tmp_line = tmp_line.lower() 26 | words = [w for w in tmp_line.split() if len(w) > 0] 27 | for w in words: 28 | if w not in word2freq: 29 | word2freq[w] = 1 30 | else: 31 | word2freq[w] += 1 32 | return words 33 | 34 | 35 | fline = open('./dataset/cornell_movie/movie_lines.txt', 'r') 36 | sets = [w.split('+++$+++') for w in fline.read().split('\n')] 37 | lines = {w[0].strip(): mark(w[-1].strip()) for w in sets} 38 | # 39 | # for w in lines: 40 | # if len(lines[w]) == 0: 41 | # print w 42 | 43 | fline.close() 44 | print 'read lines ok' 45 | fconv = open('./dataset/cornell_movie/movie_conversations.txt', 'r') 46 | 47 | turns = [] 48 | convs = fconv.readline() 49 | while convs: 50 | turn = eval(convs.split('+++$+++')[-1].strip()) 51 | turns += zip(turn[:-1], turn[1:]) 52 | convs = fconv.readline() 53 | 54 | pairs = [(lines[a], lines[b]) for a, b in turns 55 | if len(lines[a]) > 0 and len(lines[b]) > 0] 56 | 57 | # shuffle! 58 | random.shuffle(pairs) 59 | 60 | word2freq = sorted(word2freq.items(), key=lambda a: a[1], reverse=True) 61 | for at, w in enumerate(word2freq): 62 | word2idx[w[0]] = at + 2 63 | 64 | idx2word = {k: v for v, k in word2idx.items()} 65 | print idx2word[1], idx2word[2] 66 | 67 | Lmax = len(idx2word) 68 | # for i in xrange(Lmax): 69 | # print idx2word[i] 70 | print 'read dataset ok.' 71 | print Lmax 72 | print pairs[0] 73 | 74 | 75 | def build_data(data): 76 | instance = dict(text=[], summary=[], source=[], target=[], target_c=[]) 77 | print len(data) 78 | for pair in data: 79 | source, target = pair 80 | A = [word2idx[w] for w in source] 81 | B = [word2idx[w] for w in target] 82 | # C = np.asarray([[w == l for w in source] for l in target], dtype='float32') 83 | C = [0 if w not in source else source.index(w) + Lmax for w in target] 84 | 85 | instance['text'] += [source] 86 | instance['summary'] += [target] 87 | instance['source'] += [A] 88 | instance['target'] += [B] 89 | # instance['cc_matrix'] += [C] 90 | instance['target_c'] += [C] 91 | 92 | print instance['source'][4000] 93 | print instance['target'][4000] 94 | print instance['target_c'][4000] 95 | return instance 96 | 97 | 98 | train_set = build_data(pairs[10000:]) 99 | test_set = build_data(pairs[:10000]) 100 | serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/movie_dialogue_data.pkl') 101 | -------------------------------------------------------------------------------- /experiments/syn_vest.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the implementation of Copy-NET 3 | We start from the basic Seq2seq framework for a auto-encoder. 4 | """ 5 | import logging 6 | import time 7 | import numpy as np 8 | import sys 9 | import copy 10 | 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 12 | from experiments.config import setup_lcsts, setup_weibo, setup_syn, setup_bst 13 | from emolga.utils.generic_utils import * 14 | from emolga.models.covc_encdec import NRM 15 | from emolga.models.encdec import NRM as NRM0 16 | from emolga.dataset.build_dataset import deserialize_from_file 17 | from collections import OrderedDict 18 | from fuel import datasets 19 | from fuel import transformers 20 | from fuel import schemes 21 | 22 | # setup = setup_lcsts 23 | setup = setup_syn 24 | # setup = setup_bst 25 | 26 | 27 | def init_logging(logfile): 28 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', 29 | datefmt='%m/%d/%Y %H:%M:%S' ) 30 | fh = logging.FileHandler(logfile) 31 | # ch = logging.StreamHandler() 32 | 33 | fh.setFormatter(formatter) 34 | # ch.setFormatter(formatter) 35 | # fh.setLevel(logging.INFO) 36 | # ch.setLevel(logging.INFO) 37 | # logging.getLogger().addHandler(ch) 38 | logging.getLogger().addHandler(fh) 39 | logging.getLogger().setLevel(logging.INFO) 40 | 41 | return logging 42 | 43 | # prepare logging. 44 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) 45 | config = setup() # load settings. 46 | for w in config: 47 | print '{0}={1}'.format(w, config[w]) 48 | 49 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark)) 50 | n_rng = np.random.RandomState(config['seed']) 51 | np.random.seed(config['seed']) 52 | rng = RandomStreams(n_rng.randint(2 ** 30)) 53 | logger.info('Start!') 54 | 55 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset']) 56 | 57 | if config['voc_size'] == -1: # not use unk 58 | config['enc_voc_size'] = len(word2idx) 59 | config['dec_voc_size'] = config['enc_voc_size'] 60 | else: 61 | config['enc_voc_size'] = config['voc_size'] 62 | config['dec_voc_size'] = config['enc_voc_size'] 63 | 64 | samples = len(train_set['source']) 65 | logger.info('build dataset done. ' + 66 | 'dataset size: {} ||'.format(samples) + 67 | 'vocabulary size = {0}/ batch size = {1}'.format( 68 | config['dec_voc_size'], config['batch_size'])) 69 | 70 | 71 | def build_data(data): 72 | # create fuel dataset. 73 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']), 74 | ('target', data['target']), 75 | ('target_c', data['target_c']), 76 | ])) 77 | dataset.example_iteration_scheme \ 78 | = schemes.ShuffledExampleScheme(dataset.num_examples) 79 | return dataset 80 | 81 | 82 | train_data = build_data(train_set) 83 | train_data_plain = zip(*(train_set['source'], train_set['target'], train_set['rule'])) 84 | test_data_plain = zip(*(test_set['source'], test_set['target'], test_set['rule'])) 85 | 86 | # train_data_plain = zip(*(train_set['source'], train_set['target'])) 87 | # test_data_plain = zip(*(test_set['source'], test_set['target'])) 88 | 89 | train_size = len(train_data_plain) 90 | test_size = len(test_data_plain) 91 | tr_idx = n_rng.permutation(train_size)[:2000].tolist() 92 | ts_idx = n_rng.permutation(test_size )[:2000].tolist() 93 | logger.info('load the data ok.') 94 | config['copynet'] = True # False 95 | notrain = True 96 | 97 | # build the agent 98 | if config['copynet']: 99 | agent = NRM(config, n_rng, rng, mode=config['mode'], 100 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 101 | else: 102 | agent = NRM0(config, n_rng, rng, mode=config['mode'], 103 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 104 | 105 | agent.build_() 106 | if notrain: 107 | agent.compile_('display') 108 | else: 109 | agent.compile_('all') 110 | print 'compile ok.' 111 | 112 | echo = 6 113 | epochs = 10 114 | if echo > 0: 115 | tmark = '20160227-013418' # copynet multi-source model 116 | agent.load(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname'])) 117 | 118 | while echo < epochs: 119 | echo += 1 120 | loss = [] 121 | 122 | def output_stream(dataset, batch_size, size=1): 123 | data_stream = dataset.get_example_stream() 124 | data_stream = transformers.Batch(data_stream, 125 | iteration_scheme=schemes.ConstantScheme(batch_size)) 126 | 127 | # add padding and masks to the dataset 128 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target')) 129 | return data_stream 130 | 131 | def prepare_batch(batch, mask, fix_len=None): 132 | data = batch[mask].astype('int32') 133 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1) 134 | 135 | def cut_zeros(data, fix_len=None): 136 | if fix_len is not None: 137 | return data[:, : fix_len] 138 | for k in range(data.shape[1] - 1, 0, -1): 139 | data_col = data[:, k].sum() 140 | if data_col > 0: 141 | return data[:, : k + 2] 142 | return data 143 | data = cut_zeros(data, fix_len) 144 | return data 145 | 146 | def cc_martix(source, target): 147 | cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32') 148 | for k in xrange(source.shape[0]): 149 | for j in xrange(target.shape[1]): 150 | for i in xrange(source.shape[1]): 151 | if (source[k, i] == target[k, j]) and (source[k, i] > 0): 152 | cc[k][j][i] = 1. 153 | return cc 154 | 155 | def unk_filter(data): 156 | if config['voc_size'] == -1: 157 | return copy.copy(data) 158 | else: 159 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32') 160 | data = copy.copy(data * mask + (1 - mask)) 161 | return data 162 | 163 | # training 164 | if not notrain: 165 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True) 166 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo)) 167 | progbar = Progbar(train_size / config['batch_size']) 168 | for it, batch in enumerate(train_batches): 169 | # obtain data 170 | data_s = prepare_batch(batch, 'source') 171 | data_t = prepare_batch(batch, 'target') 172 | if config['copynet']: 173 | data_c = cc_martix(data_s, data_t) 174 | # data_c = prepare_batch(batch, 'target_c', data_t.shape[1]) 175 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)] 176 | else: 177 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))] 178 | 179 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])]) 180 | 181 | if it % 200 == 0: 182 | logger.info('Echo={} Evaluation Sampling.'.format(it)) 183 | logger.info('generating [training set] samples') 184 | for _ in xrange(5): 185 | idx = int(np.floor(n_rng.rand() * train_size)) 186 | train_s, train_t = train_data_plain[idx] 187 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'), 188 | np.asarray(train_t, dtype='int32'), 189 | idx2word, 190 | np.asarray(unk_filter(train_s), dtype='int32')) 191 | print '*' * 50 192 | 193 | logger.info('generating [testing set] samples') 194 | for _ in xrange(5): 195 | idx = int(np.floor(n_rng.rand() * test_size)) 196 | test_s, test_t = test_data_plain[idx] 197 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'), 198 | np.asarray(test_t, dtype='int32'), 199 | idx2word, 200 | np.asarray(unk_filter(test_s), dtype='int32')) 201 | print '*' * 50 202 | 203 | # save the weights. 204 | agent.save(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname'])) 205 | 206 | # # test accuracy 207 | def judge_rule(rule): 208 | rule = rule.split() 209 | fine = '' 210 | for w in rule: 211 | if w not in word2idx: 212 | fine += w 213 | return fine 214 | 215 | test = True 216 | if test: 217 | def analysis_(data_plain, mode='Training'): 218 | progbar_tr = Progbar(2000) 219 | print '\n' + '__' * 50 220 | cpy, cpy_pos = 0, 0 221 | types = dict() 222 | for it, idx in enumerate(tr_idx): 223 | train_s, train_t, rule = data_plain[idx] 224 | t = judge_rule(rule) 225 | c = float(agent.analyse_(np.asarray(train_s, dtype='int32'), 226 | np.asarray(train_t, dtype='int32'), 227 | idx2word)) 228 | # copy mode 229 | cpy += 1 230 | cpy_pos += c 231 | if t not in types: 232 | types[t] = {} 233 | types[t][0] = c 234 | types[t][1] = 1 235 | else: 236 | types[t][0] += c 237 | types[t][1] += 1 238 | 239 | progbar_tr.update(it + 1, [('Copy', cpy_pos)]) 240 | logger.info('\n{0} Accuracy:' + 241 | '\t{1}/{2} = {3}%'.format(mode, cpy_pos, cpy, 100 * cpy_pos / float(cpy))) 242 | print '==' * 50 243 | for t in types: 244 | print 'Type: {0}: {1}/{2}={3}%'.format(t, int(types[t][0]), types[t][1], 245 | 100 * types[t][0] / float(types[t][1])) 246 | 247 | 248 | # analysis_(train_data_plain, 'Training') 249 | analysis_(test_data_plain, 'Testing') 250 | -------------------------------------------------------------------------------- /experiments/syntest.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the implementation of Copy-NET 3 | We start from the basic Seq2seq framework for a auto-encoder. 4 | """ 5 | import logging 6 | import time 7 | import numpy as np 8 | 9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 10 | from experiments.config import setup_syn 11 | from emolga.utils.generic_utils import * 12 | from emolga.models.cooc_encdec import NRM 13 | from emolga.models.encdec import NRM as NRM0 14 | from emolga.dataset.build_dataset import deserialize_from_file 15 | from collections import OrderedDict 16 | from fuel import datasets 17 | from fuel import transformers 18 | from fuel import schemes 19 | 20 | setup = setup_syn 21 | 22 | 23 | def init_logging(logfile): 24 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', 25 | datefmt='%m/%d/%Y %H:%M:%S' ) 26 | fh = logging.FileHandler(logfile) 27 | # ch = logging.StreamHandler() 28 | 29 | fh.setFormatter(formatter) 30 | # ch.setFormatter(formatter) 31 | # fh.setLevel(logging.INFO) 32 | # ch.setLevel(logging.INFO) 33 | # logging.getLogger().addHandler(ch) 34 | logging.getLogger().addHandler(fh) 35 | logging.getLogger().setLevel(logging.INFO) 36 | 37 | return logging 38 | 39 | # prepare logging. 40 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) 41 | config = setup() # load settings. 42 | for w in config: 43 | print '{0}={1}'.format(w, config[w]) 44 | 45 | logger = init_logging(config['path_log'] + '/experiments.Copy.id={}.log'.format(tmark)) 46 | n_rng = np.random.RandomState(config['seed']) 47 | np.random.seed(config['seed']) 48 | rng = RandomStreams(n_rng.randint(2 ** 30)) 49 | logger.info('Start!') 50 | 51 | # the vocabulary 52 | tmp = [chr(x) for x in range(48, 58)] # '1', ... , '9', '0' 53 | voc = [tmp[a] + tmp[b] + tmp[c] 54 | for c in xrange(10) 55 | for b in xrange(10) 56 | for a in xrange(10)] 57 | word2idx = {voc[k]: k + 1 for k in xrange(len(voc))} 58 | word2idx[''] = 0 59 | idx2word = {word2idx[w]: w for w in word2idx} 60 | voc = [''] + voc 61 | 62 | train_set, test_set = deserialize_from_file(config['dataset']) 63 | 64 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1 65 | config['dec_voc_size'] = config['enc_voc_size'] 66 | samples = len(train_set['source']) 67 | 68 | logger.info('build dataset done. ' + 69 | 'dataset size: {} ||'.format(samples) + 70 | 'vocabulary size = {0}/ batch size = {1}'.format( 71 | config['dec_voc_size'], config['batch_size'])) 72 | 73 | 74 | def build_data(data): 75 | # create fuel dataset. 76 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']), 77 | ('target', data['target']), 78 | ('target_c', data['target_c']), 79 | ('rule_id', data['rule_id']), 80 | ('rule', data['rule'])])) 81 | dataset.example_iteration_scheme \ 82 | = schemes.ShuffledExampleScheme(dataset.num_examples) 83 | return dataset 84 | 85 | 86 | train_data = build_data(train_set) 87 | train_data_plain = zip(*(train_set['source'], train_set['target'], train_set['rule_id'], train_set['rule'])) 88 | test_data_plain = zip(*(test_set['source'], test_set['target'], test_set['rule_id'], test_set['rule'])) 89 | 90 | train_size = len(train_data_plain) 91 | test_size = len(test_data_plain) 92 | tr_idx = n_rng.permutation(train_size)[:2000].tolist() 93 | ts_idx = n_rng.permutation(test_size )[:2000].tolist() 94 | logger.info('load the data ok.') 95 | 96 | # build the agent 97 | if config['copynet']: 98 | agent = NRM(config, n_rng, rng, mode=config['mode'], 99 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 100 | else: 101 | agent = NRM0(config, n_rng, rng, mode=config['mode'], 102 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 103 | 104 | agent.build_() 105 | agent.compile_('all') 106 | print 'compile ok.' 107 | 108 | echo = 3 109 | epochs = 4 110 | if echo > 0: 111 | tmark = '20160216-152155' 112 | agent.load(config['path_h5'] + '/experiments.Copy.id={0}.epoch={1}.pkl'.format(tmark, echo)) 113 | 114 | while echo < epochs: 115 | echo += 1 116 | loss = [] 117 | 118 | def output_stream(dataset, batch_size, size=1): 119 | data_stream = dataset.get_example_stream() 120 | data_stream = transformers.Batch(data_stream, 121 | iteration_scheme=schemes.ConstantScheme(batch_size)) 122 | 123 | # add padding and masks to the dataset 124 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target', 'target_c')) 125 | return data_stream 126 | 127 | def prepare_batch(batch, mask, fix_len=None): 128 | data = batch[mask].astype('int32') 129 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1) 130 | 131 | def cut_zeros(data, fix_len=None): 132 | if fix_len is not None: 133 | return data[:, : fix_len] 134 | for k in range(data.shape[1] - 1, 0, -1): 135 | data_col = data[:, k].sum() 136 | if data_col > 0: 137 | return data[:, : k + 2] 138 | return data 139 | data = cut_zeros(data, fix_len) 140 | return data 141 | 142 | # training 143 | notrain = True 144 | if not notrain: 145 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True) 146 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo)) 147 | progbar = Progbar(train_size / config['batch_size']) 148 | for it, batch in enumerate(train_batches): 149 | # obtain data 150 | data_s = prepare_batch(batch, 'source') 151 | data_t = prepare_batch(batch, 'target') 152 | data_c = prepare_batch(batch, 'target_c', data_t.shape[1]) 153 | 154 | if config['copynet']: 155 | loss += [agent.train_(data_s, data_t, data_c)] 156 | else: 157 | loss += [agent.train_(data_s, data_t)] 158 | 159 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])]) 160 | 161 | if it % 200 == 0: 162 | logger.info('Echo={} Evaluation Sampling.'.format(it)) 163 | logger.info('generating [training set] samples') 164 | for _ in xrange(5): 165 | idx = int(np.floor(n_rng.rand() * train_size)) 166 | train_s, train_t, _, _ = train_data_plain[idx] 167 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'), 168 | np.asarray(train_t, dtype='int32'), 169 | idx2word) 170 | print '*' * 50 171 | 172 | logger.info('generating [testing set] samples') 173 | for _ in xrange(5): 174 | idx = int(np.floor(n_rng.rand() * test_size)) 175 | test_s, test_t, _, _ = test_data_plain[idx] 176 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'), 177 | np.asarray(test_t, dtype='int32'), 178 | idx2word) 179 | print '*' * 50 180 | 181 | # save the weights. 182 | agent.save(config['path_h5'] + '/experiments.Copy.id={0}.epoch={1}.pkl'.format(tmark, echo)) 183 | 184 | # test accuracy 185 | progbar_tr = Progbar(2000) 186 | 187 | print '\n' + '__' * 50 188 | gen, gen_pos = 0, 0 189 | cpy, cpy_pos = 0, 0 190 | grs, crs = [], [] 191 | for it, idx in enumerate(tr_idx): 192 | train_s, train_t, rid, rule = train_data_plain[idx] 193 | 194 | c = agent.analyse_(np.asarray(train_s, dtype='int32'), 195 | np.asarray(train_t, dtype='int32'), 196 | idx2word) 197 | if c[1] == 0: 198 | # generation mode 199 | gen += 1 200 | gen_pos += c[0] 201 | if c[0] == 1: 202 | grs += [rule] 203 | else: 204 | # copy mode 205 | cpy += 1 206 | cpy_pos += c[0] 207 | if c[0] == 1: 208 | crs += [rule] 209 | 210 | progbar_tr.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)]) 211 | grs = set(grs) 212 | crs = set(crs) 213 | irs = set.intersection(grs, crs) 214 | 215 | logger.info('\nTraining Accuracy:' + 216 | '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) + 217 | '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)) + 218 | '\tGene-Rule: {0}, Copy-Rule: {1}, Intersection: {2}'.format(len(grs), len(crs), len(irs))) 219 | 220 | print 'Generate Mode:' 221 | for r in grs: 222 | print r 223 | 224 | print 'Copy Mode:' 225 | for r in crs: 226 | print r 227 | 228 | print 'Interaction:' 229 | for r in irs: 230 | print r 231 | 232 | progbar_ts = Progbar(2000) 233 | print '\n' + '__' * 50 234 | gen, gen_pos = 0, 0 235 | cpy, cpy_pos = 0, 0 236 | grs, crs = [], [] 237 | for it, idx in enumerate(ts_idx): 238 | test_s, test_t, rid, rule = test_data_plain[idx] 239 | c = agent.analyse_(np.asarray(test_s, dtype='int32'), 240 | np.asarray(test_t, dtype='int32'), 241 | idx2word) 242 | if c[1] == 0: 243 | # generation mode 244 | gen += 1 245 | gen_pos += c[0] 246 | grs += [rule] 247 | else: 248 | # copy mode 249 | cpy += 1 250 | cpy_pos += c[0] 251 | crs += [rule] 252 | 253 | progbar_ts.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)]) 254 | grs = set(grs) 255 | crs = set(crs) 256 | irs = set.intersection(grs, crs) 257 | 258 | logger.info('\nTesting Accuracy:' + 259 | '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) + 260 | '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)) + 261 | '\tGene-Rule: {0}, Copy-Rule: {1}, Intersection: {2}'.format(len(grs), len(crs), len(irs))) 262 | 263 | print 'Generate Mode:' 264 | for r in grs: 265 | print r 266 | 267 | print 'Copy Mode:' 268 | for r in crs: 269 | print r 270 | 271 | print 'Interaction:' 272 | for r in irs: 273 | print r -------------------------------------------------------------------------------- /experiments/synthetic.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jiataogu' 2 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file 3 | import numpy.random as n_rng 4 | 5 | n_rng.seed(19920206) 6 | # the vocabulary 7 | tmp = [chr(x) for x in range(48, 58)] # '1', ... , '9', '0' 8 | voc = [tmp[a] + tmp[b] + tmp[c] 9 | for c in xrange(10) 10 | for b in xrange(10) 11 | for a in xrange(10)] 12 | word2idx = {voc[k]: k + 2 for k in xrange(len(voc))} 13 | word2idx[''] = 0 14 | word2idx[''] = 1 15 | idx2word = {word2idx[w]: w for w in word2idx} 16 | voc = ['', ''] + voc 17 | 18 | # word2idx['X'] = len(voc) 19 | # idx2word[len(voc)] = 'X' 20 | # voc += ['X'] 21 | # 22 | # word2idx['Y'] = len(voc) 23 | # idx2word[len(voc)] = 'Y' 24 | # voc += ['Y'] 25 | # print word2idx['X'], word2idx['Y'] 26 | 27 | # load the dataset 28 | Rules, _ = deserialize_from_file('/home/thoma/Work/Dial-DRL/dataset/rules.rnd.n10k.pkl') 29 | num = 200 30 | repeats = 100 31 | maxleg = 15 32 | Lmax = len(idx2word) 33 | rules = dict(source=Rules['source'][:num], 34 | target=Rules['target'][:num]) 35 | 36 | 37 | def ftr(v): 38 | if v < 10: 39 | return '00' + str(v) 40 | elif v < 100: 41 | return '0' + str(v) 42 | else: 43 | return str(v) 44 | 45 | 46 | def build_instance(): 47 | instance = dict(x=[], y=[], source=[], target=[], target_c=[], rule_id=[], rule=[]) 48 | for k in xrange(num): 49 | source = rules['source'][k] 50 | target = rules['target'][k] 51 | 52 | for j in xrange(repeats): 53 | X = n_rng.randint(1000, size= n_rng.randint(maxleg) + 1) 54 | Y = n_rng.randint(1000, size= n_rng.randint(maxleg) + 1) 55 | S = [] 56 | T = [] 57 | for w in source: 58 | if w is 'X': 59 | S += [ftr(v) for v in X] 60 | elif w is 'Y': 61 | S += [ftr(v) for v in Y] 62 | else: 63 | S += [w] 64 | 65 | for w in target: 66 | if w is 'X': 67 | T += [ftr(v) for v in X] 68 | elif w is 'Y': 69 | T += [ftr(v) for v in Y] 70 | else: 71 | T += [w] 72 | 73 | A = [word2idx[w] for w in S] 74 | B = [word2idx[w] for w in T] 75 | C = [0 if w not in S else S.index(w) + Lmax for w in T] 76 | 77 | instance['x'] += [S] 78 | instance['y'] += [T] 79 | instance['source'] += [A] 80 | instance['target'] += [B] 81 | instance['target_c'] += [C] 82 | 83 | instance['rule_id'] += [k] 84 | instance['rule'] += [' '.join(source) + ' -> ' + ' '.join(target)] 85 | 86 | return instance 87 | 88 | train_set = build_instance() 89 | print 'build ok.' 90 | test_set = build_instance() 91 | print 'build ok.' 92 | 93 | serialize_to_file([train_set, test_set, idx2word, word2idx], '/home/thoma/Work/Dial-DRL/dataset/synthetic_data_c.pkl') 94 | # serialize_to_file([train_set, test_set], '/home/thoma/Work/Dial-DRL/dataset/synthetic_data.pkl') 95 | -------------------------------------------------------------------------------- /experiments/weibo_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file 3 | 4 | word2idx = dict() 5 | wordfreq = dict() 6 | word2idx[''] = 0 7 | word2idx[''] = 1 8 | 9 | # segment = False # True 10 | 11 | # training set 12 | pairs = [] 13 | f = open('./dataset/weibo/co-occur.txt', 'r') 14 | line = f.readline().strip().decode('utf-8') 15 | at = 2 16 | lines = 0 17 | while line: 18 | post = line # f.readline().strip().decode('utf-8') 19 | post= [w.strip() for w in post.split() if len(w.strip()) > 0] 20 | 21 | # if segment: 22 | # summary = [w for w in jb.cut(summary)] 23 | 24 | for w in post: 25 | if w not in wordfreq: 26 | wordfreq[w] = 1 27 | else: 28 | wordfreq[w] += 1 29 | # if w not in word2idx: 30 | # word2idx[w] = at 31 | # at += 1 32 | 33 | text = f.readline().strip().decode('utf-8') 34 | text = [w.strip() for w in text.split() if len(w.strip()) > 0] 35 | # if segment: 36 | # text = [w for w in jb.cut(text)] 37 | for w in text: 38 | if w not in wordfreq: 39 | wordfreq[w] = 1 40 | else: 41 | wordfreq[w] += 1 42 | # if w not in word2idx: 43 | # word2idx[w] = at 44 | # at += 1 45 | 46 | pair = (post, text) 47 | pairs.append(pair) 48 | lines += 1 49 | if lines % 20000 == 0: 50 | print lines 51 | 52 | f.readline() 53 | line = f.readline().strip().decode('utf-8') 54 | 55 | 56 | # sort the vocabulary 57 | wordfreq = sorted(wordfreq.items(), key=lambda a:a[1], reverse=True) 58 | for w in wordfreq: 59 | word2idx[w[0]] = at 60 | at += 1 61 | 62 | idx2word = dict() 63 | for v, k in word2idx.items(): 64 | idx2word[k] = v 65 | Lmax = len(idx2word) 66 | print 'read dataset ok.' 67 | print Lmax 68 | for i in xrange(Lmax): 69 | print idx2word[i].encode('utf-8') 70 | 71 | # use character-based model [on] 72 | # use word-based model [off] 73 | 74 | 75 | def build_data(data): 76 | instance = dict(text=[], summary=[], source=[], target=[], target_c=[]) 77 | for pair in data: 78 | source, target = pair 79 | A = [word2idx[w] for w in source] 80 | B = [word2idx[w] for w in target] 81 | # C = np.asarray([[w == l for w in source] for l in target], dtype='float32') 82 | C = [0 if w not in source else source.index(w) + Lmax for w in target] 83 | 84 | instance['text'] += [source] 85 | instance['summary'] += [target] 86 | instance['source'] += [A] 87 | instance['target'] += [B] 88 | # instance['cc_matrix'] += [C] 89 | instance['target_c'] += [C] 90 | 91 | print instance['target'][5000] 92 | print instance['target_c'][5000] 93 | return instance 94 | 95 | 96 | train_set = build_data(pairs[10000:]) 97 | test_set = build_data(pairs[:10000]) 98 | serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/weibo_data-word-cooc.pkl') 99 | -------------------------------------------------------------------------------- /experiments/weibo_vest.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the implementation of Copy-NET 3 | We start from the basic Seq2seq framework for a auto-encoder. 4 | """ 5 | import logging 6 | import time 7 | import numpy as np 8 | import sys 9 | import copy 10 | 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 12 | from experiments.config import setup_lcsts, setup_weibo 13 | from emolga.utils.generic_utils import * 14 | from emolga.models.covc_encdec import NRM 15 | from emolga.models.encdec import NRM as NRM0 16 | from emolga.dataset.build_dataset import deserialize_from_file 17 | from collections import OrderedDict 18 | from fuel import datasets 19 | from fuel import transformers 20 | from fuel import schemes 21 | 22 | setup = setup_weibo 23 | 24 | 25 | def init_logging(logfile): 26 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', 27 | datefmt='%m/%d/%Y %H:%M:%S' ) 28 | fh = logging.FileHandler(logfile) 29 | # ch = logging.StreamHandler() 30 | 31 | fh.setFormatter(formatter) 32 | # ch.setFormatter(formatter) 33 | # fh.setLevel(logging.INFO) 34 | # ch.setLevel(logging.INFO) 35 | # logging.getLogger().addHandler(ch) 36 | logging.getLogger().addHandler(fh) 37 | logging.getLogger().setLevel(logging.INFO) 38 | 39 | return logging 40 | 41 | # prepare logging. 42 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time())) 43 | config = setup() # load settings. 44 | for w in config: 45 | print '{0}={1}'.format(w, config[w]) 46 | 47 | logger = init_logging(config['path_log'] + '/experiments.CopyWeibo.id={}.log'.format(tmark)) 48 | n_rng = np.random.RandomState(config['seed']) 49 | np.random.seed(config['seed']) 50 | rng = RandomStreams(n_rng.randint(2 ** 30)) 51 | logger.info('Start!') 52 | 53 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset']) 54 | 55 | if config['voc_size'] == -1: # not use unk 56 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1 57 | config['dec_voc_size'] = config['enc_voc_size'] 58 | else: 59 | config['enc_voc_size'] = config['voc_size'] 60 | config['dec_voc_size'] = config['enc_voc_size'] 61 | 62 | samples = len(train_set['source']) 63 | logger.info('build dataset done. ' + 64 | 'dataset size: {} ||'.format(samples) + 65 | 'vocabulary size = {0}/ batch size = {1}'.format( 66 | config['dec_voc_size'], config['batch_size'])) 67 | 68 | 69 | def build_data(data): 70 | # create fuel dataset. 71 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']), 72 | ('target', data['target']), 73 | ('target_c', data['target_c']), 74 | ])) 75 | dataset.example_iteration_scheme \ 76 | = schemes.ShuffledExampleScheme(dataset.num_examples) 77 | return dataset 78 | 79 | 80 | train_data = build_data(train_set) 81 | train_data_plain = zip(*(train_set['source'], train_set['target'])) 82 | test_data_plain = zip(*(test_set['source'], test_set['target'])) 83 | 84 | train_size = len(train_data_plain) 85 | test_size = len(test_data_plain) 86 | tr_idx = n_rng.permutation(train_size)[:2000].tolist() 87 | ts_idx = n_rng.permutation(test_size )[:2000].tolist() 88 | logger.info('load the data ok.') 89 | notrain = False 90 | 91 | # build the agent 92 | if config['copynet']: 93 | agent = NRM(config, n_rng, rng, mode=config['mode'], 94 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 95 | else: 96 | agent = NRM0(config, n_rng, rng, mode=config['mode'], 97 | use_attention=True, copynet=config['copynet'], identity=config['identity']) 98 | 99 | agent.build_() 100 | if notrain: 101 | agent.compile_('display') 102 | else: 103 | agent.compile_('all') 104 | print 'compile ok.' 105 | 106 | echo = 0 107 | epochs = 10 108 | if echo > 0: 109 | tmark = '20160227-164324' # copynet multi-source model 110 | agent.load(config['path_h5'] + '/experiments.CopyWeibo.id={0}.epoch={1}.pkl'.format(tmark, echo)) 111 | 112 | while echo < epochs: 113 | echo += 1 114 | loss = [] 115 | 116 | def output_stream(dataset, batch_size, size=1): 117 | data_stream = dataset.get_example_stream() 118 | data_stream = transformers.Batch(data_stream, 119 | iteration_scheme=schemes.ConstantScheme(batch_size)) 120 | 121 | # add padding and masks to the dataset 122 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target')) 123 | return data_stream 124 | 125 | def prepare_batch(batch, mask, fix_len=None): 126 | data = batch[mask].astype('int32') 127 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1) 128 | 129 | def cut_zeros(data, fix_len=None): 130 | if fix_len is not None: 131 | return data[:, : fix_len] 132 | for k in range(data.shape[1] - 1, 0, -1): 133 | data_col = data[:, k].sum() 134 | if data_col > 0: 135 | return data[:, : k + 2] 136 | return data 137 | data = cut_zeros(data, fix_len) 138 | return data 139 | 140 | def cc_martix(source, target): 141 | cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32') 142 | for k in xrange(source.shape[0]): 143 | for j in xrange(target.shape[1]): 144 | for i in xrange(source.shape[1]): 145 | if (source[k, i] == target[k, j]) and (source[k, i] > 0): 146 | cc[k][j][i] = 1. 147 | return cc 148 | 149 | def unk_filter(data): 150 | if config['voc_size'] == -1: 151 | return copy.copy(data) 152 | else: 153 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32') 154 | data = copy.copy(data * mask + (1 - mask)) 155 | return data 156 | 157 | # training 158 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True) 159 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo)) 160 | progbar = Progbar(train_size / config['batch_size']) 161 | for it, batch in enumerate(train_batches): 162 | # obtain data 163 | if not notrain: 164 | data_s = prepare_batch(batch, 'source') 165 | data_t = prepare_batch(batch, 'target') 166 | if config['copynet']: 167 | data_c = cc_martix(data_s, data_t) 168 | # data_c = prepare_batch(batch, 'target_c', data_t.shape[1]) 169 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)] 170 | else: 171 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))] 172 | 173 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])]) 174 | 175 | if it % 200 == 0: 176 | logger.info('Echo={} Evaluation Sampling.'.format(it)) 177 | logger.info('generating [training set] samples') 178 | for _ in xrange(5): 179 | idx = int(np.floor(n_rng.rand() * train_size)) 180 | train_s, train_t = train_data_plain[idx] 181 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'), 182 | np.asarray(train_t, dtype='int32'), 183 | idx2word, 184 | np.asarray(unk_filter(train_s), dtype='int32'), 185 | encode=config['utf-8']) 186 | print '*' * 50 187 | 188 | logger.info('generating [testing set] samples') 189 | for _ in xrange(5): 190 | idx = int(np.floor(n_rng.rand() * test_size)) 191 | test_s, test_t = test_data_plain[idx] 192 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'), 193 | np.asarray(test_t, dtype='int32'), 194 | idx2word, 195 | np.asarray(unk_filter(test_s), dtype='int32'), 196 | encode=config['utf-8']) 197 | print '*' * 50 198 | 199 | if it % 10000 == 0: 200 | # save the weights. 201 | agent.save(config['path_h5'] + '/experiments.CopyWeibo.id={0}.epoch={1}.pkl'.format(tmark, echo)) 202 | 203 | # # test accuracy 204 | # progbar_tr = Progbar(2000) 205 | # 206 | # print '\n' + '__' * 50 207 | # gen, gen_pos = 0, 0 208 | # cpy, cpy_pos = 0, 0 209 | # for it, idx in enumerate(tr_idx): 210 | # train_s, train_t = train_data_plain[idx] 211 | # 212 | # c = agent.analyse_(np.asarray(train_s, dtype='int32'), 213 | # np.asarray(train_t, dtype='int32'), 214 | # idx2word) 215 | # if c[1] == 0: 216 | # # generation mode 217 | # gen += 1 218 | # gen_pos += c[0] 219 | # else: 220 | # # copy mode 221 | # cpy += 1 222 | # cpy_pos += c[0] 223 | # 224 | # progbar_tr.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)]) 225 | # 226 | # logger.info('\nTraining Accuracy:' + 227 | # '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) + 228 | # '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy))) 229 | # 230 | # progbar_ts = Progbar(2000) 231 | # print '\n' + '__' * 50 232 | # gen, gen_pos = 0, 0 233 | # cpy, cpy_pos = 0, 0 234 | # for it, idx in enumerate(ts_idx): 235 | # test_s, test_t = test_data_plain[idx] 236 | # c = agent.analyse_(np.asarray(test_s, dtype='int32'), 237 | # np.asarray(test_t, dtype='int32'), 238 | # idx2word) 239 | # if c[1] == 0: 240 | # # generation mode 241 | # gen += 1 242 | # gen_pos += c[0] 243 | # else: 244 | # # copy mode 245 | # cpy += 1 246 | # cpy_pos += c[0] 247 | # 248 | # progbar_ts.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)]) 249 | # 250 | # logger.info('\nTesting Accuracy:' + 251 | # '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) + 252 | # '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy))) 253 | --------------------------------------------------------------------------------