├── .idea
    └── vcs.xml
├── LICENSE
├── README.md
├── emolga
    ├── __init__.py
    ├── basic
    │   ├── __init__.py
    │   ├── activations.py
    │   ├── initializations.py
    │   ├── objectives.py
    │   └── optimizers.py
    ├── config.py
    ├── config_variant.py
    ├── dataset
    │   └── build_dataset.py
    ├── layers
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── core.py
    │   ├── embeddings.py
    │   ├── gridlstm.py
    │   ├── ntm_minibatch.py
    │   └── recurrent.py
    ├── models
    │   ├── __init__.py
    │   ├── core.py
    │   ├── core.pyc
    │   ├── covc_encdec.py
    │   ├── encdec.py
    │   ├── ntm_encdec.py
    │   ├── pointers.py
    │   └── variational.py
    ├── run.py
    ├── test_lm.py
    ├── test_nvtm.py
    ├── test_run.py
    ├── utils
    │   ├── __init__.py
    │   ├── generic_utils.py
    │   ├── io_utils.py
    │   ├── np_utils.py
    │   ├── test_utils.py
    │   └── theano_utils.py
    └── voc.pkl
└── experiments
    ├── __init__.py
    ├── bst_dataset.py
    ├── bst_vest.py
    ├── config.py
    ├── config.pyc
    ├── copynet.py
    ├── copynet_input.py
    ├── dataset.py
    ├── lcsts_dataset.py
    ├── lcsts_rouge.py
    ├── lcsts_sample.py
    ├── lcsts_test.py
    ├── lcsts_vest.py
    ├── lcsts_vest_new.py
    ├── movie_dataset.py
    ├── syn_vest.py
    ├── syntest.py
    ├── synthetic.py
    ├── weibo_dataset.py
    └── weibo_vest.py


/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Jiatao Gu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CopyNet
2 | incorporating copying mechanism in sequence-to-sequence learning
3 | 


--------------------------------------------------------------------------------
/emolga/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'yinpengcheng'
2 | 


--------------------------------------------------------------------------------
/emolga/basic/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'
2 | 


--------------------------------------------------------------------------------
/emolga/basic/activations.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | 
 4 | def softmax(x):
 5 |     return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape)
 6 | 
 7 | 
 8 | def vector_softmax(x):
 9 |     return T.nnet.softmax(x.reshape((1, x.shape[0])))[0]
10 | 
11 | 
12 | def time_distributed_softmax(x):
13 |     import warnings
14 |     warnings.warn("time_distributed_softmax is deprecated. Just use softmax!", DeprecationWarning)
15 |     return softmax(x)
16 | 
17 | 
18 | def softplus(x):
19 |     return T.nnet.softplus(x)
20 | 
21 | 
22 | def relu(x):
23 |     return T.nnet.relu(x)
24 | 
25 | 
26 | def tanh(x):
27 |     return T.tanh(x)
28 | 
29 | 
30 | def sigmoid(x):
31 |     return T.nnet.sigmoid(x)
32 | 
33 | 
34 | def hard_sigmoid(x):
35 |     return T.nnet.hard_sigmoid(x)
36 | 
37 | 
38 | def linear(x):
39 |     '''
40 |     The function returns the variable that is passed in, so all types work
41 |     '''
42 |     return x
43 | 
44 | 
45 | def maxout2(x):
46 |     shape = x.shape
47 |     if x.ndim == 1:
48 |         shape1 = T.cast(shape[0] / 2, 'int64')
49 |         shape2 = T.cast(2, 'int64')
50 |         x = x.reshape([shape1, shape2])
51 |         x = x.max(1)
52 |     elif x.ndim == 2:
53 |         shape1 = T.cast(shape[1] / 2, 'int64')
54 |         shape2 = T.cast(2, 'int64')
55 |         x = x.reshape([shape[0], shape1, shape2])
56 |         x = x.max(2)
57 |     elif x.ndim == 3:
58 |         shape1 = T.cast(shape[2] / 2, 'int64')
59 |         shape2 = T.cast(2, 'int64')
60 |         x = x.reshape([shape[0], shape[1], shape1, shape2])
61 |         x = x.max(3)
62 |     return x
63 | 
64 | 
65 | from emolga.utils.generic_utils import get_from_module
66 | 
67 | 
68 | def get(identifier):
69 |     return get_from_module(identifier, globals(), 'activation function')
70 | 


--------------------------------------------------------------------------------
/emolga/basic/initializations.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | import theano.tensor as T
 3 | import numpy as np
 4 | 
 5 | from emolga.utils.theano_utils import sharedX, shared_zeros, shared_ones
 6 | 
 7 | 
 8 | def get_fans(shape):
 9 |     if isinstance(shape, int):
10 |         shape = (1, shape)
11 |     fan_in = shape[0] if len(shape) == 2 else np.prod(shape[1:])
12 |     fan_out = shape[1] if len(shape) == 2 else shape[0]
13 |     return fan_in, fan_out
14 | 
15 | 
16 | def uniform(shape, scale=0.1):
17 |     return sharedX(np.random.uniform(low=-scale, high=scale, size=shape))
18 | 
19 | 
20 | def normal(shape, scale=0.05):
21 |     return sharedX(np.random.randn(*shape) * scale)
22 | 
23 | 
24 | def lecun_uniform(shape):
25 |     ''' Reference: LeCun 98, Efficient Backprop
26 |         http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
27 |     '''
28 |     fan_in, fan_out = get_fans(shape)
29 |     scale = np.sqrt(3. / fan_in)
30 |     return uniform(shape, scale)
31 | 
32 | 
33 | def glorot_normal(shape):
34 |     ''' Reference: Glorot & Bengio, AISTATS 2010
35 |     '''
36 |     fan_in, fan_out = get_fans(shape)
37 |     s = np.sqrt(2. / (fan_in + fan_out))
38 |     return normal(shape, s)
39 | 
40 | 
41 | def glorot_uniform(shape):
42 |     fan_in, fan_out = get_fans(shape)
43 |     s = np.sqrt(6. / (fan_in + fan_out))
44 |     return uniform(shape, s)
45 | 
46 | 
47 | def he_normal(shape):
48 |     ''' Reference:  He et al., http://arxiv.org/abs/1502.01852
49 |     '''
50 |     fan_in, fan_out = get_fans(shape)
51 |     s = np.sqrt(2. / fan_in)
52 |     return normal(shape, s)
53 | 
54 | 
55 | def he_uniform(shape):
56 |     fan_in, fan_out = get_fans(shape)
57 |     s = np.sqrt(6. / fan_in)
58 |     return uniform(shape, s)
59 | 
60 | 
61 | def orthogonal(shape, scale=1.1):
62 |     ''' From Lasagne
63 |     '''
64 |     flat_shape = (shape[0], np.prod(shape[1:]))
65 |     a = np.random.normal(0.0, 1.0, flat_shape)
66 |     u, _, v = np.linalg.svd(a, full_matrices=False)
67 |     # pick the one with the correct shape
68 |     q = u if u.shape == flat_shape else v
69 |     q = q.reshape(shape)
70 |     return sharedX(scale * q[:shape[0], :shape[1]])
71 | 
72 | 
73 | def identity(shape, scale=1):
74 |     if len(shape) != 2 or shape[0] != shape[1]:
75 |         raise Exception("Identity matrix initialization can only be used for 2D square matrices")
76 |     else:
77 |         return sharedX(scale * np.identity(shape[0]))
78 | 
79 | 
80 | def zero(shape):
81 |     return shared_zeros(shape)
82 | 
83 | 
84 | def one(shape):
85 |     return shared_ones(shape)
86 | 
87 | from emolga.utils.generic_utils import get_from_module
88 | def get(identifier):
89 |     return get_from_module(identifier, globals(), 'initialization')
90 | 


--------------------------------------------------------------------------------
/emolga/basic/objectives.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | import theano
 3 | import theano.tensor as T
 4 | import numpy as np
 5 | from six.moves import range
 6 | 
 7 | if theano.config.floatX == 'float64':
 8 |     epsilon = 1.0e-9
 9 | else:
10 |     epsilon = 1.0e-7
11 | 
12 | 
13 | def mean_squared_error(y_true, y_pred):
14 |     return T.sqr(y_pred - y_true).mean(axis=-1)
15 | 
16 | 
17 | def mean_absolute_error(y_true, y_pred):
18 |     return T.abs_(y_pred - y_true).mean(axis=-1)
19 | 
20 | 
21 | def mean_absolute_percentage_error(y_true, y_pred):
22 |     return T.abs_((y_true - y_pred) / T.clip(T.abs_(y_true), epsilon, np.inf)).mean(axis=-1) * 100.
23 | 
24 | 
25 | def mean_squared_logarithmic_error(y_true, y_pred):
26 |     return T.sqr(T.log(T.clip(y_pred, epsilon, np.inf) + 1.) - T.log(T.clip(y_true, epsilon, np.inf) + 1.)).mean(axis=-1)
27 | 
28 | 
29 | def squared_hinge(y_true, y_pred):
30 |     return T.sqr(T.maximum(1. - y_true * y_pred, 0.)).mean(axis=-1)
31 | 
32 | 
33 | def hinge(y_true, y_pred):
34 |     return T.maximum(1. - y_true * y_pred, 0.).mean(axis=-1)
35 | 
36 | 
37 | def categorical_crossentropy(y_true, y_pred):
38 |     '''Expects a binary class matrix instead of a vector of scalar classes
39 |     '''
40 |     y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
41 |     # scale preds so that the class probas of each sample sum to 1
42 |     y_pred /= y_pred.sum(axis=-1, keepdims=True)
43 |     cce = T.nnet.categorical_crossentropy(y_pred, y_true)
44 |     return cce
45 | 
46 | 
47 | def binary_crossentropy(y_true, y_pred):
48 |     y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
49 |     bce = T.nnet.binary_crossentropy(y_pred, y_true).mean(axis=-1)
50 |     return bce
51 | 
52 | 
53 | def poisson_loss(y_true, y_pred):
54 |     return T.mean(y_pred - y_true * T.log(y_pred + epsilon), axis=-1)
55 | 
56 | ####################################################
57 | # Variational Auto-encoder
58 | 
59 | def gaussian_kl_divergence(mean, ln_var):
60 |     """Computes the KL-divergence of Gaussian variables from the standard one.
61 | 
62 |     Given two variable ``mean`` representing :math:`\\mu` and ``ln_var``
63 |     representing :math:`\\log(\\sigma^2)`, this function returns a variable
64 |     representing the KL-divergence between the given multi-dimensional Gaussian
65 |     :math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)`
66 | 
67 |     .. math::
68 | 
69 |        D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)),
70 | 
71 |     where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2`
72 |     and :math:`I` is an identity matrix.
73 | 
74 |     Args:
75 |         mean (~chainer.Variable): A variable representing mean of given
76 |             gaussian distribution, :math:`\\mu`.
77 |         ln_var (~chainer.Variable): A variable representing logarithm of
78 |             variance of given gaussian distribution, :math:`\\log(\\sigma^2)`.
79 | 
80 |     Returns:
81 |         ~chainer.Variable: A variable representing KL-divergence between
82 |             given gaussian distribution and the standard gaussian.
83 | 
84 |     """
85 |     var = T.exp(ln_var)
86 |     return  0.5 * T.sum(mean * mean + var - ln_var - 1, 1)
87 | 
88 | 
89 | # aliases
90 | mse = MSE = mean_squared_error
91 | mae = MAE = mean_absolute_error
92 | mape = MAPE = mean_absolute_percentage_error
93 | msle = MSLE = mean_squared_logarithmic_error
94 | gkl = GKL = gaussian_kl_divergence
95 | 
96 | from emolga.utils.generic_utils import get_from_module
97 | def get(identifier):
98 |     return get_from_module(identifier, globals(), 'objective')
99 | 


--------------------------------------------------------------------------------
/emolga/basic/optimizers.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | import theano
  3 | import sys
  4 | 
  5 | from theano.sandbox.rng_mrg import MRG_RandomStreams
  6 | import theano.tensor as T
  7 | import logging
  8 | 
  9 | from emolga.utils.theano_utils import shared_zeros, shared_scalar, floatX
 10 | from emolga.utils.generic_utils import get_from_module
 11 | from six.moves import zip
 12 | from copy import copy, deepcopy
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def clip_norm(g, c, n):
 18 |     if c > 0:
 19 |         g = T.switch(T.ge(n, c), g * c / n, g)
 20 |     return g
 21 | 
 22 | 
 23 | def kl_divergence(p, p_hat):
 24 |     return p_hat - p + p * T.log(p / p_hat)
 25 | 
 26 | 
 27 | class Optimizer(object):
 28 |     def __init__(self, **kwargs):
 29 |         self.__dict__.update(kwargs)
 30 |         self.updates   = []
 31 |         self.save_parm = []
 32 | 
 33 |     def add(self, v):
 34 |         self.save_parm += [v]
 35 | 
 36 |     def get_state(self):
 37 |         return [u[0].get_value() for u in self.updates]
 38 | 
 39 |     def set_state(self, value_list):
 40 |         assert len(self.updates) == len(value_list)
 41 |         for u, v in zip(self.updates, value_list):
 42 |             u[0].set_value(floatX(v))
 43 | 
 44 |     def get_updates(self, params, loss):
 45 |         raise NotImplementedError
 46 | 
 47 |     def get_gradients(self, loss, params):
 48 |         """
 49 |         Consider the situation that gradient is weighted.
 50 |         """
 51 |         if isinstance(loss, list):
 52 |             grads = T.grad(loss[0], params, consider_constant=loss[1:])  # gradient of loss
 53 |         else:
 54 |             grads = T.grad(loss, params)
 55 | 
 56 |         if hasattr(self, 'clipnorm') and self.clipnorm > 0:
 57 |             print 'use gradient clipping!!'
 58 |             norm = T.sqrt(sum([T.sum(g ** 2) for g in grads]))
 59 |             grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
 60 | 
 61 |         return grads
 62 | 
 63 |     def get_config(self):
 64 |         return {"name": self.__class__.__name__}
 65 | 
 66 | 
 67 | class SGD(Optimizer):
 68 | 
 69 |     def __init__(self, lr=0.05, momentum=0.9, decay=0.01, nesterov=True, *args, **kwargs):
 70 |         super(SGD, self).__init__(**kwargs)
 71 |         self.__dict__.update(locals())
 72 |         self.iterations = shared_scalar(0)
 73 |         self.lr = shared_scalar(lr)
 74 |         self.momentum = shared_scalar(momentum)
 75 | 
 76 |     def get_updates(self, params, loss):
 77 |         grads = self.get_gradients(loss, params)
 78 |         lr = self.lr * (1.0 / (1.0 + self.decay * self.iterations))
 79 |         self.updates = [(self.iterations, self.iterations + 1.)]
 80 | 
 81 |         for p, g in zip(params, grads):
 82 |             m = shared_zeros(p.get_value().shape)  # momentum
 83 |             v = self.momentum * m - lr * g  # velocity
 84 |             self.updates.append((m, v))
 85 | 
 86 |             if self.nesterov:
 87 |                 new_p = p + self.momentum * v - lr * g
 88 |             else:
 89 |                 new_p = p + v
 90 | 
 91 |             self.updates.append((p, new_p))  # apply constraints
 92 |         return self.updates
 93 | 
 94 |     def get_config(self):
 95 |         return {"name": self.__class__.__name__,
 96 |                 "lr": float(self.lr.get_value()),
 97 |                 "momentum": float(self.momentum.get_value()),
 98 |                 "decay": float(self.decay.get_value()),
 99 |                 "nesterov": self.nesterov}
100 | 
101 | 
102 | class RMSprop(Optimizer):
103 |     def __init__(self, lr=0.001, rho=0.9, epsilon=1e-6, *args, **kwargs):
104 |         super(RMSprop, self).__init__(**kwargs)
105 |         self.__dict__.update(locals())
106 |         self.lr = shared_scalar(lr)
107 |         self.rho = shared_scalar(rho)
108 |         self.iterations = shared_scalar(0)
109 | 
110 |     def get_updates(self, params, loss):
111 |         grads = self.get_gradients(loss, params)
112 |         accumulators = [shared_zeros(p.get_value().shape) for p in params]
113 |         self.updates = [(self.iterations, self.iterations + 1.)]
114 | 
115 |         for p, g, a in zip(params, grads, accumulators):
116 |             new_a = self.rho * a + (1 - self.rho) * g ** 2  # update accumulator
117 |             self.updates.append((a, new_a))
118 | 
119 |             new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
120 |             self.updates.append((p, new_p))  # apply constraints
121 |         return self.updates
122 | 
123 |     def get_config(self):
124 |         return {"name": self.__class__.__name__,
125 |                 "lr": float(self.lr.get_value()),
126 |                 "rho": float(self.rho.get_value()),
127 |                 "epsilon": self.epsilon}
128 | 
129 | 
130 | class Adagrad(Optimizer):
131 |     def __init__(self, lr=0.01, epsilon=1e-6, *args, **kwargs):
132 |         super(Adagrad, self).__init__(**kwargs)
133 |         self.__dict__.update(locals())
134 |         self.lr = shared_scalar(lr)
135 | 
136 |     def get_updates(self, params, constraints, loss):
137 |         grads = self.get_gradients(loss, params)
138 |         accumulators = [shared_zeros(p.get_value().shape) for p in params]
139 |         self.updates = []
140 | 
141 |         for p, g, a, c in zip(params, grads, accumulators, constraints):
142 |             new_a = a + g ** 2  # update accumulator
143 |             self.updates.append((a, new_a))
144 |             new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
145 |             self.updates.append((p, c(new_p)))  # apply constraints
146 |         return self.updates
147 | 
148 |     def get_config(self):
149 |         return {"name": self.__class__.__name__,
150 |                 "lr": float(self.lr.get_value()),
151 |                 "epsilon": self.epsilon}
152 | 
153 | 
154 | class Adadelta(Optimizer):
155 |     '''
156 |         Reference: http://arxiv.org/abs/1212.5701
157 |     '''
158 |     def __init__(self, lr=0.1, rho=0.95, epsilon=1e-6, *args, **kwargs):
159 |         super(Adadelta, self).__init__(**kwargs)
160 |         self.__dict__.update(locals())
161 |         self.lr = shared_scalar(lr)
162 |         self.iterations = shared_scalar(0)
163 | 
164 |     def get_updates(self, params, loss):
165 |         grads = self.get_gradients(loss, params)
166 |         accumulators = [shared_zeros(p.get_value().shape) for p in params]
167 |         delta_accumulators = [shared_zeros(p.get_value().shape) for p in params]
168 |         # self.updates = []
169 |         self.updates = [(self.iterations, self.iterations + 1.)]
170 | 
171 |         for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
172 |             new_a = self.rho * a + (1 - self.rho) * g ** 2  # update accumulator
173 |             self.updates.append((a, new_a))
174 | 
175 |             # use the new accumulator and the *old* delta_accumulator
176 |             update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a +
177 |                                                              self.epsilon)
178 | 
179 |             new_p = p - self.lr * update
180 |             self.updates.append((p, new_p))
181 | 
182 |             # update delta_accumulator
183 |             new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2
184 |             self.updates.append((d_a, new_d_a))
185 |         return self.updates
186 | 
187 |     def get_config(self):
188 |         return {"name": self.__class__.__name__,
189 |                 "lr": float(self.lr.get_value()),
190 |                 "rho": self.rho,
191 |                 "epsilon": self.epsilon}
192 | 
193 | 
194 | class Adam(Optimizer):  # new Adam is designed for our purpose.
195 |     '''
196 |         Reference: http://arxiv.org/abs/1412.6980v8
197 | 
198 |         Default parameters follow those provided in the original paper.
199 |         We add Gaussian Noise to improve the performance.
200 |     '''
201 |     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, save=False, rng=None, *args, **kwargs):
202 |         super(Adam, self).__init__(**kwargs)
203 |         self.__dict__.update(locals())
204 |         print locals()
205 | 
206 |         self.iterations = shared_scalar(0,  name='iteration')
207 |         self.lr         = shared_scalar(lr, name='lr')
208 |         self.rng        = MRG_RandomStreams(use_cuda=True)
209 |         self.noise      = []
210 |         self.forget     = dict()
211 |         self.rng        = rng
212 | 
213 |         self.add(self.iterations)
214 |         self.add(self.lr)
215 | 
216 |     def add_noise(self, param):
217 |         if param.name not in self.noise:
218 |             logger.info('add gradient noise to {}'.format(param))
219 |             self.noise += [param.name]
220 | 
221 |     def add_forget(self, param):
222 |         if param.name not in self.forget:
223 |             logger.info('add forgetting list to {}'.format(param))
224 |             self.forget[param.name] = theano.shared(param.get_value())
225 | 
226 |     def get_updates(self, params, loss):
227 |         grads = self.get_gradients(loss, params)
228 |         self.updates = [(self.iterations, self.iterations + 1.)]
229 |         self.pu = []
230 | 
231 |         t = self.iterations + 1
232 |         lr_t = self.lr * T.sqrt(1 - self.beta_2**t) / (1 - self.beta_1**t)
233 |         for p, g in zip(params, grads):
234 |             m = theano.shared(p.get_value() * 0., name=p.name + '_m')  # zero init of moment
235 |             v = theano.shared(p.get_value() * 0., name=p.name + '_v')  # zero init of velocity
236 | 
237 |             self.add(m)
238 |             self.add(v)
239 | 
240 |             # g_noise = self.rng.normal(g.shape, 0, T.sqrt(0.005 * t ** (-0.55)), dtype='float32')
241 | 
242 |             # if p.name in self.noise:
243 |             #     g_deviated = g + g_noise
244 |             # else:
245 |             #     g_deviated = g
246 | 
247 |             g_deviated = g  #  + g_noise
248 |             m_t = (self.beta_1 * m) + (1 - self.beta_1) * g_deviated
249 |             v_t = (self.beta_2 * v) + (1 - self.beta_2) * (g_deviated**2)
250 |             u_t = -lr_t * m_t / (T.sqrt(v_t) + self.epsilon)
251 |             p_t = p + u_t
252 | 
253 |             # # memory reformatting!
254 |             # if p.name in self.forget:
255 |             #     p_t = (1 - p_mem) * p_t + p_mem * self.forget[p.name]
256 |             #     p_s = (1 - p_fgt) * p_t + p_fgt * self.forget[p.name]
257 |             #     self.updates.append((self.forget[p.name], p_s))
258 | 
259 |             self.updates.append((m, m_t))
260 |             self.updates.append((v, v_t))
261 |             self.updates.append((p, p_t))  # apply constraints
262 |             self.pu.append((p, p_t - p))
263 | 
264 |         if self.save:
265 |             return self.updates, self.pu
266 |         return self.updates
267 | 
268 | # aliases
269 | sgd = SGD
270 | rmsprop = RMSprop
271 | adagrad = Adagrad
272 | adadelta = Adadelta
273 | adam = Adam
274 | 
275 | 
276 | def get(identifier, kwargs=None):
277 |     return get_from_module(identifier, globals(), 'optimizer', instantiate=True,
278 |                            kwargs=kwargs)
279 | 


--------------------------------------------------------------------------------
/emolga/config.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'jiataogu'
  2 | import os
  3 | import os.path as path
  4 | 
  5 | def setup_ptb2():
  6 |     # pretraining setting up.
  7 |     # get the lm_config.
  8 | 
  9 |     config = dict()
 10 |     config['on_unused_input'] = 'ignore'
 11 |     config['seed']            = 3030029828
 12 |     config['level']           = 'DEBUG'
 13 | 
 14 |     # config['model']           = 'RNNLM'
 15 |     # config['model']           = 'VAE'
 16 |     # config['model']           = 'RNNLM' #'Helmholtz'
 17 |     config['model']           = 'HarX'
 18 |     config['highway']         = False
 19 |     config['use_noise']       = False
 20 | 
 21 |     config['optimizer']       = 'adam'  #'adadelta'
 22 |     # config['lr']              = 0.1
 23 | 
 24 |     # config['optimizer']       = 'sgd'
 25 | 
 26 |     # dataset
 27 |     config['path']            = path.realpath(path.curdir) + '/'  # '/home/thoma/Work/Dial-DRL/'
 28 |     config['vocabulary_set']  = config['path'] + 'dataset/ptbcorpus/voc.pkl'
 29 |     config['dataset']         = config['path'] + 'dataset/ptbcorpus/data_train.pkl'
 30 |     config['dataset_valid']   = config['path'] + 'dataset/ptbcorpus/data_valid.pkl'
 31 |     config['dataset_test']    = config['path'] + 'dataset/ptbcorpus/data_test.pkl'
 32 |     # output hdf5 file place.
 33 |     config['path_h5']         = config['path'] + 'H5'
 34 |     if not os.path.exists(config['path_h5']):
 35 |         os.mkdir(config['path_h5'])
 36 | 
 37 |     # output log place
 38 |     config['path_log']        = config['path'] + 'Logs'
 39 |     if not os.path.exists(config['path_log']):
 40 |         os.mkdir(config['path_log'])
 41 | 
 42 |     # size
 43 |     config['batch_size']      = 20
 44 |     config['eval_batch_size'] = 20
 45 |     config['mode']            = 'RNN'  # NTM
 46 |     config['binary']          = False
 47 | 
 48 |     # Encoder: dimension
 49 |     config['enc_embedd_dim']  = 300
 50 |     config['enc_hidden_dim']  = 300
 51 |     config['enc_contxt_dim']  = 350
 52 |     config['encoder']         = 'RNN'
 53 |     config['pooling']         = False
 54 | 
 55 |     # Encoder: Model
 56 |     config['bidirectional']   = False  # True
 57 |     config['decposterior']    = True
 58 |     config['enc_use_contxt']  = False
 59 | 
 60 |     # Agent: dimension
 61 |     config['action_dim']      = 50
 62 |     config['output_dim']      = 300
 63 | 
 64 |     # Decoder: dimension
 65 |     config['dec_embedd_dim']  = 300
 66 |     config['dec_hidden_dim']  = 300
 67 |     config['dec_contxt_dim']  = 300
 68 | 
 69 |     # Decoder: Model
 70 |     config['shared_embed']    = False
 71 |     config['use_input']       = False
 72 |     config['bias_code']       = False   # True
 73 |     config['dec_use_contxt']  = True
 74 |     config['deep_out']        = False
 75 |     config['deep_out_activ']  = 'tanh'  # maxout2
 76 |     config['bigram_predict']  = False
 77 |     config['context_predict'] = True    # False
 78 |     config['leaky_predict']   = False   # True
 79 |     config['dropout']         = 0.3
 80 | 
 81 |     # Decoder: sampling
 82 |     config['max_len']         = 88  # 15
 83 |     config['sample_beam']     = 10
 84 |     config['sample_stoch']    = False
 85 |     config['sample_argmax']   = False
 86 | 
 87 |     # Auto-Encoder
 88 |     config['nonlinear_A']     = True
 89 |     config['nonlinear_B']     = False
 90 | 
 91 |     # VAE/Helmholtz: Model
 92 |     config['repeats']         = 10
 93 |     config['eval_repeats']    = 10
 94 |     config['eval_N']          = 10
 95 | 
 96 |     config['variant_control'] = False
 97 |     config['factor']          = 10.
 98 |     config['mult_q']          = 10.
 99 | 
100 |     print 'setup ok.'
101 |     return config
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/emolga/config_variant.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'jiataogu'
 2 | from config import setup_ptb2
 3 | setup = setup_ptb2
 4 | 
 5 | """
 6 | This file is for small variant fix on original
 7 | """
 8 | 
 9 | 
10 | def setup_bienc(config=None):
11 |     if config is None:
12 |         config = setup()
13 |     print 'make some modification'
14 | 
15 |     config['bidirectional'] = True
16 |     config['decposterior']  = False
17 |     return config
18 | 
19 | 
20 | def setup_dim(config=None):
21 |     if config is None:
22 |         config = setup()
23 |     print 'make some modification'
24 | 
25 |     config['enc_embedd_dim'] = 300
26 |     config['enc_hidden_dim'] = 300
27 |     config['action_dim']     = 100
28 | 
29 |     config['dec_embedd_dim'] = 300
30 |     config['dec_hidden_dim'] = 300
31 |     config['dec_contxt_dim'] = 300
32 |     return config
33 | 
34 | 
35 | def setup_rep(config=None):
36 |     if config is None:
37 |         config = setup()
38 |     print 'make some modification'
39 | 
40 |     config['repeats']        = 5
41 |     return config
42 | 
43 | 
44 | def setup_opt(config=None):
45 |     if config is None:
46 |         config = setup()
47 |     print 'make some modification'
48 | 
49 |     config['optimizer']      = 'Adam'
50 |     return config


--------------------------------------------------------------------------------
/emolga/dataset/build_dataset.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'jiataogu'
  2 | import numpy as np
  3 | import numpy.random as rng
  4 | import cPickle
  5 | import pprint
  6 | import sys
  7 | 
  8 | from collections import OrderedDict
  9 | from fuel import datasets
 10 | from fuel import transformers
 11 | from fuel import schemes
 12 | from fuel import streams
 13 | 
 14 | 
 15 | def serialize_to_file(obj, path, protocol=cPickle.HIGHEST_PROTOCOL):
 16 |     f = open(path, 'wb')
 17 |     cPickle.dump(obj, f, protocol=protocol)
 18 |     f.close()
 19 | 
 20 | 
 21 | def show_txt(array, path):
 22 |     f = open(path, 'w')
 23 |     for line in array:
 24 |         f.write(' '.join(line) + '\n')
 25 | 
 26 |     f.close()
 27 | 
 28 | 
 29 | def divide_dataset(dataset, test_size, max_size):
 30 |     train_set = dict()
 31 |     test_set  = dict()
 32 | 
 33 |     for w in dataset:
 34 |         train_set[w] = dataset[w][test_size:max_size].astype('int32')
 35 |         test_set[w]  = dataset[w][:test_size].astype('int32')
 36 | 
 37 |     return train_set, test_set
 38 | 
 39 | 
 40 | def deserialize_from_file(path):
 41 |     f = open(path, 'rb')
 42 |     obj = cPickle.load(f)
 43 |     f.close()
 44 |     return obj
 45 | 
 46 | 
 47 | def build_fuel(data):
 48 |     # create fuel dataset.
 49 |     dataset     = datasets.IndexableDataset(indexables=OrderedDict([('data', data)]))
 50 |     dataset.example_iteration_scheme \
 51 |                 = schemes.ShuffledExampleScheme(dataset.num_examples)
 52 |     return dataset, len(data)
 53 | 
 54 | 
 55 | def obtain_stream(dataset, batch_size, size=1):
 56 |     if size == 1:
 57 |         data_stream = dataset.get_example_stream()
 58 |         data_stream = transformers.Batch(data_stream, iteration_scheme=schemes.ConstantScheme(batch_size))
 59 | 
 60 |         # add padding and masks to the dataset
 61 |         data_stream = transformers.Padding(data_stream, mask_sources=('data'))
 62 |         return data_stream
 63 |     else:
 64 |         data_streams = [dataset.get_example_stream() for _ in xrange(size)]
 65 |         data_streams = [transformers.Batch(data_stream, iteration_scheme=schemes.ConstantScheme(batch_size))
 66 |                         for data_stream in data_streams]
 67 |         data_streams = [transformers.Padding(data_stream, mask_sources=('data')) for data_stream in data_streams]
 68 |         return data_streams
 69 | 
 70 | def build_ptb():
 71 |     path = './ptbcorpus/'
 72 |     print path
 73 |     # make the dataset and vocabulary
 74 |     X_train = [l.split() for l in open(path + 'ptb.train.txt').readlines()]
 75 |     X_test  = [l.split() for l in open(path + 'ptb.test.txt').readlines()]
 76 |     X_valid = [l.split() for l in open(path + 'ptb.valid.txt').readlines()]
 77 | 
 78 |     X = X_train + X_test + X_valid
 79 |     idx2word    = dict(enumerate(set([w for l in X for w in l]), 1))
 80 |     idx2word[0] = '<eol>'
 81 |     word2idx    = {v: k for k, v in idx2word.items()}
 82 |     ixwords_train = [[word2idx[w] for w in l] for l in X_train]
 83 |     ixwords_test  = [[word2idx[w] for w in l] for l in X_test]
 84 |     ixwords_valid = [[word2idx[w] for w in l] for l in X_valid]
 85 |     ixwords_tv    = [[word2idx[w] for w in l] for l in (X_train + X_valid)]
 86 | 
 87 |     max_len = max([len(w) for w in X_train])
 88 |     print max_len
 89 |     # serialization:
 90 |     # serialize_to_file(ixwords_train, path + 'data_train.pkl')
 91 |     # serialize_to_file(ixwords_test,  path + 'data_test.pkl')
 92 |     # serialize_to_file(ixwords_valid, path + 'data_valid.pkl')
 93 |     # serialize_to_file(ixwords_tv,    path + 'data_tv.pkl')
 94 |     # serialize_to_file([idx2word, word2idx], path + 'voc.pkl')
 95 |     # show_txt(X, 'data.txt')
 96 |     print 'save done.'
 97 | 
 98 | 
 99 | def filter_unk(X, min_freq=5):
100 |     voc = dict()
101 |     for l in X:
102 |         for w in l:
103 |             if w not in voc:
104 |                 voc[w]  = 1
105 |             else:
106 |                 voc[w] += 1
107 | 
108 |     word2idx   = dict()
109 |     word2idx['<eol>'] = 0
110 |     id2word    = dict()
111 |     id2word[0] = '<eol>'
112 | 
113 |     at         = 1
114 |     for w in voc:
115 |         if voc[w] > min_freq:
116 |             word2idx[w] = at
117 |             id2word[at] = w
118 |             at += 1
119 | 
120 |     word2idx['<unk>'] = at
121 |     id2word[at] = '<unk>'
122 |     return word2idx, id2word
123 | 
124 | 
125 | def build_msr():
126 |     # path = '/home/thoma/Work/Dial-DRL/dataset/MSRSCC/'
127 |     path = '/Users/jiataogu/Work/Dial-DRL/dataset/MSRSCC/'
128 |     print path
129 | 
130 |     X           = [l.split() for l in open(path + 'train.txt').readlines()]
131 |     word2idx, idx2word = filter_unk(X, min_freq=5)
132 |     print 'vocabulary size={0}. {1} samples'.format(len(word2idx), len(X))
133 | 
134 |     mean_len = np.mean([len(w) for w in X])
135 |     print 'mean len = {}'.format(mean_len)
136 | 
137 |     ixwords     = [[word2idx[w]
138 |                     if w in word2idx
139 |                     else word2idx['<unk>']
140 |                     for w in l] for l in X]
141 |     print ixwords[0]
142 |     # serialization:
143 |     serialize_to_file(ixwords, path + 'data_train.pkl')
144 | 
145 | 
146 | if __name__ == '__main__':
147 |     build_msr()
148 |     # build_ptb()
149 |     # build_dataset()
150 |     # game = GuessOrder(size=8)
151 |     # q = 'Is there any number smaller de than 6 in the last 3 numbers ?'
152 |     # print game.easy_parse(q)
153 | 
154 | 


--------------------------------------------------------------------------------
/emolga/layers/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'yinpengcheng'
2 | 


--------------------------------------------------------------------------------
/emolga/layers/attention.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'jiataogu'
  2 | from .core import *
  3 | """
  4 | Attention Model.
  5 |     <::: Two kinds of attention models ::::>
  6 |     -- Linear Transformation
  7 |     -- Inner Product
  8 | """
  9 | 
 10 | 
 11 | class Attention(Layer):
 12 |     def __init__(self, target_dim, source_dim, hidden_dim,
 13 |                  init='glorot_uniform', name='attention',
 14 |                  coverage=False, max_len=50,
 15 |                  shared=False):
 16 | 
 17 |         super(Attention, self).__init__()
 18 |         self.init       = initializations.get(init)
 19 |         self.softmax    = activations.get('softmax')
 20 |         self.tanh       = activations.get('tanh')
 21 |         self.target_dim = target_dim
 22 |         self.source_dim = source_dim
 23 |         self.hidden_dim = hidden_dim
 24 |         self.max_len    = max_len
 25 |         self.coverage   = coverage
 26 | 
 27 |         if coverage:
 28 |             print 'Use Coverage Trick!'
 29 | 
 30 |         self.Wa         = self.init((self.target_dim, self.hidden_dim))
 31 |         self.Ua         = self.init((self.source_dim, self.hidden_dim))
 32 |         self.va         = self.init((self.hidden_dim, 1))
 33 | 
 34 |         self.Wa.name, self.Ua.name, self.va.name = \
 35 |                 '{}_Wa'.format(name), '{}_Ua'.format(name), '{}_va'.format(name)
 36 |         self.params     = [self.Wa, self.Ua, self.va]
 37 |         if coverage:
 38 |             self.Ca      = self.init((1, self.hidden_dim))
 39 |             self.Ca.name = '{}_Ca'.format(name)
 40 |             self.params += [self.Ca]
 41 | 
 42 |     def __call__(self, X, S,
 43 |                  Smask=None,
 44 |                  return_log=False,
 45 |                  Cov=None):
 46 |         assert X.ndim + 1 == S.ndim, 'source should be one more dimension than target.'
 47 |         # X is the key:    (nb_samples, x_dim)
 48 |         # S is the source  (nb_samples, maxlen_s, ctx_dim)
 49 |         # Cov is the coverage vector (nb_samples, maxlen_s)
 50 | 
 51 |         if X.ndim == 1:
 52 |             X = X[None, :]
 53 |             S = S[None, :, :]
 54 |             if not Smask:
 55 |                 Smask = Smask[None, :]
 56 | 
 57 |         Eng   = dot(X[:, None, :], self.Wa) + dot(S, self.Ua)  # (nb_samples, source_num, hidden_dims)
 58 |         Eng   = self.tanh(Eng)
 59 |         # location aware:
 60 |         if self.coverage:
 61 |             Eng += dot(Cov[:, :, None], self.Ca)  # (nb_samples, source_num, hidden_dims)
 62 | 
 63 |         Eng   = dot(Eng, self.va)
 64 |         Eng   = Eng[:, :, 0]                      # ? (nb_samples, source_num)
 65 | 
 66 |         if Smask is not None:
 67 |             # I want to use mask!
 68 |             EngSum = logSumExp(Eng, axis=1, mask=Smask)
 69 |             if return_log:
 70 |                 return (Eng - EngSum) * Smask
 71 |             else:
 72 |                 return T.exp(Eng - EngSum) * Smask
 73 |         else:
 74 |             if return_log:
 75 |                 return T.log(self.softmax(Eng))
 76 |             else:
 77 |                 return self.softmax(Eng)
 78 | 
 79 | 
 80 | class CosineAttention(Layer):
 81 |     def __init__(self, target_dim, source_dim,
 82 |                  init='glorot_uniform',
 83 |                  use_pipe=True,
 84 |                  name='attention'):
 85 | 
 86 |         super(CosineAttention, self).__init__()
 87 |         self.init       = initializations.get(init)
 88 |         self.softmax    = activations.get('softmax')
 89 |         self.softplus   = activations.get('softplus')
 90 |         self.tanh       = activations.get('tanh')
 91 |         self.use_pipe   = use_pipe
 92 | 
 93 |         self.target_dim = target_dim
 94 |         self.source_dim = source_dim
 95 | 
 96 |         # pipe
 97 |         if self.use_pipe:
 98 |             self.W_key  = Dense(self.target_dim, self.source_dim, name='W_key')
 99 |         else:
100 |             assert target_dim == source_dim
101 |             self.W_key  = Identity(name='W_key')
102 |         self._add(self.W_key)
103 | 
104 |         # sharpen
105 |         # self.W_beta     = Dense(self.target_dim, 1, name='W_beta')
106 |         # dio-sharpen
107 |         # self.W_beta     = Dense(self.target_dim, self.source_dim, name='W_beta')
108 |         # self._add(self.W_beta)
109 | 
110 |         # self.gamma      = self.init((source_dim, ))
111 |         # self.gamma      = self.init((target_dim, source_dim))
112 |         # self.gamma.name = 'o_gamma'
113 |         # self.params    += [self.gamma]
114 | 
115 |     def __call__(self, X, S, Smask=None, return_log=False):
116 |         assert X.ndim + 1 == S.ndim, 'source should be one more dimension than target.'
117 | 
118 |         if X.ndim == 1:
119 |             X = X[None, :]
120 |             S = S[None, :, :]
121 |             if not Smask:
122 |                 Smask = Smask[None, :]
123 | 
124 |         key   = self.W_key(X)                   # (nb_samples, source_dim)
125 |         # beta  = self.softplus(self.W_beta(X))   # (nb_samples, source_dim)
126 | 
127 |         Eng   = dot_2d(key, S)  #, g=self.gamma)
128 |         # Eng   = cosine_sim2d(key, S)  # (nb_samples, source_num)
129 |         # Eng   = T.repeat(beta, Eng.shape[1], axis=1) * Eng
130 | 
131 |         if Smask is not None:
132 |             # I want to use mask!
133 |             EngSum = logSumExp(Eng, axis=1, mask=Smask)
134 |             if return_log:
135 |                 return (Eng - EngSum) * Smask
136 |             else:
137 |                 return T.exp(Eng - EngSum) * Smask
138 |         else:
139 |             if return_log:
140 |                 return T.log(self.softmax(Eng))
141 |             else:
142 |                 return self.softmax(Eng)
143 | 
144 | 


--------------------------------------------------------------------------------
/emolga/layers/core.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from emolga.utils.theano_utils import *
  4 | import emolga.basic.initializations as initializations
  5 | import emolga.basic.activations as activations
  6 | 
  7 | 
  8 | class Layer(object):
  9 |     def __init__(self):
 10 |         self.params  = []
 11 |         self.layers  = []
 12 |         self.monitor = {}
 13 |         self.watchlist = []
 14 | 
 15 |     def init_updates(self):
 16 |         self.updates = []
 17 | 
 18 |     def _monitoring(self):
 19 |         # add monitoring variables
 20 |         for l in self.layers:
 21 |             for v in l.monitor:
 22 |                 name = v + '@' + l.name
 23 |                 print name
 24 |                 self.monitor[name] = l.monitor[v]
 25 | 
 26 |     def __call__(self, X, *args, **kwargs):
 27 |         return X
 28 | 
 29 |     def _add(self, layer):
 30 |         if layer:
 31 |             self.layers.append(layer)
 32 |             self.params += layer.params
 33 | 
 34 |     def supports_masked_input(self):
 35 |         ''' Whether or not this layer respects the output mask of its previous layer in its calculations. If you try
 36 |         to attach a layer that does *not* support masked_input to a layer that gives a non-None output_mask() that is
 37 |         an error'''
 38 |         return False
 39 | 
 40 |     def get_output_mask(self, train=None):
 41 |         '''
 42 |         For some models (such as RNNs) you want a way of being able to mark some output data-points as
 43 |         "masked", so they are not used in future calculations. In such a model, get_output_mask() should return a mask
 44 |         of one less dimension than get_output() (so if get_output is (nb_samples, nb_timesteps, nb_dimensions), then the mask
 45 |         is (nb_samples, nb_timesteps), with a one for every unmasked datapoint, and a zero for every masked one.
 46 | 
 47 |         If there is *no* masking then it shall return None. For instance if you attach an Activation layer (they support masking)
 48 |         to a layer with an output_mask, then that Activation shall also have an output_mask. If you attach it to a layer with no
 49 |         such mask, then the Activation's get_output_mask shall return None.
 50 | 
 51 |         Some emolga have an output_mask even if their input is unmasked, notably Embedding which can turn the entry "0" into
 52 |         a mask.
 53 |         '''
 54 |         return None
 55 | 
 56 |     def set_weights(self, weights):
 57 |         for p, w in zip(self.params, weights):
 58 |             if p.eval().shape != w.shape:
 59 |                 raise Exception("Layer shape %s not compatible with weight shape %s." % (p.eval().shape, w.shape))
 60 |             p.set_value(floatX(w))
 61 | 
 62 |     def get_weights(self):
 63 |         weights = []
 64 |         for p in self.params:
 65 |             weights.append(p.get_value())
 66 |         return weights
 67 | 
 68 |     def get_params(self):
 69 |         return self.params
 70 | 
 71 |     def set_name(self, name):
 72 |         for i in range(len(self.params)):
 73 |             if self.params[i].name is None:
 74 |                 self.params[i].name = '%s_p%d' % (name, i)
 75 |             else:
 76 |                 self.params[i].name = name + '_' + self.params[i].name
 77 |         self.name = name
 78 | 
 79 | 
 80 | class MaskedLayer(Layer):
 81 |     '''
 82 |     If your layer trivially supports masking (by simply copying the input mask to the output), then subclass MaskedLayer
 83 |     instead of Layer, and make sure that you incorporate the input mask into your calculation of get_output()
 84 |     '''
 85 |     def supports_masked_input(self):
 86 |         return True
 87 | 
 88 | 
 89 | class Identity(Layer):
 90 |     def __init__(self, name='Identity'):
 91 |         super(Identity, self).__init__()
 92 |         if name is not None:
 93 |             self.set_name(name)
 94 | 
 95 |     def __call__(self, X):
 96 |         return X
 97 | 
 98 | 
 99 | class Dense(Layer):
100 |     def __init__(self, input_dim, output_dim, init='glorot_uniform', activation='tanh', name='Dense',
101 |                  learn_bias=True, negative_bias=False):
102 | 
103 |         super(Dense, self).__init__()
104 |         self.init = initializations.get(init)
105 |         self.activation = activations.get(activation)
106 |         self.input_dim = input_dim
107 |         self.output_dim = output_dim
108 |         self.linear = (activation == 'linear')
109 | 
110 |         # self.input = T.matrix()
111 |         self.W = self.init((self.input_dim, self.output_dim))
112 |         if not negative_bias:
113 |             self.b = shared_zeros((self.output_dim))
114 |         else:
115 |             self.b = shared_ones((self.output_dim))
116 | 
117 |         self.learn_bias = learn_bias
118 |         if self.learn_bias:
119 |             self.params = [self.W, self.b]
120 |         else:
121 |             self.params = [self.W]
122 | 
123 |         if name is not None:
124 |             self.set_name(name)
125 | 
126 |     def set_name(self, name):
127 |         self.W.name = '%s_W' % name
128 |         self.b.name = '%s_b' % name
129 | 
130 |     def __call__(self, X):
131 |         output = self.activation(T.dot(X, self.W) + 4. * self.b)
132 |         return output
133 | 
134 |     def reverse(self, Y):
135 |         assert self.linear
136 | 
137 |         output = T.dot((Y - self.b), self.W.T)
138 |         return output
139 | 
140 | 
141 | class Dense2(Layer):
142 |     def __init__(self, input_dim1, input_dim2, output_dim, init='glorot_uniform', activation='tanh', name='Dense', learn_bias=True):
143 | 
144 |         super(Dense2, self).__init__()
145 |         self.init = initializations.get(init)
146 |         self.activation = activations.get(activation)
147 |         self.input_dim1 = input_dim1
148 |         self.input_dim2 = input_dim2
149 |         self.output_dim = output_dim
150 |         self.linear = (activation == 'linear')
151 | 
152 |         # self.input = T.matrix()
153 | 
154 |         self.W1 = self.init((self.input_dim1, self.output_dim))
155 |         self.W2 = self.init((self.input_dim2, self.output_dim))
156 |         self.b  = shared_zeros((self.output_dim))
157 | 
158 |         self.learn_bias = learn_bias
159 |         if self.learn_bias:
160 |             self.params = [self.W1, self.W2, self.b]
161 |         else:
162 |             self.params = [self.W1, self.W2]
163 | 
164 |         if name is not None:
165 |             self.set_name(name)
166 | 
167 |     def set_name(self, name):
168 |         self.W1.name = '%s_W1' % name
169 |         self.W2.name = '%s_W2' % name
170 |         self.b.name = '%s_b' % name
171 | 
172 |     def __call__(self, X1, X2):
173 |         output = self.activation(T.dot(X1, self.W1) + T.dot(X2, self.W2) + self.b)
174 |         return output
175 | 
176 | 
177 | class Constant(Layer):
178 |     def __init__(self, input_dim, output_dim, init=None, activation='tanh', name='Bias'):
179 | 
180 |         super(Constant, self).__init__()
181 |         assert input_dim == output_dim, 'Bias Layer needs to have the same input/output nodes.'
182 | 
183 |         self.init = initializations.get(init)
184 |         self.activation = activations.get(activation)
185 |         self.input_dim = input_dim
186 |         self.output_dim = output_dim
187 | 
188 |         self.b = shared_zeros(self.output_dim)
189 |         self.params = [self.b]
190 | 
191 |         if name is not None:
192 |             self.set_name(name)
193 | 
194 |     def set_name(self, name):
195 |         self.b.name = '%s_b' % name
196 | 
197 |     def __call__(self, X=None):
198 |         output = self.activation(self.b)
199 |         if X:
200 |             L = X.shape[0]
201 |             output = T.extra_ops.repeat(output[None, :], L, axis=0)
202 |         return output
203 | 
204 | 
205 | class MemoryLinear(Layer):
206 |     def __init__(self, input_dim, input_wdth, init='glorot_uniform',
207 |                  activation='tanh', name='Bias', has_input=True):
208 |         super(MemoryLinear, self).__init__()
209 | 
210 |         self.init       = initializations.get(init)
211 |         self.activation = activations.get(activation)
212 |         self.input_dim  = input_dim
213 |         self.input_wdth = input_wdth
214 | 
215 |         self.b = self.init((self.input_dim, self.input_wdth))
216 |         self.params = [self.b]
217 | 
218 |         if has_input:
219 |             self.P = self.init((self.input_dim, self.input_wdth))
220 |             self.params += [self.P]
221 | 
222 |         if name is not None:
223 |             self.set_name(name)
224 | 
225 |     def __call__(self, X=None):
226 |         out = self.b[None, :, :]
227 |         if X:
228 |             out += self.P[None, :, :] * X
229 |         return self.activation(out)
230 | 
231 | 
232 | class Dropout(MaskedLayer):
233 |     """
234 |         Hinton's dropout.
235 |     """
236 |     def __init__(self, rng=None, p=1., name=None):
237 |         super(Dropout, self).__init__()
238 |         self.p   = p
239 |         self.rng = rng
240 | 
241 |     def __call__(self, X, train=True):
242 |         if self.p > 0.:
243 |             retain_prob = 1. - self.p
244 |             if train:
245 |                 X *= self.rng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
246 |             else:
247 |                 X *= retain_prob
248 |         return X
249 | 
250 | 
251 | class Activation(MaskedLayer):
252 |     """
253 |         Apply an activation function to an output.
254 |     """
255 |     def __init__(self, activation):
256 |         super(Activation, self).__init__()
257 |         self.activation = activations.get(activation)
258 | 
259 |     def __call__(self, X):
260 |         return self.activation(X)
261 | 
262 | 


--------------------------------------------------------------------------------
/emolga/layers/embeddings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from .core import Layer
 4 | from emolga.utils.theano_utils import *
 5 | import emolga.basic.initializations as initializations
 6 | 
 7 | 
 8 | class Embedding(Layer):
 9 |     '''
10 |         Turn positive integers (indexes) into denses vectors of fixed size.
11 |         eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
12 | 
13 |         @input_dim: size of vocabulary (highest input integer + 1)
14 |         @out_dim: size of dense representation
15 |     '''
16 | 
17 |     def __init__(self, input_dim, output_dim, init='uniform', name=None):
18 | 
19 |         super(Embedding, self).__init__()
20 |         self.init = initializations.get(init)
21 |         self.input_dim = input_dim
22 |         self.output_dim = output_dim
23 | 
24 |         self.W = self.init((self.input_dim, self.output_dim))
25 | 
26 |         self.params = [self.W]
27 | 
28 |         if name is not None:
29 |             self.set_name(name)
30 | 
31 |     def get_output_mask(self, X):
32 |         return T.ones_like(X) * (1 - T.eq(X, 0))
33 | 
34 |     def __call__(self, X, mask_zero=False, context=None):
35 |         if context is None:
36 |             out = self.W[X]
37 |         else:
38 |             assert context.ndim == 3
39 |             flag  = False
40 |             if X.ndim == 1:
41 |                 flag = True
42 |                 X = X[:, None]
43 | 
44 |             b_size = context.shape[0]
45 | 
46 |             EMB = T.repeat(self.W[None, :, :], b_size, axis=0)
47 |             EMB = T.concatenate([EMB, context], axis=1)
48 | 
49 |             m_size = EMB.shape[1]
50 |             e_size = EMB.shape[2]
51 |             maxlen = X.shape[1]
52 | 
53 |             EMB = EMB.reshape((b_size * m_size, e_size))
54 |             Z   = (T.arange(b_size)[:, None] * m_size + X).reshape((b_size * maxlen,))
55 |             out = EMB[Z]  # (b_size * maxlen, e_size)
56 | 
57 |             if not flag:
58 |                 out = out.reshape((b_size, maxlen, e_size))
59 |             else:
60 |                 out = out.reshape((b_size, e_size))
61 | 
62 |         if mask_zero:
63 |             return out, T.cast(self.get_output_mask(X), dtype='float32')
64 |         else:
65 |             return out
66 | 
67 | 
68 | class Zero(Layer):
69 |     def __call__(self, X):
70 |         out = T.zeros(X.shape)
71 |         return out
72 | 
73 | 
74 | class Bias(Layer):
75 |     def __call__(self, X):
76 |         tmp = X.flatten()
77 |         tmp = tmp.dimshuffle(0, 'x')
78 |         return tmp
79 | 


--------------------------------------------------------------------------------
/emolga/models/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'
2 | 


--------------------------------------------------------------------------------
/emolga/models/core.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'jiataogu'
 2 | import theano
 3 | import logging
 4 | import deepdish as dd
 5 | 
 6 | from emolga.dataset.build_dataset import serialize_to_file, deserialize_from_file
 7 | from emolga.utils.theano_utils import floatX
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | class Model(object):
13 |     def __init__(self):
14 |         self.layers  = []
15 |         self.params  = []
16 |         self.monitor = {}
17 |         self.watchlist = []
18 | 
19 |     def _add(self, layer):
20 |         if layer:
21 |             self.layers.append(layer)
22 |             self.params += layer.params
23 | 
24 |     def _monitoring(self):
25 |         # add monitoring variables
26 |         for l in self.layers:
27 |             for v in l.monitor:
28 |                 name = v + '@' + l.name
29 |                 print name
30 |                 self.monitor[name] = l.monitor[v]
31 | 
32 |     def compile_monitoring(self, inputs, updates=None):
33 |         logger.info('compile monitoring')
34 |         for i, v in enumerate(self.monitor):
35 |             self.watchlist.append(v)
36 |             logger.info('monitoring [{0}]: {1}'.format(i, v))
37 | 
38 |         self.watch = theano.function(inputs,
39 |                                      [self.monitor[v] for v in self.watchlist],
40 |                                      updates=updates
41 |                                      )
42 |         logger.info('done.')
43 | 
44 |     def set_weights(self, weights):
45 |         if hasattr(self, 'save_parm'):
46 |             params = self.params + self.save_parm
47 |         else:
48 |             params = self.params
49 | 
50 |         for p, w in zip(params, weights):
51 |             print p.name
52 |             if p.eval().shape != w.shape:
53 |                 raise Exception("Layer shape %s not compatible with weight shape %s." % (p.eval().shape, w.shape))
54 |             p.set_value(floatX(w))
55 | 
56 |     def get_weights(self):
57 |         weights = []
58 |         for p in self.params:
59 |             weights.append(p.get_value())
60 | 
61 |         if hasattr(self, 'save_parm'):
62 |             for v in self.save_parm:
63 |                 weights.append(v.get_value())
64 | 
65 |         return weights
66 | 
67 |     def set_name(self, name):
68 |         for i in range(len(self.params)):
69 |             if self.params[i].name is None:
70 |                 self.params[i].name = '%s_p%d' % (name, i)
71 |             else:
72 |                 self.params[i].name = name + '@' + self.params[i].name
73 |         self.name = name
74 | 
75 |     def save(self, filename):
76 |         if hasattr(self, 'save_parm'):
77 |             params = self.params + self.save_parm
78 |         else:
79 |             params = self.params
80 |         ps = 'save: <\n'
81 |         for p in params:
82 |             ps += '{0}: {1}\n'.format(p.name, p.eval().shape)
83 |         ps += '> to ... {}'.format(filename)
84 |         logger.info(ps)
85 | 
86 |         # hdf5 module seems works abnormal !!
87 |         # dd.io.save(filename, self.get_weights())
88 |         serialize_to_file(self.get_weights(), filename)
89 | 
90 |     def load(self, filename):
91 |         logger.info('load the weights.')
92 | 
93 |         # hdf5 module seems works abnormal !!
94 |         # weights = dd.io.load(filename)
95 |         weights = deserialize_from_file(filename)
96 |         print len(weights)
97 |         self.set_weights(weights)
98 | 


--------------------------------------------------------------------------------
/emolga/models/core.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MultiPath/CopyNet/dba24d58d505abaa169e05ee762987f37f41f566/emolga/models/core.pyc


--------------------------------------------------------------------------------
/emolga/test_lm.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'jiataogu'
  2 | 
  3 | import logging
  4 | 
  5 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  6 | 
  7 | from emolga.models.encdec import RNNLM, AutoEncoder
  8 | from emolga.models.variational import Helmholtz, VAE, HarX, THarX, NVTM
  9 | # from models.ntm_encdec import RNNLM, AutoEncoder, Helmholtz, BinaryHelmholtz
 10 | from emolga.utils.generic_utils import *
 11 | from emolga.dataset.build_dataset import deserialize_from_file, build_fuel, obtain_stream
 12 | from emolga.config import setup_ptbz, setup_ptb2
 13 | from emolga.config_variant import *
 14 | 
 15 | setup = setup_bienc
 16 | 
 17 | 
 18 | def init_logging(logfile):
 19 |     formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
 20 |                                   datefmt='%m/%d/%Y %H:%M:%S'   )
 21 |     fh = logging.FileHandler(logfile)
 22 |     # ch = logging.StreamHandler()
 23 | 
 24 |     fh.setFormatter(formatter)
 25 |     # ch.setFormatter(formatter)
 26 |     # fh.setLevel(logging.INFO)
 27 |     # ch.setLevel(logging.INFO)
 28 |     # logging.getLogger().addHandler(ch)
 29 |     logging.getLogger().addHandler(fh)
 30 |     logging.getLogger().setLevel(logging.INFO)
 31 | 
 32 |     return logging
 33 | 
 34 | # prepare logging.
 35 | tmark  = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
 36 | config = setup()   # load settings.
 37 | for w in config:
 38 |     print '{0}={1}'.format(w, config[w])
 39 | 
 40 | logger = init_logging(config['path_log'] + '/emolga.RHM.id={}.log'.format(tmark))
 41 | n_rng  = np.random.RandomState(config['seed'])
 42 | np.random.seed(config['seed'])
 43 | rng    = RandomStreams(n_rng.randint(2 ** 30))
 44 | 
 45 | logger.info('Start!')
 46 | 
 47 | # load the dataset and build a fuel-dataset.
 48 | idx2word, word2idx = deserialize_from_file(config['vocabulary_set'])
 49 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
 50 | config['dec_voc_size'] = config['enc_voc_size']
 51 | logger.info('build dataset done. vocabulary size = {0}/ batch size = {1}'.format(
 52 |         config['dec_voc_size'], config['batch_size']))
 53 | 
 54 | # training & valid & tesing set.
 55 | train_set, train_size = build_fuel(deserialize_from_file(config['dataset']))
 56 | valid_set, valid_size = build_fuel(deserialize_from_file(config['dataset_test']))  # use test set for a try
 57 | 
 58 | # weiget save.
 59 | savefile = config['path_h5'] + '/emolga.RHM.id={}.h5'.format(tmark)
 60 | 
 61 | # build the agent
 62 | if config['model'] == 'RNNLM':
 63 |     agent = RNNLM(config, n_rng, rng, mode=config['mode'])
 64 | elif config['model'] == 'HarX':
 65 |     agent = THarX(config, n_rng, rng, mode=config['mode'])
 66 | elif config['model'] == 'Helmholtz':
 67 |     agent = Helmholtz(config, n_rng, rng, mode=config['mode'])
 68 | else:
 69 |     raise NotImplementedError
 70 | 
 71 | agent.build_()
 72 | agent.compile_('train')
 73 | print 'compile ok'
 74 | 
 75 | # learning to speak language.
 76 | count  = 1000
 77 | echo   = 0
 78 | epochs = 50
 79 | while echo < epochs:
 80 |     echo   += 1
 81 |     loss    = []
 82 |     correct = 0
 83 |     scans   = 0
 84 | 
 85 |     # visualization the embedding weights.
 86 |     # if echo > 1:
 87 |     #    plt.figure(3)
 88 |     #    visualize_(plt.subplot(111), agent.decoder.Embed.get_params()[0].get_value(), name='encoder embedding',
 89 |     #  text=idx2word)
 90 |     #    plt.show()
 91 | 
 92 |     # if not config['use_noise']:
 93 | 
 94 |     # training
 95 |     train_batches = obtain_stream(train_set, config['batch_size']).get_epoch_iterator(as_dict=True)
 96 |     valid_batches = obtain_stream(valid_set, config['eval_batch_size']).get_epoch_iterator(as_dict=True)
 97 | 
 98 |     def prepare_batch(batch):
 99 |         data = batch['data'].astype('int32')
100 |         data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
101 | 
102 |         def cut_zeros(data):
103 |             for k in range(data.shape[1] - 1, 0, -1):
104 |                 data_col = data[:, k].sum()
105 |                 if data_col > 0:
106 |                     return data[:, : k + 2]
107 |             return data
108 |         data = cut_zeros(data)
109 |         return data
110 | 
111 |     # training
112 |     logger.info('Epoch = {} -> Training Set Learning...'.format(echo))
113 |     progbar = Progbar(train_size / config['batch_size'])
114 |     for it, batch in enumerate(train_batches):
115 |         # get data
116 |         data = prepare_batch(batch)
117 |         if config['model'] == 'RNNLM' or config['model'] == 'AutoEncoder':
118 |             loss.append(agent.train_(data, config['repeats']))
119 |             progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
120 |         elif config['model'] == 'Helmholtz' or 'HarX':
121 |             loss.append(agent.train_(data, config['repeats']))
122 |             weightss = np.sum([np.sum(abs(w)) for w in agent.get_weights()])
123 |             progbar.update(it, [('lossPa', loss[-1][0]), ('lossPxa', loss[-1][1]), ('lossQ', loss[-1][2]),
124 |                                 ('perplexity', np.log(loss[-1][3])), ('NLL', loss[-1][4]), ('L1', weightss)])
125 | 
126 |         """
127 |         watch = agent.watch(data)
128 |         print '.'
129 |         pprint(watch[0][0])
130 |         pprint(watch[2][0])
131 |         # pprint(watch[2][0])
132 |         sys.exit(111)
133 |         """
134 | 
135 |         # if it % 100 == 50:
136 |         #     sys.exit(-1)
137 |         #     # print '.'
138 |         #     # print 'encoded = {}'.format(encoded[11])
139 |         #     # print 'mean = {}'.format(mean[11])
140 |         #     # print 'std = {}'.format(std[11])
141 |         #
142 |         #     # watch = agent.watch(data)
143 |         #     # print '.'
144 |         #     # print 'train memory {}'.format(watch[0][0])
145 |         #
146 |         #     for kk in xrange(5):
147 |         #         # sample a sentence.
148 |         #         # action        = agent.action_sampler()
149 |         #         # context       = agent.context_trans(action)
150 |         #         if config['model'] is 'AutoEncoder':
151 |         #             source  = data[kk][None, :]
152 |         #             truth   = ' '.join(print_sample(idx2word, source[0].tolist())[:-1])
153 |         #             print '\ntruth: {}'.format(truth)
154 |         #             context = agent.memorize(source)
155 |         #             sample, score = agent.generate_(context, max_len=data.shape[1])
156 |         #         else:
157 |         #             sample, score = agent.generate_(max_len=data.shape[1])
158 |         #
159 |         #         if sample[-1] is not 0:
160 |         #             sample += [0]  # fix the end.
161 |         #         question = ' '.join(print_sample(idx2word, sample)[:-1])
162 |         #         print '\nsample: {}'.format(question)
163 |         #         print 'PPL: {}'.format(score)
164 |         #         scans   += 1.0
165 | 
166 |     print ' </s>.'
167 |     logger.info('Epoch = {0} finished.'.format(echo))
168 | 
169 |     # validation
170 |     logger.info('Epoch = {} -> Vadlidation Set Evaluation...'.format(echo))
171 |     progbar = Progbar(valid_size / config['batch_size'])
172 |     for it, batch in enumerate(valid_batches):
173 |         # get data
174 |         data = prepare_batch(batch)
175 |         if config['model'] == 'Helmholtz' or 'HarX':
176 |             loss.append(agent.evaluate_(data))
177 |             progbar.update(it, [('NLL', loss[-1][0]), ('perplexity', np.log(loss[-1][1]))])
178 |         else:
179 |             raise NotImplementedError
180 | 
181 |     print ' </s>.'
182 |     # save the weights.
183 |     agent.save(config['path_h5'] + '/emolga.RHM.id={0}.epoch={1}.pkl'.format(tmark, echo))
184 | 
185 |     # logger.info('Learning percentage: {}'.format(correct / scans))
186 | 
187 | 
188 | # inference test
189 | # batches = data_stream.get_epoch_iterator(as_dict=True)
190 | # for it, batch in enumerate(batches):
191 | #     data = batch['data'].astype('int32')
192 | #     data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
193 | #     mean, std = agent.inference_(data)
194 | #     print mean
195 | #     break
196 | # print count


--------------------------------------------------------------------------------
/emolga/test_nvtm.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'jiataogu'
  2 | 
  3 | import logging
  4 | 
  5 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  6 | 
  7 | from emolga.models.encdec import RNNLM, AutoEncoder
  8 | from emolga.models.variational import Helmholtz, VAE, HarX, THarX, NVTM
  9 | # from models.ntm_encdec import RNNLM, AutoEncoder, Helmholtz, BinaryHelmholtz
 10 | from emolga.utils.generic_utils import *
 11 | from emolga.dataset.build_dataset import deserialize_from_file, build_fuel, obtain_stream
 12 | from emolga.config import setup_ptbz, setup_ptb2
 13 | from emolga.config_variant import *
 14 | 
 15 | setup = setup_bienc
 16 | 
 17 | 
 18 | def init_logging(logfile):
 19 |     formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
 20 |                                   datefmt='%m/%d/%Y %H:%M:%S'   )
 21 |     fh = logging.FileHandler(logfile)
 22 |     # ch = logging.StreamHandler()
 23 | 
 24 |     fh.setFormatter(formatter)
 25 |     # ch.setFormatter(formatter)
 26 |     # fh.setLevel(logging.INFO)
 27 |     # ch.setLevel(logging.INFO)
 28 |     # logging.getLogger().addHandler(ch)
 29 |     logging.getLogger().addHandler(fh)
 30 |     logging.getLogger().setLevel(logging.INFO)
 31 | 
 32 |     return logging
 33 | 
 34 | # prepare logging.
 35 | tmark  = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
 36 | config = setup()   # load settings.
 37 | for w in config:
 38 |     print '{0}={1}'.format(w, config[w])
 39 | 
 40 | logger = init_logging(config['path_log'] + '/emolga.RHM.id={}.log'.format(tmark))
 41 | n_rng  = np.random.RandomState(config['seed'])
 42 | np.random.seed(config['seed'])
 43 | rng    = RandomStreams(n_rng.randint(2 ** 30))
 44 | 
 45 | logger.info('Start!')
 46 | 
 47 | # load the dataset and build a fuel-dataset.
 48 | idx2word, word2idx = deserialize_from_file(config['vocabulary_set'])
 49 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
 50 | config['dec_voc_size'] = config['enc_voc_size']
 51 | logger.info('build dataset done. vocabulary size = {0}/ batch size = {1}'.format(
 52 |         config['dec_voc_size'], config['batch_size']))
 53 | 
 54 | # training & valid & tesing set.
 55 | train_set, train_size = build_fuel(deserialize_from_file(config['dataset']))
 56 | valid_set, valid_size = build_fuel(deserialize_from_file(config['dataset_test']))  # use test set for a try
 57 | 
 58 | # weiget save.
 59 | savefile = config['path_h5'] + '/emolga.RHM.id={}.h5'.format(tmark)
 60 | 
 61 | # build the agent
 62 | if config['model'] == 'RNNLM':
 63 |     agent = RNNLM(config, n_rng, rng, mode=config['mode'])
 64 | elif config['model'] == 'HarX':
 65 |     agent = NVTM(config, n_rng, rng, mode=config['mode'])
 66 | elif config['model'] == 'Helmholtz':
 67 |     agent = Helmholtz(config, n_rng, rng, mode=config['mode'])
 68 | else:
 69 |     raise NotImplementedError
 70 | 
 71 | agent.build_()
 72 | agent.compile_('train')
 73 | print 'compile ok'
 74 | 
 75 | # learning to speak language.
 76 | count  = 1000
 77 | echo   = 0
 78 | epochs = 50
 79 | while echo < epochs:
 80 |     echo   += 1
 81 |     loss    = []
 82 |     correct = 0
 83 |     scans   = 0
 84 | 
 85 |     # visualization the embedding weights.
 86 |     # if echo > 1:
 87 |     #    plt.figure(3)
 88 |     #    visualize_(plt.subplot(111), agent.decoder.Embed.get_params()[0].get_value(), name='encoder embedding',
 89 |     #  text=idx2word)
 90 |     #    plt.show()
 91 | 
 92 |     # if not config['use_noise']:
 93 | 
 94 |     # training
 95 |     train_batches = obtain_stream(train_set, config['batch_size']).get_epoch_iterator(as_dict=True)
 96 |     valid_batches = obtain_stream(valid_set, config['eval_batch_size']).get_epoch_iterator(as_dict=True)
 97 | 
 98 |     def prepare_batch(batch):
 99 |         data = batch['data'].astype('int32')
100 |         data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
101 | 
102 |         def cut_zeros(data):
103 |             for k in range(data.shape[1] - 1, 0, -1):
104 |                 data_col = data[:, k].sum()
105 |                 if data_col > 0:
106 |                     return data[:, : k + 2]
107 |             return data
108 |         data = cut_zeros(data)
109 |         return data
110 | 
111 |     # training
112 |     logger.info('Epoch = {} -> Training Set Learning...'.format(echo))
113 |     progbar = Progbar(train_size / config['batch_size'])
114 |     for it, batch in enumerate(train_batches):
115 |         # get data
116 |         data = prepare_batch(batch)
117 |         if config['model'] == 'RNNLM' or config['model'] == 'AutoEncoder':
118 |             loss.append(agent.train_(data, config['repeats']))
119 |             progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
120 |         elif config['model'] == 'Helmholtz' or 'HarX':
121 |             loss.append(agent.train_(data, 1))
122 |             weightss = np.sum([np.sum(abs(w)) for w in agent.get_weights()])
123 |             progbar.update(it, [('lossL', loss[-1][0]), ('lossP', loss[-1][1]), ('lossQ', loss[-1][2]),
124 |                                 ('perplexity', np.log(loss[-1][3])), ('NLL', loss[-1][4]), ('Baseline', loss[-1][5])])
125 | 
126 |         """
127 |         watch = agent.watch(data)
128 |         print '.'
129 |         pprint(watch[0][0])
130 |         pprint(watch[2][0])
131 |         # pprint(watch[2][0])
132 |         sys.exit(111)
133 |         """
134 | 
135 |         # if it % 100 == 50:
136 |         #     sys.exit(-1)
137 |         #     # print '.'
138 |         #     # print 'encoded = {}'.format(encoded[11])
139 |         #     # print 'mean = {}'.format(mean[11])
140 |         #     # print 'std = {}'.format(std[11])
141 |         #
142 |         #     # watch = agent.watch(data)
143 |         #     # print '.'
144 |         #     # print 'train memory {}'.format(watch[0][0])
145 |         #
146 |         #     for kk in xrange(5):
147 |         #         # sample a sentence.
148 |         #         # action        = agent.action_sampler()
149 |         #         # context       = agent.context_trans(action)
150 |         #         if config['model'] is 'AutoEncoder':
151 |         #             source  = data[kk][None, :]
152 |         #             truth   = ' '.join(print_sample(idx2word, source[0].tolist())[:-1])
153 |         #             print '\ntruth: {}'.format(truth)
154 |         #             context = agent.memorize(source)
155 |         #             sample, score = agent.generate_(context, max_len=data.shape[1])
156 |         #         else:
157 |         #             sample, score = agent.generate_(max_len=data.shape[1])
158 |         #
159 |         #         if sample[-1] is not 0:
160 |         #             sample += [0]  # fix the end.
161 |         #         question = ' '.join(print_sample(idx2word, sample)[:-1])
162 |         #         print '\nsample: {}'.format(question)
163 |         #         print 'PPL: {}'.format(score)
164 |         #         scans   += 1.0
165 | 
166 |     print ' </s>.'
167 |     logger.info('Epoch = {0} finished.'.format(echo))
168 |     loss = zip(*loss)
169 |     logger.info('LossL: {0}, LossP: {1}, LossQ: {2}, PPL: {3}, NLL: {4}, Baseline: {5}'.format(
170 |         np.mean(loss[0]), np.mean(loss[1]), np.mean(loss[2]), np.mean(loss[3]), np.mean(loss[4]), np.mean(loss[5])
171 |     ))
172 | 
173 |     # validation
174 |     loss = []
175 |     logger.info('Epoch = {} -> Vadlidation Set Evaluation...'.format(echo))
176 |     progbar = Progbar(valid_size / config['batch_size'])
177 |     for it, batch in enumerate(valid_batches):
178 |         # get data
179 |         data = prepare_batch(batch)
180 |         if config['model'] == 'Helmholtz' or 'HarX':
181 |             # loss.append(agent.evaluate_(data))
182 |             loss.append(agent.explore_(data, 10))
183 |             weightss = np.sum([np.sum(abs(w)) for w in agent.get_weights()])
184 |             progbar.update(it, [('lossL', loss[-1][0]), ('lossP', loss[-1][1]), ('lossQ', loss[-1][2]),
185 |                                 ('perplexity', np.log(loss[-1][3])), ('NLL', loss[-1][4]), ('Baseline', loss[-1][5])])
186 |         else:
187 |             raise NotImplementedError
188 | 
189 |     print ' </s>.'
190 |     loss = zip(*loss)
191 |     logger.info('LossL: {0}, LossP: {1}, LossQ: {2}, PPL: {3}, NLL: {4}, Baseline: {5}'.format(
192 |         np.mean(loss[0]), np.mean(loss[1]), np.mean(loss[2]), np.mean(loss[3]), np.mean(loss[4]), np.mean(loss[5])
193 |     ))
194 |     # save the weights.
195 |     agent.save(config['path_h5'] + '/emolga.RHM.id={0}.epoch={1}.pkl'.format(tmark, echo))
196 | 
197 |     # logger.info('Learning percentage: {}'.format(correct / scans))
198 | 
199 | 
200 | # inference test
201 | # batches = data_stream.get_epoch_iterator(as_dict=True)
202 | # for it, batch in enumerate(batches):
203 | #     data = batch['data'].astype('int32')
204 | #     data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
205 | #     mean, std = agent.inference_(data)
206 | #     print mean
207 | #     break
208 | # print count


--------------------------------------------------------------------------------
/emolga/utils/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'yinpengcheng'
2 | 


--------------------------------------------------------------------------------
/emolga/utils/generic_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from matplotlib.ticker import FuncFormatter
  3 | import numpy as np
  4 | import time
  5 | import sys
  6 | import six
  7 | import matplotlib.pyplot as plt
  8 | import matplotlib
  9 | 
 10 | def get_from_module(identifier, module_params, module_name, instantiate=False, kwargs=None):
 11 |     if isinstance(identifier, six.string_types):
 12 |         res = module_params.get(identifier)
 13 |         if not res:
 14 |             raise Exception('Invalid ' + str(module_name) + ': ' + str(identifier))
 15 |         if instantiate and not kwargs:
 16 |             return res()
 17 |         elif instantiate and kwargs:
 18 |             return res(**kwargs)
 19 |         else:
 20 |             return res
 21 |     return identifier
 22 | 
 23 | 
 24 | def make_tuple(*args):
 25 |     return args
 26 | 
 27 | 
 28 | def printv(v, prefix=''):
 29 |     if type(v) == dict:
 30 |         if 'name' in v:
 31 |             print(prefix + '#' + v['name'])
 32 |             del v['name']
 33 |         prefix += '...'
 34 |         for nk, nv in v.items():
 35 |             if type(nv) in [dict, list]:
 36 |                 print(prefix + nk + ':')
 37 |                 printv(nv, prefix)
 38 |             else:
 39 |                 print(prefix + nk + ':' + str(nv))
 40 |     elif type(v) == list:
 41 |         prefix += '...'
 42 |         for i, nv in enumerate(v):
 43 |             print(prefix + '#' + str(i))
 44 |             printv(nv, prefix)
 45 |     else:
 46 |         prefix += '...'
 47 |         print(prefix + str(v))
 48 | 
 49 | 
 50 | def make_batches(size, batch_size):
 51 |     nb_batch = int(np.ceil(size/float(batch_size)))
 52 |     return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]
 53 | 
 54 | 
 55 | def slice_X(X, start=None, stop=None):
 56 |     if type(X) == list:
 57 |         if hasattr(start, '__len__'):
 58 |             return [x[start] for x in X]
 59 |         else:
 60 |             return [x[start:stop] for x in X]
 61 |     else:
 62 |         if hasattr(start, '__len__'):
 63 |             return X[start]
 64 |         else:
 65 |             return X[start:stop]
 66 | 
 67 | 
 68 | class Progbar(object):
 69 |     def __init__(self, target, width=30, verbose=1):
 70 |         '''
 71 |             @param target: total number of steps expected
 72 |         '''
 73 |         self.width = width
 74 |         self.target = target
 75 |         self.sum_values = {}
 76 |         self.unique_values = []
 77 |         self.start = time.time()
 78 |         self.total_width = 0
 79 |         self.seen_so_far = 0
 80 |         self.verbose = verbose
 81 | 
 82 |     def update(self, current, values=[]):
 83 |         '''
 84 |             @param current: index of current step
 85 |             @param values: list of tuples (name, value_for_last_step).
 86 |             The progress bar will display averages for these values.
 87 |         '''
 88 |         for k, v in values:
 89 |             if k not in self.sum_values:
 90 |                 self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far]
 91 |                 self.unique_values.append(k)
 92 |             else:
 93 |                 self.sum_values[k][0] += v * (current - self.seen_so_far)
 94 |                 self.sum_values[k][1] += (current - self.seen_so_far)
 95 |         self.seen_so_far = current
 96 | 
 97 |         now = time.time()
 98 |         if self.verbose == 1:
 99 |             prev_total_width = self.total_width
100 |             sys.stdout.write("\b" * prev_total_width)
101 |             sys.stdout.write("\r")
102 | 
103 |             numdigits = int(np.floor(np.log10(self.target))) + 1
104 |             barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
105 |             bar = barstr % (current, self.target)
106 |             prog = float(current)/self.target
107 |             prog_width = int(self.width*prog)
108 |             if prog_width > 0:
109 |                 bar += ('.'*(prog_width-1))
110 |                 if current < self.target:
111 |                     bar += '(-w-)'
112 |                 else:
113 |                     bar += '(-v-)!!'
114 |             bar += ('~' * (self.width-prog_width))
115 |             bar += ']'
116 |             sys.stdout.write(bar)
117 |             self.total_width = len(bar)
118 | 
119 |             if current:
120 |                 time_per_unit = (now - self.start) / current
121 |             else:
122 |                 time_per_unit = 0
123 |             eta = time_per_unit*(self.target - current)
124 |             info = ''
125 |             if current < self.target:
126 |                 info += ' - ETA: %ds' % eta
127 |             else:
128 |                 info += ' - %ds' % (now - self.start)
129 |             for k in self.unique_values:
130 |                 if k == 'perplexity' or k == 'PPL':
131 |                     info += ' - %s: %.4f' % (k, np.exp(self.sum_values[k][0] / max(1, self.sum_values[k][1])))
132 |                 else:
133 |                     info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
134 | 
135 |             self.total_width += len(info)
136 |             if prev_total_width > self.total_width:
137 |                 info += ((prev_total_width-self.total_width) * " ")
138 | 
139 |             sys.stdout.write(info)
140 |             sys.stdout.flush()
141 | 
142 |             if current >= self.target:
143 |                 sys.stdout.write("\n")
144 | 
145 |         if self.verbose == 2:
146 |             if current >= self.target:
147 |                 info = '%ds' % (now - self.start)
148 |                 for k in self.unique_values:
149 |                     info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
150 |                 sys.stdout.write(info + "\n")
151 | 
152 |     def add(self, n, values=[]):
153 |         self.update(self.seen_so_far + n, values)
154 | 
155 |     def clear(self):
156 |         self.sum_values = {}
157 |         self.unique_values = []
158 |         self.total_width = 0
159 |         self.seen_so_far = 0
160 | 
161 | 
162 | def print_sample(idx2word, idx):
163 |     def cut_eol(words):
164 |         for i, word in enumerate(words):
165 |             if words[i] == '<eol>':
166 |                 return words[:i + 1]
167 |         raise Exception("No end-of-line found")
168 | 
169 |     return cut_eol(map(lambda w_idx : idx2word[w_idx], idx))
170 | 
171 | 
172 | def visualize_(subplots, data, w=None, h=None, name=None,
173 |                display='on', size=10, text=None, normal=True,
174 |                grid=False):
175 |     fig, ax = subplots
176 |     if data.ndim == 1:
177 |         if w and h:
178 |             # vector visualization
179 |             assert w * h == np.prod(data.shape)
180 |             data = data.reshape((w, h))
181 |         else:
182 |             L = data.shape[0]
183 |             w = int(np.sqrt(L))
184 |             while L % w > 0:
185 |                 w -= 1
186 |             h = L / w
187 |             assert w * h == np.prod(data.shape)
188 |             data = data.reshape((w, h))
189 |     else:
190 |         w = data.shape[0]
191 |         h = data.shape[1]
192 | 
193 |     if not size:
194 |         size = 30 / np.sqrt(w * h)
195 | 
196 |     print data.shape
197 | 
198 |     major_ticks = np.arange(0, h, 1)
199 |     ax.set_xticks(major_ticks)
200 |     ax.set_xlim(0, h)
201 |     major_ticks = np.arange(0, w, 1)
202 |     ax.set_ylim(w, -1)
203 |     ax.set_yticks(major_ticks)
204 |     ax.set_aspect('equal')
205 |     if grid:
206 |         pass
207 |         ax.grid(which='both')
208 |         # ax.axis('equal')
209 |     if normal:
210 |         cax = ax.imshow(data, cmap=plt.cm.pink, interpolation='nearest',
211 |                         vmax=1.0, vmin=0.0, aspect='auto')
212 |     else:
213 |         cax = ax.imshow(data, cmap=plt.cm.bone, interpolation='nearest', aspect='auto')
214 | 
215 |     if name:
216 |         ax.set_title(name)
217 |     else:
218 |         ax.set_title('sample.')
219 |     import matplotlib.ticker as ticker
220 | 
221 |     # ax.xaxis.set_ticks(np.arange(0, h, 1.))
222 |     # ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%0.1f'))
223 |     # ax.yaxis.set_ticks(np.arange(0, w, 1.))
224 |     # ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.1f'))
225 | 
226 |     # ax.set_xticks(np.linspace(0, 1, h))
227 |     # ax.set_yticks(np.linspace(0, 1, w))
228 |     # Move left and bottom spines outward by 10 points
229 |     # ax.spines['left'].set_position(('outward', size))
230 |     # ax.spines['bottom'].set_position(('outward', size))
231 |     # # Hide the right and top spines
232 |     # ax.spines['right'].set_visible(False)
233 |     # ax.spines['top'].set_visible(False)
234 |     # # Only show ticks on the left and bottom spines
235 |     # ax.yaxis.set_ticks_position('left')
236 |     # ax.xaxis.set_ticks_position('bottom')
237 | 
238 |     if text:
239 |         ax.set_yticks(np.linspace(0, 1, 33) * size * 3.2)
240 |         ax.set_yticklabels([text[s] for s in xrange(33)])
241 |     # cbar = fig.colorbar(cax)
242 | 
243 |     if display == 'on':
244 |         plt.show()
245 |     else:
246 |         return ax
247 | 
248 | 
249 | def vis_Gaussian(subplot, mean, std, name=None, display='off', size=10):
250 |     ax   = subplot
251 |     data = np.random.normal(size=(2, 10000))
252 |     data[0] = data[0] * std[0] + mean[0]
253 |     data[1] = data[1] * std[1] + mean[1]
254 | 
255 |     ax.scatter(data[0].tolist(), data[1].tolist(), 'r.')
256 |     if display == 'on':
257 |         plt.show()
258 |     else:
259 |         return ax


--------------------------------------------------------------------------------
/emolga/utils/io_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | import h5py
 3 | import numpy as np
 4 | import cPickle
 5 | from collections import defaultdict
 6 | 
 7 | 
 8 | class HDF5Matrix():
 9 |     refs = defaultdict(int)
10 | 
11 |     def __init__(self, datapath, dataset, start, end, normalizer=None):
12 |         if datapath not in list(self.refs.keys()):
13 |             f = h5py.File(datapath)
14 |             self.refs[datapath] = f
15 |         else:
16 |             f = self.refs[datapath]
17 |         self.start = start
18 |         self.end = end
19 |         self.data = f[dataset]
20 |         self.normalizer = normalizer
21 | 
22 |     def __len__(self):
23 |         return self.end - self.start
24 | 
25 |     def __getitem__(self, key):
26 |         if isinstance(key, slice):
27 |             if key.stop + self.start <= self.end:
28 |                 idx = slice(key.start+self.start, key.stop + self.start)
29 |             else:
30 |                 raise IndexError
31 |         elif isinstance(key, int):
32 |             if key + self.start < self.end:
33 |                 idx = key+self.start
34 |             else:
35 |                 raise IndexError
36 |         elif isinstance(key, np.ndarray):
37 |             if np.max(key) + self.start < self.end:
38 |                 idx = (self.start + key).tolist()
39 |             else:
40 |                 raise IndexError
41 |         elif isinstance(key, list):
42 |             if max(key) + self.start < self.end:
43 |                 idx = [x + self.start for x in key]
44 |             else:
45 |                 raise IndexError
46 |         if self.normalizer is not None:
47 |             return self.normalizer(self.data[idx])
48 |         else:
49 |             return self.data[idx]
50 | 
51 |     @property
52 |     def shape(self):
53 |         return tuple([self.end - self.start, self.data.shape[1]])
54 | 
55 | 
56 | def save_array(array, name):
57 |     import tables
58 |     f = tables.open_file(name, 'w')
59 |     atom = tables.Atom.from_dtype(array.dtype)
60 |     ds = f.createCArray(f.root, 'data', atom, array.shape)
61 |     ds[:] = array
62 |     f.close()
63 | 
64 | 
65 | def load_array(name):
66 |     import tables
67 |     f = tables.open_file(name)
68 |     array = f.root.data
69 |     a = np.empty(shape=array.shape, dtype=array.dtype)
70 |     a[:] = array[:]
71 |     f.close()
72 |     return a
73 | 
74 | 
75 | def save_config():
76 |     pass
77 | 
78 | 
79 | def load_config():
80 |     pass


--------------------------------------------------------------------------------
/emolga/utils/np_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | import numpy as np
 3 | import scipy as sp
 4 | from six.moves import range
 5 | from six.moves import zip
 6 | 
 7 | 
 8 | def to_categorical(y, nb_classes=None):
 9 |     '''Convert class vector (integers from 0 to nb_classes)
10 |     to binary class matrix, for use with categorical_crossentropy
11 |     '''
12 |     y = np.asarray(y, dtype='int32')
13 |     if not nb_classes:
14 |         nb_classes = np.max(y)+1
15 |     Y = np.zeros((len(y), nb_classes))
16 |     for i in range(len(y)):
17 |         Y[i, y[i]] = 1.
18 |     return Y
19 | 
20 | 
21 | def normalize(a, axis=-1, order=2):
22 |     l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
23 |     l2[l2 == 0] = 1
24 |     return a / np.expand_dims(l2, axis)
25 | 
26 | 
27 | def binary_logloss(p, y):
28 |     epsilon = 1e-15
29 |     p = sp.maximum(epsilon, p)
30 |     p = sp.minimum(1-epsilon, p)
31 |     res = sum(y * sp.log(p) + sp.subtract(1, y) * sp.log(sp.subtract(1, p)))
32 |     res *= -1.0/len(y)
33 |     return res
34 | 
35 | 
36 | def multiclass_logloss(P, Y):
37 |     score = 0.
38 |     npreds = [P[i][Y[i]-1] for i in range(len(Y))]
39 |     score = -(1. / len(Y)) * np.sum(np.log(npreds))
40 |     return score
41 | 
42 | 
43 | def accuracy(p, y):
44 |     return np.mean([a == b for a, b in zip(p, y)])
45 | 
46 | 
47 | def probas_to_classes(y_pred):
48 |     if len(y_pred.shape) > 1 and y_pred.shape[1] > 1:
49 |         return categorical_probas_to_classes(y_pred)
50 |     return np.array([1 if p > 0.5 else 0 for p in y_pred])
51 | 
52 | 
53 | def categorical_probas_to_classes(p):
54 |     return np.argmax(p, axis=1)
55 | 


--------------------------------------------------------------------------------
/emolga/utils/test_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def get_test_data(nb_train=1000, nb_test=500, input_shape=(10,), output_shape=(2,),
 5 |                   classification=True, nb_class=2):
 6 |     '''
 7 |         classification=True overrides output_shape
 8 |         (i.e. output_shape is set to (1,)) and the output
 9 |         consists in integers in [0, nb_class-1].
10 | 
11 |         Otherwise: float output with shape output_shape.
12 |     '''
13 |     nb_sample = nb_train + nb_test
14 |     if classification:
15 |         y = np.random.randint(0, nb_class, size=(nb_sample, 1))
16 |         X = np.zeros((nb_sample,) + input_shape)
17 |         for i in range(nb_sample):
18 |             X[i] = np.random.normal(loc=y[i], scale=1.0, size=input_shape)
19 |     else:
20 |         y_loc = np.random.random((nb_sample,))
21 |         X = np.zeros((nb_sample,) + input_shape)
22 |         y = np.zeros((nb_sample,) + output_shape)
23 |         for i in range(nb_sample):
24 |             X[i] = np.random.normal(loc=y_loc[i], scale=1.0, size=input_shape)
25 |             y[i] = np.random.normal(loc=y_loc[i], scale=1.0, size=output_shape)
26 | 
27 |     return (X[:nb_train], y[:nb_train]), (X[nb_train:], y[nb_train:])
28 | 


--------------------------------------------------------------------------------
/emolga/utils/theano_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | from theano import gof
  4 | from theano.tensor import basic as tensor
  5 | import numpy as np
  6 | import theano
  7 | import theano.tensor as T
  8 | 
  9 | 
 10 | def floatX(X):
 11 |     return np.asarray(X, dtype=theano.config.floatX)
 12 | 
 13 | 
 14 | def sharedX(X, dtype=theano.config.floatX, name=None):
 15 |     return theano.shared(np.asarray(X, dtype=dtype), name=name)
 16 | 
 17 | 
 18 | def shared_zeros(shape, dtype=theano.config.floatX, name=None):
 19 |     return sharedX(np.zeros(shape), dtype=dtype, name=name)
 20 | 
 21 | 
 22 | def shared_scalar(val=0., dtype=theano.config.floatX, name=None):
 23 |     return theano.shared(np.cast[dtype](val), name=name)
 24 | 
 25 | 
 26 | def shared_ones(shape, dtype=theano.config.floatX, name=None):
 27 |     return sharedX(np.ones(shape), dtype=dtype, name=name)
 28 | 
 29 | 
 30 | def alloc_zeros_matrix(*dims):
 31 |     return T.alloc(np.cast[theano.config.floatX](0.), *dims)
 32 | 
 33 | 
 34 | def alloc_ones_matrix(*dims):
 35 |     return T.alloc(np.cast[theano.config.floatX](1.), *dims)
 36 | 
 37 | 
 38 | def ndim_tensor(ndim):
 39 |     if ndim == 1:
 40 |         return T.vector()
 41 |     elif ndim == 2:
 42 |         return T.matrix()
 43 |     elif ndim == 3:
 44 |         return T.tensor3()
 45 |     elif ndim == 4:
 46 |         return T.tensor4()
 47 |     return T.matrix()
 48 | 
 49 | 
 50 | # get int32 tensor
 51 | def ndim_itensor(ndim, name=None):
 52 |     if ndim == 2:
 53 |         return T.imatrix(name)
 54 |     elif ndim == 3:
 55 |         return T.itensor3(name)
 56 |     elif ndim == 4:
 57 |         return T.itensor4(name)
 58 |     return T.imatrix(name)
 59 | 
 60 | 
 61 | # dot-product
 62 | def dot(inp, matrix, bias=None):
 63 |     """
 64 |     Decide the right type of dot product depending on the input
 65 |     arguments
 66 |     """
 67 |     if 'int' in inp.dtype and inp.ndim == 2:
 68 |         return matrix[inp.flatten()]
 69 |     elif 'int' in inp.dtype:
 70 |         return matrix[inp]
 71 |     elif 'float' in inp.dtype and inp.ndim == 3:
 72 |         shape0 = inp.shape[0]
 73 |         shape1 = inp.shape[1]
 74 |         shape2 = inp.shape[2]
 75 |         if bias:
 76 |             return (T.dot(inp.reshape((shape0 * shape1, shape2)), matrix) + bias).reshape((shape0, shape1, matrix.shape[1]))
 77 |         else:
 78 |             return T.dot(inp.reshape((shape0 * shape1, shape2)), matrix).reshape((shape0, shape1, matrix.shape[1]))
 79 |     else:
 80 |         if bias:
 81 |             return T.dot(inp, matrix) + bias
 82 |         else:
 83 |             return T.dot(inp, matrix)
 84 | 
 85 | 
 86 | # Numerically stable log(sum(exp(A))). Can also be used in softmax function.
 87 | def logSumExp(x, axis=None, mask=None, status='theano', c=None, err=1e-7):
 88 |     """
 89 |         Numerically stable log(sum(exp(A))). Can also be used in softmax function.
 90 |         c is the additional input when it doesn't require masking but x need.
 91 | 
 92 |     """
 93 |     if status == 'theano':
 94 |         J = T
 95 |     else:
 96 |         J = np
 97 | 
 98 |     if c is None:
 99 |         x_max = J.max(x, axis=axis, keepdims=True)
100 |     else:
101 |         x_max = J.max(J.concatenate([c, x], axis=-1), axis=axis, keepdims=True)
102 | 
103 |     if c is None:
104 |         if not mask:
105 |             l_t = J.sum(J.exp(x - x_max), axis=axis, keepdims=True)
106 | 
107 |         else:
108 |             l_t = J.sum(J.exp(x - x_max) * mask, axis=axis, keepdims=True)
109 |     else:
110 |         if not mask:
111 |             l_t = J.sum(J.exp(x - x_max), axis=axis, keepdims=True) + \
112 |                   J.sum(J.exp(c - x_max), axis=axis, keepdims=True)
113 |         else:
114 |             l_t = J.sum(J.exp(x - x_max) * mask, axis=axis, keepdims=True) + \
115 |                   J.sum(J.exp(c - x_max), axis=axis, keepdims=True)
116 | 
117 |     x_t = J.log(J.maximum(l_t, err)) + x_max
118 |     return x_t
119 | 
120 | 
121 | def softmax(x):
122 |     return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape)
123 | 
124 | 
125 | def masked_softmax(x, mask, err=1e-9):
126 |     assert x.ndim == 2, 'support two-dimension'
127 |     weights  = softmax(x)
128 |     weights *= mask
129 |     weights  = weights / (T.sum(weights, axis=-1)[:, None] + err) * mask
130 |     return weights
131 | 
132 | 
133 | def cosine_sim(k, M):
134 |     k_unit = k / (T.sqrt(T.sum(k**2)) + 1e-5)
135 |     # T.patternbroadcast(k_unit.reshape((1,k_unit.shape[0])),(True,False))
136 |     k_unit = k_unit.dimshuffle(('x', 0))
137 |     k_unit.name = "k_unit"
138 |     M_lengths = T.sqrt(T.sum(M**2, axis=1)).dimshuffle((0, 'x'))
139 |     M_unit = M / (M_lengths + 1e-5)
140 |     M_unit.name = "M_unit"
141 |     return T.sum(k_unit * M_unit, axis=1)
142 | 
143 | 
144 | def cosine_sim2d(k, M):
145 |     # k: (nb_samples, memory_width)
146 |     # M: (nb_samples, memory_dim, memory_width)
147 | 
148 |     # norms of keys and memories
149 |     k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5  # (nb_samples,)
150 |     M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5  # (nb_samples, memory_dim,)
151 | 
152 |     k      = k[:, None, :]                      # (nb_samples, 1, memory_width)
153 |     k_norm = k_norm[:, None]                    # (nb_samples, 1)
154 | 
155 |     sim    = T.sum(k * M, axis=2)               # (nb_samples, memory_dim,)
156 |     sim   /= k_norm * M_norm                    # (nb_samples, memory_dim,)
157 |     return sim
158 | 
159 | 
160 | def dot_2d(k, M, b=None, g=None):
161 |     # k: (nb_samples, memory_width)
162 |     # M: (nb_samples, memory_dim, memory_width)
163 | 
164 |     # norms of keys and memories
165 |     # k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5  # (nb_samples,)
166 |     # M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5  # (nb_samples, memory_dim,)
167 | 
168 |     k      = k[:, None, :]                      # (nb_samples, 1, memory_width)
169 |     value  = k * M
170 |     if b is not None:
171 |         b  = b[:, None, :]
172 |         value *= b         # (nb_samples, memory_dim,)
173 | 
174 |     if g is not None:
175 |         g  = g[None, None, :]
176 |         value *= g
177 | 
178 |     sim    = T.sum(value, axis=2)
179 |     return sim
180 | 
181 | 
182 | def shift_convolve(weight, shift, shift_conv):
183 |     shift = shift.dimshuffle((0, 'x'))
184 |     return T.sum(shift * weight[shift_conv], axis=0)
185 | 
186 | 
187 | def shift_convolve2d(weight, shift, shift_conv):
188 |     return T.sum(shift[:, :, None] * weight[:, shift_conv], axis=1)
189 | 


--------------------------------------------------------------------------------
/emolga/voc.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MultiPath/CopyNet/dba24d58d505abaa169e05ee762987f37f41f566/emolga/voc.pkl


--------------------------------------------------------------------------------
/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'


--------------------------------------------------------------------------------
/experiments/bst_dataset.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | __author__ = 'jiataogu'
  3 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
  4 | import numpy.random as n_rng
  5 | 
  6 | 
  7 | class BSTnode(object):
  8 |     """
  9 | Representation of a node in a binary search tree.
 10 | Has a left child, right child, and key value, and stores its subtree size.
 11 | """
 12 |     def __init__(self, parent, t):
 13 |         """Create a new leaf with key t."""
 14 |         self.key = t
 15 |         self.parent = parent
 16 |         self.left = None
 17 |         self.right = None
 18 |         self.size = 1
 19 | 
 20 |     def update_stats(self):
 21 |         """Updates this node's size based on its children's sizes."""
 22 |         self.size = (0 if self.left is None else self.left.size) + (0 if self.right is None else self.right.size)
 23 | 
 24 |     def insert(self, t, NodeType):
 25 |         """Insert key t into the subtree rooted at this node (updating subtree size)."""
 26 |         self.size += 1
 27 |         if t < self.key:
 28 |             if self.left is None:
 29 |                 self.left = NodeType(self, t)
 30 |                 return self.left
 31 |             else:
 32 |                 return self.left.insert(t, NodeType)
 33 |         elif t > self.key:
 34 |             if self.right is None:
 35 |                 self.right = NodeType(self, t)
 36 |                 return self.right
 37 |             else:
 38 |                 return self.right.insert(t, NodeType)
 39 |         else:
 40 |             return self
 41 | 
 42 |     def find(self, t):
 43 |         """Return the node for key t if it is in this tree, or None otherwise."""
 44 |         if t == self.key:
 45 |             return self
 46 |         elif t < self.key:
 47 |             if self.left is None:
 48 |                 return None
 49 |             else:
 50 |                 return self.left.find(t)
 51 |         else:
 52 |             if self.right is None:
 53 |                 return None
 54 |             else:
 55 |                 return self.right.find(t)
 56 | 
 57 |     def rank(self, t):
 58 |         """Return the number of keys <= t in the subtree rooted at this node."""
 59 |         left_size = 0 if self.left is None else self.left.size
 60 |         if t == self.key:
 61 |             return left_size + 1
 62 |         elif t < self.key:
 63 |             if self.left is None:
 64 |                 return 0
 65 |             else:
 66 |                 return self.left.rank(t)
 67 |         else:
 68 |             if self.right is None:
 69 |                 return left_size + 1
 70 |             else:
 71 |                 return self.right.rank(t) + left_size + 1
 72 | 
 73 |     def minimum(self):
 74 |         """Returns the node with the smallest key in the subtree rooted by this node."""
 75 |         current = self
 76 |         while current.left is not None:
 77 |             current = current.left
 78 |         return current
 79 | 
 80 | 
 81 |     def successor(self):
 82 |         """Returns the node with the smallest key larger than this node's key, or None if this has the largest key in the tree."""
 83 |         if self.right is not None:
 84 |             return self.right.minimum()
 85 |         current = self
 86 |         while current.parent is not None and current.parent.right is current:
 87 |             current = current.parent
 88 |         return current.parent
 89 | 
 90 |     def delete(self):
 91 |         """"Delete this node from the tree."""
 92 |         if self.left is None or self.right is None:
 93 |             if self is self.parent.left:
 94 |                 self.parent.left = self.left or self.right
 95 |                 if self.parent.left is not None:
 96 |                     self.parent.left.parent = self.parent
 97 |             else:
 98 |                 self.parent.right = self.left or self.right
 99 |                 if self.parent.right is not None:
100 |                     self.parent.right.parent = self.parent
101 |             current = self.parent
102 |             while current.key is not None:
103 |                 current.update_stats()
104 |                 current = current.parent
105 |             return self
106 |         else:
107 |             s = self.successor()
108 |             self.key, s.key = s.key, self.key
109 |             return s.delete()
110 | 
111 |     def check(self, lokey, hikey):
112 |         """Checks that the subtree rooted at t is a valid BST and all keys are between (lokey, hikey)."""
113 |         if lokey is not None and self.key <= lokey:
114 |             raise "BST RI violation"
115 |         if hikey is not None and self.key >= hikey:
116 |             raise "BST RI violation"
117 |         if self.left is not None:
118 |             if self.left.parent is not self:
119 |                 raise "BST RI violation"
120 |             self.left.check(lokey, self.key)
121 |         if self.right is not None:
122 |             if self.right.parent is not self:
123 |                 raise "BST RI violation"
124 |             self.right.check(self.key, hikey)
125 |         if self.size != 1 + (0 if self.left is None else self.left.size) + (0 if self.right is None else self.right.size):
126 |             raise "BST RI violation"
127 | 
128 |     def __repr__(self):
129 |         return "<BST Node, key:" + str(self.key) + ">"
130 | 
131 | 
132 | class BST(object):
133 |     """
134 |     Simple binary search tree implementation, augmented with subtree sizes.
135 |     This BST supports insert, find, and delete-min operations.
136 |     Each tree contains some (possibly 0) BSTnode objects, representing nodes,
137 |     and a pointer to the root.
138 |     """
139 | 
140 |     def __init__(self, NodeType=BSTnode):
141 |         self.root = None
142 |         self.NodeType = NodeType
143 |         self.psroot = self.NodeType(None, None)
144 | 
145 |     def reroot(self):
146 |         self.root = self.psroot.left
147 | 
148 |     def insert(self, t):
149 |         """Insert key t into this BST, modifying it in-place."""
150 |         if self.root is None:
151 |             self.psroot.left = self.NodeType(self.psroot, t)
152 |             self.reroot()
153 |             return self.root
154 |         else:
155 |             return self.root.insert(t, self.NodeType)
156 | 
157 |     def find(self, t):
158 |         """Return the node for key t if is in the tree, or None otherwise."""
159 |         if self.root is None:
160 |             return None
161 |         else:
162 |             return self.root.find(t)
163 | 
164 |     def rank(self, t):
165 |         """The number of keys <= t in the tree."""
166 |         if self.root is None:
167 |             return 0
168 |         else:
169 |             return self.root.rank(t)
170 | 
171 |     def delete(self, t):
172 |         """Delete the node for key t if it is in the tree."""
173 |         node = self.find(t)
174 |         deleted = self.root.delete()
175 |         self.reroot()
176 |         return deleted
177 | 
178 |     def check(self):
179 |         if self.root is not None:
180 |             self.root.check(None, None)
181 | 
182 |     def __str__(self):
183 |         if self.root is None:
184 |             return '<empty tree>'
185 | 
186 |         def nested(node):
187 |             if node is None:
188 |                 return '0'
189 |             head  = str(node.key)
190 |             left  = nested(node.left)
191 |             right = nested(node.right)
192 | 
193 |             if left == '0' and right == '0':
194 |                 return head
195 |             else:
196 |                 return ' '.join(['(', head, left, right, ')'])
197 | 
198 |         return nested(self.root)
199 | 
200 |         # def recurse(node):
201 |         #     if node is None:
202 |         #         return [], 0, 0
203 |         #     label = str(node.key)
204 |         #     left_lines, left_pos, left_width = recurse(node.left)
205 |         #     right_lines, right_pos, right_width = recurse(node.right)
206 |         #     middle = max(right_pos + left_width - left_pos + 1, len(label), 2)
207 |         #     pos = left_pos + middle // 2
208 |         #     width = left_pos + middle + right_width - right_pos
209 |         #     while len(left_lines) < len(right_lines):
210 |         #         left_lines.append(' ' * left_width)
211 |         #     while len(right_lines) < len(left_lines):
212 |         #         right_lines.append(' ' * right_width)
213 |         #     if (middle - len(label)) % 2 == 1 and node.parent is not None and \
214 |         #        node is node.parent.left and len(label) < middle:
215 |         #         label += '.'
216 |         #     label = label.center(middle, '.')
217 |         #     if label[0] == '.': label = ' ' + label[1:]
218 |         #     if label[-1] == '.': label = label[:-1] + ' '
219 |         #     lines = [' ' * left_pos + label + ' ' * (right_width - right_pos),
220 |         #              ' ' * left_pos + '/' + ' ' * (middle-2) +
221 |         #              '\\' + ' ' * (right_width - right_pos)] + \
222 |         #       [left_line + ' ' * (width - left_width - right_width) +
223 |         #        right_line
224 |         #        for left_line, right_line in zip(left_lines, right_lines)]
225 |         #     return lines, pos, width
226 |         # return '\n'.join(recurse(self.root) [0])
227 | 
228 | test1 = range(0, 100, 10)
229 | test2 = [31, 41, 59, 26, 53, 58, 97, 93, 23]
230 | test3 = "algorithms"
231 | 
232 | 
233 | def printsizes(node):
234 |     if node is None:
235 |         print "node is nil"
236 |     else:
237 |         print "node", node.key, "has a subtree of size", node.size
238 | 
239 | 
240 | def test(args=None, BSTtype=BST):
241 |     import random, sys
242 |     random.seed(19920206)
243 |     if not args:
244 |         args = sys.argv[1:]
245 |     if not args:
246 |         print 'usage: %s <number-of-random-items | item item item ...>' % \
247 |               sys.argv[0]
248 |         sys.exit()
249 |     elif len(args) == 1:
250 |         items = (random.randrange(100) for i in xrange(int(args[0])))
251 |     else:
252 |         items = [int(i) for i in args]
253 | 
254 |     tree   = BSTtype()
255 |     source = []
256 |     for item in items:
257 |         tree.insert(item)
258 |         source += [str(item)]
259 |     print ' '.join(source)
260 |     print tree
261 | 
262 | 
263 | def generate():
264 |     import random, sys
265 |     random.seed(19920206)
266 | 
267 |     Lmin   = 2 ** 2 - 1
268 |     Lmax   = 2 ** 4 - 1
269 |     Xnum   = 1000000
270 |     voc    = 26
271 | 
272 |     wfile  = open('/home/thoma/Work/Dial-DRL/dataset/BST_1M.txt', 'w')
273 |     for id in xrange(Xnum):
274 |         tree   = BST()
275 |         items  = (random.randrange(voc) for i in
276 |                  xrange(random.randint(Lmin, Lmax)))
277 |         source = []
278 |         for item in items:
279 |             item = chr(item + 65)
280 |             tree.insert(item)
281 |             source += [str(item)]
282 |         source = ' '.join(source)
283 |         target = str(tree)
284 |         line   = '{0} -> {1}'.format(source, target)
285 |         wfile.write(line + '\n')
286 |         if id % 10000 == 0:
287 |             print id
288 | 
289 | 
290 | def obtain_dataset():
291 |     rfile = open('/home/thoma/Work/Dial-DRL/dataset/BST_1M.txt', 'r')
292 |     line  = rfile.readline()
293 | 
294 |     word2idx = dict()
295 |     word2idx['<eol>'] = 0
296 |     word2idx['<unk>'] = 1
297 |     pairs    = []
298 |     at       = 2
299 |     lines    = 0
300 |     while line:
301 |         lines += 1
302 |         line   = line.strip()
303 |         source, target = line.split('->')
304 |         source = source.split()
305 |         target = target.split()
306 | 
307 |         for w in source:
308 |             if w not in word2idx:
309 |                 word2idx[w] = at
310 |                 at += 1
311 |         for w in target:
312 |             if w not in word2idx:
313 |                 word2idx[w] = at
314 |                 at += 1
315 |         pairs.append((source, target))
316 |         if lines % 20000 == 0:
317 |             print lines
318 |         line = rfile.readline()
319 | 
320 |     idx2word = dict()
321 |     for v, k in word2idx.items():
322 |         idx2word[k] = v
323 | 
324 |     Lmax     = len(idx2word)
325 |     print 'read dataset ok.'
326 |     print Lmax
327 |     for i in xrange(Lmax):
328 |         print idx2word[i]
329 | 
330 |     def build_data(data):
331 |         instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
332 |         for pair in data:
333 |             source, target = pair
334 |             A = [word2idx[w] for w in source]
335 |             B = [word2idx[w] for w in target]
336 |             # C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
337 |             C = [0 if w not in source else source.index(w) + Lmax for w in target]
338 | 
339 |             instance['text']      += [source]
340 |             instance['summary']   += [target]
341 |             instance['source']    += [A]
342 |             instance['target']    += [B]
343 |             # instance['cc_matrix'] += [C]
344 |             instance['target_c'] += [C]
345 | 
346 |         print instance['target'][5000]
347 |         print instance['target_c'][5000]
348 |         return instance
349 | 
350 |     train_set = build_data(pairs[100000:])
351 |     test_set  = build_data(pairs[:100000])
352 |     serialize_to_file([train_set, test_set, idx2word, word2idx],
353 |                       '/home/thoma/Work/Dial-DRL/dataset/BST_1M.data.pkl')
354 | 
355 | 
356 | if __name__ == '__main__':
357 |     generate()
358 |     obtain_dataset()


--------------------------------------------------------------------------------
/experiments/bst_vest.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | """
  3 | This is the implementation of Copy-NET
  4 | We start from the basic Seq2seq framework for a auto-encoder.
  5 | """
  6 | import logging
  7 | import time
  8 | import numpy as np
  9 | import sys
 10 | import copy
 11 | 
 12 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 13 | from experiments.config import setup_lcsts, setup_weibo, setup_syn, setup_bst
 14 | from emolga.utils.generic_utils import *
 15 | from emolga.models.covc_encdec import NRM
 16 | from emolga.models.encdec import NRM as NRM0
 17 | from emolga.dataset.build_dataset import deserialize_from_file
 18 | from collections import OrderedDict
 19 | from fuel import datasets
 20 | from fuel import transformers
 21 | from fuel import schemes
 22 | 
 23 | # setup = setup_lcsts
 24 | # setup = setup_syn
 25 | setup = setup_bst
 26 | 
 27 | 
 28 | def init_logging(logfile):
 29 |     formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
 30 |                                   datefmt='%m/%d/%Y %H:%M:%S'   )
 31 |     fh = logging.FileHandler(logfile)
 32 |     # ch = logging.StreamHandler()
 33 | 
 34 |     fh.setFormatter(formatter)
 35 |     # ch.setFormatter(formatter)
 36 |     # fh.setLevel(logging.INFO)
 37 |     # ch.setLevel(logging.INFO)
 38 |     # logging.getLogger().addHandler(ch)
 39 |     logging.getLogger().addHandler(fh)
 40 |     logging.getLogger().setLevel(logging.INFO)
 41 | 
 42 |     return logging
 43 | 
 44 | # prepare logging.
 45 | tmark   = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
 46 | config  = setup()   # load settings.
 47 | for w in config:
 48 |     print '{0}={1}'.format(w, config[w])
 49 | 
 50 | logger  = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
 51 | n_rng   = np.random.RandomState(config['seed'])
 52 | np.random.seed(config['seed'])
 53 | rng     = RandomStreams(n_rng.randint(2 ** 30))
 54 | logger.info('Start!')
 55 | 
 56 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
 57 | 
 58 | if config['voc_size'] == -1:   # not use unk
 59 |     config['enc_voc_size'] = len(word2idx)
 60 |     config['dec_voc_size'] = config['enc_voc_size']
 61 | else:
 62 |     config['enc_voc_size'] = config['voc_size']
 63 |     config['dec_voc_size'] = config['enc_voc_size']
 64 | 
 65 | samples  = len(train_set['source'])
 66 | logger.info('build dataset done. ' +
 67 |             'dataset size: {} ||'.format(samples) +
 68 |             'vocabulary size = {0}/ batch size = {1}'.format(
 69 |         config['dec_voc_size'], config['batch_size']))
 70 | 
 71 | 
 72 | def build_data(data):
 73 |     # create fuel dataset.
 74 |     dataset     = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
 75 |                                                                     ('target', data['target']),
 76 |                                                                     ('target_c', data['target_c']),
 77 |                                                                     ]))
 78 |     dataset.example_iteration_scheme \
 79 |                 = schemes.ShuffledExampleScheme(dataset.num_examples)
 80 |     return dataset
 81 | 
 82 | 
 83 | train_data        = build_data(train_set)
 84 | train_data_plain  = zip(*(train_set['source'], train_set['target']))
 85 | test_data_plain   = zip(*(test_set['source'],  test_set['target']))
 86 | 
 87 | # train_data_plain  = zip(*(train_set['source'], train_set['target']))
 88 | # test_data_plain   = zip(*(test_set['source'],  test_set['target']))
 89 | 
 90 | train_size        = len(train_data_plain)
 91 | test_size         = len(test_data_plain)
 92 | tr_idx            = n_rng.permutation(train_size)[:2000].tolist()
 93 | ts_idx            = n_rng.permutation(test_size )[:2000].tolist()
 94 | logger.info('load the data ok.')
 95 | notrain           = False
 96 | 
 97 | # build the agent
 98 | if config['copynet']:
 99 |     agent  = NRM(config, n_rng, rng, mode=config['mode'],
100 |                  use_attention=True, copynet=config['copynet'], identity=config['identity'])
101 | else:
102 |     agent  = NRM0(config, n_rng, rng, mode=config['mode'],
103 |                   use_attention=True, copynet=config['copynet'], identity=config['identity'])
104 | 
105 | agent.build_()
106 | if notrain:
107 |     agent.compile_('display')
108 | else:
109 |     agent.compile_('all')
110 | print 'compile ok.'
111 | 
112 | # load the model
113 | # agent.load(config['path_h5'] +
114 | # '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format('20160229-105153', 1, config['modelname']))
115 | 
116 | echo   = 0
117 | epochs = 10
118 | skip   = -1  # 25000
119 | if echo > 0:
120 |     tmark = '20160229-105153'  # '20160227-013418'    # copynet multi-source model
121 |     agent.load(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname']))
122 | 
123 | while echo < epochs:
124 |     echo += 1
125 |     loss  = []
126 | 
127 |     def output_stream(dataset, batch_size, size=1):
128 |         data_stream = dataset.get_example_stream()
129 |         data_stream = transformers.Batch(data_stream,
130 |                                          iteration_scheme=schemes.ConstantScheme(batch_size))
131 | 
132 |         # add padding and masks to the dataset
133 |         data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
134 |         return data_stream
135 | 
136 |     def prepare_batch(batch, mask, fix_len=None):
137 |         data = batch[mask].astype('int32')
138 |         data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
139 | 
140 |         def cut_zeros(data, fix_len=None):
141 |             if fix_len is not None:
142 |                 return data[:, : fix_len]
143 |             for k in range(data.shape[1] - 1, 0, -1):
144 |                 data_col = data[:, k].sum()
145 |                 if data_col > 0:
146 |                     return data[:, : k + 2]
147 |             return data
148 |         data = cut_zeros(data, fix_len)
149 |         return data
150 | 
151 |     def cc_martix(source, target):
152 |         cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
153 |         for k in xrange(source.shape[0]):
154 |             for j in xrange(target.shape[1]):
155 |                 for i in xrange(source.shape[1]):
156 |                     if (source[k, i] == target[k, j]) and (source[k, i] > 0):
157 |                         cc[k][j][i] = 1.
158 |         return cc
159 | 
160 |     def unk_filter(data):
161 |         if config['voc_size'] == -1:
162 |             return copy.copy(data)
163 |         else:
164 |             mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
165 |             data = copy.copy(data * mask + (1 - mask))
166 |             return data
167 | 
168 |     # training
169 |     if not notrain:
170 |         train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
171 |         logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
172 |         progbar = Progbar(train_size / config['batch_size'])
173 |         for it, batch in enumerate(train_batches):
174 | 
175 |             # skip some iterations
176 |             if echo == 1  and it < skip:
177 |                 continue
178 | 
179 |             # obtain data
180 |             data_s = prepare_batch(batch, 'source')
181 |             data_t = prepare_batch(batch, 'target')
182 |             if config['copynet']:
183 |                 data_c = cc_martix(data_s, data_t)
184 |                 # data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
185 |                 loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
186 |             else:
187 |                 loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
188 | 
189 |             progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
190 | 
191 |             if it % 200 == 0:
192 |                 logger.info('Echo={} Evaluation Sampling.'.format(it))
193 |                 logger.info('generating [training set] samples')
194 |                 for _ in xrange(5):
195 |                     idx              = int(np.floor(n_rng.rand() * train_size))
196 |                     train_s, train_t = train_data_plain[idx]
197 |                     v                = agent.evaluate_(np.asarray(train_s, dtype='int32'),
198 |                                                        np.asarray(train_t, dtype='int32'),
199 |                                                        idx2word,
200 |                                                        np.asarray(unk_filter(train_s), dtype='int32'))
201 |                     print '*' * 50
202 | 
203 |                 logger.info('generating [testing set] samples')
204 |                 for _ in xrange(5):
205 |                     idx            = int(np.floor(n_rng.rand() * test_size))
206 |                     test_s, test_t = test_data_plain[idx]
207 |                     v              = agent.evaluate_(np.asarray(test_s, dtype='int32'),
208 |                                                      np.asarray(test_t, dtype='int32'),
209 |                                                      idx2word,
210 |                                                      np.asarray(unk_filter(test_s), dtype='int32'))
211 |                     print '*' * 50
212 | 
213 |             # save the weights.
214 |             if it % 5000 == 0:
215 |                 agent.save(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname']))
216 | 
217 |             if (it % 5000 == 0) and it > 0:
218 |                 print 'testing accuracy !!'
219 | 
220 |                 def analysis_(data_plain, t_idx, mode='Training'):
221 |                     progbar_tr = Progbar(2000)
222 |                     print '\n' + '__' * 50
223 |                     cpy, cpy_pos = 0, 0
224 |                     for it, idx in enumerate(t_idx):
225 |                         train_s, train_t = data_plain[idx]
226 |                         c = float(agent.analyse_(np.asarray(train_s, dtype='int32'),
227 |                                            np.asarray(train_t, dtype='int32'),
228 |                                            idx2word))
229 |                         # copy mode
230 |                         cpy     += 1
231 |                         cpy_pos += c
232 |                         progbar_tr.update(it + 1, [('Copy', cpy_pos)])
233 |                     logger.info('\n{0} Accuracy:' +
234 |                                 '\t{1}/{2} = {3}%'.format(mode, cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
235 |                     print '==' * 50
236 | 
237 |                 # analysis_(train_data_plain, tr_idx, 'Training')
238 |                 analysis_(test_data_plain,  ts_idx, 'Testing')
239 | 
240 | 
241 | 
242 | 


--------------------------------------------------------------------------------
/experiments/config.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MultiPath/CopyNet/dba24d58d505abaa169e05ee762987f37f41f566/experiments/config.pyc


--------------------------------------------------------------------------------
/experiments/copynet.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is the implementation of Copy-NET
  3 | We start from the basic Seq2seq framework for a auto-encoder.
  4 | """
  5 | import logging
  6 | import time
  7 | import numpy as np
  8 | 
  9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 10 | from experiments.config import setup
 11 | from emolga.utils.generic_utils import *
 12 | from emolga.models.encdec import *
 13 | from emolga.dataset.build_dataset import deserialize_from_file
 14 | from collections import OrderedDict
 15 | from fuel import datasets
 16 | from fuel import transformers
 17 | from fuel import schemes
 18 | 
 19 | 
 20 | def init_logging(logfile):
 21 |     formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
 22 |                                   datefmt='%m/%d/%Y %H:%M:%S'   )
 23 |     fh = logging.FileHandler(logfile)
 24 |     # ch = logging.StreamHandler()
 25 | 
 26 |     fh.setFormatter(formatter)
 27 |     # ch.setFormatter(formatter)
 28 |     # fh.setLevel(logging.INFO)
 29 |     # ch.setLevel(logging.INFO)
 30 |     # logging.getLogger().addHandler(ch)
 31 |     logging.getLogger().addHandler(fh)
 32 |     logging.getLogger().setLevel(logging.INFO)
 33 | 
 34 |     return logging
 35 | 
 36 | # prepare logging.
 37 | tmark   = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
 38 | config  = setup()   # load settings.
 39 | for w in config:
 40 |     print '{0}={1}'.format(w, config[w])
 41 | 
 42 | logger  = init_logging(config['path_log'] + '/experiments.Copy.id={}.log'.format(tmark))
 43 | n_rng  = np.random.RandomState(config['seed'])
 44 | np.random.seed(config['seed'])
 45 | rng    = RandomStreams(n_rng.randint(2 ** 30))
 46 | logger.info('Start!')
 47 | 
 48 | idx2word, word2idx, idx2word_o, word2idx_o \
 49 |         = deserialize_from_file(config['voc'])
 50 | idx2word_o[0] = '<eol>'
 51 | word2idx_o['<eol>'] = 0
 52 | 
 53 | source, target, origin = deserialize_from_file(config['dataset'])
 54 | samlpes = len(source)
 55 | 
 56 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
 57 | config['dec_voc_size'] = config['enc_voc_size']
 58 | logger.info('build dataset done. ' +
 59 |             'dataset size: {} ||'.format(samlpes) +
 60 |             'vocabulary size = {0}/ batch size = {1}'.format(
 61 |         config['dec_voc_size'], config['batch_size']))
 62 | 
 63 | 
 64 | def build_data(source, target):
 65 |     # create fuel dataset.
 66 |     dataset     = datasets.IndexableDataset(indexables=OrderedDict([('source', source), ('target', target)]))
 67 |     dataset.example_iteration_scheme \
 68 |                 = schemes.ShuffledExampleScheme(dataset.num_examples)
 69 |     return dataset, len(source)
 70 | 
 71 | 
 72 | train_data, train_size = build_data(source[int(0.2 * samlpes):], target[int(0.2 * samlpes):])
 73 | train_data_plain       = zip(*(source[int(0.2 * samlpes):], target[int(0.2 * samlpes):], origin[int(0.2 * samlpes):]))
 74 | test_data_plain        = zip(*(source[:int(0.2 * samlpes)], target[:int(0.2 * samlpes)], origin[:int(0.2 * samlpes)]))
 75 | test_size              = len(test_data_plain)
 76 | logger.info('load the data ok.')
 77 | 
 78 | # build the agent
 79 | agent  = NRM(config, n_rng, rng, mode=config['mode'],
 80 |              use_attention=True, copynet=config['copynet'], identity=config['identity'])
 81 | agent.build_()
 82 | agent.compile_('all')
 83 | print 'compile ok.'
 84 | 
 85 | echo   = 0
 86 | epochs = 10
 87 | while echo < epochs:
 88 |     echo += 1
 89 |     loss  = []
 90 | 
 91 |     def output_stream(dataset, batch_size, size=1):
 92 |         data_stream = dataset.get_example_stream()
 93 |         data_stream = transformers.Batch(data_stream,
 94 |                                          iteration_scheme=schemes.ConstantScheme(batch_size))
 95 | 
 96 |         # add padding and masks to the dataset
 97 |         data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
 98 |         return data_stream
 99 | 
100 |     def prepare_batch(batch, mask):
101 |         data = batch[mask].astype('int32')
102 |         data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
103 | 
104 |         def cut_zeros(data):
105 |             for k in range(data.shape[1] - 1, 0, -1):
106 |                 data_col = data[:, k].sum()
107 |                 if data_col > 0:
108 |                     return data[:, : k + 2]
109 |             return data
110 |         data = cut_zeros(data)
111 |         return data
112 | 
113 |     # training
114 |     train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
115 |     logger.info('Epoch = {} -> Training Set Learning...'.format(echo))
116 |     progbar = Progbar(train_size / config['batch_size'])
117 |     for it, batch in enumerate(train_batches):
118 |         # obtain data
119 |         data_s, data_t = prepare_batch(batch, 'source'), prepare_batch(batch, 'target')
120 |         loss += [agent.train_(data_s, data_t)]
121 |         progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
122 | 
123 |         if it % 500 == 0:
124 |             logger.info('generating [training set] samples')
125 |             for _ in xrange(5):
126 |                 idx              = int(np.floor(n_rng.rand() * train_size))
127 |                 train_s, train_t, train_o = train_data_plain[idx]
128 |                 v                = agent.evaluate_(np.asarray(train_s, dtype='int32'),
129 |                                                    np.asarray(train_t, dtype='int32'),
130 |                                                    idx2word, np.asarray(train_o, dtype='int32'), idx2word_o)
131 |                 print '*' * 50
132 | 
133 |             logger.info('generating [testing set] samples')
134 |             for _ in xrange(5):
135 |                 idx            = int(np.floor(n_rng.rand() * test_size))
136 |                 test_s, test_t, test_o = test_data_plain[idx]
137 |                 v              = agent.evaluate_(np.asarray(test_s, dtype='int32'),
138 |                                                  np.asarray(test_t, dtype='int32'),
139 |                                                  idx2word, np.asarray(test_o, dtype='int32'), idx2word_o)
140 |                 print '*' * 50
141 | 


--------------------------------------------------------------------------------
/experiments/copynet_input.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import logging
 3 | import time
 4 | import numpy as np
 5 | import sys
 6 | import copy
 7 | 
 8 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 9 | from experiments.config import setup_lcsts
10 | from emolga.utils.generic_utils import *
11 | from emolga.models.covc_encdec import NRM
12 | from emolga.models.encdec import NRM as NRM0
13 | from emolga.dataset.build_dataset import deserialize_from_file
14 | from collections import OrderedDict
15 | from fuel import datasets
16 | from fuel import transformers
17 | from fuel import schemes
18 | 
19 | setup = setup_lcsts
20 | 
21 | 
22 | def init_logging(logfile):
23 |     formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
24 |                                   datefmt='%m/%d/%Y %H:%M:%S'   )
25 |     fh = logging.FileHandler(logfile)
26 |     # ch = logging.StreamHandler()
27 | 
28 |     fh.setFormatter(formatter)
29 |     # ch.setFormatter(formatter)
30 |     # fh.setLevel(logging.INFO)
31 |     # ch.setLevel(logging.INFO)
32 |     # logging.getLogger().addHandler(ch)
33 |     logging.getLogger().addHandler(fh)
34 |     logging.getLogger().setLevel(logging.INFO)
35 |     return logging
36 | 
37 | # prepare logging.
38 | tmark   = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
39 | config  = setup()   # load settings.
40 | for w in config:
41 |     print '{0}={1}'.format(w, config[w])
42 | 
43 | logger  = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
44 | n_rng   = np.random.RandomState(config['seed'])
45 | np.random.seed(config['seed'])
46 | rng     = RandomStreams(n_rng.randint(2 ** 30))
47 | logger.info('Start!')
48 | 
49 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
50 | if config['voc_size'] == -1:   # not use unk
51 |     config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
52 |     config['dec_voc_size'] = config['enc_voc_size']
53 | else:
54 |     config['enc_voc_size'] = config['voc_size']
55 |     config['dec_voc_size'] = config['enc_voc_size']
56 | 
57 | samples = len(train_set['source'])
58 | 
59 | logger.info('build dataset done. ' +
60 |             'dataset size: {} ||'.format(samples) +
61 |             'vocabulary size = {0}/ batch size = {1}'.format(
62 |         config['dec_voc_size'], config['batch_size']))
63 | 
64 | 
65 | def unk_filter(data):
66 |     if config['voc_size'] == -1:
67 |         return copy.copy(data)
68 |     else:
69 |         mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
70 |         data = copy.copy(data * mask + (1 - mask))
71 |         return data
72 | 
73 | source = '临近 岁末 ， 新 基金 发行 步入 旺季 ， 11 月份 以来 单周 新基 ' +  \
74 |          '发行 数 始终保持 35 只 以上 的 高位 ， 仅 11 月 25 日 一天 ， ' + \
75 |          '就 有 12 只 基金 同时 发售 。 国内 首只 公募 对冲 混合型 基金 — 嘉实 绝对 收益 策略 ' + \
76 |          '定期 混合 基金 自 发行 首日 便 备受 各界 青睐 ， 每日 认购 均 能 达到 上 亿'
77 | target = '首只 公募 对冲 基金 每日 吸金 上 亿'
78 | 
79 | test_s = [word2idx[w.decode('utf-8')] for w in source.split()]
80 | test_t = [word2idx[w.decode('utf-8')] for w in target.split()]
81 | 
82 | logger.info('load the data ok.')
83 | 
84 | logger.info('Evaluate CopyNet')
85 | echo              = 9
86 | tmark             = '20160226-164053'  # '20160221-025049'  # copy-net model [no unk]
87 | config['copynet'] = True
88 | agent  = NRM(config, n_rng, rng, mode=config['mode'],
89 |                   use_attention=True, copynet=config['copynet'], identity=config['identity'])
90 | agent.build_()
91 | agent.compile_('display')
92 | agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
93 | logger.info('generating [testing set] samples')
94 | 
95 | v      = agent.evaluate_(np.asarray(test_s, dtype='int32'),
96 |                          np.asarray(test_t, dtype='int32'),
97 |                          idx2word, np.asarray(unk_filter(test_s), dtype='int32'))
98 | logger.info('Complete!')


--------------------------------------------------------------------------------
/experiments/dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Preprocess the bAbI datset.
 3 | """
 4 | import logging
 5 | import os
 6 | import sys
 7 | import numpy.random as n_rng
 8 | from emolga.dataset.build_dataset import serialize_to_file
 9 | 
10 | data_path = './dataset/bAbI/en-10k/'
11 | data = []
12 | n_rng.seed(19920206)
13 | 
14 | for p, folders, docs in os.walk(data_path):
15 |     for doc in docs:
16 |         with open(os.path.join(p, doc)) as f:
17 |             l = f.readline()
18 |             while l:
19 |                 l = l.strip().lower()
20 |                 l = l[l.find(' ') + 1:]
21 |                 if len(l.split('\t')) == 1:
22 |                     data += [l[:-1].split()]
23 |                 l = f.readline()
24 | 
25 | idx2word = dict(enumerate(set([w for l in data for w in l]), 1))
26 | word2idx = {v: k for k, v in idx2word.items()}
27 | 
28 | persons  = [1, 6, 24, 37, 38, 47, 60, 61, 73, 74, 90, 94, 107, 110, 114]
29 | colors   = [3, 20, 34, 48, 99, 121]
30 | shapes   = [11, 15, 27, 99]
31 | 
32 | 
33 | def repeat_name(l):
34 |     ll = []
35 |     for word in l:
36 |         if word2idx[word] in persons:
37 |             k = n_rng.randint(5) + 1
38 |             ll += [idx2word[persons[i]] for i in n_rng.randint(len(persons), size=k).tolist()]
39 |         elif word2idx[word] in colors:
40 |             k = n_rng.randint(5) + 1
41 |             ll += [idx2word[colors[i]] for i in n_rng.randint(len(colors), size=k).tolist()]
42 |         elif word2idx[word] in shapes:
43 |             k = n_rng.randint(5) + 1
44 |             ll += [idx2word[shapes[i]] for i in n_rng.randint(len(shapes), size=k).tolist()]
45 |         else:
46 |             ll += [word]
47 |     return ll
48 | 
49 | data_rep = [repeat_name(l) for l in data]
50 | origin   = [[word2idx[w] for w in l] for l in data_rep]
51 | 
52 | def replace(word):
53 |     if word2idx[word] in [1, 6, 24, 37, 38, 47, 60, 61, 73, 74, 90, 94, 107, 110, 114]:
54 |         return '<person>'
55 |     elif word2idx[word] in [3, 20, 34, 48, 99, 121]:
56 |         return '<color>'
57 |     elif word2idx[word] in [11, 15, 27, 99]:
58 |         return '<shape>'
59 |     else:
60 |         return word
61 | 
62 | # prepare the vocabulary
63 | data_clean   = [[replace(w) for w in l] for l in data_rep]
64 | idx2word2    = dict(enumerate(set([w for l in data_clean for w in l]), 1))
65 | idx2word2[0] = '<eol>'
66 | word2idx2    = {v: k for k, v in idx2word2.items()}
67 | Lmax         = len(idx2word2)
68 | 
69 | for k in xrange(len(idx2word2)):
70 |     print k, '\t', idx2word2[k]
71 | print 'Max: {}'.format(Lmax)
72 | 
73 | serialize_to_file([idx2word2, word2idx2, idx2word, word2idx], './dataset/bAbI/voc-b.pkl')
74 | 
75 | # get ready for the dataset.
76 | source = [[word2idx2[w] for w in l] for l in data_clean]
77 | target = [[word2idx2[w] if w not in ['<person>', '<color>', '<shape>']
78 |            else it + Lmax
79 |            for it, w in enumerate(l)] for l in data_clean]
80 | 
81 | 
82 | def print_str(data):
83 |     for d in data:
84 |         print ' '.join(str(w) for w in d)
85 | 
86 | 
87 | print_str(data[10000: 10005])
88 | print_str(data_rep[10000: 10005])
89 | print_str(data_clean[10000: 10005])
90 | print_str(source[10000: 10005])
91 | print_str(target[10000: 10005])
92 | 
93 | serialize_to_file([source, target, origin], './dataset/bAbI/dataset-b.pkl')
94 | 


--------------------------------------------------------------------------------
/experiments/lcsts_dataset.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import chardet
  3 | import sys
  4 | import numpy as np
  5 | import jieba as jb
  6 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
  7 | 
  8 | word2idx = dict()
  9 | wordfreq = dict()
 10 | word2idx['<eol>'] = 0
 11 | word2idx['<unk>'] = 1
 12 | 
 13 | segment  = False # True
 14 | 
 15 | # training set
 16 | pairs = []
 17 | f     = open('./dataset/LCSTS/PART_I/PART_full.txt', 'r')
 18 | line  = f.readline().strip()
 19 | at    = 2
 20 | lines = 0
 21 | while line:
 22 |     if line == '<summary>':
 23 |         summary = f.readline().strip().decode('utf-8')
 24 |         if segment:
 25 |             summary = [w for w in jb.cut(summary)]
 26 | 
 27 |         for w in summary:
 28 |             if w not in wordfreq:
 29 |                 wordfreq[w]  = 1
 30 |             else:
 31 |                 wordfreq[w] += 1
 32 |             # if w not in word2idx:
 33 |             #     word2idx[w] = at
 34 |             #     at         += 1
 35 | 
 36 |         f.readline()
 37 |         f.readline()
 38 |         text    = f.readline().strip().decode('utf-8')
 39 |         if segment:
 40 |             text = [w for w in jb.cut(text)]
 41 |         for w in text:
 42 |             if w not in wordfreq:
 43 |                 wordfreq[w]  = 1
 44 |             else:
 45 |                 wordfreq[w] += 1
 46 |             # if w not in word2idx:
 47 |             #     word2idx[w] = at
 48 |             #     at         += 1
 49 | 
 50 |         pair    = (text, summary)
 51 |         pairs.append(pair)
 52 |         lines  += 1
 53 |         if lines % 20000 == 0:
 54 |             print lines
 55 |     line = f.readline().strip()
 56 | 
 57 | # testing set
 58 | tests = []
 59 | f     = open('./dataset/LCSTS/PART_II/PART_II.txt', 'r')
 60 | line  = f.readline().strip()
 61 | lines = 0
 62 | while line:
 63 |     if line == '<summary>':
 64 |         summary = f.readline().strip().decode('utf-8')
 65 |         if segment:
 66 |             summary = [w for w in jb.cut(summary)]
 67 | 
 68 |         for w in summary:
 69 |             if w not in wordfreq:
 70 |                 wordfreq[w]  = 1
 71 |             else:
 72 |                 wordfreq[w] += 1
 73 |             # if w not in word2idx:
 74 |             #     word2idx[w] = at
 75 |             #     at         += 1
 76 | 
 77 |         f.readline()
 78 |         f.readline()
 79 |         text    = f.readline().strip().decode('utf-8')
 80 |         if segment:
 81 |             text = [w for w in jb.cut(text)]
 82 |         for w in text:
 83 |             if w not in wordfreq:
 84 |                 wordfreq[w]  = 1
 85 |             else:
 86 |                 wordfreq[w] += 1
 87 |             # if w not in word2idx:
 88 |             #     word2idx[w] = at
 89 |             #     at         += 1
 90 | 
 91 |         pair    = (text, summary)
 92 |         tests.append(pair)
 93 |         lines  += 1
 94 |         if lines % 20000 == 0:
 95 |             print lines
 96 |     line = f.readline().strip()
 97 | 
 98 | print len(pairs), len(tests)
 99 | 
100 | # sort the vocabulary
101 | wordfreq = sorted(wordfreq.items(), key=lambda a:a[1], reverse=True)
102 | for w in wordfreq:
103 |     word2idx[w[0]] = at
104 |     at += 1
105 | 
106 | idx2word = {k: v for v, k in word2idx.items()}
107 | Lmax     = len(idx2word)
108 | print 'read dataset ok.'
109 | print Lmax
110 | for i in xrange(Lmax):
111 |     print idx2word[i].encode('utf-8')
112 | 
113 | # use character-based model [on]
114 | # use word-based model     [off]
115 | 
116 | 
117 | def build_data(data):
118 |     instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
119 |     for pair in data:
120 |         source, target = pair
121 |         A = [word2idx[w] for w in source]
122 |         B = [word2idx[w] for w in target]
123 |         # C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
124 |         C = [0 if w not in source else source.index(w) + Lmax for w in target]
125 | 
126 |         instance['text']      += [source]
127 |         instance['summary']   += [target]
128 |         instance['source']    += [A]
129 |         instance['target']    += [B]
130 |         # instance['cc_matrix'] += [C]
131 |         instance['target_c'] += [C]
132 | 
133 |     print instance['target'][5000]
134 |     print instance['target_c'][5000]
135 |     return instance
136 | 
137 | 
138 | train_set = build_data(pairs)
139 | test_set  = build_data(tests)
140 | serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/lcsts_data-char-full.pkl')
141 | 


--------------------------------------------------------------------------------
/experiments/lcsts_rouge.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Evaluation using ROUGE for LCSTS dataset.
  3 | """
  4 | # load the testing set.
  5 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
  6 | import jieba as jb
  7 | import logging
  8 | import copy
  9 | from pyrouge import Rouge155
 10 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 11 | from experiments.config import setup_lcsts, setup_weibo, setup_syn
 12 | from emolga.utils.generic_utils import *
 13 | from emolga.models.covc_encdec import NRM
 14 | from emolga.models.encdec import NRM as NRM0
 15 | from emolga.dataset.build_dataset import deserialize_from_file
 16 | from collections import OrderedDict
 17 | from fuel import datasets
 18 | from fuel import transformers
 19 | from fuel import schemes
 20 | from pprint import pprint
 21 | setup = setup_lcsts
 22 | 
 23 | 
 24 | def build_evaluation(train_set, segment):
 25 |     _, _, idx2word, word2idx = deserialize_from_file(train_set)
 26 |     pairs   = []
 27 |     f       = open('./dataset/LCSTS/PART_III/PART_III.txt', 'r')
 28 |     line    = f.readline().strip()
 29 |     lines   = 0
 30 |     segment = segment
 31 |     while line:
 32 |         if '<human_label>' in line:
 33 |             score   = int(line[13])
 34 |             if score >= 3:
 35 |                 f.readline()
 36 |                 summary = f.readline().strip().decode('utf-8')
 37 |                 if segment:
 38 |                     summary = [w for w in jb.cut(summary)]
 39 |                 target  = []
 40 |                 for w in summary:
 41 |                     if w not in word2idx:
 42 |                         word2idx[w] = len(word2idx)
 43 |                         idx2word[len(idx2word)] = w
 44 |                     target += [word2idx[w]]
 45 | 
 46 |                 f.readline()
 47 |                 f.readline()
 48 |                 text    = f.readline().strip().decode('utf-8')
 49 |                 if segment:
 50 |                     text = [w for w in jb.cut(text)]
 51 |                 source  = []
 52 |                 for w in text:
 53 |                     if w not in word2idx:
 54 |                         word2idx[w] = len(word2idx)
 55 |                         idx2word[len(idx2word)] = w
 56 |                     source += [word2idx[w]]
 57 | 
 58 |                 pair    = (text, summary, score, source, target)
 59 |                 pairs.append(pair)
 60 |                 lines  += 1
 61 |                 if lines % 1000 == 0:
 62 |                     print lines
 63 |         line = f.readline().strip()
 64 |     print 'lines={}'.format(len(pairs))
 65 |     return pairs, word2idx, idx2word
 66 | 
 67 | # words, wwi, wiw = build_evaluation('./dataset/lcsts_data-word-full.pkl', True)
 68 | # chars, cwi, ciw = build_evaluation('./dataset/lcsts_data-char-full.pkl', False)
 69 | #
 70 | # serialize_to_file([words, chars, [wwi, wiw], [cwi, ciw]], './dataset/lcsts_evaluate_data.pkl')
 71 | 
 72 | 
 73 | def init_logging(logfile):
 74 |     formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
 75 |                                   datefmt='%m/%d/%Y %H:%M:%S'   )
 76 |     fh = logging.FileHandler(logfile)
 77 |     # ch = logging.StreamHandler()
 78 | 
 79 |     fh.setFormatter(formatter)
 80 |     # ch.setFormatter(formatter)
 81 |     # fh.setLevel(logging.INFO)
 82 |     # ch.setLevel(logging.INFO)
 83 |     # logging.getLogger().addHandler(ch)
 84 |     logging.getLogger().addHandler(fh)
 85 |     logging.getLogger().setLevel(logging.INFO)
 86 | 
 87 |     return logging
 88 | 
 89 | 
 90 | # prepare logging.
 91 | config  = setup()   # load settings.
 92 | for w in config:
 93 |     print '{0}={1}'.format(w, config[w])
 94 | tmark   = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
 95 | logger  = init_logging(config['path_log'] + '/experiments.LCSTS.Eval.id={}.log'.format(tmark))
 96 | n_rng   = np.random.RandomState(config['seed'])
 97 | np.random.seed(config['seed'])
 98 | rng     = RandomStreams(n_rng.randint(2 ** 30))
 99 | logger.info('Start!')
100 | 
101 | segment = config['segment']
102 | word_set, char_set, word_voc, char_voc = deserialize_from_file('./dataset/lcsts_evaluate_data.pkl')
103 | 
104 | if segment:
105 |     eval_set           = word_set
106 |     word2idx, idx2word = word_voc
107 | else:
108 |     eval_set           = char_set
109 |     word2idx, idx2word = char_voc
110 | 
111 | if config['voc_size'] == -1:   # not use unk
112 |     config['enc_voc_size'] = len(word2idx)
113 |     config['dec_voc_size'] = config['enc_voc_size']
114 | else:
115 |     config['enc_voc_size'] = config['voc_size']
116 |     config['dec_voc_size'] = config['enc_voc_size']
117 | 
118 | samples  = len(eval_set)
119 | logger.info('build dataset done. ' +
120 |             'dataset size: {} ||'.format(samples) +
121 |             'vocabulary size = {0}/ batch size = {1}'.format(
122 |         config['dec_voc_size'], config['batch_size']))
123 | logger.info('load the data ok.')
124 | 
125 | # build the agent
126 | if config['copynet']:
127 |     agent  = NRM(config, n_rng, rng, mode=config['mode'],
128 |                  use_attention=True, copynet=config['copynet'], identity=config['identity'])
129 | else:
130 |     agent  = NRM0(config, n_rng, rng, mode=config['mode'],
131 |                   use_attention=True, copynet=config['copynet'], identity=config['identity'])
132 | 
133 | agent.build_()
134 | agent.compile_('display')
135 | print 'compile ok.'
136 | 
137 | # load the model
138 | agent.load(config['trained_model'])
139 | 
140 | 
141 | def unk_filter(data):
142 |     if config['voc_size'] == -1:
143 |         return copy.copy(data)
144 |     else:
145 |         mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
146 |         data = copy.copy(data * mask + (1 - mask))
147 |         return data
148 | 
149 | rouge    = Rouge155(n_words=40)
150 | evalsets = {'rouge_1_f_score': 'R1',
151 |             'rouge_2_f_score': 'R2',
152 |             'rouge_3_f_score': 'R3',
153 |             'rouge_4_f_score': 'R4',
154 |             'rouge_l_f_score': 'RL',
155 |             'rouge_su4_f_score': 'RSU4'}
156 | scores = dict()
157 | for id, sample in enumerate(eval_set):
158 |     text, summary, score, source, target = sample
159 |     v              = agent.evaluate_(np.asarray(source, dtype='int32'),
160 |                                      np.asarray(target, dtype='int32'),
161 |                                      idx2word,
162 |                                      np.asarray(unk_filter(source), dtype='int32')).decode('utf-8').split('\n')
163 | 
164 |     print 'ID = {} ||'.format(id) + '*' * 50
165 |     ref   = ' '.join(['t{}'.format(char_voc[0][u]) for u in ''.join([w for w in v[2][9:].split()])])
166 |     sym   = ' '.join(['t{}'.format(char_voc[0][u]) for u in ''.join([w for w in v[3][9:].split()])])
167 | 
168 |     sssss = rouge.score_summary(sym, {'A': ref})
169 | 
170 |     for si in sssss:
171 |         if si not in scores:
172 |             scores[si]  = sssss[si]
173 |         else:
174 |             scores[si] += sssss[si]
175 | 
176 |     for e in evalsets:
177 |         print '{0}: {1}'.format(evalsets[e], scores[e] / (id + 1)),
178 |     print './.'
179 | 
180 | # average
181 | for si in scores:
182 |     scores[si] /= float(len(eval_set))
183 | 


--------------------------------------------------------------------------------
/experiments/lcsts_sample.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is the implementation of Copy-NET
  3 | We start from the basic Seq2seq framework for a auto-encoder.
  4 | """
  5 | import logging
  6 | import time
  7 | import numpy as np
  8 | import sys
  9 | import copy
 10 | 
 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 12 | from experiments.config import setup_lcsts
 13 | from emolga.utils.generic_utils import *
 14 | from emolga.models.covc_encdec import NRM
 15 | from emolga.models.encdec import NRM as NRM0
 16 | from emolga.dataset.build_dataset import deserialize_from_file
 17 | from collections import OrderedDict
 18 | from fuel import datasets
 19 | from fuel import transformers
 20 | from fuel import schemes
 21 | 
 22 | setup = setup_lcsts
 23 | 
 24 | 
 25 | def init_logging(logfile):
 26 |     formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
 27 |                                   datefmt='%m/%d/%Y %H:%M:%S'   )
 28 |     fh = logging.FileHandler(logfile)
 29 |     # ch = logging.StreamHandler()
 30 | 
 31 |     fh.setFormatter(formatter)
 32 |     # ch.setFormatter(formatter)
 33 |     # fh.setLevel(logging.INFO)
 34 |     # ch.setLevel(logging.INFO)
 35 |     # logging.getLogger().addHandler(ch)
 36 |     logging.getLogger().addHandler(fh)
 37 |     logging.getLogger().setLevel(logging.INFO)
 38 |     return logging
 39 | 
 40 | # prepare logging.
 41 | tmark   = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
 42 | config  = setup()   # load settings.
 43 | for w in config:
 44 |     print '{0}={1}'.format(w, config[w])
 45 | 
 46 | logger  = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
 47 | n_rng   = np.random.RandomState(config['seed'])
 48 | np.random.seed(config['seed'])
 49 | rng     = RandomStreams(n_rng.randint(2 ** 30))
 50 | logger.info('Start!')
 51 | 
 52 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
 53 | if config['voc_size'] == -1:   # not use unk
 54 |     config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
 55 |     config['dec_voc_size'] = config['enc_voc_size']
 56 | else:
 57 |     config['enc_voc_size'] = config['voc_size']
 58 |     config['dec_voc_size'] = config['enc_voc_size']
 59 | 
 60 | samples = len(train_set['source'])
 61 | 
 62 | logger.info('build dataset done. ' +
 63 |             'dataset size: {} ||'.format(samples) +
 64 |             'vocabulary size = {0}/ batch size = {1}'.format(
 65 |         config['dec_voc_size'], config['batch_size']))
 66 | 
 67 | 
 68 | def build_data(data):
 69 |     # create fuel dataset.
 70 |     dataset     = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
 71 |                                                                     ('target', data['target']),
 72 |                                                                     ('target_c', data['target_c']),
 73 |                                                                     ]))
 74 |     dataset.example_iteration_scheme \
 75 |                 = schemes.ShuffledExampleScheme(dataset.num_examples)
 76 |     return dataset
 77 | 
 78 | 
 79 | def unk_filter(data):
 80 |     if config['voc_size'] == -1:
 81 |         return copy.copy(data)
 82 |     else:
 83 |         mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
 84 |         data = copy.copy(data * mask + (1 - mask))
 85 |         return data
 86 | 
 87 | 
 88 | train_data_plain  = zip(*(train_set['source'], train_set['target']))
 89 | test_data_plain   = zip(*(test_set['source'],  test_set['target']))
 90 | train_size        = len(train_data_plain)
 91 | test_size         = len(test_data_plain)
 92 | tr_idx            = n_rng.permutation(train_size)[:2000].tolist()
 93 | ts_idx            = n_rng.permutation(test_size)[:100].tolist()
 94 | 
 95 | logger.info('load the data ok.')
 96 | 
 97 | # logger.info('Evaluate Enc-Dec')
 98 | # log_gen           = open(config['path_log'] + '/experiments.CopyLCSTS.generate_{}.log'.format(0), 'w')
 99 | # config['copynet'] = True
100 | # echo              = 10
101 | # tmark             = '20160224-185023'  # '20160221-171853'  # enc-dec model [no unk]
102 | # agent  = NRM(config, n_rng, rng, mode=config['mode'],
103 | #                   use_attention=True, copynet=config['copynet'], identity=config['identity'])
104 | # agent.build_()
105 | # agent.compile_('display')
106 | # agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
107 | # logger.info('generating [testing set] samples')
108 | # for idx in ts_idx:
109 | #     # idx            = int(np.floor(n_rng.rand() * test_size))
110 | #     test_s, test_t = test_data_plain[idx]
111 | #     v              = agent.evaluate_(np.asarray(test_s, dtype='int32'),
112 | #                                      np.asarray(test_t, dtype='int32'),
113 | #                                      idx2word)
114 | #     log_gen.write(v)
115 | #     log_gen.write('*' * 50 + '\n')
116 | # log_gen.close()
117 | 
118 | logger.info('Evaluate CopyNet')
119 | echo              = 6
120 | tmark             = '20160224-185023'  # '20160221-025049'  # copy-net model [no unk]
121 | log_cp            = open(config['path_logX'] + '/experiments.copy_{0}_{1}.log'.format(tmark, echo), 'w')
122 | config['copynet'] = True
123 | agent  = NRM(config, n_rng, rng, mode=config['mode'],
124 |                   use_attention=True, copynet=config['copynet'], identity=config['identity'])
125 | agent.build_()
126 | agent.compile_('display')
127 | agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
128 | logger.info('generating [testing set] samples')
129 | for idx in ts_idx:
130 |     # idx            = int(np.floor(n_rng.rand() * test_size))
131 |     test_s, test_t = test_data_plain[idx]
132 |     v              = agent.evaluate_(np.asarray(test_s, dtype='int32'),
133 |                                      np.asarray(test_t, dtype='int32'),
134 |                                      idx2word, np.asarray(unk_filter(test_s), dtype='int32'))
135 |     log_cp.write(v)
136 |     log_cp.write('*' * 50 + '\n')
137 | log_cp.close()
138 | logger.info('Complete!')


--------------------------------------------------------------------------------
/experiments/lcsts_test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is the implementation of Copy-NET
  3 | We start from the basic Seq2seq framework for a auto-encoder.
  4 | """
  5 | import logging
  6 | import time
  7 | import numpy as np
  8 | import sys
  9 | 
 10 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 11 | from experiments.config import setup_lcsts
 12 | from emolga.utils.generic_utils import *
 13 | from emolga.models.cooc_encdec import NRM
 14 | from emolga.models.encdec import NRM as NRM0
 15 | from emolga.dataset.build_dataset import deserialize_from_file
 16 | from collections import OrderedDict
 17 | from fuel import datasets
 18 | from fuel import transformers
 19 | from fuel import schemes
 20 | 
 21 | setup = setup_lcsts
 22 | 
 23 | 
 24 | def init_logging(logfile):
 25 |     formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
 26 |                                   datefmt='%m/%d/%Y %H:%M:%S'   )
 27 |     fh = logging.FileHandler(logfile)
 28 |     # ch = logging.StreamHandler()
 29 | 
 30 |     fh.setFormatter(formatter)
 31 |     # ch.setFormatter(formatter)
 32 |     # fh.setLevel(logging.INFO)
 33 |     # ch.setLevel(logging.INFO)
 34 |     # logging.getLogger().addHandler(ch)
 35 |     logging.getLogger().addHandler(fh)
 36 |     logging.getLogger().setLevel(logging.INFO)
 37 | 
 38 |     return logging
 39 | 
 40 | # prepare logging.
 41 | tmark   = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
 42 | config  = setup()   # load settings.
 43 | for w in config:
 44 |     print '{0}={1}'.format(w, config[w])
 45 | 
 46 | logger  = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
 47 | n_rng   = np.random.RandomState(config['seed'])
 48 | np.random.seed(config['seed'])
 49 | rng     = RandomStreams(n_rng.randint(2 ** 30))
 50 | logger.info('Start!')
 51 | 
 52 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
 53 | 
 54 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
 55 | config['dec_voc_size'] = config['enc_voc_size']
 56 | samples  = len(train_set['source'])
 57 | 
 58 | logger.info('build dataset done. ' +
 59 |             'dataset size: {} ||'.format(samples) +
 60 |             'vocabulary size = {0}/ batch size = {1}'.format(
 61 |         config['dec_voc_size'], config['batch_size']))
 62 | 
 63 | 
 64 | def build_data(data):
 65 |     # create fuel dataset.
 66 |     dataset     = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
 67 |                                                                     ('target', data['target']),
 68 |                                                                     ('target_c', data['target_c']),
 69 |                                                                     ]))
 70 |     dataset.example_iteration_scheme \
 71 |                 = schemes.ShuffledExampleScheme(dataset.num_examples)
 72 |     return dataset
 73 | 
 74 | 
 75 | train_data        = build_data(train_set)
 76 | train_data_plain  = zip(*(train_set['source'], train_set['target']))
 77 | test_data_plain   = zip(*(test_set['source'],  test_set['target']))
 78 | 
 79 | train_size        = len(train_data_plain)
 80 | test_size         = len(test_data_plain)
 81 | tr_idx            = n_rng.permutation(train_size)[:2000].tolist()
 82 | ts_idx            = n_rng.permutation(test_size )[:2000].tolist()
 83 | logger.info('load the data ok.')
 84 | 
 85 | # build the agent
 86 | if config['copynet']:
 87 |     agent  = NRM(config, n_rng, rng, mode=config['mode'],
 88 |                  use_attention=True, copynet=config['copynet'], identity=config['identity'])
 89 | else:
 90 |     agent  = NRM0(config, n_rng, rng, mode=config['mode'],
 91 |                   use_attention=True, copynet=config['copynet'], identity=config['identity'])
 92 | 
 93 | agent.build_()
 94 | agent.compile_('all')
 95 | print 'compile ok.'
 96 | 
 97 | echo   = 2
 98 | epochs = 10
 99 | if echo > 0:
100 |     tmark = '20160217-232113'
101 |     agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
102 | 
103 | while echo < epochs:
104 |     echo += 1
105 |     loss  = []
106 | 
107 |     def output_stream(dataset, batch_size, size=1):
108 |         data_stream = dataset.get_example_stream()
109 |         data_stream = transformers.Batch(data_stream,
110 |                                          iteration_scheme=schemes.ConstantScheme(batch_size))
111 | 
112 |         # add padding and masks to the dataset
113 |         data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target', 'target_c'))
114 |         return data_stream
115 | 
116 |     def prepare_batch(batch, mask, fix_len=None):
117 |         data = batch[mask].astype('int32')
118 |         data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
119 | 
120 |         def cut_zeros(data, fix_len=None):
121 |             if fix_len is not None:
122 |                 return data[:, : fix_len]
123 |             for k in range(data.shape[1] - 1, 0, -1):
124 |                 data_col = data[:, k].sum()
125 |                 if data_col > 0:
126 |                     return data[:, : k + 2]
127 |             return data
128 |         data = cut_zeros(data, fix_len)
129 |         return data
130 | 
131 |     # training
132 |     notrain = False
133 |     if not notrain:
134 |         train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
135 |         logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
136 |         progbar = Progbar(train_size / config['batch_size'])
137 |         for it, batch in enumerate(train_batches):
138 |             # obtain data
139 |             data_s = prepare_batch(batch, 'source')
140 |             data_t = prepare_batch(batch, 'target')
141 |             data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
142 | 
143 |             if config['copynet']:
144 |                 loss += [agent.train_(data_s, data_t, data_c)]
145 |             else:
146 |                 loss += [agent.train_(data_s, data_t)]
147 | 
148 |             progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
149 | 
150 |             if it % 200 == 0:
151 |                 logger.info('Echo={} Evaluation Sampling.'.format(it))
152 |                 logger.info('generating [training set] samples')
153 |                 for _ in xrange(5):
154 |                     idx              = int(np.floor(n_rng.rand() * train_size))
155 |                     train_s, train_t = train_data_plain[idx]
156 |                     v                = agent.evaluate_(np.asarray(train_s, dtype='int32'),
157 |                                                        np.asarray(train_t, dtype='int32'),
158 |                                                        idx2word)
159 |                     print '*' * 50
160 | 
161 |                 logger.info('generating [testing set] samples')
162 |                 for _ in xrange(5):
163 |                     idx            = int(np.floor(n_rng.rand() * test_size))
164 |                     test_s, test_t = test_data_plain[idx]
165 |                     v              = agent.evaluate_(np.asarray(test_s, dtype='int32'),
166 |                                                      np.asarray(test_t, dtype='int32'),
167 |                                                      idx2word)
168 |                     print '*' * 50
169 | 
170 |         # save the weights.
171 |         agent.save(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
172 | 
173 |     # # test accuracy
174 |     # progbar_tr = Progbar(2000)
175 |     #
176 |     # print '\n' + '__' * 50
177 |     # gen, gen_pos = 0, 0
178 |     # cpy, cpy_pos = 0, 0
179 |     # for it, idx in enumerate(tr_idx):
180 |     #     train_s, train_t = train_data_plain[idx]
181 |     #
182 |     #     c = agent.analyse_(np.asarray(train_s, dtype='int32'),
183 |     #                        np.asarray(train_t, dtype='int32'),
184 |     #                        idx2word)
185 |     #     if c[1] == 0:
186 |     #         # generation mode
187 |     #         gen     += 1
188 |     #         gen_pos += c[0]
189 |     #     else:
190 |     #         # copy mode
191 |     #         cpy     += 1
192 |     #         cpy_pos += c[0]
193 |     #
194 |     #     progbar_tr.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
195 |     #
196 |     # logger.info('\nTraining Accuracy:' +
197 |     #             '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
198 |     #             '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)))
199 |     #
200 |     # progbar_ts = Progbar(2000)
201 |     # print '\n' + '__' * 50
202 |     # gen, gen_pos = 0, 0
203 |     # cpy, cpy_pos = 0, 0
204 |     # for it, idx in enumerate(ts_idx):
205 |     #     test_s, test_t = test_data_plain[idx]
206 |     #     c      = agent.analyse_(np.asarray(test_s, dtype='int32'),
207 |     #                             np.asarray(test_t, dtype='int32'),
208 |     #                             idx2word)
209 |     #     if c[1] == 0:
210 |     #         # generation mode
211 |     #         gen     += 1
212 |     #         gen_pos += c[0]
213 |     #     else:
214 |     #         # copy mode
215 |     #         cpy     += 1
216 |     #         cpy_pos += c[0]
217 |     #
218 |     #     progbar_ts.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
219 |     #
220 |     # logger.info('\nTesting Accuracy:' +
221 |     #             '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
222 |     #             '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)))
223 | 


--------------------------------------------------------------------------------
/experiments/lcsts_vest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is the implementation of Copy-NET
  3 | We start from the basic Seq2seq framework for a auto-encoder.
  4 | """
  5 | import logging
  6 | import time
  7 | import numpy as np
  8 | import sys
  9 | import copy
 10 | 
 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 12 | from experiments.config import setup_lcsts, setup_weibo, setup_syn
 13 | from emolga.utils.generic_utils import *
 14 | from emolga.models.covc_encdec import NRM
 15 | from emolga.models.encdec import NRM as NRM0
 16 | from emolga.dataset.build_dataset import deserialize_from_file
 17 | from collections import OrderedDict
 18 | from fuel import datasets
 19 | from fuel import transformers
 20 | from fuel import schemes
 21 | 
 22 | setup = setup_lcsts
 23 | # setup = setup_syn
 24 | 
 25 | 
 26 | def init_logging(logfile):
 27 |     formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
 28 |                                   datefmt='%m/%d/%Y %H:%M:%S'   )
 29 |     fh = logging.FileHandler(logfile)
 30 |     # ch = logging.StreamHandler()
 31 | 
 32 |     fh.setFormatter(formatter)
 33 |     # ch.setFormatter(formatter)
 34 |     # fh.setLevel(logging.INFO)
 35 |     # ch.setLevel(logging.INFO)
 36 |     # logging.getLogger().addHandler(ch)
 37 |     logging.getLogger().addHandler(fh)
 38 |     logging.getLogger().setLevel(logging.INFO)
 39 | 
 40 |     return logging
 41 | 
 42 | # prepare logging.
 43 | tmark   = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
 44 | config  = setup()   # load settings.
 45 | for w in config:
 46 |     print '{0}={1}'.format(w, config[w])
 47 | 
 48 | logger  = init_logging(config['path_log'] + '/experiments.CopyLCSTSXXX.id={}.log'.format(tmark))
 49 | n_rng   = np.random.RandomState(config['seed'])
 50 | np.random.seed(config['seed'])
 51 | rng     = RandomStreams(n_rng.randint(2 ** 30))
 52 | logger.info('Start!')
 53 | 
 54 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
 55 | 
 56 | if config['voc_size'] == -1:   # not use unk
 57 |     config['enc_voc_size'] = len(word2idx)
 58 |     config['dec_voc_size'] = config['enc_voc_size']
 59 | else:
 60 |     config['enc_voc_size'] = config['voc_size']
 61 |     config['dec_voc_size'] = config['enc_voc_size']
 62 | 
 63 | samples  = len(train_set['source'])
 64 | logger.info('build dataset done. ' +
 65 |             'dataset size: {} ||'.format(samples) +
 66 |             'vocabulary size = {0}/ batch size = {1}'.format(
 67 |         config['dec_voc_size'], config['batch_size']))
 68 | 
 69 | 
 70 | def build_data(data):
 71 |     # create fuel dataset.
 72 |     dataset     = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
 73 |                                                                     ('target', data['target']),
 74 |                                                                     ('target_c', data['target_c']),
 75 |                                                                     ]))
 76 |     dataset.example_iteration_scheme \
 77 |                 = schemes.ShuffledExampleScheme(dataset.num_examples)
 78 |     return dataset
 79 | 
 80 | 
 81 | train_data        = build_data(train_set)
 82 | train_data_plain  = zip(*(train_set['source'], train_set['target']))
 83 | test_data_plain   = zip(*(test_set['source'],  test_set['target']))
 84 | 
 85 | # train_data_plain  = zip(*(train_set['source'], train_set['target']))
 86 | # test_data_plain   = zip(*(test_set['source'],  test_set['target']))
 87 | 
 88 | train_size        = len(train_data_plain)
 89 | test_size         = len(test_data_plain)
 90 | tr_idx            = n_rng.permutation(train_size)[:2000].tolist()
 91 | ts_idx            = n_rng.permutation(test_size )[:2000].tolist()
 92 | logger.info('load the data ok.')
 93 | 
 94 | # build the agent
 95 | if config['copynet']:
 96 |     agent  = NRM(config, n_rng, rng, mode=config['mode'],
 97 |                  use_attention=True, copynet=config['copynet'], identity=config['identity'])
 98 | else:
 99 |     agent  = NRM0(config, n_rng, rng, mode=config['mode'],
100 |                   use_attention=True, copynet=config['copynet'], identity=config['identity'])
101 | 
102 | 
103 | echo   = 0
104 | epochs = 10
105 | if echo > 0:
106 |     tmark = '20160221-025049'    # copynet multi-source model
107 |     agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.pkl'.format(tmark, echo))
108 | 
109 | # recover from Nan
110 | tmark = '20160307-135907'  # '20160301-105813'
111 | skip  = 26000
112 | sep   = 1
113 | 
114 | # tmark = '20160301-114653'
115 | # skip  = 14000
116 | agent.build_(0.001, 26000)
117 | agent.compile_('all')
118 | print 'compile ok.'
119 | agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(tmark, sep, skip))
120 | 
121 | 
122 | while echo < epochs:
123 |     echo += 1
124 |     loss  = []
125 | 
126 |     def output_stream(dataset, batch_size, size=1):
127 |         data_stream = dataset.get_example_stream()
128 |         data_stream = transformers.Batch(data_stream,
129 |                                          iteration_scheme=schemes.ConstantScheme(batch_size))
130 | 
131 |         # add padding and masks to the dataset
132 |         data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
133 |         return data_stream
134 | 
135 |     def prepare_batch(batch, mask, fix_len=None):
136 |         data = batch[mask].astype('int32')
137 |         data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
138 | 
139 |         def cut_zeros(data, fix_len=None):
140 |             if fix_len is not None:
141 |                 return data[:, : fix_len]
142 |             for k in range(data.shape[1] - 1, 0, -1):
143 |                 data_col = data[:, k].sum()
144 |                 if data_col > 0:
145 |                     return data[:, : k + 2]
146 |             return data
147 |         data = cut_zeros(data, fix_len)
148 |         return data
149 | 
150 |     def cc_martix(source, target):
151 |         cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
152 |         for k in xrange(source.shape[0]):
153 |             for j in xrange(target.shape[1]):
154 |                 for i in xrange(source.shape[1]):
155 |                     if (source[k, i] == target[k, j]) and (source[k, i] > 0):
156 |                         cc[k][j][i] = 1.
157 |         return cc
158 | 
159 |     def unk_filter(data):
160 |         if config['voc_size'] == -1:
161 |             return copy.copy(data)
162 |         else:
163 |             mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
164 |             data = copy.copy(data * mask + (1 - mask))
165 |             return data
166 | 
167 |     # training
168 |     notrain = False
169 |     if not notrain:
170 |         train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
171 |         logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
172 |         progbar = Progbar(train_size / config['batch_size'])
173 |         for it, batch in enumerate(train_batches):
174 |             if (echo < sep):
175 |                 continue
176 | 
177 |             if (echo == sep) and (skip > it):
178 |                 if it % 200 == 0:
179 |                     print it
180 |                 continue
181 |             
182 |             # obtain data
183 |             data_s = prepare_batch(batch, 'source')
184 |             data_t = prepare_batch(batch, 'target')
185 |             if config['copynet']:
186 |                 data_c = cc_martix(data_s, data_t)
187 |                 # data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
188 |                 loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
189 |             else:
190 |                 loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
191 | 
192 |             progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
193 | 
194 |             if it % 200 == 0:
195 |                 logger.info('Echo={} Evaluation Sampling.'.format(it))
196 |                 logger.info('generating [training set] samples')
197 |                 for _ in xrange(5):
198 |                     idx              = int(np.floor(n_rng.rand() * train_size))
199 |                     train_s, train_t = train_data_plain[idx]
200 |                     v                = agent.evaluate_(np.asarray(train_s, dtype='int32'),
201 |                                                        np.asarray(train_t, dtype='int32'),
202 |                                                        idx2word,
203 |                                                        np.asarray(unk_filter(train_s), dtype='int32'))
204 |                     print '*' * 50
205 | 
206 |                 logger.info('generating [testing set] samples')
207 |                 for _ in xrange(5):
208 |                     idx            = int(np.floor(n_rng.rand() * test_size))
209 |                     test_s, test_t = test_data_plain[idx]
210 |                     v              = agent.evaluate_(np.asarray(test_s, dtype='int32'),
211 |                                                      np.asarray(test_t, dtype='int32'),
212 |                                                      idx2word,
213 |                                                      np.asarray(unk_filter(test_s), dtype='int32'))
214 |                     print '*' * 50
215 | 
216 |         # save the weights.
217 |             if it % 2000 == 0:
218 |                 agent.save(config['path_h5'] +
219 |                            '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(
220 |                                    tmark, echo, it))
221 | 
222 |     # # test accuracy
223 |     test = False
224 |     if test:
225 |         progbar_tr = Progbar(2000)
226 | 
227 |         print '\n' + '__' * 50
228 |         cpy, cpy_pos = 0, 0
229 |         for it, idx in enumerate(tr_idx):
230 |             train_s, train_t = train_data_plain[idx]
231 | 
232 |             c = agent.analyse_(np.asarray(train_s, dtype='int32'),
233 |                                np.asarray(train_t, dtype='int32'),
234 |                                idx2word)
235 |             # copy mode
236 |             cpy     += 1
237 |             cpy_pos += c
238 |             progbar_tr.update(it + 1, [('Copy', cpy_pos)])
239 | 
240 |         logger.info('\nTraining Accuracy:' +
241 |                     '\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
242 | 
243 |         progbar_ts = Progbar(2000)
244 |         print '\n' + '__' * 50
245 | 
246 |         cpy, cpy_pos = 0, 0
247 |         for it, idx in enumerate(ts_idx):
248 |             test_s, test_t = test_data_plain[idx]
249 |             c      = agent.analyse_(np.asarray(test_s, dtype='int32'),
250 |                                     np.asarray(test_t, dtype='int32'),
251 |                                     idx2word)
252 |             cpy     += 1
253 |             cpy_pos += c
254 |             progbar_ts.update(it + 1, [('Copy', cpy_pos)])
255 | 
256 |         logger.info('\nTesting Accuracy:' +
257 |                     '\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
258 | 


--------------------------------------------------------------------------------
/experiments/lcsts_vest_new.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is the implementation of Copy-NET
  3 | We start from the basic Seq2seq framework for a auto-encoder.
  4 | """
  5 | import logging
  6 | import time
  7 | import numpy as np
  8 | import sys
  9 | import copy
 10 | 
 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 12 | from experiments.config import setup_lcsts, setup_weibo, setup_syn
 13 | from emolga.utils.generic_utils import *
 14 | from emolga.models.covc_encdec import NRM
 15 | from emolga.models.encdec import NRM as NRM0
 16 | from emolga.dataset.build_dataset import deserialize_from_file
 17 | from collections import OrderedDict
 18 | from fuel import datasets
 19 | from fuel import transformers
 20 | from fuel import schemes
 21 | 
 22 | setup = setup_lcsts
 23 | # setup = setup_syn
 24 | 
 25 | 
 26 | def init_logging(logfile):
 27 |     formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
 28 |                                   datefmt='%m/%d/%Y %H:%M:%S'   )
 29 |     fh = logging.FileHandler(logfile)
 30 |     # ch = logging.StreamHandler()
 31 | 
 32 |     fh.setFormatter(formatter)
 33 |     # ch.setFormatter(formatter)
 34 |     # fh.setLevel(logging.INFO)
 35 |     # ch.setLevel(logging.INFO)
 36 |     # logging.getLogger().addHandler(ch)
 37 |     logging.getLogger().addHandler(fh)
 38 |     logging.getLogger().setLevel(logging.INFO)
 39 | 
 40 |     return logging
 41 | 
 42 | # prepare logging.
 43 | tmark   = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
 44 | config  = setup()   # load settings.
 45 | for w in config:
 46 |     print '{0}={1}'.format(w, config[w])
 47 | 
 48 | logger  = init_logging(config['path_log'] + '/experiments.CopyLCSTSXXX.id={}.log'.format(tmark))
 49 | n_rng   = np.random.RandomState(config['seed'])
 50 | np.random.seed(config['seed'])
 51 | rng     = RandomStreams(n_rng.randint(2 ** 30))
 52 | logger.info('Start!')
 53 | 
 54 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
 55 | 
 56 | if config['voc_size'] == -1:   # not use unk
 57 |     config['enc_voc_size'] = len(word2idx)
 58 |     config['dec_voc_size'] = config['enc_voc_size']
 59 | else:
 60 |     config['enc_voc_size'] = config['voc_size']
 61 |     config['dec_voc_size'] = config['enc_voc_size']
 62 | 
 63 | samples  = len(train_set['source'])
 64 | logger.info('build dataset done. ' +
 65 |             'dataset size: {} ||'.format(samples) +
 66 |             'vocabulary size = {0}/ batch size = {1}'.format(
 67 |         config['dec_voc_size'], config['batch_size']))
 68 | 
 69 | 
 70 | def build_data(data):
 71 |     # create fuel dataset.
 72 |     dataset     = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
 73 |                                                                     ('target', data['target']),
 74 |                                                                     ('target_c', data['target_c']),
 75 |                                                                     ]))
 76 |     dataset.example_iteration_scheme \
 77 |                 = schemes.ShuffledExampleScheme(dataset.num_examples)
 78 |     return dataset
 79 | 
 80 | 
 81 | train_data        = build_data(train_set)
 82 | train_data_plain  = zip(*(train_set['source'], train_set['target']))
 83 | test_data_plain   = zip(*(test_set['source'],  test_set['target']))
 84 | 
 85 | # train_data_plain  = zip(*(train_set['source'], train_set['target']))
 86 | # test_data_plain   = zip(*(test_set['source'],  test_set['target']))
 87 | 
 88 | train_size        = len(train_data_plain)
 89 | test_size         = len(test_data_plain)
 90 | tr_idx            = n_rng.permutation(train_size)[:2000].tolist()
 91 | ts_idx            = n_rng.permutation(test_size )[:2000].tolist()
 92 | logger.info('load the data ok.')
 93 | 
 94 | # build the agent
 95 | if config['copynet']:
 96 |     agent  = NRM(config, n_rng, rng, mode=config['mode'],
 97 |                  use_attention=True, copynet=config['copynet'], identity=config['identity'])
 98 | else:
 99 |     agent  = NRM0(config, n_rng, rng, mode=config['mode'],
100 |                   use_attention=True, copynet=config['copynet'], identity=config['identity'])
101 | 
102 | agent.build_()
103 | agent.compile_('all')
104 | print 'compile ok.'
105 | 
106 | echo   = 0
107 | epochs = 10
108 | if echo > 0:
109 |     tmark = '20160221-025049'    # copynet multi-source model
110 |     agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.pkl'.format(tmark, echo))
111 | 
112 | # recover from Nan
113 | # tmark = '20160301-105813'
114 | skip  = -1 #100000
115 | sep   = -1 #2
116 | 
117 | # tmark = '20160301-114653'
118 | # skip  = 14000
119 | # agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(tmark, sep, skip))
120 | 
121 | 
122 | while echo < epochs:
123 |     echo += 1
124 |     loss  = []
125 | 
126 |     def output_stream(dataset, batch_size, size=1):
127 |         data_stream = dataset.get_example_stream()
128 |         data_stream = transformers.Batch(data_stream,
129 |                                          iteration_scheme=schemes.ConstantScheme(batch_size))
130 | 
131 |         # add padding and masks to the dataset
132 |         data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
133 |         return data_stream
134 | 
135 |     def prepare_batch(batch, mask, fix_len=None):
136 |         data = batch[mask].astype('int32')
137 |         data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
138 | 
139 |         def cut_zeros(data, fix_len=None):
140 |             if fix_len is not None:
141 |                 return data[:, : fix_len]
142 |             for k in range(data.shape[1] - 1, 0, -1):
143 |                 data_col = data[:, k].sum()
144 |                 if data_col > 0:
145 |                     return data[:, : k + 2]
146 |             return data
147 |         data = cut_zeros(data, fix_len)
148 |         return data
149 | 
150 |     def cc_martix(source, target):
151 |         cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
152 |         for k in xrange(source.shape[0]):
153 |             for j in xrange(target.shape[1]):
154 |                 for i in xrange(source.shape[1]):
155 |                     if (source[k, i] == target[k, j]) and (source[k, i] > 0):
156 |                         cc[k][j][i] = 1.
157 |         return cc
158 | 
159 |     def unk_filter(data):
160 |         if config['voc_size'] == -1:
161 |             return copy.copy(data)
162 |         else:
163 |             mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
164 |             data = copy.copy(data * mask + (1 - mask))
165 |             return data
166 | 
167 |     # training
168 |     notrain = False
169 |     if not notrain:
170 |         train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
171 |         logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
172 |         progbar = Progbar(train_size / config['batch_size'])
173 |         for it, batch in enumerate(train_batches):
174 |             if (echo < sep):
175 |                 continue
176 | 
177 |             if (echo == sep) and (skip > it):
178 |                 if it % 200 == 0:
179 |                     print it
180 |                 continue
181 |             
182 |             # obtain data
183 |             data_s = prepare_batch(batch, 'source')
184 |             data_t = prepare_batch(batch, 'target')
185 |             if config['copynet']:
186 |                 data_c = cc_martix(data_s, data_t)
187 |                 # data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
188 |                 loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
189 |             else:
190 |                 loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
191 | 
192 |             progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
193 | 
194 |             if it % 200 == 0:
195 |                 logger.info('Echo={} Evaluation Sampling.'.format(it))
196 |                 logger.info('generating [training set] samples')
197 |                 for _ in xrange(5):
198 |                     idx              = int(np.floor(n_rng.rand() * train_size))
199 |                     train_s, train_t = train_data_plain[idx]
200 |                     v                = agent.evaluate_(np.asarray(train_s, dtype='int32'),
201 |                                                        np.asarray(train_t, dtype='int32'),
202 |                                                        idx2word,
203 |                                                        np.asarray(unk_filter(train_s), dtype='int32'))
204 |                     print '*' * 50
205 | 
206 |                 logger.info('generating [testing set] samples')
207 |                 for _ in xrange(5):
208 |                     idx            = int(np.floor(n_rng.rand() * test_size))
209 |                     test_s, test_t = test_data_plain[idx]
210 |                     v              = agent.evaluate_(np.asarray(test_s, dtype='int32'),
211 |                                                      np.asarray(test_t, dtype='int32'),
212 |                                                      idx2word,
213 |                                                      np.asarray(unk_filter(test_s), dtype='int32'))
214 |                     print '*' * 50
215 | 
216 |         # save the weights.
217 |             if it % 2000 == 0:
218 |                 agent.save(config['path_h5'] +
219 |                            '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(
220 |                                    tmark, echo, it))
221 | 
222 |     # # test accuracy
223 |     test = False
224 |     if test:
225 |         progbar_tr = Progbar(2000)
226 | 
227 |         print '\n' + '__' * 50
228 |         cpy, cpy_pos = 0, 0
229 |         for it, idx in enumerate(tr_idx):
230 |             train_s, train_t = train_data_plain[idx]
231 | 
232 |             c = agent.analyse_(np.asarray(train_s, dtype='int32'),
233 |                                np.asarray(train_t, dtype='int32'),
234 |                                idx2word)
235 |             # copy mode
236 |             cpy     += 1
237 |             cpy_pos += c
238 |             progbar_tr.update(it + 1, [('Copy', cpy_pos)])
239 | 
240 |         logger.info('\nTraining Accuracy:' +
241 |                     '\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
242 | 
243 |         progbar_ts = Progbar(2000)
244 |         print '\n' + '__' * 50
245 | 
246 |         cpy, cpy_pos = 0, 0
247 |         for it, idx in enumerate(ts_idx):
248 |             test_s, test_t = test_data_plain[idx]
249 |             c      = agent.analyse_(np.asarray(test_s, dtype='int32'),
250 |                                     np.asarray(test_t, dtype='int32'),
251 |                                     idx2word)
252 |             cpy     += 1
253 |             cpy_pos += c
254 |             progbar_ts.update(it + 1, [('Copy', cpy_pos)])
255 | 
256 |         logger.info('\nTesting Accuracy:' +
257 |                     '\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
258 | 


--------------------------------------------------------------------------------
/experiments/movie_dataset.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
  3 | import string
  4 | import random
  5 | import sys
  6 | 
  7 | random.seed(19920206)
  8 | word2idx  = dict()
  9 | wordfreq  = dict()
 10 | word2idx['<eol>'] = 0
 11 | word2idx['<unk>'] = 1
 12 | word2freq = dict()
 13 | 
 14 | 
 15 | def mark(line):
 16 |     tmp_line = ''
 17 |     for c in line:
 18 |         if c in string.punctuation:
 19 |             if c is not "'":
 20 |                 tmp_line += ' ' + c + ' '
 21 |             else:
 22 |                 tmp_line += ' ' + c
 23 |         else:
 24 |             tmp_line += c
 25 |     tmp_line = tmp_line.lower()
 26 |     words = [w for w in tmp_line.split() if len(w) > 0]
 27 |     for w in words:
 28 |         if w not in word2freq:
 29 |             word2freq[w]  = 1
 30 |         else:
 31 |             word2freq[w] += 1
 32 |     return words
 33 | 
 34 | 
 35 | fline     = open('./dataset/cornell_movie/movie_lines.txt', 'r')
 36 | sets      = [w.split('+++$+++') for w in fline.read().split('\n')]
 37 | lines     = {w[0].strip(): mark(w[-1].strip()) for w in sets}
 38 | #
 39 | # for w in lines:
 40 | #     if len(lines[w]) == 0:
 41 | #         print w
 42 | 
 43 | fline.close()
 44 | print 'read lines ok'
 45 | fconv     = open('./dataset/cornell_movie/movie_conversations.txt', 'r')
 46 | 
 47 | turns     = []
 48 | convs     = fconv.readline()
 49 | while convs:
 50 |     turn   = eval(convs.split('+++$+++')[-1].strip())
 51 |     turns += zip(turn[:-1], turn[1:])
 52 |     convs  = fconv.readline()
 53 | 
 54 | pairs     = [(lines[a], lines[b]) for a, b in turns
 55 |              if len(lines[a]) > 0 and len(lines[b]) > 0]
 56 | 
 57 | # shuffle!
 58 | random.shuffle(pairs)
 59 | 
 60 | word2freq = sorted(word2freq.items(), key=lambda a: a[1], reverse=True)
 61 | for at, w in enumerate(word2freq):
 62 |     word2idx[w[0]] = at + 2
 63 | 
 64 | idx2word  = {k: v for v, k in word2idx.items()}
 65 | print idx2word[1], idx2word[2]
 66 | 
 67 | Lmax     = len(idx2word)
 68 | # for i in xrange(Lmax):
 69 | #     print idx2word[i]
 70 | print 'read dataset ok.'
 71 | print Lmax
 72 | print pairs[0]
 73 | 
 74 | 
 75 | def build_data(data):
 76 |     instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
 77 |     print len(data)
 78 |     for pair in data:
 79 |         source, target = pair
 80 |         A = [word2idx[w] for w in source]
 81 |         B = [word2idx[w] for w in target]
 82 |         # C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
 83 |         C = [0 if w not in source else source.index(w) + Lmax for w in target]
 84 | 
 85 |         instance['text']      += [source]
 86 |         instance['summary']   += [target]
 87 |         instance['source']    += [A]
 88 |         instance['target']    += [B]
 89 |         # instance['cc_matrix'] += [C]
 90 |         instance['target_c'] += [C]
 91 | 
 92 |     print instance['source'][4000]
 93 |     print instance['target'][4000]
 94 |     print instance['target_c'][4000]
 95 |     return instance
 96 | 
 97 | 
 98 | train_set = build_data(pairs[10000:])
 99 | test_set  = build_data(pairs[:10000])
100 | serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/movie_dialogue_data.pkl')
101 | 


--------------------------------------------------------------------------------
/experiments/syn_vest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is the implementation of Copy-NET
  3 | We start from the basic Seq2seq framework for a auto-encoder.
  4 | """
  5 | import logging
  6 | import time
  7 | import numpy as np
  8 | import sys
  9 | import copy
 10 | 
 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 12 | from experiments.config import setup_lcsts, setup_weibo, setup_syn, setup_bst
 13 | from emolga.utils.generic_utils import *
 14 | from emolga.models.covc_encdec import NRM
 15 | from emolga.models.encdec import NRM as NRM0
 16 | from emolga.dataset.build_dataset import deserialize_from_file
 17 | from collections import OrderedDict
 18 | from fuel import datasets
 19 | from fuel import transformers
 20 | from fuel import schemes
 21 | 
 22 | # setup = setup_lcsts
 23 | setup = setup_syn
 24 | # setup = setup_bst
 25 | 
 26 | 
 27 | def init_logging(logfile):
 28 |     formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
 29 |                                   datefmt='%m/%d/%Y %H:%M:%S'   )
 30 |     fh = logging.FileHandler(logfile)
 31 |     # ch = logging.StreamHandler()
 32 | 
 33 |     fh.setFormatter(formatter)
 34 |     # ch.setFormatter(formatter)
 35 |     # fh.setLevel(logging.INFO)
 36 |     # ch.setLevel(logging.INFO)
 37 |     # logging.getLogger().addHandler(ch)
 38 |     logging.getLogger().addHandler(fh)
 39 |     logging.getLogger().setLevel(logging.INFO)
 40 | 
 41 |     return logging
 42 | 
 43 | # prepare logging.
 44 | tmark   = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
 45 | config  = setup()   # load settings.
 46 | for w in config:
 47 |     print '{0}={1}'.format(w, config[w])
 48 | 
 49 | logger  = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
 50 | n_rng   = np.random.RandomState(config['seed'])
 51 | np.random.seed(config['seed'])
 52 | rng     = RandomStreams(n_rng.randint(2 ** 30))
 53 | logger.info('Start!')
 54 | 
 55 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
 56 | 
 57 | if config['voc_size'] == -1:   # not use unk
 58 |     config['enc_voc_size'] = len(word2idx)
 59 |     config['dec_voc_size'] = config['enc_voc_size']
 60 | else:
 61 |     config['enc_voc_size'] = config['voc_size']
 62 |     config['dec_voc_size'] = config['enc_voc_size']
 63 | 
 64 | samples  = len(train_set['source'])
 65 | logger.info('build dataset done. ' +
 66 |             'dataset size: {} ||'.format(samples) +
 67 |             'vocabulary size = {0}/ batch size = {1}'.format(
 68 |         config['dec_voc_size'], config['batch_size']))
 69 | 
 70 | 
 71 | def build_data(data):
 72 |     # create fuel dataset.
 73 |     dataset     = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
 74 |                                                                     ('target', data['target']),
 75 |                                                                     ('target_c', data['target_c']),
 76 |                                                                     ]))
 77 |     dataset.example_iteration_scheme \
 78 |                 = schemes.ShuffledExampleScheme(dataset.num_examples)
 79 |     return dataset
 80 | 
 81 | 
 82 | train_data        = build_data(train_set)
 83 | train_data_plain  = zip(*(train_set['source'], train_set['target'], train_set['rule']))
 84 | test_data_plain   = zip(*(test_set['source'],  test_set['target'],  test_set['rule']))
 85 | 
 86 | # train_data_plain  = zip(*(train_set['source'], train_set['target']))
 87 | # test_data_plain   = zip(*(test_set['source'],  test_set['target']))
 88 | 
 89 | train_size        = len(train_data_plain)
 90 | test_size         = len(test_data_plain)
 91 | tr_idx            = n_rng.permutation(train_size)[:2000].tolist()
 92 | ts_idx            = n_rng.permutation(test_size )[:2000].tolist()
 93 | logger.info('load the data ok.')
 94 | config['copynet'] = True  # False
 95 | notrain           = True
 96 | 
 97 | # build the agent
 98 | if config['copynet']:
 99 |     agent  = NRM(config, n_rng, rng, mode=config['mode'],
100 |                  use_attention=True, copynet=config['copynet'], identity=config['identity'])
101 | else:
102 |     agent  = NRM0(config, n_rng, rng, mode=config['mode'],
103 |                   use_attention=True, copynet=config['copynet'], identity=config['identity'])
104 | 
105 | agent.build_()
106 | if notrain:
107 |     agent.compile_('display')
108 | else:
109 |     agent.compile_('all')
110 | print 'compile ok.'
111 | 
112 | echo   = 6
113 | epochs = 10
114 | if echo > 0:
115 |     tmark = '20160227-013418'    # copynet multi-source model
116 |     agent.load(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname']))
117 | 
118 | while echo < epochs:
119 |     echo += 1
120 |     loss  = []
121 | 
122 |     def output_stream(dataset, batch_size, size=1):
123 |         data_stream = dataset.get_example_stream()
124 |         data_stream = transformers.Batch(data_stream,
125 |                                          iteration_scheme=schemes.ConstantScheme(batch_size))
126 | 
127 |         # add padding and masks to the dataset
128 |         data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
129 |         return data_stream
130 | 
131 |     def prepare_batch(batch, mask, fix_len=None):
132 |         data = batch[mask].astype('int32')
133 |         data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
134 | 
135 |         def cut_zeros(data, fix_len=None):
136 |             if fix_len is not None:
137 |                 return data[:, : fix_len]
138 |             for k in range(data.shape[1] - 1, 0, -1):
139 |                 data_col = data[:, k].sum()
140 |                 if data_col > 0:
141 |                     return data[:, : k + 2]
142 |             return data
143 |         data = cut_zeros(data, fix_len)
144 |         return data
145 | 
146 |     def cc_martix(source, target):
147 |         cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
148 |         for k in xrange(source.shape[0]):
149 |             for j in xrange(target.shape[1]):
150 |                 for i in xrange(source.shape[1]):
151 |                     if (source[k, i] == target[k, j]) and (source[k, i] > 0):
152 |                         cc[k][j][i] = 1.
153 |         return cc
154 | 
155 |     def unk_filter(data):
156 |         if config['voc_size'] == -1:
157 |             return copy.copy(data)
158 |         else:
159 |             mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
160 |             data = copy.copy(data * mask + (1 - mask))
161 |             return data
162 | 
163 |     # training
164 |     if not notrain:
165 |         train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
166 |         logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
167 |         progbar = Progbar(train_size / config['batch_size'])
168 |         for it, batch in enumerate(train_batches):
169 |             # obtain data
170 |             data_s = prepare_batch(batch, 'source')
171 |             data_t = prepare_batch(batch, 'target')
172 |             if config['copynet']:
173 |                 data_c = cc_martix(data_s, data_t)
174 |                 # data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
175 |                 loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
176 |             else:
177 |                 loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
178 | 
179 |             progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
180 | 
181 |             if it % 200 == 0:
182 |                 logger.info('Echo={} Evaluation Sampling.'.format(it))
183 |                 logger.info('generating [training set] samples')
184 |                 for _ in xrange(5):
185 |                     idx              = int(np.floor(n_rng.rand() * train_size))
186 |                     train_s, train_t = train_data_plain[idx]
187 |                     v                = agent.evaluate_(np.asarray(train_s, dtype='int32'),
188 |                                                        np.asarray(train_t, dtype='int32'),
189 |                                                        idx2word,
190 |                                                        np.asarray(unk_filter(train_s), dtype='int32'))
191 |                     print '*' * 50
192 | 
193 |                 logger.info('generating [testing set] samples')
194 |                 for _ in xrange(5):
195 |                     idx            = int(np.floor(n_rng.rand() * test_size))
196 |                     test_s, test_t = test_data_plain[idx]
197 |                     v              = agent.evaluate_(np.asarray(test_s, dtype='int32'),
198 |                                                      np.asarray(test_t, dtype='int32'),
199 |                                                      idx2word,
200 |                                                      np.asarray(unk_filter(test_s), dtype='int32'))
201 |                     print '*' * 50
202 | 
203 |         # save the weights.
204 |         agent.save(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname']))
205 | 
206 |     # # test accuracy
207 |     def judge_rule(rule):
208 |         rule = rule.split()
209 |         fine = ''
210 |         for w in rule:
211 |             if w not in word2idx:
212 |                 fine += w
213 |         return fine
214 | 
215 |     test = True
216 |     if test:
217 |         def analysis_(data_plain, mode='Training'):
218 |             progbar_tr = Progbar(2000)
219 |             print '\n' + '__' * 50
220 |             cpy, cpy_pos = 0, 0
221 |             types = dict()
222 |             for it, idx in enumerate(tr_idx):
223 |                 train_s, train_t, rule = data_plain[idx]
224 |                 t = judge_rule(rule)
225 |                 c = float(agent.analyse_(np.asarray(train_s, dtype='int32'),
226 |                                    np.asarray(train_t, dtype='int32'),
227 |                                    idx2word))
228 |                 # copy mode
229 |                 cpy     += 1
230 |                 cpy_pos += c
231 |                 if t not in types:
232 |                     types[t] = {}
233 |                     types[t][0] = c
234 |                     types[t][1] = 1
235 |                 else:
236 |                     types[t][0] += c
237 |                     types[t][1] += 1
238 | 
239 |                 progbar_tr.update(it + 1, [('Copy', cpy_pos)])
240 |             logger.info('\n{0} Accuracy:' +
241 |                         '\t{1}/{2} = {3}%'.format(mode, cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
242 |             print '==' * 50
243 |             for t in types:
244 |                 print 'Type: {0}: {1}/{2}={3}%'.format(t, int(types[t][0]), types[t][1],
245 |                                                        100 * types[t][0] / float(types[t][1]))
246 | 
247 | 
248 |         # analysis_(train_data_plain, 'Training')
249 |         analysis_(test_data_plain, 'Testing')
250 | 


--------------------------------------------------------------------------------
/experiments/syntest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is the implementation of Copy-NET
  3 | We start from the basic Seq2seq framework for a auto-encoder.
  4 | """
  5 | import logging
  6 | import time
  7 | import numpy as np
  8 | 
  9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 10 | from experiments.config import setup_syn
 11 | from emolga.utils.generic_utils import *
 12 | from emolga.models.cooc_encdec import NRM
 13 | from emolga.models.encdec import NRM as NRM0
 14 | from emolga.dataset.build_dataset import deserialize_from_file
 15 | from collections import OrderedDict
 16 | from fuel import datasets
 17 | from fuel import transformers
 18 | from fuel import schemes
 19 | 
 20 | setup = setup_syn
 21 | 
 22 | 
 23 | def init_logging(logfile):
 24 |     formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
 25 |                                   datefmt='%m/%d/%Y %H:%M:%S'   )
 26 |     fh = logging.FileHandler(logfile)
 27 |     # ch = logging.StreamHandler()
 28 | 
 29 |     fh.setFormatter(formatter)
 30 |     # ch.setFormatter(formatter)
 31 |     # fh.setLevel(logging.INFO)
 32 |     # ch.setLevel(logging.INFO)
 33 |     # logging.getLogger().addHandler(ch)
 34 |     logging.getLogger().addHandler(fh)
 35 |     logging.getLogger().setLevel(logging.INFO)
 36 | 
 37 |     return logging
 38 | 
 39 | # prepare logging.
 40 | tmark   = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
 41 | config  = setup()   # load settings.
 42 | for w in config:
 43 |     print '{0}={1}'.format(w, config[w])
 44 | 
 45 | logger  = init_logging(config['path_log'] + '/experiments.Copy.id={}.log'.format(tmark))
 46 | n_rng  = np.random.RandomState(config['seed'])
 47 | np.random.seed(config['seed'])
 48 | rng    = RandomStreams(n_rng.randint(2 ** 30))
 49 | logger.info('Start!')
 50 | 
 51 | # the vocabulary
 52 | tmp      = [chr(x) for x in range(48, 58)]  # '1', ... , '9', '0'
 53 | voc      = [tmp[a] + tmp[b] + tmp[c]
 54 |             for c in xrange(10)
 55 |             for b in xrange(10)
 56 |             for a in xrange(10)]
 57 | word2idx           = {voc[k]: k + 1 for k in xrange(len(voc))}
 58 | word2idx['<eol>']  = 0
 59 | idx2word           = {word2idx[w]: w for w in word2idx}
 60 | voc                = ['<eol>'] + voc
 61 | 
 62 | train_set, test_set = deserialize_from_file(config['dataset'])
 63 | 
 64 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
 65 | config['dec_voc_size'] = config['enc_voc_size']
 66 | samples  = len(train_set['source'])
 67 | 
 68 | logger.info('build dataset done. ' +
 69 |             'dataset size: {} ||'.format(samples) +
 70 |             'vocabulary size = {0}/ batch size = {1}'.format(
 71 |         config['dec_voc_size'], config['batch_size']))
 72 | 
 73 | 
 74 | def build_data(data):
 75 |     # create fuel dataset.
 76 |     dataset     = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
 77 |                                                                     ('target', data['target']),
 78 |                                                                     ('target_c', data['target_c']),
 79 |                                                                     ('rule_id', data['rule_id']),
 80 |                                                                     ('rule', data['rule'])]))
 81 |     dataset.example_iteration_scheme \
 82 |                 = schemes.ShuffledExampleScheme(dataset.num_examples)
 83 |     return dataset
 84 | 
 85 | 
 86 | train_data        = build_data(train_set)
 87 | train_data_plain  = zip(*(train_set['source'], train_set['target'], train_set['rule_id'], train_set['rule']))
 88 | test_data_plain   = zip(*(test_set['source'],  test_set['target'],  test_set['rule_id'],  test_set['rule']))
 89 | 
 90 | train_size        = len(train_data_plain)
 91 | test_size         = len(test_data_plain)
 92 | tr_idx            = n_rng.permutation(train_size)[:2000].tolist()
 93 | ts_idx            = n_rng.permutation(test_size )[:2000].tolist()
 94 | logger.info('load the data ok.')
 95 | 
 96 | # build the agent
 97 | if config['copynet']:
 98 |     agent  = NRM(config, n_rng, rng, mode=config['mode'],
 99 |                  use_attention=True, copynet=config['copynet'], identity=config['identity'])
100 | else:
101 |     agent  = NRM0(config, n_rng, rng, mode=config['mode'],
102 |                   use_attention=True, copynet=config['copynet'], identity=config['identity'])
103 | 
104 | agent.build_()
105 | agent.compile_('all')
106 | print 'compile ok.'
107 | 
108 | echo   = 3
109 | epochs = 4
110 | if echo > 0:
111 |     tmark = '20160216-152155'
112 |     agent.load(config['path_h5'] + '/experiments.Copy.id={0}.epoch={1}.pkl'.format(tmark, echo))
113 | 
114 | while echo < epochs:
115 |     echo += 1
116 |     loss  = []
117 | 
118 |     def output_stream(dataset, batch_size, size=1):
119 |         data_stream = dataset.get_example_stream()
120 |         data_stream = transformers.Batch(data_stream,
121 |                                          iteration_scheme=schemes.ConstantScheme(batch_size))
122 | 
123 |         # add padding and masks to the dataset
124 |         data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target', 'target_c'))
125 |         return data_stream
126 | 
127 |     def prepare_batch(batch, mask, fix_len=None):
128 |         data = batch[mask].astype('int32')
129 |         data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
130 | 
131 |         def cut_zeros(data, fix_len=None):
132 |             if fix_len is not None:
133 |                 return data[:, : fix_len]
134 |             for k in range(data.shape[1] - 1, 0, -1):
135 |                 data_col = data[:, k].sum()
136 |                 if data_col > 0:
137 |                     return data[:, : k + 2]
138 |             return data
139 |         data = cut_zeros(data, fix_len)
140 |         return data
141 | 
142 |     # training
143 |     notrain = True
144 |     if not notrain:
145 |         train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
146 |         logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
147 |         progbar = Progbar(train_size / config['batch_size'])
148 |         for it, batch in enumerate(train_batches):
149 |             # obtain data
150 |             data_s = prepare_batch(batch, 'source')
151 |             data_t = prepare_batch(batch, 'target')
152 |             data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
153 | 
154 |             if config['copynet']:
155 |                 loss += [agent.train_(data_s, data_t, data_c)]
156 |             else:
157 |                 loss += [agent.train_(data_s, data_t)]
158 | 
159 |             progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
160 | 
161 |             if it % 200 == 0:
162 |                 logger.info('Echo={} Evaluation Sampling.'.format(it))
163 |                 logger.info('generating [training set] samples')
164 |                 for _ in xrange(5):
165 |                     idx              = int(np.floor(n_rng.rand() * train_size))
166 |                     train_s, train_t, _, _ = train_data_plain[idx]
167 |                     v                = agent.evaluate_(np.asarray(train_s, dtype='int32'),
168 |                                                        np.asarray(train_t, dtype='int32'),
169 |                                                        idx2word)
170 |                     print '*' * 50
171 | 
172 |                 logger.info('generating [testing set] samples')
173 |                 for _ in xrange(5):
174 |                     idx            = int(np.floor(n_rng.rand() * test_size))
175 |                     test_s, test_t, _, _ = test_data_plain[idx]
176 |                     v              = agent.evaluate_(np.asarray(test_s, dtype='int32'),
177 |                                                      np.asarray(test_t, dtype='int32'),
178 |                                                      idx2word)
179 |                     print '*' * 50
180 | 
181 |         # save the weights.
182 |         agent.save(config['path_h5'] + '/experiments.Copy.id={0}.epoch={1}.pkl'.format(tmark, echo))
183 | 
184 |     # test accuracy
185 |     progbar_tr = Progbar(2000)
186 | 
187 |     print '\n' + '__' * 50
188 |     gen, gen_pos = 0, 0
189 |     cpy, cpy_pos = 0, 0
190 |     grs, crs     = [], []
191 |     for it, idx in enumerate(tr_idx):
192 |         train_s, train_t, rid, rule = train_data_plain[idx]
193 | 
194 |         c = agent.analyse_(np.asarray(train_s, dtype='int32'),
195 |                            np.asarray(train_t, dtype='int32'),
196 |                            idx2word)
197 |         if c[1] == 0:
198 |             # generation mode
199 |             gen     += 1
200 |             gen_pos += c[0]
201 |             if c[0] == 1:
202 |                 grs     += [rule]
203 |         else:
204 |             # copy mode
205 |             cpy     += 1
206 |             cpy_pos += c[0]
207 |             if c[0] == 1:
208 |                 crs     += [rule]
209 | 
210 |         progbar_tr.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
211 |     grs = set(grs)
212 |     crs = set(crs)
213 |     irs = set.intersection(grs, crs)
214 | 
215 |     logger.info('\nTraining Accuracy:' +
216 |                 '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
217 |                 '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)) +
218 |                 '\tGene-Rule: {0}, Copy-Rule: {1}, Intersection: {2}'.format(len(grs), len(crs), len(irs)))
219 | 
220 |     print 'Generate Mode:'
221 |     for r in grs:
222 |         print r
223 | 
224 |     print 'Copy Mode:'
225 |     for r in crs:
226 |         print r
227 | 
228 |     print 'Interaction:'
229 |     for r in irs:
230 |         print r
231 | 
232 |     progbar_ts = Progbar(2000)
233 |     print '\n' + '__' * 50
234 |     gen, gen_pos = 0, 0
235 |     cpy, cpy_pos = 0, 0
236 |     grs, crs     = [], []
237 |     for it, idx in enumerate(ts_idx):
238 |         test_s, test_t, rid, rule = test_data_plain[idx]
239 |         c      = agent.analyse_(np.asarray(test_s, dtype='int32'),
240 |                                 np.asarray(test_t, dtype='int32'),
241 |                                 idx2word)
242 |         if c[1] == 0:
243 |             # generation mode
244 |             gen     += 1
245 |             gen_pos += c[0]
246 |             grs     += [rule]
247 |         else:
248 |             # copy mode
249 |             cpy     += 1
250 |             cpy_pos += c[0]
251 |             crs     += [rule]
252 | 
253 |         progbar_ts.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
254 |     grs = set(grs)
255 |     crs = set(crs)
256 |     irs = set.intersection(grs, crs)
257 | 
258 |     logger.info('\nTesting Accuracy:' +
259 |                 '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
260 |                 '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)) +
261 |                 '\tGene-Rule: {0}, Copy-Rule: {1}, Intersection: {2}'.format(len(grs), len(crs), len(irs)))
262 | 
263 |     print 'Generate Mode:'
264 |     for r in grs:
265 |         print r
266 | 
267 |     print 'Copy Mode:'
268 |     for r in crs:
269 |         print r
270 | 
271 |     print 'Interaction:'
272 |     for r in irs:
273 |         print r


--------------------------------------------------------------------------------
/experiments/synthetic.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'jiataogu'
 2 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
 3 | import numpy.random as n_rng
 4 | 
 5 | n_rng.seed(19920206)
 6 | # the vocabulary
 7 | tmp      = [chr(x) for x in range(48, 58)]  # '1', ... , '9', '0'
 8 | voc      = [tmp[a] + tmp[b] + tmp[c]
 9 |             for c in xrange(10)
10 |             for b in xrange(10)
11 |             for a in xrange(10)]
12 | word2idx           = {voc[k]: k + 2 for k in xrange(len(voc))}
13 | word2idx['<eol>']  = 0
14 | word2idx['<unk>']  = 1
15 | idx2word           = {word2idx[w]: w for w in word2idx}
16 | voc                = ['<eol>', '<unk>'] + voc
17 | 
18 | # word2idx['X']      = len(voc)
19 | # idx2word[len(voc)] = 'X'
20 | # voc               += ['X']
21 | #
22 | # word2idx['Y']      = len(voc)
23 | # idx2word[len(voc)] = 'Y'
24 | # voc               += ['Y']
25 | # print word2idx['X'], word2idx['Y']
26 | 
27 | # load the dataset
28 | Rules, _ = deserialize_from_file('/home/thoma/Work/Dial-DRL/dataset/rules.rnd.n10k.pkl')
29 | num      = 200
30 | repeats  = 100
31 | maxleg   = 15
32 | Lmax     = len(idx2word)
33 | rules    = dict(source=Rules['source'][:num],
34 |                 target=Rules['target'][:num])
35 | 
36 | 
37 | def ftr(v):
38 |     if v < 10:
39 |         return '00' + str(v)
40 |     elif v < 100:
41 |         return '0' + str(v)
42 |     else:
43 |         return str(v)
44 | 
45 | 
46 | def build_instance():
47 |     instance = dict(x=[], y=[], source=[], target=[], target_c=[], rule_id=[], rule=[])
48 |     for k in xrange(num):
49 |         source = rules['source'][k]
50 |         target = rules['target'][k]
51 | 
52 |         for j in xrange(repeats):
53 |             X  = n_rng.randint(1000, size= n_rng.randint(maxleg) + 1)
54 |             Y  = n_rng.randint(1000, size= n_rng.randint(maxleg) + 1)
55 |             S  = []
56 |             T  = []
57 |             for w in source:
58 |                 if w is 'X':
59 |                     S += [ftr(v) for v in X]
60 |                 elif w is 'Y':
61 |                     S += [ftr(v) for v in Y]
62 |                 else:
63 |                     S += [w]
64 | 
65 |             for w in target:
66 |                 if w is 'X':
67 |                     T += [ftr(v) for v in X]
68 |                 elif w is 'Y':
69 |                     T += [ftr(v) for v in Y]
70 |                 else:
71 |                     T += [w]
72 | 
73 |             A  = [word2idx[w] for w in S]
74 |             B  = [word2idx[w] for w in T]
75 |             C  = [0 if w not in S else S.index(w) + Lmax for w in T]
76 | 
77 |             instance['x']        += [S]
78 |             instance['y']        += [T]
79 |             instance['source']   += [A]
80 |             instance['target']   += [B]
81 |             instance['target_c'] += [C]
82 | 
83 |             instance['rule_id']  += [k]
84 |             instance['rule']     += [' '.join(source) + ' -> ' + ' '.join(target)]
85 | 
86 |     return instance
87 | 
88 | train_set = build_instance()
89 | print 'build ok.'
90 | test_set  = build_instance()
91 | print 'build ok.'
92 | 
93 | serialize_to_file([train_set, test_set, idx2word, word2idx], '/home/thoma/Work/Dial-DRL/dataset/synthetic_data_c.pkl')
94 | # serialize_to_file([train_set, test_set], '/home/thoma/Work/Dial-DRL/dataset/synthetic_data.pkl')
95 | 


--------------------------------------------------------------------------------
/experiments/weibo_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
 3 | 
 4 | word2idx = dict()
 5 | wordfreq = dict()
 6 | word2idx['<eol>'] = 0
 7 | word2idx['<unk>'] = 1
 8 | 
 9 | # segment  = False # True
10 | 
11 | # training set
12 | pairs = []
13 | f     = open('./dataset/weibo/co-occur.txt', 'r')
14 | line  = f.readline().strip().decode('utf-8')
15 | at    = 2
16 | lines = 0
17 | while line:
18 |     post = line # f.readline().strip().decode('utf-8')
19 |     post= [w.strip() for w in post.split() if len(w.strip()) > 0]
20 |     
21 |     # if segment:
22 |     #    summary = [w for w in jb.cut(summary)]
23 | 
24 |     for w in post:
25 |         if w not in wordfreq:
26 |             wordfreq[w]  = 1
27 |         else:
28 |             wordfreq[w] += 1
29 |         # if w not in word2idx:
30 |         #     word2idx[w] = at
31 |         #     at         += 1
32 | 
33 |     text    = f.readline().strip().decode('utf-8')
34 |     text    = [w.strip() for w in text.split() if len(w.strip()) > 0]
35 |     # if segment:
36 |     #     text = [w for w in jb.cut(text)]
37 |     for w in text:
38 |         if w not in wordfreq:
39 |             wordfreq[w]  = 1
40 |         else:
41 |             wordfreq[w] += 1
42 |         # if w not in word2idx:
43 |         #     word2idx[w] = at
44 |         #     at         += 1
45 | 
46 |     pair    = (post, text)
47 |     pairs.append(pair)
48 |     lines  += 1
49 |     if lines % 20000 == 0:
50 |         print lines
51 |     
52 |     f.readline()
53 |     line = f.readline().strip().decode('utf-8')
54 | 
55 | 
56 | # sort the vocabulary
57 | wordfreq = sorted(wordfreq.items(), key=lambda a:a[1], reverse=True)
58 | for w in wordfreq:
59 |     word2idx[w[0]] = at
60 |     at += 1
61 | 
62 | idx2word = dict()
63 | for v, k in word2idx.items():
64 |     idx2word[k] = v
65 | Lmax     = len(idx2word)
66 | print 'read dataset ok.'
67 | print Lmax
68 | for i in xrange(Lmax):
69 |     print idx2word[i].encode('utf-8')
70 | 
71 | # use character-based model [on]
72 | # use word-based model     [off]
73 | 
74 | 
75 | def build_data(data):
76 |     instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
77 |     for pair in data:
78 |         source, target = pair
79 |         A = [word2idx[w] for w in source]
80 |         B = [word2idx[w] for w in target]
81 |         # C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
82 |         C = [0 if w not in source else source.index(w) + Lmax for w in target]
83 | 
84 |         instance['text']      += [source]
85 |         instance['summary']   += [target]
86 |         instance['source']    += [A]
87 |         instance['target']    += [B]
88 |         # instance['cc_matrix'] += [C]
89 |         instance['target_c'] += [C]
90 | 
91 |     print instance['target'][5000]
92 |     print instance['target_c'][5000]
93 |     return instance
94 | 
95 | 
96 | train_set = build_data(pairs[10000:])
97 | test_set  = build_data(pairs[:10000])
98 | serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/weibo_data-word-cooc.pkl')
99 | 


--------------------------------------------------------------------------------
/experiments/weibo_vest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is the implementation of Copy-NET
  3 | We start from the basic Seq2seq framework for a auto-encoder.
  4 | """
  5 | import logging
  6 | import time
  7 | import numpy as np
  8 | import sys
  9 | import copy
 10 | 
 11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 12 | from experiments.config import setup_lcsts, setup_weibo
 13 | from emolga.utils.generic_utils import *
 14 | from emolga.models.covc_encdec import NRM
 15 | from emolga.models.encdec import NRM as NRM0
 16 | from emolga.dataset.build_dataset import deserialize_from_file
 17 | from collections import OrderedDict
 18 | from fuel import datasets
 19 | from fuel import transformers
 20 | from fuel import schemes
 21 | 
 22 | setup = setup_weibo
 23 | 
 24 | 
 25 | def init_logging(logfile):
 26 |     formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
 27 |                                   datefmt='%m/%d/%Y %H:%M:%S'   )
 28 |     fh = logging.FileHandler(logfile)
 29 |     # ch = logging.StreamHandler()
 30 | 
 31 |     fh.setFormatter(formatter)
 32 |     # ch.setFormatter(formatter)
 33 |     # fh.setLevel(logging.INFO)
 34 |     # ch.setLevel(logging.INFO)
 35 |     # logging.getLogger().addHandler(ch)
 36 |     logging.getLogger().addHandler(fh)
 37 |     logging.getLogger().setLevel(logging.INFO)
 38 | 
 39 |     return logging
 40 | 
 41 | # prepare logging.
 42 | tmark   = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
 43 | config  = setup()   # load settings.
 44 | for w in config:
 45 |     print '{0}={1}'.format(w, config[w])
 46 | 
 47 | logger  = init_logging(config['path_log'] + '/experiments.CopyWeibo.id={}.log'.format(tmark))
 48 | n_rng   = np.random.RandomState(config['seed'])
 49 | np.random.seed(config['seed'])
 50 | rng     = RandomStreams(n_rng.randint(2 ** 30))
 51 | logger.info('Start!')
 52 | 
 53 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
 54 | 
 55 | if config['voc_size'] == -1:   # not use unk
 56 |     config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
 57 |     config['dec_voc_size'] = config['enc_voc_size']
 58 | else:
 59 |     config['enc_voc_size'] = config['voc_size']
 60 |     config['dec_voc_size'] = config['enc_voc_size']
 61 | 
 62 | samples  = len(train_set['source'])
 63 | logger.info('build dataset done. ' +
 64 |             'dataset size: {} ||'.format(samples) +
 65 |             'vocabulary size = {0}/ batch size = {1}'.format(
 66 |         config['dec_voc_size'], config['batch_size']))
 67 | 
 68 | 
 69 | def build_data(data):
 70 |     # create fuel dataset.
 71 |     dataset     = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
 72 |                                                                     ('target', data['target']),
 73 |                                                                     ('target_c', data['target_c']),
 74 |                                                                     ]))
 75 |     dataset.example_iteration_scheme \
 76 |                 = schemes.ShuffledExampleScheme(dataset.num_examples)
 77 |     return dataset
 78 | 
 79 | 
 80 | train_data        = build_data(train_set)
 81 | train_data_plain  = zip(*(train_set['source'], train_set['target']))
 82 | test_data_plain   = zip(*(test_set['source'],  test_set['target']))
 83 | 
 84 | train_size        = len(train_data_plain)
 85 | test_size         = len(test_data_plain)
 86 | tr_idx            = n_rng.permutation(train_size)[:2000].tolist()
 87 | ts_idx            = n_rng.permutation(test_size )[:2000].tolist()
 88 | logger.info('load the data ok.')
 89 | notrain           = False
 90 | 
 91 | # build the agent
 92 | if config['copynet']:
 93 |     agent  = NRM(config, n_rng, rng, mode=config['mode'],
 94 |                  use_attention=True, copynet=config['copynet'], identity=config['identity'])
 95 | else:
 96 |     agent  = NRM0(config, n_rng, rng, mode=config['mode'],
 97 |                   use_attention=True, copynet=config['copynet'], identity=config['identity'])
 98 | 
 99 | agent.build_()
100 | if notrain:
101 |     agent.compile_('display')
102 | else:
103 |     agent.compile_('all')
104 | print 'compile ok.'
105 | 
106 | echo   = 0
107 | epochs = 10
108 | if echo > 0:
109 |     tmark = '20160227-164324'    # copynet multi-source model
110 |     agent.load(config['path_h5'] + '/experiments.CopyWeibo.id={0}.epoch={1}.pkl'.format(tmark, echo))
111 | 
112 | while echo < epochs:
113 |     echo += 1
114 |     loss  = []
115 | 
116 |     def output_stream(dataset, batch_size, size=1):
117 |         data_stream = dataset.get_example_stream()
118 |         data_stream = transformers.Batch(data_stream,
119 |                                          iteration_scheme=schemes.ConstantScheme(batch_size))
120 | 
121 |         # add padding and masks to the dataset
122 |         data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
123 |         return data_stream
124 | 
125 |     def prepare_batch(batch, mask, fix_len=None):
126 |         data = batch[mask].astype('int32')
127 |         data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
128 | 
129 |         def cut_zeros(data, fix_len=None):
130 |             if fix_len is not None:
131 |                 return data[:, : fix_len]
132 |             for k in range(data.shape[1] - 1, 0, -1):
133 |                 data_col = data[:, k].sum()
134 |                 if data_col > 0:
135 |                     return data[:, : k + 2]
136 |             return data
137 |         data = cut_zeros(data, fix_len)
138 |         return data
139 | 
140 |     def cc_martix(source, target):
141 |         cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
142 |         for k in xrange(source.shape[0]):
143 |             for j in xrange(target.shape[1]):
144 |                 for i in xrange(source.shape[1]):
145 |                     if (source[k, i] == target[k, j]) and (source[k, i] > 0):
146 |                         cc[k][j][i] = 1.
147 |         return cc
148 | 
149 |     def unk_filter(data):
150 |         if config['voc_size'] == -1:
151 |             return copy.copy(data)
152 |         else:
153 |             mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
154 |             data = copy.copy(data * mask + (1 - mask))
155 |             return data
156 | 
157 |     # training
158 |     train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
159 |     logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
160 |     progbar = Progbar(train_size / config['batch_size'])
161 |     for it, batch in enumerate(train_batches):
162 |         # obtain data
163 |         if not notrain:
164 |             data_s = prepare_batch(batch, 'source')
165 |             data_t = prepare_batch(batch, 'target')
166 |             if config['copynet']:
167 |                 data_c = cc_martix(data_s, data_t)
168 |                 # data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
169 |                 loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
170 |             else:
171 |                 loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
172 | 
173 |             progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
174 | 
175 |         if it % 200 == 0:
176 |             logger.info('Echo={} Evaluation Sampling.'.format(it))
177 |             logger.info('generating [training set] samples')
178 |             for _ in xrange(5):
179 |                 idx              = int(np.floor(n_rng.rand() * train_size))
180 |                 train_s, train_t = train_data_plain[idx]
181 |                 v                = agent.evaluate_(np.asarray(train_s, dtype='int32'),
182 |                                                    np.asarray(train_t, dtype='int32'),
183 |                                                    idx2word,
184 |                                                    np.asarray(unk_filter(train_s), dtype='int32'),
185 |                                                    encode=config['utf-8'])
186 |                 print '*' * 50
187 | 
188 |             logger.info('generating [testing set] samples')
189 |             for _ in xrange(5):
190 |                 idx            = int(np.floor(n_rng.rand() * test_size))
191 |                 test_s, test_t = test_data_plain[idx]
192 |                 v              = agent.evaluate_(np.asarray(test_s, dtype='int32'),
193 |                                                  np.asarray(test_t, dtype='int32'),
194 |                                                  idx2word,
195 |                                                  np.asarray(unk_filter(test_s), dtype='int32'),
196 |                                                  encode=config['utf-8'])
197 |                 print '*' * 50
198 |             
199 |             if it % 10000 == 0:
200 |                 # save the weights.
201 |                 agent.save(config['path_h5'] + '/experiments.CopyWeibo.id={0}.epoch={1}.pkl'.format(tmark, echo))
202 | 
203 |     # # test accuracy
204 |     # progbar_tr = Progbar(2000)
205 |     #
206 |     # print '\n' + '__' * 50
207 |     # gen, gen_pos = 0, 0
208 |     # cpy, cpy_pos = 0, 0
209 |     # for it, idx in enumerate(tr_idx):
210 |     #     train_s, train_t = train_data_plain[idx]
211 |     #
212 |     #     c = agent.analyse_(np.asarray(train_s, dtype='int32'),
213 |     #                        np.asarray(train_t, dtype='int32'),
214 |     #                        idx2word)
215 |     #     if c[1] == 0:
216 |     #         # generation mode
217 |     #         gen     += 1
218 |     #         gen_pos += c[0]
219 |     #     else:
220 |     #         # copy mode
221 |     #         cpy     += 1
222 |     #         cpy_pos += c[0]
223 |     #
224 |     #     progbar_tr.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
225 |     #
226 |     # logger.info('\nTraining Accuracy:' +
227 |     #             '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
228 |     #             '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)))
229 |     #
230 |     # progbar_ts = Progbar(2000)
231 |     # print '\n' + '__' * 50
232 |     # gen, gen_pos = 0, 0
233 |     # cpy, cpy_pos = 0, 0
234 |     # for it, idx in enumerate(ts_idx):
235 |     #     test_s, test_t = test_data_plain[idx]
236 |     #     c      = agent.analyse_(np.asarray(test_s, dtype='int32'),
237 |     #                             np.asarray(test_t, dtype='int32'),
238 |     #                             idx2word)
239 |     #     if c[1] == 0:
240 |     #         # generation mode
241 |     #         gen     += 1
242 |     #         gen_pos += c[0]
243 |     #     else:
244 |     #         # copy mode
245 |     #         cpy     += 1
246 |     #         cpy_pos += c[0]
247 |     #
248 |     #     progbar_ts.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
249 |     #
250 |     # logger.info('\nTesting Accuracy:' +
251 |     #             '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
252 |     #             '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)))
253 | 


--------------------------------------------------------------------------------