├── .idea
└── vcs.xml
├── LICENSE
├── README.md
├── emolga
├── __init__.py
├── basic
│ ├── __init__.py
│ ├── activations.py
│ ├── initializations.py
│ ├── objectives.py
│ └── optimizers.py
├── config.py
├── config_variant.py
├── dataset
│ └── build_dataset.py
├── layers
│ ├── __init__.py
│ ├── attention.py
│ ├── core.py
│ ├── embeddings.py
│ ├── gridlstm.py
│ ├── ntm_minibatch.py
│ └── recurrent.py
├── models
│ ├── __init__.py
│ ├── core.py
│ ├── core.pyc
│ ├── covc_encdec.py
│ ├── encdec.py
│ ├── ntm_encdec.py
│ ├── pointers.py
│ └── variational.py
├── run.py
├── test_lm.py
├── test_nvtm.py
├── test_run.py
├── utils
│ ├── __init__.py
│ ├── generic_utils.py
│ ├── io_utils.py
│ ├── np_utils.py
│ ├── test_utils.py
│ └── theano_utils.py
└── voc.pkl
└── experiments
├── __init__.py
├── bst_dataset.py
├── bst_vest.py
├── config.py
├── config.pyc
├── copynet.py
├── copynet_input.py
├── dataset.py
├── lcsts_dataset.py
├── lcsts_rouge.py
├── lcsts_sample.py
├── lcsts_test.py
├── lcsts_vest.py
├── lcsts_vest_new.py
├── movie_dataset.py
├── syn_vest.py
├── syntest.py
├── synthetic.py
├── weibo_dataset.py
└── weibo_vest.py
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016 Jiatao Gu
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CopyNet
2 | incorporating copying mechanism in sequence-to-sequence learning
3 |
--------------------------------------------------------------------------------
/emolga/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'yinpengcheng'
2 |
--------------------------------------------------------------------------------
/emolga/basic/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'
2 |
--------------------------------------------------------------------------------
/emolga/basic/activations.py:
--------------------------------------------------------------------------------
1 | import theano.tensor as T
2 |
3 |
4 | def softmax(x):
5 | return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape)
6 |
7 |
8 | def vector_softmax(x):
9 | return T.nnet.softmax(x.reshape((1, x.shape[0])))[0]
10 |
11 |
12 | def time_distributed_softmax(x):
13 | import warnings
14 | warnings.warn("time_distributed_softmax is deprecated. Just use softmax!", DeprecationWarning)
15 | return softmax(x)
16 |
17 |
18 | def softplus(x):
19 | return T.nnet.softplus(x)
20 |
21 |
22 | def relu(x):
23 | return T.nnet.relu(x)
24 |
25 |
26 | def tanh(x):
27 | return T.tanh(x)
28 |
29 |
30 | def sigmoid(x):
31 | return T.nnet.sigmoid(x)
32 |
33 |
34 | def hard_sigmoid(x):
35 | return T.nnet.hard_sigmoid(x)
36 |
37 |
38 | def linear(x):
39 | '''
40 | The function returns the variable that is passed in, so all types work
41 | '''
42 | return x
43 |
44 |
45 | def maxout2(x):
46 | shape = x.shape
47 | if x.ndim == 1:
48 | shape1 = T.cast(shape[0] / 2, 'int64')
49 | shape2 = T.cast(2, 'int64')
50 | x = x.reshape([shape1, shape2])
51 | x = x.max(1)
52 | elif x.ndim == 2:
53 | shape1 = T.cast(shape[1] / 2, 'int64')
54 | shape2 = T.cast(2, 'int64')
55 | x = x.reshape([shape[0], shape1, shape2])
56 | x = x.max(2)
57 | elif x.ndim == 3:
58 | shape1 = T.cast(shape[2] / 2, 'int64')
59 | shape2 = T.cast(2, 'int64')
60 | x = x.reshape([shape[0], shape[1], shape1, shape2])
61 | x = x.max(3)
62 | return x
63 |
64 |
65 | from emolga.utils.generic_utils import get_from_module
66 |
67 |
68 | def get(identifier):
69 | return get_from_module(identifier, globals(), 'activation function')
70 |
--------------------------------------------------------------------------------
/emolga/basic/initializations.py:
--------------------------------------------------------------------------------
1 | import theano
2 | import theano.tensor as T
3 | import numpy as np
4 |
5 | from emolga.utils.theano_utils import sharedX, shared_zeros, shared_ones
6 |
7 |
8 | def get_fans(shape):
9 | if isinstance(shape, int):
10 | shape = (1, shape)
11 | fan_in = shape[0] if len(shape) == 2 else np.prod(shape[1:])
12 | fan_out = shape[1] if len(shape) == 2 else shape[0]
13 | return fan_in, fan_out
14 |
15 |
16 | def uniform(shape, scale=0.1):
17 | return sharedX(np.random.uniform(low=-scale, high=scale, size=shape))
18 |
19 |
20 | def normal(shape, scale=0.05):
21 | return sharedX(np.random.randn(*shape) * scale)
22 |
23 |
24 | def lecun_uniform(shape):
25 | ''' Reference: LeCun 98, Efficient Backprop
26 | http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
27 | '''
28 | fan_in, fan_out = get_fans(shape)
29 | scale = np.sqrt(3. / fan_in)
30 | return uniform(shape, scale)
31 |
32 |
33 | def glorot_normal(shape):
34 | ''' Reference: Glorot & Bengio, AISTATS 2010
35 | '''
36 | fan_in, fan_out = get_fans(shape)
37 | s = np.sqrt(2. / (fan_in + fan_out))
38 | return normal(shape, s)
39 |
40 |
41 | def glorot_uniform(shape):
42 | fan_in, fan_out = get_fans(shape)
43 | s = np.sqrt(6. / (fan_in + fan_out))
44 | return uniform(shape, s)
45 |
46 |
47 | def he_normal(shape):
48 | ''' Reference: He et al., http://arxiv.org/abs/1502.01852
49 | '''
50 | fan_in, fan_out = get_fans(shape)
51 | s = np.sqrt(2. / fan_in)
52 | return normal(shape, s)
53 |
54 |
55 | def he_uniform(shape):
56 | fan_in, fan_out = get_fans(shape)
57 | s = np.sqrt(6. / fan_in)
58 | return uniform(shape, s)
59 |
60 |
61 | def orthogonal(shape, scale=1.1):
62 | ''' From Lasagne
63 | '''
64 | flat_shape = (shape[0], np.prod(shape[1:]))
65 | a = np.random.normal(0.0, 1.0, flat_shape)
66 | u, _, v = np.linalg.svd(a, full_matrices=False)
67 | # pick the one with the correct shape
68 | q = u if u.shape == flat_shape else v
69 | q = q.reshape(shape)
70 | return sharedX(scale * q[:shape[0], :shape[1]])
71 |
72 |
73 | def identity(shape, scale=1):
74 | if len(shape) != 2 or shape[0] != shape[1]:
75 | raise Exception("Identity matrix initialization can only be used for 2D square matrices")
76 | else:
77 | return sharedX(scale * np.identity(shape[0]))
78 |
79 |
80 | def zero(shape):
81 | return shared_zeros(shape)
82 |
83 |
84 | def one(shape):
85 | return shared_ones(shape)
86 |
87 | from emolga.utils.generic_utils import get_from_module
88 | def get(identifier):
89 | return get_from_module(identifier, globals(), 'initialization')
90 |
--------------------------------------------------------------------------------
/emolga/basic/objectives.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | import theano
3 | import theano.tensor as T
4 | import numpy as np
5 | from six.moves import range
6 |
7 | if theano.config.floatX == 'float64':
8 | epsilon = 1.0e-9
9 | else:
10 | epsilon = 1.0e-7
11 |
12 |
13 | def mean_squared_error(y_true, y_pred):
14 | return T.sqr(y_pred - y_true).mean(axis=-1)
15 |
16 |
17 | def mean_absolute_error(y_true, y_pred):
18 | return T.abs_(y_pred - y_true).mean(axis=-1)
19 |
20 |
21 | def mean_absolute_percentage_error(y_true, y_pred):
22 | return T.abs_((y_true - y_pred) / T.clip(T.abs_(y_true), epsilon, np.inf)).mean(axis=-1) * 100.
23 |
24 |
25 | def mean_squared_logarithmic_error(y_true, y_pred):
26 | return T.sqr(T.log(T.clip(y_pred, epsilon, np.inf) + 1.) - T.log(T.clip(y_true, epsilon, np.inf) + 1.)).mean(axis=-1)
27 |
28 |
29 | def squared_hinge(y_true, y_pred):
30 | return T.sqr(T.maximum(1. - y_true * y_pred, 0.)).mean(axis=-1)
31 |
32 |
33 | def hinge(y_true, y_pred):
34 | return T.maximum(1. - y_true * y_pred, 0.).mean(axis=-1)
35 |
36 |
37 | def categorical_crossentropy(y_true, y_pred):
38 | '''Expects a binary class matrix instead of a vector of scalar classes
39 | '''
40 | y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
41 | # scale preds so that the class probas of each sample sum to 1
42 | y_pred /= y_pred.sum(axis=-1, keepdims=True)
43 | cce = T.nnet.categorical_crossentropy(y_pred, y_true)
44 | return cce
45 |
46 |
47 | def binary_crossentropy(y_true, y_pred):
48 | y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
49 | bce = T.nnet.binary_crossentropy(y_pred, y_true).mean(axis=-1)
50 | return bce
51 |
52 |
53 | def poisson_loss(y_true, y_pred):
54 | return T.mean(y_pred - y_true * T.log(y_pred + epsilon), axis=-1)
55 |
56 | ####################################################
57 | # Variational Auto-encoder
58 |
59 | def gaussian_kl_divergence(mean, ln_var):
60 | """Computes the KL-divergence of Gaussian variables from the standard one.
61 |
62 | Given two variable ``mean`` representing :math:`\\mu` and ``ln_var``
63 | representing :math:`\\log(\\sigma^2)`, this function returns a variable
64 | representing the KL-divergence between the given multi-dimensional Gaussian
65 | :math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)`
66 |
67 | .. math::
68 |
69 | D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)),
70 |
71 | where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2`
72 | and :math:`I` is an identity matrix.
73 |
74 | Args:
75 | mean (~chainer.Variable): A variable representing mean of given
76 | gaussian distribution, :math:`\\mu`.
77 | ln_var (~chainer.Variable): A variable representing logarithm of
78 | variance of given gaussian distribution, :math:`\\log(\\sigma^2)`.
79 |
80 | Returns:
81 | ~chainer.Variable: A variable representing KL-divergence between
82 | given gaussian distribution and the standard gaussian.
83 |
84 | """
85 | var = T.exp(ln_var)
86 | return 0.5 * T.sum(mean * mean + var - ln_var - 1, 1)
87 |
88 |
89 | # aliases
90 | mse = MSE = mean_squared_error
91 | mae = MAE = mean_absolute_error
92 | mape = MAPE = mean_absolute_percentage_error
93 | msle = MSLE = mean_squared_logarithmic_error
94 | gkl = GKL = gaussian_kl_divergence
95 |
96 | from emolga.utils.generic_utils import get_from_module
97 | def get(identifier):
98 | return get_from_module(identifier, globals(), 'objective')
99 |
--------------------------------------------------------------------------------
/emolga/basic/optimizers.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | import theano
3 | import sys
4 |
5 | from theano.sandbox.rng_mrg import MRG_RandomStreams
6 | import theano.tensor as T
7 | import logging
8 |
9 | from emolga.utils.theano_utils import shared_zeros, shared_scalar, floatX
10 | from emolga.utils.generic_utils import get_from_module
11 | from six.moves import zip
12 | from copy import copy, deepcopy
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | def clip_norm(g, c, n):
18 | if c > 0:
19 | g = T.switch(T.ge(n, c), g * c / n, g)
20 | return g
21 |
22 |
23 | def kl_divergence(p, p_hat):
24 | return p_hat - p + p * T.log(p / p_hat)
25 |
26 |
27 | class Optimizer(object):
28 | def __init__(self, **kwargs):
29 | self.__dict__.update(kwargs)
30 | self.updates = []
31 | self.save_parm = []
32 |
33 | def add(self, v):
34 | self.save_parm += [v]
35 |
36 | def get_state(self):
37 | return [u[0].get_value() for u in self.updates]
38 |
39 | def set_state(self, value_list):
40 | assert len(self.updates) == len(value_list)
41 | for u, v in zip(self.updates, value_list):
42 | u[0].set_value(floatX(v))
43 |
44 | def get_updates(self, params, loss):
45 | raise NotImplementedError
46 |
47 | def get_gradients(self, loss, params):
48 | """
49 | Consider the situation that gradient is weighted.
50 | """
51 | if isinstance(loss, list):
52 | grads = T.grad(loss[0], params, consider_constant=loss[1:]) # gradient of loss
53 | else:
54 | grads = T.grad(loss, params)
55 |
56 | if hasattr(self, 'clipnorm') and self.clipnorm > 0:
57 | print 'use gradient clipping!!'
58 | norm = T.sqrt(sum([T.sum(g ** 2) for g in grads]))
59 | grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
60 |
61 | return grads
62 |
63 | def get_config(self):
64 | return {"name": self.__class__.__name__}
65 |
66 |
67 | class SGD(Optimizer):
68 |
69 | def __init__(self, lr=0.05, momentum=0.9, decay=0.01, nesterov=True, *args, **kwargs):
70 | super(SGD, self).__init__(**kwargs)
71 | self.__dict__.update(locals())
72 | self.iterations = shared_scalar(0)
73 | self.lr = shared_scalar(lr)
74 | self.momentum = shared_scalar(momentum)
75 |
76 | def get_updates(self, params, loss):
77 | grads = self.get_gradients(loss, params)
78 | lr = self.lr * (1.0 / (1.0 + self.decay * self.iterations))
79 | self.updates = [(self.iterations, self.iterations + 1.)]
80 |
81 | for p, g in zip(params, grads):
82 | m = shared_zeros(p.get_value().shape) # momentum
83 | v = self.momentum * m - lr * g # velocity
84 | self.updates.append((m, v))
85 |
86 | if self.nesterov:
87 | new_p = p + self.momentum * v - lr * g
88 | else:
89 | new_p = p + v
90 |
91 | self.updates.append((p, new_p)) # apply constraints
92 | return self.updates
93 |
94 | def get_config(self):
95 | return {"name": self.__class__.__name__,
96 | "lr": float(self.lr.get_value()),
97 | "momentum": float(self.momentum.get_value()),
98 | "decay": float(self.decay.get_value()),
99 | "nesterov": self.nesterov}
100 |
101 |
102 | class RMSprop(Optimizer):
103 | def __init__(self, lr=0.001, rho=0.9, epsilon=1e-6, *args, **kwargs):
104 | super(RMSprop, self).__init__(**kwargs)
105 | self.__dict__.update(locals())
106 | self.lr = shared_scalar(lr)
107 | self.rho = shared_scalar(rho)
108 | self.iterations = shared_scalar(0)
109 |
110 | def get_updates(self, params, loss):
111 | grads = self.get_gradients(loss, params)
112 | accumulators = [shared_zeros(p.get_value().shape) for p in params]
113 | self.updates = [(self.iterations, self.iterations + 1.)]
114 |
115 | for p, g, a in zip(params, grads, accumulators):
116 | new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator
117 | self.updates.append((a, new_a))
118 |
119 | new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
120 | self.updates.append((p, new_p)) # apply constraints
121 | return self.updates
122 |
123 | def get_config(self):
124 | return {"name": self.__class__.__name__,
125 | "lr": float(self.lr.get_value()),
126 | "rho": float(self.rho.get_value()),
127 | "epsilon": self.epsilon}
128 |
129 |
130 | class Adagrad(Optimizer):
131 | def __init__(self, lr=0.01, epsilon=1e-6, *args, **kwargs):
132 | super(Adagrad, self).__init__(**kwargs)
133 | self.__dict__.update(locals())
134 | self.lr = shared_scalar(lr)
135 |
136 | def get_updates(self, params, constraints, loss):
137 | grads = self.get_gradients(loss, params)
138 | accumulators = [shared_zeros(p.get_value().shape) for p in params]
139 | self.updates = []
140 |
141 | for p, g, a, c in zip(params, grads, accumulators, constraints):
142 | new_a = a + g ** 2 # update accumulator
143 | self.updates.append((a, new_a))
144 | new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
145 | self.updates.append((p, c(new_p))) # apply constraints
146 | return self.updates
147 |
148 | def get_config(self):
149 | return {"name": self.__class__.__name__,
150 | "lr": float(self.lr.get_value()),
151 | "epsilon": self.epsilon}
152 |
153 |
154 | class Adadelta(Optimizer):
155 | '''
156 | Reference: http://arxiv.org/abs/1212.5701
157 | '''
158 | def __init__(self, lr=0.1, rho=0.95, epsilon=1e-6, *args, **kwargs):
159 | super(Adadelta, self).__init__(**kwargs)
160 | self.__dict__.update(locals())
161 | self.lr = shared_scalar(lr)
162 | self.iterations = shared_scalar(0)
163 |
164 | def get_updates(self, params, loss):
165 | grads = self.get_gradients(loss, params)
166 | accumulators = [shared_zeros(p.get_value().shape) for p in params]
167 | delta_accumulators = [shared_zeros(p.get_value().shape) for p in params]
168 | # self.updates = []
169 | self.updates = [(self.iterations, self.iterations + 1.)]
170 |
171 | for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
172 | new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator
173 | self.updates.append((a, new_a))
174 |
175 | # use the new accumulator and the *old* delta_accumulator
176 | update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a +
177 | self.epsilon)
178 |
179 | new_p = p - self.lr * update
180 | self.updates.append((p, new_p))
181 |
182 | # update delta_accumulator
183 | new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2
184 | self.updates.append((d_a, new_d_a))
185 | return self.updates
186 |
187 | def get_config(self):
188 | return {"name": self.__class__.__name__,
189 | "lr": float(self.lr.get_value()),
190 | "rho": self.rho,
191 | "epsilon": self.epsilon}
192 |
193 |
194 | class Adam(Optimizer): # new Adam is designed for our purpose.
195 | '''
196 | Reference: http://arxiv.org/abs/1412.6980v8
197 |
198 | Default parameters follow those provided in the original paper.
199 | We add Gaussian Noise to improve the performance.
200 | '''
201 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, save=False, rng=None, *args, **kwargs):
202 | super(Adam, self).__init__(**kwargs)
203 | self.__dict__.update(locals())
204 | print locals()
205 |
206 | self.iterations = shared_scalar(0, name='iteration')
207 | self.lr = shared_scalar(lr, name='lr')
208 | self.rng = MRG_RandomStreams(use_cuda=True)
209 | self.noise = []
210 | self.forget = dict()
211 | self.rng = rng
212 |
213 | self.add(self.iterations)
214 | self.add(self.lr)
215 |
216 | def add_noise(self, param):
217 | if param.name not in self.noise:
218 | logger.info('add gradient noise to {}'.format(param))
219 | self.noise += [param.name]
220 |
221 | def add_forget(self, param):
222 | if param.name not in self.forget:
223 | logger.info('add forgetting list to {}'.format(param))
224 | self.forget[param.name] = theano.shared(param.get_value())
225 |
226 | def get_updates(self, params, loss):
227 | grads = self.get_gradients(loss, params)
228 | self.updates = [(self.iterations, self.iterations + 1.)]
229 | self.pu = []
230 |
231 | t = self.iterations + 1
232 | lr_t = self.lr * T.sqrt(1 - self.beta_2**t) / (1 - self.beta_1**t)
233 | for p, g in zip(params, grads):
234 | m = theano.shared(p.get_value() * 0., name=p.name + '_m') # zero init of moment
235 | v = theano.shared(p.get_value() * 0., name=p.name + '_v') # zero init of velocity
236 |
237 | self.add(m)
238 | self.add(v)
239 |
240 | # g_noise = self.rng.normal(g.shape, 0, T.sqrt(0.005 * t ** (-0.55)), dtype='float32')
241 |
242 | # if p.name in self.noise:
243 | # g_deviated = g + g_noise
244 | # else:
245 | # g_deviated = g
246 |
247 | g_deviated = g # + g_noise
248 | m_t = (self.beta_1 * m) + (1 - self.beta_1) * g_deviated
249 | v_t = (self.beta_2 * v) + (1 - self.beta_2) * (g_deviated**2)
250 | u_t = -lr_t * m_t / (T.sqrt(v_t) + self.epsilon)
251 | p_t = p + u_t
252 |
253 | # # memory reformatting!
254 | # if p.name in self.forget:
255 | # p_t = (1 - p_mem) * p_t + p_mem * self.forget[p.name]
256 | # p_s = (1 - p_fgt) * p_t + p_fgt * self.forget[p.name]
257 | # self.updates.append((self.forget[p.name], p_s))
258 |
259 | self.updates.append((m, m_t))
260 | self.updates.append((v, v_t))
261 | self.updates.append((p, p_t)) # apply constraints
262 | self.pu.append((p, p_t - p))
263 |
264 | if self.save:
265 | return self.updates, self.pu
266 | return self.updates
267 |
268 | # aliases
269 | sgd = SGD
270 | rmsprop = RMSprop
271 | adagrad = Adagrad
272 | adadelta = Adadelta
273 | adam = Adam
274 |
275 |
276 | def get(identifier, kwargs=None):
277 | return get_from_module(identifier, globals(), 'optimizer', instantiate=True,
278 | kwargs=kwargs)
279 |
--------------------------------------------------------------------------------
/emolga/config.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'
2 | import os
3 | import os.path as path
4 |
5 | def setup_ptb2():
6 | # pretraining setting up.
7 | # get the lm_config.
8 |
9 | config = dict()
10 | config['on_unused_input'] = 'ignore'
11 | config['seed'] = 3030029828
12 | config['level'] = 'DEBUG'
13 |
14 | # config['model'] = 'RNNLM'
15 | # config['model'] = 'VAE'
16 | # config['model'] = 'RNNLM' #'Helmholtz'
17 | config['model'] = 'HarX'
18 | config['highway'] = False
19 | config['use_noise'] = False
20 |
21 | config['optimizer'] = 'adam' #'adadelta'
22 | # config['lr'] = 0.1
23 |
24 | # config['optimizer'] = 'sgd'
25 |
26 | # dataset
27 | config['path'] = path.realpath(path.curdir) + '/' # '/home/thoma/Work/Dial-DRL/'
28 | config['vocabulary_set'] = config['path'] + 'dataset/ptbcorpus/voc.pkl'
29 | config['dataset'] = config['path'] + 'dataset/ptbcorpus/data_train.pkl'
30 | config['dataset_valid'] = config['path'] + 'dataset/ptbcorpus/data_valid.pkl'
31 | config['dataset_test'] = config['path'] + 'dataset/ptbcorpus/data_test.pkl'
32 | # output hdf5 file place.
33 | config['path_h5'] = config['path'] + 'H5'
34 | if not os.path.exists(config['path_h5']):
35 | os.mkdir(config['path_h5'])
36 |
37 | # output log place
38 | config['path_log'] = config['path'] + 'Logs'
39 | if not os.path.exists(config['path_log']):
40 | os.mkdir(config['path_log'])
41 |
42 | # size
43 | config['batch_size'] = 20
44 | config['eval_batch_size'] = 20
45 | config['mode'] = 'RNN' # NTM
46 | config['binary'] = False
47 |
48 | # Encoder: dimension
49 | config['enc_embedd_dim'] = 300
50 | config['enc_hidden_dim'] = 300
51 | config['enc_contxt_dim'] = 350
52 | config['encoder'] = 'RNN'
53 | config['pooling'] = False
54 |
55 | # Encoder: Model
56 | config['bidirectional'] = False # True
57 | config['decposterior'] = True
58 | config['enc_use_contxt'] = False
59 |
60 | # Agent: dimension
61 | config['action_dim'] = 50
62 | config['output_dim'] = 300
63 |
64 | # Decoder: dimension
65 | config['dec_embedd_dim'] = 300
66 | config['dec_hidden_dim'] = 300
67 | config['dec_contxt_dim'] = 300
68 |
69 | # Decoder: Model
70 | config['shared_embed'] = False
71 | config['use_input'] = False
72 | config['bias_code'] = False # True
73 | config['dec_use_contxt'] = True
74 | config['deep_out'] = False
75 | config['deep_out_activ'] = 'tanh' # maxout2
76 | config['bigram_predict'] = False
77 | config['context_predict'] = True # False
78 | config['leaky_predict'] = False # True
79 | config['dropout'] = 0.3
80 |
81 | # Decoder: sampling
82 | config['max_len'] = 88 # 15
83 | config['sample_beam'] = 10
84 | config['sample_stoch'] = False
85 | config['sample_argmax'] = False
86 |
87 | # Auto-Encoder
88 | config['nonlinear_A'] = True
89 | config['nonlinear_B'] = False
90 |
91 | # VAE/Helmholtz: Model
92 | config['repeats'] = 10
93 | config['eval_repeats'] = 10
94 | config['eval_N'] = 10
95 |
96 | config['variant_control'] = False
97 | config['factor'] = 10.
98 | config['mult_q'] = 10.
99 |
100 | print 'setup ok.'
101 | return config
102 |
103 |
104 |
--------------------------------------------------------------------------------
/emolga/config_variant.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'
2 | from config import setup_ptb2
3 | setup = setup_ptb2
4 |
5 | """
6 | This file is for small variant fix on original
7 | """
8 |
9 |
10 | def setup_bienc(config=None):
11 | if config is None:
12 | config = setup()
13 | print 'make some modification'
14 |
15 | config['bidirectional'] = True
16 | config['decposterior'] = False
17 | return config
18 |
19 |
20 | def setup_dim(config=None):
21 | if config is None:
22 | config = setup()
23 | print 'make some modification'
24 |
25 | config['enc_embedd_dim'] = 300
26 | config['enc_hidden_dim'] = 300
27 | config['action_dim'] = 100
28 |
29 | config['dec_embedd_dim'] = 300
30 | config['dec_hidden_dim'] = 300
31 | config['dec_contxt_dim'] = 300
32 | return config
33 |
34 |
35 | def setup_rep(config=None):
36 | if config is None:
37 | config = setup()
38 | print 'make some modification'
39 |
40 | config['repeats'] = 5
41 | return config
42 |
43 |
44 | def setup_opt(config=None):
45 | if config is None:
46 | config = setup()
47 | print 'make some modification'
48 |
49 | config['optimizer'] = 'Adam'
50 | return config
--------------------------------------------------------------------------------
/emolga/dataset/build_dataset.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'
2 | import numpy as np
3 | import numpy.random as rng
4 | import cPickle
5 | import pprint
6 | import sys
7 |
8 | from collections import OrderedDict
9 | from fuel import datasets
10 | from fuel import transformers
11 | from fuel import schemes
12 | from fuel import streams
13 |
14 |
15 | def serialize_to_file(obj, path, protocol=cPickle.HIGHEST_PROTOCOL):
16 | f = open(path, 'wb')
17 | cPickle.dump(obj, f, protocol=protocol)
18 | f.close()
19 |
20 |
21 | def show_txt(array, path):
22 | f = open(path, 'w')
23 | for line in array:
24 | f.write(' '.join(line) + '\n')
25 |
26 | f.close()
27 |
28 |
29 | def divide_dataset(dataset, test_size, max_size):
30 | train_set = dict()
31 | test_set = dict()
32 |
33 | for w in dataset:
34 | train_set[w] = dataset[w][test_size:max_size].astype('int32')
35 | test_set[w] = dataset[w][:test_size].astype('int32')
36 |
37 | return train_set, test_set
38 |
39 |
40 | def deserialize_from_file(path):
41 | f = open(path, 'rb')
42 | obj = cPickle.load(f)
43 | f.close()
44 | return obj
45 |
46 |
47 | def build_fuel(data):
48 | # create fuel dataset.
49 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('data', data)]))
50 | dataset.example_iteration_scheme \
51 | = schemes.ShuffledExampleScheme(dataset.num_examples)
52 | return dataset, len(data)
53 |
54 |
55 | def obtain_stream(dataset, batch_size, size=1):
56 | if size == 1:
57 | data_stream = dataset.get_example_stream()
58 | data_stream = transformers.Batch(data_stream, iteration_scheme=schemes.ConstantScheme(batch_size))
59 |
60 | # add padding and masks to the dataset
61 | data_stream = transformers.Padding(data_stream, mask_sources=('data'))
62 | return data_stream
63 | else:
64 | data_streams = [dataset.get_example_stream() for _ in xrange(size)]
65 | data_streams = [transformers.Batch(data_stream, iteration_scheme=schemes.ConstantScheme(batch_size))
66 | for data_stream in data_streams]
67 | data_streams = [transformers.Padding(data_stream, mask_sources=('data')) for data_stream in data_streams]
68 | return data_streams
69 |
70 | def build_ptb():
71 | path = './ptbcorpus/'
72 | print path
73 | # make the dataset and vocabulary
74 | X_train = [l.split() for l in open(path + 'ptb.train.txt').readlines()]
75 | X_test = [l.split() for l in open(path + 'ptb.test.txt').readlines()]
76 | X_valid = [l.split() for l in open(path + 'ptb.valid.txt').readlines()]
77 |
78 | X = X_train + X_test + X_valid
79 | idx2word = dict(enumerate(set([w for l in X for w in l]), 1))
80 | idx2word[0] = ''
81 | word2idx = {v: k for k, v in idx2word.items()}
82 | ixwords_train = [[word2idx[w] for w in l] for l in X_train]
83 | ixwords_test = [[word2idx[w] for w in l] for l in X_test]
84 | ixwords_valid = [[word2idx[w] for w in l] for l in X_valid]
85 | ixwords_tv = [[word2idx[w] for w in l] for l in (X_train + X_valid)]
86 |
87 | max_len = max([len(w) for w in X_train])
88 | print max_len
89 | # serialization:
90 | # serialize_to_file(ixwords_train, path + 'data_train.pkl')
91 | # serialize_to_file(ixwords_test, path + 'data_test.pkl')
92 | # serialize_to_file(ixwords_valid, path + 'data_valid.pkl')
93 | # serialize_to_file(ixwords_tv, path + 'data_tv.pkl')
94 | # serialize_to_file([idx2word, word2idx], path + 'voc.pkl')
95 | # show_txt(X, 'data.txt')
96 | print 'save done.'
97 |
98 |
99 | def filter_unk(X, min_freq=5):
100 | voc = dict()
101 | for l in X:
102 | for w in l:
103 | if w not in voc:
104 | voc[w] = 1
105 | else:
106 | voc[w] += 1
107 |
108 | word2idx = dict()
109 | word2idx[''] = 0
110 | id2word = dict()
111 | id2word[0] = ''
112 |
113 | at = 1
114 | for w in voc:
115 | if voc[w] > min_freq:
116 | word2idx[w] = at
117 | id2word[at] = w
118 | at += 1
119 |
120 | word2idx[''] = at
121 | id2word[at] = ''
122 | return word2idx, id2word
123 |
124 |
125 | def build_msr():
126 | # path = '/home/thoma/Work/Dial-DRL/dataset/MSRSCC/'
127 | path = '/Users/jiataogu/Work/Dial-DRL/dataset/MSRSCC/'
128 | print path
129 |
130 | X = [l.split() for l in open(path + 'train.txt').readlines()]
131 | word2idx, idx2word = filter_unk(X, min_freq=5)
132 | print 'vocabulary size={0}. {1} samples'.format(len(word2idx), len(X))
133 |
134 | mean_len = np.mean([len(w) for w in X])
135 | print 'mean len = {}'.format(mean_len)
136 |
137 | ixwords = [[word2idx[w]
138 | if w in word2idx
139 | else word2idx['']
140 | for w in l] for l in X]
141 | print ixwords[0]
142 | # serialization:
143 | serialize_to_file(ixwords, path + 'data_train.pkl')
144 |
145 |
146 | if __name__ == '__main__':
147 | build_msr()
148 | # build_ptb()
149 | # build_dataset()
150 | # game = GuessOrder(size=8)
151 | # q = 'Is there any number smaller de than 6 in the last 3 numbers ?'
152 | # print game.easy_parse(q)
153 |
154 |
--------------------------------------------------------------------------------
/emolga/layers/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'yinpengcheng'
2 |
--------------------------------------------------------------------------------
/emolga/layers/attention.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'
2 | from .core import *
3 | """
4 | Attention Model.
5 | <::: Two kinds of attention models ::::>
6 | -- Linear Transformation
7 | -- Inner Product
8 | """
9 |
10 |
11 | class Attention(Layer):
12 | def __init__(self, target_dim, source_dim, hidden_dim,
13 | init='glorot_uniform', name='attention',
14 | coverage=False, max_len=50,
15 | shared=False):
16 |
17 | super(Attention, self).__init__()
18 | self.init = initializations.get(init)
19 | self.softmax = activations.get('softmax')
20 | self.tanh = activations.get('tanh')
21 | self.target_dim = target_dim
22 | self.source_dim = source_dim
23 | self.hidden_dim = hidden_dim
24 | self.max_len = max_len
25 | self.coverage = coverage
26 |
27 | if coverage:
28 | print 'Use Coverage Trick!'
29 |
30 | self.Wa = self.init((self.target_dim, self.hidden_dim))
31 | self.Ua = self.init((self.source_dim, self.hidden_dim))
32 | self.va = self.init((self.hidden_dim, 1))
33 |
34 | self.Wa.name, self.Ua.name, self.va.name = \
35 | '{}_Wa'.format(name), '{}_Ua'.format(name), '{}_va'.format(name)
36 | self.params = [self.Wa, self.Ua, self.va]
37 | if coverage:
38 | self.Ca = self.init((1, self.hidden_dim))
39 | self.Ca.name = '{}_Ca'.format(name)
40 | self.params += [self.Ca]
41 |
42 | def __call__(self, X, S,
43 | Smask=None,
44 | return_log=False,
45 | Cov=None):
46 | assert X.ndim + 1 == S.ndim, 'source should be one more dimension than target.'
47 | # X is the key: (nb_samples, x_dim)
48 | # S is the source (nb_samples, maxlen_s, ctx_dim)
49 | # Cov is the coverage vector (nb_samples, maxlen_s)
50 |
51 | if X.ndim == 1:
52 | X = X[None, :]
53 | S = S[None, :, :]
54 | if not Smask:
55 | Smask = Smask[None, :]
56 |
57 | Eng = dot(X[:, None, :], self.Wa) + dot(S, self.Ua) # (nb_samples, source_num, hidden_dims)
58 | Eng = self.tanh(Eng)
59 | # location aware:
60 | if self.coverage:
61 | Eng += dot(Cov[:, :, None], self.Ca) # (nb_samples, source_num, hidden_dims)
62 |
63 | Eng = dot(Eng, self.va)
64 | Eng = Eng[:, :, 0] # ? (nb_samples, source_num)
65 |
66 | if Smask is not None:
67 | # I want to use mask!
68 | EngSum = logSumExp(Eng, axis=1, mask=Smask)
69 | if return_log:
70 | return (Eng - EngSum) * Smask
71 | else:
72 | return T.exp(Eng - EngSum) * Smask
73 | else:
74 | if return_log:
75 | return T.log(self.softmax(Eng))
76 | else:
77 | return self.softmax(Eng)
78 |
79 |
80 | class CosineAttention(Layer):
81 | def __init__(self, target_dim, source_dim,
82 | init='glorot_uniform',
83 | use_pipe=True,
84 | name='attention'):
85 |
86 | super(CosineAttention, self).__init__()
87 | self.init = initializations.get(init)
88 | self.softmax = activations.get('softmax')
89 | self.softplus = activations.get('softplus')
90 | self.tanh = activations.get('tanh')
91 | self.use_pipe = use_pipe
92 |
93 | self.target_dim = target_dim
94 | self.source_dim = source_dim
95 |
96 | # pipe
97 | if self.use_pipe:
98 | self.W_key = Dense(self.target_dim, self.source_dim, name='W_key')
99 | else:
100 | assert target_dim == source_dim
101 | self.W_key = Identity(name='W_key')
102 | self._add(self.W_key)
103 |
104 | # sharpen
105 | # self.W_beta = Dense(self.target_dim, 1, name='W_beta')
106 | # dio-sharpen
107 | # self.W_beta = Dense(self.target_dim, self.source_dim, name='W_beta')
108 | # self._add(self.W_beta)
109 |
110 | # self.gamma = self.init((source_dim, ))
111 | # self.gamma = self.init((target_dim, source_dim))
112 | # self.gamma.name = 'o_gamma'
113 | # self.params += [self.gamma]
114 |
115 | def __call__(self, X, S, Smask=None, return_log=False):
116 | assert X.ndim + 1 == S.ndim, 'source should be one more dimension than target.'
117 |
118 | if X.ndim == 1:
119 | X = X[None, :]
120 | S = S[None, :, :]
121 | if not Smask:
122 | Smask = Smask[None, :]
123 |
124 | key = self.W_key(X) # (nb_samples, source_dim)
125 | # beta = self.softplus(self.W_beta(X)) # (nb_samples, source_dim)
126 |
127 | Eng = dot_2d(key, S) #, g=self.gamma)
128 | # Eng = cosine_sim2d(key, S) # (nb_samples, source_num)
129 | # Eng = T.repeat(beta, Eng.shape[1], axis=1) * Eng
130 |
131 | if Smask is not None:
132 | # I want to use mask!
133 | EngSum = logSumExp(Eng, axis=1, mask=Smask)
134 | if return_log:
135 | return (Eng - EngSum) * Smask
136 | else:
137 | return T.exp(Eng - EngSum) * Smask
138 | else:
139 | if return_log:
140 | return T.log(self.softmax(Eng))
141 | else:
142 | return self.softmax(Eng)
143 |
144 |
--------------------------------------------------------------------------------
/emolga/layers/core.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from emolga.utils.theano_utils import *
4 | import emolga.basic.initializations as initializations
5 | import emolga.basic.activations as activations
6 |
7 |
8 | class Layer(object):
9 | def __init__(self):
10 | self.params = []
11 | self.layers = []
12 | self.monitor = {}
13 | self.watchlist = []
14 |
15 | def init_updates(self):
16 | self.updates = []
17 |
18 | def _monitoring(self):
19 | # add monitoring variables
20 | for l in self.layers:
21 | for v in l.monitor:
22 | name = v + '@' + l.name
23 | print name
24 | self.monitor[name] = l.monitor[v]
25 |
26 | def __call__(self, X, *args, **kwargs):
27 | return X
28 |
29 | def _add(self, layer):
30 | if layer:
31 | self.layers.append(layer)
32 | self.params += layer.params
33 |
34 | def supports_masked_input(self):
35 | ''' Whether or not this layer respects the output mask of its previous layer in its calculations. If you try
36 | to attach a layer that does *not* support masked_input to a layer that gives a non-None output_mask() that is
37 | an error'''
38 | return False
39 |
40 | def get_output_mask(self, train=None):
41 | '''
42 | For some models (such as RNNs) you want a way of being able to mark some output data-points as
43 | "masked", so they are not used in future calculations. In such a model, get_output_mask() should return a mask
44 | of one less dimension than get_output() (so if get_output is (nb_samples, nb_timesteps, nb_dimensions), then the mask
45 | is (nb_samples, nb_timesteps), with a one for every unmasked datapoint, and a zero for every masked one.
46 |
47 | If there is *no* masking then it shall return None. For instance if you attach an Activation layer (they support masking)
48 | to a layer with an output_mask, then that Activation shall also have an output_mask. If you attach it to a layer with no
49 | such mask, then the Activation's get_output_mask shall return None.
50 |
51 | Some emolga have an output_mask even if their input is unmasked, notably Embedding which can turn the entry "0" into
52 | a mask.
53 | '''
54 | return None
55 |
56 | def set_weights(self, weights):
57 | for p, w in zip(self.params, weights):
58 | if p.eval().shape != w.shape:
59 | raise Exception("Layer shape %s not compatible with weight shape %s." % (p.eval().shape, w.shape))
60 | p.set_value(floatX(w))
61 |
62 | def get_weights(self):
63 | weights = []
64 | for p in self.params:
65 | weights.append(p.get_value())
66 | return weights
67 |
68 | def get_params(self):
69 | return self.params
70 |
71 | def set_name(self, name):
72 | for i in range(len(self.params)):
73 | if self.params[i].name is None:
74 | self.params[i].name = '%s_p%d' % (name, i)
75 | else:
76 | self.params[i].name = name + '_' + self.params[i].name
77 | self.name = name
78 |
79 |
80 | class MaskedLayer(Layer):
81 | '''
82 | If your layer trivially supports masking (by simply copying the input mask to the output), then subclass MaskedLayer
83 | instead of Layer, and make sure that you incorporate the input mask into your calculation of get_output()
84 | '''
85 | def supports_masked_input(self):
86 | return True
87 |
88 |
89 | class Identity(Layer):
90 | def __init__(self, name='Identity'):
91 | super(Identity, self).__init__()
92 | if name is not None:
93 | self.set_name(name)
94 |
95 | def __call__(self, X):
96 | return X
97 |
98 |
99 | class Dense(Layer):
100 | def __init__(self, input_dim, output_dim, init='glorot_uniform', activation='tanh', name='Dense',
101 | learn_bias=True, negative_bias=False):
102 |
103 | super(Dense, self).__init__()
104 | self.init = initializations.get(init)
105 | self.activation = activations.get(activation)
106 | self.input_dim = input_dim
107 | self.output_dim = output_dim
108 | self.linear = (activation == 'linear')
109 |
110 | # self.input = T.matrix()
111 | self.W = self.init((self.input_dim, self.output_dim))
112 | if not negative_bias:
113 | self.b = shared_zeros((self.output_dim))
114 | else:
115 | self.b = shared_ones((self.output_dim))
116 |
117 | self.learn_bias = learn_bias
118 | if self.learn_bias:
119 | self.params = [self.W, self.b]
120 | else:
121 | self.params = [self.W]
122 |
123 | if name is not None:
124 | self.set_name(name)
125 |
126 | def set_name(self, name):
127 | self.W.name = '%s_W' % name
128 | self.b.name = '%s_b' % name
129 |
130 | def __call__(self, X):
131 | output = self.activation(T.dot(X, self.W) + 4. * self.b)
132 | return output
133 |
134 | def reverse(self, Y):
135 | assert self.linear
136 |
137 | output = T.dot((Y - self.b), self.W.T)
138 | return output
139 |
140 |
141 | class Dense2(Layer):
142 | def __init__(self, input_dim1, input_dim2, output_dim, init='glorot_uniform', activation='tanh', name='Dense', learn_bias=True):
143 |
144 | super(Dense2, self).__init__()
145 | self.init = initializations.get(init)
146 | self.activation = activations.get(activation)
147 | self.input_dim1 = input_dim1
148 | self.input_dim2 = input_dim2
149 | self.output_dim = output_dim
150 | self.linear = (activation == 'linear')
151 |
152 | # self.input = T.matrix()
153 |
154 | self.W1 = self.init((self.input_dim1, self.output_dim))
155 | self.W2 = self.init((self.input_dim2, self.output_dim))
156 | self.b = shared_zeros((self.output_dim))
157 |
158 | self.learn_bias = learn_bias
159 | if self.learn_bias:
160 | self.params = [self.W1, self.W2, self.b]
161 | else:
162 | self.params = [self.W1, self.W2]
163 |
164 | if name is not None:
165 | self.set_name(name)
166 |
167 | def set_name(self, name):
168 | self.W1.name = '%s_W1' % name
169 | self.W2.name = '%s_W2' % name
170 | self.b.name = '%s_b' % name
171 |
172 | def __call__(self, X1, X2):
173 | output = self.activation(T.dot(X1, self.W1) + T.dot(X2, self.W2) + self.b)
174 | return output
175 |
176 |
177 | class Constant(Layer):
178 | def __init__(self, input_dim, output_dim, init=None, activation='tanh', name='Bias'):
179 |
180 | super(Constant, self).__init__()
181 | assert input_dim == output_dim, 'Bias Layer needs to have the same input/output nodes.'
182 |
183 | self.init = initializations.get(init)
184 | self.activation = activations.get(activation)
185 | self.input_dim = input_dim
186 | self.output_dim = output_dim
187 |
188 | self.b = shared_zeros(self.output_dim)
189 | self.params = [self.b]
190 |
191 | if name is not None:
192 | self.set_name(name)
193 |
194 | def set_name(self, name):
195 | self.b.name = '%s_b' % name
196 |
197 | def __call__(self, X=None):
198 | output = self.activation(self.b)
199 | if X:
200 | L = X.shape[0]
201 | output = T.extra_ops.repeat(output[None, :], L, axis=0)
202 | return output
203 |
204 |
205 | class MemoryLinear(Layer):
206 | def __init__(self, input_dim, input_wdth, init='glorot_uniform',
207 | activation='tanh', name='Bias', has_input=True):
208 | super(MemoryLinear, self).__init__()
209 |
210 | self.init = initializations.get(init)
211 | self.activation = activations.get(activation)
212 | self.input_dim = input_dim
213 | self.input_wdth = input_wdth
214 |
215 | self.b = self.init((self.input_dim, self.input_wdth))
216 | self.params = [self.b]
217 |
218 | if has_input:
219 | self.P = self.init((self.input_dim, self.input_wdth))
220 | self.params += [self.P]
221 |
222 | if name is not None:
223 | self.set_name(name)
224 |
225 | def __call__(self, X=None):
226 | out = self.b[None, :, :]
227 | if X:
228 | out += self.P[None, :, :] * X
229 | return self.activation(out)
230 |
231 |
232 | class Dropout(MaskedLayer):
233 | """
234 | Hinton's dropout.
235 | """
236 | def __init__(self, rng=None, p=1., name=None):
237 | super(Dropout, self).__init__()
238 | self.p = p
239 | self.rng = rng
240 |
241 | def __call__(self, X, train=True):
242 | if self.p > 0.:
243 | retain_prob = 1. - self.p
244 | if train:
245 | X *= self.rng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
246 | else:
247 | X *= retain_prob
248 | return X
249 |
250 |
251 | class Activation(MaskedLayer):
252 | """
253 | Apply an activation function to an output.
254 | """
255 | def __init__(self, activation):
256 | super(Activation, self).__init__()
257 | self.activation = activations.get(activation)
258 |
259 | def __call__(self, X):
260 | return self.activation(X)
261 |
262 |
--------------------------------------------------------------------------------
/emolga/layers/embeddings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .core import Layer
4 | from emolga.utils.theano_utils import *
5 | import emolga.basic.initializations as initializations
6 |
7 |
8 | class Embedding(Layer):
9 | '''
10 | Turn positive integers (indexes) into denses vectors of fixed size.
11 | eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
12 |
13 | @input_dim: size of vocabulary (highest input integer + 1)
14 | @out_dim: size of dense representation
15 | '''
16 |
17 | def __init__(self, input_dim, output_dim, init='uniform', name=None):
18 |
19 | super(Embedding, self).__init__()
20 | self.init = initializations.get(init)
21 | self.input_dim = input_dim
22 | self.output_dim = output_dim
23 |
24 | self.W = self.init((self.input_dim, self.output_dim))
25 |
26 | self.params = [self.W]
27 |
28 | if name is not None:
29 | self.set_name(name)
30 |
31 | def get_output_mask(self, X):
32 | return T.ones_like(X) * (1 - T.eq(X, 0))
33 |
34 | def __call__(self, X, mask_zero=False, context=None):
35 | if context is None:
36 | out = self.W[X]
37 | else:
38 | assert context.ndim == 3
39 | flag = False
40 | if X.ndim == 1:
41 | flag = True
42 | X = X[:, None]
43 |
44 | b_size = context.shape[0]
45 |
46 | EMB = T.repeat(self.W[None, :, :], b_size, axis=0)
47 | EMB = T.concatenate([EMB, context], axis=1)
48 |
49 | m_size = EMB.shape[1]
50 | e_size = EMB.shape[2]
51 | maxlen = X.shape[1]
52 |
53 | EMB = EMB.reshape((b_size * m_size, e_size))
54 | Z = (T.arange(b_size)[:, None] * m_size + X).reshape((b_size * maxlen,))
55 | out = EMB[Z] # (b_size * maxlen, e_size)
56 |
57 | if not flag:
58 | out = out.reshape((b_size, maxlen, e_size))
59 | else:
60 | out = out.reshape((b_size, e_size))
61 |
62 | if mask_zero:
63 | return out, T.cast(self.get_output_mask(X), dtype='float32')
64 | else:
65 | return out
66 |
67 |
68 | class Zero(Layer):
69 | def __call__(self, X):
70 | out = T.zeros(X.shape)
71 | return out
72 |
73 |
74 | class Bias(Layer):
75 | def __call__(self, X):
76 | tmp = X.flatten()
77 | tmp = tmp.dimshuffle(0, 'x')
78 | return tmp
79 |
--------------------------------------------------------------------------------
/emolga/models/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'
2 |
--------------------------------------------------------------------------------
/emolga/models/core.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'
2 | import theano
3 | import logging
4 | import deepdish as dd
5 |
6 | from emolga.dataset.build_dataset import serialize_to_file, deserialize_from_file
7 | from emolga.utils.theano_utils import floatX
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | class Model(object):
13 | def __init__(self):
14 | self.layers = []
15 | self.params = []
16 | self.monitor = {}
17 | self.watchlist = []
18 |
19 | def _add(self, layer):
20 | if layer:
21 | self.layers.append(layer)
22 | self.params += layer.params
23 |
24 | def _monitoring(self):
25 | # add monitoring variables
26 | for l in self.layers:
27 | for v in l.monitor:
28 | name = v + '@' + l.name
29 | print name
30 | self.monitor[name] = l.monitor[v]
31 |
32 | def compile_monitoring(self, inputs, updates=None):
33 | logger.info('compile monitoring')
34 | for i, v in enumerate(self.monitor):
35 | self.watchlist.append(v)
36 | logger.info('monitoring [{0}]: {1}'.format(i, v))
37 |
38 | self.watch = theano.function(inputs,
39 | [self.monitor[v] for v in self.watchlist],
40 | updates=updates
41 | )
42 | logger.info('done.')
43 |
44 | def set_weights(self, weights):
45 | if hasattr(self, 'save_parm'):
46 | params = self.params + self.save_parm
47 | else:
48 | params = self.params
49 |
50 | for p, w in zip(params, weights):
51 | print p.name
52 | if p.eval().shape != w.shape:
53 | raise Exception("Layer shape %s not compatible with weight shape %s." % (p.eval().shape, w.shape))
54 | p.set_value(floatX(w))
55 |
56 | def get_weights(self):
57 | weights = []
58 | for p in self.params:
59 | weights.append(p.get_value())
60 |
61 | if hasattr(self, 'save_parm'):
62 | for v in self.save_parm:
63 | weights.append(v.get_value())
64 |
65 | return weights
66 |
67 | def set_name(self, name):
68 | for i in range(len(self.params)):
69 | if self.params[i].name is None:
70 | self.params[i].name = '%s_p%d' % (name, i)
71 | else:
72 | self.params[i].name = name + '@' + self.params[i].name
73 | self.name = name
74 |
75 | def save(self, filename):
76 | if hasattr(self, 'save_parm'):
77 | params = self.params + self.save_parm
78 | else:
79 | params = self.params
80 | ps = 'save: <\n'
81 | for p in params:
82 | ps += '{0}: {1}\n'.format(p.name, p.eval().shape)
83 | ps += '> to ... {}'.format(filename)
84 | logger.info(ps)
85 |
86 | # hdf5 module seems works abnormal !!
87 | # dd.io.save(filename, self.get_weights())
88 | serialize_to_file(self.get_weights(), filename)
89 |
90 | def load(self, filename):
91 | logger.info('load the weights.')
92 |
93 | # hdf5 module seems works abnormal !!
94 | # weights = dd.io.load(filename)
95 | weights = deserialize_from_file(filename)
96 | print len(weights)
97 | self.set_weights(weights)
98 |
--------------------------------------------------------------------------------
/emolga/models/core.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MultiPath/CopyNet/dba24d58d505abaa169e05ee762987f37f41f566/emolga/models/core.pyc
--------------------------------------------------------------------------------
/emolga/test_lm.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'
2 |
3 | import logging
4 |
5 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
6 |
7 | from emolga.models.encdec import RNNLM, AutoEncoder
8 | from emolga.models.variational import Helmholtz, VAE, HarX, THarX, NVTM
9 | # from models.ntm_encdec import RNNLM, AutoEncoder, Helmholtz, BinaryHelmholtz
10 | from emolga.utils.generic_utils import *
11 | from emolga.dataset.build_dataset import deserialize_from_file, build_fuel, obtain_stream
12 | from emolga.config import setup_ptbz, setup_ptb2
13 | from emolga.config_variant import *
14 |
15 | setup = setup_bienc
16 |
17 |
18 | def init_logging(logfile):
19 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
20 | datefmt='%m/%d/%Y %H:%M:%S' )
21 | fh = logging.FileHandler(logfile)
22 | # ch = logging.StreamHandler()
23 |
24 | fh.setFormatter(formatter)
25 | # ch.setFormatter(formatter)
26 | # fh.setLevel(logging.INFO)
27 | # ch.setLevel(logging.INFO)
28 | # logging.getLogger().addHandler(ch)
29 | logging.getLogger().addHandler(fh)
30 | logging.getLogger().setLevel(logging.INFO)
31 |
32 | return logging
33 |
34 | # prepare logging.
35 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
36 | config = setup() # load settings.
37 | for w in config:
38 | print '{0}={1}'.format(w, config[w])
39 |
40 | logger = init_logging(config['path_log'] + '/emolga.RHM.id={}.log'.format(tmark))
41 | n_rng = np.random.RandomState(config['seed'])
42 | np.random.seed(config['seed'])
43 | rng = RandomStreams(n_rng.randint(2 ** 30))
44 |
45 | logger.info('Start!')
46 |
47 | # load the dataset and build a fuel-dataset.
48 | idx2word, word2idx = deserialize_from_file(config['vocabulary_set'])
49 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
50 | config['dec_voc_size'] = config['enc_voc_size']
51 | logger.info('build dataset done. vocabulary size = {0}/ batch size = {1}'.format(
52 | config['dec_voc_size'], config['batch_size']))
53 |
54 | # training & valid & tesing set.
55 | train_set, train_size = build_fuel(deserialize_from_file(config['dataset']))
56 | valid_set, valid_size = build_fuel(deserialize_from_file(config['dataset_test'])) # use test set for a try
57 |
58 | # weiget save.
59 | savefile = config['path_h5'] + '/emolga.RHM.id={}.h5'.format(tmark)
60 |
61 | # build the agent
62 | if config['model'] == 'RNNLM':
63 | agent = RNNLM(config, n_rng, rng, mode=config['mode'])
64 | elif config['model'] == 'HarX':
65 | agent = THarX(config, n_rng, rng, mode=config['mode'])
66 | elif config['model'] == 'Helmholtz':
67 | agent = Helmholtz(config, n_rng, rng, mode=config['mode'])
68 | else:
69 | raise NotImplementedError
70 |
71 | agent.build_()
72 | agent.compile_('train')
73 | print 'compile ok'
74 |
75 | # learning to speak language.
76 | count = 1000
77 | echo = 0
78 | epochs = 50
79 | while echo < epochs:
80 | echo += 1
81 | loss = []
82 | correct = 0
83 | scans = 0
84 |
85 | # visualization the embedding weights.
86 | # if echo > 1:
87 | # plt.figure(3)
88 | # visualize_(plt.subplot(111), agent.decoder.Embed.get_params()[0].get_value(), name='encoder embedding',
89 | # text=idx2word)
90 | # plt.show()
91 |
92 | # if not config['use_noise']:
93 |
94 | # training
95 | train_batches = obtain_stream(train_set, config['batch_size']).get_epoch_iterator(as_dict=True)
96 | valid_batches = obtain_stream(valid_set, config['eval_batch_size']).get_epoch_iterator(as_dict=True)
97 |
98 | def prepare_batch(batch):
99 | data = batch['data'].astype('int32')
100 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
101 |
102 | def cut_zeros(data):
103 | for k in range(data.shape[1] - 1, 0, -1):
104 | data_col = data[:, k].sum()
105 | if data_col > 0:
106 | return data[:, : k + 2]
107 | return data
108 | data = cut_zeros(data)
109 | return data
110 |
111 | # training
112 | logger.info('Epoch = {} -> Training Set Learning...'.format(echo))
113 | progbar = Progbar(train_size / config['batch_size'])
114 | for it, batch in enumerate(train_batches):
115 | # get data
116 | data = prepare_batch(batch)
117 | if config['model'] == 'RNNLM' or config['model'] == 'AutoEncoder':
118 | loss.append(agent.train_(data, config['repeats']))
119 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
120 | elif config['model'] == 'Helmholtz' or 'HarX':
121 | loss.append(agent.train_(data, config['repeats']))
122 | weightss = np.sum([np.sum(abs(w)) for w in agent.get_weights()])
123 | progbar.update(it, [('lossPa', loss[-1][0]), ('lossPxa', loss[-1][1]), ('lossQ', loss[-1][2]),
124 | ('perplexity', np.log(loss[-1][3])), ('NLL', loss[-1][4]), ('L1', weightss)])
125 |
126 | """
127 | watch = agent.watch(data)
128 | print '.'
129 | pprint(watch[0][0])
130 | pprint(watch[2][0])
131 | # pprint(watch[2][0])
132 | sys.exit(111)
133 | """
134 |
135 | # if it % 100 == 50:
136 | # sys.exit(-1)
137 | # # print '.'
138 | # # print 'encoded = {}'.format(encoded[11])
139 | # # print 'mean = {}'.format(mean[11])
140 | # # print 'std = {}'.format(std[11])
141 | #
142 | # # watch = agent.watch(data)
143 | # # print '.'
144 | # # print 'train memory {}'.format(watch[0][0])
145 | #
146 | # for kk in xrange(5):
147 | # # sample a sentence.
148 | # # action = agent.action_sampler()
149 | # # context = agent.context_trans(action)
150 | # if config['model'] is 'AutoEncoder':
151 | # source = data[kk][None, :]
152 | # truth = ' '.join(print_sample(idx2word, source[0].tolist())[:-1])
153 | # print '\ntruth: {}'.format(truth)
154 | # context = agent.memorize(source)
155 | # sample, score = agent.generate_(context, max_len=data.shape[1])
156 | # else:
157 | # sample, score = agent.generate_(max_len=data.shape[1])
158 | #
159 | # if sample[-1] is not 0:
160 | # sample += [0] # fix the end.
161 | # question = ' '.join(print_sample(idx2word, sample)[:-1])
162 | # print '\nsample: {}'.format(question)
163 | # print 'PPL: {}'.format(score)
164 | # scans += 1.0
165 |
166 | print ' .'
167 | logger.info('Epoch = {0} finished.'.format(echo))
168 |
169 | # validation
170 | logger.info('Epoch = {} -> Vadlidation Set Evaluation...'.format(echo))
171 | progbar = Progbar(valid_size / config['batch_size'])
172 | for it, batch in enumerate(valid_batches):
173 | # get data
174 | data = prepare_batch(batch)
175 | if config['model'] == 'Helmholtz' or 'HarX':
176 | loss.append(agent.evaluate_(data))
177 | progbar.update(it, [('NLL', loss[-1][0]), ('perplexity', np.log(loss[-1][1]))])
178 | else:
179 | raise NotImplementedError
180 |
181 | print ' .'
182 | # save the weights.
183 | agent.save(config['path_h5'] + '/emolga.RHM.id={0}.epoch={1}.pkl'.format(tmark, echo))
184 |
185 | # logger.info('Learning percentage: {}'.format(correct / scans))
186 |
187 |
188 | # inference test
189 | # batches = data_stream.get_epoch_iterator(as_dict=True)
190 | # for it, batch in enumerate(batches):
191 | # data = batch['data'].astype('int32')
192 | # data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
193 | # mean, std = agent.inference_(data)
194 | # print mean
195 | # break
196 | # print count
--------------------------------------------------------------------------------
/emolga/test_nvtm.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'
2 |
3 | import logging
4 |
5 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
6 |
7 | from emolga.models.encdec import RNNLM, AutoEncoder
8 | from emolga.models.variational import Helmholtz, VAE, HarX, THarX, NVTM
9 | # from models.ntm_encdec import RNNLM, AutoEncoder, Helmholtz, BinaryHelmholtz
10 | from emolga.utils.generic_utils import *
11 | from emolga.dataset.build_dataset import deserialize_from_file, build_fuel, obtain_stream
12 | from emolga.config import setup_ptbz, setup_ptb2
13 | from emolga.config_variant import *
14 |
15 | setup = setup_bienc
16 |
17 |
18 | def init_logging(logfile):
19 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
20 | datefmt='%m/%d/%Y %H:%M:%S' )
21 | fh = logging.FileHandler(logfile)
22 | # ch = logging.StreamHandler()
23 |
24 | fh.setFormatter(formatter)
25 | # ch.setFormatter(formatter)
26 | # fh.setLevel(logging.INFO)
27 | # ch.setLevel(logging.INFO)
28 | # logging.getLogger().addHandler(ch)
29 | logging.getLogger().addHandler(fh)
30 | logging.getLogger().setLevel(logging.INFO)
31 |
32 | return logging
33 |
34 | # prepare logging.
35 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
36 | config = setup() # load settings.
37 | for w in config:
38 | print '{0}={1}'.format(w, config[w])
39 |
40 | logger = init_logging(config['path_log'] + '/emolga.RHM.id={}.log'.format(tmark))
41 | n_rng = np.random.RandomState(config['seed'])
42 | np.random.seed(config['seed'])
43 | rng = RandomStreams(n_rng.randint(2 ** 30))
44 |
45 | logger.info('Start!')
46 |
47 | # load the dataset and build a fuel-dataset.
48 | idx2word, word2idx = deserialize_from_file(config['vocabulary_set'])
49 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
50 | config['dec_voc_size'] = config['enc_voc_size']
51 | logger.info('build dataset done. vocabulary size = {0}/ batch size = {1}'.format(
52 | config['dec_voc_size'], config['batch_size']))
53 |
54 | # training & valid & tesing set.
55 | train_set, train_size = build_fuel(deserialize_from_file(config['dataset']))
56 | valid_set, valid_size = build_fuel(deserialize_from_file(config['dataset_test'])) # use test set for a try
57 |
58 | # weiget save.
59 | savefile = config['path_h5'] + '/emolga.RHM.id={}.h5'.format(tmark)
60 |
61 | # build the agent
62 | if config['model'] == 'RNNLM':
63 | agent = RNNLM(config, n_rng, rng, mode=config['mode'])
64 | elif config['model'] == 'HarX':
65 | agent = NVTM(config, n_rng, rng, mode=config['mode'])
66 | elif config['model'] == 'Helmholtz':
67 | agent = Helmholtz(config, n_rng, rng, mode=config['mode'])
68 | else:
69 | raise NotImplementedError
70 |
71 | agent.build_()
72 | agent.compile_('train')
73 | print 'compile ok'
74 |
75 | # learning to speak language.
76 | count = 1000
77 | echo = 0
78 | epochs = 50
79 | while echo < epochs:
80 | echo += 1
81 | loss = []
82 | correct = 0
83 | scans = 0
84 |
85 | # visualization the embedding weights.
86 | # if echo > 1:
87 | # plt.figure(3)
88 | # visualize_(plt.subplot(111), agent.decoder.Embed.get_params()[0].get_value(), name='encoder embedding',
89 | # text=idx2word)
90 | # plt.show()
91 |
92 | # if not config['use_noise']:
93 |
94 | # training
95 | train_batches = obtain_stream(train_set, config['batch_size']).get_epoch_iterator(as_dict=True)
96 | valid_batches = obtain_stream(valid_set, config['eval_batch_size']).get_epoch_iterator(as_dict=True)
97 |
98 | def prepare_batch(batch):
99 | data = batch['data'].astype('int32')
100 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
101 |
102 | def cut_zeros(data):
103 | for k in range(data.shape[1] - 1, 0, -1):
104 | data_col = data[:, k].sum()
105 | if data_col > 0:
106 | return data[:, : k + 2]
107 | return data
108 | data = cut_zeros(data)
109 | return data
110 |
111 | # training
112 | logger.info('Epoch = {} -> Training Set Learning...'.format(echo))
113 | progbar = Progbar(train_size / config['batch_size'])
114 | for it, batch in enumerate(train_batches):
115 | # get data
116 | data = prepare_batch(batch)
117 | if config['model'] == 'RNNLM' or config['model'] == 'AutoEncoder':
118 | loss.append(agent.train_(data, config['repeats']))
119 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
120 | elif config['model'] == 'Helmholtz' or 'HarX':
121 | loss.append(agent.train_(data, 1))
122 | weightss = np.sum([np.sum(abs(w)) for w in agent.get_weights()])
123 | progbar.update(it, [('lossL', loss[-1][0]), ('lossP', loss[-1][1]), ('lossQ', loss[-1][2]),
124 | ('perplexity', np.log(loss[-1][3])), ('NLL', loss[-1][4]), ('Baseline', loss[-1][5])])
125 |
126 | """
127 | watch = agent.watch(data)
128 | print '.'
129 | pprint(watch[0][0])
130 | pprint(watch[2][0])
131 | # pprint(watch[2][0])
132 | sys.exit(111)
133 | """
134 |
135 | # if it % 100 == 50:
136 | # sys.exit(-1)
137 | # # print '.'
138 | # # print 'encoded = {}'.format(encoded[11])
139 | # # print 'mean = {}'.format(mean[11])
140 | # # print 'std = {}'.format(std[11])
141 | #
142 | # # watch = agent.watch(data)
143 | # # print '.'
144 | # # print 'train memory {}'.format(watch[0][0])
145 | #
146 | # for kk in xrange(5):
147 | # # sample a sentence.
148 | # # action = agent.action_sampler()
149 | # # context = agent.context_trans(action)
150 | # if config['model'] is 'AutoEncoder':
151 | # source = data[kk][None, :]
152 | # truth = ' '.join(print_sample(idx2word, source[0].tolist())[:-1])
153 | # print '\ntruth: {}'.format(truth)
154 | # context = agent.memorize(source)
155 | # sample, score = agent.generate_(context, max_len=data.shape[1])
156 | # else:
157 | # sample, score = agent.generate_(max_len=data.shape[1])
158 | #
159 | # if sample[-1] is not 0:
160 | # sample += [0] # fix the end.
161 | # question = ' '.join(print_sample(idx2word, sample)[:-1])
162 | # print '\nsample: {}'.format(question)
163 | # print 'PPL: {}'.format(score)
164 | # scans += 1.0
165 |
166 | print ' .'
167 | logger.info('Epoch = {0} finished.'.format(echo))
168 | loss = zip(*loss)
169 | logger.info('LossL: {0}, LossP: {1}, LossQ: {2}, PPL: {3}, NLL: {4}, Baseline: {5}'.format(
170 | np.mean(loss[0]), np.mean(loss[1]), np.mean(loss[2]), np.mean(loss[3]), np.mean(loss[4]), np.mean(loss[5])
171 | ))
172 |
173 | # validation
174 | loss = []
175 | logger.info('Epoch = {} -> Vadlidation Set Evaluation...'.format(echo))
176 | progbar = Progbar(valid_size / config['batch_size'])
177 | for it, batch in enumerate(valid_batches):
178 | # get data
179 | data = prepare_batch(batch)
180 | if config['model'] == 'Helmholtz' or 'HarX':
181 | # loss.append(agent.evaluate_(data))
182 | loss.append(agent.explore_(data, 10))
183 | weightss = np.sum([np.sum(abs(w)) for w in agent.get_weights()])
184 | progbar.update(it, [('lossL', loss[-1][0]), ('lossP', loss[-1][1]), ('lossQ', loss[-1][2]),
185 | ('perplexity', np.log(loss[-1][3])), ('NLL', loss[-1][4]), ('Baseline', loss[-1][5])])
186 | else:
187 | raise NotImplementedError
188 |
189 | print ' .'
190 | loss = zip(*loss)
191 | logger.info('LossL: {0}, LossP: {1}, LossQ: {2}, PPL: {3}, NLL: {4}, Baseline: {5}'.format(
192 | np.mean(loss[0]), np.mean(loss[1]), np.mean(loss[2]), np.mean(loss[3]), np.mean(loss[4]), np.mean(loss[5])
193 | ))
194 | # save the weights.
195 | agent.save(config['path_h5'] + '/emolga.RHM.id={0}.epoch={1}.pkl'.format(tmark, echo))
196 |
197 | # logger.info('Learning percentage: {}'.format(correct / scans))
198 |
199 |
200 | # inference test
201 | # batches = data_stream.get_epoch_iterator(as_dict=True)
202 | # for it, batch in enumerate(batches):
203 | # data = batch['data'].astype('int32')
204 | # data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
205 | # mean, std = agent.inference_(data)
206 | # print mean
207 | # break
208 | # print count
--------------------------------------------------------------------------------
/emolga/utils/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'yinpengcheng'
2 |
--------------------------------------------------------------------------------
/emolga/utils/generic_utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from matplotlib.ticker import FuncFormatter
3 | import numpy as np
4 | import time
5 | import sys
6 | import six
7 | import matplotlib.pyplot as plt
8 | import matplotlib
9 |
10 | def get_from_module(identifier, module_params, module_name, instantiate=False, kwargs=None):
11 | if isinstance(identifier, six.string_types):
12 | res = module_params.get(identifier)
13 | if not res:
14 | raise Exception('Invalid ' + str(module_name) + ': ' + str(identifier))
15 | if instantiate and not kwargs:
16 | return res()
17 | elif instantiate and kwargs:
18 | return res(**kwargs)
19 | else:
20 | return res
21 | return identifier
22 |
23 |
24 | def make_tuple(*args):
25 | return args
26 |
27 |
28 | def printv(v, prefix=''):
29 | if type(v) == dict:
30 | if 'name' in v:
31 | print(prefix + '#' + v['name'])
32 | del v['name']
33 | prefix += '...'
34 | for nk, nv in v.items():
35 | if type(nv) in [dict, list]:
36 | print(prefix + nk + ':')
37 | printv(nv, prefix)
38 | else:
39 | print(prefix + nk + ':' + str(nv))
40 | elif type(v) == list:
41 | prefix += '...'
42 | for i, nv in enumerate(v):
43 | print(prefix + '#' + str(i))
44 | printv(nv, prefix)
45 | else:
46 | prefix += '...'
47 | print(prefix + str(v))
48 |
49 |
50 | def make_batches(size, batch_size):
51 | nb_batch = int(np.ceil(size/float(batch_size)))
52 | return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]
53 |
54 |
55 | def slice_X(X, start=None, stop=None):
56 | if type(X) == list:
57 | if hasattr(start, '__len__'):
58 | return [x[start] for x in X]
59 | else:
60 | return [x[start:stop] for x in X]
61 | else:
62 | if hasattr(start, '__len__'):
63 | return X[start]
64 | else:
65 | return X[start:stop]
66 |
67 |
68 | class Progbar(object):
69 | def __init__(self, target, width=30, verbose=1):
70 | '''
71 | @param target: total number of steps expected
72 | '''
73 | self.width = width
74 | self.target = target
75 | self.sum_values = {}
76 | self.unique_values = []
77 | self.start = time.time()
78 | self.total_width = 0
79 | self.seen_so_far = 0
80 | self.verbose = verbose
81 |
82 | def update(self, current, values=[]):
83 | '''
84 | @param current: index of current step
85 | @param values: list of tuples (name, value_for_last_step).
86 | The progress bar will display averages for these values.
87 | '''
88 | for k, v in values:
89 | if k not in self.sum_values:
90 | self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far]
91 | self.unique_values.append(k)
92 | else:
93 | self.sum_values[k][0] += v * (current - self.seen_so_far)
94 | self.sum_values[k][1] += (current - self.seen_so_far)
95 | self.seen_so_far = current
96 |
97 | now = time.time()
98 | if self.verbose == 1:
99 | prev_total_width = self.total_width
100 | sys.stdout.write("\b" * prev_total_width)
101 | sys.stdout.write("\r")
102 |
103 | numdigits = int(np.floor(np.log10(self.target))) + 1
104 | barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
105 | bar = barstr % (current, self.target)
106 | prog = float(current)/self.target
107 | prog_width = int(self.width*prog)
108 | if prog_width > 0:
109 | bar += ('.'*(prog_width-1))
110 | if current < self.target:
111 | bar += '(-w-)'
112 | else:
113 | bar += '(-v-)!!'
114 | bar += ('~' * (self.width-prog_width))
115 | bar += ']'
116 | sys.stdout.write(bar)
117 | self.total_width = len(bar)
118 |
119 | if current:
120 | time_per_unit = (now - self.start) / current
121 | else:
122 | time_per_unit = 0
123 | eta = time_per_unit*(self.target - current)
124 | info = ''
125 | if current < self.target:
126 | info += ' - ETA: %ds' % eta
127 | else:
128 | info += ' - %ds' % (now - self.start)
129 | for k in self.unique_values:
130 | if k == 'perplexity' or k == 'PPL':
131 | info += ' - %s: %.4f' % (k, np.exp(self.sum_values[k][0] / max(1, self.sum_values[k][1])))
132 | else:
133 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
134 |
135 | self.total_width += len(info)
136 | if prev_total_width > self.total_width:
137 | info += ((prev_total_width-self.total_width) * " ")
138 |
139 | sys.stdout.write(info)
140 | sys.stdout.flush()
141 |
142 | if current >= self.target:
143 | sys.stdout.write("\n")
144 |
145 | if self.verbose == 2:
146 | if current >= self.target:
147 | info = '%ds' % (now - self.start)
148 | for k in self.unique_values:
149 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
150 | sys.stdout.write(info + "\n")
151 |
152 | def add(self, n, values=[]):
153 | self.update(self.seen_so_far + n, values)
154 |
155 | def clear(self):
156 | self.sum_values = {}
157 | self.unique_values = []
158 | self.total_width = 0
159 | self.seen_so_far = 0
160 |
161 |
162 | def print_sample(idx2word, idx):
163 | def cut_eol(words):
164 | for i, word in enumerate(words):
165 | if words[i] == '':
166 | return words[:i + 1]
167 | raise Exception("No end-of-line found")
168 |
169 | return cut_eol(map(lambda w_idx : idx2word[w_idx], idx))
170 |
171 |
172 | def visualize_(subplots, data, w=None, h=None, name=None,
173 | display='on', size=10, text=None, normal=True,
174 | grid=False):
175 | fig, ax = subplots
176 | if data.ndim == 1:
177 | if w and h:
178 | # vector visualization
179 | assert w * h == np.prod(data.shape)
180 | data = data.reshape((w, h))
181 | else:
182 | L = data.shape[0]
183 | w = int(np.sqrt(L))
184 | while L % w > 0:
185 | w -= 1
186 | h = L / w
187 | assert w * h == np.prod(data.shape)
188 | data = data.reshape((w, h))
189 | else:
190 | w = data.shape[0]
191 | h = data.shape[1]
192 |
193 | if not size:
194 | size = 30 / np.sqrt(w * h)
195 |
196 | print data.shape
197 |
198 | major_ticks = np.arange(0, h, 1)
199 | ax.set_xticks(major_ticks)
200 | ax.set_xlim(0, h)
201 | major_ticks = np.arange(0, w, 1)
202 | ax.set_ylim(w, -1)
203 | ax.set_yticks(major_ticks)
204 | ax.set_aspect('equal')
205 | if grid:
206 | pass
207 | ax.grid(which='both')
208 | # ax.axis('equal')
209 | if normal:
210 | cax = ax.imshow(data, cmap=plt.cm.pink, interpolation='nearest',
211 | vmax=1.0, vmin=0.0, aspect='auto')
212 | else:
213 | cax = ax.imshow(data, cmap=plt.cm.bone, interpolation='nearest', aspect='auto')
214 |
215 | if name:
216 | ax.set_title(name)
217 | else:
218 | ax.set_title('sample.')
219 | import matplotlib.ticker as ticker
220 |
221 | # ax.xaxis.set_ticks(np.arange(0, h, 1.))
222 | # ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%0.1f'))
223 | # ax.yaxis.set_ticks(np.arange(0, w, 1.))
224 | # ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.1f'))
225 |
226 | # ax.set_xticks(np.linspace(0, 1, h))
227 | # ax.set_yticks(np.linspace(0, 1, w))
228 | # Move left and bottom spines outward by 10 points
229 | # ax.spines['left'].set_position(('outward', size))
230 | # ax.spines['bottom'].set_position(('outward', size))
231 | # # Hide the right and top spines
232 | # ax.spines['right'].set_visible(False)
233 | # ax.spines['top'].set_visible(False)
234 | # # Only show ticks on the left and bottom spines
235 | # ax.yaxis.set_ticks_position('left')
236 | # ax.xaxis.set_ticks_position('bottom')
237 |
238 | if text:
239 | ax.set_yticks(np.linspace(0, 1, 33) * size * 3.2)
240 | ax.set_yticklabels([text[s] for s in xrange(33)])
241 | # cbar = fig.colorbar(cax)
242 |
243 | if display == 'on':
244 | plt.show()
245 | else:
246 | return ax
247 |
248 |
249 | def vis_Gaussian(subplot, mean, std, name=None, display='off', size=10):
250 | ax = subplot
251 | data = np.random.normal(size=(2, 10000))
252 | data[0] = data[0] * std[0] + mean[0]
253 | data[1] = data[1] * std[1] + mean[1]
254 |
255 | ax.scatter(data[0].tolist(), data[1].tolist(), 'r.')
256 | if display == 'on':
257 | plt.show()
258 | else:
259 | return ax
--------------------------------------------------------------------------------
/emolga/utils/io_utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | import h5py
3 | import numpy as np
4 | import cPickle
5 | from collections import defaultdict
6 |
7 |
8 | class HDF5Matrix():
9 | refs = defaultdict(int)
10 |
11 | def __init__(self, datapath, dataset, start, end, normalizer=None):
12 | if datapath not in list(self.refs.keys()):
13 | f = h5py.File(datapath)
14 | self.refs[datapath] = f
15 | else:
16 | f = self.refs[datapath]
17 | self.start = start
18 | self.end = end
19 | self.data = f[dataset]
20 | self.normalizer = normalizer
21 |
22 | def __len__(self):
23 | return self.end - self.start
24 |
25 | def __getitem__(self, key):
26 | if isinstance(key, slice):
27 | if key.stop + self.start <= self.end:
28 | idx = slice(key.start+self.start, key.stop + self.start)
29 | else:
30 | raise IndexError
31 | elif isinstance(key, int):
32 | if key + self.start < self.end:
33 | idx = key+self.start
34 | else:
35 | raise IndexError
36 | elif isinstance(key, np.ndarray):
37 | if np.max(key) + self.start < self.end:
38 | idx = (self.start + key).tolist()
39 | else:
40 | raise IndexError
41 | elif isinstance(key, list):
42 | if max(key) + self.start < self.end:
43 | idx = [x + self.start for x in key]
44 | else:
45 | raise IndexError
46 | if self.normalizer is not None:
47 | return self.normalizer(self.data[idx])
48 | else:
49 | return self.data[idx]
50 |
51 | @property
52 | def shape(self):
53 | return tuple([self.end - self.start, self.data.shape[1]])
54 |
55 |
56 | def save_array(array, name):
57 | import tables
58 | f = tables.open_file(name, 'w')
59 | atom = tables.Atom.from_dtype(array.dtype)
60 | ds = f.createCArray(f.root, 'data', atom, array.shape)
61 | ds[:] = array
62 | f.close()
63 |
64 |
65 | def load_array(name):
66 | import tables
67 | f = tables.open_file(name)
68 | array = f.root.data
69 | a = np.empty(shape=array.shape, dtype=array.dtype)
70 | a[:] = array[:]
71 | f.close()
72 | return a
73 |
74 |
75 | def save_config():
76 | pass
77 |
78 |
79 | def load_config():
80 | pass
--------------------------------------------------------------------------------
/emolga/utils/np_utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | import numpy as np
3 | import scipy as sp
4 | from six.moves import range
5 | from six.moves import zip
6 |
7 |
8 | def to_categorical(y, nb_classes=None):
9 | '''Convert class vector (integers from 0 to nb_classes)
10 | to binary class matrix, for use with categorical_crossentropy
11 | '''
12 | y = np.asarray(y, dtype='int32')
13 | if not nb_classes:
14 | nb_classes = np.max(y)+1
15 | Y = np.zeros((len(y), nb_classes))
16 | for i in range(len(y)):
17 | Y[i, y[i]] = 1.
18 | return Y
19 |
20 |
21 | def normalize(a, axis=-1, order=2):
22 | l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
23 | l2[l2 == 0] = 1
24 | return a / np.expand_dims(l2, axis)
25 |
26 |
27 | def binary_logloss(p, y):
28 | epsilon = 1e-15
29 | p = sp.maximum(epsilon, p)
30 | p = sp.minimum(1-epsilon, p)
31 | res = sum(y * sp.log(p) + sp.subtract(1, y) * sp.log(sp.subtract(1, p)))
32 | res *= -1.0/len(y)
33 | return res
34 |
35 |
36 | def multiclass_logloss(P, Y):
37 | score = 0.
38 | npreds = [P[i][Y[i]-1] for i in range(len(Y))]
39 | score = -(1. / len(Y)) * np.sum(np.log(npreds))
40 | return score
41 |
42 |
43 | def accuracy(p, y):
44 | return np.mean([a == b for a, b in zip(p, y)])
45 |
46 |
47 | def probas_to_classes(y_pred):
48 | if len(y_pred.shape) > 1 and y_pred.shape[1] > 1:
49 | return categorical_probas_to_classes(y_pred)
50 | return np.array([1 if p > 0.5 else 0 for p in y_pred])
51 |
52 |
53 | def categorical_probas_to_classes(p):
54 | return np.argmax(p, axis=1)
55 |
--------------------------------------------------------------------------------
/emolga/utils/test_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def get_test_data(nb_train=1000, nb_test=500, input_shape=(10,), output_shape=(2,),
5 | classification=True, nb_class=2):
6 | '''
7 | classification=True overrides output_shape
8 | (i.e. output_shape is set to (1,)) and the output
9 | consists in integers in [0, nb_class-1].
10 |
11 | Otherwise: float output with shape output_shape.
12 | '''
13 | nb_sample = nb_train + nb_test
14 | if classification:
15 | y = np.random.randint(0, nb_class, size=(nb_sample, 1))
16 | X = np.zeros((nb_sample,) + input_shape)
17 | for i in range(nb_sample):
18 | X[i] = np.random.normal(loc=y[i], scale=1.0, size=input_shape)
19 | else:
20 | y_loc = np.random.random((nb_sample,))
21 | X = np.zeros((nb_sample,) + input_shape)
22 | y = np.zeros((nb_sample,) + output_shape)
23 | for i in range(nb_sample):
24 | X[i] = np.random.normal(loc=y_loc[i], scale=1.0, size=input_shape)
25 | y[i] = np.random.normal(loc=y_loc[i], scale=1.0, size=output_shape)
26 |
27 | return (X[:nb_train], y[:nb_train]), (X[nb_train:], y[nb_train:])
28 |
--------------------------------------------------------------------------------
/emolga/utils/theano_utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | from theano import gof
4 | from theano.tensor import basic as tensor
5 | import numpy as np
6 | import theano
7 | import theano.tensor as T
8 |
9 |
10 | def floatX(X):
11 | return np.asarray(X, dtype=theano.config.floatX)
12 |
13 |
14 | def sharedX(X, dtype=theano.config.floatX, name=None):
15 | return theano.shared(np.asarray(X, dtype=dtype), name=name)
16 |
17 |
18 | def shared_zeros(shape, dtype=theano.config.floatX, name=None):
19 | return sharedX(np.zeros(shape), dtype=dtype, name=name)
20 |
21 |
22 | def shared_scalar(val=0., dtype=theano.config.floatX, name=None):
23 | return theano.shared(np.cast[dtype](val), name=name)
24 |
25 |
26 | def shared_ones(shape, dtype=theano.config.floatX, name=None):
27 | return sharedX(np.ones(shape), dtype=dtype, name=name)
28 |
29 |
30 | def alloc_zeros_matrix(*dims):
31 | return T.alloc(np.cast[theano.config.floatX](0.), *dims)
32 |
33 |
34 | def alloc_ones_matrix(*dims):
35 | return T.alloc(np.cast[theano.config.floatX](1.), *dims)
36 |
37 |
38 | def ndim_tensor(ndim):
39 | if ndim == 1:
40 | return T.vector()
41 | elif ndim == 2:
42 | return T.matrix()
43 | elif ndim == 3:
44 | return T.tensor3()
45 | elif ndim == 4:
46 | return T.tensor4()
47 | return T.matrix()
48 |
49 |
50 | # get int32 tensor
51 | def ndim_itensor(ndim, name=None):
52 | if ndim == 2:
53 | return T.imatrix(name)
54 | elif ndim == 3:
55 | return T.itensor3(name)
56 | elif ndim == 4:
57 | return T.itensor4(name)
58 | return T.imatrix(name)
59 |
60 |
61 | # dot-product
62 | def dot(inp, matrix, bias=None):
63 | """
64 | Decide the right type of dot product depending on the input
65 | arguments
66 | """
67 | if 'int' in inp.dtype and inp.ndim == 2:
68 | return matrix[inp.flatten()]
69 | elif 'int' in inp.dtype:
70 | return matrix[inp]
71 | elif 'float' in inp.dtype and inp.ndim == 3:
72 | shape0 = inp.shape[0]
73 | shape1 = inp.shape[1]
74 | shape2 = inp.shape[2]
75 | if bias:
76 | return (T.dot(inp.reshape((shape0 * shape1, shape2)), matrix) + bias).reshape((shape0, shape1, matrix.shape[1]))
77 | else:
78 | return T.dot(inp.reshape((shape0 * shape1, shape2)), matrix).reshape((shape0, shape1, matrix.shape[1]))
79 | else:
80 | if bias:
81 | return T.dot(inp, matrix) + bias
82 | else:
83 | return T.dot(inp, matrix)
84 |
85 |
86 | # Numerically stable log(sum(exp(A))). Can also be used in softmax function.
87 | def logSumExp(x, axis=None, mask=None, status='theano', c=None, err=1e-7):
88 | """
89 | Numerically stable log(sum(exp(A))). Can also be used in softmax function.
90 | c is the additional input when it doesn't require masking but x need.
91 |
92 | """
93 | if status == 'theano':
94 | J = T
95 | else:
96 | J = np
97 |
98 | if c is None:
99 | x_max = J.max(x, axis=axis, keepdims=True)
100 | else:
101 | x_max = J.max(J.concatenate([c, x], axis=-1), axis=axis, keepdims=True)
102 |
103 | if c is None:
104 | if not mask:
105 | l_t = J.sum(J.exp(x - x_max), axis=axis, keepdims=True)
106 |
107 | else:
108 | l_t = J.sum(J.exp(x - x_max) * mask, axis=axis, keepdims=True)
109 | else:
110 | if not mask:
111 | l_t = J.sum(J.exp(x - x_max), axis=axis, keepdims=True) + \
112 | J.sum(J.exp(c - x_max), axis=axis, keepdims=True)
113 | else:
114 | l_t = J.sum(J.exp(x - x_max) * mask, axis=axis, keepdims=True) + \
115 | J.sum(J.exp(c - x_max), axis=axis, keepdims=True)
116 |
117 | x_t = J.log(J.maximum(l_t, err)) + x_max
118 | return x_t
119 |
120 |
121 | def softmax(x):
122 | return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape)
123 |
124 |
125 | def masked_softmax(x, mask, err=1e-9):
126 | assert x.ndim == 2, 'support two-dimension'
127 | weights = softmax(x)
128 | weights *= mask
129 | weights = weights / (T.sum(weights, axis=-1)[:, None] + err) * mask
130 | return weights
131 |
132 |
133 | def cosine_sim(k, M):
134 | k_unit = k / (T.sqrt(T.sum(k**2)) + 1e-5)
135 | # T.patternbroadcast(k_unit.reshape((1,k_unit.shape[0])),(True,False))
136 | k_unit = k_unit.dimshuffle(('x', 0))
137 | k_unit.name = "k_unit"
138 | M_lengths = T.sqrt(T.sum(M**2, axis=1)).dimshuffle((0, 'x'))
139 | M_unit = M / (M_lengths + 1e-5)
140 | M_unit.name = "M_unit"
141 | return T.sum(k_unit * M_unit, axis=1)
142 |
143 |
144 | def cosine_sim2d(k, M):
145 | # k: (nb_samples, memory_width)
146 | # M: (nb_samples, memory_dim, memory_width)
147 |
148 | # norms of keys and memories
149 | k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,)
150 | M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,)
151 |
152 | k = k[:, None, :] # (nb_samples, 1, memory_width)
153 | k_norm = k_norm[:, None] # (nb_samples, 1)
154 |
155 | sim = T.sum(k * M, axis=2) # (nb_samples, memory_dim,)
156 | sim /= k_norm * M_norm # (nb_samples, memory_dim,)
157 | return sim
158 |
159 |
160 | def dot_2d(k, M, b=None, g=None):
161 | # k: (nb_samples, memory_width)
162 | # M: (nb_samples, memory_dim, memory_width)
163 |
164 | # norms of keys and memories
165 | # k_norm = T.sqrt(T.sum(T.sqr(k), 1)) + 1e-5 # (nb_samples,)
166 | # M_norm = T.sqrt(T.sum(T.sqr(M), 2)) + 1e-5 # (nb_samples, memory_dim,)
167 |
168 | k = k[:, None, :] # (nb_samples, 1, memory_width)
169 | value = k * M
170 | if b is not None:
171 | b = b[:, None, :]
172 | value *= b # (nb_samples, memory_dim,)
173 |
174 | if g is not None:
175 | g = g[None, None, :]
176 | value *= g
177 |
178 | sim = T.sum(value, axis=2)
179 | return sim
180 |
181 |
182 | def shift_convolve(weight, shift, shift_conv):
183 | shift = shift.dimshuffle((0, 'x'))
184 | return T.sum(shift * weight[shift_conv], axis=0)
185 |
186 |
187 | def shift_convolve2d(weight, shift, shift_conv):
188 | return T.sum(shift[:, :, None] * weight[:, shift_conv], axis=1)
189 |
--------------------------------------------------------------------------------
/emolga/voc.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MultiPath/CopyNet/dba24d58d505abaa169e05ee762987f37f41f566/emolga/voc.pkl
--------------------------------------------------------------------------------
/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'
--------------------------------------------------------------------------------
/experiments/bst_dataset.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | __author__ = 'jiataogu'
3 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
4 | import numpy.random as n_rng
5 |
6 |
7 | class BSTnode(object):
8 | """
9 | Representation of a node in a binary search tree.
10 | Has a left child, right child, and key value, and stores its subtree size.
11 | """
12 | def __init__(self, parent, t):
13 | """Create a new leaf with key t."""
14 | self.key = t
15 | self.parent = parent
16 | self.left = None
17 | self.right = None
18 | self.size = 1
19 |
20 | def update_stats(self):
21 | """Updates this node's size based on its children's sizes."""
22 | self.size = (0 if self.left is None else self.left.size) + (0 if self.right is None else self.right.size)
23 |
24 | def insert(self, t, NodeType):
25 | """Insert key t into the subtree rooted at this node (updating subtree size)."""
26 | self.size += 1
27 | if t < self.key:
28 | if self.left is None:
29 | self.left = NodeType(self, t)
30 | return self.left
31 | else:
32 | return self.left.insert(t, NodeType)
33 | elif t > self.key:
34 | if self.right is None:
35 | self.right = NodeType(self, t)
36 | return self.right
37 | else:
38 | return self.right.insert(t, NodeType)
39 | else:
40 | return self
41 |
42 | def find(self, t):
43 | """Return the node for key t if it is in this tree, or None otherwise."""
44 | if t == self.key:
45 | return self
46 | elif t < self.key:
47 | if self.left is None:
48 | return None
49 | else:
50 | return self.left.find(t)
51 | else:
52 | if self.right is None:
53 | return None
54 | else:
55 | return self.right.find(t)
56 |
57 | def rank(self, t):
58 | """Return the number of keys <= t in the subtree rooted at this node."""
59 | left_size = 0 if self.left is None else self.left.size
60 | if t == self.key:
61 | return left_size + 1
62 | elif t < self.key:
63 | if self.left is None:
64 | return 0
65 | else:
66 | return self.left.rank(t)
67 | else:
68 | if self.right is None:
69 | return left_size + 1
70 | else:
71 | return self.right.rank(t) + left_size + 1
72 |
73 | def minimum(self):
74 | """Returns the node with the smallest key in the subtree rooted by this node."""
75 | current = self
76 | while current.left is not None:
77 | current = current.left
78 | return current
79 |
80 |
81 | def successor(self):
82 | """Returns the node with the smallest key larger than this node's key, or None if this has the largest key in the tree."""
83 | if self.right is not None:
84 | return self.right.minimum()
85 | current = self
86 | while current.parent is not None and current.parent.right is current:
87 | current = current.parent
88 | return current.parent
89 |
90 | def delete(self):
91 | """"Delete this node from the tree."""
92 | if self.left is None or self.right is None:
93 | if self is self.parent.left:
94 | self.parent.left = self.left or self.right
95 | if self.parent.left is not None:
96 | self.parent.left.parent = self.parent
97 | else:
98 | self.parent.right = self.left or self.right
99 | if self.parent.right is not None:
100 | self.parent.right.parent = self.parent
101 | current = self.parent
102 | while current.key is not None:
103 | current.update_stats()
104 | current = current.parent
105 | return self
106 | else:
107 | s = self.successor()
108 | self.key, s.key = s.key, self.key
109 | return s.delete()
110 |
111 | def check(self, lokey, hikey):
112 | """Checks that the subtree rooted at t is a valid BST and all keys are between (lokey, hikey)."""
113 | if lokey is not None and self.key <= lokey:
114 | raise "BST RI violation"
115 | if hikey is not None and self.key >= hikey:
116 | raise "BST RI violation"
117 | if self.left is not None:
118 | if self.left.parent is not self:
119 | raise "BST RI violation"
120 | self.left.check(lokey, self.key)
121 | if self.right is not None:
122 | if self.right.parent is not self:
123 | raise "BST RI violation"
124 | self.right.check(self.key, hikey)
125 | if self.size != 1 + (0 if self.left is None else self.left.size) + (0 if self.right is None else self.right.size):
126 | raise "BST RI violation"
127 |
128 | def __repr__(self):
129 | return ""
130 |
131 |
132 | class BST(object):
133 | """
134 | Simple binary search tree implementation, augmented with subtree sizes.
135 | This BST supports insert, find, and delete-min operations.
136 | Each tree contains some (possibly 0) BSTnode objects, representing nodes,
137 | and a pointer to the root.
138 | """
139 |
140 | def __init__(self, NodeType=BSTnode):
141 | self.root = None
142 | self.NodeType = NodeType
143 | self.psroot = self.NodeType(None, None)
144 |
145 | def reroot(self):
146 | self.root = self.psroot.left
147 |
148 | def insert(self, t):
149 | """Insert key t into this BST, modifying it in-place."""
150 | if self.root is None:
151 | self.psroot.left = self.NodeType(self.psroot, t)
152 | self.reroot()
153 | return self.root
154 | else:
155 | return self.root.insert(t, self.NodeType)
156 |
157 | def find(self, t):
158 | """Return the node for key t if is in the tree, or None otherwise."""
159 | if self.root is None:
160 | return None
161 | else:
162 | return self.root.find(t)
163 |
164 | def rank(self, t):
165 | """The number of keys <= t in the tree."""
166 | if self.root is None:
167 | return 0
168 | else:
169 | return self.root.rank(t)
170 |
171 | def delete(self, t):
172 | """Delete the node for key t if it is in the tree."""
173 | node = self.find(t)
174 | deleted = self.root.delete()
175 | self.reroot()
176 | return deleted
177 |
178 | def check(self):
179 | if self.root is not None:
180 | self.root.check(None, None)
181 |
182 | def __str__(self):
183 | if self.root is None:
184 | return ''
185 |
186 | def nested(node):
187 | if node is None:
188 | return '0'
189 | head = str(node.key)
190 | left = nested(node.left)
191 | right = nested(node.right)
192 |
193 | if left == '0' and right == '0':
194 | return head
195 | else:
196 | return ' '.join(['(', head, left, right, ')'])
197 |
198 | return nested(self.root)
199 |
200 | # def recurse(node):
201 | # if node is None:
202 | # return [], 0, 0
203 | # label = str(node.key)
204 | # left_lines, left_pos, left_width = recurse(node.left)
205 | # right_lines, right_pos, right_width = recurse(node.right)
206 | # middle = max(right_pos + left_width - left_pos + 1, len(label), 2)
207 | # pos = left_pos + middle // 2
208 | # width = left_pos + middle + right_width - right_pos
209 | # while len(left_lines) < len(right_lines):
210 | # left_lines.append(' ' * left_width)
211 | # while len(right_lines) < len(left_lines):
212 | # right_lines.append(' ' * right_width)
213 | # if (middle - len(label)) % 2 == 1 and node.parent is not None and \
214 | # node is node.parent.left and len(label) < middle:
215 | # label += '.'
216 | # label = label.center(middle, '.')
217 | # if label[0] == '.': label = ' ' + label[1:]
218 | # if label[-1] == '.': label = label[:-1] + ' '
219 | # lines = [' ' * left_pos + label + ' ' * (right_width - right_pos),
220 | # ' ' * left_pos + '/' + ' ' * (middle-2) +
221 | # '\\' + ' ' * (right_width - right_pos)] + \
222 | # [left_line + ' ' * (width - left_width - right_width) +
223 | # right_line
224 | # for left_line, right_line in zip(left_lines, right_lines)]
225 | # return lines, pos, width
226 | # return '\n'.join(recurse(self.root) [0])
227 |
228 | test1 = range(0, 100, 10)
229 | test2 = [31, 41, 59, 26, 53, 58, 97, 93, 23]
230 | test3 = "algorithms"
231 |
232 |
233 | def printsizes(node):
234 | if node is None:
235 | print "node is nil"
236 | else:
237 | print "node", node.key, "has a subtree of size", node.size
238 |
239 |
240 | def test(args=None, BSTtype=BST):
241 | import random, sys
242 | random.seed(19920206)
243 | if not args:
244 | args = sys.argv[1:]
245 | if not args:
246 | print 'usage: %s ' % \
247 | sys.argv[0]
248 | sys.exit()
249 | elif len(args) == 1:
250 | items = (random.randrange(100) for i in xrange(int(args[0])))
251 | else:
252 | items = [int(i) for i in args]
253 |
254 | tree = BSTtype()
255 | source = []
256 | for item in items:
257 | tree.insert(item)
258 | source += [str(item)]
259 | print ' '.join(source)
260 | print tree
261 |
262 |
263 | def generate():
264 | import random, sys
265 | random.seed(19920206)
266 |
267 | Lmin = 2 ** 2 - 1
268 | Lmax = 2 ** 4 - 1
269 | Xnum = 1000000
270 | voc = 26
271 |
272 | wfile = open('/home/thoma/Work/Dial-DRL/dataset/BST_1M.txt', 'w')
273 | for id in xrange(Xnum):
274 | tree = BST()
275 | items = (random.randrange(voc) for i in
276 | xrange(random.randint(Lmin, Lmax)))
277 | source = []
278 | for item in items:
279 | item = chr(item + 65)
280 | tree.insert(item)
281 | source += [str(item)]
282 | source = ' '.join(source)
283 | target = str(tree)
284 | line = '{0} -> {1}'.format(source, target)
285 | wfile.write(line + '\n')
286 | if id % 10000 == 0:
287 | print id
288 |
289 |
290 | def obtain_dataset():
291 | rfile = open('/home/thoma/Work/Dial-DRL/dataset/BST_1M.txt', 'r')
292 | line = rfile.readline()
293 |
294 | word2idx = dict()
295 | word2idx[''] = 0
296 | word2idx[''] = 1
297 | pairs = []
298 | at = 2
299 | lines = 0
300 | while line:
301 | lines += 1
302 | line = line.strip()
303 | source, target = line.split('->')
304 | source = source.split()
305 | target = target.split()
306 |
307 | for w in source:
308 | if w not in word2idx:
309 | word2idx[w] = at
310 | at += 1
311 | for w in target:
312 | if w not in word2idx:
313 | word2idx[w] = at
314 | at += 1
315 | pairs.append((source, target))
316 | if lines % 20000 == 0:
317 | print lines
318 | line = rfile.readline()
319 |
320 | idx2word = dict()
321 | for v, k in word2idx.items():
322 | idx2word[k] = v
323 |
324 | Lmax = len(idx2word)
325 | print 'read dataset ok.'
326 | print Lmax
327 | for i in xrange(Lmax):
328 | print idx2word[i]
329 |
330 | def build_data(data):
331 | instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
332 | for pair in data:
333 | source, target = pair
334 | A = [word2idx[w] for w in source]
335 | B = [word2idx[w] for w in target]
336 | # C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
337 | C = [0 if w not in source else source.index(w) + Lmax for w in target]
338 |
339 | instance['text'] += [source]
340 | instance['summary'] += [target]
341 | instance['source'] += [A]
342 | instance['target'] += [B]
343 | # instance['cc_matrix'] += [C]
344 | instance['target_c'] += [C]
345 |
346 | print instance['target'][5000]
347 | print instance['target_c'][5000]
348 | return instance
349 |
350 | train_set = build_data(pairs[100000:])
351 | test_set = build_data(pairs[:100000])
352 | serialize_to_file([train_set, test_set, idx2word, word2idx],
353 | '/home/thoma/Work/Dial-DRL/dataset/BST_1M.data.pkl')
354 |
355 |
356 | if __name__ == '__main__':
357 | generate()
358 | obtain_dataset()
--------------------------------------------------------------------------------
/experiments/bst_vest.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | """
3 | This is the implementation of Copy-NET
4 | We start from the basic Seq2seq framework for a auto-encoder.
5 | """
6 | import logging
7 | import time
8 | import numpy as np
9 | import sys
10 | import copy
11 |
12 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
13 | from experiments.config import setup_lcsts, setup_weibo, setup_syn, setup_bst
14 | from emolga.utils.generic_utils import *
15 | from emolga.models.covc_encdec import NRM
16 | from emolga.models.encdec import NRM as NRM0
17 | from emolga.dataset.build_dataset import deserialize_from_file
18 | from collections import OrderedDict
19 | from fuel import datasets
20 | from fuel import transformers
21 | from fuel import schemes
22 |
23 | # setup = setup_lcsts
24 | # setup = setup_syn
25 | setup = setup_bst
26 |
27 |
28 | def init_logging(logfile):
29 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
30 | datefmt='%m/%d/%Y %H:%M:%S' )
31 | fh = logging.FileHandler(logfile)
32 | # ch = logging.StreamHandler()
33 |
34 | fh.setFormatter(formatter)
35 | # ch.setFormatter(formatter)
36 | # fh.setLevel(logging.INFO)
37 | # ch.setLevel(logging.INFO)
38 | # logging.getLogger().addHandler(ch)
39 | logging.getLogger().addHandler(fh)
40 | logging.getLogger().setLevel(logging.INFO)
41 |
42 | return logging
43 |
44 | # prepare logging.
45 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
46 | config = setup() # load settings.
47 | for w in config:
48 | print '{0}={1}'.format(w, config[w])
49 |
50 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
51 | n_rng = np.random.RandomState(config['seed'])
52 | np.random.seed(config['seed'])
53 | rng = RandomStreams(n_rng.randint(2 ** 30))
54 | logger.info('Start!')
55 |
56 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
57 |
58 | if config['voc_size'] == -1: # not use unk
59 | config['enc_voc_size'] = len(word2idx)
60 | config['dec_voc_size'] = config['enc_voc_size']
61 | else:
62 | config['enc_voc_size'] = config['voc_size']
63 | config['dec_voc_size'] = config['enc_voc_size']
64 |
65 | samples = len(train_set['source'])
66 | logger.info('build dataset done. ' +
67 | 'dataset size: {} ||'.format(samples) +
68 | 'vocabulary size = {0}/ batch size = {1}'.format(
69 | config['dec_voc_size'], config['batch_size']))
70 |
71 |
72 | def build_data(data):
73 | # create fuel dataset.
74 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
75 | ('target', data['target']),
76 | ('target_c', data['target_c']),
77 | ]))
78 | dataset.example_iteration_scheme \
79 | = schemes.ShuffledExampleScheme(dataset.num_examples)
80 | return dataset
81 |
82 |
83 | train_data = build_data(train_set)
84 | train_data_plain = zip(*(train_set['source'], train_set['target']))
85 | test_data_plain = zip(*(test_set['source'], test_set['target']))
86 |
87 | # train_data_plain = zip(*(train_set['source'], train_set['target']))
88 | # test_data_plain = zip(*(test_set['source'], test_set['target']))
89 |
90 | train_size = len(train_data_plain)
91 | test_size = len(test_data_plain)
92 | tr_idx = n_rng.permutation(train_size)[:2000].tolist()
93 | ts_idx = n_rng.permutation(test_size )[:2000].tolist()
94 | logger.info('load the data ok.')
95 | notrain = False
96 |
97 | # build the agent
98 | if config['copynet']:
99 | agent = NRM(config, n_rng, rng, mode=config['mode'],
100 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
101 | else:
102 | agent = NRM0(config, n_rng, rng, mode=config['mode'],
103 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
104 |
105 | agent.build_()
106 | if notrain:
107 | agent.compile_('display')
108 | else:
109 | agent.compile_('all')
110 | print 'compile ok.'
111 |
112 | # load the model
113 | # agent.load(config['path_h5'] +
114 | # '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format('20160229-105153', 1, config['modelname']))
115 |
116 | echo = 0
117 | epochs = 10
118 | skip = -1 # 25000
119 | if echo > 0:
120 | tmark = '20160229-105153' # '20160227-013418' # copynet multi-source model
121 | agent.load(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname']))
122 |
123 | while echo < epochs:
124 | echo += 1
125 | loss = []
126 |
127 | def output_stream(dataset, batch_size, size=1):
128 | data_stream = dataset.get_example_stream()
129 | data_stream = transformers.Batch(data_stream,
130 | iteration_scheme=schemes.ConstantScheme(batch_size))
131 |
132 | # add padding and masks to the dataset
133 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
134 | return data_stream
135 |
136 | def prepare_batch(batch, mask, fix_len=None):
137 | data = batch[mask].astype('int32')
138 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
139 |
140 | def cut_zeros(data, fix_len=None):
141 | if fix_len is not None:
142 | return data[:, : fix_len]
143 | for k in range(data.shape[1] - 1, 0, -1):
144 | data_col = data[:, k].sum()
145 | if data_col > 0:
146 | return data[:, : k + 2]
147 | return data
148 | data = cut_zeros(data, fix_len)
149 | return data
150 |
151 | def cc_martix(source, target):
152 | cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
153 | for k in xrange(source.shape[0]):
154 | for j in xrange(target.shape[1]):
155 | for i in xrange(source.shape[1]):
156 | if (source[k, i] == target[k, j]) and (source[k, i] > 0):
157 | cc[k][j][i] = 1.
158 | return cc
159 |
160 | def unk_filter(data):
161 | if config['voc_size'] == -1:
162 | return copy.copy(data)
163 | else:
164 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
165 | data = copy.copy(data * mask + (1 - mask))
166 | return data
167 |
168 | # training
169 | if not notrain:
170 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
171 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
172 | progbar = Progbar(train_size / config['batch_size'])
173 | for it, batch in enumerate(train_batches):
174 |
175 | # skip some iterations
176 | if echo == 1 and it < skip:
177 | continue
178 |
179 | # obtain data
180 | data_s = prepare_batch(batch, 'source')
181 | data_t = prepare_batch(batch, 'target')
182 | if config['copynet']:
183 | data_c = cc_martix(data_s, data_t)
184 | # data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
185 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
186 | else:
187 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
188 |
189 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
190 |
191 | if it % 200 == 0:
192 | logger.info('Echo={} Evaluation Sampling.'.format(it))
193 | logger.info('generating [training set] samples')
194 | for _ in xrange(5):
195 | idx = int(np.floor(n_rng.rand() * train_size))
196 | train_s, train_t = train_data_plain[idx]
197 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
198 | np.asarray(train_t, dtype='int32'),
199 | idx2word,
200 | np.asarray(unk_filter(train_s), dtype='int32'))
201 | print '*' * 50
202 |
203 | logger.info('generating [testing set] samples')
204 | for _ in xrange(5):
205 | idx = int(np.floor(n_rng.rand() * test_size))
206 | test_s, test_t = test_data_plain[idx]
207 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
208 | np.asarray(test_t, dtype='int32'),
209 | idx2word,
210 | np.asarray(unk_filter(test_s), dtype='int32'))
211 | print '*' * 50
212 |
213 | # save the weights.
214 | if it % 5000 == 0:
215 | agent.save(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname']))
216 |
217 | if (it % 5000 == 0) and it > 0:
218 | print 'testing accuracy !!'
219 |
220 | def analysis_(data_plain, t_idx, mode='Training'):
221 | progbar_tr = Progbar(2000)
222 | print '\n' + '__' * 50
223 | cpy, cpy_pos = 0, 0
224 | for it, idx in enumerate(t_idx):
225 | train_s, train_t = data_plain[idx]
226 | c = float(agent.analyse_(np.asarray(train_s, dtype='int32'),
227 | np.asarray(train_t, dtype='int32'),
228 | idx2word))
229 | # copy mode
230 | cpy += 1
231 | cpy_pos += c
232 | progbar_tr.update(it + 1, [('Copy', cpy_pos)])
233 | logger.info('\n{0} Accuracy:' +
234 | '\t{1}/{2} = {3}%'.format(mode, cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
235 | print '==' * 50
236 |
237 | # analysis_(train_data_plain, tr_idx, 'Training')
238 | analysis_(test_data_plain, ts_idx, 'Testing')
239 |
240 |
241 |
242 |
--------------------------------------------------------------------------------
/experiments/config.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MultiPath/CopyNet/dba24d58d505abaa169e05ee762987f37f41f566/experiments/config.pyc
--------------------------------------------------------------------------------
/experiments/copynet.py:
--------------------------------------------------------------------------------
1 | """
2 | This is the implementation of Copy-NET
3 | We start from the basic Seq2seq framework for a auto-encoder.
4 | """
5 | import logging
6 | import time
7 | import numpy as np
8 |
9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
10 | from experiments.config import setup
11 | from emolga.utils.generic_utils import *
12 | from emolga.models.encdec import *
13 | from emolga.dataset.build_dataset import deserialize_from_file
14 | from collections import OrderedDict
15 | from fuel import datasets
16 | from fuel import transformers
17 | from fuel import schemes
18 |
19 |
20 | def init_logging(logfile):
21 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
22 | datefmt='%m/%d/%Y %H:%M:%S' )
23 | fh = logging.FileHandler(logfile)
24 | # ch = logging.StreamHandler()
25 |
26 | fh.setFormatter(formatter)
27 | # ch.setFormatter(formatter)
28 | # fh.setLevel(logging.INFO)
29 | # ch.setLevel(logging.INFO)
30 | # logging.getLogger().addHandler(ch)
31 | logging.getLogger().addHandler(fh)
32 | logging.getLogger().setLevel(logging.INFO)
33 |
34 | return logging
35 |
36 | # prepare logging.
37 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
38 | config = setup() # load settings.
39 | for w in config:
40 | print '{0}={1}'.format(w, config[w])
41 |
42 | logger = init_logging(config['path_log'] + '/experiments.Copy.id={}.log'.format(tmark))
43 | n_rng = np.random.RandomState(config['seed'])
44 | np.random.seed(config['seed'])
45 | rng = RandomStreams(n_rng.randint(2 ** 30))
46 | logger.info('Start!')
47 |
48 | idx2word, word2idx, idx2word_o, word2idx_o \
49 | = deserialize_from_file(config['voc'])
50 | idx2word_o[0] = ''
51 | word2idx_o[''] = 0
52 |
53 | source, target, origin = deserialize_from_file(config['dataset'])
54 | samlpes = len(source)
55 |
56 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
57 | config['dec_voc_size'] = config['enc_voc_size']
58 | logger.info('build dataset done. ' +
59 | 'dataset size: {} ||'.format(samlpes) +
60 | 'vocabulary size = {0}/ batch size = {1}'.format(
61 | config['dec_voc_size'], config['batch_size']))
62 |
63 |
64 | def build_data(source, target):
65 | # create fuel dataset.
66 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', source), ('target', target)]))
67 | dataset.example_iteration_scheme \
68 | = schemes.ShuffledExampleScheme(dataset.num_examples)
69 | return dataset, len(source)
70 |
71 |
72 | train_data, train_size = build_data(source[int(0.2 * samlpes):], target[int(0.2 * samlpes):])
73 | train_data_plain = zip(*(source[int(0.2 * samlpes):], target[int(0.2 * samlpes):], origin[int(0.2 * samlpes):]))
74 | test_data_plain = zip(*(source[:int(0.2 * samlpes)], target[:int(0.2 * samlpes)], origin[:int(0.2 * samlpes)]))
75 | test_size = len(test_data_plain)
76 | logger.info('load the data ok.')
77 |
78 | # build the agent
79 | agent = NRM(config, n_rng, rng, mode=config['mode'],
80 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
81 | agent.build_()
82 | agent.compile_('all')
83 | print 'compile ok.'
84 |
85 | echo = 0
86 | epochs = 10
87 | while echo < epochs:
88 | echo += 1
89 | loss = []
90 |
91 | def output_stream(dataset, batch_size, size=1):
92 | data_stream = dataset.get_example_stream()
93 | data_stream = transformers.Batch(data_stream,
94 | iteration_scheme=schemes.ConstantScheme(batch_size))
95 |
96 | # add padding and masks to the dataset
97 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
98 | return data_stream
99 |
100 | def prepare_batch(batch, mask):
101 | data = batch[mask].astype('int32')
102 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
103 |
104 | def cut_zeros(data):
105 | for k in range(data.shape[1] - 1, 0, -1):
106 | data_col = data[:, k].sum()
107 | if data_col > 0:
108 | return data[:, : k + 2]
109 | return data
110 | data = cut_zeros(data)
111 | return data
112 |
113 | # training
114 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
115 | logger.info('Epoch = {} -> Training Set Learning...'.format(echo))
116 | progbar = Progbar(train_size / config['batch_size'])
117 | for it, batch in enumerate(train_batches):
118 | # obtain data
119 | data_s, data_t = prepare_batch(batch, 'source'), prepare_batch(batch, 'target')
120 | loss += [agent.train_(data_s, data_t)]
121 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
122 |
123 | if it % 500 == 0:
124 | logger.info('generating [training set] samples')
125 | for _ in xrange(5):
126 | idx = int(np.floor(n_rng.rand() * train_size))
127 | train_s, train_t, train_o = train_data_plain[idx]
128 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
129 | np.asarray(train_t, dtype='int32'),
130 | idx2word, np.asarray(train_o, dtype='int32'), idx2word_o)
131 | print '*' * 50
132 |
133 | logger.info('generating [testing set] samples')
134 | for _ in xrange(5):
135 | idx = int(np.floor(n_rng.rand() * test_size))
136 | test_s, test_t, test_o = test_data_plain[idx]
137 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
138 | np.asarray(test_t, dtype='int32'),
139 | idx2word, np.asarray(test_o, dtype='int32'), idx2word_o)
140 | print '*' * 50
141 |
--------------------------------------------------------------------------------
/experiments/copynet_input.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import logging
3 | import time
4 | import numpy as np
5 | import sys
6 | import copy
7 |
8 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
9 | from experiments.config import setup_lcsts
10 | from emolga.utils.generic_utils import *
11 | from emolga.models.covc_encdec import NRM
12 | from emolga.models.encdec import NRM as NRM0
13 | from emolga.dataset.build_dataset import deserialize_from_file
14 | from collections import OrderedDict
15 | from fuel import datasets
16 | from fuel import transformers
17 | from fuel import schemes
18 |
19 | setup = setup_lcsts
20 |
21 |
22 | def init_logging(logfile):
23 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
24 | datefmt='%m/%d/%Y %H:%M:%S' )
25 | fh = logging.FileHandler(logfile)
26 | # ch = logging.StreamHandler()
27 |
28 | fh.setFormatter(formatter)
29 | # ch.setFormatter(formatter)
30 | # fh.setLevel(logging.INFO)
31 | # ch.setLevel(logging.INFO)
32 | # logging.getLogger().addHandler(ch)
33 | logging.getLogger().addHandler(fh)
34 | logging.getLogger().setLevel(logging.INFO)
35 | return logging
36 |
37 | # prepare logging.
38 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
39 | config = setup() # load settings.
40 | for w in config:
41 | print '{0}={1}'.format(w, config[w])
42 |
43 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
44 | n_rng = np.random.RandomState(config['seed'])
45 | np.random.seed(config['seed'])
46 | rng = RandomStreams(n_rng.randint(2 ** 30))
47 | logger.info('Start!')
48 |
49 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
50 | if config['voc_size'] == -1: # not use unk
51 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
52 | config['dec_voc_size'] = config['enc_voc_size']
53 | else:
54 | config['enc_voc_size'] = config['voc_size']
55 | config['dec_voc_size'] = config['enc_voc_size']
56 |
57 | samples = len(train_set['source'])
58 |
59 | logger.info('build dataset done. ' +
60 | 'dataset size: {} ||'.format(samples) +
61 | 'vocabulary size = {0}/ batch size = {1}'.format(
62 | config['dec_voc_size'], config['batch_size']))
63 |
64 |
65 | def unk_filter(data):
66 | if config['voc_size'] == -1:
67 | return copy.copy(data)
68 | else:
69 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
70 | data = copy.copy(data * mask + (1 - mask))
71 | return data
72 |
73 | source = '临近 岁末 , 新 基金 发行 步入 旺季 , 11 月份 以来 单周 新基 ' + \
74 | '发行 数 始终保持 35 只 以上 的 高位 , 仅 11 月 25 日 一天 , ' + \
75 | '就 有 12 只 基金 同时 发售 。 国内 首只 公募 对冲 混合型 基金 — 嘉实 绝对 收益 策略 ' + \
76 | '定期 混合 基金 自 发行 首日 便 备受 各界 青睐 , 每日 认购 均 能 达到 上 亿'
77 | target = '首只 公募 对冲 基金 每日 吸金 上 亿'
78 |
79 | test_s = [word2idx[w.decode('utf-8')] for w in source.split()]
80 | test_t = [word2idx[w.decode('utf-8')] for w in target.split()]
81 |
82 | logger.info('load the data ok.')
83 |
84 | logger.info('Evaluate CopyNet')
85 | echo = 9
86 | tmark = '20160226-164053' # '20160221-025049' # copy-net model [no unk]
87 | config['copynet'] = True
88 | agent = NRM(config, n_rng, rng, mode=config['mode'],
89 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
90 | agent.build_()
91 | agent.compile_('display')
92 | agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
93 | logger.info('generating [testing set] samples')
94 |
95 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
96 | np.asarray(test_t, dtype='int32'),
97 | idx2word, np.asarray(unk_filter(test_s), dtype='int32'))
98 | logger.info('Complete!')
--------------------------------------------------------------------------------
/experiments/dataset.py:
--------------------------------------------------------------------------------
1 | """
2 | Preprocess the bAbI datset.
3 | """
4 | import logging
5 | import os
6 | import sys
7 | import numpy.random as n_rng
8 | from emolga.dataset.build_dataset import serialize_to_file
9 |
10 | data_path = './dataset/bAbI/en-10k/'
11 | data = []
12 | n_rng.seed(19920206)
13 |
14 | for p, folders, docs in os.walk(data_path):
15 | for doc in docs:
16 | with open(os.path.join(p, doc)) as f:
17 | l = f.readline()
18 | while l:
19 | l = l.strip().lower()
20 | l = l[l.find(' ') + 1:]
21 | if len(l.split('\t')) == 1:
22 | data += [l[:-1].split()]
23 | l = f.readline()
24 |
25 | idx2word = dict(enumerate(set([w for l in data for w in l]), 1))
26 | word2idx = {v: k for k, v in idx2word.items()}
27 |
28 | persons = [1, 6, 24, 37, 38, 47, 60, 61, 73, 74, 90, 94, 107, 110, 114]
29 | colors = [3, 20, 34, 48, 99, 121]
30 | shapes = [11, 15, 27, 99]
31 |
32 |
33 | def repeat_name(l):
34 | ll = []
35 | for word in l:
36 | if word2idx[word] in persons:
37 | k = n_rng.randint(5) + 1
38 | ll += [idx2word[persons[i]] for i in n_rng.randint(len(persons), size=k).tolist()]
39 | elif word2idx[word] in colors:
40 | k = n_rng.randint(5) + 1
41 | ll += [idx2word[colors[i]] for i in n_rng.randint(len(colors), size=k).tolist()]
42 | elif word2idx[word] in shapes:
43 | k = n_rng.randint(5) + 1
44 | ll += [idx2word[shapes[i]] for i in n_rng.randint(len(shapes), size=k).tolist()]
45 | else:
46 | ll += [word]
47 | return ll
48 |
49 | data_rep = [repeat_name(l) for l in data]
50 | origin = [[word2idx[w] for w in l] for l in data_rep]
51 |
52 | def replace(word):
53 | if word2idx[word] in [1, 6, 24, 37, 38, 47, 60, 61, 73, 74, 90, 94, 107, 110, 114]:
54 | return ''
55 | elif word2idx[word] in [3, 20, 34, 48, 99, 121]:
56 | return ''
57 | elif word2idx[word] in [11, 15, 27, 99]:
58 | return ''
59 | else:
60 | return word
61 |
62 | # prepare the vocabulary
63 | data_clean = [[replace(w) for w in l] for l in data_rep]
64 | idx2word2 = dict(enumerate(set([w for l in data_clean for w in l]), 1))
65 | idx2word2[0] = ''
66 | word2idx2 = {v: k for k, v in idx2word2.items()}
67 | Lmax = len(idx2word2)
68 |
69 | for k in xrange(len(idx2word2)):
70 | print k, '\t', idx2word2[k]
71 | print 'Max: {}'.format(Lmax)
72 |
73 | serialize_to_file([idx2word2, word2idx2, idx2word, word2idx], './dataset/bAbI/voc-b.pkl')
74 |
75 | # get ready for the dataset.
76 | source = [[word2idx2[w] for w in l] for l in data_clean]
77 | target = [[word2idx2[w] if w not in ['', '', '']
78 | else it + Lmax
79 | for it, w in enumerate(l)] for l in data_clean]
80 |
81 |
82 | def print_str(data):
83 | for d in data:
84 | print ' '.join(str(w) for w in d)
85 |
86 |
87 | print_str(data[10000: 10005])
88 | print_str(data_rep[10000: 10005])
89 | print_str(data_clean[10000: 10005])
90 | print_str(source[10000: 10005])
91 | print_str(target[10000: 10005])
92 |
93 | serialize_to_file([source, target, origin], './dataset/bAbI/dataset-b.pkl')
94 |
--------------------------------------------------------------------------------
/experiments/lcsts_dataset.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import chardet
3 | import sys
4 | import numpy as np
5 | import jieba as jb
6 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
7 |
8 | word2idx = dict()
9 | wordfreq = dict()
10 | word2idx[''] = 0
11 | word2idx[''] = 1
12 |
13 | segment = False # True
14 |
15 | # training set
16 | pairs = []
17 | f = open('./dataset/LCSTS/PART_I/PART_full.txt', 'r')
18 | line = f.readline().strip()
19 | at = 2
20 | lines = 0
21 | while line:
22 | if line == '':
23 | summary = f.readline().strip().decode('utf-8')
24 | if segment:
25 | summary = [w for w in jb.cut(summary)]
26 |
27 | for w in summary:
28 | if w not in wordfreq:
29 | wordfreq[w] = 1
30 | else:
31 | wordfreq[w] += 1
32 | # if w not in word2idx:
33 | # word2idx[w] = at
34 | # at += 1
35 |
36 | f.readline()
37 | f.readline()
38 | text = f.readline().strip().decode('utf-8')
39 | if segment:
40 | text = [w for w in jb.cut(text)]
41 | for w in text:
42 | if w not in wordfreq:
43 | wordfreq[w] = 1
44 | else:
45 | wordfreq[w] += 1
46 | # if w not in word2idx:
47 | # word2idx[w] = at
48 | # at += 1
49 |
50 | pair = (text, summary)
51 | pairs.append(pair)
52 | lines += 1
53 | if lines % 20000 == 0:
54 | print lines
55 | line = f.readline().strip()
56 |
57 | # testing set
58 | tests = []
59 | f = open('./dataset/LCSTS/PART_II/PART_II.txt', 'r')
60 | line = f.readline().strip()
61 | lines = 0
62 | while line:
63 | if line == '':
64 | summary = f.readline().strip().decode('utf-8')
65 | if segment:
66 | summary = [w for w in jb.cut(summary)]
67 |
68 | for w in summary:
69 | if w not in wordfreq:
70 | wordfreq[w] = 1
71 | else:
72 | wordfreq[w] += 1
73 | # if w not in word2idx:
74 | # word2idx[w] = at
75 | # at += 1
76 |
77 | f.readline()
78 | f.readline()
79 | text = f.readline().strip().decode('utf-8')
80 | if segment:
81 | text = [w for w in jb.cut(text)]
82 | for w in text:
83 | if w not in wordfreq:
84 | wordfreq[w] = 1
85 | else:
86 | wordfreq[w] += 1
87 | # if w not in word2idx:
88 | # word2idx[w] = at
89 | # at += 1
90 |
91 | pair = (text, summary)
92 | tests.append(pair)
93 | lines += 1
94 | if lines % 20000 == 0:
95 | print lines
96 | line = f.readline().strip()
97 |
98 | print len(pairs), len(tests)
99 |
100 | # sort the vocabulary
101 | wordfreq = sorted(wordfreq.items(), key=lambda a:a[1], reverse=True)
102 | for w in wordfreq:
103 | word2idx[w[0]] = at
104 | at += 1
105 |
106 | idx2word = {k: v for v, k in word2idx.items()}
107 | Lmax = len(idx2word)
108 | print 'read dataset ok.'
109 | print Lmax
110 | for i in xrange(Lmax):
111 | print idx2word[i].encode('utf-8')
112 |
113 | # use character-based model [on]
114 | # use word-based model [off]
115 |
116 |
117 | def build_data(data):
118 | instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
119 | for pair in data:
120 | source, target = pair
121 | A = [word2idx[w] for w in source]
122 | B = [word2idx[w] for w in target]
123 | # C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
124 | C = [0 if w not in source else source.index(w) + Lmax for w in target]
125 |
126 | instance['text'] += [source]
127 | instance['summary'] += [target]
128 | instance['source'] += [A]
129 | instance['target'] += [B]
130 | # instance['cc_matrix'] += [C]
131 | instance['target_c'] += [C]
132 |
133 | print instance['target'][5000]
134 | print instance['target_c'][5000]
135 | return instance
136 |
137 |
138 | train_set = build_data(pairs)
139 | test_set = build_data(tests)
140 | serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/lcsts_data-char-full.pkl')
141 |
--------------------------------------------------------------------------------
/experiments/lcsts_rouge.py:
--------------------------------------------------------------------------------
1 | """
2 | Evaluation using ROUGE for LCSTS dataset.
3 | """
4 | # load the testing set.
5 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
6 | import jieba as jb
7 | import logging
8 | import copy
9 | from pyrouge import Rouge155
10 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
11 | from experiments.config import setup_lcsts, setup_weibo, setup_syn
12 | from emolga.utils.generic_utils import *
13 | from emolga.models.covc_encdec import NRM
14 | from emolga.models.encdec import NRM as NRM0
15 | from emolga.dataset.build_dataset import deserialize_from_file
16 | from collections import OrderedDict
17 | from fuel import datasets
18 | from fuel import transformers
19 | from fuel import schemes
20 | from pprint import pprint
21 | setup = setup_lcsts
22 |
23 |
24 | def build_evaluation(train_set, segment):
25 | _, _, idx2word, word2idx = deserialize_from_file(train_set)
26 | pairs = []
27 | f = open('./dataset/LCSTS/PART_III/PART_III.txt', 'r')
28 | line = f.readline().strip()
29 | lines = 0
30 | segment = segment
31 | while line:
32 | if '' in line:
33 | score = int(line[13])
34 | if score >= 3:
35 | f.readline()
36 | summary = f.readline().strip().decode('utf-8')
37 | if segment:
38 | summary = [w for w in jb.cut(summary)]
39 | target = []
40 | for w in summary:
41 | if w not in word2idx:
42 | word2idx[w] = len(word2idx)
43 | idx2word[len(idx2word)] = w
44 | target += [word2idx[w]]
45 |
46 | f.readline()
47 | f.readline()
48 | text = f.readline().strip().decode('utf-8')
49 | if segment:
50 | text = [w for w in jb.cut(text)]
51 | source = []
52 | for w in text:
53 | if w not in word2idx:
54 | word2idx[w] = len(word2idx)
55 | idx2word[len(idx2word)] = w
56 | source += [word2idx[w]]
57 |
58 | pair = (text, summary, score, source, target)
59 | pairs.append(pair)
60 | lines += 1
61 | if lines % 1000 == 0:
62 | print lines
63 | line = f.readline().strip()
64 | print 'lines={}'.format(len(pairs))
65 | return pairs, word2idx, idx2word
66 |
67 | # words, wwi, wiw = build_evaluation('./dataset/lcsts_data-word-full.pkl', True)
68 | # chars, cwi, ciw = build_evaluation('./dataset/lcsts_data-char-full.pkl', False)
69 | #
70 | # serialize_to_file([words, chars, [wwi, wiw], [cwi, ciw]], './dataset/lcsts_evaluate_data.pkl')
71 |
72 |
73 | def init_logging(logfile):
74 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
75 | datefmt='%m/%d/%Y %H:%M:%S' )
76 | fh = logging.FileHandler(logfile)
77 | # ch = logging.StreamHandler()
78 |
79 | fh.setFormatter(formatter)
80 | # ch.setFormatter(formatter)
81 | # fh.setLevel(logging.INFO)
82 | # ch.setLevel(logging.INFO)
83 | # logging.getLogger().addHandler(ch)
84 | logging.getLogger().addHandler(fh)
85 | logging.getLogger().setLevel(logging.INFO)
86 |
87 | return logging
88 |
89 |
90 | # prepare logging.
91 | config = setup() # load settings.
92 | for w in config:
93 | print '{0}={1}'.format(w, config[w])
94 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
95 | logger = init_logging(config['path_log'] + '/experiments.LCSTS.Eval.id={}.log'.format(tmark))
96 | n_rng = np.random.RandomState(config['seed'])
97 | np.random.seed(config['seed'])
98 | rng = RandomStreams(n_rng.randint(2 ** 30))
99 | logger.info('Start!')
100 |
101 | segment = config['segment']
102 | word_set, char_set, word_voc, char_voc = deserialize_from_file('./dataset/lcsts_evaluate_data.pkl')
103 |
104 | if segment:
105 | eval_set = word_set
106 | word2idx, idx2word = word_voc
107 | else:
108 | eval_set = char_set
109 | word2idx, idx2word = char_voc
110 |
111 | if config['voc_size'] == -1: # not use unk
112 | config['enc_voc_size'] = len(word2idx)
113 | config['dec_voc_size'] = config['enc_voc_size']
114 | else:
115 | config['enc_voc_size'] = config['voc_size']
116 | config['dec_voc_size'] = config['enc_voc_size']
117 |
118 | samples = len(eval_set)
119 | logger.info('build dataset done. ' +
120 | 'dataset size: {} ||'.format(samples) +
121 | 'vocabulary size = {0}/ batch size = {1}'.format(
122 | config['dec_voc_size'], config['batch_size']))
123 | logger.info('load the data ok.')
124 |
125 | # build the agent
126 | if config['copynet']:
127 | agent = NRM(config, n_rng, rng, mode=config['mode'],
128 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
129 | else:
130 | agent = NRM0(config, n_rng, rng, mode=config['mode'],
131 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
132 |
133 | agent.build_()
134 | agent.compile_('display')
135 | print 'compile ok.'
136 |
137 | # load the model
138 | agent.load(config['trained_model'])
139 |
140 |
141 | def unk_filter(data):
142 | if config['voc_size'] == -1:
143 | return copy.copy(data)
144 | else:
145 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
146 | data = copy.copy(data * mask + (1 - mask))
147 | return data
148 |
149 | rouge = Rouge155(n_words=40)
150 | evalsets = {'rouge_1_f_score': 'R1',
151 | 'rouge_2_f_score': 'R2',
152 | 'rouge_3_f_score': 'R3',
153 | 'rouge_4_f_score': 'R4',
154 | 'rouge_l_f_score': 'RL',
155 | 'rouge_su4_f_score': 'RSU4'}
156 | scores = dict()
157 | for id, sample in enumerate(eval_set):
158 | text, summary, score, source, target = sample
159 | v = agent.evaluate_(np.asarray(source, dtype='int32'),
160 | np.asarray(target, dtype='int32'),
161 | idx2word,
162 | np.asarray(unk_filter(source), dtype='int32')).decode('utf-8').split('\n')
163 |
164 | print 'ID = {} ||'.format(id) + '*' * 50
165 | ref = ' '.join(['t{}'.format(char_voc[0][u]) for u in ''.join([w for w in v[2][9:].split()])])
166 | sym = ' '.join(['t{}'.format(char_voc[0][u]) for u in ''.join([w for w in v[3][9:].split()])])
167 |
168 | sssss = rouge.score_summary(sym, {'A': ref})
169 |
170 | for si in sssss:
171 | if si not in scores:
172 | scores[si] = sssss[si]
173 | else:
174 | scores[si] += sssss[si]
175 |
176 | for e in evalsets:
177 | print '{0}: {1}'.format(evalsets[e], scores[e] / (id + 1)),
178 | print './.'
179 |
180 | # average
181 | for si in scores:
182 | scores[si] /= float(len(eval_set))
183 |
--------------------------------------------------------------------------------
/experiments/lcsts_sample.py:
--------------------------------------------------------------------------------
1 | """
2 | This is the implementation of Copy-NET
3 | We start from the basic Seq2seq framework for a auto-encoder.
4 | """
5 | import logging
6 | import time
7 | import numpy as np
8 | import sys
9 | import copy
10 |
11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
12 | from experiments.config import setup_lcsts
13 | from emolga.utils.generic_utils import *
14 | from emolga.models.covc_encdec import NRM
15 | from emolga.models.encdec import NRM as NRM0
16 | from emolga.dataset.build_dataset import deserialize_from_file
17 | from collections import OrderedDict
18 | from fuel import datasets
19 | from fuel import transformers
20 | from fuel import schemes
21 |
22 | setup = setup_lcsts
23 |
24 |
25 | def init_logging(logfile):
26 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
27 | datefmt='%m/%d/%Y %H:%M:%S' )
28 | fh = logging.FileHandler(logfile)
29 | # ch = logging.StreamHandler()
30 |
31 | fh.setFormatter(formatter)
32 | # ch.setFormatter(formatter)
33 | # fh.setLevel(logging.INFO)
34 | # ch.setLevel(logging.INFO)
35 | # logging.getLogger().addHandler(ch)
36 | logging.getLogger().addHandler(fh)
37 | logging.getLogger().setLevel(logging.INFO)
38 | return logging
39 |
40 | # prepare logging.
41 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
42 | config = setup() # load settings.
43 | for w in config:
44 | print '{0}={1}'.format(w, config[w])
45 |
46 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
47 | n_rng = np.random.RandomState(config['seed'])
48 | np.random.seed(config['seed'])
49 | rng = RandomStreams(n_rng.randint(2 ** 30))
50 | logger.info('Start!')
51 |
52 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
53 | if config['voc_size'] == -1: # not use unk
54 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
55 | config['dec_voc_size'] = config['enc_voc_size']
56 | else:
57 | config['enc_voc_size'] = config['voc_size']
58 | config['dec_voc_size'] = config['enc_voc_size']
59 |
60 | samples = len(train_set['source'])
61 |
62 | logger.info('build dataset done. ' +
63 | 'dataset size: {} ||'.format(samples) +
64 | 'vocabulary size = {0}/ batch size = {1}'.format(
65 | config['dec_voc_size'], config['batch_size']))
66 |
67 |
68 | def build_data(data):
69 | # create fuel dataset.
70 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
71 | ('target', data['target']),
72 | ('target_c', data['target_c']),
73 | ]))
74 | dataset.example_iteration_scheme \
75 | = schemes.ShuffledExampleScheme(dataset.num_examples)
76 | return dataset
77 |
78 |
79 | def unk_filter(data):
80 | if config['voc_size'] == -1:
81 | return copy.copy(data)
82 | else:
83 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
84 | data = copy.copy(data * mask + (1 - mask))
85 | return data
86 |
87 |
88 | train_data_plain = zip(*(train_set['source'], train_set['target']))
89 | test_data_plain = zip(*(test_set['source'], test_set['target']))
90 | train_size = len(train_data_plain)
91 | test_size = len(test_data_plain)
92 | tr_idx = n_rng.permutation(train_size)[:2000].tolist()
93 | ts_idx = n_rng.permutation(test_size)[:100].tolist()
94 |
95 | logger.info('load the data ok.')
96 |
97 | # logger.info('Evaluate Enc-Dec')
98 | # log_gen = open(config['path_log'] + '/experiments.CopyLCSTS.generate_{}.log'.format(0), 'w')
99 | # config['copynet'] = True
100 | # echo = 10
101 | # tmark = '20160224-185023' # '20160221-171853' # enc-dec model [no unk]
102 | # agent = NRM(config, n_rng, rng, mode=config['mode'],
103 | # use_attention=True, copynet=config['copynet'], identity=config['identity'])
104 | # agent.build_()
105 | # agent.compile_('display')
106 | # agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
107 | # logger.info('generating [testing set] samples')
108 | # for idx in ts_idx:
109 | # # idx = int(np.floor(n_rng.rand() * test_size))
110 | # test_s, test_t = test_data_plain[idx]
111 | # v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
112 | # np.asarray(test_t, dtype='int32'),
113 | # idx2word)
114 | # log_gen.write(v)
115 | # log_gen.write('*' * 50 + '\n')
116 | # log_gen.close()
117 |
118 | logger.info('Evaluate CopyNet')
119 | echo = 6
120 | tmark = '20160224-185023' # '20160221-025049' # copy-net model [no unk]
121 | log_cp = open(config['path_logX'] + '/experiments.copy_{0}_{1}.log'.format(tmark, echo), 'w')
122 | config['copynet'] = True
123 | agent = NRM(config, n_rng, rng, mode=config['mode'],
124 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
125 | agent.build_()
126 | agent.compile_('display')
127 | agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
128 | logger.info('generating [testing set] samples')
129 | for idx in ts_idx:
130 | # idx = int(np.floor(n_rng.rand() * test_size))
131 | test_s, test_t = test_data_plain[idx]
132 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
133 | np.asarray(test_t, dtype='int32'),
134 | idx2word, np.asarray(unk_filter(test_s), dtype='int32'))
135 | log_cp.write(v)
136 | log_cp.write('*' * 50 + '\n')
137 | log_cp.close()
138 | logger.info('Complete!')
--------------------------------------------------------------------------------
/experiments/lcsts_test.py:
--------------------------------------------------------------------------------
1 | """
2 | This is the implementation of Copy-NET
3 | We start from the basic Seq2seq framework for a auto-encoder.
4 | """
5 | import logging
6 | import time
7 | import numpy as np
8 | import sys
9 |
10 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
11 | from experiments.config import setup_lcsts
12 | from emolga.utils.generic_utils import *
13 | from emolga.models.cooc_encdec import NRM
14 | from emolga.models.encdec import NRM as NRM0
15 | from emolga.dataset.build_dataset import deserialize_from_file
16 | from collections import OrderedDict
17 | from fuel import datasets
18 | from fuel import transformers
19 | from fuel import schemes
20 |
21 | setup = setup_lcsts
22 |
23 |
24 | def init_logging(logfile):
25 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
26 | datefmt='%m/%d/%Y %H:%M:%S' )
27 | fh = logging.FileHandler(logfile)
28 | # ch = logging.StreamHandler()
29 |
30 | fh.setFormatter(formatter)
31 | # ch.setFormatter(formatter)
32 | # fh.setLevel(logging.INFO)
33 | # ch.setLevel(logging.INFO)
34 | # logging.getLogger().addHandler(ch)
35 | logging.getLogger().addHandler(fh)
36 | logging.getLogger().setLevel(logging.INFO)
37 |
38 | return logging
39 |
40 | # prepare logging.
41 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
42 | config = setup() # load settings.
43 | for w in config:
44 | print '{0}={1}'.format(w, config[w])
45 |
46 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
47 | n_rng = np.random.RandomState(config['seed'])
48 | np.random.seed(config['seed'])
49 | rng = RandomStreams(n_rng.randint(2 ** 30))
50 | logger.info('Start!')
51 |
52 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
53 |
54 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
55 | config['dec_voc_size'] = config['enc_voc_size']
56 | samples = len(train_set['source'])
57 |
58 | logger.info('build dataset done. ' +
59 | 'dataset size: {} ||'.format(samples) +
60 | 'vocabulary size = {0}/ batch size = {1}'.format(
61 | config['dec_voc_size'], config['batch_size']))
62 |
63 |
64 | def build_data(data):
65 | # create fuel dataset.
66 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
67 | ('target', data['target']),
68 | ('target_c', data['target_c']),
69 | ]))
70 | dataset.example_iteration_scheme \
71 | = schemes.ShuffledExampleScheme(dataset.num_examples)
72 | return dataset
73 |
74 |
75 | train_data = build_data(train_set)
76 | train_data_plain = zip(*(train_set['source'], train_set['target']))
77 | test_data_plain = zip(*(test_set['source'], test_set['target']))
78 |
79 | train_size = len(train_data_plain)
80 | test_size = len(test_data_plain)
81 | tr_idx = n_rng.permutation(train_size)[:2000].tolist()
82 | ts_idx = n_rng.permutation(test_size )[:2000].tolist()
83 | logger.info('load the data ok.')
84 |
85 | # build the agent
86 | if config['copynet']:
87 | agent = NRM(config, n_rng, rng, mode=config['mode'],
88 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
89 | else:
90 | agent = NRM0(config, n_rng, rng, mode=config['mode'],
91 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
92 |
93 | agent.build_()
94 | agent.compile_('all')
95 | print 'compile ok.'
96 |
97 | echo = 2
98 | epochs = 10
99 | if echo > 0:
100 | tmark = '20160217-232113'
101 | agent.load(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
102 |
103 | while echo < epochs:
104 | echo += 1
105 | loss = []
106 |
107 | def output_stream(dataset, batch_size, size=1):
108 | data_stream = dataset.get_example_stream()
109 | data_stream = transformers.Batch(data_stream,
110 | iteration_scheme=schemes.ConstantScheme(batch_size))
111 |
112 | # add padding and masks to the dataset
113 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target', 'target_c'))
114 | return data_stream
115 |
116 | def prepare_batch(batch, mask, fix_len=None):
117 | data = batch[mask].astype('int32')
118 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
119 |
120 | def cut_zeros(data, fix_len=None):
121 | if fix_len is not None:
122 | return data[:, : fix_len]
123 | for k in range(data.shape[1] - 1, 0, -1):
124 | data_col = data[:, k].sum()
125 | if data_col > 0:
126 | return data[:, : k + 2]
127 | return data
128 | data = cut_zeros(data, fix_len)
129 | return data
130 |
131 | # training
132 | notrain = False
133 | if not notrain:
134 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
135 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
136 | progbar = Progbar(train_size / config['batch_size'])
137 | for it, batch in enumerate(train_batches):
138 | # obtain data
139 | data_s = prepare_batch(batch, 'source')
140 | data_t = prepare_batch(batch, 'target')
141 | data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
142 |
143 | if config['copynet']:
144 | loss += [agent.train_(data_s, data_t, data_c)]
145 | else:
146 | loss += [agent.train_(data_s, data_t)]
147 |
148 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
149 |
150 | if it % 200 == 0:
151 | logger.info('Echo={} Evaluation Sampling.'.format(it))
152 | logger.info('generating [training set] samples')
153 | for _ in xrange(5):
154 | idx = int(np.floor(n_rng.rand() * train_size))
155 | train_s, train_t = train_data_plain[idx]
156 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
157 | np.asarray(train_t, dtype='int32'),
158 | idx2word)
159 | print '*' * 50
160 |
161 | logger.info('generating [testing set] samples')
162 | for _ in xrange(5):
163 | idx = int(np.floor(n_rng.rand() * test_size))
164 | test_s, test_t = test_data_plain[idx]
165 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
166 | np.asarray(test_t, dtype='int32'),
167 | idx2word)
168 | print '*' * 50
169 |
170 | # save the weights.
171 | agent.save(config['path_h5'] + '/experiments.CopyLCSTS.id={0}.epoch={1}.pkl'.format(tmark, echo))
172 |
173 | # # test accuracy
174 | # progbar_tr = Progbar(2000)
175 | #
176 | # print '\n' + '__' * 50
177 | # gen, gen_pos = 0, 0
178 | # cpy, cpy_pos = 0, 0
179 | # for it, idx in enumerate(tr_idx):
180 | # train_s, train_t = train_data_plain[idx]
181 | #
182 | # c = agent.analyse_(np.asarray(train_s, dtype='int32'),
183 | # np.asarray(train_t, dtype='int32'),
184 | # idx2word)
185 | # if c[1] == 0:
186 | # # generation mode
187 | # gen += 1
188 | # gen_pos += c[0]
189 | # else:
190 | # # copy mode
191 | # cpy += 1
192 | # cpy_pos += c[0]
193 | #
194 | # progbar_tr.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
195 | #
196 | # logger.info('\nTraining Accuracy:' +
197 | # '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
198 | # '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)))
199 | #
200 | # progbar_ts = Progbar(2000)
201 | # print '\n' + '__' * 50
202 | # gen, gen_pos = 0, 0
203 | # cpy, cpy_pos = 0, 0
204 | # for it, idx in enumerate(ts_idx):
205 | # test_s, test_t = test_data_plain[idx]
206 | # c = agent.analyse_(np.asarray(test_s, dtype='int32'),
207 | # np.asarray(test_t, dtype='int32'),
208 | # idx2word)
209 | # if c[1] == 0:
210 | # # generation mode
211 | # gen += 1
212 | # gen_pos += c[0]
213 | # else:
214 | # # copy mode
215 | # cpy += 1
216 | # cpy_pos += c[0]
217 | #
218 | # progbar_ts.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
219 | #
220 | # logger.info('\nTesting Accuracy:' +
221 | # '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
222 | # '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)))
223 |
--------------------------------------------------------------------------------
/experiments/lcsts_vest.py:
--------------------------------------------------------------------------------
1 | """
2 | This is the implementation of Copy-NET
3 | We start from the basic Seq2seq framework for a auto-encoder.
4 | """
5 | import logging
6 | import time
7 | import numpy as np
8 | import sys
9 | import copy
10 |
11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
12 | from experiments.config import setup_lcsts, setup_weibo, setup_syn
13 | from emolga.utils.generic_utils import *
14 | from emolga.models.covc_encdec import NRM
15 | from emolga.models.encdec import NRM as NRM0
16 | from emolga.dataset.build_dataset import deserialize_from_file
17 | from collections import OrderedDict
18 | from fuel import datasets
19 | from fuel import transformers
20 | from fuel import schemes
21 |
22 | setup = setup_lcsts
23 | # setup = setup_syn
24 |
25 |
26 | def init_logging(logfile):
27 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
28 | datefmt='%m/%d/%Y %H:%M:%S' )
29 | fh = logging.FileHandler(logfile)
30 | # ch = logging.StreamHandler()
31 |
32 | fh.setFormatter(formatter)
33 | # ch.setFormatter(formatter)
34 | # fh.setLevel(logging.INFO)
35 | # ch.setLevel(logging.INFO)
36 | # logging.getLogger().addHandler(ch)
37 | logging.getLogger().addHandler(fh)
38 | logging.getLogger().setLevel(logging.INFO)
39 |
40 | return logging
41 |
42 | # prepare logging.
43 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
44 | config = setup() # load settings.
45 | for w in config:
46 | print '{0}={1}'.format(w, config[w])
47 |
48 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTSXXX.id={}.log'.format(tmark))
49 | n_rng = np.random.RandomState(config['seed'])
50 | np.random.seed(config['seed'])
51 | rng = RandomStreams(n_rng.randint(2 ** 30))
52 | logger.info('Start!')
53 |
54 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
55 |
56 | if config['voc_size'] == -1: # not use unk
57 | config['enc_voc_size'] = len(word2idx)
58 | config['dec_voc_size'] = config['enc_voc_size']
59 | else:
60 | config['enc_voc_size'] = config['voc_size']
61 | config['dec_voc_size'] = config['enc_voc_size']
62 |
63 | samples = len(train_set['source'])
64 | logger.info('build dataset done. ' +
65 | 'dataset size: {} ||'.format(samples) +
66 | 'vocabulary size = {0}/ batch size = {1}'.format(
67 | config['dec_voc_size'], config['batch_size']))
68 |
69 |
70 | def build_data(data):
71 | # create fuel dataset.
72 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
73 | ('target', data['target']),
74 | ('target_c', data['target_c']),
75 | ]))
76 | dataset.example_iteration_scheme \
77 | = schemes.ShuffledExampleScheme(dataset.num_examples)
78 | return dataset
79 |
80 |
81 | train_data = build_data(train_set)
82 | train_data_plain = zip(*(train_set['source'], train_set['target']))
83 | test_data_plain = zip(*(test_set['source'], test_set['target']))
84 |
85 | # train_data_plain = zip(*(train_set['source'], train_set['target']))
86 | # test_data_plain = zip(*(test_set['source'], test_set['target']))
87 |
88 | train_size = len(train_data_plain)
89 | test_size = len(test_data_plain)
90 | tr_idx = n_rng.permutation(train_size)[:2000].tolist()
91 | ts_idx = n_rng.permutation(test_size )[:2000].tolist()
92 | logger.info('load the data ok.')
93 |
94 | # build the agent
95 | if config['copynet']:
96 | agent = NRM(config, n_rng, rng, mode=config['mode'],
97 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
98 | else:
99 | agent = NRM0(config, n_rng, rng, mode=config['mode'],
100 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
101 |
102 |
103 | echo = 0
104 | epochs = 10
105 | if echo > 0:
106 | tmark = '20160221-025049' # copynet multi-source model
107 | agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.pkl'.format(tmark, echo))
108 |
109 | # recover from Nan
110 | tmark = '20160307-135907' # '20160301-105813'
111 | skip = 26000
112 | sep = 1
113 |
114 | # tmark = '20160301-114653'
115 | # skip = 14000
116 | agent.build_(0.001, 26000)
117 | agent.compile_('all')
118 | print 'compile ok.'
119 | agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(tmark, sep, skip))
120 |
121 |
122 | while echo < epochs:
123 | echo += 1
124 | loss = []
125 |
126 | def output_stream(dataset, batch_size, size=1):
127 | data_stream = dataset.get_example_stream()
128 | data_stream = transformers.Batch(data_stream,
129 | iteration_scheme=schemes.ConstantScheme(batch_size))
130 |
131 | # add padding and masks to the dataset
132 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
133 | return data_stream
134 |
135 | def prepare_batch(batch, mask, fix_len=None):
136 | data = batch[mask].astype('int32')
137 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
138 |
139 | def cut_zeros(data, fix_len=None):
140 | if fix_len is not None:
141 | return data[:, : fix_len]
142 | for k in range(data.shape[1] - 1, 0, -1):
143 | data_col = data[:, k].sum()
144 | if data_col > 0:
145 | return data[:, : k + 2]
146 | return data
147 | data = cut_zeros(data, fix_len)
148 | return data
149 |
150 | def cc_martix(source, target):
151 | cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
152 | for k in xrange(source.shape[0]):
153 | for j in xrange(target.shape[1]):
154 | for i in xrange(source.shape[1]):
155 | if (source[k, i] == target[k, j]) and (source[k, i] > 0):
156 | cc[k][j][i] = 1.
157 | return cc
158 |
159 | def unk_filter(data):
160 | if config['voc_size'] == -1:
161 | return copy.copy(data)
162 | else:
163 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
164 | data = copy.copy(data * mask + (1 - mask))
165 | return data
166 |
167 | # training
168 | notrain = False
169 | if not notrain:
170 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
171 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
172 | progbar = Progbar(train_size / config['batch_size'])
173 | for it, batch in enumerate(train_batches):
174 | if (echo < sep):
175 | continue
176 |
177 | if (echo == sep) and (skip > it):
178 | if it % 200 == 0:
179 | print it
180 | continue
181 |
182 | # obtain data
183 | data_s = prepare_batch(batch, 'source')
184 | data_t = prepare_batch(batch, 'target')
185 | if config['copynet']:
186 | data_c = cc_martix(data_s, data_t)
187 | # data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
188 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
189 | else:
190 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
191 |
192 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
193 |
194 | if it % 200 == 0:
195 | logger.info('Echo={} Evaluation Sampling.'.format(it))
196 | logger.info('generating [training set] samples')
197 | for _ in xrange(5):
198 | idx = int(np.floor(n_rng.rand() * train_size))
199 | train_s, train_t = train_data_plain[idx]
200 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
201 | np.asarray(train_t, dtype='int32'),
202 | idx2word,
203 | np.asarray(unk_filter(train_s), dtype='int32'))
204 | print '*' * 50
205 |
206 | logger.info('generating [testing set] samples')
207 | for _ in xrange(5):
208 | idx = int(np.floor(n_rng.rand() * test_size))
209 | test_s, test_t = test_data_plain[idx]
210 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
211 | np.asarray(test_t, dtype='int32'),
212 | idx2word,
213 | np.asarray(unk_filter(test_s), dtype='int32'))
214 | print '*' * 50
215 |
216 | # save the weights.
217 | if it % 2000 == 0:
218 | agent.save(config['path_h5'] +
219 | '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(
220 | tmark, echo, it))
221 |
222 | # # test accuracy
223 | test = False
224 | if test:
225 | progbar_tr = Progbar(2000)
226 |
227 | print '\n' + '__' * 50
228 | cpy, cpy_pos = 0, 0
229 | for it, idx in enumerate(tr_idx):
230 | train_s, train_t = train_data_plain[idx]
231 |
232 | c = agent.analyse_(np.asarray(train_s, dtype='int32'),
233 | np.asarray(train_t, dtype='int32'),
234 | idx2word)
235 | # copy mode
236 | cpy += 1
237 | cpy_pos += c
238 | progbar_tr.update(it + 1, [('Copy', cpy_pos)])
239 |
240 | logger.info('\nTraining Accuracy:' +
241 | '\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
242 |
243 | progbar_ts = Progbar(2000)
244 | print '\n' + '__' * 50
245 |
246 | cpy, cpy_pos = 0, 0
247 | for it, idx in enumerate(ts_idx):
248 | test_s, test_t = test_data_plain[idx]
249 | c = agent.analyse_(np.asarray(test_s, dtype='int32'),
250 | np.asarray(test_t, dtype='int32'),
251 | idx2word)
252 | cpy += 1
253 | cpy_pos += c
254 | progbar_ts.update(it + 1, [('Copy', cpy_pos)])
255 |
256 | logger.info('\nTesting Accuracy:' +
257 | '\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
258 |
--------------------------------------------------------------------------------
/experiments/lcsts_vest_new.py:
--------------------------------------------------------------------------------
1 | """
2 | This is the implementation of Copy-NET
3 | We start from the basic Seq2seq framework for a auto-encoder.
4 | """
5 | import logging
6 | import time
7 | import numpy as np
8 | import sys
9 | import copy
10 |
11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
12 | from experiments.config import setup_lcsts, setup_weibo, setup_syn
13 | from emolga.utils.generic_utils import *
14 | from emolga.models.covc_encdec import NRM
15 | from emolga.models.encdec import NRM as NRM0
16 | from emolga.dataset.build_dataset import deserialize_from_file
17 | from collections import OrderedDict
18 | from fuel import datasets
19 | from fuel import transformers
20 | from fuel import schemes
21 |
22 | setup = setup_lcsts
23 | # setup = setup_syn
24 |
25 |
26 | def init_logging(logfile):
27 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
28 | datefmt='%m/%d/%Y %H:%M:%S' )
29 | fh = logging.FileHandler(logfile)
30 | # ch = logging.StreamHandler()
31 |
32 | fh.setFormatter(formatter)
33 | # ch.setFormatter(formatter)
34 | # fh.setLevel(logging.INFO)
35 | # ch.setLevel(logging.INFO)
36 | # logging.getLogger().addHandler(ch)
37 | logging.getLogger().addHandler(fh)
38 | logging.getLogger().setLevel(logging.INFO)
39 |
40 | return logging
41 |
42 | # prepare logging.
43 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
44 | config = setup() # load settings.
45 | for w in config:
46 | print '{0}={1}'.format(w, config[w])
47 |
48 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTSXXX.id={}.log'.format(tmark))
49 | n_rng = np.random.RandomState(config['seed'])
50 | np.random.seed(config['seed'])
51 | rng = RandomStreams(n_rng.randint(2 ** 30))
52 | logger.info('Start!')
53 |
54 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
55 |
56 | if config['voc_size'] == -1: # not use unk
57 | config['enc_voc_size'] = len(word2idx)
58 | config['dec_voc_size'] = config['enc_voc_size']
59 | else:
60 | config['enc_voc_size'] = config['voc_size']
61 | config['dec_voc_size'] = config['enc_voc_size']
62 |
63 | samples = len(train_set['source'])
64 | logger.info('build dataset done. ' +
65 | 'dataset size: {} ||'.format(samples) +
66 | 'vocabulary size = {0}/ batch size = {1}'.format(
67 | config['dec_voc_size'], config['batch_size']))
68 |
69 |
70 | def build_data(data):
71 | # create fuel dataset.
72 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
73 | ('target', data['target']),
74 | ('target_c', data['target_c']),
75 | ]))
76 | dataset.example_iteration_scheme \
77 | = schemes.ShuffledExampleScheme(dataset.num_examples)
78 | return dataset
79 |
80 |
81 | train_data = build_data(train_set)
82 | train_data_plain = zip(*(train_set['source'], train_set['target']))
83 | test_data_plain = zip(*(test_set['source'], test_set['target']))
84 |
85 | # train_data_plain = zip(*(train_set['source'], train_set['target']))
86 | # test_data_plain = zip(*(test_set['source'], test_set['target']))
87 |
88 | train_size = len(train_data_plain)
89 | test_size = len(test_data_plain)
90 | tr_idx = n_rng.permutation(train_size)[:2000].tolist()
91 | ts_idx = n_rng.permutation(test_size )[:2000].tolist()
92 | logger.info('load the data ok.')
93 |
94 | # build the agent
95 | if config['copynet']:
96 | agent = NRM(config, n_rng, rng, mode=config['mode'],
97 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
98 | else:
99 | agent = NRM0(config, n_rng, rng, mode=config['mode'],
100 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
101 |
102 | agent.build_()
103 | agent.compile_('all')
104 | print 'compile ok.'
105 |
106 | echo = 0
107 | epochs = 10
108 | if echo > 0:
109 | tmark = '20160221-025049' # copynet multi-source model
110 | agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.pkl'.format(tmark, echo))
111 |
112 | # recover from Nan
113 | # tmark = '20160301-105813'
114 | skip = -1 #100000
115 | sep = -1 #2
116 |
117 | # tmark = '20160301-114653'
118 | # skip = 14000
119 | # agent.load(config['path_h5'] + '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(tmark, sep, skip))
120 |
121 |
122 | while echo < epochs:
123 | echo += 1
124 | loss = []
125 |
126 | def output_stream(dataset, batch_size, size=1):
127 | data_stream = dataset.get_example_stream()
128 | data_stream = transformers.Batch(data_stream,
129 | iteration_scheme=schemes.ConstantScheme(batch_size))
130 |
131 | # add padding and masks to the dataset
132 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
133 | return data_stream
134 |
135 | def prepare_batch(batch, mask, fix_len=None):
136 | data = batch[mask].astype('int32')
137 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
138 |
139 | def cut_zeros(data, fix_len=None):
140 | if fix_len is not None:
141 | return data[:, : fix_len]
142 | for k in range(data.shape[1] - 1, 0, -1):
143 | data_col = data[:, k].sum()
144 | if data_col > 0:
145 | return data[:, : k + 2]
146 | return data
147 | data = cut_zeros(data, fix_len)
148 | return data
149 |
150 | def cc_martix(source, target):
151 | cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
152 | for k in xrange(source.shape[0]):
153 | for j in xrange(target.shape[1]):
154 | for i in xrange(source.shape[1]):
155 | if (source[k, i] == target[k, j]) and (source[k, i] > 0):
156 | cc[k][j][i] = 1.
157 | return cc
158 |
159 | def unk_filter(data):
160 | if config['voc_size'] == -1:
161 | return copy.copy(data)
162 | else:
163 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
164 | data = copy.copy(data * mask + (1 - mask))
165 | return data
166 |
167 | # training
168 | notrain = False
169 | if not notrain:
170 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
171 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
172 | progbar = Progbar(train_size / config['batch_size'])
173 | for it, batch in enumerate(train_batches):
174 | if (echo < sep):
175 | continue
176 |
177 | if (echo == sep) and (skip > it):
178 | if it % 200 == 0:
179 | print it
180 | continue
181 |
182 | # obtain data
183 | data_s = prepare_batch(batch, 'source')
184 | data_t = prepare_batch(batch, 'target')
185 | if config['copynet']:
186 | data_c = cc_martix(data_s, data_t)
187 | # data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
188 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
189 | else:
190 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
191 |
192 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
193 |
194 | if it % 200 == 0:
195 | logger.info('Echo={} Evaluation Sampling.'.format(it))
196 | logger.info('generating [training set] samples')
197 | for _ in xrange(5):
198 | idx = int(np.floor(n_rng.rand() * train_size))
199 | train_s, train_t = train_data_plain[idx]
200 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
201 | np.asarray(train_t, dtype='int32'),
202 | idx2word,
203 | np.asarray(unk_filter(train_s), dtype='int32'))
204 | print '*' * 50
205 |
206 | logger.info('generating [testing set] samples')
207 | for _ in xrange(5):
208 | idx = int(np.floor(n_rng.rand() * test_size))
209 | test_s, test_t = test_data_plain[idx]
210 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
211 | np.asarray(test_t, dtype='int32'),
212 | idx2word,
213 | np.asarray(unk_filter(test_s), dtype='int32'))
214 | print '*' * 50
215 |
216 | # save the weights.
217 | if it % 2000 == 0:
218 | agent.save(config['path_h5'] +
219 | '/experiments.CopyLCSTSXXX.id={0}.epoch={1}.iter={2}.pkl'.format(
220 | tmark, echo, it))
221 |
222 | # # test accuracy
223 | test = False
224 | if test:
225 | progbar_tr = Progbar(2000)
226 |
227 | print '\n' + '__' * 50
228 | cpy, cpy_pos = 0, 0
229 | for it, idx in enumerate(tr_idx):
230 | train_s, train_t = train_data_plain[idx]
231 |
232 | c = agent.analyse_(np.asarray(train_s, dtype='int32'),
233 | np.asarray(train_t, dtype='int32'),
234 | idx2word)
235 | # copy mode
236 | cpy += 1
237 | cpy_pos += c
238 | progbar_tr.update(it + 1, [('Copy', cpy_pos)])
239 |
240 | logger.info('\nTraining Accuracy:' +
241 | '\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
242 |
243 | progbar_ts = Progbar(2000)
244 | print '\n' + '__' * 50
245 |
246 | cpy, cpy_pos = 0, 0
247 | for it, idx in enumerate(ts_idx):
248 | test_s, test_t = test_data_plain[idx]
249 | c = agent.analyse_(np.asarray(test_s, dtype='int32'),
250 | np.asarray(test_t, dtype='int32'),
251 | idx2word)
252 | cpy += 1
253 | cpy_pos += c
254 | progbar_ts.update(it + 1, [('Copy', cpy_pos)])
255 |
256 | logger.info('\nTesting Accuracy:' +
257 | '\t{0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
258 |
--------------------------------------------------------------------------------
/experiments/movie_dataset.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
3 | import string
4 | import random
5 | import sys
6 |
7 | random.seed(19920206)
8 | word2idx = dict()
9 | wordfreq = dict()
10 | word2idx[''] = 0
11 | word2idx[''] = 1
12 | word2freq = dict()
13 |
14 |
15 | def mark(line):
16 | tmp_line = ''
17 | for c in line:
18 | if c in string.punctuation:
19 | if c is not "'":
20 | tmp_line += ' ' + c + ' '
21 | else:
22 | tmp_line += ' ' + c
23 | else:
24 | tmp_line += c
25 | tmp_line = tmp_line.lower()
26 | words = [w for w in tmp_line.split() if len(w) > 0]
27 | for w in words:
28 | if w not in word2freq:
29 | word2freq[w] = 1
30 | else:
31 | word2freq[w] += 1
32 | return words
33 |
34 |
35 | fline = open('./dataset/cornell_movie/movie_lines.txt', 'r')
36 | sets = [w.split('+++$+++') for w in fline.read().split('\n')]
37 | lines = {w[0].strip(): mark(w[-1].strip()) for w in sets}
38 | #
39 | # for w in lines:
40 | # if len(lines[w]) == 0:
41 | # print w
42 |
43 | fline.close()
44 | print 'read lines ok'
45 | fconv = open('./dataset/cornell_movie/movie_conversations.txt', 'r')
46 |
47 | turns = []
48 | convs = fconv.readline()
49 | while convs:
50 | turn = eval(convs.split('+++$+++')[-1].strip())
51 | turns += zip(turn[:-1], turn[1:])
52 | convs = fconv.readline()
53 |
54 | pairs = [(lines[a], lines[b]) for a, b in turns
55 | if len(lines[a]) > 0 and len(lines[b]) > 0]
56 |
57 | # shuffle!
58 | random.shuffle(pairs)
59 |
60 | word2freq = sorted(word2freq.items(), key=lambda a: a[1], reverse=True)
61 | for at, w in enumerate(word2freq):
62 | word2idx[w[0]] = at + 2
63 |
64 | idx2word = {k: v for v, k in word2idx.items()}
65 | print idx2word[1], idx2word[2]
66 |
67 | Lmax = len(idx2word)
68 | # for i in xrange(Lmax):
69 | # print idx2word[i]
70 | print 'read dataset ok.'
71 | print Lmax
72 | print pairs[0]
73 |
74 |
75 | def build_data(data):
76 | instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
77 | print len(data)
78 | for pair in data:
79 | source, target = pair
80 | A = [word2idx[w] for w in source]
81 | B = [word2idx[w] for w in target]
82 | # C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
83 | C = [0 if w not in source else source.index(w) + Lmax for w in target]
84 |
85 | instance['text'] += [source]
86 | instance['summary'] += [target]
87 | instance['source'] += [A]
88 | instance['target'] += [B]
89 | # instance['cc_matrix'] += [C]
90 | instance['target_c'] += [C]
91 |
92 | print instance['source'][4000]
93 | print instance['target'][4000]
94 | print instance['target_c'][4000]
95 | return instance
96 |
97 |
98 | train_set = build_data(pairs[10000:])
99 | test_set = build_data(pairs[:10000])
100 | serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/movie_dialogue_data.pkl')
101 |
--------------------------------------------------------------------------------
/experiments/syn_vest.py:
--------------------------------------------------------------------------------
1 | """
2 | This is the implementation of Copy-NET
3 | We start from the basic Seq2seq framework for a auto-encoder.
4 | """
5 | import logging
6 | import time
7 | import numpy as np
8 | import sys
9 | import copy
10 |
11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
12 | from experiments.config import setup_lcsts, setup_weibo, setup_syn, setup_bst
13 | from emolga.utils.generic_utils import *
14 | from emolga.models.covc_encdec import NRM
15 | from emolga.models.encdec import NRM as NRM0
16 | from emolga.dataset.build_dataset import deserialize_from_file
17 | from collections import OrderedDict
18 | from fuel import datasets
19 | from fuel import transformers
20 | from fuel import schemes
21 |
22 | # setup = setup_lcsts
23 | setup = setup_syn
24 | # setup = setup_bst
25 |
26 |
27 | def init_logging(logfile):
28 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
29 | datefmt='%m/%d/%Y %H:%M:%S' )
30 | fh = logging.FileHandler(logfile)
31 | # ch = logging.StreamHandler()
32 |
33 | fh.setFormatter(formatter)
34 | # ch.setFormatter(formatter)
35 | # fh.setLevel(logging.INFO)
36 | # ch.setLevel(logging.INFO)
37 | # logging.getLogger().addHandler(ch)
38 | logging.getLogger().addHandler(fh)
39 | logging.getLogger().setLevel(logging.INFO)
40 |
41 | return logging
42 |
43 | # prepare logging.
44 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
45 | config = setup() # load settings.
46 | for w in config:
47 | print '{0}={1}'.format(w, config[w])
48 |
49 | logger = init_logging(config['path_log'] + '/experiments.CopyLCSTS.id={}.log'.format(tmark))
50 | n_rng = np.random.RandomState(config['seed'])
51 | np.random.seed(config['seed'])
52 | rng = RandomStreams(n_rng.randint(2 ** 30))
53 | logger.info('Start!')
54 |
55 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
56 |
57 | if config['voc_size'] == -1: # not use unk
58 | config['enc_voc_size'] = len(word2idx)
59 | config['dec_voc_size'] = config['enc_voc_size']
60 | else:
61 | config['enc_voc_size'] = config['voc_size']
62 | config['dec_voc_size'] = config['enc_voc_size']
63 |
64 | samples = len(train_set['source'])
65 | logger.info('build dataset done. ' +
66 | 'dataset size: {} ||'.format(samples) +
67 | 'vocabulary size = {0}/ batch size = {1}'.format(
68 | config['dec_voc_size'], config['batch_size']))
69 |
70 |
71 | def build_data(data):
72 | # create fuel dataset.
73 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
74 | ('target', data['target']),
75 | ('target_c', data['target_c']),
76 | ]))
77 | dataset.example_iteration_scheme \
78 | = schemes.ShuffledExampleScheme(dataset.num_examples)
79 | return dataset
80 |
81 |
82 | train_data = build_data(train_set)
83 | train_data_plain = zip(*(train_set['source'], train_set['target'], train_set['rule']))
84 | test_data_plain = zip(*(test_set['source'], test_set['target'], test_set['rule']))
85 |
86 | # train_data_plain = zip(*(train_set['source'], train_set['target']))
87 | # test_data_plain = zip(*(test_set['source'], test_set['target']))
88 |
89 | train_size = len(train_data_plain)
90 | test_size = len(test_data_plain)
91 | tr_idx = n_rng.permutation(train_size)[:2000].tolist()
92 | ts_idx = n_rng.permutation(test_size )[:2000].tolist()
93 | logger.info('load the data ok.')
94 | config['copynet'] = True # False
95 | notrain = True
96 |
97 | # build the agent
98 | if config['copynet']:
99 | agent = NRM(config, n_rng, rng, mode=config['mode'],
100 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
101 | else:
102 | agent = NRM0(config, n_rng, rng, mode=config['mode'],
103 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
104 |
105 | agent.build_()
106 | if notrain:
107 | agent.compile_('display')
108 | else:
109 | agent.compile_('all')
110 | print 'compile ok.'
111 |
112 | echo = 6
113 | epochs = 10
114 | if echo > 0:
115 | tmark = '20160227-013418' # copynet multi-source model
116 | agent.load(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname']))
117 |
118 | while echo < epochs:
119 | echo += 1
120 | loss = []
121 |
122 | def output_stream(dataset, batch_size, size=1):
123 | data_stream = dataset.get_example_stream()
124 | data_stream = transformers.Batch(data_stream,
125 | iteration_scheme=schemes.ConstantScheme(batch_size))
126 |
127 | # add padding and masks to the dataset
128 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
129 | return data_stream
130 |
131 | def prepare_batch(batch, mask, fix_len=None):
132 | data = batch[mask].astype('int32')
133 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
134 |
135 | def cut_zeros(data, fix_len=None):
136 | if fix_len is not None:
137 | return data[:, : fix_len]
138 | for k in range(data.shape[1] - 1, 0, -1):
139 | data_col = data[:, k].sum()
140 | if data_col > 0:
141 | return data[:, : k + 2]
142 | return data
143 | data = cut_zeros(data, fix_len)
144 | return data
145 |
146 | def cc_martix(source, target):
147 | cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
148 | for k in xrange(source.shape[0]):
149 | for j in xrange(target.shape[1]):
150 | for i in xrange(source.shape[1]):
151 | if (source[k, i] == target[k, j]) and (source[k, i] > 0):
152 | cc[k][j][i] = 1.
153 | return cc
154 |
155 | def unk_filter(data):
156 | if config['voc_size'] == -1:
157 | return copy.copy(data)
158 | else:
159 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
160 | data = copy.copy(data * mask + (1 - mask))
161 | return data
162 |
163 | # training
164 | if not notrain:
165 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
166 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
167 | progbar = Progbar(train_size / config['batch_size'])
168 | for it, batch in enumerate(train_batches):
169 | # obtain data
170 | data_s = prepare_batch(batch, 'source')
171 | data_t = prepare_batch(batch, 'target')
172 | if config['copynet']:
173 | data_c = cc_martix(data_s, data_t)
174 | # data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
175 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
176 | else:
177 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
178 |
179 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
180 |
181 | if it % 200 == 0:
182 | logger.info('Echo={} Evaluation Sampling.'.format(it))
183 | logger.info('generating [training set] samples')
184 | for _ in xrange(5):
185 | idx = int(np.floor(n_rng.rand() * train_size))
186 | train_s, train_t = train_data_plain[idx]
187 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
188 | np.asarray(train_t, dtype='int32'),
189 | idx2word,
190 | np.asarray(unk_filter(train_s), dtype='int32'))
191 | print '*' * 50
192 |
193 | logger.info('generating [testing set] samples')
194 | for _ in xrange(5):
195 | idx = int(np.floor(n_rng.rand() * test_size))
196 | test_s, test_t = test_data_plain[idx]
197 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
198 | np.asarray(test_t, dtype='int32'),
199 | idx2word,
200 | np.asarray(unk_filter(test_s), dtype='int32'))
201 | print '*' * 50
202 |
203 | # save the weights.
204 | agent.save(config['path_h5'] + '/experiments.Copy{2}.id={0}.epoch={1}.pkl'.format(tmark, echo, config['modelname']))
205 |
206 | # # test accuracy
207 | def judge_rule(rule):
208 | rule = rule.split()
209 | fine = ''
210 | for w in rule:
211 | if w not in word2idx:
212 | fine += w
213 | return fine
214 |
215 | test = True
216 | if test:
217 | def analysis_(data_plain, mode='Training'):
218 | progbar_tr = Progbar(2000)
219 | print '\n' + '__' * 50
220 | cpy, cpy_pos = 0, 0
221 | types = dict()
222 | for it, idx in enumerate(tr_idx):
223 | train_s, train_t, rule = data_plain[idx]
224 | t = judge_rule(rule)
225 | c = float(agent.analyse_(np.asarray(train_s, dtype='int32'),
226 | np.asarray(train_t, dtype='int32'),
227 | idx2word))
228 | # copy mode
229 | cpy += 1
230 | cpy_pos += c
231 | if t not in types:
232 | types[t] = {}
233 | types[t][0] = c
234 | types[t][1] = 1
235 | else:
236 | types[t][0] += c
237 | types[t][1] += 1
238 |
239 | progbar_tr.update(it + 1, [('Copy', cpy_pos)])
240 | logger.info('\n{0} Accuracy:' +
241 | '\t{1}/{2} = {3}%'.format(mode, cpy_pos, cpy, 100 * cpy_pos / float(cpy)))
242 | print '==' * 50
243 | for t in types:
244 | print 'Type: {0}: {1}/{2}={3}%'.format(t, int(types[t][0]), types[t][1],
245 | 100 * types[t][0] / float(types[t][1]))
246 |
247 |
248 | # analysis_(train_data_plain, 'Training')
249 | analysis_(test_data_plain, 'Testing')
250 |
--------------------------------------------------------------------------------
/experiments/syntest.py:
--------------------------------------------------------------------------------
1 | """
2 | This is the implementation of Copy-NET
3 | We start from the basic Seq2seq framework for a auto-encoder.
4 | """
5 | import logging
6 | import time
7 | import numpy as np
8 |
9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
10 | from experiments.config import setup_syn
11 | from emolga.utils.generic_utils import *
12 | from emolga.models.cooc_encdec import NRM
13 | from emolga.models.encdec import NRM as NRM0
14 | from emolga.dataset.build_dataset import deserialize_from_file
15 | from collections import OrderedDict
16 | from fuel import datasets
17 | from fuel import transformers
18 | from fuel import schemes
19 |
20 | setup = setup_syn
21 |
22 |
23 | def init_logging(logfile):
24 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
25 | datefmt='%m/%d/%Y %H:%M:%S' )
26 | fh = logging.FileHandler(logfile)
27 | # ch = logging.StreamHandler()
28 |
29 | fh.setFormatter(formatter)
30 | # ch.setFormatter(formatter)
31 | # fh.setLevel(logging.INFO)
32 | # ch.setLevel(logging.INFO)
33 | # logging.getLogger().addHandler(ch)
34 | logging.getLogger().addHandler(fh)
35 | logging.getLogger().setLevel(logging.INFO)
36 |
37 | return logging
38 |
39 | # prepare logging.
40 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
41 | config = setup() # load settings.
42 | for w in config:
43 | print '{0}={1}'.format(w, config[w])
44 |
45 | logger = init_logging(config['path_log'] + '/experiments.Copy.id={}.log'.format(tmark))
46 | n_rng = np.random.RandomState(config['seed'])
47 | np.random.seed(config['seed'])
48 | rng = RandomStreams(n_rng.randint(2 ** 30))
49 | logger.info('Start!')
50 |
51 | # the vocabulary
52 | tmp = [chr(x) for x in range(48, 58)] # '1', ... , '9', '0'
53 | voc = [tmp[a] + tmp[b] + tmp[c]
54 | for c in xrange(10)
55 | for b in xrange(10)
56 | for a in xrange(10)]
57 | word2idx = {voc[k]: k + 1 for k in xrange(len(voc))}
58 | word2idx[''] = 0
59 | idx2word = {word2idx[w]: w for w in word2idx}
60 | voc = [''] + voc
61 |
62 | train_set, test_set = deserialize_from_file(config['dataset'])
63 |
64 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
65 | config['dec_voc_size'] = config['enc_voc_size']
66 | samples = len(train_set['source'])
67 |
68 | logger.info('build dataset done. ' +
69 | 'dataset size: {} ||'.format(samples) +
70 | 'vocabulary size = {0}/ batch size = {1}'.format(
71 | config['dec_voc_size'], config['batch_size']))
72 |
73 |
74 | def build_data(data):
75 | # create fuel dataset.
76 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
77 | ('target', data['target']),
78 | ('target_c', data['target_c']),
79 | ('rule_id', data['rule_id']),
80 | ('rule', data['rule'])]))
81 | dataset.example_iteration_scheme \
82 | = schemes.ShuffledExampleScheme(dataset.num_examples)
83 | return dataset
84 |
85 |
86 | train_data = build_data(train_set)
87 | train_data_plain = zip(*(train_set['source'], train_set['target'], train_set['rule_id'], train_set['rule']))
88 | test_data_plain = zip(*(test_set['source'], test_set['target'], test_set['rule_id'], test_set['rule']))
89 |
90 | train_size = len(train_data_plain)
91 | test_size = len(test_data_plain)
92 | tr_idx = n_rng.permutation(train_size)[:2000].tolist()
93 | ts_idx = n_rng.permutation(test_size )[:2000].tolist()
94 | logger.info('load the data ok.')
95 |
96 | # build the agent
97 | if config['copynet']:
98 | agent = NRM(config, n_rng, rng, mode=config['mode'],
99 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
100 | else:
101 | agent = NRM0(config, n_rng, rng, mode=config['mode'],
102 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
103 |
104 | agent.build_()
105 | agent.compile_('all')
106 | print 'compile ok.'
107 |
108 | echo = 3
109 | epochs = 4
110 | if echo > 0:
111 | tmark = '20160216-152155'
112 | agent.load(config['path_h5'] + '/experiments.Copy.id={0}.epoch={1}.pkl'.format(tmark, echo))
113 |
114 | while echo < epochs:
115 | echo += 1
116 | loss = []
117 |
118 | def output_stream(dataset, batch_size, size=1):
119 | data_stream = dataset.get_example_stream()
120 | data_stream = transformers.Batch(data_stream,
121 | iteration_scheme=schemes.ConstantScheme(batch_size))
122 |
123 | # add padding and masks to the dataset
124 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target', 'target_c'))
125 | return data_stream
126 |
127 | def prepare_batch(batch, mask, fix_len=None):
128 | data = batch[mask].astype('int32')
129 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
130 |
131 | def cut_zeros(data, fix_len=None):
132 | if fix_len is not None:
133 | return data[:, : fix_len]
134 | for k in range(data.shape[1] - 1, 0, -1):
135 | data_col = data[:, k].sum()
136 | if data_col > 0:
137 | return data[:, : k + 2]
138 | return data
139 | data = cut_zeros(data, fix_len)
140 | return data
141 |
142 | # training
143 | notrain = True
144 | if not notrain:
145 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
146 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
147 | progbar = Progbar(train_size / config['batch_size'])
148 | for it, batch in enumerate(train_batches):
149 | # obtain data
150 | data_s = prepare_batch(batch, 'source')
151 | data_t = prepare_batch(batch, 'target')
152 | data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
153 |
154 | if config['copynet']:
155 | loss += [agent.train_(data_s, data_t, data_c)]
156 | else:
157 | loss += [agent.train_(data_s, data_t)]
158 |
159 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
160 |
161 | if it % 200 == 0:
162 | logger.info('Echo={} Evaluation Sampling.'.format(it))
163 | logger.info('generating [training set] samples')
164 | for _ in xrange(5):
165 | idx = int(np.floor(n_rng.rand() * train_size))
166 | train_s, train_t, _, _ = train_data_plain[idx]
167 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
168 | np.asarray(train_t, dtype='int32'),
169 | idx2word)
170 | print '*' * 50
171 |
172 | logger.info('generating [testing set] samples')
173 | for _ in xrange(5):
174 | idx = int(np.floor(n_rng.rand() * test_size))
175 | test_s, test_t, _, _ = test_data_plain[idx]
176 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
177 | np.asarray(test_t, dtype='int32'),
178 | idx2word)
179 | print '*' * 50
180 |
181 | # save the weights.
182 | agent.save(config['path_h5'] + '/experiments.Copy.id={0}.epoch={1}.pkl'.format(tmark, echo))
183 |
184 | # test accuracy
185 | progbar_tr = Progbar(2000)
186 |
187 | print '\n' + '__' * 50
188 | gen, gen_pos = 0, 0
189 | cpy, cpy_pos = 0, 0
190 | grs, crs = [], []
191 | for it, idx in enumerate(tr_idx):
192 | train_s, train_t, rid, rule = train_data_plain[idx]
193 |
194 | c = agent.analyse_(np.asarray(train_s, dtype='int32'),
195 | np.asarray(train_t, dtype='int32'),
196 | idx2word)
197 | if c[1] == 0:
198 | # generation mode
199 | gen += 1
200 | gen_pos += c[0]
201 | if c[0] == 1:
202 | grs += [rule]
203 | else:
204 | # copy mode
205 | cpy += 1
206 | cpy_pos += c[0]
207 | if c[0] == 1:
208 | crs += [rule]
209 |
210 | progbar_tr.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
211 | grs = set(grs)
212 | crs = set(crs)
213 | irs = set.intersection(grs, crs)
214 |
215 | logger.info('\nTraining Accuracy:' +
216 | '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
217 | '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)) +
218 | '\tGene-Rule: {0}, Copy-Rule: {1}, Intersection: {2}'.format(len(grs), len(crs), len(irs)))
219 |
220 | print 'Generate Mode:'
221 | for r in grs:
222 | print r
223 |
224 | print 'Copy Mode:'
225 | for r in crs:
226 | print r
227 |
228 | print 'Interaction:'
229 | for r in irs:
230 | print r
231 |
232 | progbar_ts = Progbar(2000)
233 | print '\n' + '__' * 50
234 | gen, gen_pos = 0, 0
235 | cpy, cpy_pos = 0, 0
236 | grs, crs = [], []
237 | for it, idx in enumerate(ts_idx):
238 | test_s, test_t, rid, rule = test_data_plain[idx]
239 | c = agent.analyse_(np.asarray(test_s, dtype='int32'),
240 | np.asarray(test_t, dtype='int32'),
241 | idx2word)
242 | if c[1] == 0:
243 | # generation mode
244 | gen += 1
245 | gen_pos += c[0]
246 | grs += [rule]
247 | else:
248 | # copy mode
249 | cpy += 1
250 | cpy_pos += c[0]
251 | crs += [rule]
252 |
253 | progbar_ts.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
254 | grs = set(grs)
255 | crs = set(crs)
256 | irs = set.intersection(grs, crs)
257 |
258 | logger.info('\nTesting Accuracy:' +
259 | '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
260 | '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)) +
261 | '\tGene-Rule: {0}, Copy-Rule: {1}, Intersection: {2}'.format(len(grs), len(crs), len(irs)))
262 |
263 | print 'Generate Mode:'
264 | for r in grs:
265 | print r
266 |
267 | print 'Copy Mode:'
268 | for r in crs:
269 | print r
270 |
271 | print 'Interaction:'
272 | for r in irs:
273 | print r
--------------------------------------------------------------------------------
/experiments/synthetic.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jiataogu'
2 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
3 | import numpy.random as n_rng
4 |
5 | n_rng.seed(19920206)
6 | # the vocabulary
7 | tmp = [chr(x) for x in range(48, 58)] # '1', ... , '9', '0'
8 | voc = [tmp[a] + tmp[b] + tmp[c]
9 | for c in xrange(10)
10 | for b in xrange(10)
11 | for a in xrange(10)]
12 | word2idx = {voc[k]: k + 2 for k in xrange(len(voc))}
13 | word2idx[''] = 0
14 | word2idx[''] = 1
15 | idx2word = {word2idx[w]: w for w in word2idx}
16 | voc = ['', ''] + voc
17 |
18 | # word2idx['X'] = len(voc)
19 | # idx2word[len(voc)] = 'X'
20 | # voc += ['X']
21 | #
22 | # word2idx['Y'] = len(voc)
23 | # idx2word[len(voc)] = 'Y'
24 | # voc += ['Y']
25 | # print word2idx['X'], word2idx['Y']
26 |
27 | # load the dataset
28 | Rules, _ = deserialize_from_file('/home/thoma/Work/Dial-DRL/dataset/rules.rnd.n10k.pkl')
29 | num = 200
30 | repeats = 100
31 | maxleg = 15
32 | Lmax = len(idx2word)
33 | rules = dict(source=Rules['source'][:num],
34 | target=Rules['target'][:num])
35 |
36 |
37 | def ftr(v):
38 | if v < 10:
39 | return '00' + str(v)
40 | elif v < 100:
41 | return '0' + str(v)
42 | else:
43 | return str(v)
44 |
45 |
46 | def build_instance():
47 | instance = dict(x=[], y=[], source=[], target=[], target_c=[], rule_id=[], rule=[])
48 | for k in xrange(num):
49 | source = rules['source'][k]
50 | target = rules['target'][k]
51 |
52 | for j in xrange(repeats):
53 | X = n_rng.randint(1000, size= n_rng.randint(maxleg) + 1)
54 | Y = n_rng.randint(1000, size= n_rng.randint(maxleg) + 1)
55 | S = []
56 | T = []
57 | for w in source:
58 | if w is 'X':
59 | S += [ftr(v) for v in X]
60 | elif w is 'Y':
61 | S += [ftr(v) for v in Y]
62 | else:
63 | S += [w]
64 |
65 | for w in target:
66 | if w is 'X':
67 | T += [ftr(v) for v in X]
68 | elif w is 'Y':
69 | T += [ftr(v) for v in Y]
70 | else:
71 | T += [w]
72 |
73 | A = [word2idx[w] for w in S]
74 | B = [word2idx[w] for w in T]
75 | C = [0 if w not in S else S.index(w) + Lmax for w in T]
76 |
77 | instance['x'] += [S]
78 | instance['y'] += [T]
79 | instance['source'] += [A]
80 | instance['target'] += [B]
81 | instance['target_c'] += [C]
82 |
83 | instance['rule_id'] += [k]
84 | instance['rule'] += [' '.join(source) + ' -> ' + ' '.join(target)]
85 |
86 | return instance
87 |
88 | train_set = build_instance()
89 | print 'build ok.'
90 | test_set = build_instance()
91 | print 'build ok.'
92 |
93 | serialize_to_file([train_set, test_set, idx2word, word2idx], '/home/thoma/Work/Dial-DRL/dataset/synthetic_data_c.pkl')
94 | # serialize_to_file([train_set, test_set], '/home/thoma/Work/Dial-DRL/dataset/synthetic_data.pkl')
95 |
--------------------------------------------------------------------------------
/experiments/weibo_dataset.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
3 |
4 | word2idx = dict()
5 | wordfreq = dict()
6 | word2idx[''] = 0
7 | word2idx[''] = 1
8 |
9 | # segment = False # True
10 |
11 | # training set
12 | pairs = []
13 | f = open('./dataset/weibo/co-occur.txt', 'r')
14 | line = f.readline().strip().decode('utf-8')
15 | at = 2
16 | lines = 0
17 | while line:
18 | post = line # f.readline().strip().decode('utf-8')
19 | post= [w.strip() for w in post.split() if len(w.strip()) > 0]
20 |
21 | # if segment:
22 | # summary = [w for w in jb.cut(summary)]
23 |
24 | for w in post:
25 | if w not in wordfreq:
26 | wordfreq[w] = 1
27 | else:
28 | wordfreq[w] += 1
29 | # if w not in word2idx:
30 | # word2idx[w] = at
31 | # at += 1
32 |
33 | text = f.readline().strip().decode('utf-8')
34 | text = [w.strip() for w in text.split() if len(w.strip()) > 0]
35 | # if segment:
36 | # text = [w for w in jb.cut(text)]
37 | for w in text:
38 | if w not in wordfreq:
39 | wordfreq[w] = 1
40 | else:
41 | wordfreq[w] += 1
42 | # if w not in word2idx:
43 | # word2idx[w] = at
44 | # at += 1
45 |
46 | pair = (post, text)
47 | pairs.append(pair)
48 | lines += 1
49 | if lines % 20000 == 0:
50 | print lines
51 |
52 | f.readline()
53 | line = f.readline().strip().decode('utf-8')
54 |
55 |
56 | # sort the vocabulary
57 | wordfreq = sorted(wordfreq.items(), key=lambda a:a[1], reverse=True)
58 | for w in wordfreq:
59 | word2idx[w[0]] = at
60 | at += 1
61 |
62 | idx2word = dict()
63 | for v, k in word2idx.items():
64 | idx2word[k] = v
65 | Lmax = len(idx2word)
66 | print 'read dataset ok.'
67 | print Lmax
68 | for i in xrange(Lmax):
69 | print idx2word[i].encode('utf-8')
70 |
71 | # use character-based model [on]
72 | # use word-based model [off]
73 |
74 |
75 | def build_data(data):
76 | instance = dict(text=[], summary=[], source=[], target=[], target_c=[])
77 | for pair in data:
78 | source, target = pair
79 | A = [word2idx[w] for w in source]
80 | B = [word2idx[w] for w in target]
81 | # C = np.asarray([[w == l for w in source] for l in target], dtype='float32')
82 | C = [0 if w not in source else source.index(w) + Lmax for w in target]
83 |
84 | instance['text'] += [source]
85 | instance['summary'] += [target]
86 | instance['source'] += [A]
87 | instance['target'] += [B]
88 | # instance['cc_matrix'] += [C]
89 | instance['target_c'] += [C]
90 |
91 | print instance['target'][5000]
92 | print instance['target_c'][5000]
93 | return instance
94 |
95 |
96 | train_set = build_data(pairs[10000:])
97 | test_set = build_data(pairs[:10000])
98 | serialize_to_file([train_set, test_set, idx2word, word2idx], './dataset/weibo_data-word-cooc.pkl')
99 |
--------------------------------------------------------------------------------
/experiments/weibo_vest.py:
--------------------------------------------------------------------------------
1 | """
2 | This is the implementation of Copy-NET
3 | We start from the basic Seq2seq framework for a auto-encoder.
4 | """
5 | import logging
6 | import time
7 | import numpy as np
8 | import sys
9 | import copy
10 |
11 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
12 | from experiments.config import setup_lcsts, setup_weibo
13 | from emolga.utils.generic_utils import *
14 | from emolga.models.covc_encdec import NRM
15 | from emolga.models.encdec import NRM as NRM0
16 | from emolga.dataset.build_dataset import deserialize_from_file
17 | from collections import OrderedDict
18 | from fuel import datasets
19 | from fuel import transformers
20 | from fuel import schemes
21 |
22 | setup = setup_weibo
23 |
24 |
25 | def init_logging(logfile):
26 | formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
27 | datefmt='%m/%d/%Y %H:%M:%S' )
28 | fh = logging.FileHandler(logfile)
29 | # ch = logging.StreamHandler()
30 |
31 | fh.setFormatter(formatter)
32 | # ch.setFormatter(formatter)
33 | # fh.setLevel(logging.INFO)
34 | # ch.setLevel(logging.INFO)
35 | # logging.getLogger().addHandler(ch)
36 | logging.getLogger().addHandler(fh)
37 | logging.getLogger().setLevel(logging.INFO)
38 |
39 | return logging
40 |
41 | # prepare logging.
42 | tmark = time.strftime('%Y%m%d-%H%M%S', time.localtime(time.time()))
43 | config = setup() # load settings.
44 | for w in config:
45 | print '{0}={1}'.format(w, config[w])
46 |
47 | logger = init_logging(config['path_log'] + '/experiments.CopyWeibo.id={}.log'.format(tmark))
48 | n_rng = np.random.RandomState(config['seed'])
49 | np.random.seed(config['seed'])
50 | rng = RandomStreams(n_rng.randint(2 ** 30))
51 | logger.info('Start!')
52 |
53 | train_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])
54 |
55 | if config['voc_size'] == -1: # not use unk
56 | config['enc_voc_size'] = max(zip(*word2idx.items())[1]) + 1
57 | config['dec_voc_size'] = config['enc_voc_size']
58 | else:
59 | config['enc_voc_size'] = config['voc_size']
60 | config['dec_voc_size'] = config['enc_voc_size']
61 |
62 | samples = len(train_set['source'])
63 | logger.info('build dataset done. ' +
64 | 'dataset size: {} ||'.format(samples) +
65 | 'vocabulary size = {0}/ batch size = {1}'.format(
66 | config['dec_voc_size'], config['batch_size']))
67 |
68 |
69 | def build_data(data):
70 | # create fuel dataset.
71 | dataset = datasets.IndexableDataset(indexables=OrderedDict([('source', data['source']),
72 | ('target', data['target']),
73 | ('target_c', data['target_c']),
74 | ]))
75 | dataset.example_iteration_scheme \
76 | = schemes.ShuffledExampleScheme(dataset.num_examples)
77 | return dataset
78 |
79 |
80 | train_data = build_data(train_set)
81 | train_data_plain = zip(*(train_set['source'], train_set['target']))
82 | test_data_plain = zip(*(test_set['source'], test_set['target']))
83 |
84 | train_size = len(train_data_plain)
85 | test_size = len(test_data_plain)
86 | tr_idx = n_rng.permutation(train_size)[:2000].tolist()
87 | ts_idx = n_rng.permutation(test_size )[:2000].tolist()
88 | logger.info('load the data ok.')
89 | notrain = False
90 |
91 | # build the agent
92 | if config['copynet']:
93 | agent = NRM(config, n_rng, rng, mode=config['mode'],
94 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
95 | else:
96 | agent = NRM0(config, n_rng, rng, mode=config['mode'],
97 | use_attention=True, copynet=config['copynet'], identity=config['identity'])
98 |
99 | agent.build_()
100 | if notrain:
101 | agent.compile_('display')
102 | else:
103 | agent.compile_('all')
104 | print 'compile ok.'
105 |
106 | echo = 0
107 | epochs = 10
108 | if echo > 0:
109 | tmark = '20160227-164324' # copynet multi-source model
110 | agent.load(config['path_h5'] + '/experiments.CopyWeibo.id={0}.epoch={1}.pkl'.format(tmark, echo))
111 |
112 | while echo < epochs:
113 | echo += 1
114 | loss = []
115 |
116 | def output_stream(dataset, batch_size, size=1):
117 | data_stream = dataset.get_example_stream()
118 | data_stream = transformers.Batch(data_stream,
119 | iteration_scheme=schemes.ConstantScheme(batch_size))
120 |
121 | # add padding and masks to the dataset
122 | data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
123 | return data_stream
124 |
125 | def prepare_batch(batch, mask, fix_len=None):
126 | data = batch[mask].astype('int32')
127 | data = np.concatenate([data, np.zeros((data.shape[0], 1), dtype='int32')], axis=1)
128 |
129 | def cut_zeros(data, fix_len=None):
130 | if fix_len is not None:
131 | return data[:, : fix_len]
132 | for k in range(data.shape[1] - 1, 0, -1):
133 | data_col = data[:, k].sum()
134 | if data_col > 0:
135 | return data[:, : k + 2]
136 | return data
137 | data = cut_zeros(data, fix_len)
138 | return data
139 |
140 | def cc_martix(source, target):
141 | cc = np.zeros((source.shape[0], target.shape[1], source.shape[1]), dtype='float32')
142 | for k in xrange(source.shape[0]):
143 | for j in xrange(target.shape[1]):
144 | for i in xrange(source.shape[1]):
145 | if (source[k, i] == target[k, j]) and (source[k, i] > 0):
146 | cc[k][j][i] = 1.
147 | return cc
148 |
149 | def unk_filter(data):
150 | if config['voc_size'] == -1:
151 | return copy.copy(data)
152 | else:
153 | mask = (np.less(data, config['voc_size'])).astype(dtype='int32')
154 | data = copy.copy(data * mask + (1 - mask))
155 | return data
156 |
157 | # training
158 | train_batches = output_stream(train_data, config['batch_size']).get_epoch_iterator(as_dict=True)
159 | logger.info('\nEpoch = {} -> Training Set Learning...'.format(echo))
160 | progbar = Progbar(train_size / config['batch_size'])
161 | for it, batch in enumerate(train_batches):
162 | # obtain data
163 | if not notrain:
164 | data_s = prepare_batch(batch, 'source')
165 | data_t = prepare_batch(batch, 'target')
166 | if config['copynet']:
167 | data_c = cc_martix(data_s, data_t)
168 | # data_c = prepare_batch(batch, 'target_c', data_t.shape[1])
169 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t), data_c)]
170 | else:
171 | loss += [agent.train_(unk_filter(data_s), unk_filter(data_t))]
172 |
173 | progbar.update(it, [('loss_reg', loss[-1][0]), ('ppl.', loss[-1][1])])
174 |
175 | if it % 200 == 0:
176 | logger.info('Echo={} Evaluation Sampling.'.format(it))
177 | logger.info('generating [training set] samples')
178 | for _ in xrange(5):
179 | idx = int(np.floor(n_rng.rand() * train_size))
180 | train_s, train_t = train_data_plain[idx]
181 | v = agent.evaluate_(np.asarray(train_s, dtype='int32'),
182 | np.asarray(train_t, dtype='int32'),
183 | idx2word,
184 | np.asarray(unk_filter(train_s), dtype='int32'),
185 | encode=config['utf-8'])
186 | print '*' * 50
187 |
188 | logger.info('generating [testing set] samples')
189 | for _ in xrange(5):
190 | idx = int(np.floor(n_rng.rand() * test_size))
191 | test_s, test_t = test_data_plain[idx]
192 | v = agent.evaluate_(np.asarray(test_s, dtype='int32'),
193 | np.asarray(test_t, dtype='int32'),
194 | idx2word,
195 | np.asarray(unk_filter(test_s), dtype='int32'),
196 | encode=config['utf-8'])
197 | print '*' * 50
198 |
199 | if it % 10000 == 0:
200 | # save the weights.
201 | agent.save(config['path_h5'] + '/experiments.CopyWeibo.id={0}.epoch={1}.pkl'.format(tmark, echo))
202 |
203 | # # test accuracy
204 | # progbar_tr = Progbar(2000)
205 | #
206 | # print '\n' + '__' * 50
207 | # gen, gen_pos = 0, 0
208 | # cpy, cpy_pos = 0, 0
209 | # for it, idx in enumerate(tr_idx):
210 | # train_s, train_t = train_data_plain[idx]
211 | #
212 | # c = agent.analyse_(np.asarray(train_s, dtype='int32'),
213 | # np.asarray(train_t, dtype='int32'),
214 | # idx2word)
215 | # if c[1] == 0:
216 | # # generation mode
217 | # gen += 1
218 | # gen_pos += c[0]
219 | # else:
220 | # # copy mode
221 | # cpy += 1
222 | # cpy_pos += c[0]
223 | #
224 | # progbar_tr.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
225 | #
226 | # logger.info('\nTraining Accuracy:' +
227 | # '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
228 | # '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)))
229 | #
230 | # progbar_ts = Progbar(2000)
231 | # print '\n' + '__' * 50
232 | # gen, gen_pos = 0, 0
233 | # cpy, cpy_pos = 0, 0
234 | # for it, idx in enumerate(ts_idx):
235 | # test_s, test_t = test_data_plain[idx]
236 | # c = agent.analyse_(np.asarray(test_s, dtype='int32'),
237 | # np.asarray(test_t, dtype='int32'),
238 | # idx2word)
239 | # if c[1] == 0:
240 | # # generation mode
241 | # gen += 1
242 | # gen_pos += c[0]
243 | # else:
244 | # # copy mode
245 | # cpy += 1
246 | # cpy_pos += c[0]
247 | #
248 | # progbar_ts.update(it + 1, [('Gen', gen_pos), ('Copy', cpy_pos)])
249 | #
250 | # logger.info('\nTesting Accuracy:' +
251 | # '\tGene-Mode: {0}/{1} = {2}%'.format(gen_pos, gen, 100 * gen_pos/float(gen)) +
252 | # '\tCopy-Mode: {0}/{1} = {2}%'.format(cpy_pos, cpy, 100 * cpy_pos/float(cpy)))
253 |
--------------------------------------------------------------------------------