├── README.md
├── dist_conv_net_classes.py
├── dist_conv_net_sentence.py
├── dist_conv_net_sentence_oneFold.py
└── dist_process_data.py


/README.md:
--------------------------------------------------------------------------------
 1 | This is the source code for the paper: 
 2 | 
 3 | Relation Extraction: Perspective from Convolutional Neural Networks
 4 | 
 5 | Thien Huu Nguyen and Ralph Grishman, in Proceedings of NAACL Workshop on Vector Space Modeling for NLP, Denver, Colorado, June, 2015.
 6 | 
 7 | ----------------
 8 | 
 9 | Much of this code is modified from: https://github.com/yoonkim/CNN_sentence
10 | 
11 | This code is written when i started my deep learning journey so it is not optimal :).
12 | 
13 | There are two steps to run this code:
14 | 
15 | * Preprocessing: using file ```dist_process_data.py```
16 | 
17 | You will need to have the ACE 2005 data set in the format required by this file. We cannot include the data in this release due to licence issues.
18 | 
19 | * Train and test the model: using file ```dist_conv_net_sentence_oneFold.py```
20 | 
21 | This step takes the output file in step 1.
22 | 
23 | THE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.


--------------------------------------------------------------------------------
/dist_conv_net_classes.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Convolutional Neural Network for Relation Extraction
  3 | 
  4 | Much of the code is modified from
  5 | - https://github.com/yoonkim/CNN_sentence
  6 | """
  7 | 
  8 | import numpy
  9 | import theano.tensor.shared_randomstreams
 10 | import theano
 11 | import theano.tensor as T
 12 | from theano.tensor.signal import downsample
 13 | from theano.tensor.nnet import conv
 14 | 
 15 | def ReLU(x):
 16 |     y = T.maximum(0.0, x)
 17 |     return(y)
 18 | def Sigmoid(x):
 19 |     y = T.nnet.sigmoid(x)
 20 |     return(y)
 21 | def Tanh(x):
 22 |     y = T.tanh(x)
 23 |     return(y)
 24 | def Iden(x):
 25 |     y = x
 26 |     return(y)
 27 |         
 28 | class HiddenLayer(object):
 29 |     """
 30 |     Class for HiddenLayer
 31 |     """
 32 |     def __init__(self, rng, input, n_in, n_out, activation, W=None, b=None,
 33 |                  use_bias=False):
 34 | 
 35 |         self.input = input
 36 |         self.activation = activation
 37 | 
 38 |         if W is None:            
 39 |             if activation.func_name == "ReLU":
 40 |                 W_values = numpy.asarray(0.01 * rng.standard_normal(size=(n_in, n_out)), dtype=theano.config.floatX)
 41 |             else:                
 42 |                 W_values = numpy.asarray(rng.uniform(low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)),
 43 |                                                      size=(n_in, n_out)), dtype=theano.config.floatX)
 44 |             W = theano.shared(value=W_values, name='W')        
 45 |         if b is None:
 46 |             b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
 47 |             b = theano.shared(value=b_values, name='b')
 48 | 
 49 |         self.W = W
 50 |         self.b = b
 51 | 
 52 |         if use_bias:
 53 |             lin_output = T.dot(input, self.W) + self.b
 54 |         else:
 55 |             lin_output = T.dot(input, self.W)
 56 | 
 57 |         self.output = (lin_output if activation is None else activation(lin_output))
 58 |     
 59 |         # parameters of the model
 60 |         if use_bias:
 61 |             self.params = [self.W, self.b]
 62 |         else:
 63 |             self.params = [self.W]
 64 | 
 65 | def _dropout_from_layer(rng, layer, p):
 66 |     """p is the probablity of dropping a unit
 67 | """
 68 |     srng = theano.tensor.shared_randomstreams.RandomStreams(rng.randint(999999))
 69 |     # p=1-p because 1's indicate keep and p is prob of dropping
 70 |     mask = srng.binomial(n=1, p=1-p, size=layer.shape)
 71 |     # The cast is important because
 72 |     # int * float32 = float64 which pulls things off the gpu
 73 |     output = layer * T.cast(mask, theano.config.floatX)
 74 |     return output
 75 | 
 76 | class DropoutHiddenLayer(HiddenLayer):
 77 |     def __init__(self, rng, input, n_in, n_out,
 78 |                  activation, dropout_rate, use_bias, W=None, b=None):
 79 |         super(DropoutHiddenLayer, self).__init__(
 80 |                 rng=rng, input=input, n_in=n_in, n_out=n_out, W=W, b=b,
 81 |                 activation=activation, use_bias=use_bias)
 82 | 
 83 |         self.output = _dropout_from_layer(rng, self.output, p=dropout_rate)
 84 | 
 85 | class MLPDropout(object):
 86 |     """A multilayer perceptron with dropout"""
 87 |     def __init__(self,rng,input,layer_sizes,dropout_rates,activations,use_bias=True):
 88 | 
 89 |         #rectified_linear_activation = lambda x: T.maximum(0.0, x)
 90 | 
 91 |         # Set up all the hidden layers
 92 |         self.weight_matrix_sizes = zip(layer_sizes, layer_sizes[1:])
 93 |         self.layers = []
 94 |         self.dropout_layers = []
 95 |         self.activations = activations
 96 |         next_layer_input = input
 97 |         #first_layer = True
 98 |         # dropout the input
 99 |         next_dropout_layer_input = _dropout_from_layer(rng, input, p=dropout_rates[0])
100 |         layer_counter = 0
101 |         for n_in, n_out in self.weight_matrix_sizes[:-1]:
102 |             next_dropout_layer = DropoutHiddenLayer(rng=rng,
103 |                     input=next_dropout_layer_input,
104 |                     activation=activations[layer_counter],
105 |                     n_in=n_in, n_out=n_out, use_bias=use_bias,
106 |                     dropout_rate=dropout_rates[layer_counter])
107 |             self.dropout_layers.append(next_dropout_layer)
108 |             next_dropout_layer_input = next_dropout_layer.output
109 | 
110 |             # Reuse the parameters from the dropout layer here, in a different
111 |             # path through the graph.
112 |             next_layer = HiddenLayer(rng=rng,
113 |                     input=next_layer_input,
114 |                     activation=activations[layer_counter],
115 |                     # scale the weight matrix W with (1-p)
116 |                     W=next_dropout_layer.W * (1 - dropout_rates[layer_counter]),
117 |                     b=next_dropout_layer.b,
118 |                     n_in=n_in, n_out=n_out,
119 |                     use_bias=use_bias)
120 |             self.layers.append(next_layer)
121 |             next_layer_input = next_layer.output
122 |             #first_layer = False
123 |             layer_counter += 1
124 |         
125 |         # Set up the output layer
126 |         n_in, n_out = self.weight_matrix_sizes[-1]
127 |         dropout_output_layer = LogisticRegression(
128 |                 input=next_dropout_layer_input,
129 |                 n_in=n_in, n_out=n_out)
130 |         self.dropout_layers.append(dropout_output_layer)
131 | 
132 |         # Again, reuse paramters in the dropout output.
133 |         output_layer = LogisticRegression(
134 |             input=next_layer_input,
135 |             # scale the weight matrix W with (1-p)
136 |             W=dropout_output_layer.W * (1 - dropout_rates[-1]),
137 |             b=dropout_output_layer.b,
138 |             n_in=n_in, n_out=n_out)
139 |         self.layers.append(output_layer)
140 | 
141 |         # Use the negative log likelihood of the logistic regression layer as
142 |         # the objective.
143 |         self.dropout_negative_log_likelihood = self.dropout_layers[-1].negative_log_likelihood
144 |         self.dropout_errors = self.dropout_layers[-1].errors
145 |         self.dropout_F1 = self.dropout_layers[-1].F1
146 | 
147 |         self.negative_log_likelihood = self.layers[-1].negative_log_likelihood
148 |         self.errors = self.layers[-1].errors
149 |         self.F1 = self.layers[-1].F1
150 | 
151 |         # Grab all the parameters together.
152 |         self.params = [ param for layer in self.dropout_layers for param in layer.params ]
153 | 
154 |     def predict(self, new_data):
155 |         next_layer_input = new_data
156 |         for i,layer in enumerate(self.layers):
157 |             if i<len(self.layers)-1:
158 |                 next_layer_input = self.activations[i](T.dot(next_layer_input,layer.W) + layer.b)
159 |             else:
160 |                 p_y_given_x = T.nnet.softmax(T.dot(next_layer_input, layer.W) + layer.b)
161 |         y_pred = T.argmax(p_y_given_x, axis=1)
162 |         return y_pred
163 | 
164 |     def predict_p(self, new_data):
165 |         next_layer_input = new_data
166 |         for i,layer in enumerate(self.layers):
167 |             if i<len(self.layers)-1:
168 |                 next_layer_input = self.activations[i](T.dot(next_layer_input,layer.W) + layer.b)
169 |             else:
170 |                 p_y_given_x = T.nnet.softmax(T.dot(next_layer_input, layer.W) + layer.b)
171 |         return p_y_given_x
172 |         
173 | class MLP(object):
174 |     """Multi-Layer Perceptron Class
175 | 
176 |     A multilayer perceptron is a feedforward artificial neural network model
177 |     that has one layer or more of hidden units and nonlinear activations.
178 |     Intermediate layers usually have as activation function tanh or the
179 |     sigmoid function (defined here by a ``HiddenLayer`` class)  while the
180 |     top layer is a softamx layer (defined here by a ``LogisticRegression``
181 |     class).
182 |     """
183 | 
184 |     def __init__(self, rng, input, n_in, n_hidden, n_out):
185 |         """Initialize the parameters for the multilayer perceptron
186 | 
187 |         :type rng: numpy.random.RandomState
188 |         :param rng: a random number generator used to initialize weights
189 | 
190 |         :type input: theano.tensor.TensorType
191 |         :param input: symbolic variable that describes the input of the
192 |         architecture (one minibatch)
193 | 
194 |         :type n_in: int
195 |         :param n_in: number of input units, the dimension of the space in
196 |         which the datapoints lie
197 | 
198 |         :type n_hidden: int
199 |         :param n_hidden: number of hidden units
200 | 
201 |         :type n_out: int
202 |         :param n_out: number of output units, the dimension of the space in
203 |         which the labels lie
204 | 
205 |         """
206 | 
207 |         # Since we are dealing with a one hidden layer MLP, this will translate
208 |         # into a HiddenLayer with a tanh activation function connected to the
209 |         # LogisticRegression layer; the activation function can be replaced by
210 |         # sigmoid or any other nonlinear function
211 |         self.hiddenLayer = HiddenLayer(rng=rng, input=input,
212 |                                        n_in=n_in, n_out=n_hidden,
213 |                                        activation=T.tanh)
214 | 
215 |         # The logistic regression layer gets as input the hidden units
216 |         # of the hidden layer
217 |         self.logRegressionLayer = LogisticRegression(
218 |             input=self.hiddenLayer.output,
219 |             n_in=n_hidden,
220 |             n_out=n_out)
221 | 
222 |         # L1 norm ; one regularization option is to enforce L1 norm to
223 |         # be small
224 | 
225 |         # negative log likelihood of the MLP is given by the negative
226 |         # log likelihood of the output of the model, computed in the
227 |         # logistic regression layer
228 |         self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
229 |         # same holds for the function computing the number of errors
230 |         self.errors = self.logRegressionLayer.errors
231 | 
232 |         # the parameters of the model are the parameters of the two layer it is
233 |         # made out of
234 |         self.params = self.hiddenLayer.params + self.logRegressionLayer.params
235 |         
236 | class LogisticRegression(object):
237 |     """Multi-class Logistic Regression Class
238 | 
239 |     The logistic regression is fully described by a weight matrix :math:`W`
240 |     and bias vector :math:`b`. Classification is done by projecting data
241 |     points onto a set of hyperplanes, the distance to which is used to
242 |     determine a class membership probability.
243 |     """
244 | 
245 |     def __init__(self, input, n_in, n_out, W=None, b=None):
246 |         """ Initialize the parameters of the logistic regression
247 | 
248 |     :type input: theano.tensor.TensorType
249 |     :param input: symbolic variable that describes the input of the
250 |     architecture (one minibatch)
251 |     
252 |     :type n_in: int
253 |     :param n_in: number of input units, the dimension of the space in
254 |     which the datapoints lie
255 |     
256 |     :type n_out: int
257 |     :param n_out: number of output units, the dimension of the space in
258 |     which the labels lie
259 |     
260 |     """
261 | 
262 |         # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
263 |         if W is None:
264 |             self.W = theano.shared(
265 |                     value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX),
266 |                     name='W')
267 |         else:
268 |             self.W = W
269 | 
270 |         # initialize the baises b as a vector of n_out 0s
271 |         if b is None:
272 |             self.b = theano.shared(
273 |                     value=numpy.zeros((n_out,), dtype=theano.config.floatX),
274 |                     name='b')
275 |         else:
276 |             self.b = b
277 | 
278 |         # compute vector of class-membership probabilities in symbolic form
279 |         self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
280 | 
281 |         # compute prediction as class whose probability is maximal in
282 |         # symbolic form
283 |         self.y_pred = T.argmax(self.p_y_given_x, axis=1)
284 | 
285 |         # parameters of the model
286 |         self.params = [self.W, self.b]
287 | 
288 |     def negative_log_likelihood(self, y):
289 |         """Return the mean of the negative log-likelihood of the prediction
290 |         of this model under a given target distribution.
291 | 
292 |     .. math::
293 |     
294 |     \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
295 |     \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
296 |     \ell (\theta=\{W,b\}, \mathcal{D})
297 |     
298 |     :type y: theano.tensor.TensorType
299 |     :param y: corresponds to a vector that gives for each example the
300 |     correct label
301 |     
302 |     Note: we use the mean instead of the sum so that
303 |     the learning rate is less dependent on the batch size
304 |     """
305 |         # y.shape[0] is (symbolically) the number of rows in y, i.e.,
306 |         # number of examples (call it n) in the minibatch
307 |         # T.arange(y.shape[0]) is a symbolic vector which will contain
308 |         # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
309 |         # Log-Probabilities (call it LP) with one row per example and
310 |         # one column per class LP[T.arange(y.shape[0]),y] is a vector
311 |         # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
312 |         # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
313 |         # the mean (across minibatch examples) of the elements in v,
314 |         # i.e., the mean log-likelihood across the minibatch.
315 |         return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
316 | 
317 |     def errors(self, y):
318 |         """Return a float representing the number of errors in the minibatch ;
319 |     zero one loss over the size of the minibatch
320 |     
321 |     :type y: theano.tensor.TensorType
322 |     :param y: corresponds to a vector that gives for each example the
323 |     correct label
324 |     """
325 | 
326 |         # check if y has same dimension of y_pred
327 |         if y.ndim != self.y_pred.ndim:
328 |             raise TypeError('y should have the same shape as self.y_pred',
329 |                 ('y', target.type, 'y_pred', self.y_pred.type))
330 |         # check if y is of the correct datatype
331 |         if y.dtype.startswith('int'):
332 |             # the T.neq operator returns a vector of 0s and 1s, where 1
333 |             # represents a mistake in prediction
334 |             return T.mean(T.neq(self.y_pred, y))
335 |         else:
336 |             raise NotImplementedError()
337 |             
338 |     def F1(self, y):
339 |         if y.ndim != self.y_pred.ndim:
340 |             raise TypeError('y should have the same shape as self.y_pred',
341 |                 ('y', target.type, 'y_pred', self.y_pred.type))
342 |         if y.dtype.startswith('int'):
343 |             zeros = T.zeros_like(y)
344 |             numKey = T.sum(T.neq(y, zeros))
345 |             numPred = T.sum(T.neq(self.y_pred, zeros))
346 |             predIds = self.y_pred.nonzero()
347 |             preds = self.y_pred[predIds]
348 |             keys = y[predIds]
349 |             correct = T.sum(T.eq(preds, keys))
350 |             p = 100.0 * correct / numPred
351 |             r = 100.0 * correct / numKey
352 |             f = (2.0 * p * r) / (p + r)
353 |             return f
354 |         else:
355 |             raise NotImplementedError()
356 |         
357 | class LeNetConvPoolLayer(object):
358 |     """Pool Layer of a convolutional network """
359 | 
360 |     def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2), non_linear="tanh"):
361 |         """
362 |         Allocate a LeNetConvPoolLayer with shared variable internal parameters.
363 | 
364 |         :type rng: numpy.random.RandomState
365 |         :param rng: a random number generator used to initialize weights
366 | 
367 |         :type input: theano.tensor.dtensor4
368 |         :param input: symbolic image tensor, of shape image_shape
369 | 
370 |         :type filter_shape: tuple or list of length 4
371 |         :param filter_shape: (number of filters, num input feature maps,
372 |                               filter height,filter width)
373 | 
374 |         :type image_shape: tuple or list of length 4
375 |         :param image_shape: (batch size, num input feature maps,
376 |                              image height, image width)
377 | 
378 |         :type poolsize: tuple or list of length 2
379 |         :param poolsize: the downsampling (pooling) factor (#rows,#cols)
380 |         """
381 | 
382 |         assert image_shape[1] == filter_shape[1]
383 |         self.input = input
384 |         self.filter_shape = filter_shape
385 |         self.image_shape = image_shape
386 |         self.poolsize = poolsize
387 |         self.non_linear = non_linear
388 |         # there are "num input feature maps * filter height * filter width"
389 |         # inputs to each hidden unit
390 |         fan_in = numpy.prod(filter_shape[1:])
391 |         # each unit in the lower layer receives a gradient from:
392 |         # "num output feature maps * filter height * filter width" /
393 |         #   pooling size
394 |         fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) /numpy.prod(poolsize))
395 |         # initialize weights with random weights
396 |         if self.non_linear=="none" or self.non_linear=="relu":
397 |             self.W = theano.shared(numpy.asarray(rng.uniform(low=-0.01,high=0.01,size=filter_shape), 
398 |                                                 dtype=theano.config.floatX),borrow=True,name="W_conv")
399 |         else:
400 |             W_bound = numpy.sqrt(6. / (fan_in + fan_out))
401 |             self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
402 |                 dtype=theano.config.floatX),borrow=True,name="W_conv")   
403 |         b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
404 |         self.b = theano.shared(value=b_values, borrow=True, name="b_conv")
405 |         
406 |         # convolve input feature maps with filters
407 |         conv_out = conv.conv2d(input=input, filters=self.W,filter_shape=self.filter_shape, image_shape=self.image_shape)
408 |         if self.non_linear=="tanh":
409 |             conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
410 |             self.output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True)
411 |         elif self.non_linear=="relu":
412 |             conv_out_tanh = ReLU(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
413 |             self.output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True)
414 |         else:
415 |             pooled_out = downsample.max_pool_2d(input=conv_out, ds=self.poolsize, ignore_border=True)
416 |             self.output = pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')
417 |         self.params = [self.W, self.b]
418 |         
419 |     def predict(self, new_data, batch_size):
420 |         """
421 |         predict for new data
422 |         """
423 |         img_shape = (batch_size, 1, self.image_shape[2], self.image_shape[3])
424 |         conv_out = conv.conv2d(input=new_data, filters=self.W, filter_shape=self.filter_shape, image_shape=img_shape)
425 |         if self.non_linear=="tanh":
426 |             conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
427 |             output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True)
428 |         if self.non_linear=="relu":
429 |             conv_out_tanh = ReLU(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
430 |             output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True)
431 |         else:
432 |             pooled_out = downsample.max_pool_2d(input=conv_out, ds=self.poolsize, ignore_border=True)
433 |             output = pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')
434 |         return output
435 |         
436 | 


--------------------------------------------------------------------------------
/dist_conv_net_sentence.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Convolutional Neural Network for Relation Extraction
  3 | 
  4 | Much of the code is modified from
  5 | - https://github.com/yoonkim/CNN_sentence
  6 | """
  7 | import cPickle
  8 | import numpy as np
  9 | from collections import defaultdict, OrderedDict
 10 | import theano
 11 | import theano.tensor as T
 12 | import re
 13 | import warnings
 14 | import sys
 15 | warnings.filterwarnings("ignore")   
 16 | 
 17 | #different non-linearities
 18 | def ReLU(x):
 19 |     y = T.maximum(0.0, x)
 20 |     return(y)
 21 | def Sigmoid(x):
 22 |     y = T.nnet.sigmoid(x)
 23 |     return(y)
 24 | def Tanh(x):
 25 |     y = T.tanh(x)
 26 |     return(y)
 27 | def Iden(x):
 28 |     y = x
 29 |     return(y)
 30 |     
 31 | def inspect_inputs(i, node, fn):
 32 |     print i, node, "input(s) value(s):", [input[0] for input in fn.inputs],
 33 | 
 34 | def inspect_outputs(i, node, fn):
 35 |     print "output(s) value(s):", [output[0] for output in fn.outputs]
 36 | 
 37 | def writeInfo(f, info):
 38 |     old_stdout = sys.stdout
 39 |     sys.stdout = f
 40 |     print info
 41 |     sys.stdout = old_stdout
 42 |     print info
 43 | 
 44 | def detect_nan(i, node, fn):
 45 |     for output in fn.outputs:
 46 |         if numpy.isnan(output[0]).any():
 47 |             print '*** NaN detected ***'
 48 |             theano.printing.debugprint(node)
 49 |             print 'Inputs : %s' % [input[0] for input in fn.inputs]
 50 |             print 'Outputs: %s' % [output[0] for output in fn.outputs]
 51 |             break
 52 |        
 53 | def train_conv_net(datasets,
 54 |                    U,
 55 |                    logger,
 56 |                    img_w=300, 
 57 |                    filter_hs=[3,4,5],
 58 |                    hidden_units=[100,7], 
 59 |                    dropout_rate=[0.5],
 60 |                    shuffle_batch=True,
 61 |                    n_epochs=25, 
 62 |                    batch_size=50, 
 63 |                    lr_decay = 0.95,
 64 |                    conv_non_linear="relu",
 65 |                    activations=[Iden],
 66 |                    sqr_norm_lim=9,
 67 |                    non_static=True):
 68 |     """
 69 |     Train a simple conv net
 70 |     img_h = sentence length (padded where necessary)
 71 |     img_w = word vector length (300 for word2vec)
 72 |     filter_hs = filter window sizes    
 73 |     hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer
 74 |     sqr_norm_lim = s^2 in the paper
 75 |     lr_decay = adadelta decay parameter
 76 |     """    
 77 |     rng = np.random.RandomState(3435)
 78 |     img_h = len(datasets[0][0])-1
 79 |     img_w += 1
 80 |     filter_w = img_w
 81 |     feature_maps = hidden_units[0]
 82 |     filter_shapes = []
 83 |     pool_sizes = []
 84 |     for filter_h in filter_hs:
 85 |         filter_shapes.append((feature_maps, 1, filter_h, filter_w))
 86 |         pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1))
 87 |     parameters = [("image shape",img_h,img_w),("filter shape",filter_shapes), ("hidden_units",hidden_units),
 88 |                   ("dropout", dropout_rate), ("batch_size",batch_size),("non_static", non_static),
 89 |                     ("learn_decay",lr_decay), ("conv_non_linear", conv_non_linear), ("non_static", non_static)
 90 |                     ,("sqr_norm_lim",sqr_norm_lim),("shuffle_batch",shuffle_batch),("epochs",n_epochs),("filter_hs",filter_hs)]
 91 |     writeInfo(logger, parameters)   
 92 |     
 93 |     #define model architecture
 94 |     writeInfo(logger, 'constructing graph ...')
 95 |     index = T.lscalar()
 96 |     x = T.matrix('x')
 97 |     ent = T.matrix('ent')  
 98 |     y = T.ivector('y')
 99 |     Words = theano.shared(value = U, name = "Words") #consider to add borrow=True
100 |     zero_vec_tensor = T.vector()
101 |     zero_vec = np.zeros(img_w-1,dtype='float32')
102 |     set_zero = theano.function([zero_vec_tensor], updates=[(Words, T.set_subtensor(Words[0,:], zero_vec_tensor))])
103 |     writeInfo(logger, 'building layer0_input ...')
104 |     layer0_input1 = Words[T.cast(x.flatten(),dtype="int32")].reshape((x.shape[0],1,x.shape[1],Words.shape[1]))
105 |     layer0_input2 = ent.reshape((ent.shape[0],1,ent.shape[1],1))
106 |     layer0_input = T.cast(T.concatenate([layer0_input1, layer0_input2], axis=3), dtype=theano.config.floatX)          
107 |     conv_layers = []
108 |     layer1_inputs = []
109 |     writeInfo(logger, 'adding convolutional layers ...')
110 |     for i in xrange(len(filter_hs)):
111 |         filter_shape = filter_shapes[i]
112 |         pool_size = pool_sizes[i]
113 |         conv_layer = LeNetConvPoolLayer(rng, input=layer0_input,image_shape=(batch_size, 1, img_h, img_w),
114 |                                 filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear)
115 |         layer1_input = conv_layer.output.flatten(2)
116 |         conv_layers.append(conv_layer)
117 |         layer1_inputs.append(layer1_input)
118 |     layer1_input = T.concatenate(layer1_inputs,1)
119 |     hidden_units[0] = feature_maps*len(filter_hs)
120 |     writeInfo(logger, 'dropouting ...')
121 |     classifier = MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, activations=activations, dropout_rates=dropout_rate)
122 |     
123 |     #define parameters of the model and update functions using adadelta
124 |     writeInfo(logger, 'creating parameters ...')
125 |     params = classifier.params     
126 |     for conv_layer in conv_layers:
127 |         params += conv_layer.params
128 |     if non_static:
129 |         #if word vectors are allowed to change, add them as model parameters
130 |         params += [Words]
131 |     cost = classifier.negative_log_likelihood(y)
132 |     writeInfo(logger, 'computing cost and gradient ...')
133 |     dropout_cost = classifier.dropout_negative_log_likelihood(y)           
134 |     grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim)
135 |     
136 |     #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate 
137 |     #extra data (at random)
138 |     writeInfo(logger, 'minibatching data ...')
139 |     np.random.seed(3435)
140 |     if datasets[0].shape[0] % batch_size > 0:
141 |         extra_data_num = batch_size - datasets[0].shape[0] % batch_size
142 |         train_set = np.random.permutation(datasets[0])   
143 |         extra_data = train_set[:extra_data_num]
144 |         new_data=np.append(datasets[0],extra_data,axis=0)
145 |         
146 |         np.random.seed(3435)
147 |         train_ent = np.random.permutation(datasets[1])
148 |         extra_ent = train_ent[:extra_data_num]
149 |         new_ent=np.append(datasets[1],extra_ent,axis=0)
150 |     else:
151 |         new_data = datasets[0]
152 |         new_ent = datasets[1]
153 |     #new_data = np.random.permutation(new_data)
154 |     n_batches = new_data.shape[0]/batch_size
155 |     n_train_batches = n_batches
156 |     writeInfo(logger, 'generating functions ...')
157 |     test_set_x = datasets[2][:,:img_h]
158 |     test_ent_pos = datasets[3][:,:img_h]
159 |     test_set_y = np.asarray(datasets[2][:,-1],"int32")
160 |     train_set = new_data[:,:]
161 |     train_ent = new_ent[:,:]
162 |     train_set_x, train_ent_pos, train_set_y = shared_dataset((train_set[:,:img_h],train_ent[:,:img_h],train_set[:,-1])) 
163 |             
164 |     #make theano functions to get train/val/test errors            
165 |     train_model = theano.function([index], cost, updates=grad_updates,
166 |           givens={
167 |             x: train_set_x[index*batch_size:(index+1)*batch_size],
168 |             ent: train_ent_pos[index*batch_size:(index+1)*batch_size],
169 |             y: train_set_y[index*batch_size:(index+1)*batch_size]})#, mode=theano.compile.MonitorMode(
170 |                         #pre_func=inspect_inputs,
171 |                         #post_func=inspect_outputs))     
172 |     test_pred_layers = []
173 |     test_size = test_set_x.shape[0]
174 |     test_layer0_input1 = Words[T.cast(x.flatten(),dtype="int32")].reshape((test_size,1,img_h,Words.shape[1]))
175 |     test_layer0_input2 = ent.reshape((test_size,1,img_h,1))
176 |     test_layer0_input = T.cast(T.concatenate([test_layer0_input1, test_layer0_input2], axis=3), dtype=theano.config.floatX)
177 |     for conv_layer in conv_layers:
178 |         test_layer0_output = conv_layer.predict(test_layer0_input, test_size)
179 |         test_pred_layers.append(test_layer0_output.flatten(2))
180 |     test_layer1_input = T.concatenate(test_pred_layers, 1)
181 |     test_y_pred = classifier.predict(test_layer1_input)
182 |     
183 |     zero_for_err = T.zeros_like(y)
184 |     numKey_err = T.sum(T.neq(y, zero_for_err))
185 |     numPred_err = T.sum(T.neq(test_y_pred, zero_for_err))
186 |     predIds_err = test_y_pred.nonzero()
187 |     preds_err = test_y_pred[predIds_err]
188 |     keys_err = y[predIds_err]
189 |     correct_err = T.sum(T.eq(preds_err, keys_err))
190 |     p_err = 100.0 * correct_err / numPred_err
191 |     r_err = 100.0 * correct_err / numKey_err
192 |     f_err = (2.0 * p_err * r_err) / (p_err + r_err)
193 |     test_error = (p_err, r_err, f_err)
194 |     
195 |     test_model_all = theano.function([x,ent,y], test_error)   
196 |     
197 |     #start training over mini-batches
198 |     writeInfo(logger, '... training')
199 |     epoch = 0     
200 |     cost_epoch = 0
201 |     test_f1 = (0.0,0.0,0.0)
202 |     while (epoch < n_epochs):        
203 |         epoch = epoch + 1
204 |         writeInfo(logger, ('------in epoch %i(%i batches)-----' % (epoch,n_train_batches)))
205 |         runId = 0
206 |         if shuffle_batch:
207 |             for minibatch_index in np.random.permutation(range(n_train_batches)):
208 |                 runId += 1
209 |                 cost_epoch = train_model(minibatch_index)
210 |                 if runId % 50 == 0:
211 |                     writeInfo(logger, (runId, ': minibatch_index = ', minibatch_index, ', err = ', cost_epoch))
212 |                 set_zero(zero_vec)
213 |         else:
214 |             for minibatch_index in xrange(n_train_batches):
215 |                 cost_epoch = train_model(minibatch_index)  
216 |                 set_zero(zero_vec)
217 |         writeInfo(logger, 'updated parameters!')
218 |         test_f1 = test_model_all(test_set_x,test_ent_pos,test_set_y)    
219 |         writeInfo(logger, ('      current test perf ', test_f1))
220 |     return test_f1
221 | 
222 | def shared_dataset(data_xyz, borrow=True):
223 |         """ Function that loads the dataset into shared variables
224 | 
225 |         The reason we store our dataset in shared variables is to allow
226 |         Theano to copy it into the GPU memory (when code is run on GPU).
227 |         Since copying data into the GPU is slow, copying a minibatch everytime
228 |         is needed (the default behaviour if the data is not in a shared
229 |         variable) would lead to a large decrease in performance.
230 |         """
231 |         data_x, data_y, data_z = data_xyz
232 |         shared_x = theano.shared(np.asarray(data_x,
233 |                                                dtype=theano.config.floatX),
234 |                                  borrow=borrow)
235 |         shared_y = theano.shared(np.asarray(data_y,
236 |                                                dtype=theano.config.floatX),
237 |                                  borrow=borrow)
238 |         shared_z = theano.shared(np.asarray(data_z,
239 |                                                dtype=theano.config.floatX),
240 |                                  borrow=borrow)
241 |         return shared_x, shared_y, T.cast(shared_z, 'int32')
242 |         
243 | def sgd_updates_adadelta(params,cost,rho=0.95,epsilon=1e-6,norm_lim=9,word_vec_name='Words'):
244 |     """
245 |     adadelta update rule, mostly from
246 |     https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
247 |     """
248 |     updates = OrderedDict({})
249 |     exp_sqr_grads = OrderedDict({})
250 |     exp_sqr_ups = OrderedDict({})
251 |     gparams = []
252 |     for param in params:
253 |         empty = np.zeros_like(param.get_value())
254 |         exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name)
255 |         gp = T.grad(cost, param)
256 |         exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name)
257 |         gparams.append(gp)
258 |     for param, gp in zip(params, gparams):
259 |         exp_sg = exp_sqr_grads[param]
260 |         exp_su = exp_sqr_ups[param]
261 |         up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
262 |         updates[exp_sg] = up_exp_sg
263 |         step =  -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp
264 |         updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
265 |         stepped_param = param + step
266 |         if (param.get_value(borrow=True).ndim == 2) and (param.name!='Words'):
267 |             col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
268 |             desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim))
269 |             scale = desired_norms / (1e-7 + col_norms)
270 |             updates[param] = stepped_param * scale
271 |         else:
272 |             updates[param] = stepped_param      
273 |     return updates 
274 | 
275 | def as_floatX(variable):
276 |     if isinstance(variable, float):
277 |         return np.cast[theano.config.floatX](variable)
278 | 
279 |     if isinstance(variable, np.ndarray):
280 |         return np.cast[theano.config.floatX](variable)
281 |     return theano.tensor.cast(variable, theano.config.floatX)
282 |     
283 | def safe_update(dict_to, dict_from):
284 |     """
285 |     re-make update dictionary for safe updating
286 |     """
287 |     for key, val in dict(dict_from).iteritems():
288 |         if key in dict_to:
289 |             raise KeyError(key)
290 |         dict_to[key] = val
291 |     return dict_to
292 |     
293 | def get_idx_from_sent(sent, pos1, pos2, word_idx_map, max_l=20, k=300, filter_h=5):
294 |     """
295 |     Transforms sentence into a list of indices. Pad with zeroes.
296 |     """
297 |     x = []
298 |     e = []
299 |     pad = filter_h - 1
300 |     for i in xrange(pad):
301 |         x.append(0)
302 |         e.append(0)
303 |     words = sent.split()
304 |     id = -1
305 |     for word in words:
306 |         id += 1
307 |         word = ' '.join(word.split('_'))
308 |         if word in word_idx_map:
309 |             x.append(word_idx_map[word])
310 |             e.append(1 if (id == pos1 or id == pos2) else 0)
311 |         else:
312 |             print 'unrecognized word in get_idx_from_sent ', word
313 |             exit()
314 |     while len(x) < max_l+2*pad:
315 |         x.append(0)
316 |         e.append(0)
317 |     
318 |     return x, e
319 | 
320 | def make_idx_data_cv(revs, word_idx_map, cv, max_l=20, k=300, filter_h=5):
321 |     """
322 |     Transforms sentences into a 2-d matrix.
323 |     """
324 |     train, trainEnt, test, testEnt = [], [], [], []
325 |     for rev in revs:
326 |         sent, ent = get_idx_from_sent(rev["text"], rev["pos1"], rev["pos2"], word_idx_map, max_l, k, filter_h)   
327 |         sent.append(rev["y"])
328 |         ent.append(0)
329 |         if rev["fold"]==cv:            
330 |             test.append(sent)
331 |             testEnt.append(ent)       
332 |         else:  
333 |             train.append(sent)
334 |             trainEnt.append(ent) 
335 |     train = np.array(train,dtype="int")
336 |     trainEnt = np.array(trainEnt,dtype="float32")
337 |     test = np.array(test,dtype="int")
338 |     testEnt = np.array(testEnt,dtype="float32")
339 |     return [train, trainEnt, test, testEnt]     
340 |   
341 |    
342 | if __name__=="__main__":
343 |     logger = open("logs/" + sys.argv[3],"w")
344 |     writeInfo(logger, "loading data...")
345 |     x = cPickle.load(open("nnre.dat","rb"))
346 |     revs, W, W2, word_idx_map, vocab, labelDict = x[0], x[1], x[2], x[3], x[4], x[5]
347 |     writeInfo(logger, "data loaded!")
348 |     mode= sys.argv[1]
349 |     word_vectors = sys.argv[2]    
350 |     if mode=="-nonstatic":
351 |         writeInfo(logger, "model architecture: CNN-non-static")
352 |         non_static=True
353 |     elif mode=="-static":
354 |         writeInfo(logger, "model architecture: CNN-static")
355 |         non_static=False
356 |     execfile("conv_net_classes.py")    
357 |     if word_vectors=="-rand":
358 |         writeInfo(logger, "using: random vectors")
359 |         U = W2
360 |     elif word_vectors=="-word2vec":
361 |         writeInfo(logger, "using: word2vec vectors")
362 |         U = W
363 |     results = [np.asarray(0.0), np.asarray(0.0), np.asarray(0.0)]
364 |     r = range(0,5)  
365 |     for i in r:
366 |         writeInfo(logger, ('------------------------------working on fold %i------------------' % i))
367 |         writeInfo(logger, 'producing data for this fold ...')
368 |         datasets = make_idx_data_cv(revs, word_idx_map, i, max_l=20,k=300, filter_h=4)
369 |         perf = train_conv_net(datasets,
370 |                               U,
371 |                               logger,
372 |                               lr_decay=0.95,
373 |                               filter_hs=[3,4],
374 |                               conv_non_linear="relu",
375 |                               hidden_units=[100,7], 
376 |                               shuffle_batch=True, 
377 |                               n_epochs=25, 
378 |                               sqr_norm_lim=9,
379 |                               non_static=non_static,
380 |                               batch_size=50,
381 |                               dropout_rate=[0.5])
382 |         writeInfo(logger, ("cv: " + str(i) + ", perf: " + str(perf)))
383 |         results = np.add(results, perf)
384 |     writeInfo(logger, str(results / 5.0))
385 |     logger.close()


--------------------------------------------------------------------------------
/dist_conv_net_sentence_oneFold.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Convolutional Neural Network for Relation Extraction
  3 | 
  4 | Much of the code is modified from
  5 | - deeplearning.net (for ConvNet classes)
  6 | - https://github.com/mdenil/dropout (for dropout)
  7 | - https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
  8 | """
  9 | import cPickle
 10 | import numpy as np
 11 | from collections import defaultdict, OrderedDict
 12 | import theano
 13 | import theano.tensor as T
 14 | import re
 15 | import warnings
 16 | import sys
 17 | import os
 18 | warnings.filterwarnings("ignore")   
 19 | 
 20 | #different non-linearities
 21 | def ReLU(x):
 22 |     y = T.maximum(0.0, x)
 23 |     return(y)
 24 | def Sigmoid(x):
 25 |     y = T.nnet.sigmoid(x)
 26 |     return(y)
 27 | def Tanh(x):
 28 |     y = T.tanh(x)
 29 |     return(y)
 30 | def Iden(x):
 31 |     y = x
 32 |     return(y)
 33 |     
 34 | def inspect_inputs(i, node, fn):
 35 |     print i, node, "input(s) value(s):", [input[0] for input in fn.inputs],
 36 | 
 37 | def inspect_outputs(i, node, fn):
 38 |     print "output(s) value(s):", [output[0] for output in fn.outputs]
 39 | 
 40 | def writeInfo(f, info):
 41 |     old_stdout = sys.stdout
 42 |     sys.stdout = f
 43 |     print info
 44 |     f.flush()
 45 |     os.fsync(f.fileno())
 46 |     sys.stdout = old_stdout
 47 |     print info
 48 | 
 49 | def detect_nan(i, node, fn):
 50 |     for output in fn.outputs:
 51 |         if numpy.isnan(output[0]).any():
 52 |             print '*** NaN detected ***'
 53 |             theano.printing.debugprint(node)
 54 |             print 'Inputs : %s' % [input[0] for input in fn.inputs]
 55 |             print 'Outputs: %s' % [output[0] for output in fn.outputs]
 56 |             break
 57 |        
 58 | def train_conv_net(datasets,
 59 |                    U,
 60 |                    D1,
 61 |                    D2,
 62 |                    logger,
 63 |                    word_vectors,
 64 |                    dist_dim=50,
 65 |                    img_w=300, 
 66 |                    filter_hs=[3,4,5],
 67 |                    hidden_units=[100,7], 
 68 |                    dropout_rate=[0.5],
 69 |                    shuffle_batch=True,
 70 |                    n_epochs=25, 
 71 |                    batch_size=50, 
 72 |                    lr_decay = 0.95,
 73 |                    conv_non_linear="relu",
 74 |                    activations=[Iden],
 75 |                    sqr_norm_lim=9,
 76 |                    non_static=True):
 77 |     """
 78 |     Train a simple conv net
 79 |     img_h = sentence length (padded where necessary)
 80 |     img_w = word vector length (300 for word2vec)
 81 |     filter_hs = filter window sizes    
 82 |     hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer
 83 |     sqr_norm_lim = s^2 in the paper
 84 |     lr_decay = adadelta decay parameter
 85 |     """    
 86 |     rng = np.random.RandomState(3435)
 87 |     img_h = len(datasets[0][0])-1
 88 |     img_w += 2*dist_dim
 89 |     added_dim = 2*dist_dim
 90 |     filter_w = img_w
 91 |     feature_maps = hidden_units[0]
 92 |     filter_shapes = []
 93 |     pool_sizes = []
 94 |     for filter_h in filter_hs:
 95 |         filter_shapes.append((feature_maps, 1, filter_h, filter_w))
 96 |         pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1))
 97 |     parameters = [("image shape",img_h,img_w),("filter shape",filter_shapes), ("epochs",n_epochs),
 98 |                   ("dropout", dropout_rate), ("batch_size",batch_size), ("learn_decay",lr_decay),
 99 |                   ("conv_non_linear", conv_non_linear), ("sqr_norm_lim",sqr_norm_lim), ("shuffle_batch",shuffle_batch),
100 |                   ("filter_hs",filter_hs), ("non_static", non_static), ("word_vectors", word_vectors), ("hidden_units",hidden_units)]
101 |     writeInfo(logger, parameters)   
102 |     
103 |     #define model architecture
104 |     writeInfo(logger, 'constructing graph ...')
105 |     index = T.lscalar()
106 |     x = T.matrix('x')
107 |     dist1 = T.matrix('dist1')
108 |     dist2 = T.matrix('dist2')
109 |     y = T.ivector('y')
110 |     Words = theano.shared(value = U, name = "Words") #consider to add borrow=True
111 |     Distance1 = theano.shared(value = D1, name = "Distance1") #consider to add borrow=True
112 |     Distance2 = theano.shared(value = D2, name = "Distance2") #consider to add borrow=True
113 |     zero_vec_tensor = T.vector()
114 |     zero_vec_tensor_dist1 = T.vector()
115 |     zero_vec_tensor_dist2 = T.vector()
116 |     zero_vec = np.zeros(img_w-added_dim,dtype='float32')
117 |     set_zero = theano.function([zero_vec_tensor], updates=[(Words, T.set_subtensor(Words[0,:], zero_vec_tensor))])
118 |     zero_vect_dist = np.zeros(dist_dim,dtype='float32')
119 |     set_zero_dist1 = theano.function([zero_vec_tensor_dist1], updates=[(Distance1, T.set_subtensor(Distance1[0,:], zero_vec_tensor_dist1))])
120 |     set_zero_dist2 = theano.function([zero_vec_tensor_dist2], updates=[(Distance2, T.set_subtensor(Distance2[0,:], zero_vec_tensor_dist2))])
121 |     
122 |     writeInfo(logger, 'building layer0_input ...')
123 |     layer0_input1 = Words[T.cast(x.flatten(),dtype="int32")].reshape((x.shape[0],1,x.shape[1],Words.shape[1]))
124 |     layer0_input2 = Distance1[T.cast(dist1.flatten(),dtype="int32")].reshape((dist1.shape[0],1,dist1.shape[1],Distance1.shape[1]))
125 |     layer0_input3 = Distance2[T.cast(dist2.flatten(),dtype="int32")].reshape((dist2.shape[0],1,dist2.shape[1],Distance2.shape[1]))
126 |     
127 |     
128 |     layer0_input = T.cast(T.concatenate([layer0_input1, layer0_input2, layer0_input3], axis=3), dtype=theano.config.floatX)          
129 |     conv_layers = []
130 |     layer1_inputs = []
131 |     writeInfo(logger, 'adding convolutional layers ...')
132 |     for i in xrange(len(filter_hs)):
133 |         filter_shape = filter_shapes[i]
134 |         pool_size = pool_sizes[i]
135 |         conv_layer = LeNetConvPoolLayer(rng, input=layer0_input,image_shape=(batch_size, 1, img_h, img_w),
136 |                                 filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear)
137 |         layer1_input = conv_layer.output.flatten(2)
138 |         conv_layers.append(conv_layer)
139 |         layer1_inputs.append(layer1_input)
140 |     layer1_input = T.concatenate(layer1_inputs,1)
141 |     hidden_units[0] = feature_maps*len(filter_hs)
142 |     writeInfo(logger, 'dropouting ...')
143 |     classifier = MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, activations=activations, dropout_rates=dropout_rate)
144 |     
145 |     #define parameters of the model and update functions using adadelta
146 |     writeInfo(logger, 'creating parameters ...')
147 |     params = classifier.params     
148 |     for conv_layer in conv_layers:
149 |         params += conv_layer.params
150 |     if non_static:
151 |         #if word vectors are allowed to change, add them as model parameters
152 |         params += [Words]
153 |         params += [Distance1]
154 |         params += [Distance2]
155 |     cost = classifier.negative_log_likelihood(y)
156 |     writeInfo(logger, 'computing cost and gradient ...')
157 |     dropout_cost = classifier.dropout_negative_log_likelihood(y)           
158 |     grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim)
159 |     
160 |     #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate 
161 |     #extra data (at random)
162 |     writeInfo(logger, 'minibatching data ...')
163 |     np.random.seed(3435)
164 |     if datasets[0].shape[0] % batch_size > 0:
165 |         extra_data_num = batch_size - datasets[0].shape[0] % batch_size
166 |         train_set = np.random.permutation(datasets[0])   
167 |         extra_data = train_set[:extra_data_num]
168 |         new_data=np.append(datasets[0],extra_data,axis=0)
169 |         
170 |         np.random.seed(3435)
171 |         train_dist1 = np.random.permutation(datasets[1])
172 |         extra_dist1 = train_dist1[:extra_data_num]
173 |         new_dist1=np.append(datasets[1],extra_dist1,axis=0)
174 |         
175 |         np.random.seed(3435)
176 |         train_dist2 = np.random.permutation(datasets[2])
177 |         extra_dist2 = train_dist2[:extra_data_num]
178 |         new_dist2=np.append(datasets[2],extra_dist2,axis=0)
179 |     else:
180 |         new_data = datasets[0]
181 |         new_dist1 = datasets[1]
182 |         new_dist2 = datasets[2]
183 |     #new_data = np.random.permutation(new_data)
184 |     n_batches = new_data.shape[0]/batch_size
185 |     n_train_batches = n_batches
186 |     writeInfo(logger, 'generating functions ...')
187 |     test_set_x = datasets[3][:,:img_h]
188 |     test_dist1_x = datasets[4][:,:img_h]
189 |     test_dist2_x = datasets[5][:,:img_h]
190 |     test_set_y = np.asarray(datasets[3][:,-1],"int32")
191 |     train_set = new_data[:,:]
192 |     train_dist1 = new_dist1[:,:]
193 |     train_dist2 = new_dist2[:,:]
194 |     train_set_x, train_dist1_x, train_dist2_x, train_set_y = shared_dataset((train_set[:,:img_h],train_dist1[:,:img_h],train_dist2[:,:img_h],train_set[:,-1])) 
195 |             
196 |     #make theano functions to get train/val/test errors            
197 |     train_model = theano.function([index], cost, updates=grad_updates,
198 |           givens={
199 |             x: train_set_x[index*batch_size:(index+1)*batch_size],
200 |             dist1: train_dist1_x[index*batch_size:(index+1)*batch_size],
201 |             dist2: train_dist2_x[index*batch_size:(index+1)*batch_size],
202 |             y: train_set_y[index*batch_size:(index+1)*batch_size]})#, mode=theano.compile.MonitorMode(
203 |                         #pre_func=inspect_inputs,
204 |                         #post_func=inspect_outputs))     
205 |     test_pred_layers = []
206 |     test_size = test_set_x.shape[0]
207 |     test_layer0_input1 = Words[T.cast(x.flatten(),dtype="int32")].reshape((test_size,1,img_h,Words.shape[1]))
208 |     test_layer0_input2 = Distance1[T.cast(dist1.flatten(),dtype="int32")].reshape((test_size,1,img_h,Distance1.shape[1]))
209 |     test_layer0_input3 = Distance2[T.cast(dist2.flatten(),dtype="int32")].reshape((test_size,1,img_h,Distance2.shape[1]))
210 |     test_layer0_input = T.cast(T.concatenate([test_layer0_input1, test_layer0_input2, test_layer0_input3], axis=3), dtype=theano.config.floatX)
211 |     for conv_layer in conv_layers:
212 |         test_layer0_output = conv_layer.predict(test_layer0_input, test_size)
213 |         test_pred_layers.append(test_layer0_output.flatten(2))
214 |     test_layer1_input = T.concatenate(test_pred_layers, 1)
215 |     test_y_pred = classifier.predict(test_layer1_input)
216 |     
217 |     zero_for_err = T.zeros_like(y)
218 |     numKey_err = T.sum(T.neq(y, zero_for_err))
219 |     numPred_err = T.sum(T.neq(test_y_pred, zero_for_err))
220 |     predIds_err = test_y_pred.nonzero()
221 |     preds_err = test_y_pred[predIds_err]
222 |     keys_err = y[predIds_err]
223 |     correct_err = T.sum(T.eq(preds_err, keys_err))
224 |     p_err = 100.0 * correct_err / numPred_err
225 |     r_err = 100.0 * correct_err / numKey_err
226 |     f_err = (2.0 * p_err * r_err) / (p_err + r_err)
227 |     test_error = (p_err, r_err, f_err)
228 |     
229 |     test_model_all = theano.function([x,dist1,dist2,y], test_error)   
230 |     
231 |     #start training over mini-batches
232 |     writeInfo(logger, '... training')
233 |     epoch = 0     
234 |     cost_epoch = 0
235 |     test_f1 = (0.0,0.0,0.0)
236 |     while (epoch < n_epochs):        
237 |         epoch = epoch + 1
238 |         writeInfo(logger, ('------in epoch %i(%i batches)-----' % (epoch,n_train_batches)))
239 |         runId = 0
240 |         if shuffle_batch:
241 |             for minibatch_index in np.random.permutation(range(n_train_batches)):
242 |                 runId += 1
243 |                 cost_epoch = train_model(minibatch_index)
244 |                 if runId % 50 == 0:
245 |                     writeInfo(logger, (runId, ': minibatch_index = ', minibatch_index, ', err = ', cost_epoch))
246 |                 set_zero(zero_vec)
247 |                 set_zero_dist1(zero_vect_dist)
248 |                 set_zero_dist2(zero_vect_dist)
249 |         else:
250 |             for minibatch_index in xrange(n_train_batches):
251 |                 cost_epoch = train_model(minibatch_index)  
252 |                 set_zero(zero_vec)
253 |                 set_zero_dist1(zero_vect_dist)
254 |                 set_zero_dist2(zero_vect_dist)
255 |         writeInfo(logger, 'updated parameters!')
256 |         test_f1 = test_model_all(test_set_x,test_dist1_x,test_dist2_x,test_set_y)    
257 |         writeInfo(logger, ('      current test perf ', test_f1))
258 |     return test_f1
259 | 
260 | def shared_dataset(data_xyzt, borrow=True):
261 |         """ Function that loads the dataset into shared variables
262 | 
263 |         The reason we store our dataset in shared variables is to allow
264 |         Theano to copy it into the GPU memory (when code is run on GPU).
265 |         Since copying data into the GPU is slow, copying a minibatch everytime
266 |         is needed (the default behaviour if the data is not in a shared
267 |         variable) would lead to a large decrease in performance.
268 |         """
269 |         data_x, data_y, data_z, data_t = data_xyzt
270 |         shared_x = theano.shared(np.asarray(data_x,
271 |                                                dtype=theano.config.floatX),
272 |                                  borrow=borrow)
273 |         shared_y = theano.shared(np.asarray(data_y,
274 |                                                dtype=theano.config.floatX),
275 |                                  borrow=borrow)
276 |         shared_z = theano.shared(np.asarray(data_z,
277 |                                                dtype=theano.config.floatX),
278 |                                  borrow=borrow)
279 |         shared_t = theano.shared(np.asarray(data_t,
280 |                                                dtype=theano.config.floatX),
281 |                                  borrow=borrow)
282 |         return shared_x, shared_y, shared_z, T.cast(shared_t, 'int32')
283 |         
284 | def sgd_updates_adadelta(params,cost,rho=0.95,epsilon=1e-6,norm_lim=9,word_vec_name='Words'):
285 |     """
286 |     adadelta update rule, mostly from
287 |     https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
288 |     """
289 |     updates = OrderedDict({})
290 |     exp_sqr_grads = OrderedDict({})
291 |     exp_sqr_ups = OrderedDict({})
292 |     gparams = []
293 |     for param in params:
294 |         empty = np.zeros_like(param.get_value())
295 |         exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name)
296 |         gp = T.grad(cost, param)
297 |         exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name)
298 |         gparams.append(gp)
299 |     for param, gp in zip(params, gparams):
300 |         exp_sg = exp_sqr_grads[param]
301 |         exp_su = exp_sqr_ups[param]
302 |         up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
303 |         updates[exp_sg] = up_exp_sg
304 |         step =  -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp
305 |         updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
306 |         stepped_param = param + step
307 |         if (param.get_value(borrow=True).ndim == 2) and (param.name!='Words'):
308 |             col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
309 |             desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim))
310 |             scale = desired_norms / (1e-7 + col_norms)
311 |             updates[param] = stepped_param * scale
312 |         else:
313 |             updates[param] = stepped_param      
314 |     return updates 
315 | 
316 | def as_floatX(variable):
317 |     if isinstance(variable, float):
318 |         return np.cast[theano.config.floatX](variable)
319 | 
320 |     if isinstance(variable, np.ndarray):
321 |         return np.cast[theano.config.floatX](variable)
322 |     return theano.tensor.cast(variable, theano.config.floatX)
323 |     
324 | def safe_update(dict_to, dict_from):
325 |     """
326 |     re-make update dictionary for safe updating
327 |     """
328 |     for key, val in dict(dict_from).iteritems():
329 |         if key in dict_to:
330 |             raise KeyError(key)
331 |         dict_to[key] = val
332 |     return dict_to
333 |     
334 | def get_idx_from_sent(sent, pos1, pos2, word_idx_map, max_l=20, k=300, filter_h=5):
335 |     """
336 |     Transforms sentence into a list of indices. Pad with zeroes.
337 |     """
338 |     x = []
339 |     dist1 = []
340 |     dist2 = []
341 |     pad = filter_h - 1
342 |     for i in xrange(pad):
343 |         x.append(0)
344 |         dist1.append(0)
345 |         dist2.append(0)
346 |     words = sent.split()
347 |     id = -1
348 |     for word in words:
349 |         id += 1
350 |         word = ' '.join(word.split('_'))
351 |         if word in word_idx_map:
352 |             x.append(word_idx_map[word])
353 |             dist1.append(max_l + id - pos1)
354 |             dist2.append(max_l + id - pos2)
355 |         else:
356 |             print 'unrecognized word in get_idx_from_sent ', word
357 |             exit()
358 |     while len(x) < max_l+2*pad:
359 |         x.append(0)
360 |         dist1.append(0)
361 |         dist2.append(0)
362 |     
363 |     return x, dist1, dist2
364 | 
365 | def make_idx_data_cv(revs, word_idx_map, cv, max_l=20, k=300, filter_h=5):
366 |     """
367 |     Transforms sentences into a 2-d matrix.
368 |     """
369 |     train, trainDist1, trainDist2, test, testDist1, testDist2 = [], [], [], [], [], []
370 |     for rev in revs:
371 |         sent, dist1, dist2 = get_idx_from_sent(rev["text"], rev["pos1"], rev["pos2"], word_idx_map, max_l, k, filter_h) 
372 |          
373 |         sent.append(rev["y"])
374 |         dist1.append(0)
375 |         dist2.append(0)
376 |         if rev["fold"]==cv:            
377 |             test.append(sent)
378 |             testDist1.append(dist1)
379 |             testDist2.append(dist2)      
380 |         else:  
381 |             train.append(sent)
382 |             trainDist1.append(dist1)
383 |             trainDist2.append(dist2)
384 |     train = np.array(train,dtype="int")
385 |     trainDist1 = np.array(trainDist1,dtype="int")
386 |     trainDist2 = np.array(trainDist2,dtype="int")
387 |     test = np.array(test,dtype="int")
388 |     testDist1 = np.array(testDist1,dtype="int")
389 |     testDist2 = np.array(testDist2,dtype="int")
390 |     return [train, trainDist1, trainDist2, test, testDist1, testDist2]   
391 |   
392 |    
393 | if __name__=="__main__":
394 |     hidden_units = [150,7] # 50, 100, 150, 200
395 |     filter_h = 5
396 |     filter_hs = [2,3,4,5]
397 |     mode = "static" # static, nonstatic
398 |     word_vectors = "word2vec" # rand, word2vec
399 |     fold = 0 # 0, 1, 2, 3, 4
400 |     
401 |     if mode=="nonstatic":
402 |         print "model architecture: CNN-non-static"
403 |         non_static=True
404 |     elif mode=="static":
405 |         print "model architecture: CNN-static"
406 |         non_static=False
407 |     else:
408 |         print 'Undefined mode'
409 |         exit()
410 |     
411 |     if word_vectors=="rand":
412 |         print "using: random vectors"
413 |     elif word_vectors=="word2vec":
414 |         print "using: word2vec vectors"
415 |     else:
416 |         print 'Undefined word vectors'
417 |         exit()
418 |         
419 |     execfile("dist_conv_net_classes.py")
420 |     
421 |     filterStr = ''
422 |     for fil in filter_hs:
423 |         filterStr = filterStr + '-' + str(fil)
424 |     filterStr = filterStr[1:]
425 |     log_file = 'h' + str(hidden_units[0]) + '.' + mode + '.' + word_vectors + '.f' + filterStr + '.fold' + str(fold)
426 |     
427 |     logger = open('res/' + log_file,"w")
428 |     writeInfo(logger, 'setting=' + log_file)
429 |     writeInfo(logger, "loading data...")
430 |     x = cPickle.load(open("distnnre.dat","rb"))
431 |     revs, W, W2, D1, D2, word_idx_map, vocab, id2Label = x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]
432 |     writeInfo(logger, "data loaded!")
433 |     
434 |     if word_vectors=="rand":
435 |         U = W2
436 |         k = 300
437 |     elif word_vectors=="word2vec":
438 |         U = W
439 |         k = 300
440 |     max_l = 20
441 |     #dist_size = 2*max_l - 1
442 |     dist_dim = 50
443 |     #D1 = np.random.uniform(-0.25,0.25,(dist_size+1,dist_dim))
444 |     #D2 = np.random.uniform(-0.25,0.25,(dist_size+1,dist_dim))
445 |     #D1[0] = np.zeros(dist_dim)
446 |     #D2[0] = np.zeros(dist_dim)
447 |     
448 |     writeInfo(logger, ('------------------------------working on fold %i------------------' % fold))
449 |     writeInfo(logger, 'producing data for this fold ...')
450 |     datasets = make_idx_data_cv(revs, word_idx_map, fold, max_l=max_l, k=k, filter_h=filter_h)
451 |     perf = train_conv_net(datasets,
452 |                           U,
453 |                           D1,
454 |                           D2,
455 |                           logger,
456 |                           word_vectors=word_vectors,
457 |                           dist_dim=dist_dim,
458 |                           lr_decay=0.95,
459 |                           filter_hs=filter_hs,
460 |                           conv_non_linear="tanh",
461 |                           hidden_units=hidden_units,
462 |                           shuffle_batch=True,
463 |                           n_epochs=20,
464 |                           sqr_norm_lim=9,
465 |                           non_static=non_static,
466 |                           batch_size=50,
467 |                           dropout_rate=[0.5])
468 |     writeInfo(logger, ("cv: " + str(fold) + ", perf: " + str(perf)))
469 |     writeInfo(logger, 'Done ' + log_file + '!')
470 |     logger.close()


--------------------------------------------------------------------------------
/dist_process_data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import cPickle
  3 | from collections import defaultdict
  4 | import sys, re
  5 | import pandas as pd
  6 | 
  7 | #thien's version
  8 | window = 20
  9 | 
 10 | def build_data_cv(data_folder, data_file):
 11 |     """
 12 |     Loads data.
 13 |     """
 14 |     labelDict = {'classLabel=Other':0}
 15 |     vocab = defaultdict(float)
 16 |     revs = []
 17 |     file2Fold = loadFoldMap(data_folder)
 18 |     fold = 0
 19 |     with open(data_file, 'r') as f:
 20 |         for line in f:
 21 |             line = line.strip()
 22 |             if line.startswith('#') and line.endswith('#'):
 23 |                 line = line[1:len(line)-1]
 24 |                 if not file2Fold.has_key(line):
 25 |                     print 'cannot find fold for file: ', line
 26 |                     exit()
 27 |                 fold = file2Fold[line]
 28 |                 continue
 29 |             
 30 |             relId, features, _, classLabel, type1, subtype1, pos1, type2, subtype2, pos2, sentence = parseLine(line)
 31 |             if not labelDict.has_key(classLabel):
 32 |                 labelDict[classLabel] = len(labelDict)
 33 |                 print 'label: ', classLabel, ' --> id = ', labelDict[classLabel]
 34 |                 if classLabel == 'classLabel=Other#':
 35 |                     print '-------Wrong: ', line
 36 |             classId = labelDict[classLabel]
 37 |             
 38 |             sentence, pos1, pos2 = fitSentenceToWindow(sentence=sentence, pos1=pos1, pos2 = pos2, window=window)
 39 |             
 40 |             #print 'sent: ', sentence, ' len = ', len(sentence.split()), ' poss: ', pos1, pos2, classLabel
 41 |             
 42 |             words = set(sentence.split())
 43 |             for word in words:
 44 |                 word = ' '.join(word.split('_'))
 45 |                 vocab[word] += 1
 46 |             
 47 |             datum = {"id": relId,
 48 |                      "y":classId,
 49 |                      "text": sentence,
 50 |                      "pos1": pos1,
 51 |                      "pos2": pos2,
 52 |                      "type1": type1,
 53 |                      "subtype1": subtype1,
 54 |                      "type2": type2,
 55 |                      "subtype2": subtype2,
 56 |                      "fold": fold}
 57 |             revs.append(datum)
 58 |     return revs, vocab, labelDict
 59 | 
 60 | def parseLine(line):
 61 |     relId = line[0:line.find(' ')]
 62 |     line = line[(line.find(' ')+1):]
 63 |     rel = line[0:line.rfind('###@@@')]
 64 |     sentInfo = line[(line.rfind('###@@@')+6):].split('\t')
 65 |     
 66 |     classLabel = rel[(rel.rfind(' ')+1):]
 67 |     rel = rel[0:rel.rfind(' ')]
 68 |     detectorLabel = rel[(rel.rfind(' ')+1):]
 69 |     features = rel[0:rel.rfind(' ')]
 70 |     
 71 |     sentence = sentInfo[0].lower()
 72 |     _, type1, subtype1, pos1 = tuple(sentInfo[1].split('#'))
 73 |     _, type2, subtype2, pos2 = tuple(sentInfo[2].split('#'))
 74 |     
 75 |     return relId, features, detectorLabel, classLabel, type1, subtype1, int(pos1), type2, subtype2, int(pos2), sentence
 76 |     
 77 | def loadFoldMap(data_folder):
 78 |     file2Fold = {}
 79 |     count = 0
 80 |     for fold in range(len(data_folder)):
 81 |         with open(data_folder[fold], "r") as f:
 82 |             for line in f:
 83 |                 line = line.strip()
 84 |                 if line:
 85 |                     file2Fold[line] = fold
 86 |                     count += 1
 87 |     print 'Loaded ', count, ' files!'
 88 |     return file2Fold
 89 | 
 90 | def fitSentenceToWindow(sentence, pos1, pos2, window=20):
 91 |     if (abs(pos1 - pos2)+1) > window:
 92 |         print 'Encounter a sentence with two far arguments: ', sentence, pos1, pos2
 93 |         exit()
 94 |     lower = ((pos1 + pos2 + window) / 2) - window + 1
 95 |     npos1 = pos1 - lower
 96 |     npos2 = pos2 - lower
 97 |     words = sentence.split()
 98 |     nsent = ''
 99 |     for i in range(window):
100 |         id = lower + i
101 |         nsent += ((words[id] + ' ') if (0 <= id < len(words)) else '###### ')
102 |     
103 |     return nsent.strip(), npos1, npos2
104 |     
105 | def get_W(word_vecs, k=300):
106 |     """
107 |     Get word matrix. W[i] is the vector for word indexed by i
108 |     """
109 |     vocab_size = len(word_vecs)
110 |     word_idx_map = dict()
111 |     W = np.zeros(shape=(vocab_size+1, k))            
112 |     W[0] = np.zeros(k)
113 |     i = 1
114 |     for word in word_vecs:
115 |         W[i] = word_vecs[word]
116 |         word_idx_map[word] = i
117 |         i += 1
118 |     return W, word_idx_map
119 | 
120 | def load_bin_vec(fname, vocab):
121 |     """
122 |     Loads 300x1 word vecs from Google (Mikolov) word2vec
123 |     """
124 |     word_vecs = {}
125 |     with open(fname, "rb") as f:
126 |         header = f.readline()
127 |         vocab_size, layer1_size = map(int, header.split())
128 |         binary_len = np.dtype('float32').itemsize * layer1_size
129 |         for line in xrange(vocab_size):
130 |             word = []
131 |             while True:
132 |                 ch = f.read(1)
133 |                 if ch == ' ':
134 |                     word = ''.join(word)
135 |                     break
136 |                 if ch != '\n':
137 |                     word.append(ch)   
138 |             if word in vocab:
139 |                word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')  
140 |             else:
141 |                 f.read(binary_len)
142 |     return word_vecs
143 | 
144 | def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
145 |     """
146 |     For words that occur in at least min_df documents, create a separate word vector.    
147 |     0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
148 |     """
149 |     for word in vocab:
150 |         if word not in word_vecs and vocab[word] >= min_df:
151 |             word_vecs[word] = np.random.uniform(-0.25,0.25,k)
152 | 
153 | if __name__=="__main__":
154 |     w2v_file = sys.argv[1]
155 |     #data_file = 'dataTest'
156 |     data_file = sys.argv[2]
157 |     basePath = '../relationExtractor/crossValiFilelists/ace2005_clean_test.cv'    
158 |     data_folder = []
159 |     for i in range(5):
160 |         data_folder += [basePath + str(i+1)]  
161 |     print "loading data...\n"
162 |     revs, vocab, labelDict = build_data_cv(data_folder, data_file)
163 |     max_l = np.max(abs(pd.DataFrame(revs)["pos1"] - pd.DataFrame(revs)["pos2"]))
164 |     print "max sentence length: " + str(max_l)
165 |     print "data loaded!"
166 |     print "number of relation instances: " + str(len(revs))
167 |     print "vocab size: " + str(len(vocab))
168 |     print "loading word2vec vectors...",
169 |     w2v = load_bin_vec(w2v_file, vocab)
170 |     print "word2vec loaded!"
171 |     print "num words already in word2vec: " + str(len(w2v))
172 |     add_unknown_words(w2v, vocab)
173 |     W, word_idx_map = get_W(w2v)
174 |     rand_vecs = {}
175 |     add_unknown_words(rand_vecs, vocab)
176 |     W2, _ = get_W(rand_vecs)
177 |     mmax_l = 20
178 |     dist_size = 2*mmax_l - 1
179 |     dist_dim = 50
180 |     D1 = np.random.uniform(-0.25,0.25,(dist_size+1,dist_dim))
181 |     D2 = np.random.uniform(-0.25,0.25,(dist_size+1,dist_dim))
182 |     D1[0] = np.zeros(dist_dim)
183 |     D2[0] = np.zeros(dist_dim)
184 |     cPickle.dump([revs, W, W2, D1, D2, word_idx_map, vocab, labelDict], open("distnnre.dat", "wb"))
185 |     print "dataset created!"   
186 | 


--------------------------------------------------------------------------------