├── __init__.py
├── .gitignore
├── LICENSE
├── test_bipartite_user_item_reviews.py
├── README.md
├── mlslnn.py
├── json_plus.py
├── test_MLSL.py
├── multi_level_lstm.py
├── dnn.py
└── lstm.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .idea
3 | *~
4 | *.bak
5 | *.iml
6 | 
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | All the files in this repository are released under the BSD 2-clause license below. 
 2 | Please refer to the individual files for copyright and authorship information.
 3 | 
 4 | Copyright (c) 2015, Camiolog Inc., Luca de Alfaro, Michael Shavlovsky,
 5 | Vassilis Polychronopoulos, Rakshit Agrawal, Massimo Di Pierro, Andrej Karpathy.
 6 | All rights reserved.
 7 | 
 8 | Redistribution and use in source and binary forms, with or without modification,
 9 | are permitted provided that the following conditions are met:
10 | 
11 | 1. Redistributions of source code must retain the above copyright notice,
12 | this list of conditions and the following disclaimer.
13 | 
14 | 2. Redistributions in binary form must reproduce the above copyright notice,
15 | this list of conditions and the following disclaimer in the documentation
16 | and/or other materials provided with the distribution.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
20 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
22 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
28 | THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/test_bipartite_user_item_reviews.py:
--------------------------------------------------------------------------------
 1 | class Item:
 2 |     def __init__(self, id, assignment_id = None, inherent = None, max_inherent = None):
 3 |         self.id = id
 4 |         self.assignment_id = assignment_id
 5 |         self.inherent = inherent
 6 |         self.max_inherent = max_inherent
 7 |         self.reviews = {}
 8 | 
 9 | class User:
10 |     def __init__(self, name):
11 |         self.name = name
12 |         self.users = set()
13 |         self.reviews = {}
14 | 
15 | class Review:
16 |     def __init__(self, review_id = None, grade = None, extra_informative_feature = None):
17 |         self.review_id = id
18 |         self.grade = grade
19 |         self.extra_informative_feature = extra_informative_feature
20 | 
21 | class Graph:
22 |     def __init__(self):
23 |         self.items = set()
24 |         self.users = set()
25 |         self.reviews = {}
26 |         self.items_with_ground_truth = []
27 |         self.user_dict = {}
28 |         self.item_dict = {}
29 | 
30 |     def add_item(self, item_id, inherent = None, max_inherent = None, assignment_id = None):
31 |         item = Item(id = item_id, inherent= inherent, max_inherent= max_inherent, assignment_id= assignment_id)
32 |         self.item_dict[item_id] = item
33 |         self.items = self.items | {item}
34 | 
35 |     def add_user(self, user_name):
36 |         user = User(user_name)
37 |         self.user_dict[user_name] = user
38 |         self.users = self.users | {user}
39 | 
40 |     def get_user(self, user_name):
41 |         return self.user_dict.get(user_name)
42 | 
43 |     def get_item(self, item_id):
44 |         return self.item_dict.get(item_id)
45 | 
46 |     def has_voted(self, user_name,item_id):
47 |         if not user_name in self.user_dict or not item_id in self.item_dict:
48 |             return False
49 |         if (self.get_user(user_name), self.get_item(item_id)) in self.reviews:
50 |             return True
51 |         else:
52 |             return False
53 | 
54 |     def get_no_of_votes(self, item_id):
55 |         if not item_id in self.item_dict:
56 |             return 0
57 |         return len(self.get_item(item_id).reviews)
58 | 
59 |     def add_review(self, user_name, item_id, review, assignment_id = None):
60 |         """
61 |         Adds a review to the graph.
62 |         It inserts the review to the generic dictionary of reviews
63 |         but also to the item.reviews and user.reviews dictionaries.
64 |         There is redundancy of information but enhances accessibility.
65 |         """
66 |         # If user name or item id are not in the graph create respective objects
67 |         if not user_name in self.user_dict:
68 |             self.add_user(user_name)
69 |         if not item_id in self.item_dict:
70 |             self.add_item(item_id, assignment_id= assignment_id)
71 |         # Get user and item objects that correspond to user name and user id
72 |         user = self.get_user(user_name)
73 |         item = self.get_item(item_id)
74 |         # add review to the dictionaries
75 |         item.reviews[user] = review
76 |         user.reviews[item] = review
77 |         self.reviews[(user, item)] = review


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Graph-LSTM
 2 | 
 3 | This repository contains several pieces of code that are useful for applying machine learning to graphs. 
 4 | See the [project page](https://sites.google.com/view/ml-on-structures) for the overall project, papers, and data. 
 5 | 
 6 | Many prediction problems can be phrased as inferences over local neighborhoods of graphs.  The graph represents the interaction between entities, and the neighborhood of each entity contains information that allows the inferences or predictions. 
 7 | This project enables the application of machine learning directly to such graph neighborhoods, allowing predictions to be learned from examples, bypassing the step of creating and tuning an inference model or summarizing the neighborhoods via a fixed set of hand-crafted features.
 8 | The approach is based on a multi-level architecture built from Long Short-Term Memory neural nets (LSTMs); the LSTMs learn how to summarize the neighborhood from data.
 9 | 
10 | ## How it works
11 | 
12 | The code performs predictions for one target graph node at a time. 
13 | First, the graph is unfolded from the target node, yielding a tree with the target node as its root at level 0, its neighbors as level-1 children, its neighbors' neighbors as level-2 children, and so forth, up to a desired depth D.
14 | At each tree node v of level 0 <= d < D, a level-d+1 LSTM is fed sequentially the information from the children of v at level d+1, and produces as output information for v itself. 
15 | Thus, we exploit LSTMs' ability to process sequences of any length to process trees of any branching factor.
16 | The top-level LSTM produces an output vector y that summarizes the tree rooted at v.
17 | This output vector can then be combined with the features of v itself, for instance via a standard neural net, to yield the desired prediction for the target node. 
18 | The architecture requires training D LSTMs, one per tree level.
19 | The LSTMs learn how to summarize the neighborhood up to radius D on the basis of data, avoiding the manual task of synthesizing a fixed set of features.
20 | By dedicating one LSTM to each level, we can tailor the learning (and the LSTM size) to the distance from the target node.
21 | 
22 | ## Code included
23 | 
24 | This repository contains various ML algorithms, which can be used independently or in combination.
25 | 
26 | ### DNN
27 | **dnn.py** provides an implementation of deep neural networks.  The input consists in fixed-length feature vectors.
28 | 
29 | ### LSTM
30 | **lstm.py** provides an implementation of LSTMs.  The input consists in sequences of fixed-length feature vectors.
31 | 
32 | ### Multi-Level LSTM
33 | **multi_level_lstm.py** provides an implementation of multi-level LSTMs (see the [project page](https://sites.google.com/view/ml-on-structures) for papers and information).  The input consists in trees of nodes; each node has a feature vector.  The trees can be obtained, among other ways, by unrolling the local neighborhood of a node in a graph.
34 | 
35 | ### MLSLNN
36 | **mlslnn.py** is a helper function to apply multi-level LSTMs to a graph or tree.  The code defined in multi_level_lstm.py enables the summarization of the feature vectors of a tree rooted at v into an output vector (from the top-level LSTM) y. The vector y summarizes the features of the children of v (and subtrees rooted there), but not of v itself.  Thus, it is useful to combine the vector y, and the feature vector f(v) of v, via a top-level neural network that gives the overall output.  The class MLSLNN enables this. 
37 | 
38 | ## Contributors
39 | 
40 | * [Luca de Alfaro](https://sites.google.com/a/ucsc.edu/luca/)
41 | * Rakshit Agrawal
42 | * Vassilis Polychonopoulos
43 | 


--------------------------------------------------------------------------------
/mlslnn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file builds a wrapper for using MLSL along with
  3 | a Neural Network sitting on top of it.
  4 | The neural network, for each of its entry,
  5 | takes in a feature vector defining inputs along with a
  6 | set of features outputting from an MLSL lying underneath
  7 | 
  8 | 
  9 | Authors:
 10 |   Rakshit Agrawal
 11 |   Luca de Alfaro
 12 |   Vassilis Polychronopoulos
 13 | Copyright by the authors, 2016.
 14 | """
 15 | from json_plus import Serializable
 16 | from multi_level_lstm import MLSL
 17 | from dnn import DNN
 18 | import numpy as np
 19 | 
 20 | 
 21 | def softmax(w, t = 1.0):
 22 |     e = np.exp(np.array(w) / t)
 23 |     dist = e / np.sum(e)
 24 |     return dist
 25 | 
 26 | def get_objective_derivative(output, target, objective):
 27 |     if objective == "softmax_classification":
 28 |         return output - target
 29 | 
 30 | class MLSLNN(Serializable):
 31 |     """
 32 |     This class initializes a neural network
 33 |     based on the size of features per entry along
 34 |     with a provided MLSL which generates certain number of outputs
 35 |     """
 36 | 
 37 |     def __init__(self):
 38 |         pass
 39 | 
 40 |     def initialize(self, mlsl, nnl, seed=None, weight_range=1.0, outputs_from_mlsl=None, use_softmax=True):
 41 |         """
 42 |         Initialize an object of this class that binds a new NN on top
 43 |         of an existing MLSL object
 44 |         :param mlsl:
 45 |         :type mlsl: MLSL
 46 |         :param nnl:
 47 |         :type nnl: list
 48 |         :param seed:
 49 |         :type seed:
 50 |         :param weight_range:
 51 |         :type weight_range:
 52 |         :return:
 53 |         :rtype:
 54 |         """
 55 |         self.mlsl_output_size = mlsl.output_sizes[-1] if outputs_from_mlsl else outputs_from_mlsl
 56 | 
 57 |         # Change input size of Neural net to assigned feature size plus MLSL outputs
 58 |         nnl[0]+=self.mlsl_output_size
 59 | 
 60 |         self.outputs_from_mlsl = outputs_from_mlsl
 61 | 
 62 |         self.mlsl = mlsl
 63 |         self.nnet = DNN()
 64 |         self.nnet.initialize(nnl=nnl,seed=seed, weight_range=weight_range)
 65 |         self.use_softmax = use_softmax
 66 | 
 67 |     def forward(self, input_to_mlsl, additional_input_to_nn, target):
 68 |         """
 69 |         This runs a forward through the entire model comprising of an MLSL
 70 |         followed by a NN
 71 |         :param input_to_mlsl:
 72 |         :type input_to_mlsl:
 73 |         :param additional_input_to_nn:
 74 |         :type additional_input_to_nn:
 75 |         :return:
 76 |         :rtype:
 77 |         """
 78 |         mlsl_output = self.mlsl._forward_instance(input_to_mlsl, 0)
 79 |         input_to_nn = np.concatenate((mlsl_output[:self.mlsl_output_size], additional_input_to_nn))
 80 |         nnet_output = self.nnet.forward(input_to_nn)
 81 |         if self.use_softmax:
 82 |             nnet_output = softmax(nnet_output)
 83 | 
 84 |         return nnet_output
 85 | 
 86 |     def get_objective_derivative(self, output, target):
 87 |         if self.use_softmax:
 88 |             return output - target
 89 |         else:
 90 |             raise ValueError
 91 | 
 92 | 
 93 |     def backward(self, loss_deriv, instance_node):
 94 | 
 95 |         # Run derivative through LSTM first
 96 | 
 97 |         nn_deriv = self.nnet.backward_adadelta(loss_deriv)
 98 | 
 99 |         deriv = nn_deriv[:self.mlsl_output_size]
100 | 
101 |         self.mlsl._compute_backward_gradients(instance_node, deriv, 0)
102 |         self.mlsl._compute_LSTM_updates(instance_node, 0)
103 |         # updating the weights of the LSTM modules and
104 |         # updating momentum_dW of LSTM modules with sums of dWs
105 |         # and the other variables for adadelta
106 |         # these momentum/adadelta specific updates happen regardless of whether we use steady rate, momentum, or adadelta
107 |         # if we use steady rate those variables play no role in the computation of dW
108 |         for d in range(self.mlsl.max_depth + 1):
109 |             self.mlsl.lstm_stack[d].WLSTM += self.mlsl.sum_of_dWs[d] / self.mlsl.number_of_nodes_per_level[d]
110 |             self.mlsl.lstm_stack[d].momentum_dW = self.mlsl.sum_of_dWs[d] / self.mlsl.number_of_nodes_per_level[d]
111 |             self.mlsl.lstm_stack[d].tot_gradient_weight = self.mlsl.sum_tot_delta_weight[d] / self.mlsl.number_of_nodes_per_level[d]
112 |             self.mlsl.lstm_stack[d].tot_sq_gradient = self.mlsl.sum_tot_sq_gradient[d] / self.mlsl.number_of_nodes_per_level[d]
113 |             self.mlsl.lstm_stack[d].tot_delta_weight = self.mlsl.sum_tot_delta_weight[d] / self.mlsl.number_of_nodes_per_level[d]
114 |             self.mlsl.lstm_stack[d].tot_sq_delta = self.mlsl.sum_tot_sq_delta[d] / self.mlsl.number_of_nodes_per_level[d]
115 | 
116 | 
117 |     def run_through_the_model(self, instance_node, target, additional_input_to_nn):
118 |         self.mlsl._reset_learning_parameters()
119 |         return self.backward(self.get(self.forward(instance_node, additional_input_to_nn), target), instance_node)
120 | 


--------------------------------------------------------------------------------
/json_plus.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright 2014 Camiolog Inc.
  4 | # Authors: Luca de Alfaro and Massimo Di Pierro
  5 | 
  6 | import base64
  7 | import datetime
  8 | import importlib
  9 | import json
 10 | import numbers
 11 | import numpy
 12 | import unittest
 13 | import collections
 14 | 
 15 | 
 16 | fallback = {}
 17 | remapper = {}
 18 | 
 19 | 
 20 | class Storage(dict):
 21 |     __getattr__ = dict.__getitem__
 22 |     __setattr__ = dict.__setitem__
 23 | 
 24 | 
 25 | def smartcmp(a,b,
 26 |              types=(int, long, basestring, float, bool, tuple)):
 27 |     is_a_primitive = isinstance(a[1],types)
 28 |     is_b_primitive = isinstance(b[1],types)
 29 |     if is_a_primitive and not is_b_primitive:
 30 |         return -1
 31 |     elif not is_a_primitive and is_b_primitive:
 32 |         return +1
 33 |     else:
 34 |         return cmp(a[0],b[0])
 35 | 
 36 | 
 37 | class Serializable(object):
 38 | 
 39 |     # We mimick a dict.
 40 |     def __getitem__(self, key):
 41 |         return getattr(self, key)
 42 |     def __setitem__(self, key, value):
 43 |         setattr(self, key, value)
 44 |     def __delitem__(self, key):
 45 |         del self.__dict__[key]
 46 |     def keys(self):
 47 |         return self.__dict__.keys()
 48 |     def items(self):
 49 |         return self.__dict__.items()
 50 |     def values(self):
 51 |         return self.__dict__.values()
 52 |     def update(self, d):
 53 |         self.__dict__.update(d)
 54 |     def __len__(self):
 55 |         return len(self.__dict__)
 56 |     def __contains__(self, item):
 57 |         return item in self.__dict__
 58 |     def iteritems(self):
 59 |         return iter(self.__dict__.items())
 60 |     def __repr__(self):
 61 |         return repr(self.__dict__)
 62 | 
 63 |     def get(self, k, d=None):
 64 |         try:
 65 |             return getattr(self, k)
 66 |         except AttributeError:
 67 |             return d
 68 | 
 69 |     def __eq__(self, other):
 70 |         return hasattr(other, '__dict__') and self.__dict__ == other.__dict__
 71 | 
 72 |     def to_json(self, pack_ndarray=True, tolerant=True, indent=2):
 73 |         return Serializable.dumps(self, pack_ndarray=pack_ndarray, tolerant=tolerant, indent=indent)
 74 | 
 75 |     @staticmethod
 76 |     def dump(obj, fp, pack_ndarray=True, tolerant=True, indent=2):
 77 |         return fp.write(Serializable.dumps(obj, pack_ndarray=pack_ndarray, tolerant=tolerant, indent=indent))
 78 | 
 79 |     @staticmethod
 80 |     def dumps(obj, pack_ndarray=True, tolerant=True, indent=2):
 81 |         def custom(o):
 82 |             if isinstance(o, Serializable):
 83 |                 module = o.__class__.__module__.split('campil.')[-1]
 84 |                 # make sure keys are sorted
 85 |                 d = collections.OrderedDict()
 86 |                 d['meta_class'] = '%s.%s' % (module, o.__class__.__name__)
 87 |                 d.update(sorted((item for item in o.__dict__.iteritems()
 88 |                                  if not item[0].startswith('_')), smartcmp))
 89 |                 return d
 90 |             elif isinstance(o, datetime.datetime):
 91 |                 d = {'meta_class': 'datetime.datetime',
 92 |                      'date': o.isoformat()}
 93 |                 return d
 94 |             elif isinstance(o, set):
 95 |                 d = {'meta_class': 'set',
 96 |                      'set': list(o)}
 97 |                 return d
 98 |             elif isinstance(o, file):
 99 |                 return '<file %r>' % o.name
100 | 
101 |             elif pack_ndarray and isinstance(o, numpy.matrix):
102 |                 # This catches both numpy arrays, and CamArray.
103 |                 d = {'meta_class': 'numpy.matrix',
104 |                      'dtype': str(o.dtype),
105 |                      'shape': o.shape,
106 |                      'data': base64.b64encode(o.tostring())}
107 |                 return d
108 | 
109 |             elif pack_ndarray and isinstance(o, numpy.ndarray):
110 |                 # This catches both numpy arrays, and CamArray.
111 |                 d = {'meta_class': 'numpy.ndarray',
112 |                      'dtype': str(o.dtype),
113 |                      'shape': o.shape,
114 |                      'data': base64.b64encode(o.tostring())}
115 |                 return d
116 | 
117 |             # We try to preserve numpy numbers.
118 |             elif type(o).__module__ == numpy.__name__ and isinstance(o, numbers.Real):
119 |                 d = {'meta_class': 'numpy.number',
120 |                      'dtype': str(o.dtype),
121 |                      'data': base64.b64encode(o.tostring())
122 |                      }
123 |                 return d
124 | 
125 |             # Normal Python types are unchanged
126 |             elif isinstance(o, (int, long, basestring, float, bool, list, tuple)):
127 |                 return o
128 |             # except dictionaries which are sorted
129 |             elif isinstance(o, dict):
130 |                 d = collections.OrderedDict()
131 |                 d.update(sorted((item for item in o.iteritems()), smartcmp))
132 |                 return d
133 |             # These two defaults are catch-all
134 |             elif isinstance(o, numbers.Integral):
135 |                 return int(o)
136 |             elif isinstance(o, numbers.Real):
137 |                 return float(o)
138 |             elif isinstance(o, (numpy.bool, numpy.bool_)):
139 |                 return bool(o)
140 |             elif tolerant:
141 |                 return None
142 |             else:
143 |                 raise ValueError("Cannot encode in json object %r" % o)
144 |         return json.dumps(obj, default=custom, indent=indent)
145 | 
146 |     @staticmethod
147 |     def from_json(s, objectify=True, mapper={}):
148 |         """Decodes json_plus.
149 |          @param s : the string to decode
150 |          @param objectify : If True, reconstructs the object hierarchy.
151 |          @param mapper :
152 |             - If a dictonary, then the key classes are replaced by the value classes in the
153 |                 decoding.
154 |             - If a class, then all objects that are not dates or numpy classes are decoded to
155 |               this class.
156 |             - If None, then all objects that are not dates or numpy classes are decoded to
157 |               json_plus.Serializable."""
158 |         def hook(o):
159 |             meta_module, meta_class = None, o.get('meta_class')
160 |             if meta_class in ('Datetime', 'datetime.datetime'):
161 |                 # 'Datetime' included for backward compatibility
162 |                 try:
163 |                     tmp = datetime.datetime.strptime(
164 |                         o['date'], '%Y-%m-%dT%H:%M:%S.%f')
165 |                 except Exception, e:
166 |                     tmp = datetime.datetime.strptime(
167 |                         o['date'], '%Y-%m-%dT%H:%M:%S')
168 |                 return tmp
169 |             elif meta_class == 'set':
170 |                 return set(o['set'])
171 |             # Numpy arrays.
172 |             elif meta_class == 'numpy.ndarray':
173 |                 data = base64.b64decode(o['data'])
174 |                 dtype = o['dtype']
175 |                 shape = o['shape']
176 |                 v = numpy.frombuffer(data, dtype=dtype)
177 |                 v = v.reshape(shape)
178 |                 obj = v.copy()
179 |                 obj.flags.writeable = True
180 |                 return obj
181 |             elif meta_class == 'numpy.matrix':
182 |                 data = base64.b64decode(o['data'])
183 |                 dtype = o['dtype']
184 |                 shape = o['shape']
185 |                 v = numpy.frombuffer(data, dtype=dtype)
186 |                 v = v.reshape(shape)
187 |                 obj = numpy.matrix(v.copy())
188 |                 obj.flags.writeable = True
189 |                 return obj
190 |             # Numpy numbers.
191 |             elif meta_class == 'numpy.number':
192 |                 data = base64.b64decode(o['data'])
193 |                 dtype = o['dtype']
194 |                 v = numpy.frombuffer(data, dtype=dtype)[0]
195 |                 return v
196 | 
197 |             elif meta_class and '.' in meta_class:
198 |                 # correct for classes that have migrated from one module to another
199 |                 meta_class = mapper.get(meta_class, meta_class)
200 |                 meta_class = remapper.get(meta_class, meta_class)
201 |                 # separate the module name from the actual class name
202 |                 meta_module, meta_class = meta_class.rsplit('.',1)
203 | 
204 |             if meta_class is not None:
205 |                 del o['meta_class']
206 |                 if mapper is None:
207 |                     obj = Serializable()
208 |                     obj.__dict__.update(o)
209 |                     o = obj
210 |                 elif isinstance(mapper, dict):
211 |                     # this option is for backward compatibility in case a module is not specified
212 |                     if meta_class in fallback:
213 |                         meta_module = fallback.get(meta_class)
214 | 
215 |                     if meta_module is not None and objectify:
216 |                         try:
217 |                             module = importlib.import_module(meta_module)
218 |                             cls = getattr(module, meta_class)
219 |                             obj = cls()
220 |                             obj.__dict__.update(o)
221 |                             o = obj
222 |                         except Exception, e:
223 |                             # If an object is unknown, restores it as a member
224 |                             # of this same class.
225 |                             obj = Serializable()
226 |                             obj.__dict__.update(o)
227 |                             o = obj
228 |                 else:
229 |                     # Map all to the specified class.
230 |                     obj = mapper()
231 |                     obj.__dict__.update(o)
232 |                     o = obj
233 |             elif type(o).__name__ == 'dict':
234 |                 # For convenience we deserialize dict into Storage.
235 |                 o = Storage(o)
236 |             return o
237 | 
238 |         return json.loads(s, object_hook=hook)
239 | 
240 |     @staticmethod
241 |     def loads(s):
242 |         return Serializable.from_json(s)
243 | 
244 |     @staticmethod
245 |     def load(fp):
246 |         return Serializable.loads(fp.read())
247 | 
248 | 
249 | loads = Serializable.loads
250 | dumps = Serializable.dumps
251 | 
252 | class TestSerializable(unittest.TestCase):
253 | 
254 |     def test_simple(self):
255 |         a = Serializable()
256 |         a.x = 1
257 |         a.y = 'test'
258 |         a.z = 3.14
259 |         b = Serializable.from_json(a.to_json())
260 |         self.assertEqual(a, b)
261 | 
262 |     def test_datetime(self):
263 |         a = Serializable()
264 |         a.x = datetime.datetime(2015,1,3)
265 |         b = Serializable.from_json(a.to_json())
266 |         self.assertEqual(a, b)
267 | 
268 |     def test_recursive(self):
269 |         a = Serializable()
270 |         a.x = Serializable()
271 |         a.x.y = 'test'
272 |         b = Serializable.from_json(a.to_json())
273 |         self.assertEqual(a, b)
274 | 
275 |     def test_numpy(self):
276 |         a = Serializable()
277 |         a.x = numpy.array([[1,2,3],[4,5,6]], dtype=numpy.int32)
278 |         b = Serializable.from_json(a.to_json(pack_ndarray=True))
279 |         self.assertEqual(numpy.sum(numpy.abs(a.x - b.x)), 0)
280 | 
281 |     def test_numpy_twice(self):
282 |         a = Serializable()
283 |         a.x = numpy.array([[1,2,3],[4,5,6]], dtype=numpy.int32)
284 |         b = Serializable.from_json(a.to_json(pack_ndarray=True))
285 |         self.assertEqual(numpy.sum(numpy.abs(a.x - b.x)), 0)
286 |         c = Serializable.from_json(b.to_json(pack_ndarray=True))
287 |         self.assertEqual(numpy.sum(numpy.abs(a.x - c.x)), 0)
288 | 
289 |     def test_numpy_direct(self):
290 |         a = numpy.array([[1,2,3],[4,5,6]], dtype=numpy.int32)
291 |         s = Serializable.dumps(a, pack_ndarray=True)
292 |         c = Serializable.from_json(s)
293 |         self.assertEqual(numpy.sum(numpy.abs(a - c)), 0)
294 | 
295 |     def test_float(self):
296 |         x = numpy.float16(3.5)
297 |         y = Serializable.from_json(Serializable.dumps(x))
298 |         self.assertAlmostEqual(y, x, 2)
299 | 
300 |     def test_numpy_uint32(self):
301 |         x = numpy.uint32(55)
302 |         s = Serializable.dumps(x)
303 |         y = Serializable.from_json(s)
304 |         self.assertEqual(x, y)
305 |         self.assertEqual(str(x.dtype), 'uint32')
306 |         self.assertEqual(str(y.dtype), 'uint32')
307 | 
308 |     def test_numpy_float128(self):
309 |         x = numpy.float128(55.3)
310 |         s = Serializable.dumps(x)
311 |         y = Serializable.from_json(s)
312 |         self.assertAlmostEqual(x, y, 5)
313 |         self.assertEqual(str(x.dtype), 'float128')
314 |         self.assertEqual(str(y.dtype), 'float128')
315 | 
316 |     def test_set(self):
317 |         s = set(['a', 'b', 'c'])
318 |         x = Serializable.dumps(s)
319 |         t = Serializable.loads(x)
320 |         self.assertEqual(s, t)
321 | 
322 |     def test_multiple_dicts(self):
323 |         d = dict(cane=4, gatto=4, uccello=2)
324 |         d1 = Serializable.loads(Serializable.dumps(d))
325 |         d2 = Serializable.loads(Serializable.dumps(d1))
326 |         for k in d.keys():
327 |             self.assertEqual(d.get(k), d2.get(k))
328 |         for k in d2.keys():
329 |             self.assertEqual(d.get(k), d2.get(k))
330 | 
331 |     def test_modifiable(self):
332 |         a = numpy.zeros((10,10))
333 |         b = loads(dumps(a))
334 |         a[2:4, 5:6] = 1
335 |         b[2:4, 5:6] = 1
336 |         self.assertEqual(numpy.sum(numpy.abs(a - b)), 0)
337 | 
338 |     def test_matrices(self):
339 |         a = numpy.matrix(numpy.ones((4,5)))
340 |         b = numpy.matrix(numpy.ones((5, 6)))
341 |         ab = a * b
342 |         # print "Serialization:", dumps(a)
343 |         aa = loads(dumps(a))
344 |         bb = loads(dumps(b))
345 |         # print "Deserialied types:", type(aa), type(bb)
346 |         aabb = aa * bb
347 |         self.assertEqual(numpy.sum(numpy.abs(ab - aabb)), 0)
348 | 
349 | if __name__ == '__main__':
350 |     unittest.main()
351 | 


--------------------------------------------------------------------------------
/test_MLSL.py:
--------------------------------------------------------------------------------
  1 | # This code is useful for testing the MLSL package.
  2 | 
  3 | import test_bipartite_user_item_reviews as bp
  4 | import mlsl as ml
  5 | import random
  6 | import numpy as np
  7 | import unittest
  8 | 
  9 | def test_model(mlsl_model, test_set):
 10 |     guesses = 0
 11 |     hits = 0
 12 |     found = {}
 13 |     missed = {}
 14 |     misclassified = {}
 15 |     for item in test_set:
 16 |         Y = softmax(mlsl_model.forward_propagation(item))
 17 |         if Y is None:
 18 |             continue
 19 |         print Y
 20 |         predicted_label = Y.argmax()
 21 |         real_label = item.get_label()
 22 |         print "Predicted label ", predicted_label , " real label", real_label
 23 |         guesses += 1
 24 |         hits += 1 if predicted_label == real_label else 0
 25 |         if predicted_label == real_label:
 26 |             if real_label not in found:
 27 |                 found[real_label] = 1
 28 |             else:
 29 |                 found[real_label] += 1
 30 |         if predicted_label != real_label:
 31 |             if real_label not in missed:
 32 |                 missed[real_label] = 1
 33 |             else:
 34 |                 missed[real_label] += 1
 35 |             if predicted_label not in misclassified:
 36 |                 misclassified[predicted_label] = 1
 37 |             else:
 38 |                 misclassified[predicted_label] += 1
 39 |     print "LSTM results"
 40 |     print "============================================================="
 41 |     print "Predicted correctly ", hits , "over ", guesses, " instances."
 42 |     recall_list = []
 43 |     recall_dict = {}
 44 |     precision_dict = {}
 45 |     found_labels = set(found.keys())
 46 |     missed_labels = set(missed.keys())
 47 |     all_labels = found_labels.union(missed_labels)
 48 |     for label in all_labels:
 49 |         no_of_finds = float((0 if label not in found else found[label]))
 50 |         no_of_missed = float((0 if label not in missed else missed[label]))
 51 |         no_of_misclassified = float((0 if label not in misclassified else misclassified[label]))
 52 |         recall =  no_of_finds / (no_of_finds + no_of_missed)
 53 |         precision = no_of_finds / (no_of_finds + no_of_misclassified)
 54 |         recall_dict[label] = recall
 55 |         precision_dict[label] = precision
 56 |         recall_list.append(recall)
 57 |     avg_recall = np.mean(recall_list)
 58 |     print "Average recall ", np.mean(recall_list)
 59 |     if len(all_labels) == 2: # compute F-1 score for binary classification
 60 |         for label in all_labels:
 61 |             print "F-1 score for label ", label, " is : ",
 62 |             print 2 * (precision_dict[label] * recall_dict[label]) / (precision_dict[label] + recall_dict[label])
 63 |     return avg_recall
 64 | 
 65 | def create_synthetic_graph_with_informative_extra_feature(no_of_items, no_of_users, no_of_votes, min_grade, max_grade, min_delay, max_delay, threshold, exclude_true_grade_from_random_answers = False, seed = 500):
 66 |     """
 67 |     Creates random graph.
 68 |     Adds extra feature in review: if value is above threshold, the user performing the review is always truthful.
 69 |     Intended for testing the MLSL network across multiple levels by feeding extra information selectively to particular levels.
 70 |     When exclude_true_grade_from_random_answers is True the true grade is excluded from the possible random grades, i.e. the random review grade is alway not the true grade.
 71 | 
 72 |     """
 73 |     g = bp.Graph()
 74 |     random.seed(seed)
 75 |     # add items and their inherent random grades to the graph
 76 |     # inherent grades are uniformly distributed between min_grade and max_grade
 77 |     item_ids = [str(i) for i in range(no_of_items)]
 78 |     for i in item_ids:
 79 |         g.add_item(i, random.randint(min_grade,max_grade))
 80 |     # item_ids is a list that contains the ids of items that are still alive, i.e.
 81 |     # the ones that have received less than no_of_votes votes.
 82 |     # Initially before users cast their votes, the list contains all items.
 83 |     user_ids = [str(u) for u in range(no_of_users)]
 84 |     for u in user_ids: # iteration over users and random picking of items to vote
 85 |         user_votes = 0
 86 |         items_complete_with_votes = []
 87 |         # shuffling items so that user picks items to vote randomly
 88 |         random.shuffle(item_ids)
 89 |         extra_informative = random.randint(min_delay, max_delay)
 90 |         for i in item_ids:
 91 |             # if item has over the max number of votes or user has already voted on it
 92 |             # then continue to next item
 93 |             if g.get_no_of_votes(i) >= no_of_votes:
 94 |                 items_complete_with_votes.append(i)
 95 |                 continue
 96 |             if g.has_voted(u,i):
 97 |                 continue
 98 |             # all clear to create review for item
 99 |             possible_random_grades = range(min_grade, max_grade + 1)
100 |             if exclude_true_grade_from_random_answers:
101 |                 possible_random_grades.remove(g.get_item(i).inherent)
102 |             review = bp.Review( grade = g.get_item(i).inherent if extra_informative>threshold else random.choice(possible_random_grades), extra_informative_feature = extra_informative)
103 |             g.add_review(u,i,review)
104 |             user_votes += 1
105 |             # if item has exceeded max number of votes add it to the list for removal
106 |             if g.get_no_of_votes(i) >= no_of_votes:
107 |                 items_complete_with_votes.append(i)
108 |             # if user has cast more than max number of votes, break loop and continue to next user
109 |             if user_votes == no_of_votes:
110 |                 break
111 |         # remove items with more than max number of votes from item_ids list to avoid iterating over them in the future (makes execution faster)
112 |         for i in items_complete_with_votes:
113 |             item_ids.remove(i)
114 |     return g
115 | 
116 | 
117 | def test_for_multiple_layers(print_graph = False, max_depth = 0, informative_features = None):
118 |     votes = 3
119 |     g = create_synthetic_graph_with_informative_extra_feature(no_of_items = 3000, no_of_users = 3000,
120 |                                                                no_of_votes = votes, min_grade = 0, max_grade = 10,
121 |                                                                min_delay= 0.0, max_delay = 1000.0,
122 |                                                                threshold = 700.0)
123 |     random.seed(940)
124 |     itemList = list(g.items)
125 |     if print_graph:
126 |         for u in g.users:
127 |             print "User ", u.name, "voted items:"
128 |             for i in u.reviews:
129 |                 print "Item ", i.id, "Inherent grade:", i.inherent, "User grade:", u.reviews[i].grade, "Extra feature: ", u.reviews[i].extra_informative_feature
130 |     instance_list = []
131 |     counter = 0
132 |     for i in itemList:
133 |         new_root = ml.InstanceNode(label = i.inherent)
134 |         build_unfolding(0, max_depth, i, new_root, informative_features)
135 |         new_root.set_label(i.inherent)
136 |         instance_list.append(new_root)
137 |         counter +=1
138 |         if counter % 200 ==0:
139 |             print "Created unfolding for ", counter, "items."
140 |     OUTPUT_SIZES = [11, 2, 2]
141 |     INPUT_SIZES = [11 + (1 if informative_features[0] == "include" else 0),11 + (1 if informative_features[1] == "include" else 0),
142 |                    11 + (1 if informative_features[2] == "include" else 0)]
143 |     LEARNING_RATE_VECTOR = [0.05,0.1, 4.5]
144 |     LEARNING_METHOD_VECTOR = ["steady_rate", "steady_rate","steady_rate"]
145 |     #LEARNING_METHOD_VECTOR = ["momentum", "momentum", "momentum"]
146 |     #LEARNING_METHOD_VECTOR = ["adadelta", "adadelta", "adadelta"]
147 |     MOMENTUM_VECTOR = [0.01, 0.01, 0.01]
148 |     ADADELTA_VECTOR = [{"learning_factor" : 1.0, "epsilon" : 0.001, "decay" : 0.95}, {"learning_factor" : 1.0, "epsilon" : 0.001, "decay" : 0.95}, {"learning_factor" : 1.0, "epsilon" : 0.001, "decay" : 0.95}]
149 |     OBJECTIVE_FUNCTION = "softmax_classification"
150 |     mlsl_model = ml.MLSL(max_depth + 1, output_sizes= OUTPUT_SIZES[:max_depth + 1], node_feature_sizes= INPUT_SIZES[:max_depth + 1], learning_rate_vector= LEARNING_RATE_VECTOR[:max_depth + 1], learning_method_vector= LEARNING_METHOD_VECTOR[:max_depth + 1])
151 |     random.shuffle(instance_list)
152 |     training_set = instance_list[0:2000]
153 |     test_set = instance_list[2000:3000]
154 |     print "Training starts for ", max_depth + 1, " levels"
155 |     train_model_force_balance(mlsl_model, training_set, num_instances = 50000,
156 |                                           max_depth= max_depth, objective_function= OBJECTIVE_FUNCTION,
157 |                                           learning_rate_vector= LEARNING_RATE_VECTOR, learning_method_vector = LEARNING_METHOD_VECTOR,
158 |                                           momentum_vector= MOMENTUM_VECTOR, adadelta_parameters = ADADELTA_VECTOR)
159 |     return test_model(mlsl_model, test_set)
160 | 
161 | def build_unfolding(current_depth, max_depth, bipartite_node, tree_node, informative_features = None, parent_user_informative_feature = None):
162 |     for c in bipartite_node.reviews:
163 |         number_of_features = 12 if informative_features[current_depth] == "include" else 11
164 |         feature_vector = np.zeros(number_of_features)
165 |         feature_vector[bipartite_node.reviews[c].grade] = 1.0
166 |         if current_depth == 0: # nake explicit honesty feature to feed to the 3rd level for 3 level training test
167 |             if bipartite_node.inherent == bipartite_node.reviews[c].grade:
168 |                 honesty = 1.0
169 |             else:
170 |                 honesty = 0.0
171 |         extra_feature = bipartite_node.reviews[c].extra_informative_feature / 1000.0
172 |         if number_of_features == 12:
173 |             feature_vector[11] = extra_feature if current_depth < 2 else parent_user_informative_feature
174 |         child_node = ml.InstanceNode(feature_vector = feature_vector.copy())
175 |         if current_depth < max_depth:
176 |             build_unfolding(current_depth + 1, max_depth, bipartite_node = c, tree_node= child_node, informative_features = informative_features,
177 |                             parent_user_informative_feature = honesty if current_depth == 0 else parent_user_informative_feature)
178 |         tree_node.children.append(child_node)
179 | 
180 | 
181 | """
182 | trains MLSL with stochastic gradient descent
183 | by imposing class balance, i.e. shows equal number of examples of all classes to the network during training
184 | """
185 | def train_model_force_balance(mlsl_model, training_set, num_instances, max_depth, objective_function, learning_rate_vector, learning_method_vector, momentum_vector = None, adadelta_parameters = None):
186 |     counter = 0
187 |     if num_instances == 0:
188 |         return
189 |     for item in get_balanced_training_set(training_set, mlsl_model.output_sizes[0]):
190 |         if item.get_number_of_children() == 0:
191 |             continue
192 |         target = np.zeros((1,mlsl_model.output_sizes[0]))
193 |         target[0,item.get_label()] = 1.0
194 |         mlsl_model._reset_learning_parameters()
195 |         Y = softmax(mlsl_model.forward_propagation(item))
196 |         mlsl_model.backward_propagation(item, Y - target)
197 |         counter += 1
198 |         if counter % 1000 == 0:
199 |             print "Training has gone over", counter, " instances.."
200 |         if counter == num_instances:
201 |             break
202 | 
203 | def softmax(w, t = 1.0):
204 |     e = np.exp(np.array(w) / t)
205 |     dist = e / np.sum(e)
206 |     return dist
207 | 
208 | """ Generator that returns items from training set
209 |     equally balanced among classes"""
210 | def get_balanced_training_set(training_set, no_of_classes):
211 |     # make bucket of classes to sample from
212 |     buckets = {}
213 |     buckets_current_indexes ={}
214 |     for i in range(0, no_of_classes):
215 |         buckets[i] = []
216 |         buckets_current_indexes[i] = 0
217 |     for item in training_set:
218 |         category = item.get_label()
219 |         buckets[category].append(item)
220 |     while True:
221 |         for i in range(0,no_of_classes):
222 |             if len(buckets[i]) == 0: # if a class has no representatives, continue
223 |                 continue
224 |             if buckets_current_indexes[i] == len(buckets[i]):
225 |                 buckets_current_indexes[i] = 0
226 |             yield buckets[i][buckets_current_indexes[i]]
227 |             buckets_current_indexes[i] += 1
228 | 
229 | 
230 | class SimpleLearningTest(unittest.TestCase):
231 | 
232 |     def test_graph(self):
233 |         # test for 1 level
234 |         # by changing 'exclude' to 'includde' we include the informative feature
235 |         # and expect performance to improve
236 |         first_level_performance = test_for_multiple_layers(print_graph= False, max_depth = 0, informative_features = ["exclude", "NA", "NA"])
237 |         first_level_additional_feature_performance = test_for_multiple_layers(print_graph= False, max_depth = 0, informative_features = ["include","NA","NA"])
238 | 
239 |         # test for 2 levels
240 |         # the 2 level (max_depth = 1) beats the 1 level as it can learn the informative feature at the second level
241 |         second_level_no_additional_performance = test_for_multiple_layers(print_graph= False, max_depth = 1, informative_features = ["exclude", "exclude", "NA"])
242 |         second_level_additional_performance = test_for_multiple_layers(print_graph= False, max_depth = 1, informative_features = ["exclude", "include", "NA"])
243 | 
244 |         # test for 3 levels
245 |         third_level_additional_performance = test_for_multiple_layers(print_graph= False, max_depth = 2, informative_features = ["exclude", "exclude", "include"])
246 | 
247 |         print "\n\n\nAggregate performance comparison, test results"
248 |         print "----------------------------------------------"
249 |         print "1-MLSL performance, no additional informative feature : ", first_level_performance
250 |         print "1-MLSL performance, with additional informative feature :", first_level_additional_feature_performance
251 |         print "-----"
252 |         print "Additional feature enhances performance -- training OK!" if first_level_additional_feature_performance> first_level_performance  else "Not OK"
253 |         print "-----"
254 |         print "2-MLSL performance, no additional informative feature :", second_level_no_additional_performance
255 |         print "2-MLSL performance, additional informative feature fed to second level *only* :", second_level_additional_performance
256 |         print "-----"
257 |         print "Additional feature at second level enhances performance -- second level training OK!" if second_level_additional_performance> second_level_no_additional_performance  else "Not OK"
258 |         print "-----"
259 |         print "3-MLSL performance, additional informative feature on parent user honesty fed to third level *only* :", third_level_additional_performance
260 |         print "-----"
261 |         print "Additional feature at third level enhances performance -- third level training OK!" if third_level_additional_performance> second_level_no_additional_performance  else "Not OK"
262 |         print "-----"
263 |         self.assertGreater(first_level_additional_feature_performance, first_level_performance,"1 Level training not OK, retrain!")
264 |         self.assertGreater(second_level_additional_performance, second_level_no_additional_performance,"2 level training not OK, retrain!")
265 |         self.assertGreater(third_level_additional_performance,second_level_no_additional_performance, "3 level training not OK, retrain!")
266 |         # If one occasionally gets Not OK results, retrain as the random initiliazation of the weights can sometimes trap the network
267 | 
268 | if __name__ == '__main__':
269 |     unittest.main()


--------------------------------------------------------------------------------
/multi_level_lstm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementation of Multi-Level Sequence Learners using LSTMs.
  3 | See https://sites.google.com/view/ml-on-structures for overall project page, and papers
  4 | that describe these learners.
  5 | 
  6 | Authors:
  7 |   Rakshit Agrawal
  8 |   Luca de Alfaro
  9 |   Vassilis Polychronopoulos
 10 | Copyright by the authors, 2016.
 11 | """
 12 | 
 13 | import lstm
 14 | import numpy as np
 15 | import random
 16 | from json_plus import Serializable, Storage
 17 | 
 18 | class UnknownLearningMethod(Exception):
 19 |     def __init__(self, s):
 20 |         self.message = s
 21 | 
 22 | 
 23 | class MLSL(Serializable):
 24 | 
 25 |     def __init__(self, max_depth, output_sizes, node_feature_sizes,
 26 |                  learning_rate_vector, learning_method_vector,
 27 |                  shuffle_levels=[],
 28 |                  adadelta_parameters=None,
 29 |                  momentum_vector=None):
 30 |         """Initializes a multi-level LSTM.
 31 |         The ML-LSTM has max_depth layers.  Layer 0 is the root node.
 32 |         Layers max_depth - 1 to 0 have LSTMs in them.
 33 |         Layer max_depth is simply composed of graph nodes, which forward their
 34 |         features to the LSTMs of level max_depth - 1.
 35 |         The output of level i consists in the LSTM features computed from the children of i;
 36 |         it does not contain any features computed from the node at level i itself.
 37 |         The features of the node at level i will be passed to node at level i-1 along
 38 |         with the LSTM output.
 39 | 
 40 |         @param max_depth: As noted above.
 41 |         @param node_feature_sizes: How many features are produced by a node, according to its depth.
 42 |             This can go from 0 to max_depth (included).  Be careful: unless e.g. the graph is
 43 |             bipartite, you need to use the same number throughout.
 44 |         @param output_sizes: How many features are produced by LSTMs at different depth.  This
 45 |             does not need to be constant.
 46 |         @param learning_rate_vector: Vector of learning rates.
 47 |         @param learning_method_vector: Vector of learning methods. It can be None, in which case
 48 |             adadelta is used, or it can be a vector consisting of 'adadelta' or 'momentum'
 49 |             or 'steady_rate' (the latter is not recommended) for each layer.
 50 |         @param momentum_vector: vector containing momentums for learning.  It can be None if
 51 |             adadelta is used.
 52 |         @param adadelta_parameters: vector of adadelta parameters.  It can be None if momentum
 53 |             learning is used.
 54 |         @param shuffle_children: a list (or set) of depths at which shuffling is to occur.
 55 |         """
 56 |         # First, some sanity checks.
 57 |         assert max_depth > 0
 58 |         assert len(output_sizes) == max_depth
 59 |         assert len(node_feature_sizes) == max_depth
 60 |         assert len(learning_method_vector) == max_depth
 61 |         assert adadelta_parameters is None or len(adadelta_parameters) == max_depth
 62 |         assert adadelta_parameters is not None or all(m != 'adadelta' for m in learning_method_vector)
 63 |         assert momentum_vector is None or len(momentum_vector) == max_depth
 64 |         assert momentum_vector is not None or all(m == 'steady_rate' for m in learning_method_vector)
 65 |         #assert [i < max_depth for i in shuffle_levels]
 66 | 
 67 |         self.output_sizes = output_sizes
 68 |         self.node_feature_sizes = node_feature_sizes
 69 |         self.max_depth = max_depth
 70 |         self.learning_rate_vector = learning_rate_vector
 71 |         self.learning_method_vector = learning_method_vector
 72 |         self.adadelta_parameters = adadelta_parameters
 73 |         self.momentum_vector = momentum_vector
 74 |         self.shuffle_levels = shuffle_levels
 75 | 
 76 |         # Creates the list of LSTMs, one per level.
 77 |         self.lstm_stack = [lstm.LSTM() for _ in range(max_depth)]
 78 |         for l in range(max_depth):
 79 |             self.lstm_stack[l].initialize(
 80 |                 node_feature_sizes[l] + (0 if l == max_depth - 1 else output_sizes[l + 1]),
 81 |                 output_sizes[l])
 82 | 
 83 |         # we need the following structures, when training with momentum and/or adadelta,
 84 |         # to keep track of the sum of dW at each level in order to update the momentum_dW
 85 |         # or the adadelta parameters of the respective LSTM modules.
 86 |         self.number_of_nodes_per_level = None
 87 |         self.sum_of_dWs = None
 88 |         self.sum_tot_sq_gradient = None
 89 |         self.sum_tot_gradient_weight = None
 90 |         self.sum_tot_sq_delta = None
 91 |         self.sum_tot_delta_weight = None
 92 | 
 93 | 
 94 |     def forward_propagation(self, instance_node, instance_depth=0):
 95 |         """Performs forward propagation through the multi-level LSTM structure.
 96 |          The node instance_node at depth instance_depth is propagated.
 97 |          The node should be an object of class InstanceNode."""
 98 |         # Prepares for back-propagation.
 99 |         self._reset_learning_parameters()
100 |         input_sequence = np.array([])
101 |         children_sequence = list(instance_node.get_children())
102 |         if len(children_sequence) == 0:
103 |             # FIXME We should really have a feature that describes the number of children.
104 |             # This loses any data that might be associated with the node itself.
105 |             return -100 * np.ones(self.output_sizes[instance_depth]) # no children signifier vector
106 |         if instance_depth in self.shuffle_levels:
107 |             # Shuffles children order if required.
108 |             random.shuffle(children_sequence)
109 |         for child_node in children_sequence:
110 |             child_node_feature_vector = child_node.get_feature_vector()
111 |             assert len(child_node_feature_vector) == self.node_feature_sizes[instance_depth]
112 |             # If we are not at the very bottom we need to get input from LSTM at the next level.
113 |             LSTM_output_from_below = np.array([])
114 |             if instance_depth < self.max_depth - 1:
115 |                  LSTM_output_from_below = self.forward_propagation(child_node, instance_depth=instance_depth + 1).reshape(
116 |                      self.output_sizes[instance_depth + 1]) # recursive call
117 |             # concatenate feature vector and input from LSTM output below
118 |             full_feature_vector = np.concatenate((LSTM_output_from_below, child_node_feature_vector))
119 |             # concatenate current feature vector to input sequence for the LSTM
120 |             # TODO: This is very confusing; can you change this to use row and column stacking?
121 |             input_sequence = np.concatenate((input_sequence, full_feature_vector))
122 |         # forward the input sequence to this depth's LSTM
123 |         input_sequence = input_sequence.reshape(len(children_sequence), 1, len(full_feature_vector))
124 |         _, _, Y, cache = self.lstm_stack[instance_depth]._forward(input_sequence)
125 |         # We store the state of the LSTM, so we can use it for back-propagation.
126 |         instance_node.cache.lstm_cache = cache
127 |         # we also need to save the sequence in the same order we used it.
128 |         instance_node.children_sequence = children_sequence
129 |         return Y
130 | 
131 | 
132 |     def backward_propagation(self, instance_node, derivative, instance_depth=0):
133 |         """Performs backward propagation, given a loss derivative for the outputs."""
134 |         # First, we backpropagate through the layers the backward gradient.
135 |         self._compute_backward_gradients(instance_node, derivative, instance_depth)
136 |         # Second, we compute (but we do not apply) the update at all layers
137 |         # of the MLSL.  We don't apply it because at every layer, there are in
138 |         # general multiple instances of an LSTM, and we will have to add all the
139 |         # updates for an LSTM at the same level before applying them.
140 |         self._compute_LSTM_updates(instance_node, instance_depth)
141 |         # Finally, once the updates have been computed, it applies them
142 |         # to all the levels of the LSTM.
143 |         self._apply_LSTM_updates()
144 | 
145 | 
146 |     def _reset_learning_parameters(self):
147 |         """This function should be called before any learning step."""
148 |         self.number_of_nodes_per_level = [0 for _ in range(self.max_depth + 1)]
149 |         self.sum_of_dWs = [0.0 for _ in range(self.max_depth)]
150 |         self.sum_tot_sq_gradient =  [0.0 for _ in range(self.max_depth)]
151 |         self.sum_tot_gradient_weight = [0.0 for _ in range(self.max_depth)]
152 |         self.sum_tot_sq_delta = [0.0 for _ in range(self.max_depth)]
153 |         self.sum_tot_delta_weight = [0.0 for _ in range(self.max_depth)]
154 | 
155 | 
156 |     def _compute_backward_gradients(self, instance_node, derivative, instance_depth):
157 |         """Recursive function to compute the backward gradients at all levels
158 |         of the MLSL.  The gradients are left in instance_node.cache.weight_gradient."""
159 |         dX, g, _, _ = self.lstm_stack[instance_depth].backward_return_vector_no_update(
160 |             d = derivative, cache = instance_node.cache.lstm_cache)
161 |         instance_node.cache.weight_gradient = g
162 |         if instance_depth == self.max_depth:
163 |             return
164 |         for idx, item in enumerate(instance_node.children_sequence):
165 |             if item.cache == {}:
166 |                 continue
167 |             input_derivatives = dX[idx, :, 0:self.output_sizes[instance_depth + 1]]
168 |             if instance_depth < self.max_depth:
169 |                 feature_derivatives = dX[idx, :, self.output_sizes[instance_depth + 1]:]
170 |             else:
171 |                 feature_derivatives = dX[idx, :, :]
172 |             instance_node.children_sequence[idx].gradient = feature_derivatives
173 |             self._compute_backward_gradients(item, input_derivatives, instance_depth + 1)
174 | 
175 | 
176 |     def _compute_LSTM_updates(self, instance_node, current_depth):
177 |         """Computes the update to the LSTM coefficients, recurrently down
178 |         the tree of nodes."""
179 |         # First, computes the update for the current node.
180 |         method = self.learning_method_vector[current_depth]
181 |         if method == "steady_rate":
182 |             self._compute_update_LSTM_weights_steady_rate(instance_node, current_depth)
183 |         elif method == "momentum":
184 |             self._compute_update_LSTM_weights_with_momentum(instance_node, current_depth)
185 |         elif method == "adadelta":
186 |             self._compute_update_LSTM_weights_adadelta(instance_node, current_depth)
187 |         else:
188 |             raise UnknownLearningMethod(method)
189 |         # Then, recurs down the tree.
190 |         if current_depth == self.max_depth - 1:
191 |             return
192 |         for item in instance_node.children_sequence:
193 |             self._compute_LSTM_updates(item, current_depth + 1)
194 | 
195 | 
196 |     def _compute_update_LSTM_weights_steady_rate(self, instance_node, current_depth):
197 |         """Computes the LSTM weight update at steady rate."""
198 |         if instance_node.cache is not None:
199 |             dW = - self.learning_rate_vector[current_depth] * instance_node.cache.weight_gradient
200 |             self.sum_of_dWs[current_depth] += dW
201 |             self.number_of_nodes_per_level[current_depth] += 1
202 | 
203 | 
204 |     def _compute_update_LSTM_weights_with_momentum(self, instance_node, current_depth):
205 |         """Computes the LSTM weight update using momentum."""
206 |         if instance_node.cache is not None:
207 |             if self.lstm_stack[current_depth].momentum_dW is None: # initialize momentum of LSTM to zero
208 |                 self.lstm_stack[current_depth].momentum_dW = np.zeros(self.lstm_stack[current_depth].WLSTM.shape)
209 |             dW = (- self.learning_rate_vector[current_depth] * instance_node.cache.weight_gradient
210 |                   + self.momentum_vector[current_depth] * self.lstm_stack[current_depth].momentum_dW)
211 |             self.lstm_stack[current_depth].WLSTM += dW
212 |             self.sum_of_dWs[current_depth] += dW
213 |             self.number_of_nodes_per_level[current_depth] += 1
214 | 
215 | 
216 |     def _compute_update_LSTM_weights_adadelta(self, instance_node, current_depth):
217 |         """Computes the LSTM weight update using adadelta."""
218 |         # obtain adadelta parameters
219 |         decay = self.adadelta_parameters[current_depth]["decay"]
220 |         epsilon = self.adadelta_parameters[current_depth]["epsilon"]
221 |         learning_factor = self.adadelta_parameters[current_depth]["learning_factor"]
222 |         # do the adadelta updates
223 |         if instance_node.cache is not None:
224 |             instance_node.tot_sq_gradient = (self.lstm_stack[current_depth].tot_sq_gradient * decay
225 |                                              + np.sum(np.square(instance_node.cache.weight_gradient)))
226 |             instance_node.tot_gradient_weight = self.lstm_stack[current_depth].tot_gradient_weight * decay + 1.0
227 |             # Computes the speed.
228 |             rms_delta = np.sqrt((self.lstm_stack[current_depth].tot_sq_delta + epsilon)
229 |                                 / (self.lstm_stack[current_depth].tot_delta_weight + epsilon))
230 |             rms_gradient = np.sqrt((instance_node.tot_sq_gradient + epsilon)
231 |                                    / (instance_node.tot_gradient_weight + epsilon))
232 |             s = rms_delta / rms_gradient
233 |             # Computes the delta.
234 |             delta = s * instance_node.cache.weight_gradient
235 |             instance_node.tot_sq_delta = self.lstm_stack[current_depth].tot_sq_delta * decay + np.sum(np.square(delta))
236 |             instance_node.tot_delta_weight = self.lstm_stack[current_depth].tot_delta_weight * decay + 1.0
237 |             # Finally, updates the weights.
238 |             dW = - delta * learning_factor
239 |             self.sum_of_dWs[current_depth] += dW
240 |             self.number_of_nodes_per_level[current_depth] += 1
241 |             self.sum_tot_sq_gradient[current_depth] += instance_node.tot_sq_gradient
242 |             self.sum_tot_gradient_weight[current_depth] += instance_node.tot_gradient_weight
243 |             self.sum_tot_sq_delta[current_depth] += instance_node.tot_sq_delta
244 |             self.sum_tot_delta_weight[current_depth] += instance_node.tot_delta_weight
245 | 
246 | 
247 |     def _apply_LSTM_updates(self):
248 |         """Applies the updates that have been computed to the LSTM."""
249 |         for d in range(self.max_depth):
250 |             self.lstm_stack[d].WLSTM += self.sum_of_dWs[d] / self.number_of_nodes_per_level[d]
251 |             self.lstm_stack[d].momentum_dW = self.sum_of_dWs[d] / self.number_of_nodes_per_level[d]
252 |             self.lstm_stack[d].tot_gradient_weight = self.sum_tot_delta_weight[d] / self.number_of_nodes_per_level[d]
253 |             self.lstm_stack[d].tot_sq_gradient = self.sum_tot_sq_gradient[d] / self.number_of_nodes_per_level[d]
254 |             self.lstm_stack[d].tot_delta_weight = self.sum_tot_delta_weight[d] / self.number_of_nodes_per_level[d]
255 |             self.lstm_stack[d].tot_sq_delta = self.sum_tot_sq_delta[d] / self.number_of_nodes_per_level[d]
256 | 
257 | 
258 | 
259 | # the following class represents nodes of the unfoldings
260 | # the MLSL module understands and can train and test on tree instances that are encoded as objects of this class
261 | 
262 | class InstanceNode(Serializable):
263 |     """In order to use an MLSL, we need to pass to it a tree (tree, NOT dag)
264 |     of these InstanceNode.
265 |     At the end of the processing, the gradient attribute of each node
266 |     will contain the backpropagation of the loss derivative to the feature
267 |     vector of the node itself."""
268 |     def __init__(self, feature_vector = None, label = None, id = None):
269 |         self.id = id
270 |         self.feature_vector = feature_vector
271 |         self.label = label
272 |         self.children = []
273 |         self.children_sequence = [] # Stores the specific order by which the items were fed into the LSTM to update weights correctly
274 |         # The gradient backpropagated at this node will be left here.
275 |         # It can be used for further back-propagation as needed.
276 |         self.gradient = None
277 |         # Here we store intermediate values useful for the processing.
278 |         self.cache = Storage()
279 | 
280 |     def set_label(self, label):
281 |         self.label = label
282 | 
283 |     def get_number_of_children(self):
284 |         return len(self.children)
285 | 
286 |     def get_label(self):
287 |         return self.label
288 | 
289 |     def get_children(self):
290 |         return self.children
291 | 
292 |     def get_feature_vector(self):
293 |         return self.feature_vector


--------------------------------------------------------------------------------
/dnn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # This class implements a neural net with forward and backpropagation.
  4 | # No specific loss function is used.  Rather, the backpropagation can
  5 | # backpropagate any derivative with respect to a loss function,
  6 | # and learn accordingly.
  7 | 
  8 | """
  9 | Copyright (c) 2015, Camiolog Inc.
 10 | All rights reserved.
 11 | 
 12 | Redistribution and use in source and binary forms, with or without modification,
 13 | are permitted provided that the following conditions are met:
 14 | 
 15 | 1. Redistributions of source code must retain the above copyright notice,
 16 | this list of conditions and the following disclaimer.
 17 | 
 18 | 2. Redistributions in binary form must reproduce the above copyright notice,
 19 | this list of conditions and the following disclaimer in the documentation
 20 | and/or other materials provided with the distribution.
 21 | 
 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 23 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 24 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 25 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
 26 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 27 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 28 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 29 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 30 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 31 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 32 | THE POSSIBILITY OF SUCH DAMAGE.
 33 | """
 34 | 
 35 | # This code has been developed by Luca de Alfaro for Camiolog, Inc.,
 36 | # and is here released under BSD license.
 37 | # The code is derived from http://arctrix.com/nas/python/bpnn.py,
 38 | # developed by Neil Schemenauer <nas@arctrix.com> and placed in the
 39 | # public domain.
 40 | 
 41 | 
 42 | from json_plus import Serializable
 43 | import numpy as np
 44 | import unittest
 45 | 
 46 | # Type to be used for floats.
 47 | FLOAT_TYPE = 'double'
 48 | 
 49 | # Used for weight initialization.
 50 | NEURON_OVERLAP = 2.0
 51 | 
 52 | # These nets are between 0..1, hence the choice of sigmoid function.
 53 | def sigmoid(x):
 54 |     return 1.0 / (1.0 + np.exp(-x))
 55 | 
 56 | # This is the derivative of a sigmoid as a function of the inputs.
 57 | def dsigmoid_in(x):
 58 |     e = np.exp(-x)
 59 |     return e / ((1.0 + e) ** 2)
 60 | 
 61 | # This is the derivative of a sigmoid as a function of the output.
 62 | def dsigmoid_out(y):
 63 |     return np.multiply(y, (1.0 - y))
 64 | 
 65 | 
 66 | class DNN(Serializable):
 67 |     """This class implements a neural net with inputs between 0..1,
 68 |     and outputs between 0..1. The net can have a specified number of
 69 |     neurons in the hidden layer."""
 70 | 
 71 |     def __init__(self, debug=False):
 72 |         """Do not call; no initialization is done.  Use the create method below."""
 73 |         self.debug = debug
 74 | 
 75 | 
 76 |     def initialize(self, nnl, seed=None, weight_range=1.0):
 77 |         """Produces a new net.
 78 |         nnl is a list, consisting of the number of values in each layer.
 79 |         The first element of nll is the number of inputs, and the last element is
 80 |         the number of outputs.
 81 |         Leaving the weight_range to None will cause it to be automatically chosen
 82 |         (recommended).
 83 |         Leaving the distribution to 'uniform' will use the uniform distribution,
 84 |         also recommended.
 85 |         seed is a seed for the random number generator.
 86 |         """
 87 |         # Sanity check.
 88 |         for n in nnl:
 89 |             assert n > 0
 90 |         self.nnl = nnl
 91 |         self.num_layers = len(nnl) - 1 # Number of layers.
 92 |         # Private random number generator.
 93 |         self.random_generator = np.random.RandomState(seed=seed)
 94 |         # w is a list of numpy arrays.
 95 |         # Each numpy array contains the matrix of weights from that layer to the next.
 96 |         # The element w[l][i,j] indicates the weight from element i of layer l
 97 |         # to element j of layer l + 1.
 98 |         self.w = []
 99 |         # c, old_dw are exactly like w, but stores weight momentums.
100 |         # We set them to None initially, as we use them only if needed
101 |         # according to the update method.
102 |         self.c, self.old_dw = None, None
103 |         # These are for AdaDelta
104 |         self.tot_sq_delta, self.tot_sq_gradient = None, None
105 |         self.tot_delta_weight, self.tot_gradient_weight = None, None
106 |         for n in range(self.num_layers):
107 |             # Initializes the weights.
108 |             ww = np.matrix(self.random_generator.uniform(
109 |                             -1.0, +1.0, size=(nnl[n] + 1, nnl[n + 1])))
110 |             # Normalizes according to Nguyen-Widrow; see
111 |             # http://web.stanford.edu/class/ee373b/nninitialization.pdf
112 |             # Computes the modulus of the weights for each neuron.
113 |             wwmod = np.sqrt(np.sum(np.multiply(ww, ww), axis=0))
114 |             # Computes the ideal interval width, actually the reciprocal of the width.
115 |             int_width = NEURON_OVERLAP * weight_range * np.power(nnl[n + 1], 1.0 / nnl[n])
116 |             ww_norm = int_width * (ww / wwmod)
117 |             self.w.append(ww_norm)
118 |         # Creates the matrix b of activations.  Again, we store a list, in which b[n] is a
119 |         # vector consisting of nnl[n] elements, containing the output activations of layer n.
120 |         # Layer 0 consists of the inputs.
121 |         # Note that the last element of self.b will be always set to 1, constituting the bias
122 |         # for the activation potential of the net.
123 |         self.b = []
124 |         for n in range(self.num_layers + 1):
125 |             self.b.append(np.matrix(np.ones(nnl[n] + 1, dtype=FLOAT_TYPE)))
126 |         x, y = self.b[0].shape
127 |         self.input_shape = (x, y - 1)
128 | 
129 | 
130 |     def forward(self, bi):
131 |         """Given a vector bi of nnl[0] values, computes the forward-propagation of the network,
132 |         returning a vector bo consisting of nnl[-1] values, each between 0 and 1.
133 |         This function also sets internally all the activations a and outputs b."""
134 |         bii = np.matrix(bi)
135 |         assert bii.shape == self.input_shape, "expected shape: %r actual shape: %r" % (self.input_shape, bii.shape)
136 |         self.b[0][0, 0:self.nnl[0]] = bii
137 |         # Propagates from layer n to layer n + 1.
138 |         for n in range(self.num_layers):
139 |             a = self.b[n] * self.w[n]
140 |             self.b[n + 1][0, 0:self.nnl[n + 1]] = sigmoid(a)
141 |         # The copy statement is necessary, or otherwise we can modify the b's
142 |         # from the output, and potentially sabotage backpropagation.
143 |         return self.b[self.num_layers][0, 0:self.nnl[self.num_layers]].copy()
144 | 
145 | 
146 |     def backward(self, delta):
147 |         """Implements backpropagation without updates.
148 |         The input is a vector delta, of the same size of the
149 |         outputs, giving \partial loss / \partial output.  The output is a vector, containing
150 |         \partial loss / \partial input for every input, allowing the model to be chained.
151 |         NOTE: this function must be called only after the forward step!"""
152 |         return self._backward_update(delta, None)
153 | 
154 | 
155 |     def backward_momentum_NM(self, delta, speed=0.5, N=0.5, M=0.3):
156 |         """Implements backpropagation.  The input is a vector delta, of the same size of the
157 |         outputs, giving \partial loss / \partial output.  The output is a vector, containing
158 |         \partial loss / \partial input for every input, allowing the model to be chained.
159 |         NOTE: this function must be called only after the forward step!"""
160 |         # Defines the update function.
161 |         def update_function(self, layer_idx, d, speed=speed, N=N, M=M):
162 |             if self.c is None:
163 |                 self.c = [np.zeros((self.nnl[n] + 1, self.nnl[n + 1]), dtype=FLOAT_TYPE)
164 |                           for n in range(self.num_layers)]
165 |             # Update.
166 |             wd = np.transpose(self.b[layer_idx]) * d
167 |             self.w[layer_idx] -= speed * N * wd + M * self.c[layer_idx]
168 |             self.c[layer_idx] = wd * speed
169 |         return self._backward_update(delta, update_function)
170 | 
171 | 
172 |     def backward_momentum(self, delta, speed=0.1, momentum=0.8):
173 |         """Implements backpropagation.  The input is a vector delta, of the same size of the
174 |         outputs, giving \partial loss / \partial output.  The output is a vector, containing
175 |         \partial loss / \partial input for every input, allowing the model to be chained.
176 |         NOTE: this function must be called only after the forward step!"""
177 |         def update_function(self, layer_idx, d, speed=speed, momentum=momentum):
178 |             if self.old_dw is None:
179 |                 self.old_dw = [np.zeros((self.nnl[n] + 1, self.nnl[n + 1]), dtype=FLOAT_TYPE)
180 |                                for n in range(self.num_layers)]
181 |             g = np.transpose(self.b[layer_idx]) * d
182 |             dw = speed * g + momentum * self.old_dw[layer_idx]
183 |             self.w[layer_idx] -= dw
184 |             self.old_dw[layer_idx] = dw
185 |         return self._backward_update(delta, update_function)
186 | 
187 | 
188 |     def backward_adadelta(self, delta, learning_factor=1.0, epsilon = 0.1, decay=0.999):
189 |         """This performs an adadelta update, see http://arxiv.org/abs/1212.5701 ,
190 |         where learning_factor indicates how much we should learn from this particular example."""
191 |         def update_function(self, layer_idx, d, epsilon=epsilon):
192 |             if self.tot_sq_gradient is None:
193 |                 self.tot_sq_gradient = [0.0 for n in range(self.num_layers)]
194 |                 self.tot_sq_delta = [0.0 for n in range(self.num_layers)]
195 |                 self.tot_delta_weight = [0.0 for n in range(self.num_layers)]
196 |                 self.tot_gradient_weight = [0.0 for n in range(self.num_layers)]
197 |             # Computes the gradient.
198 |             g = np.transpose(self.b[layer_idx]) * d
199 |             # Updates the gradient average.
200 |             self.tot_sq_gradient[layer_idx] = self.tot_sq_gradient[layer_idx] * decay + np.sum(np.square(g))
201 |             self.tot_gradient_weight[layer_idx] = self.tot_gradient_weight[layer_idx] * decay + g.size
202 |             # Computes the speed.
203 |             rms_delta = np.sqrt((self.tot_sq_gradient[layer_idx] + epsilon) /
204 |                                 (self.tot_gradient_weight[layer_idx] + epsilon))
205 |             rms_gradient = np.sqrt((self.tot_sq_delta[layer_idx] + epsilon) /
206 |                                    (self.tot_delta_weight[layer_idx] + epsilon))
207 |             s = rms_delta / rms_gradient
208 |             # Performs the update.
209 |             dx = s * g
210 |             self.w[layer_idx] -= dx * learning_factor
211 |             # Updates the delta average.
212 |             self.tot_sq_delta[layer_idx] = self.tot_sq_delta[layer_idx] * decay + np.sum(np.square(dx))
213 |             self.tot_delta_weight[layer_idx] = self.tot_delta_weight[layer_idx] * decay + dx.size
214 |         return self._backward_update(delta, update_function)
215 | 
216 | 
217 |     def _backward_update(self, delta, update_function):
218 |         """Implements backpropagation core.  The input is a vector delta, of the same size of the
219 |         outputs, giving \partial loss / \partial output.  The output is a vector, containing
220 |         \partial loss / \partial input for every input, allowing the model to be chained.
221 |         Weights are updated if update is set to True.
222 |         The function update_function is used to carry out the specific update.
223 |         NOTE: this function must be called only after the forward step!"""
224 |         # First, computes the derivatives wrt a[n], the activation layer.
225 |         m = self.nnl[self.num_layers] # True number of outputs
226 |         d = np.matrix(np.multiply(delta,
227 |                         np.multiply(self.b[self.num_layers][0, 0:m], 1.0 - self.b[self.num_layers][0, 0:m])))
228 |         # Then, iteratively for n going from the last layer to the first one:
229 |         # - We update the weights leading from n to n + 1
230 |         # - We compute d for the layer n.
231 |         for n in range(self.num_layers - 1, -1, -1):
232 |             m = self.nnl[n] # Number of true outputs at this level.
233 |             # We do first the weight update, as it is very slightly faster.
234 |             if update_function is not None:
235 |                 # Weight update.  change is \partial loss / \partial weight.
236 |                 # We use the full b, as change refers also to the activation potentials.
237 |                 update_function(self, n, d)
238 |             # Computing d for previous layer. dd is \partial loss / \partial b
239 |             # This should not include the activation potentials.
240 |             dd = d * np.transpose(self.w[n][0:m, :])
241 |             if n > 0:
242 |                 # Not the last layer.  We compute d as partial loss / \partial a
243 |                 d = np.multiply(dd, dsigmoid_out(self.b[n][0, 0:m]))
244 |             else:
245 |                 # For the last layer, we just output d, since the inputs are equivalent
246 |                 # to b, and what we want is \partial loss / \partial input
247 |                 d = dd
248 |         return d
249 | 
250 | 
251 | class TestNet(unittest.TestCase):
252 | 
253 |     def test_backward(self):
254 |         myrandom = np.random.RandomState(seed=0)
255 |         net = DNN(debug=True)
256 |         net.initialize([4, 2, 5, 3], 0)
257 |         vi = myrandom.uniform(0.0, 1.0, 4)
258 |         vo = net.forward(vi)
259 |         # print "Output:", vo
260 |         delta = myrandom.uniform(-1.0, 1.0, 3)
261 |         d = net.backward_momentum_NM(delta)
262 |         # print d
263 | 
264 |     def test_derivative(self):
265 |         myrandom = np.random.RandomState(seed=0)
266 |         net = DNN(debug=True)
267 |         nnl = [3, 4, 2, 1]
268 |         net.initialize(nnl, 0)
269 |         # Backpropagates a dloss / dy of 1.
270 |         bi = np.matrix(myrandom.uniform(0.0, 1.0, size=nnl[0]))
271 |         y0 = net.forward(bi)
272 |         # print "y0:", y0
273 |         dd = net.backward(1.0)
274 |         # Initializes the true derivatives to 0, as a placeholder.
275 |         # Computes the true derivatives.
276 |         epsilon = 0.001
277 |         idx = 0
278 |         bi[0, idx] += epsilon
279 |         y1 = net.forward(bi)
280 |         # print "y1:", y1
281 |         # print "diff:", y1 - y0
282 |         true_deriv = (y1 - y0) / epsilon
283 |         # print "true deriv: ", true_deriv # [0, 0]
284 |         # print "computed:   ", dd[0, idx]
285 |         self.assertAlmostEqual(true_deriv[0, 0], dd[0, idx], 4)
286 | 
287 |     def test_update_NM(self):
288 |         myrandom = np.random.RandomState(seed=0)
289 |         net = DNN(debug=False)
290 |         nnl = [4, 2, 3, 1]
291 |         net.initialize(nnl, 0)
292 |         # Backpropagates a dloss / dy of 1.
293 |         bi = np.matrix(myrandom.uniform(0.0, 1.0, size=nnl[0]))
294 |         y = []
295 |         for i in range(10):
296 |             y.append(net.forward(bi))
297 |             net.backward_momentum_NM(1.0)
298 |             if i > 0:
299 |                 self.assertLess(y[i], y[i - 1])
300 |         # print "These must decrease (NM):"
301 |         # print y
302 | 
303 |     def test_update_momentum(self):
304 |         myrandom = np.random.RandomState(seed=0)
305 |         net = DNN(debug=False)
306 |         nnl = [4, 2, 3, 1]
307 |         net.initialize(nnl, 0)
308 |         # Backpropagates a dloss / dy of 1.
309 |         bi = np.matrix(myrandom.uniform(0.0, 1.0, size=nnl[0]))
310 |         y = []
311 |         for i in range(10):
312 |             y.append(net.forward(bi))
313 |             net.backward_momentum(1.0)
314 |             if i > 0:
315 |                 self.assertLess(y[i], y[i - 1])
316 |         # print "These must decrease (momentum):"
317 |         # print y
318 | 
319 |     def test_update_adadelta(self):
320 |         myrandom = np.random.RandomState(seed=0)
321 |         net = DNN(debug=False)
322 |         nnl = [4, 2, 3, 1]
323 |         net.initialize(nnl, 0)
324 |         # Backpropagates a dloss / dy of 1.
325 |         bi = np.matrix(myrandom.uniform(0.0, 1.0, size=nnl[0]))
326 |         y = []
327 |         for i in range(10):
328 |             y.append(net.forward(bi))
329 |             net.backward_adadelta(1.0)
330 |             if i > 0:
331 |                 self.assertLess(y[i], y[i - 1])
332 |         # print "These must decrease (adadelta):"
333 |         # print y
334 | 
335 | class TestInit(unittest.TestCase):
336 |     def test_start(self):
337 |         net = DNN(debug=True)
338 |         nnl = [4, 40, 1]
339 |         net.initialize(nnl, 1)
340 |         for i in range(5):
341 |             fv = np.random.uniform(size=4)
342 |             y = net.forward(fv)
343 |             # print fv, y
344 | 
345 | class TestLearn(unittest.TestCase):
346 | 
347 |     def test_xor_momentum(self):
348 |         import random
349 |         N = 4000
350 |         pats = [
351 |             (np.array([0, 0]), 0),
352 |             (np.array([0, 1]), 1),
353 |             (np.array([1, 0]), 1),
354 |             (np.array([1, 1]), 0),
355 |         ]
356 |         for k in range(20):
357 |             net = DNN()
358 |             net.initialize([2, 16, 1])
359 |             e = np.zeros((N))
360 |             for i in range(N):
361 |                 x, tgt = random.choice(pats)
362 |                 y = net.forward(x)
363 |                 dy = 2.0 * (y - tgt)
364 |                 e[i] = np.sum((y - tgt) ** 2)
365 |                 net.backward_momentum(dy)
366 |                 # print i, ":", e[i]
367 |             avg_e = np.average(e[N/2:])
368 |             print "MMT Avg error:", avg_e
369 |             self.assertLess(avg_e, 0.01)
370 | 
371 |     def test_xor_adadelta(self):
372 |         import random
373 |         N = 4000
374 |         pats = [
375 |             (np.array([0, 0]), 0),
376 |             (np.array([0, 1]), 1),
377 |             (np.array([1, 0]), 1),
378 |             (np.array([1, 1]), 0),
379 |         ]
380 |         for k in range(20):
381 |             net = DNN()
382 |             net.initialize([2, 16, 1])
383 |             e = np.zeros((N))
384 |             for i in range(N):
385 |                 x, tgt = random.choice(pats)
386 |                 y = net.forward(x)
387 |                 dy = 2.0 * (y - tgt)
388 |                 e[i] = np.sum((y - tgt) ** 2)
389 |                 net.backward_adadelta(dy)
390 |                 # print i, ":", e[i]
391 |             avg_e = np.average(e[N/2:])
392 |             print "ADA Avg error:", avg_e
393 |             self.assertLess(avg_e, 0.01)
394 | 
395 | if __name__ == '__main__':
396 |     unittest.main()
397 | 
398 | 


--------------------------------------------------------------------------------
/lstm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This code is based on
  3 | https://gist.github.com/karpathy/587454dc0146a6ae21fc
  4 | by Andrej Karpathy.
  5 | Luca de Alfaro <luca@dealfaro.com> modified the code to make the
  6 | LSTM into objects that can be serialized, and added the learning
  7 | methods based on gradient descent with momentum, and adadelta.
  8 | Vassilis Polychronopoulos and Rakshit Agrawal then added methods
  9 | that facilitate the use of the code for multi-level LSTMs.
 10 | """
 11 | 
 12 | import numpy as np
 13 | from json_plus import Serializable
 14 | import unittest
 15 | 
 16 | class LSTM(Serializable):
 17 |     """Class implementing an LSTM."""
 18 | 
 19 |     def __init__(self):
 20 |         """We need an empty initializer, to be compatible with the Serializable
 21 |         interface."""
 22 |         pass
 23 | 
 24 |     def initialize(self, input_size, hidden_size, fancy_forget_bias_init=3):
 25 |         """
 26 |         Initialize parameters of the LSTM (both weights and biases in one matrix)
 27 |         One might way to have a positive fancy_forget_bias_init number (e.g. maybe even up to 5, in some papers)
 28 |         In the matrix there are inputs for:
 29 |         - 1 (bias)
 30 |         - Input
 31 |         - Hidden
 32 |         In the other dimension, there are four outputs, for:
 33 |         - Input to cell
 34 |         - Forget
 35 |         - Output
 36 |         - Gate
 37 |         """
 38 |         # +1 for the biases, which will be the first row of self.WLSTM
 39 |         self.input_size = input_size
 40 |         self.hidden_size = hidden_size
 41 |         self.WLSTM = np.random.randn(input_size + hidden_size + 1, 4 * hidden_size) / np.sqrt(input_size + hidden_size)
 42 |         # self.WLSTM[0, :] = 0  # initialize biases to zero
 43 |         if fancy_forget_bias_init != 0:
 44 |             # forget gates get little bit negative bias initially to encourage them to be turned off
 45 |             # remember that due to Xavier initialization above, the raw output activations from gates before
 46 |             # nonlinearity are zero mean and on order of standard deviation ~1
 47 |             self.WLSTM[0, hidden_size:2 * hidden_size] = fancy_forget_bias_init
 48 | 
 49 |         # Init parameters for momentum update method.
 50 |         self.momentum_dW = None # Delta weights
 51 | 
 52 |         # Init parameters for ADADELTA update method.
 53 |         self.tot_gradient_weight, self.tot_delta_weight = 0, 0
 54 |         self.tot_sq_gradient, self.tot_sq_delta = 0, 0
 55 | 
 56 | 
 57 |     def clone(self):
 58 |         replica = LSTM()
 59 |         replica.input_size = self.input_size
 60 |         replica.hidden_size = self.hidden_size
 61 |         replica.WLSTM = np.copy(self.WLSTM)
 62 |         replica.momentum_dW = np.copy(self.momentum_dW)
 63 |         replica.tot_gradient_weight = self.tot_gradient_weight
 64 |         replica.tot_delta_weight = self.tot_delta_weight
 65 |         replica.tot_sq_gradient = self.tot_sq_gradient
 66 |         replica.tot_sq_delta = self.tot_sq_delta
 67 |         return replica
 68 | 
 69 | 
 70 |     def _forward(self, X, c0=None, h0=None):
 71 |         """
 72 |         Forward-propagates the input X.
 73 |         The output consists in the vector of all outputs for all time steps,
 74 |         the final value of the memories, the output of the LSTM at the final step,
 75 |         and a cache that contains the whole useful state of the LSTM, so that it can be
 76 |         used later for back-propagation.
 77 |         The cache value is also stored in the LSTM.
 78 |         X should be of shape (n,b,input_size), where n = length of sequence, b = batch size
 79 |         """
 80 |         n, b, isz = X.shape
 81 |         d = self.hidden_size
 82 |         if c0 is None: c0 = np.zeros((b, d))
 83 |         if h0 is None: h0 = np.zeros((b, d))
 84 |         assert(isz == self.input_size)
 85 | 
 86 |         # Perform the LSTM forward pass with X as the input
 87 |         m = self.WLSTM.shape[0]  # size of x plus h plus bias
 88 |         Hin = np.zeros((n, b, m))  # input [1, xt, ht-1] to each tick of the LSTM
 89 |         Hout = np.zeros((n, b, d))  # hidden representation of the LSTM (gated cell content)
 90 |         IFOG = np.zeros((n, b, d * 4))  # input, forget, output, gate (IFOG)
 91 |         IFOGf = np.zeros((n, b, d * 4))  # after nonlinearity
 92 |         C = np.zeros((n, b, d))  # cell content
 93 |         Ct = np.zeros((n, b, d))  # tanh of cell content
 94 |         for t in xrange(n):
 95 |             # concat [x,h] as input to the LSTM
 96 |             prevh = Hout[t - 1] if t > 0 else h0 # previous cell output.
 97 |             # assembles cell input.
 98 |             Hin[t, :, 0] = 1  # bias
 99 |             Hin[t, :, 1:self.input_size + 1] = X[t]
100 |             Hin[t, :, self.input_size + 1:] = prevh
101 |             # print "Hin[%d]:\n" % t, Hin[t, :, :], '\n-------------------------------\n'
102 |             # compute all gate activations. dots: (most work is this line)
103 |             IFOG[t] = Hin[t].dot(self.WLSTM)
104 |             # non-linearities
105 |             IFOGf[t, :, :3 * d] = 1.0 / (1.0 + np.exp(-IFOG[t, :, :3 * d]))  # sigmoids; these are the gates
106 |             IFOGf[t, :, 3 * d:] = np.tanh(IFOG[t, :, 3 * d:])  # tanh
107 |             # compute the cell activation
108 |             prevc = C[t - 1] if t > 0 else c0
109 |             # input * gate + forget * previous_cell; (2)
110 |             C[t] = IFOGf[t, :, :d] * IFOGf[t, :, 3 * d:] + IFOGf[t, :, d:2 * d] * prevc
111 |             Ct[t] = np.tanh(C[t]) # nonlinearity
112 |             Hout[t] = IFOGf[t, :, 2 * d:3 * d] * Ct[t] # output * cell (1)
113 | 
114 |         cache = {}
115 |         cache['Hout'] = Hout
116 |         cache['IFOGf'] = IFOGf
117 |         cache['IFOG'] = IFOG
118 |         cache['C'] = C
119 |         cache['Ct'] = Ct
120 |         cache['Hin'] = Hin
121 |         cache['c0'] = c0
122 |         cache['h0'] = h0
123 |         cache['n'] = n
124 |         cache['b'] = b
125 | 
126 |         # We remember the cached values, so we don't need to plug them back in each time.
127 |         self.cache = cache
128 | 
129 |         # return C[t], as well so we can continue LSTM with prev state init if needed
130 |         return Hout, C[t], Hout[t], cache
131 | 
132 | 
133 |     def clean_before_serialization(self):
134 |         self.cache.clear()
135 | 
136 | 
137 |     def _backward(self, dHout_in, cache=None, dcn=None, dhn=None):
138 |         """Backward propagation through the LSTM.  dHout_in must have the same shape as Hout."""
139 |         if cache is None:
140 |             cache = self.cache
141 |         Hout = cache['Hout']
142 |         IFOGf = cache['IFOGf']
143 |         IFOG = cache['IFOG']
144 |         C = cache['C']
145 |         Ct = cache['Ct']
146 |         Hin = cache['Hin']
147 |         c0 = cache['c0']
148 |         h0 = cache['h0']
149 |         n = cache['n']
150 |         b = cache['b']
151 |         d = self.hidden_size
152 | 
153 |         # backprop the LSTM
154 |         dIFOG = np.zeros(IFOG.shape)
155 |         dIFOGf = np.zeros(IFOGf.shape)
156 |         dWLSTM = np.zeros(self.WLSTM.shape)
157 |         dHin = np.zeros(Hin.shape)
158 |         dC = np.zeros(C.shape)
159 |         dX = np.zeros((n, b, self.input_size))
160 |         dh0 = np.zeros((b, d))
161 |         dc0 = np.zeros((b, d))
162 |         dHout = dHout_in.copy()  # make a copy so we don't have any funny side effects
163 |         if dcn is not None: dC[n - 1] += dcn.copy()  # carry over gradients from later
164 |         if dhn is not None: dHout[n - 1] += dhn.copy()
165 |         for t in reversed(xrange(n)):
166 | 
167 |             tanhCt = Ct[t]
168 |             # backpropagation through (1) for output
169 |             dIFOGf[t, :, 2 * d:3 * d] = tanhCt * dHout[t]
170 |             # backprop tanh non-linearity first then continue backprop.
171 |             # this is the backprop of output on cells, (1) cont.
172 |             dC[t] += (1 - tanhCt ** 2) * (IFOGf[t, :, 2 * d:3 * d] * dHout[t])
173 | 
174 |             if t > 0:
175 |                 dIFOGf[t, :, d:2 * d] = C[t - 1] * dC[t] # delta forget (through (2))
176 |                 dC[t - 1] += IFOGf[t, :, d:2 * d] * dC[t]
177 |             else:
178 |                 dIFOGf[t, :, d:2 * d] = c0 * dC[t]
179 |                 dc0 = IFOGf[t, :, d:2 * d] * dC[t]
180 |             # this completes backpropagation to the cell memory.
181 | 
182 |             # this is the gate * input portion, effects on gate and input.
183 |             dIFOGf[t, :, :d] = IFOGf[t, :, 3 * d:] * dC[t] # backprop of input through gate, part of (2)
184 |             dIFOGf[t, :, 3 * d:] = IFOGf[t, :, :d] * dC[t] # backprop of gate through input, part of (2)
185 | 
186 |             # backprop activation functions
187 |             dIFOG[t, :, 3 * d:] = (1 - IFOGf[t, :, 3 * d:] ** 2) * dIFOGf[t, :, 3 * d:]
188 |             y = IFOGf[t, :, :3 * d]
189 |             dIFOG[t, :, :3 * d] = (y * (1.0 - y)) * dIFOGf[t, :, :3 * d]
190 | 
191 |             # backprop matrix multiply
192 |             dWLSTM += np.dot(Hin[t].transpose(), dIFOG[t])
193 |             dHin[t] = dIFOG[t].dot(self.WLSTM.transpose())
194 | 
195 |             # backprop the identity transforms into Hin
196 |             dX[t] = dHin[t, :, 1:self.input_size + 1]
197 |             if t > 0:
198 |                 dHout[t - 1, :] += dHin[t, :, self.input_size + 1:]
199 |             else:
200 |                 dh0 += dHin[t, :, self.input_size + 1:]
201 | 
202 |         return dX, dWLSTM, dc0, dh0
203 | 
204 | 
205 |     def forward(self, X):
206 |         """Forward function.  Can be called to predict outputs, and as preparation to backpropagation.
207 |         X should be of shape (n, b, input_size), where n = length of sequence, b = batch size.
208 |         If b = 1, one can also give dimensions (n, input_size) to X.
209 |         """
210 |         XX = X if X.ndim == 3 else X.reshape((X.shape[0], 1, X.shape[1]))
211 |         _, _, o, _ = self._forward(XX)
212 |         return o if X.ndim == 3 else o.flatten()
213 | 
214 | 
215 |     def _adapt_input_derivative(self, d):
216 |         """In an LSTM, we often have feedback only on the last result, only once
217 |         all the sequence has been read.  This function takes d as given, and
218 |         produces the internal representation that is needed.  The rule is as follows:
219 |         - If d has dimension 3, then it is assumed that it comes already in the
220 |           correct format.
221 |         - If d has dimension 2, then it is assumed that there are batches, and
222 |           that the data includes only the latest temporal step.  The previous
223 |           temporal steps are filled in with zeros, as appropriate.
224 |         - If d has dimension 1, then it is assumed that no batches are present,
225 |           and that d refers only to the last temporal step.
226 |           The other temporal steps are filled with zeros as required, and an
227 |           appropriate array is returned.
228 |         """
229 |         if d.ndim == 3:
230 |             return d
231 |         elif d.ndim == 1:
232 |             n = self.cache['n'] # N. of temporal steps
233 |             assert(self.cache['b'] == 1)
234 |             assert(d.size == self.hidden_size)
235 |             dd = np.vstack((np.zeros((n - 1, self.hidden_size)), d))
236 |             return dd.reshape(n, 1, self.hidden_size)
237 |         elif d.ndim == 2:
238 |             n = self.cache['n'] # N. of temporal steps
239 |             batch_size, hidden_size = d.shape
240 |             assert(batch_size == self.cache['b'])
241 |             assert(hidden_size == self.hidden_size)
242 |             other_times = np.zeros((n - 1, batch_size, hidden_size))
243 |             return np.vstack((other_times, d.reshape(1, d.shape[0], d.shape[1])))
244 | 
245 | 
246 |     def backward(self, d):
247 |         """Backward function without learning.  Input is de loss / de output."""
248 |         dd = self._adapt_input_derivative(d)
249 |         _, _, _, dh0 = self._backward(dd)
250 |         return dh0
251 | 
252 |     """ no update, and cache can be passed as parameter"""
253 |     def backward_return_vector_no_update(self, d, cache):
254 |         """Backward function without learning.  Input is de loss / de output."""
255 |         self.cache = cache
256 |         dd = self._adapt_input_derivative(d)
257 |         dX, g, dc0, dh0 = self._backward(dd, cache = cache)
258 |         return dX, g, dc0, dh0
259 | 
260 | 
261 |     def backward_momentum(self, d, speed=0.1, momentum=0.8):
262 |         """Implements backpropagation with momentum."""
263 |         dd = self._adapt_input_derivative(d)
264 |         _, g, _, dh0 = self._backward(dd)
265 |         if self.momentum_dW is None:
266 |             self.momentum_dW = np.zeros(self.WLSTM.shape)
267 |         dW = - speed * g + momentum * self.momentum_dW
268 |         self.momentum_dW = dW
269 |         self.WLSTM += dW
270 |         return dh0
271 | 
272 | 
273 |     def backward_momentum_vector(self, d, speed=0.0001, momentum=0.0008):
274 |         """Implements backpropagation with momentum."""
275 |         dd = self._adapt_input_derivative(d)
276 |         dX, g, dc0, dh0 = self._backward(dd)
277 |         if self.momentum_dW is None:
278 |             self.momentum_dW = np.zeros(self.WLSTM.shape)
279 |         dW = - speed * g + momentum * self.momentum_dW
280 |         self.momentum_dW = dW
281 |         self.WLSTM += dW
282 |         return dX, g, dc0, dh0
283 | 
284 | 
285 |     def backward_adadelta(self, d, learning_factor=1.0, epsilon=0.001, decay=0.95):
286 |         """Implements backpropagation with the ADADELTA method, see
287 |         http://arxiv.org/abs/1212.5701
288 |         learning_factor indicates how much we should learn from this particular example."""
289 |         dd = self._adapt_input_derivative(d)
290 |         _, g, _, dh0 = self._backward(dd)
291 |         # Updates the gradient average.
292 |         self.tot_sq_gradient = self.tot_sq_gradient * decay + np.sum(np.square(g))
293 |         self.tot_gradient_weight = self.tot_gradient_weight * decay + 1.0
294 |         # Computes the speed.
295 |         rms_delta = np.sqrt((self.tot_sq_delta + epsilon) / (self.tot_delta_weight + epsilon))
296 |         rms_gradient = np.sqrt((self.tot_sq_gradient + epsilon) / (self.tot_gradient_weight + epsilon))
297 |         s = rms_delta / rms_gradient
298 |         # Computes the delta.
299 |         delta = s * g
300 |         # Updates the delta average.
301 |         self.tot_sq_delta = self.tot_sq_delta * decay + np.sum(np.square(delta))
302 |         self.tot_delta_weight = self.tot_delta_weight * decay + 1.0
303 |         # Finally, updates the weights.
304 |         self.WLSTM -= delta * learning_factor
305 |         return dh0
306 | 
307 |     def backward_adadelta_vector(self, d, learning_factor = 1.0, epsilon = 0.0001, decay = 0.95):
308 |         """Implements backpropagation with the ADADELTA method, see
309 |         http://arxiv.org/abs/1212.5701
310 |         learning_factor indicates how much we should learn from this particular example."""
311 |         dd = self._adapt_input_derivative(d)
312 |         dX, g, dc0, dh0 = self._backward(dd)
313 |         # Updates the gradient average.
314 |         self.tot_sq_gradient = self.tot_sq_gradient * decay + np.sum(np.square(g))
315 |         self.tot_gradient_weight = self.tot_gradient_weight * decay + 1.0
316 |         # Computes the speed.
317 |         rms_delta = np.sqrt((self.tot_sq_delta + epsilon) / (self.tot_delta_weight + epsilon))
318 |         rms_gradient = np.sqrt((self.tot_sq_gradient + epsilon) / (self.tot_gradient_weight + epsilon))
319 |         s = rms_delta / rms_gradient
320 |         # Computes the delta.
321 |         delta = s * g
322 |         # Updates the delta average.
323 |         self.tot_sq_delta = self.tot_sq_delta * decay + np.sum(np.square(delta))
324 |         self.tot_delta_weight = self.tot_delta_weight * decay + 1.0
325 |         # Finally, updates the weights.
326 |         self.WLSTM -= delta * learning_factor
327 |         return dX, g, dc0, dh0
328 | 
329 |     def backward_adadelta_vector_no_update(self, d, learning_factor=1.0, epsilon=0.0001, decay=0.95):
330 |         """Implements backpropagation with the ADADELTA method, see
331 |         http://arxiv.org/abs/1212.5701
332 |         learning_factor indicates how much we should learn from this particular example."""
333 |         dd = self._adapt_input_derivative(d)
334 |         dX, g, dc0, dh0 = self._backward(dd)
335 |         # Updates the gradient average.
336 |         self.tot_sq_gradient = self.tot_sq_gradient * decay + np.sum(np.square(g))
337 |         self.tot_gradient_weight = self.tot_gradient_weight * decay + 1.0
338 |         # Computes the speed.
339 |         rms_delta = np.sqrt((self.tot_sq_delta + epsilon) / (self.tot_delta_weight + epsilon))
340 |         rms_gradient = np.sqrt((self.tot_sq_gradient + epsilon) / (self.tot_gradient_weight + epsilon))
341 |         s = rms_delta / rms_gradient
342 |         # Computes the delta.
343 |         delta = s * g
344 |         # Updates the delta average.
345 |         self.tot_sq_delta = self.tot_sq_delta * decay + np.sum(np.square(delta))
346 |         self.tot_delta_weight = self.tot_delta_weight * decay + 1.0
347 |         # Finally, updates the weights.
348 |         #self.WLSTM -= delta * learning_factor
349 |         return dX, -delta * learning_factor
350 | 
351 | 
352 | # -------------------
353 | # TEST CASES
354 | # -------------------
355 | 
356 | class BasicTests(unittest.TestCase):
357 | 
358 |     @unittest.skip("later")
359 |     def test_checkSequentialMatchesBatch(self):
360 |         """ check LSTM I/O forward/backward interactions """
361 | 
362 |         n, b, d = (5, 3, 4)  # sequence length, batch size, hidden size
363 |         input_size = 10
364 |         WLSTM = LSTM()
365 |         WLSTM.initialize(input_size, d)  # input size, hidden size
366 |         X = np.random.randn(n, b, input_size)
367 |         h0 = np.random.randn(b, d)
368 |         c0 = np.random.randn(b, d)
369 | 
370 |         # sequential forward
371 |         cprev = c0
372 |         hprev = h0
373 |         caches = [{} for t in xrange(n)]
374 |         Hcat = np.zeros((n, b, d))
375 |         for t in xrange(n):
376 |             xt = X[t:t + 1]
377 |             _, cprev, hprev, cache = WLSTM._forward(xt, cprev, hprev)
378 |             caches[t] = cache
379 |             Hcat[t] = hprev
380 | 
381 |         # sanity check: perform batch forward to check that we get the same thing
382 |         H, _, _, batch_cache = WLSTM._forward(X, c0, h0)
383 |         assert np.allclose(H, Hcat), 'Sequential and Batch forward don''t match!'
384 | 
385 |         # eval loss
386 |         wrand = np.random.randn(*Hcat.shape)
387 |         loss = np.sum(Hcat * wrand)
388 |         dH = wrand
389 | 
390 |         # get the batched version gradients
391 |         BdX, BdWLSTM, Bdc0, Bdh0 = WLSTM._backward(dH, batch_cache)
392 | 
393 |         # now perform sequential backward
394 |         dX = np.zeros_like(X)
395 |         dWLSTM = np.zeros_like(WLSTM.WLSTM)
396 |         dc0 = np.zeros_like(c0)
397 |         dh0 = np.zeros_like(h0)
398 |         dcnext = None
399 |         dhnext = None
400 |         for t in reversed(xrange(n)):
401 |             dht = dH[t].reshape(1, b, d)
402 |             dx, dWLSTMt, dcprev, dhprev = WLSTM._backward(dht, caches[t], dcnext, dhnext)
403 |             dhnext = dhprev
404 |             dcnext = dcprev
405 | 
406 |             dWLSTM += dWLSTMt  # accumulate LSTM gradient
407 |             dX[t] = dx[0]
408 |             if t == 0:
409 |                 dc0 = dcprev
410 |                 dh0 = dhprev
411 | 
412 |         # and make sure the gradients match
413 |         # print 'Making sure batched version agrees with sequential version: (should all be True)'
414 |         self.assertTrue(np.allclose(BdX, dX))
415 |         self.assertTrue(np.allclose(BdWLSTM, dWLSTM))
416 |         self.assertTrue(np.allclose(Bdc0, dc0))
417 |         self.assertTrue(np.allclose(Bdh0, dh0))
418 | 
419 | 
420 |     @unittest.skip("later")
421 |     def test_checkBatchGradient(self):
422 |         """ check that the batch gradient is correct """
423 | 
424 |         # lets gradient check this beast
425 |         n, b, d = (5, 3, 4)  # sequence length, batch size, hidden size
426 |         input_size = 10
427 |         WLSTM = LSTM()
428 |         WLSTM.initialize(input_size, d)  # input size, hidden size
429 |         X = np.random.randn(n, b, input_size)
430 |         h0 = np.random.randn(b, d)
431 |         c0 = np.random.randn(b, d)
432 | 
433 |         # batch forward backward
434 |         H, Ct, Ht, cache = WLSTM._forward(X, c0, h0)
435 |         wrand = np.random.randn(*H.shape)
436 |         loss = np.sum(H * wrand)  # weighted sum is a nice hash to use I think
437 |         dH = wrand
438 |         dX, dWLSTM, dc0, dh0 = WLSTM._backward(dH, cache)
439 | 
440 |         def fwd():
441 |             h, _, _, _ = WLSTM._forward(X, c0, h0)
442 |             return np.sum(h * wrand)
443 | 
444 |         # now gradient check all
445 |         delta = 1e-5
446 |         rel_error_thr_warning = 1e-2
447 |         rel_error_thr_error = 1
448 |         tocheck = [X, c0, h0]
449 |         grads_analytic = [dX, dc0, dh0]
450 |         names = ['X', 'c0', 'h0']
451 |         for j in xrange(len(tocheck)):
452 |             mat = tocheck[j]
453 |             dmat = grads_analytic[j]
454 |             name = names[j]
455 |             # gradcheck
456 |             for i in xrange(mat.size):
457 |                 old_val = mat.flat[i]
458 |                 mat.flat[i] = old_val + delta
459 |                 loss0 = fwd()
460 |                 mat.flat[i] = old_val - delta
461 |                 loss1 = fwd()
462 |                 mat.flat[i] = old_val
463 | 
464 |                 grad_analytic = dmat.flat[i]
465 |                 grad_numerical = (loss0 - loss1) / (2 * delta)
466 | 
467 |                 if grad_numerical == 0 and grad_analytic == 0:
468 |                     rel_error = 0  # both are zero, OK.
469 |                     status = 'OK'
470 |                 elif abs(grad_numerical) < 1e-7 and abs(grad_analytic) < 1e-7:
471 |                     rel_error = 0  # not enough precision to check this
472 |                     status = 'VAL SMALL WARNING'
473 |                 else:
474 |                     rel_error = abs(grad_analytic - grad_numerical) / abs(grad_numerical + grad_analytic)
475 |                     status = 'OK'
476 |                     if rel_error > rel_error_thr_warning: status = 'WARNING'
477 |                     if rel_error > rel_error_thr_error: status = '!!!!! NOTOK'
478 |                 self.assertEqual(status, 'OK')
479 | 
480 |                 # print stats
481 |                 # print '%s checking param %s index %s (val = %+8f), analytic = %+8f, numerical = %+8f, relative error = %+8f' \
482 |                 #       % (status, name, `np.unravel_index(i, mat.shape)`, old_val, grad_analytic, grad_numerical, rel_error)
483 | 
484 | 
485 | class TestLearning(unittest.TestCase):
486 | 
487 |     unittest.skip("later")
488 |     def make_sequence3d(self, length, p):
489 |         """Makes a random sequence of 0 and 1 as a 3d input"""
490 |         return (np.random.random((length,1,1))<p) & 1
491 | 
492 |     unittest.skip("later")
493 |     def make_sequence2d(self, length, p):
494 |         """Makes a random sequence of 0 and 1 as a 2d input"""
495 |         return (np.random.random((length,1))<p) & 1
496 | 
497 |     unittest.skip("later")
498 |     def test_forward_only_3d(self):
499 |         M = 4
500 |         K = 6
501 |         net = LSTM()
502 |         net.initialize(1, M)
503 |         X = self.make_sequence3d(K, 0.0)
504 |         print "X:", X.flatten()
505 |         Y = net.forward(X)
506 |         self.assertEqual(Y.ndim, 2)
507 |         self.assertEqual(Y.shape, (1, 4))
508 | 
509 |     unittest.skip("later")
510 |     def test_forward_only_2d(self):
511 |         M = 4
512 |         K = 6
513 |         net = LSTM()
514 |         net.initialize(1, M)
515 |         X = self.make_sequence2d(K, 0.0)
516 |         Y = net.forward(X)
517 |         self.assertEqual(Y.ndim, 1)
518 |         print "Y non-batch:", Y
519 | 
520 |     unittest.skip("later")
521 |     def test_adapt_input_1d(self):
522 |         net = LSTM()
523 |         net.cache = {'n': 10, 'b': 1}
524 |         net.hidden_size = 4
525 |         dd = net._adapt_input_derivative(np.array([1, 2, 3, 4]))
526 |         self.assertEqual(dd.shape, (10, 1, 4))
527 |         self.assertEqual(np.sum(dd[:9, :, :]), 0)
528 | 
529 |     unittest.skip("later")
530 |     def test_adapt_input_2d(self):
531 |         net = LSTM()
532 |         net.cache = {'n': 10, 'b': 5}
533 |         net.hidden_size = 4
534 |         dd = net._adapt_input_derivative(np.array([[1, 2, 3, 4],
535 |                                                    [5, 6, 7, 8],
536 |                                                    [2, 4, 6, 8],
537 |                                                    [3, 4, 3, 2],
538 |                                                    [5, 5, 4, 3]]))
539 |         self.assertEqual(dd.shape, (10, 5, 4))
540 |         self.assertEqual(np.sum(dd[:9, :, :]), 0)
541 | 
542 |     def _run_total_test(self, K, num_ones, M, N, use_adadelta=False):
543 |         """Runs a test in which sequences of K elements must have
544 |         one of the specified number of ones.  Do a total of N
545 |         learning iterations.  The net has M memory cells."""
546 |         net = LSTM()
547 |         net.initialize(1, M)
548 |         num_report = N / 10
549 |         num_err = 0
550 |         tot_err = 0.0
551 |         tot_tgt = 0.0
552 |         for i in range(N):
553 |             X = self.make_sequence2d(K, (num_ones[0] * 1.0) / K)
554 |             # print "X:", X.flatten()
555 |             Y = net.forward(X)
556 |             # print "Y", Y
557 |             # Target and loss.
558 |             yt = 1.0 if np.sum(X) in num_ones else 0.0
559 |             # print "X, yt:", X, yt
560 |             e = (Y[0] - yt) ** 2
561 |             # print "e:", e, "yt:", yt, "y:", Y[0]
562 |             d = np.zeros(Y.shape)
563 |             d[0] = 2.0 * (Y[0] - yt)
564 |             #print "Loss der:", d
565 |             if use_adadelta:
566 |                 net.backward_adadelta(d)
567 |             else:
568 |                 net.backward_momentum(d)
569 |             num_err += 1
570 |             tot_err += e
571 |             tot_tgt += yt
572 |             if num_err >= num_report:
573 |                 print "After %d iterations, avg tgt = %f, avg err = %f" % (i + 1, tot_tgt / num_err, tot_err / num_err)
574 |                 num_err = 0
575 |                 tot_err = 0.0
576 |                 tot_tgt = 0.0
577 |         if num_err > 0:
578 |             print "After %d iterations, avg tgt = %f, avg err = %f" % (i + 1, tot_tgt / num_err, tot_err / num_err)
579 | 
580 |     def test_has_one(self):
581 |         print "At least one 1:"
582 |         self._run_total_test(6, [1, 2, 3, 4, 5], 4, 1000)
583 | 
584 |     def test_has_only_one(self):
585 |         print "Exactly one 1:"
586 |         self._run_total_test(6, [1], 4, 10000)
587 | 
588 |     def test_has_one_or_three(self):
589 |         print "Either one, or three 1's:"
590 |         self._run_total_test(6, [1, 3], 6, 10000)
591 | 
592 | 
593 | 
594 | 
595 | if __name__ == "__main__":
596 |     unittest.main()
597 | 


--------------------------------------------------------------------------------