├── __init__.py ├── .gitignore ├── LICENSE ├── test_bipartite_user_item_reviews.py ├── README.md ├── mlslnn.py ├── json_plus.py ├── test_MLSL.py ├── multi_level_lstm.py ├── dnn.py └── lstm.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .idea 3 | *~ 4 | *.bak 5 | *.iml 6 | 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | All the files in this repository are released under the BSD 2-clause license below. 2 | Please refer to the individual files for copyright and authorship information. 3 | 4 | Copyright (c) 2015, Camiolog Inc., Luca de Alfaro, Michael Shavlovsky, 5 | Vassilis Polychronopoulos, Rakshit Agrawal, Massimo Di Pierro, Andrej Karpathy. 6 | All rights reserved. 7 | 8 | Redistribution and use in source and binary forms, with or without modification, 9 | are permitted provided that the following conditions are met: 10 | 11 | 1. Redistributions of source code must retain the above copyright notice, 12 | this list of conditions and the following disclaimer. 13 | 14 | 2. Redistributions in binary form must reproduce the above copyright notice, 15 | this list of conditions and the following disclaimer in the documentation 16 | and/or other materials provided with the distribution. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 20 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS 22 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 28 | THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /test_bipartite_user_item_reviews.py: -------------------------------------------------------------------------------- 1 | class Item: 2 | def __init__(self, id, assignment_id = None, inherent = None, max_inherent = None): 3 | self.id = id 4 | self.assignment_id = assignment_id 5 | self.inherent = inherent 6 | self.max_inherent = max_inherent 7 | self.reviews = {} 8 | 9 | class User: 10 | def __init__(self, name): 11 | self.name = name 12 | self.users = set() 13 | self.reviews = {} 14 | 15 | class Review: 16 | def __init__(self, review_id = None, grade = None, extra_informative_feature = None): 17 | self.review_id = id 18 | self.grade = grade 19 | self.extra_informative_feature = extra_informative_feature 20 | 21 | class Graph: 22 | def __init__(self): 23 | self.items = set() 24 | self.users = set() 25 | self.reviews = {} 26 | self.items_with_ground_truth = [] 27 | self.user_dict = {} 28 | self.item_dict = {} 29 | 30 | def add_item(self, item_id, inherent = None, max_inherent = None, assignment_id = None): 31 | item = Item(id = item_id, inherent= inherent, max_inherent= max_inherent, assignment_id= assignment_id) 32 | self.item_dict[item_id] = item 33 | self.items = self.items | {item} 34 | 35 | def add_user(self, user_name): 36 | user = User(user_name) 37 | self.user_dict[user_name] = user 38 | self.users = self.users | {user} 39 | 40 | def get_user(self, user_name): 41 | return self.user_dict.get(user_name) 42 | 43 | def get_item(self, item_id): 44 | return self.item_dict.get(item_id) 45 | 46 | def has_voted(self, user_name,item_id): 47 | if not user_name in self.user_dict or not item_id in self.item_dict: 48 | return False 49 | if (self.get_user(user_name), self.get_item(item_id)) in self.reviews: 50 | return True 51 | else: 52 | return False 53 | 54 | def get_no_of_votes(self, item_id): 55 | if not item_id in self.item_dict: 56 | return 0 57 | return len(self.get_item(item_id).reviews) 58 | 59 | def add_review(self, user_name, item_id, review, assignment_id = None): 60 | """ 61 | Adds a review to the graph. 62 | It inserts the review to the generic dictionary of reviews 63 | but also to the item.reviews and user.reviews dictionaries. 64 | There is redundancy of information but enhances accessibility. 65 | """ 66 | # If user name or item id are not in the graph create respective objects 67 | if not user_name in self.user_dict: 68 | self.add_user(user_name) 69 | if not item_id in self.item_dict: 70 | self.add_item(item_id, assignment_id= assignment_id) 71 | # Get user and item objects that correspond to user name and user id 72 | user = self.get_user(user_name) 73 | item = self.get_item(item_id) 74 | # add review to the dictionaries 75 | item.reviews[user] = review 76 | user.reviews[item] = review 77 | self.reviews[(user, item)] = review -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Graph-LSTM 2 | 3 | This repository contains several pieces of code that are useful for applying machine learning to graphs. 4 | See the [project page](https://sites.google.com/view/ml-on-structures) for the overall project, papers, and data. 5 | 6 | Many prediction problems can be phrased as inferences over local neighborhoods of graphs. The graph represents the interaction between entities, and the neighborhood of each entity contains information that allows the inferences or predictions. 7 | This project enables the application of machine learning directly to such graph neighborhoods, allowing predictions to be learned from examples, bypassing the step of creating and tuning an inference model or summarizing the neighborhoods via a fixed set of hand-crafted features. 8 | The approach is based on a multi-level architecture built from Long Short-Term Memory neural nets (LSTMs); the LSTMs learn how to summarize the neighborhood from data. 9 | 10 | ## How it works 11 | 12 | The code performs predictions for one target graph node at a time. 13 | First, the graph is unfolded from the target node, yielding a tree with the target node as its root at level 0, its neighbors as level-1 children, its neighbors' neighbors as level-2 children, and so forth, up to a desired depth D. 14 | At each tree node v of level 0 <= d < D, a level-d+1 LSTM is fed sequentially the information from the children of v at level d+1, and produces as output information for v itself. 15 | Thus, we exploit LSTMs' ability to process sequences of any length to process trees of any branching factor. 16 | The top-level LSTM produces an output vector y that summarizes the tree rooted at v. 17 | This output vector can then be combined with the features of v itself, for instance via a standard neural net, to yield the desired prediction for the target node. 18 | The architecture requires training D LSTMs, one per tree level. 19 | The LSTMs learn how to summarize the neighborhood up to radius D on the basis of data, avoiding the manual task of synthesizing a fixed set of features. 20 | By dedicating one LSTM to each level, we can tailor the learning (and the LSTM size) to the distance from the target node. 21 | 22 | ## Code included 23 | 24 | This repository contains various ML algorithms, which can be used independently or in combination. 25 | 26 | ### DNN 27 | **dnn.py** provides an implementation of deep neural networks. The input consists in fixed-length feature vectors. 28 | 29 | ### LSTM 30 | **lstm.py** provides an implementation of LSTMs. The input consists in sequences of fixed-length feature vectors. 31 | 32 | ### Multi-Level LSTM 33 | **multi_level_lstm.py** provides an implementation of multi-level LSTMs (see the [project page](https://sites.google.com/view/ml-on-structures) for papers and information). The input consists in trees of nodes; each node has a feature vector. The trees can be obtained, among other ways, by unrolling the local neighborhood of a node in a graph. 34 | 35 | ### MLSLNN 36 | **mlslnn.py** is a helper function to apply multi-level LSTMs to a graph or tree. The code defined in multi_level_lstm.py enables the summarization of the feature vectors of a tree rooted at v into an output vector (from the top-level LSTM) y. The vector y summarizes the features of the children of v (and subtrees rooted there), but not of v itself. Thus, it is useful to combine the vector y, and the feature vector f(v) of v, via a top-level neural network that gives the overall output. The class MLSLNN enables this. 37 | 38 | ## Contributors 39 | 40 | * [Luca de Alfaro](https://sites.google.com/a/ucsc.edu/luca/) 41 | * Rakshit Agrawal 42 | * Vassilis Polychonopoulos 43 | -------------------------------------------------------------------------------- /mlslnn.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file builds a wrapper for using MLSL along with 3 | a Neural Network sitting on top of it. 4 | The neural network, for each of its entry, 5 | takes in a feature vector defining inputs along with a 6 | set of features outputting from an MLSL lying underneath 7 | 8 | 9 | Authors: 10 | Rakshit Agrawal 11 | Luca de Alfaro 12 | Vassilis Polychronopoulos 13 | Copyright by the authors, 2016. 14 | """ 15 | from json_plus import Serializable 16 | from multi_level_lstm import MLSL 17 | from dnn import DNN 18 | import numpy as np 19 | 20 | 21 | def softmax(w, t = 1.0): 22 | e = np.exp(np.array(w) / t) 23 | dist = e / np.sum(e) 24 | return dist 25 | 26 | def get_objective_derivative(output, target, objective): 27 | if objective == "softmax_classification": 28 | return output - target 29 | 30 | class MLSLNN(Serializable): 31 | """ 32 | This class initializes a neural network 33 | based on the size of features per entry along 34 | with a provided MLSL which generates certain number of outputs 35 | """ 36 | 37 | def __init__(self): 38 | pass 39 | 40 | def initialize(self, mlsl, nnl, seed=None, weight_range=1.0, outputs_from_mlsl=None, use_softmax=True): 41 | """ 42 | Initialize an object of this class that binds a new NN on top 43 | of an existing MLSL object 44 | :param mlsl: 45 | :type mlsl: MLSL 46 | :param nnl: 47 | :type nnl: list 48 | :param seed: 49 | :type seed: 50 | :param weight_range: 51 | :type weight_range: 52 | :return: 53 | :rtype: 54 | """ 55 | self.mlsl_output_size = mlsl.output_sizes[-1] if outputs_from_mlsl else outputs_from_mlsl 56 | 57 | # Change input size of Neural net to assigned feature size plus MLSL outputs 58 | nnl[0]+=self.mlsl_output_size 59 | 60 | self.outputs_from_mlsl = outputs_from_mlsl 61 | 62 | self.mlsl = mlsl 63 | self.nnet = DNN() 64 | self.nnet.initialize(nnl=nnl,seed=seed, weight_range=weight_range) 65 | self.use_softmax = use_softmax 66 | 67 | def forward(self, input_to_mlsl, additional_input_to_nn, target): 68 | """ 69 | This runs a forward through the entire model comprising of an MLSL 70 | followed by a NN 71 | :param input_to_mlsl: 72 | :type input_to_mlsl: 73 | :param additional_input_to_nn: 74 | :type additional_input_to_nn: 75 | :return: 76 | :rtype: 77 | """ 78 | mlsl_output = self.mlsl._forward_instance(input_to_mlsl, 0) 79 | input_to_nn = np.concatenate((mlsl_output[:self.mlsl_output_size], additional_input_to_nn)) 80 | nnet_output = self.nnet.forward(input_to_nn) 81 | if self.use_softmax: 82 | nnet_output = softmax(nnet_output) 83 | 84 | return nnet_output 85 | 86 | def get_objective_derivative(self, output, target): 87 | if self.use_softmax: 88 | return output - target 89 | else: 90 | raise ValueError 91 | 92 | 93 | def backward(self, loss_deriv, instance_node): 94 | 95 | # Run derivative through LSTM first 96 | 97 | nn_deriv = self.nnet.backward_adadelta(loss_deriv) 98 | 99 | deriv = nn_deriv[:self.mlsl_output_size] 100 | 101 | self.mlsl._compute_backward_gradients(instance_node, deriv, 0) 102 | self.mlsl._compute_LSTM_updates(instance_node, 0) 103 | # updating the weights of the LSTM modules and 104 | # updating momentum_dW of LSTM modules with sums of dWs 105 | # and the other variables for adadelta 106 | # these momentum/adadelta specific updates happen regardless of whether we use steady rate, momentum, or adadelta 107 | # if we use steady rate those variables play no role in the computation of dW 108 | for d in range(self.mlsl.max_depth + 1): 109 | self.mlsl.lstm_stack[d].WLSTM += self.mlsl.sum_of_dWs[d] / self.mlsl.number_of_nodes_per_level[d] 110 | self.mlsl.lstm_stack[d].momentum_dW = self.mlsl.sum_of_dWs[d] / self.mlsl.number_of_nodes_per_level[d] 111 | self.mlsl.lstm_stack[d].tot_gradient_weight = self.mlsl.sum_tot_delta_weight[d] / self.mlsl.number_of_nodes_per_level[d] 112 | self.mlsl.lstm_stack[d].tot_sq_gradient = self.mlsl.sum_tot_sq_gradient[d] / self.mlsl.number_of_nodes_per_level[d] 113 | self.mlsl.lstm_stack[d].tot_delta_weight = self.mlsl.sum_tot_delta_weight[d] / self.mlsl.number_of_nodes_per_level[d] 114 | self.mlsl.lstm_stack[d].tot_sq_delta = self.mlsl.sum_tot_sq_delta[d] / self.mlsl.number_of_nodes_per_level[d] 115 | 116 | 117 | def run_through_the_model(self, instance_node, target, additional_input_to_nn): 118 | self.mlsl._reset_learning_parameters() 119 | return self.backward(self.get(self.forward(instance_node, additional_input_to_nn), target), instance_node) 120 | -------------------------------------------------------------------------------- /json_plus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2014 Camiolog Inc. 4 | # Authors: Luca de Alfaro and Massimo Di Pierro 5 | 6 | import base64 7 | import datetime 8 | import importlib 9 | import json 10 | import numbers 11 | import numpy 12 | import unittest 13 | import collections 14 | 15 | 16 | fallback = {} 17 | remapper = {} 18 | 19 | 20 | class Storage(dict): 21 | __getattr__ = dict.__getitem__ 22 | __setattr__ = dict.__setitem__ 23 | 24 | 25 | def smartcmp(a,b, 26 | types=(int, long, basestring, float, bool, tuple)): 27 | is_a_primitive = isinstance(a[1],types) 28 | is_b_primitive = isinstance(b[1],types) 29 | if is_a_primitive and not is_b_primitive: 30 | return -1 31 | elif not is_a_primitive and is_b_primitive: 32 | return +1 33 | else: 34 | return cmp(a[0],b[0]) 35 | 36 | 37 | class Serializable(object): 38 | 39 | # We mimick a dict. 40 | def __getitem__(self, key): 41 | return getattr(self, key) 42 | def __setitem__(self, key, value): 43 | setattr(self, key, value) 44 | def __delitem__(self, key): 45 | del self.__dict__[key] 46 | def keys(self): 47 | return self.__dict__.keys() 48 | def items(self): 49 | return self.__dict__.items() 50 | def values(self): 51 | return self.__dict__.values() 52 | def update(self, d): 53 | self.__dict__.update(d) 54 | def __len__(self): 55 | return len(self.__dict__) 56 | def __contains__(self, item): 57 | return item in self.__dict__ 58 | def iteritems(self): 59 | return iter(self.__dict__.items()) 60 | def __repr__(self): 61 | return repr(self.__dict__) 62 | 63 | def get(self, k, d=None): 64 | try: 65 | return getattr(self, k) 66 | except AttributeError: 67 | return d 68 | 69 | def __eq__(self, other): 70 | return hasattr(other, '__dict__') and self.__dict__ == other.__dict__ 71 | 72 | def to_json(self, pack_ndarray=True, tolerant=True, indent=2): 73 | return Serializable.dumps(self, pack_ndarray=pack_ndarray, tolerant=tolerant, indent=indent) 74 | 75 | @staticmethod 76 | def dump(obj, fp, pack_ndarray=True, tolerant=True, indent=2): 77 | return fp.write(Serializable.dumps(obj, pack_ndarray=pack_ndarray, tolerant=tolerant, indent=indent)) 78 | 79 | @staticmethod 80 | def dumps(obj, pack_ndarray=True, tolerant=True, indent=2): 81 | def custom(o): 82 | if isinstance(o, Serializable): 83 | module = o.__class__.__module__.split('campil.')[-1] 84 | # make sure keys are sorted 85 | d = collections.OrderedDict() 86 | d['meta_class'] = '%s.%s' % (module, o.__class__.__name__) 87 | d.update(sorted((item for item in o.__dict__.iteritems() 88 | if not item[0].startswith('_')), smartcmp)) 89 | return d 90 | elif isinstance(o, datetime.datetime): 91 | d = {'meta_class': 'datetime.datetime', 92 | 'date': o.isoformat()} 93 | return d 94 | elif isinstance(o, set): 95 | d = {'meta_class': 'set', 96 | 'set': list(o)} 97 | return d 98 | elif isinstance(o, file): 99 | return '' % o.name 100 | 101 | elif pack_ndarray and isinstance(o, numpy.matrix): 102 | # This catches both numpy arrays, and CamArray. 103 | d = {'meta_class': 'numpy.matrix', 104 | 'dtype': str(o.dtype), 105 | 'shape': o.shape, 106 | 'data': base64.b64encode(o.tostring())} 107 | return d 108 | 109 | elif pack_ndarray and isinstance(o, numpy.ndarray): 110 | # This catches both numpy arrays, and CamArray. 111 | d = {'meta_class': 'numpy.ndarray', 112 | 'dtype': str(o.dtype), 113 | 'shape': o.shape, 114 | 'data': base64.b64encode(o.tostring())} 115 | return d 116 | 117 | # We try to preserve numpy numbers. 118 | elif type(o).__module__ == numpy.__name__ and isinstance(o, numbers.Real): 119 | d = {'meta_class': 'numpy.number', 120 | 'dtype': str(o.dtype), 121 | 'data': base64.b64encode(o.tostring()) 122 | } 123 | return d 124 | 125 | # Normal Python types are unchanged 126 | elif isinstance(o, (int, long, basestring, float, bool, list, tuple)): 127 | return o 128 | # except dictionaries which are sorted 129 | elif isinstance(o, dict): 130 | d = collections.OrderedDict() 131 | d.update(sorted((item for item in o.iteritems()), smartcmp)) 132 | return d 133 | # These two defaults are catch-all 134 | elif isinstance(o, numbers.Integral): 135 | return int(o) 136 | elif isinstance(o, numbers.Real): 137 | return float(o) 138 | elif isinstance(o, (numpy.bool, numpy.bool_)): 139 | return bool(o) 140 | elif tolerant: 141 | return None 142 | else: 143 | raise ValueError("Cannot encode in json object %r" % o) 144 | return json.dumps(obj, default=custom, indent=indent) 145 | 146 | @staticmethod 147 | def from_json(s, objectify=True, mapper={}): 148 | """Decodes json_plus. 149 | @param s : the string to decode 150 | @param objectify : If True, reconstructs the object hierarchy. 151 | @param mapper : 152 | - If a dictonary, then the key classes are replaced by the value classes in the 153 | decoding. 154 | - If a class, then all objects that are not dates or numpy classes are decoded to 155 | this class. 156 | - If None, then all objects that are not dates or numpy classes are decoded to 157 | json_plus.Serializable.""" 158 | def hook(o): 159 | meta_module, meta_class = None, o.get('meta_class') 160 | if meta_class in ('Datetime', 'datetime.datetime'): 161 | # 'Datetime' included for backward compatibility 162 | try: 163 | tmp = datetime.datetime.strptime( 164 | o['date'], '%Y-%m-%dT%H:%M:%S.%f') 165 | except Exception, e: 166 | tmp = datetime.datetime.strptime( 167 | o['date'], '%Y-%m-%dT%H:%M:%S') 168 | return tmp 169 | elif meta_class == 'set': 170 | return set(o['set']) 171 | # Numpy arrays. 172 | elif meta_class == 'numpy.ndarray': 173 | data = base64.b64decode(o['data']) 174 | dtype = o['dtype'] 175 | shape = o['shape'] 176 | v = numpy.frombuffer(data, dtype=dtype) 177 | v = v.reshape(shape) 178 | obj = v.copy() 179 | obj.flags.writeable = True 180 | return obj 181 | elif meta_class == 'numpy.matrix': 182 | data = base64.b64decode(o['data']) 183 | dtype = o['dtype'] 184 | shape = o['shape'] 185 | v = numpy.frombuffer(data, dtype=dtype) 186 | v = v.reshape(shape) 187 | obj = numpy.matrix(v.copy()) 188 | obj.flags.writeable = True 189 | return obj 190 | # Numpy numbers. 191 | elif meta_class == 'numpy.number': 192 | data = base64.b64decode(o['data']) 193 | dtype = o['dtype'] 194 | v = numpy.frombuffer(data, dtype=dtype)[0] 195 | return v 196 | 197 | elif meta_class and '.' in meta_class: 198 | # correct for classes that have migrated from one module to another 199 | meta_class = mapper.get(meta_class, meta_class) 200 | meta_class = remapper.get(meta_class, meta_class) 201 | # separate the module name from the actual class name 202 | meta_module, meta_class = meta_class.rsplit('.',1) 203 | 204 | if meta_class is not None: 205 | del o['meta_class'] 206 | if mapper is None: 207 | obj = Serializable() 208 | obj.__dict__.update(o) 209 | o = obj 210 | elif isinstance(mapper, dict): 211 | # this option is for backward compatibility in case a module is not specified 212 | if meta_class in fallback: 213 | meta_module = fallback.get(meta_class) 214 | 215 | if meta_module is not None and objectify: 216 | try: 217 | module = importlib.import_module(meta_module) 218 | cls = getattr(module, meta_class) 219 | obj = cls() 220 | obj.__dict__.update(o) 221 | o = obj 222 | except Exception, e: 223 | # If an object is unknown, restores it as a member 224 | # of this same class. 225 | obj = Serializable() 226 | obj.__dict__.update(o) 227 | o = obj 228 | else: 229 | # Map all to the specified class. 230 | obj = mapper() 231 | obj.__dict__.update(o) 232 | o = obj 233 | elif type(o).__name__ == 'dict': 234 | # For convenience we deserialize dict into Storage. 235 | o = Storage(o) 236 | return o 237 | 238 | return json.loads(s, object_hook=hook) 239 | 240 | @staticmethod 241 | def loads(s): 242 | return Serializable.from_json(s) 243 | 244 | @staticmethod 245 | def load(fp): 246 | return Serializable.loads(fp.read()) 247 | 248 | 249 | loads = Serializable.loads 250 | dumps = Serializable.dumps 251 | 252 | class TestSerializable(unittest.TestCase): 253 | 254 | def test_simple(self): 255 | a = Serializable() 256 | a.x = 1 257 | a.y = 'test' 258 | a.z = 3.14 259 | b = Serializable.from_json(a.to_json()) 260 | self.assertEqual(a, b) 261 | 262 | def test_datetime(self): 263 | a = Serializable() 264 | a.x = datetime.datetime(2015,1,3) 265 | b = Serializable.from_json(a.to_json()) 266 | self.assertEqual(a, b) 267 | 268 | def test_recursive(self): 269 | a = Serializable() 270 | a.x = Serializable() 271 | a.x.y = 'test' 272 | b = Serializable.from_json(a.to_json()) 273 | self.assertEqual(a, b) 274 | 275 | def test_numpy(self): 276 | a = Serializable() 277 | a.x = numpy.array([[1,2,3],[4,5,6]], dtype=numpy.int32) 278 | b = Serializable.from_json(a.to_json(pack_ndarray=True)) 279 | self.assertEqual(numpy.sum(numpy.abs(a.x - b.x)), 0) 280 | 281 | def test_numpy_twice(self): 282 | a = Serializable() 283 | a.x = numpy.array([[1,2,3],[4,5,6]], dtype=numpy.int32) 284 | b = Serializable.from_json(a.to_json(pack_ndarray=True)) 285 | self.assertEqual(numpy.sum(numpy.abs(a.x - b.x)), 0) 286 | c = Serializable.from_json(b.to_json(pack_ndarray=True)) 287 | self.assertEqual(numpy.sum(numpy.abs(a.x - c.x)), 0) 288 | 289 | def test_numpy_direct(self): 290 | a = numpy.array([[1,2,3],[4,5,6]], dtype=numpy.int32) 291 | s = Serializable.dumps(a, pack_ndarray=True) 292 | c = Serializable.from_json(s) 293 | self.assertEqual(numpy.sum(numpy.abs(a - c)), 0) 294 | 295 | def test_float(self): 296 | x = numpy.float16(3.5) 297 | y = Serializable.from_json(Serializable.dumps(x)) 298 | self.assertAlmostEqual(y, x, 2) 299 | 300 | def test_numpy_uint32(self): 301 | x = numpy.uint32(55) 302 | s = Serializable.dumps(x) 303 | y = Serializable.from_json(s) 304 | self.assertEqual(x, y) 305 | self.assertEqual(str(x.dtype), 'uint32') 306 | self.assertEqual(str(y.dtype), 'uint32') 307 | 308 | def test_numpy_float128(self): 309 | x = numpy.float128(55.3) 310 | s = Serializable.dumps(x) 311 | y = Serializable.from_json(s) 312 | self.assertAlmostEqual(x, y, 5) 313 | self.assertEqual(str(x.dtype), 'float128') 314 | self.assertEqual(str(y.dtype), 'float128') 315 | 316 | def test_set(self): 317 | s = set(['a', 'b', 'c']) 318 | x = Serializable.dumps(s) 319 | t = Serializable.loads(x) 320 | self.assertEqual(s, t) 321 | 322 | def test_multiple_dicts(self): 323 | d = dict(cane=4, gatto=4, uccello=2) 324 | d1 = Serializable.loads(Serializable.dumps(d)) 325 | d2 = Serializable.loads(Serializable.dumps(d1)) 326 | for k in d.keys(): 327 | self.assertEqual(d.get(k), d2.get(k)) 328 | for k in d2.keys(): 329 | self.assertEqual(d.get(k), d2.get(k)) 330 | 331 | def test_modifiable(self): 332 | a = numpy.zeros((10,10)) 333 | b = loads(dumps(a)) 334 | a[2:4, 5:6] = 1 335 | b[2:4, 5:6] = 1 336 | self.assertEqual(numpy.sum(numpy.abs(a - b)), 0) 337 | 338 | def test_matrices(self): 339 | a = numpy.matrix(numpy.ones((4,5))) 340 | b = numpy.matrix(numpy.ones((5, 6))) 341 | ab = a * b 342 | # print "Serialization:", dumps(a) 343 | aa = loads(dumps(a)) 344 | bb = loads(dumps(b)) 345 | # print "Deserialied types:", type(aa), type(bb) 346 | aabb = aa * bb 347 | self.assertEqual(numpy.sum(numpy.abs(ab - aabb)), 0) 348 | 349 | if __name__ == '__main__': 350 | unittest.main() 351 | -------------------------------------------------------------------------------- /test_MLSL.py: -------------------------------------------------------------------------------- 1 | # This code is useful for testing the MLSL package. 2 | 3 | import test_bipartite_user_item_reviews as bp 4 | import mlsl as ml 5 | import random 6 | import numpy as np 7 | import unittest 8 | 9 | def test_model(mlsl_model, test_set): 10 | guesses = 0 11 | hits = 0 12 | found = {} 13 | missed = {} 14 | misclassified = {} 15 | for item in test_set: 16 | Y = softmax(mlsl_model.forward_propagation(item)) 17 | if Y is None: 18 | continue 19 | print Y 20 | predicted_label = Y.argmax() 21 | real_label = item.get_label() 22 | print "Predicted label ", predicted_label , " real label", real_label 23 | guesses += 1 24 | hits += 1 if predicted_label == real_label else 0 25 | if predicted_label == real_label: 26 | if real_label not in found: 27 | found[real_label] = 1 28 | else: 29 | found[real_label] += 1 30 | if predicted_label != real_label: 31 | if real_label not in missed: 32 | missed[real_label] = 1 33 | else: 34 | missed[real_label] += 1 35 | if predicted_label not in misclassified: 36 | misclassified[predicted_label] = 1 37 | else: 38 | misclassified[predicted_label] += 1 39 | print "LSTM results" 40 | print "=============================================================" 41 | print "Predicted correctly ", hits , "over ", guesses, " instances." 42 | recall_list = [] 43 | recall_dict = {} 44 | precision_dict = {} 45 | found_labels = set(found.keys()) 46 | missed_labels = set(missed.keys()) 47 | all_labels = found_labels.union(missed_labels) 48 | for label in all_labels: 49 | no_of_finds = float((0 if label not in found else found[label])) 50 | no_of_missed = float((0 if label not in missed else missed[label])) 51 | no_of_misclassified = float((0 if label not in misclassified else misclassified[label])) 52 | recall = no_of_finds / (no_of_finds + no_of_missed) 53 | precision = no_of_finds / (no_of_finds + no_of_misclassified) 54 | recall_dict[label] = recall 55 | precision_dict[label] = precision 56 | recall_list.append(recall) 57 | avg_recall = np.mean(recall_list) 58 | print "Average recall ", np.mean(recall_list) 59 | if len(all_labels) == 2: # compute F-1 score for binary classification 60 | for label in all_labels: 61 | print "F-1 score for label ", label, " is : ", 62 | print 2 * (precision_dict[label] * recall_dict[label]) / (precision_dict[label] + recall_dict[label]) 63 | return avg_recall 64 | 65 | def create_synthetic_graph_with_informative_extra_feature(no_of_items, no_of_users, no_of_votes, min_grade, max_grade, min_delay, max_delay, threshold, exclude_true_grade_from_random_answers = False, seed = 500): 66 | """ 67 | Creates random graph. 68 | Adds extra feature in review: if value is above threshold, the user performing the review is always truthful. 69 | Intended for testing the MLSL network across multiple levels by feeding extra information selectively to particular levels. 70 | When exclude_true_grade_from_random_answers is True the true grade is excluded from the possible random grades, i.e. the random review grade is alway not the true grade. 71 | 72 | """ 73 | g = bp.Graph() 74 | random.seed(seed) 75 | # add items and their inherent random grades to the graph 76 | # inherent grades are uniformly distributed between min_grade and max_grade 77 | item_ids = [str(i) for i in range(no_of_items)] 78 | for i in item_ids: 79 | g.add_item(i, random.randint(min_grade,max_grade)) 80 | # item_ids is a list that contains the ids of items that are still alive, i.e. 81 | # the ones that have received less than no_of_votes votes. 82 | # Initially before users cast their votes, the list contains all items. 83 | user_ids = [str(u) for u in range(no_of_users)] 84 | for u in user_ids: # iteration over users and random picking of items to vote 85 | user_votes = 0 86 | items_complete_with_votes = [] 87 | # shuffling items so that user picks items to vote randomly 88 | random.shuffle(item_ids) 89 | extra_informative = random.randint(min_delay, max_delay) 90 | for i in item_ids: 91 | # if item has over the max number of votes or user has already voted on it 92 | # then continue to next item 93 | if g.get_no_of_votes(i) >= no_of_votes: 94 | items_complete_with_votes.append(i) 95 | continue 96 | if g.has_voted(u,i): 97 | continue 98 | # all clear to create review for item 99 | possible_random_grades = range(min_grade, max_grade + 1) 100 | if exclude_true_grade_from_random_answers: 101 | possible_random_grades.remove(g.get_item(i).inherent) 102 | review = bp.Review( grade = g.get_item(i).inherent if extra_informative>threshold else random.choice(possible_random_grades), extra_informative_feature = extra_informative) 103 | g.add_review(u,i,review) 104 | user_votes += 1 105 | # if item has exceeded max number of votes add it to the list for removal 106 | if g.get_no_of_votes(i) >= no_of_votes: 107 | items_complete_with_votes.append(i) 108 | # if user has cast more than max number of votes, break loop and continue to next user 109 | if user_votes == no_of_votes: 110 | break 111 | # remove items with more than max number of votes from item_ids list to avoid iterating over them in the future (makes execution faster) 112 | for i in items_complete_with_votes: 113 | item_ids.remove(i) 114 | return g 115 | 116 | 117 | def test_for_multiple_layers(print_graph = False, max_depth = 0, informative_features = None): 118 | votes = 3 119 | g = create_synthetic_graph_with_informative_extra_feature(no_of_items = 3000, no_of_users = 3000, 120 | no_of_votes = votes, min_grade = 0, max_grade = 10, 121 | min_delay= 0.0, max_delay = 1000.0, 122 | threshold = 700.0) 123 | random.seed(940) 124 | itemList = list(g.items) 125 | if print_graph: 126 | for u in g.users: 127 | print "User ", u.name, "voted items:" 128 | for i in u.reviews: 129 | print "Item ", i.id, "Inherent grade:", i.inherent, "User grade:", u.reviews[i].grade, "Extra feature: ", u.reviews[i].extra_informative_feature 130 | instance_list = [] 131 | counter = 0 132 | for i in itemList: 133 | new_root = ml.InstanceNode(label = i.inherent) 134 | build_unfolding(0, max_depth, i, new_root, informative_features) 135 | new_root.set_label(i.inherent) 136 | instance_list.append(new_root) 137 | counter +=1 138 | if counter % 200 ==0: 139 | print "Created unfolding for ", counter, "items." 140 | OUTPUT_SIZES = [11, 2, 2] 141 | INPUT_SIZES = [11 + (1 if informative_features[0] == "include" else 0),11 + (1 if informative_features[1] == "include" else 0), 142 | 11 + (1 if informative_features[2] == "include" else 0)] 143 | LEARNING_RATE_VECTOR = [0.05,0.1, 4.5] 144 | LEARNING_METHOD_VECTOR = ["steady_rate", "steady_rate","steady_rate"] 145 | #LEARNING_METHOD_VECTOR = ["momentum", "momentum", "momentum"] 146 | #LEARNING_METHOD_VECTOR = ["adadelta", "adadelta", "adadelta"] 147 | MOMENTUM_VECTOR = [0.01, 0.01, 0.01] 148 | ADADELTA_VECTOR = [{"learning_factor" : 1.0, "epsilon" : 0.001, "decay" : 0.95}, {"learning_factor" : 1.0, "epsilon" : 0.001, "decay" : 0.95}, {"learning_factor" : 1.0, "epsilon" : 0.001, "decay" : 0.95}] 149 | OBJECTIVE_FUNCTION = "softmax_classification" 150 | mlsl_model = ml.MLSL(max_depth + 1, output_sizes= OUTPUT_SIZES[:max_depth + 1], node_feature_sizes= INPUT_SIZES[:max_depth + 1], learning_rate_vector= LEARNING_RATE_VECTOR[:max_depth + 1], learning_method_vector= LEARNING_METHOD_VECTOR[:max_depth + 1]) 151 | random.shuffle(instance_list) 152 | training_set = instance_list[0:2000] 153 | test_set = instance_list[2000:3000] 154 | print "Training starts for ", max_depth + 1, " levels" 155 | train_model_force_balance(mlsl_model, training_set, num_instances = 50000, 156 | max_depth= max_depth, objective_function= OBJECTIVE_FUNCTION, 157 | learning_rate_vector= LEARNING_RATE_VECTOR, learning_method_vector = LEARNING_METHOD_VECTOR, 158 | momentum_vector= MOMENTUM_VECTOR, adadelta_parameters = ADADELTA_VECTOR) 159 | return test_model(mlsl_model, test_set) 160 | 161 | def build_unfolding(current_depth, max_depth, bipartite_node, tree_node, informative_features = None, parent_user_informative_feature = None): 162 | for c in bipartite_node.reviews: 163 | number_of_features = 12 if informative_features[current_depth] == "include" else 11 164 | feature_vector = np.zeros(number_of_features) 165 | feature_vector[bipartite_node.reviews[c].grade] = 1.0 166 | if current_depth == 0: # nake explicit honesty feature to feed to the 3rd level for 3 level training test 167 | if bipartite_node.inherent == bipartite_node.reviews[c].grade: 168 | honesty = 1.0 169 | else: 170 | honesty = 0.0 171 | extra_feature = bipartite_node.reviews[c].extra_informative_feature / 1000.0 172 | if number_of_features == 12: 173 | feature_vector[11] = extra_feature if current_depth < 2 else parent_user_informative_feature 174 | child_node = ml.InstanceNode(feature_vector = feature_vector.copy()) 175 | if current_depth < max_depth: 176 | build_unfolding(current_depth + 1, max_depth, bipartite_node = c, tree_node= child_node, informative_features = informative_features, 177 | parent_user_informative_feature = honesty if current_depth == 0 else parent_user_informative_feature) 178 | tree_node.children.append(child_node) 179 | 180 | 181 | """ 182 | trains MLSL with stochastic gradient descent 183 | by imposing class balance, i.e. shows equal number of examples of all classes to the network during training 184 | """ 185 | def train_model_force_balance(mlsl_model, training_set, num_instances, max_depth, objective_function, learning_rate_vector, learning_method_vector, momentum_vector = None, adadelta_parameters = None): 186 | counter = 0 187 | if num_instances == 0: 188 | return 189 | for item in get_balanced_training_set(training_set, mlsl_model.output_sizes[0]): 190 | if item.get_number_of_children() == 0: 191 | continue 192 | target = np.zeros((1,mlsl_model.output_sizes[0])) 193 | target[0,item.get_label()] = 1.0 194 | mlsl_model._reset_learning_parameters() 195 | Y = softmax(mlsl_model.forward_propagation(item)) 196 | mlsl_model.backward_propagation(item, Y - target) 197 | counter += 1 198 | if counter % 1000 == 0: 199 | print "Training has gone over", counter, " instances.." 200 | if counter == num_instances: 201 | break 202 | 203 | def softmax(w, t = 1.0): 204 | e = np.exp(np.array(w) / t) 205 | dist = e / np.sum(e) 206 | return dist 207 | 208 | """ Generator that returns items from training set 209 | equally balanced among classes""" 210 | def get_balanced_training_set(training_set, no_of_classes): 211 | # make bucket of classes to sample from 212 | buckets = {} 213 | buckets_current_indexes ={} 214 | for i in range(0, no_of_classes): 215 | buckets[i] = [] 216 | buckets_current_indexes[i] = 0 217 | for item in training_set: 218 | category = item.get_label() 219 | buckets[category].append(item) 220 | while True: 221 | for i in range(0,no_of_classes): 222 | if len(buckets[i]) == 0: # if a class has no representatives, continue 223 | continue 224 | if buckets_current_indexes[i] == len(buckets[i]): 225 | buckets_current_indexes[i] = 0 226 | yield buckets[i][buckets_current_indexes[i]] 227 | buckets_current_indexes[i] += 1 228 | 229 | 230 | class SimpleLearningTest(unittest.TestCase): 231 | 232 | def test_graph(self): 233 | # test for 1 level 234 | # by changing 'exclude' to 'includde' we include the informative feature 235 | # and expect performance to improve 236 | first_level_performance = test_for_multiple_layers(print_graph= False, max_depth = 0, informative_features = ["exclude", "NA", "NA"]) 237 | first_level_additional_feature_performance = test_for_multiple_layers(print_graph= False, max_depth = 0, informative_features = ["include","NA","NA"]) 238 | 239 | # test for 2 levels 240 | # the 2 level (max_depth = 1) beats the 1 level as it can learn the informative feature at the second level 241 | second_level_no_additional_performance = test_for_multiple_layers(print_graph= False, max_depth = 1, informative_features = ["exclude", "exclude", "NA"]) 242 | second_level_additional_performance = test_for_multiple_layers(print_graph= False, max_depth = 1, informative_features = ["exclude", "include", "NA"]) 243 | 244 | # test for 3 levels 245 | third_level_additional_performance = test_for_multiple_layers(print_graph= False, max_depth = 2, informative_features = ["exclude", "exclude", "include"]) 246 | 247 | print "\n\n\nAggregate performance comparison, test results" 248 | print "----------------------------------------------" 249 | print "1-MLSL performance, no additional informative feature : ", first_level_performance 250 | print "1-MLSL performance, with additional informative feature :", first_level_additional_feature_performance 251 | print "-----" 252 | print "Additional feature enhances performance -- training OK!" if first_level_additional_feature_performance> first_level_performance else "Not OK" 253 | print "-----" 254 | print "2-MLSL performance, no additional informative feature :", second_level_no_additional_performance 255 | print "2-MLSL performance, additional informative feature fed to second level *only* :", second_level_additional_performance 256 | print "-----" 257 | print "Additional feature at second level enhances performance -- second level training OK!" if second_level_additional_performance> second_level_no_additional_performance else "Not OK" 258 | print "-----" 259 | print "3-MLSL performance, additional informative feature on parent user honesty fed to third level *only* :", third_level_additional_performance 260 | print "-----" 261 | print "Additional feature at third level enhances performance -- third level training OK!" if third_level_additional_performance> second_level_no_additional_performance else "Not OK" 262 | print "-----" 263 | self.assertGreater(first_level_additional_feature_performance, first_level_performance,"1 Level training not OK, retrain!") 264 | self.assertGreater(second_level_additional_performance, second_level_no_additional_performance,"2 level training not OK, retrain!") 265 | self.assertGreater(third_level_additional_performance,second_level_no_additional_performance, "3 level training not OK, retrain!") 266 | # If one occasionally gets Not OK results, retrain as the random initiliazation of the weights can sometimes trap the network 267 | 268 | if __name__ == '__main__': 269 | unittest.main() -------------------------------------------------------------------------------- /multi_level_lstm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of Multi-Level Sequence Learners using LSTMs. 3 | See https://sites.google.com/view/ml-on-structures for overall project page, and papers 4 | that describe these learners. 5 | 6 | Authors: 7 | Rakshit Agrawal 8 | Luca de Alfaro 9 | Vassilis Polychronopoulos 10 | Copyright by the authors, 2016. 11 | """ 12 | 13 | import lstm 14 | import numpy as np 15 | import random 16 | from json_plus import Serializable, Storage 17 | 18 | class UnknownLearningMethod(Exception): 19 | def __init__(self, s): 20 | self.message = s 21 | 22 | 23 | class MLSL(Serializable): 24 | 25 | def __init__(self, max_depth, output_sizes, node_feature_sizes, 26 | learning_rate_vector, learning_method_vector, 27 | shuffle_levels=[], 28 | adadelta_parameters=None, 29 | momentum_vector=None): 30 | """Initializes a multi-level LSTM. 31 | The ML-LSTM has max_depth layers. Layer 0 is the root node. 32 | Layers max_depth - 1 to 0 have LSTMs in them. 33 | Layer max_depth is simply composed of graph nodes, which forward their 34 | features to the LSTMs of level max_depth - 1. 35 | The output of level i consists in the LSTM features computed from the children of i; 36 | it does not contain any features computed from the node at level i itself. 37 | The features of the node at level i will be passed to node at level i-1 along 38 | with the LSTM output. 39 | 40 | @param max_depth: As noted above. 41 | @param node_feature_sizes: How many features are produced by a node, according to its depth. 42 | This can go from 0 to max_depth (included). Be careful: unless e.g. the graph is 43 | bipartite, you need to use the same number throughout. 44 | @param output_sizes: How many features are produced by LSTMs at different depth. This 45 | does not need to be constant. 46 | @param learning_rate_vector: Vector of learning rates. 47 | @param learning_method_vector: Vector of learning methods. It can be None, in which case 48 | adadelta is used, or it can be a vector consisting of 'adadelta' or 'momentum' 49 | or 'steady_rate' (the latter is not recommended) for each layer. 50 | @param momentum_vector: vector containing momentums for learning. It can be None if 51 | adadelta is used. 52 | @param adadelta_parameters: vector of adadelta parameters. It can be None if momentum 53 | learning is used. 54 | @param shuffle_children: a list (or set) of depths at which shuffling is to occur. 55 | """ 56 | # First, some sanity checks. 57 | assert max_depth > 0 58 | assert len(output_sizes) == max_depth 59 | assert len(node_feature_sizes) == max_depth 60 | assert len(learning_method_vector) == max_depth 61 | assert adadelta_parameters is None or len(adadelta_parameters) == max_depth 62 | assert adadelta_parameters is not None or all(m != 'adadelta' for m in learning_method_vector) 63 | assert momentum_vector is None or len(momentum_vector) == max_depth 64 | assert momentum_vector is not None or all(m == 'steady_rate' for m in learning_method_vector) 65 | #assert [i < max_depth for i in shuffle_levels] 66 | 67 | self.output_sizes = output_sizes 68 | self.node_feature_sizes = node_feature_sizes 69 | self.max_depth = max_depth 70 | self.learning_rate_vector = learning_rate_vector 71 | self.learning_method_vector = learning_method_vector 72 | self.adadelta_parameters = adadelta_parameters 73 | self.momentum_vector = momentum_vector 74 | self.shuffle_levels = shuffle_levels 75 | 76 | # Creates the list of LSTMs, one per level. 77 | self.lstm_stack = [lstm.LSTM() for _ in range(max_depth)] 78 | for l in range(max_depth): 79 | self.lstm_stack[l].initialize( 80 | node_feature_sizes[l] + (0 if l == max_depth - 1 else output_sizes[l + 1]), 81 | output_sizes[l]) 82 | 83 | # we need the following structures, when training with momentum and/or adadelta, 84 | # to keep track of the sum of dW at each level in order to update the momentum_dW 85 | # or the adadelta parameters of the respective LSTM modules. 86 | self.number_of_nodes_per_level = None 87 | self.sum_of_dWs = None 88 | self.sum_tot_sq_gradient = None 89 | self.sum_tot_gradient_weight = None 90 | self.sum_tot_sq_delta = None 91 | self.sum_tot_delta_weight = None 92 | 93 | 94 | def forward_propagation(self, instance_node, instance_depth=0): 95 | """Performs forward propagation through the multi-level LSTM structure. 96 | The node instance_node at depth instance_depth is propagated. 97 | The node should be an object of class InstanceNode.""" 98 | # Prepares for back-propagation. 99 | self._reset_learning_parameters() 100 | input_sequence = np.array([]) 101 | children_sequence = list(instance_node.get_children()) 102 | if len(children_sequence) == 0: 103 | # FIXME We should really have a feature that describes the number of children. 104 | # This loses any data that might be associated with the node itself. 105 | return -100 * np.ones(self.output_sizes[instance_depth]) # no children signifier vector 106 | if instance_depth in self.shuffle_levels: 107 | # Shuffles children order if required. 108 | random.shuffle(children_sequence) 109 | for child_node in children_sequence: 110 | child_node_feature_vector = child_node.get_feature_vector() 111 | assert len(child_node_feature_vector) == self.node_feature_sizes[instance_depth] 112 | # If we are not at the very bottom we need to get input from LSTM at the next level. 113 | LSTM_output_from_below = np.array([]) 114 | if instance_depth < self.max_depth - 1: 115 | LSTM_output_from_below = self.forward_propagation(child_node, instance_depth=instance_depth + 1).reshape( 116 | self.output_sizes[instance_depth + 1]) # recursive call 117 | # concatenate feature vector and input from LSTM output below 118 | full_feature_vector = np.concatenate((LSTM_output_from_below, child_node_feature_vector)) 119 | # concatenate current feature vector to input sequence for the LSTM 120 | # TODO: This is very confusing; can you change this to use row and column stacking? 121 | input_sequence = np.concatenate((input_sequence, full_feature_vector)) 122 | # forward the input sequence to this depth's LSTM 123 | input_sequence = input_sequence.reshape(len(children_sequence), 1, len(full_feature_vector)) 124 | _, _, Y, cache = self.lstm_stack[instance_depth]._forward(input_sequence) 125 | # We store the state of the LSTM, so we can use it for back-propagation. 126 | instance_node.cache.lstm_cache = cache 127 | # we also need to save the sequence in the same order we used it. 128 | instance_node.children_sequence = children_sequence 129 | return Y 130 | 131 | 132 | def backward_propagation(self, instance_node, derivative, instance_depth=0): 133 | """Performs backward propagation, given a loss derivative for the outputs.""" 134 | # First, we backpropagate through the layers the backward gradient. 135 | self._compute_backward_gradients(instance_node, derivative, instance_depth) 136 | # Second, we compute (but we do not apply) the update at all layers 137 | # of the MLSL. We don't apply it because at every layer, there are in 138 | # general multiple instances of an LSTM, and we will have to add all the 139 | # updates for an LSTM at the same level before applying them. 140 | self._compute_LSTM_updates(instance_node, instance_depth) 141 | # Finally, once the updates have been computed, it applies them 142 | # to all the levels of the LSTM. 143 | self._apply_LSTM_updates() 144 | 145 | 146 | def _reset_learning_parameters(self): 147 | """This function should be called before any learning step.""" 148 | self.number_of_nodes_per_level = [0 for _ in range(self.max_depth + 1)] 149 | self.sum_of_dWs = [0.0 for _ in range(self.max_depth)] 150 | self.sum_tot_sq_gradient = [0.0 for _ in range(self.max_depth)] 151 | self.sum_tot_gradient_weight = [0.0 for _ in range(self.max_depth)] 152 | self.sum_tot_sq_delta = [0.0 for _ in range(self.max_depth)] 153 | self.sum_tot_delta_weight = [0.0 for _ in range(self.max_depth)] 154 | 155 | 156 | def _compute_backward_gradients(self, instance_node, derivative, instance_depth): 157 | """Recursive function to compute the backward gradients at all levels 158 | of the MLSL. The gradients are left in instance_node.cache.weight_gradient.""" 159 | dX, g, _, _ = self.lstm_stack[instance_depth].backward_return_vector_no_update( 160 | d = derivative, cache = instance_node.cache.lstm_cache) 161 | instance_node.cache.weight_gradient = g 162 | if instance_depth == self.max_depth: 163 | return 164 | for idx, item in enumerate(instance_node.children_sequence): 165 | if item.cache == {}: 166 | continue 167 | input_derivatives = dX[idx, :, 0:self.output_sizes[instance_depth + 1]] 168 | if instance_depth < self.max_depth: 169 | feature_derivatives = dX[idx, :, self.output_sizes[instance_depth + 1]:] 170 | else: 171 | feature_derivatives = dX[idx, :, :] 172 | instance_node.children_sequence[idx].gradient = feature_derivatives 173 | self._compute_backward_gradients(item, input_derivatives, instance_depth + 1) 174 | 175 | 176 | def _compute_LSTM_updates(self, instance_node, current_depth): 177 | """Computes the update to the LSTM coefficients, recurrently down 178 | the tree of nodes.""" 179 | # First, computes the update for the current node. 180 | method = self.learning_method_vector[current_depth] 181 | if method == "steady_rate": 182 | self._compute_update_LSTM_weights_steady_rate(instance_node, current_depth) 183 | elif method == "momentum": 184 | self._compute_update_LSTM_weights_with_momentum(instance_node, current_depth) 185 | elif method == "adadelta": 186 | self._compute_update_LSTM_weights_adadelta(instance_node, current_depth) 187 | else: 188 | raise UnknownLearningMethod(method) 189 | # Then, recurs down the tree. 190 | if current_depth == self.max_depth - 1: 191 | return 192 | for item in instance_node.children_sequence: 193 | self._compute_LSTM_updates(item, current_depth + 1) 194 | 195 | 196 | def _compute_update_LSTM_weights_steady_rate(self, instance_node, current_depth): 197 | """Computes the LSTM weight update at steady rate.""" 198 | if instance_node.cache is not None: 199 | dW = - self.learning_rate_vector[current_depth] * instance_node.cache.weight_gradient 200 | self.sum_of_dWs[current_depth] += dW 201 | self.number_of_nodes_per_level[current_depth] += 1 202 | 203 | 204 | def _compute_update_LSTM_weights_with_momentum(self, instance_node, current_depth): 205 | """Computes the LSTM weight update using momentum.""" 206 | if instance_node.cache is not None: 207 | if self.lstm_stack[current_depth].momentum_dW is None: # initialize momentum of LSTM to zero 208 | self.lstm_stack[current_depth].momentum_dW = np.zeros(self.lstm_stack[current_depth].WLSTM.shape) 209 | dW = (- self.learning_rate_vector[current_depth] * instance_node.cache.weight_gradient 210 | + self.momentum_vector[current_depth] * self.lstm_stack[current_depth].momentum_dW) 211 | self.lstm_stack[current_depth].WLSTM += dW 212 | self.sum_of_dWs[current_depth] += dW 213 | self.number_of_nodes_per_level[current_depth] += 1 214 | 215 | 216 | def _compute_update_LSTM_weights_adadelta(self, instance_node, current_depth): 217 | """Computes the LSTM weight update using adadelta.""" 218 | # obtain adadelta parameters 219 | decay = self.adadelta_parameters[current_depth]["decay"] 220 | epsilon = self.adadelta_parameters[current_depth]["epsilon"] 221 | learning_factor = self.adadelta_parameters[current_depth]["learning_factor"] 222 | # do the adadelta updates 223 | if instance_node.cache is not None: 224 | instance_node.tot_sq_gradient = (self.lstm_stack[current_depth].tot_sq_gradient * decay 225 | + np.sum(np.square(instance_node.cache.weight_gradient))) 226 | instance_node.tot_gradient_weight = self.lstm_stack[current_depth].tot_gradient_weight * decay + 1.0 227 | # Computes the speed. 228 | rms_delta = np.sqrt((self.lstm_stack[current_depth].tot_sq_delta + epsilon) 229 | / (self.lstm_stack[current_depth].tot_delta_weight + epsilon)) 230 | rms_gradient = np.sqrt((instance_node.tot_sq_gradient + epsilon) 231 | / (instance_node.tot_gradient_weight + epsilon)) 232 | s = rms_delta / rms_gradient 233 | # Computes the delta. 234 | delta = s * instance_node.cache.weight_gradient 235 | instance_node.tot_sq_delta = self.lstm_stack[current_depth].tot_sq_delta * decay + np.sum(np.square(delta)) 236 | instance_node.tot_delta_weight = self.lstm_stack[current_depth].tot_delta_weight * decay + 1.0 237 | # Finally, updates the weights. 238 | dW = - delta * learning_factor 239 | self.sum_of_dWs[current_depth] += dW 240 | self.number_of_nodes_per_level[current_depth] += 1 241 | self.sum_tot_sq_gradient[current_depth] += instance_node.tot_sq_gradient 242 | self.sum_tot_gradient_weight[current_depth] += instance_node.tot_gradient_weight 243 | self.sum_tot_sq_delta[current_depth] += instance_node.tot_sq_delta 244 | self.sum_tot_delta_weight[current_depth] += instance_node.tot_delta_weight 245 | 246 | 247 | def _apply_LSTM_updates(self): 248 | """Applies the updates that have been computed to the LSTM.""" 249 | for d in range(self.max_depth): 250 | self.lstm_stack[d].WLSTM += self.sum_of_dWs[d] / self.number_of_nodes_per_level[d] 251 | self.lstm_stack[d].momentum_dW = self.sum_of_dWs[d] / self.number_of_nodes_per_level[d] 252 | self.lstm_stack[d].tot_gradient_weight = self.sum_tot_delta_weight[d] / self.number_of_nodes_per_level[d] 253 | self.lstm_stack[d].tot_sq_gradient = self.sum_tot_sq_gradient[d] / self.number_of_nodes_per_level[d] 254 | self.lstm_stack[d].tot_delta_weight = self.sum_tot_delta_weight[d] / self.number_of_nodes_per_level[d] 255 | self.lstm_stack[d].tot_sq_delta = self.sum_tot_sq_delta[d] / self.number_of_nodes_per_level[d] 256 | 257 | 258 | 259 | # the following class represents nodes of the unfoldings 260 | # the MLSL module understands and can train and test on tree instances that are encoded as objects of this class 261 | 262 | class InstanceNode(Serializable): 263 | """In order to use an MLSL, we need to pass to it a tree (tree, NOT dag) 264 | of these InstanceNode. 265 | At the end of the processing, the gradient attribute of each node 266 | will contain the backpropagation of the loss derivative to the feature 267 | vector of the node itself.""" 268 | def __init__(self, feature_vector = None, label = None, id = None): 269 | self.id = id 270 | self.feature_vector = feature_vector 271 | self.label = label 272 | self.children = [] 273 | self.children_sequence = [] # Stores the specific order by which the items were fed into the LSTM to update weights correctly 274 | # The gradient backpropagated at this node will be left here. 275 | # It can be used for further back-propagation as needed. 276 | self.gradient = None 277 | # Here we store intermediate values useful for the processing. 278 | self.cache = Storage() 279 | 280 | def set_label(self, label): 281 | self.label = label 282 | 283 | def get_number_of_children(self): 284 | return len(self.children) 285 | 286 | def get_label(self): 287 | return self.label 288 | 289 | def get_children(self): 290 | return self.children 291 | 292 | def get_feature_vector(self): 293 | return self.feature_vector -------------------------------------------------------------------------------- /dnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This class implements a neural net with forward and backpropagation. 4 | # No specific loss function is used. Rather, the backpropagation can 5 | # backpropagate any derivative with respect to a loss function, 6 | # and learn accordingly. 7 | 8 | """ 9 | Copyright (c) 2015, Camiolog Inc. 10 | All rights reserved. 11 | 12 | Redistribution and use in source and binary forms, with or without modification, 13 | are permitted provided that the following conditions are met: 14 | 15 | 1. Redistributions of source code must retain the above copyright notice, 16 | this list of conditions and the following disclaimer. 17 | 18 | 2. Redistributions in binary form must reproduce the above copyright notice, 19 | this list of conditions and the following disclaimer in the documentation 20 | and/or other materials provided with the distribution. 21 | 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS 26 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 32 | THE POSSIBILITY OF SUCH DAMAGE. 33 | """ 34 | 35 | # This code has been developed by Luca de Alfaro for Camiolog, Inc., 36 | # and is here released under BSD license. 37 | # The code is derived from http://arctrix.com/nas/python/bpnn.py, 38 | # developed by Neil Schemenauer and placed in the 39 | # public domain. 40 | 41 | 42 | from json_plus import Serializable 43 | import numpy as np 44 | import unittest 45 | 46 | # Type to be used for floats. 47 | FLOAT_TYPE = 'double' 48 | 49 | # Used for weight initialization. 50 | NEURON_OVERLAP = 2.0 51 | 52 | # These nets are between 0..1, hence the choice of sigmoid function. 53 | def sigmoid(x): 54 | return 1.0 / (1.0 + np.exp(-x)) 55 | 56 | # This is the derivative of a sigmoid as a function of the inputs. 57 | def dsigmoid_in(x): 58 | e = np.exp(-x) 59 | return e / ((1.0 + e) ** 2) 60 | 61 | # This is the derivative of a sigmoid as a function of the output. 62 | def dsigmoid_out(y): 63 | return np.multiply(y, (1.0 - y)) 64 | 65 | 66 | class DNN(Serializable): 67 | """This class implements a neural net with inputs between 0..1, 68 | and outputs between 0..1. The net can have a specified number of 69 | neurons in the hidden layer.""" 70 | 71 | def __init__(self, debug=False): 72 | """Do not call; no initialization is done. Use the create method below.""" 73 | self.debug = debug 74 | 75 | 76 | def initialize(self, nnl, seed=None, weight_range=1.0): 77 | """Produces a new net. 78 | nnl is a list, consisting of the number of values in each layer. 79 | The first element of nll is the number of inputs, and the last element is 80 | the number of outputs. 81 | Leaving the weight_range to None will cause it to be automatically chosen 82 | (recommended). 83 | Leaving the distribution to 'uniform' will use the uniform distribution, 84 | also recommended. 85 | seed is a seed for the random number generator. 86 | """ 87 | # Sanity check. 88 | for n in nnl: 89 | assert n > 0 90 | self.nnl = nnl 91 | self.num_layers = len(nnl) - 1 # Number of layers. 92 | # Private random number generator. 93 | self.random_generator = np.random.RandomState(seed=seed) 94 | # w is a list of numpy arrays. 95 | # Each numpy array contains the matrix of weights from that layer to the next. 96 | # The element w[l][i,j] indicates the weight from element i of layer l 97 | # to element j of layer l + 1. 98 | self.w = [] 99 | # c, old_dw are exactly like w, but stores weight momentums. 100 | # We set them to None initially, as we use them only if needed 101 | # according to the update method. 102 | self.c, self.old_dw = None, None 103 | # These are for AdaDelta 104 | self.tot_sq_delta, self.tot_sq_gradient = None, None 105 | self.tot_delta_weight, self.tot_gradient_weight = None, None 106 | for n in range(self.num_layers): 107 | # Initializes the weights. 108 | ww = np.matrix(self.random_generator.uniform( 109 | -1.0, +1.0, size=(nnl[n] + 1, nnl[n + 1]))) 110 | # Normalizes according to Nguyen-Widrow; see 111 | # http://web.stanford.edu/class/ee373b/nninitialization.pdf 112 | # Computes the modulus of the weights for each neuron. 113 | wwmod = np.sqrt(np.sum(np.multiply(ww, ww), axis=0)) 114 | # Computes the ideal interval width, actually the reciprocal of the width. 115 | int_width = NEURON_OVERLAP * weight_range * np.power(nnl[n + 1], 1.0 / nnl[n]) 116 | ww_norm = int_width * (ww / wwmod) 117 | self.w.append(ww_norm) 118 | # Creates the matrix b of activations. Again, we store a list, in which b[n] is a 119 | # vector consisting of nnl[n] elements, containing the output activations of layer n. 120 | # Layer 0 consists of the inputs. 121 | # Note that the last element of self.b will be always set to 1, constituting the bias 122 | # for the activation potential of the net. 123 | self.b = [] 124 | for n in range(self.num_layers + 1): 125 | self.b.append(np.matrix(np.ones(nnl[n] + 1, dtype=FLOAT_TYPE))) 126 | x, y = self.b[0].shape 127 | self.input_shape = (x, y - 1) 128 | 129 | 130 | def forward(self, bi): 131 | """Given a vector bi of nnl[0] values, computes the forward-propagation of the network, 132 | returning a vector bo consisting of nnl[-1] values, each between 0 and 1. 133 | This function also sets internally all the activations a and outputs b.""" 134 | bii = np.matrix(bi) 135 | assert bii.shape == self.input_shape, "expected shape: %r actual shape: %r" % (self.input_shape, bii.shape) 136 | self.b[0][0, 0:self.nnl[0]] = bii 137 | # Propagates from layer n to layer n + 1. 138 | for n in range(self.num_layers): 139 | a = self.b[n] * self.w[n] 140 | self.b[n + 1][0, 0:self.nnl[n + 1]] = sigmoid(a) 141 | # The copy statement is necessary, or otherwise we can modify the b's 142 | # from the output, and potentially sabotage backpropagation. 143 | return self.b[self.num_layers][0, 0:self.nnl[self.num_layers]].copy() 144 | 145 | 146 | def backward(self, delta): 147 | """Implements backpropagation without updates. 148 | The input is a vector delta, of the same size of the 149 | outputs, giving \partial loss / \partial output. The output is a vector, containing 150 | \partial loss / \partial input for every input, allowing the model to be chained. 151 | NOTE: this function must be called only after the forward step!""" 152 | return self._backward_update(delta, None) 153 | 154 | 155 | def backward_momentum_NM(self, delta, speed=0.5, N=0.5, M=0.3): 156 | """Implements backpropagation. The input is a vector delta, of the same size of the 157 | outputs, giving \partial loss / \partial output. The output is a vector, containing 158 | \partial loss / \partial input for every input, allowing the model to be chained. 159 | NOTE: this function must be called only after the forward step!""" 160 | # Defines the update function. 161 | def update_function(self, layer_idx, d, speed=speed, N=N, M=M): 162 | if self.c is None: 163 | self.c = [np.zeros((self.nnl[n] + 1, self.nnl[n + 1]), dtype=FLOAT_TYPE) 164 | for n in range(self.num_layers)] 165 | # Update. 166 | wd = np.transpose(self.b[layer_idx]) * d 167 | self.w[layer_idx] -= speed * N * wd + M * self.c[layer_idx] 168 | self.c[layer_idx] = wd * speed 169 | return self._backward_update(delta, update_function) 170 | 171 | 172 | def backward_momentum(self, delta, speed=0.1, momentum=0.8): 173 | """Implements backpropagation. The input is a vector delta, of the same size of the 174 | outputs, giving \partial loss / \partial output. The output is a vector, containing 175 | \partial loss / \partial input for every input, allowing the model to be chained. 176 | NOTE: this function must be called only after the forward step!""" 177 | def update_function(self, layer_idx, d, speed=speed, momentum=momentum): 178 | if self.old_dw is None: 179 | self.old_dw = [np.zeros((self.nnl[n] + 1, self.nnl[n + 1]), dtype=FLOAT_TYPE) 180 | for n in range(self.num_layers)] 181 | g = np.transpose(self.b[layer_idx]) * d 182 | dw = speed * g + momentum * self.old_dw[layer_idx] 183 | self.w[layer_idx] -= dw 184 | self.old_dw[layer_idx] = dw 185 | return self._backward_update(delta, update_function) 186 | 187 | 188 | def backward_adadelta(self, delta, learning_factor=1.0, epsilon = 0.1, decay=0.999): 189 | """This performs an adadelta update, see http://arxiv.org/abs/1212.5701 , 190 | where learning_factor indicates how much we should learn from this particular example.""" 191 | def update_function(self, layer_idx, d, epsilon=epsilon): 192 | if self.tot_sq_gradient is None: 193 | self.tot_sq_gradient = [0.0 for n in range(self.num_layers)] 194 | self.tot_sq_delta = [0.0 for n in range(self.num_layers)] 195 | self.tot_delta_weight = [0.0 for n in range(self.num_layers)] 196 | self.tot_gradient_weight = [0.0 for n in range(self.num_layers)] 197 | # Computes the gradient. 198 | g = np.transpose(self.b[layer_idx]) * d 199 | # Updates the gradient average. 200 | self.tot_sq_gradient[layer_idx] = self.tot_sq_gradient[layer_idx] * decay + np.sum(np.square(g)) 201 | self.tot_gradient_weight[layer_idx] = self.tot_gradient_weight[layer_idx] * decay + g.size 202 | # Computes the speed. 203 | rms_delta = np.sqrt((self.tot_sq_gradient[layer_idx] + epsilon) / 204 | (self.tot_gradient_weight[layer_idx] + epsilon)) 205 | rms_gradient = np.sqrt((self.tot_sq_delta[layer_idx] + epsilon) / 206 | (self.tot_delta_weight[layer_idx] + epsilon)) 207 | s = rms_delta / rms_gradient 208 | # Performs the update. 209 | dx = s * g 210 | self.w[layer_idx] -= dx * learning_factor 211 | # Updates the delta average. 212 | self.tot_sq_delta[layer_idx] = self.tot_sq_delta[layer_idx] * decay + np.sum(np.square(dx)) 213 | self.tot_delta_weight[layer_idx] = self.tot_delta_weight[layer_idx] * decay + dx.size 214 | return self._backward_update(delta, update_function) 215 | 216 | 217 | def _backward_update(self, delta, update_function): 218 | """Implements backpropagation core. The input is a vector delta, of the same size of the 219 | outputs, giving \partial loss / \partial output. The output is a vector, containing 220 | \partial loss / \partial input for every input, allowing the model to be chained. 221 | Weights are updated if update is set to True. 222 | The function update_function is used to carry out the specific update. 223 | NOTE: this function must be called only after the forward step!""" 224 | # First, computes the derivatives wrt a[n], the activation layer. 225 | m = self.nnl[self.num_layers] # True number of outputs 226 | d = np.matrix(np.multiply(delta, 227 | np.multiply(self.b[self.num_layers][0, 0:m], 1.0 - self.b[self.num_layers][0, 0:m]))) 228 | # Then, iteratively for n going from the last layer to the first one: 229 | # - We update the weights leading from n to n + 1 230 | # - We compute d for the layer n. 231 | for n in range(self.num_layers - 1, -1, -1): 232 | m = self.nnl[n] # Number of true outputs at this level. 233 | # We do first the weight update, as it is very slightly faster. 234 | if update_function is not None: 235 | # Weight update. change is \partial loss / \partial weight. 236 | # We use the full b, as change refers also to the activation potentials. 237 | update_function(self, n, d) 238 | # Computing d for previous layer. dd is \partial loss / \partial b 239 | # This should not include the activation potentials. 240 | dd = d * np.transpose(self.w[n][0:m, :]) 241 | if n > 0: 242 | # Not the last layer. We compute d as partial loss / \partial a 243 | d = np.multiply(dd, dsigmoid_out(self.b[n][0, 0:m])) 244 | else: 245 | # For the last layer, we just output d, since the inputs are equivalent 246 | # to b, and what we want is \partial loss / \partial input 247 | d = dd 248 | return d 249 | 250 | 251 | class TestNet(unittest.TestCase): 252 | 253 | def test_backward(self): 254 | myrandom = np.random.RandomState(seed=0) 255 | net = DNN(debug=True) 256 | net.initialize([4, 2, 5, 3], 0) 257 | vi = myrandom.uniform(0.0, 1.0, 4) 258 | vo = net.forward(vi) 259 | # print "Output:", vo 260 | delta = myrandom.uniform(-1.0, 1.0, 3) 261 | d = net.backward_momentum_NM(delta) 262 | # print d 263 | 264 | def test_derivative(self): 265 | myrandom = np.random.RandomState(seed=0) 266 | net = DNN(debug=True) 267 | nnl = [3, 4, 2, 1] 268 | net.initialize(nnl, 0) 269 | # Backpropagates a dloss / dy of 1. 270 | bi = np.matrix(myrandom.uniform(0.0, 1.0, size=nnl[0])) 271 | y0 = net.forward(bi) 272 | # print "y0:", y0 273 | dd = net.backward(1.0) 274 | # Initializes the true derivatives to 0, as a placeholder. 275 | # Computes the true derivatives. 276 | epsilon = 0.001 277 | idx = 0 278 | bi[0, idx] += epsilon 279 | y1 = net.forward(bi) 280 | # print "y1:", y1 281 | # print "diff:", y1 - y0 282 | true_deriv = (y1 - y0) / epsilon 283 | # print "true deriv: ", true_deriv # [0, 0] 284 | # print "computed: ", dd[0, idx] 285 | self.assertAlmostEqual(true_deriv[0, 0], dd[0, idx], 4) 286 | 287 | def test_update_NM(self): 288 | myrandom = np.random.RandomState(seed=0) 289 | net = DNN(debug=False) 290 | nnl = [4, 2, 3, 1] 291 | net.initialize(nnl, 0) 292 | # Backpropagates a dloss / dy of 1. 293 | bi = np.matrix(myrandom.uniform(0.0, 1.0, size=nnl[0])) 294 | y = [] 295 | for i in range(10): 296 | y.append(net.forward(bi)) 297 | net.backward_momentum_NM(1.0) 298 | if i > 0: 299 | self.assertLess(y[i], y[i - 1]) 300 | # print "These must decrease (NM):" 301 | # print y 302 | 303 | def test_update_momentum(self): 304 | myrandom = np.random.RandomState(seed=0) 305 | net = DNN(debug=False) 306 | nnl = [4, 2, 3, 1] 307 | net.initialize(nnl, 0) 308 | # Backpropagates a dloss / dy of 1. 309 | bi = np.matrix(myrandom.uniform(0.0, 1.0, size=nnl[0])) 310 | y = [] 311 | for i in range(10): 312 | y.append(net.forward(bi)) 313 | net.backward_momentum(1.0) 314 | if i > 0: 315 | self.assertLess(y[i], y[i - 1]) 316 | # print "These must decrease (momentum):" 317 | # print y 318 | 319 | def test_update_adadelta(self): 320 | myrandom = np.random.RandomState(seed=0) 321 | net = DNN(debug=False) 322 | nnl = [4, 2, 3, 1] 323 | net.initialize(nnl, 0) 324 | # Backpropagates a dloss / dy of 1. 325 | bi = np.matrix(myrandom.uniform(0.0, 1.0, size=nnl[0])) 326 | y = [] 327 | for i in range(10): 328 | y.append(net.forward(bi)) 329 | net.backward_adadelta(1.0) 330 | if i > 0: 331 | self.assertLess(y[i], y[i - 1]) 332 | # print "These must decrease (adadelta):" 333 | # print y 334 | 335 | class TestInit(unittest.TestCase): 336 | def test_start(self): 337 | net = DNN(debug=True) 338 | nnl = [4, 40, 1] 339 | net.initialize(nnl, 1) 340 | for i in range(5): 341 | fv = np.random.uniform(size=4) 342 | y = net.forward(fv) 343 | # print fv, y 344 | 345 | class TestLearn(unittest.TestCase): 346 | 347 | def test_xor_momentum(self): 348 | import random 349 | N = 4000 350 | pats = [ 351 | (np.array([0, 0]), 0), 352 | (np.array([0, 1]), 1), 353 | (np.array([1, 0]), 1), 354 | (np.array([1, 1]), 0), 355 | ] 356 | for k in range(20): 357 | net = DNN() 358 | net.initialize([2, 16, 1]) 359 | e = np.zeros((N)) 360 | for i in range(N): 361 | x, tgt = random.choice(pats) 362 | y = net.forward(x) 363 | dy = 2.0 * (y - tgt) 364 | e[i] = np.sum((y - tgt) ** 2) 365 | net.backward_momentum(dy) 366 | # print i, ":", e[i] 367 | avg_e = np.average(e[N/2:]) 368 | print "MMT Avg error:", avg_e 369 | self.assertLess(avg_e, 0.01) 370 | 371 | def test_xor_adadelta(self): 372 | import random 373 | N = 4000 374 | pats = [ 375 | (np.array([0, 0]), 0), 376 | (np.array([0, 1]), 1), 377 | (np.array([1, 0]), 1), 378 | (np.array([1, 1]), 0), 379 | ] 380 | for k in range(20): 381 | net = DNN() 382 | net.initialize([2, 16, 1]) 383 | e = np.zeros((N)) 384 | for i in range(N): 385 | x, tgt = random.choice(pats) 386 | y = net.forward(x) 387 | dy = 2.0 * (y - tgt) 388 | e[i] = np.sum((y - tgt) ** 2) 389 | net.backward_adadelta(dy) 390 | # print i, ":", e[i] 391 | avg_e = np.average(e[N/2:]) 392 | print "ADA Avg error:", avg_e 393 | self.assertLess(avg_e, 0.01) 394 | 395 | if __name__ == '__main__': 396 | unittest.main() 397 | 398 | -------------------------------------------------------------------------------- /lstm.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code is based on 3 | https://gist.github.com/karpathy/587454dc0146a6ae21fc 4 | by Andrej Karpathy. 5 | Luca de Alfaro modified the code to make the 6 | LSTM into objects that can be serialized, and added the learning 7 | methods based on gradient descent with momentum, and adadelta. 8 | Vassilis Polychronopoulos and Rakshit Agrawal then added methods 9 | that facilitate the use of the code for multi-level LSTMs. 10 | """ 11 | 12 | import numpy as np 13 | from json_plus import Serializable 14 | import unittest 15 | 16 | class LSTM(Serializable): 17 | """Class implementing an LSTM.""" 18 | 19 | def __init__(self): 20 | """We need an empty initializer, to be compatible with the Serializable 21 | interface.""" 22 | pass 23 | 24 | def initialize(self, input_size, hidden_size, fancy_forget_bias_init=3): 25 | """ 26 | Initialize parameters of the LSTM (both weights and biases in one matrix) 27 | One might way to have a positive fancy_forget_bias_init number (e.g. maybe even up to 5, in some papers) 28 | In the matrix there are inputs for: 29 | - 1 (bias) 30 | - Input 31 | - Hidden 32 | In the other dimension, there are four outputs, for: 33 | - Input to cell 34 | - Forget 35 | - Output 36 | - Gate 37 | """ 38 | # +1 for the biases, which will be the first row of self.WLSTM 39 | self.input_size = input_size 40 | self.hidden_size = hidden_size 41 | self.WLSTM = np.random.randn(input_size + hidden_size + 1, 4 * hidden_size) / np.sqrt(input_size + hidden_size) 42 | # self.WLSTM[0, :] = 0 # initialize biases to zero 43 | if fancy_forget_bias_init != 0: 44 | # forget gates get little bit negative bias initially to encourage them to be turned off 45 | # remember that due to Xavier initialization above, the raw output activations from gates before 46 | # nonlinearity are zero mean and on order of standard deviation ~1 47 | self.WLSTM[0, hidden_size:2 * hidden_size] = fancy_forget_bias_init 48 | 49 | # Init parameters for momentum update method. 50 | self.momentum_dW = None # Delta weights 51 | 52 | # Init parameters for ADADELTA update method. 53 | self.tot_gradient_weight, self.tot_delta_weight = 0, 0 54 | self.tot_sq_gradient, self.tot_sq_delta = 0, 0 55 | 56 | 57 | def clone(self): 58 | replica = LSTM() 59 | replica.input_size = self.input_size 60 | replica.hidden_size = self.hidden_size 61 | replica.WLSTM = np.copy(self.WLSTM) 62 | replica.momentum_dW = np.copy(self.momentum_dW) 63 | replica.tot_gradient_weight = self.tot_gradient_weight 64 | replica.tot_delta_weight = self.tot_delta_weight 65 | replica.tot_sq_gradient = self.tot_sq_gradient 66 | replica.tot_sq_delta = self.tot_sq_delta 67 | return replica 68 | 69 | 70 | def _forward(self, X, c0=None, h0=None): 71 | """ 72 | Forward-propagates the input X. 73 | The output consists in the vector of all outputs for all time steps, 74 | the final value of the memories, the output of the LSTM at the final step, 75 | and a cache that contains the whole useful state of the LSTM, so that it can be 76 | used later for back-propagation. 77 | The cache value is also stored in the LSTM. 78 | X should be of shape (n,b,input_size), where n = length of sequence, b = batch size 79 | """ 80 | n, b, isz = X.shape 81 | d = self.hidden_size 82 | if c0 is None: c0 = np.zeros((b, d)) 83 | if h0 is None: h0 = np.zeros((b, d)) 84 | assert(isz == self.input_size) 85 | 86 | # Perform the LSTM forward pass with X as the input 87 | m = self.WLSTM.shape[0] # size of x plus h plus bias 88 | Hin = np.zeros((n, b, m)) # input [1, xt, ht-1] to each tick of the LSTM 89 | Hout = np.zeros((n, b, d)) # hidden representation of the LSTM (gated cell content) 90 | IFOG = np.zeros((n, b, d * 4)) # input, forget, output, gate (IFOG) 91 | IFOGf = np.zeros((n, b, d * 4)) # after nonlinearity 92 | C = np.zeros((n, b, d)) # cell content 93 | Ct = np.zeros((n, b, d)) # tanh of cell content 94 | for t in xrange(n): 95 | # concat [x,h] as input to the LSTM 96 | prevh = Hout[t - 1] if t > 0 else h0 # previous cell output. 97 | # assembles cell input. 98 | Hin[t, :, 0] = 1 # bias 99 | Hin[t, :, 1:self.input_size + 1] = X[t] 100 | Hin[t, :, self.input_size + 1:] = prevh 101 | # print "Hin[%d]:\n" % t, Hin[t, :, :], '\n-------------------------------\n' 102 | # compute all gate activations. dots: (most work is this line) 103 | IFOG[t] = Hin[t].dot(self.WLSTM) 104 | # non-linearities 105 | IFOGf[t, :, :3 * d] = 1.0 / (1.0 + np.exp(-IFOG[t, :, :3 * d])) # sigmoids; these are the gates 106 | IFOGf[t, :, 3 * d:] = np.tanh(IFOG[t, :, 3 * d:]) # tanh 107 | # compute the cell activation 108 | prevc = C[t - 1] if t > 0 else c0 109 | # input * gate + forget * previous_cell; (2) 110 | C[t] = IFOGf[t, :, :d] * IFOGf[t, :, 3 * d:] + IFOGf[t, :, d:2 * d] * prevc 111 | Ct[t] = np.tanh(C[t]) # nonlinearity 112 | Hout[t] = IFOGf[t, :, 2 * d:3 * d] * Ct[t] # output * cell (1) 113 | 114 | cache = {} 115 | cache['Hout'] = Hout 116 | cache['IFOGf'] = IFOGf 117 | cache['IFOG'] = IFOG 118 | cache['C'] = C 119 | cache['Ct'] = Ct 120 | cache['Hin'] = Hin 121 | cache['c0'] = c0 122 | cache['h0'] = h0 123 | cache['n'] = n 124 | cache['b'] = b 125 | 126 | # We remember the cached values, so we don't need to plug them back in each time. 127 | self.cache = cache 128 | 129 | # return C[t], as well so we can continue LSTM with prev state init if needed 130 | return Hout, C[t], Hout[t], cache 131 | 132 | 133 | def clean_before_serialization(self): 134 | self.cache.clear() 135 | 136 | 137 | def _backward(self, dHout_in, cache=None, dcn=None, dhn=None): 138 | """Backward propagation through the LSTM. dHout_in must have the same shape as Hout.""" 139 | if cache is None: 140 | cache = self.cache 141 | Hout = cache['Hout'] 142 | IFOGf = cache['IFOGf'] 143 | IFOG = cache['IFOG'] 144 | C = cache['C'] 145 | Ct = cache['Ct'] 146 | Hin = cache['Hin'] 147 | c0 = cache['c0'] 148 | h0 = cache['h0'] 149 | n = cache['n'] 150 | b = cache['b'] 151 | d = self.hidden_size 152 | 153 | # backprop the LSTM 154 | dIFOG = np.zeros(IFOG.shape) 155 | dIFOGf = np.zeros(IFOGf.shape) 156 | dWLSTM = np.zeros(self.WLSTM.shape) 157 | dHin = np.zeros(Hin.shape) 158 | dC = np.zeros(C.shape) 159 | dX = np.zeros((n, b, self.input_size)) 160 | dh0 = np.zeros((b, d)) 161 | dc0 = np.zeros((b, d)) 162 | dHout = dHout_in.copy() # make a copy so we don't have any funny side effects 163 | if dcn is not None: dC[n - 1] += dcn.copy() # carry over gradients from later 164 | if dhn is not None: dHout[n - 1] += dhn.copy() 165 | for t in reversed(xrange(n)): 166 | 167 | tanhCt = Ct[t] 168 | # backpropagation through (1) for output 169 | dIFOGf[t, :, 2 * d:3 * d] = tanhCt * dHout[t] 170 | # backprop tanh non-linearity first then continue backprop. 171 | # this is the backprop of output on cells, (1) cont. 172 | dC[t] += (1 - tanhCt ** 2) * (IFOGf[t, :, 2 * d:3 * d] * dHout[t]) 173 | 174 | if t > 0: 175 | dIFOGf[t, :, d:2 * d] = C[t - 1] * dC[t] # delta forget (through (2)) 176 | dC[t - 1] += IFOGf[t, :, d:2 * d] * dC[t] 177 | else: 178 | dIFOGf[t, :, d:2 * d] = c0 * dC[t] 179 | dc0 = IFOGf[t, :, d:2 * d] * dC[t] 180 | # this completes backpropagation to the cell memory. 181 | 182 | # this is the gate * input portion, effects on gate and input. 183 | dIFOGf[t, :, :d] = IFOGf[t, :, 3 * d:] * dC[t] # backprop of input through gate, part of (2) 184 | dIFOGf[t, :, 3 * d:] = IFOGf[t, :, :d] * dC[t] # backprop of gate through input, part of (2) 185 | 186 | # backprop activation functions 187 | dIFOG[t, :, 3 * d:] = (1 - IFOGf[t, :, 3 * d:] ** 2) * dIFOGf[t, :, 3 * d:] 188 | y = IFOGf[t, :, :3 * d] 189 | dIFOG[t, :, :3 * d] = (y * (1.0 - y)) * dIFOGf[t, :, :3 * d] 190 | 191 | # backprop matrix multiply 192 | dWLSTM += np.dot(Hin[t].transpose(), dIFOG[t]) 193 | dHin[t] = dIFOG[t].dot(self.WLSTM.transpose()) 194 | 195 | # backprop the identity transforms into Hin 196 | dX[t] = dHin[t, :, 1:self.input_size + 1] 197 | if t > 0: 198 | dHout[t - 1, :] += dHin[t, :, self.input_size + 1:] 199 | else: 200 | dh0 += dHin[t, :, self.input_size + 1:] 201 | 202 | return dX, dWLSTM, dc0, dh0 203 | 204 | 205 | def forward(self, X): 206 | """Forward function. Can be called to predict outputs, and as preparation to backpropagation. 207 | X should be of shape (n, b, input_size), where n = length of sequence, b = batch size. 208 | If b = 1, one can also give dimensions (n, input_size) to X. 209 | """ 210 | XX = X if X.ndim == 3 else X.reshape((X.shape[0], 1, X.shape[1])) 211 | _, _, o, _ = self._forward(XX) 212 | return o if X.ndim == 3 else o.flatten() 213 | 214 | 215 | def _adapt_input_derivative(self, d): 216 | """In an LSTM, we often have feedback only on the last result, only once 217 | all the sequence has been read. This function takes d as given, and 218 | produces the internal representation that is needed. The rule is as follows: 219 | - If d has dimension 3, then it is assumed that it comes already in the 220 | correct format. 221 | - If d has dimension 2, then it is assumed that there are batches, and 222 | that the data includes only the latest temporal step. The previous 223 | temporal steps are filled in with zeros, as appropriate. 224 | - If d has dimension 1, then it is assumed that no batches are present, 225 | and that d refers only to the last temporal step. 226 | The other temporal steps are filled with zeros as required, and an 227 | appropriate array is returned. 228 | """ 229 | if d.ndim == 3: 230 | return d 231 | elif d.ndim == 1: 232 | n = self.cache['n'] # N. of temporal steps 233 | assert(self.cache['b'] == 1) 234 | assert(d.size == self.hidden_size) 235 | dd = np.vstack((np.zeros((n - 1, self.hidden_size)), d)) 236 | return dd.reshape(n, 1, self.hidden_size) 237 | elif d.ndim == 2: 238 | n = self.cache['n'] # N. of temporal steps 239 | batch_size, hidden_size = d.shape 240 | assert(batch_size == self.cache['b']) 241 | assert(hidden_size == self.hidden_size) 242 | other_times = np.zeros((n - 1, batch_size, hidden_size)) 243 | return np.vstack((other_times, d.reshape(1, d.shape[0], d.shape[1]))) 244 | 245 | 246 | def backward(self, d): 247 | """Backward function without learning. Input is de loss / de output.""" 248 | dd = self._adapt_input_derivative(d) 249 | _, _, _, dh0 = self._backward(dd) 250 | return dh0 251 | 252 | """ no update, and cache can be passed as parameter""" 253 | def backward_return_vector_no_update(self, d, cache): 254 | """Backward function without learning. Input is de loss / de output.""" 255 | self.cache = cache 256 | dd = self._adapt_input_derivative(d) 257 | dX, g, dc0, dh0 = self._backward(dd, cache = cache) 258 | return dX, g, dc0, dh0 259 | 260 | 261 | def backward_momentum(self, d, speed=0.1, momentum=0.8): 262 | """Implements backpropagation with momentum.""" 263 | dd = self._adapt_input_derivative(d) 264 | _, g, _, dh0 = self._backward(dd) 265 | if self.momentum_dW is None: 266 | self.momentum_dW = np.zeros(self.WLSTM.shape) 267 | dW = - speed * g + momentum * self.momentum_dW 268 | self.momentum_dW = dW 269 | self.WLSTM += dW 270 | return dh0 271 | 272 | 273 | def backward_momentum_vector(self, d, speed=0.0001, momentum=0.0008): 274 | """Implements backpropagation with momentum.""" 275 | dd = self._adapt_input_derivative(d) 276 | dX, g, dc0, dh0 = self._backward(dd) 277 | if self.momentum_dW is None: 278 | self.momentum_dW = np.zeros(self.WLSTM.shape) 279 | dW = - speed * g + momentum * self.momentum_dW 280 | self.momentum_dW = dW 281 | self.WLSTM += dW 282 | return dX, g, dc0, dh0 283 | 284 | 285 | def backward_adadelta(self, d, learning_factor=1.0, epsilon=0.001, decay=0.95): 286 | """Implements backpropagation with the ADADELTA method, see 287 | http://arxiv.org/abs/1212.5701 288 | learning_factor indicates how much we should learn from this particular example.""" 289 | dd = self._adapt_input_derivative(d) 290 | _, g, _, dh0 = self._backward(dd) 291 | # Updates the gradient average. 292 | self.tot_sq_gradient = self.tot_sq_gradient * decay + np.sum(np.square(g)) 293 | self.tot_gradient_weight = self.tot_gradient_weight * decay + 1.0 294 | # Computes the speed. 295 | rms_delta = np.sqrt((self.tot_sq_delta + epsilon) / (self.tot_delta_weight + epsilon)) 296 | rms_gradient = np.sqrt((self.tot_sq_gradient + epsilon) / (self.tot_gradient_weight + epsilon)) 297 | s = rms_delta / rms_gradient 298 | # Computes the delta. 299 | delta = s * g 300 | # Updates the delta average. 301 | self.tot_sq_delta = self.tot_sq_delta * decay + np.sum(np.square(delta)) 302 | self.tot_delta_weight = self.tot_delta_weight * decay + 1.0 303 | # Finally, updates the weights. 304 | self.WLSTM -= delta * learning_factor 305 | return dh0 306 | 307 | def backward_adadelta_vector(self, d, learning_factor = 1.0, epsilon = 0.0001, decay = 0.95): 308 | """Implements backpropagation with the ADADELTA method, see 309 | http://arxiv.org/abs/1212.5701 310 | learning_factor indicates how much we should learn from this particular example.""" 311 | dd = self._adapt_input_derivative(d) 312 | dX, g, dc0, dh0 = self._backward(dd) 313 | # Updates the gradient average. 314 | self.tot_sq_gradient = self.tot_sq_gradient * decay + np.sum(np.square(g)) 315 | self.tot_gradient_weight = self.tot_gradient_weight * decay + 1.0 316 | # Computes the speed. 317 | rms_delta = np.sqrt((self.tot_sq_delta + epsilon) / (self.tot_delta_weight + epsilon)) 318 | rms_gradient = np.sqrt((self.tot_sq_gradient + epsilon) / (self.tot_gradient_weight + epsilon)) 319 | s = rms_delta / rms_gradient 320 | # Computes the delta. 321 | delta = s * g 322 | # Updates the delta average. 323 | self.tot_sq_delta = self.tot_sq_delta * decay + np.sum(np.square(delta)) 324 | self.tot_delta_weight = self.tot_delta_weight * decay + 1.0 325 | # Finally, updates the weights. 326 | self.WLSTM -= delta * learning_factor 327 | return dX, g, dc0, dh0 328 | 329 | def backward_adadelta_vector_no_update(self, d, learning_factor=1.0, epsilon=0.0001, decay=0.95): 330 | """Implements backpropagation with the ADADELTA method, see 331 | http://arxiv.org/abs/1212.5701 332 | learning_factor indicates how much we should learn from this particular example.""" 333 | dd = self._adapt_input_derivative(d) 334 | dX, g, dc0, dh0 = self._backward(dd) 335 | # Updates the gradient average. 336 | self.tot_sq_gradient = self.tot_sq_gradient * decay + np.sum(np.square(g)) 337 | self.tot_gradient_weight = self.tot_gradient_weight * decay + 1.0 338 | # Computes the speed. 339 | rms_delta = np.sqrt((self.tot_sq_delta + epsilon) / (self.tot_delta_weight + epsilon)) 340 | rms_gradient = np.sqrt((self.tot_sq_gradient + epsilon) / (self.tot_gradient_weight + epsilon)) 341 | s = rms_delta / rms_gradient 342 | # Computes the delta. 343 | delta = s * g 344 | # Updates the delta average. 345 | self.tot_sq_delta = self.tot_sq_delta * decay + np.sum(np.square(delta)) 346 | self.tot_delta_weight = self.tot_delta_weight * decay + 1.0 347 | # Finally, updates the weights. 348 | #self.WLSTM -= delta * learning_factor 349 | return dX, -delta * learning_factor 350 | 351 | 352 | # ------------------- 353 | # TEST CASES 354 | # ------------------- 355 | 356 | class BasicTests(unittest.TestCase): 357 | 358 | @unittest.skip("later") 359 | def test_checkSequentialMatchesBatch(self): 360 | """ check LSTM I/O forward/backward interactions """ 361 | 362 | n, b, d = (5, 3, 4) # sequence length, batch size, hidden size 363 | input_size = 10 364 | WLSTM = LSTM() 365 | WLSTM.initialize(input_size, d) # input size, hidden size 366 | X = np.random.randn(n, b, input_size) 367 | h0 = np.random.randn(b, d) 368 | c0 = np.random.randn(b, d) 369 | 370 | # sequential forward 371 | cprev = c0 372 | hprev = h0 373 | caches = [{} for t in xrange(n)] 374 | Hcat = np.zeros((n, b, d)) 375 | for t in xrange(n): 376 | xt = X[t:t + 1] 377 | _, cprev, hprev, cache = WLSTM._forward(xt, cprev, hprev) 378 | caches[t] = cache 379 | Hcat[t] = hprev 380 | 381 | # sanity check: perform batch forward to check that we get the same thing 382 | H, _, _, batch_cache = WLSTM._forward(X, c0, h0) 383 | assert np.allclose(H, Hcat), 'Sequential and Batch forward don''t match!' 384 | 385 | # eval loss 386 | wrand = np.random.randn(*Hcat.shape) 387 | loss = np.sum(Hcat * wrand) 388 | dH = wrand 389 | 390 | # get the batched version gradients 391 | BdX, BdWLSTM, Bdc0, Bdh0 = WLSTM._backward(dH, batch_cache) 392 | 393 | # now perform sequential backward 394 | dX = np.zeros_like(X) 395 | dWLSTM = np.zeros_like(WLSTM.WLSTM) 396 | dc0 = np.zeros_like(c0) 397 | dh0 = np.zeros_like(h0) 398 | dcnext = None 399 | dhnext = None 400 | for t in reversed(xrange(n)): 401 | dht = dH[t].reshape(1, b, d) 402 | dx, dWLSTMt, dcprev, dhprev = WLSTM._backward(dht, caches[t], dcnext, dhnext) 403 | dhnext = dhprev 404 | dcnext = dcprev 405 | 406 | dWLSTM += dWLSTMt # accumulate LSTM gradient 407 | dX[t] = dx[0] 408 | if t == 0: 409 | dc0 = dcprev 410 | dh0 = dhprev 411 | 412 | # and make sure the gradients match 413 | # print 'Making sure batched version agrees with sequential version: (should all be True)' 414 | self.assertTrue(np.allclose(BdX, dX)) 415 | self.assertTrue(np.allclose(BdWLSTM, dWLSTM)) 416 | self.assertTrue(np.allclose(Bdc0, dc0)) 417 | self.assertTrue(np.allclose(Bdh0, dh0)) 418 | 419 | 420 | @unittest.skip("later") 421 | def test_checkBatchGradient(self): 422 | """ check that the batch gradient is correct """ 423 | 424 | # lets gradient check this beast 425 | n, b, d = (5, 3, 4) # sequence length, batch size, hidden size 426 | input_size = 10 427 | WLSTM = LSTM() 428 | WLSTM.initialize(input_size, d) # input size, hidden size 429 | X = np.random.randn(n, b, input_size) 430 | h0 = np.random.randn(b, d) 431 | c0 = np.random.randn(b, d) 432 | 433 | # batch forward backward 434 | H, Ct, Ht, cache = WLSTM._forward(X, c0, h0) 435 | wrand = np.random.randn(*H.shape) 436 | loss = np.sum(H * wrand) # weighted sum is a nice hash to use I think 437 | dH = wrand 438 | dX, dWLSTM, dc0, dh0 = WLSTM._backward(dH, cache) 439 | 440 | def fwd(): 441 | h, _, _, _ = WLSTM._forward(X, c0, h0) 442 | return np.sum(h * wrand) 443 | 444 | # now gradient check all 445 | delta = 1e-5 446 | rel_error_thr_warning = 1e-2 447 | rel_error_thr_error = 1 448 | tocheck = [X, c0, h0] 449 | grads_analytic = [dX, dc0, dh0] 450 | names = ['X', 'c0', 'h0'] 451 | for j in xrange(len(tocheck)): 452 | mat = tocheck[j] 453 | dmat = grads_analytic[j] 454 | name = names[j] 455 | # gradcheck 456 | for i in xrange(mat.size): 457 | old_val = mat.flat[i] 458 | mat.flat[i] = old_val + delta 459 | loss0 = fwd() 460 | mat.flat[i] = old_val - delta 461 | loss1 = fwd() 462 | mat.flat[i] = old_val 463 | 464 | grad_analytic = dmat.flat[i] 465 | grad_numerical = (loss0 - loss1) / (2 * delta) 466 | 467 | if grad_numerical == 0 and grad_analytic == 0: 468 | rel_error = 0 # both are zero, OK. 469 | status = 'OK' 470 | elif abs(grad_numerical) < 1e-7 and abs(grad_analytic) < 1e-7: 471 | rel_error = 0 # not enough precision to check this 472 | status = 'VAL SMALL WARNING' 473 | else: 474 | rel_error = abs(grad_analytic - grad_numerical) / abs(grad_numerical + grad_analytic) 475 | status = 'OK' 476 | if rel_error > rel_error_thr_warning: status = 'WARNING' 477 | if rel_error > rel_error_thr_error: status = '!!!!! NOTOK' 478 | self.assertEqual(status, 'OK') 479 | 480 | # print stats 481 | # print '%s checking param %s index %s (val = %+8f), analytic = %+8f, numerical = %+8f, relative error = %+8f' \ 482 | # % (status, name, `np.unravel_index(i, mat.shape)`, old_val, grad_analytic, grad_numerical, rel_error) 483 | 484 | 485 | class TestLearning(unittest.TestCase): 486 | 487 | unittest.skip("later") 488 | def make_sequence3d(self, length, p): 489 | """Makes a random sequence of 0 and 1 as a 3d input""" 490 | return (np.random.random((length,1,1))= num_report: 573 | print "After %d iterations, avg tgt = %f, avg err = %f" % (i + 1, tot_tgt / num_err, tot_err / num_err) 574 | num_err = 0 575 | tot_err = 0.0 576 | tot_tgt = 0.0 577 | if num_err > 0: 578 | print "After %d iterations, avg tgt = %f, avg err = %f" % (i + 1, tot_tgt / num_err, tot_err / num_err) 579 | 580 | def test_has_one(self): 581 | print "At least one 1:" 582 | self._run_total_test(6, [1, 2, 3, 4, 5], 4, 1000) 583 | 584 | def test_has_only_one(self): 585 | print "Exactly one 1:" 586 | self._run_total_test(6, [1], 4, 10000) 587 | 588 | def test_has_one_or_three(self): 589 | print "Either one, or three 1's:" 590 | self._run_total_test(6, [1, 3], 6, 10000) 591 | 592 | 593 | 594 | 595 | if __name__ == "__main__": 596 | unittest.main() 597 | --------------------------------------------------------------------------------