├── KNN MapReduce
    ├── input_points.txt
    ├── example_dataset.txt
    ├── example_output_classification.txt
    ├── example_output_regression.txt
    └── knn.py
├── Linear Regression MapReduce
    ├── example_output
    ├── .DS_Store
    ├── example_dataset.txt
    └── LinearRegressionTS.py
├── README.md
├── RidgeRegression
    ├── .DS_Store
    └── RidgeRegressionMapReduce.py
├── Gaussian Mixture Model MapReduce
    ├── gmm_test_local_test_iteration_0
    │   └── part-00000
    ├── gmm_test_local_test_iteration_1
    │   └── part-00000
    ├── gmm_test_local_test_iteration_2
    │   └── part-00000
    ├── IterationGaussianMixtureMR.py
    ├── gmm_test_data.txt
    ├── gmm.py
    └── InitialiseGaussianMixtures.py
├── Random Sample MapReduce
    └── SimpleRandomSampleNoReplacementMR.py
├── Gaussian Discriminant Analysis MapReduce
    ├── gda_wrapper.py
    └── gda.py
└── Multivariate Descriptive Statistics
    └── MultivariateDescriptiveStatistics.py


/KNN MapReduce/input_points.txt:
--------------------------------------------------------------------------------
1 | 1.56,2.46
2 | 0.98,1.34
3 | 6.08,7.54
4 | 8.42,9.23
5 | 


--------------------------------------------------------------------------------
/Linear Regression MapReduce/example_output:
--------------------------------------------------------------------------------
1 | [3.139604536372969, 0.69858828816164764]
2 | 


--------------------------------------------------------------------------------
/KNN MapReduce/example_dataset.txt:
--------------------------------------------------------------------------------
1 | 0,1,2,1
2 | 1,0,1,1
3 | 2,1,1,1
4 | 3,10,9,0
5 | 4,8,7,0
6 | 5,6,9,0
7 | 


--------------------------------------------------------------------------------
/KNN MapReduce/example_output_classification.txt:
--------------------------------------------------------------------------------
1 | 0.98,1.34,1
2 | 6.08,7.54,0
3 | 8.42,9.23,0
4 | 1.56,2.46,1
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MapReduce-Machine-Learning
2 | 
3 | Map-Reduce implementation of some machine learning algorithms
4 | 


--------------------------------------------------------------------------------
/KNN MapReduce/example_output_regression.txt:
--------------------------------------------------------------------------------
1 | 0.98,1.34,1.0
2 | 6.08,7.54,0.0
3 | 8.42,9.23,0.0
4 | 1.56,2.46,1.0
5 | 


--------------------------------------------------------------------------------
/RidgeRegression/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AmazaspShumik/MapReduce-Machine-Learning/HEAD/RidgeRegression/.DS_Store


--------------------------------------------------------------------------------
/Linear Regression MapReduce/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AmazaspShumik/MapReduce-Machine-Learning/HEAD/Linear Regression MapReduce/.DS_Store


--------------------------------------------------------------------------------
/Gaussian Mixture Model MapReduce/gmm_test_local_test_iteration_0/part-00000:
--------------------------------------------------------------------------------
1 | {"mixing": [0.48, 0.52], "mu": [[10.096098155296746, 9.7736326940978326], [0.86293812494265676, 0.76820108485655736]], "covariance": [[[1.4235652864798376, -0.17050578277420905], [-0.17050578277420905, 0.9520414860040205]], [[1.1309668152757282, -0.26122286099829789], [-0.26122286099829789, 0.64619841763880226]]]}
2 | 


--------------------------------------------------------------------------------
/Gaussian Mixture Model MapReduce/gmm_test_local_test_iteration_1/part-00000:
--------------------------------------------------------------------------------
1 | {"mixing": [0.5, 0.5], "mu": [[10.008817713066399, 9.7957780166309867], [1.0466993295879303, 0.8918511713799977]], "covariance": [[[1.1365934421562542, -0.075964802401628617], [-0.075964802401628617, 0.78716761230624854]], [[1.1201170855228824, -0.031611048687119281], [-0.031611048687119281, 0.86979878980839098]]]}
2 | 


--------------------------------------------------------------------------------
/Gaussian Mixture Model MapReduce/gmm_test_local_test_iteration_2/part-00000:
--------------------------------------------------------------------------------
1 | {"mixing": [0.5, 0.5], "mu": [[10.008817713066399, 9.7957780166309867], [1.0466993295879303, 0.8918511713799977]], "covariance": [[[1.1365934421562542, -0.075964802401628617], [-0.075964802401628617, 0.78716761230624854]], [[1.1201170855228824, -0.031611048687119281], [-0.031611048687119281, 0.86979878980839098]]]}
2 | 


--------------------------------------------------------------------------------
/Random Sample MapReduce/SimpleRandomSampleNoReplacementMR.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu May  7 16:01:20 2015
 4 | 
 5 | @author: amazaspshaumyan
 6 | """
 7 | 
 8 | from mrjob.job import MRJob
 9 | from mrjob.step import MRStep
10 | from mrjob.protocol import RawValueProtocol, JSONProtocol
11 | import random
12 | import heapq
13 | 
14 | 
15 | class SimpleRandomSampleNoReplacementMR(MRJob):
16 |     ''' Simple Random Sampling without replacement for relatively small sample
17 |     sizes. 
18 |     Do not use for large sample sizes that can not fit in memory (current code
19 |     uses only one reducer)
20 |     
21 |     Each line in input data is assigned random priority then n lines with largest
22 |     corresponding priorities are selected (where n is size of random sample)
23 | 
24 |     '''
25 |     
26 |     INPUT_PROTOCOL = RawValueProtocol
27 |     
28 |     INTERNAL_PROTOCOL = JSONProtocol
29 |     
30 |     OUTPUT_PROTOCOL = RawValueProtocol
31 |     
32 |     def __init__(self,*args,**kwargs):
33 |         super(SimpleRandomSampleNoReplacementMR,self).__init__(*args, **kwargs)
34 |         self.pq = []      
35 |         
36 |     def configure_options(self):
37 |         super(SimpleRandomSampleNoReplacementMR,self).configure_options()
38 |         self.add_passthrough_option("--sample-size",
39 |                                     type= int,
40 |                                     help = "number of elements in sample")
41 |                                     
42 |     def load_options(self,args):
43 |         super(SimpleRandomSampleNoReplacementMR,self).load_options(args)
44 |         if self.options.sample_size is None:
45 |             self.option_parser.error("You need to specify sample size")
46 |         else:
47 |             self.n = self.options.sample_size
48 |             
49 |     def mapper_rs(self,_,line):
50 |         r = random.randrange(1000000)
51 |         if len(self.pq) < self.n:
52 |             heapq.heappush(self.pq,(r,line))
53 |         else:
54 |             if self.pq[0][0] < r:
55 |                heapq.heappushpop(self.pq,(r,line))
56 |             
57 |     def mapper_rs_final(self):
58 |         yield 1, self.pq
59 |         
60 |     def reducer_rs(self,key,samples):
61 |         pq_final = []
62 |         for sample in samples:
63 |             for element in sample:
64 |                 if len(pq_final) < self.n:
65 |                    pq_final.append(element)
66 |                    if len(pq_final)==self.n:
67 |                        heapq.heapify(pq_final)
68 |                 else:
69 |                     if pq_final[0][0] < element[0]:
70 |                         heapq.heappushpop(pq_final,element)
71 |         for r,line in pq_final:
72 |             yield None, line
73 |             
74 |     def steps(self):
75 |         return [MRStep(mapper = self.mapper_rs,
76 |                        mapper_final = self.mapper_rs_final,
77 |                        reducer = self.reducer_rs)]
78 |                        
79 | if __name__=="__main__":
80 |     SimpleRandomSampleNoReplacementMR.run()
81 |             
82 |         
83 |             
84 |         
85 |         
86 |         
87 |         
88 |         
89 |                                     


--------------------------------------------------------------------------------
/Linear Regression MapReduce/example_dataset.txt:
--------------------------------------------------------------------------------
  1 | 2.21808980306,1.0,0.0958385695435
  2 | 3.36402189406,1.0,0.260144497068
  3 | 4.15992743817,1.0,0.350816858335
  4 | 3.78196031881,1.0,0.784498848077
  5 | 3.42989512572,1.0,0.756516398407
  6 | 3.15950491457,1.0,0.951706599864
  7 | 3.25515042186,1.0,0.189581130574
  8 | 2.85369444908,1.0,0.370011550607
  9 | 3.50858378154,1.0,0.0590663027425
 10 | 3.90430478422,1.0,0.215343806169
 11 | 3.45693232136,1.0,0.449606621905
 12 | 2.75392984381,1.0,0.467775521684
 13 | 2.96059817472,1.0,0.39167030594
 14 | 3.32239943091,1.0,0.391057313689
 15 | 4.33496722789,1.0,0.252178510274
 16 | 2.91235980568,1.0,0.0674111956377
 17 | 3.71830204197,1.0,0.0616356010853
 18 | 3.2526960081,1.0,0.846743579917
 19 | 3.69205534024,1.0,0.122784969768
 20 | 3.07017383938,1.0,0.0266326664286
 21 | 3.69061507324,1.0,0.385036208768
 22 | 3.54567590508,1.0,0.870806590937
 23 | 4.41083559002,1.0,0.338407273459
 24 | 3.60068346044,1.0,0.276550896573
 25 | 3.3326031757,1.0,0.570194488677
 26 | 3.91016851652,1.0,0.327916765825
 27 | 3.84847082807,1.0,0.499414512464
 28 | 3.18634918861,1.0,0.468041957164
 29 | 3.29221765542,1.0,0.513708729344
 30 | 3.02148685747,1.0,0.807197008949
 31 | 3.12681717646,1.0,0.0932085022848
 32 | 3.57983546648,1.0,0.467758273876
 33 | 3.23900869201,1.0,0.427403343319
 34 | 4.57834602865,1.0,0.824081796391
 35 | 2.37644200344,1.0,0.0730487124565
 36 | 3.31498725761,1.0,0.400585779084
 37 | 3.21553086907,1.0,0.402743818081
 38 | 3.85959702777,1.0,0.0585452915492
 39 | 3.44615734683,1.0,0.20314102397
 40 | 3.48628658623,1.0,0.237082680794
 41 | 3.05829561767,1.0,0.0517994300085
 42 | 2.92315187738,1.0,0.164315225282
 43 | 2.32904030685,1.0,0.268142893579
 44 | 2.9552309982,1.0,0.351917716187
 45 | 4.40407128841,1.0,0.924092466762
 46 | 2.8225258913,1.0,0.0736433270037
 47 | 3.96948927524,1.0,0.659915874842
 48 | 3.56243467005,1.0,0.40964570709
 49 | 3.47474682329,1.0,0.770583077191
 50 | 3.56644300528,1.0,0.169839956493
 51 | 2.91731170254,1.0,0.0146508450881
 52 | 3.01295568085,1.0,0.545071147873
 53 | 3.46360556376,1.0,0.185368560599
 54 | 2.88143489161,1.0,0.0415632021348
 55 | 4.0412414966,1.0,0.78353303003
 56 | 3.07130972376,1.0,0.0600912187556
 57 | 3.78054503771,1.0,0.373856196533
 58 | 2.67311016754,1.0,0.717171494383
 59 | 3.75106376672,1.0,0.00508139023391
 60 | 3.94727073914,1.0,0.325172372195
 61 | 3.29085843373,1.0,0.656444434018
 62 | 3.32255132263,1.0,0.114928090252
 63 | 3.12394866603,1.0,0.84060643069
 64 | 2.53739670801,1.0,0.692190065838
 65 | 3.75322060603,1.0,0.191343723907
 66 | 4.1477197632,1.0,0.933759376925
 67 | 3.13989205948,1.0,0.429303430318
 68 | 3.2851647969,1.0,0.841234079435
 69 | 3.20774504619,1.0,0.245968569897
 70 | 3.51446893396,1.0,0.207929643678
 71 | 2.67983489149,1.0,0.0380580305849
 72 | 3.62731156643,1.0,0.389920308388
 73 | 4.44198388793,1.0,0.462793931372
 74 | 3.46638243416,1.0,0.458118777839
 75 | 3.48977121311,1.0,0.716891061751
 76 | 3.66471522624,1.0,0.27638286331
 77 | 2.91767319777,1.0,0.135421973123
 78 | 2.84514397006,1.0,0.503745974055
 79 | 4.57023086158,1.0,0.606443402484
 80 | 3.07915184373,1.0,0.237213452278
 81 | 4.6912100323,1.0,0.894024333638
 82 | 3.4989481316,1.0,0.00549703197849
 83 | 3.76166465388,1.0,0.265318734242
 84 | 3.94945854274,1.0,0.323770161366
 85 | 5.21789943675,1.0,0.68209456231
 86 | 2.99116512508,1.0,0.0202862280747
 87 | 2.91418758566,1.0,0.518492194298
 88 | 3.96622033204,1.0,0.799844072411
 89 | 3.68633462338,1.0,0.683774764756
 90 | 2.63705120301,1.0,0.0769351367498
 91 | 3.18626145777,1.0,0.0882592101268
 92 | 3.43380657408,1.0,0.803298114512
 93 | 4.11905748802,1.0,0.488393678274
 94 | 3.25768073747,1.0,0.146494823121
 95 | 3.59930371649,1.0,0.7262645386
 96 | 3.32525332426,1.0,0.0119205330777
 97 | 2.36401703415,1.0,0.0482396643746
 98 | 2.72247212355,1.0,0.0651489918596
 99 | 2.64138521372,1.0,0.502341826693
100 | 2.9975020738,1.0,0.100776706488
101 | 


--------------------------------------------------------------------------------
/Gaussian Discriminant Analysis MapReduce/gda_wrapper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import GDA as gda
  4 | import numpy as np
  5 | from boto.s3.connection import S3Connection
  6 | import json
  7 | import os
  8 | 
  9 | # use if you did not set up this parameters in configuration file
 10 | EMR_DEFAULT_PARAMS = ["--ec2-core-instance-bid-price", "0.4", 
 11 |                       "--ec2-core-instance-type" ,"m1.small",
 12 |                       "--num-ec2-core-instances", "1", 
 13 |                       "--ec2-task-instance-bid-price", "0.4", 
 14 |                       "--ec2-task-instance-type", "m1.small", 
 15 |                       "--num-ec2-task-instances","1"]
 16 | 
 17 | # access and secret key
 18 | ACCESS_KEY = "YOUR_ACCESS_KEY"
 19 | SECRET_KEY = "YOUR_SECRET_KEY"
 20 | 
 21 | 
 22 | 
 23 | class GaussianDiscriminantAnalysis(object):
 24 |     '''
 25 |     Performs Gaussian Discriminant Analysis for classification. Two approaches 
 26 |     are available QDA (each class has its own covariance matrix) or LDA (
 27 |     covariance matrix is shared).
 28 |     
 29 |     '''
 30 |     
 31 |     def __init__(self,targets,dimensions, input_path, output_path, 
 32 |                  emr_local = "local", emr_defaults = True):
 33 |         self.targets = targets
 34 |         self.dimensions = dimensions
 35 |         self.input_path = input_path
 36 |         self.output_path = output_path
 37 |         self.emr_local = emr_local
 38 |         self.emr_defaults = emr_defaults
 39 |         self.params = {}
 40 |         
 41 |     def configure(self):
 42 |         '''
 43 |         Sets configuration parameters to run map reduce job for finding
 44 |         parameters of Discriminant Analysis
 45 |         '''
 46 |         configs = ["--feature-dimensions",str(self.dim),
 47 |                    "--targets", json.loads(self.targets),
 48 |                    "-r", self.emr_local,
 49 |                    "--output-dir",self.output_path,
 50 |                    "--no-output",self.input_path]
 51 |         configs_new = []
 52 |         if self.emr_defaults is True:
 53 |             configs_new.extend(EMR_DEFAULT_PARAMS)
 54 |         configs_new.extend(configs)
 55 |         # start job
 56 |         mrJobGDA = gda.GaussianDiscriminantAnalysisMR(configs_new)
 57 |         with mrJobGDA.make_runner() as runner:
 58 |             runner.run()
 59 | 
 60 |     def load_params(self):
 61 |         if self.emr_local == "local":
 62 |             self.params =  self.local_load_params(self.output_path)
 63 |         else:
 64 |             self.params =  self.s3_load_params(self.output_path)
 65 | 
 66 |                                      
 67 |     def s3_load_params(self,s3_path):
 68 |         ''' load parameters if they are on amazon s3'''
 69 |         path = s3_path.strip("s3://").split("/")
 70 |         mybucket = self.conn.get_bucket(path[0]) # connect to s3 bucket
 71 |         s3_file_keys = [f for f in mybucket.list(prefix = "/".join(path[1:]))]
 72 |         for s3key in s3_file_keys:
 73 |             if mybucket.lookup(s3key).size > 0:
 74 |                 data = s3key.get_contents_as_string()
 75 |                 params = json.loads(data)
 76 |                 return params
 77 |                 
 78 |     def local_load_params(self,local_path):
 79 |         ''' load paramters if they are on local machine'''
 80 |         current_dir = os.getcwd()
 81 |         os.chdir(local_path)
 82 |         for filename in os.listdir(os.getcwd()):
 83 |             if "part-" in filename:
 84 |                 if os.path.getsize(filename) > 0:
 85 |                     with open(filename,"r") as in_file:
 86 |                         data = json.load(in_file)
 87 |                         os.chdir(current_dir)
 88 |                         return data
 89 |                         
 90 |     def posterior_probs(self, method = ):
 91 |         ''' get class probability
 92 |         
 93 | 
 94 |            method - (str) can have two values either 'QDA' or 'LDA'         
 95 |         '''
 96 |         
 97 |         
 98 |         
 99 |                  
100 |         


--------------------------------------------------------------------------------
/Gaussian Discriminant Analysis MapReduce/gda.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from mrjob.job import MRJob
  5 | from mrjob.step import MRStep
  6 | from mrjob.protocol import RawValueProtocol,JSONProtocol, JSONValueProtocol
  7 | import numpy as np
  8 | import json
  9 | 
 10 | ################### Helper function & classes #################################
 11 | 
 12 | 
 13 | def extract_features(line):
 14 |     ''' Extracts data from line of input '''
 15 |     data = line.strip.split(",")
 16 |     return data[1], data[2:]
 17 |     
 18 | def matrix_to_list(input_data):
 19 |     return [list(e) for e in input_data]
 20 |     
 21 | class DimensionalityMismatchError(Exception):
 22 |     ''' Error when dimensionalities do not match '''
 23 |     def __init__(self,expected,real):
 24 |         self.exp = expected
 25 |         self.real = real
 26 |         
 27 |     def __str__(self):
 28 |         error = "Expected number of dimensions: "+str(self.exp)+" observed: "+ str(self.real)
 29 |         return error
 30 |         
 31 |         
 32 | class TargetValueError(Exception):
 33 |     ''' Error for target values '''
 34 |     def __init__(self,observed):
 35 |         self.observed = observed
 36 |     
 37 |     def __str__(self):
 38 |         error = "Observed value "+str(self.e) + " is not target value"
 39 |         return error
 40 |         
 41 |         
 42 | ####################### MapReduce Job  ########################################
 43 | 
 44 | 
 45 | class GaussianDiscriminantAnalysisMR(MRJob):
 46 |     '''
 47 |     Calculates parameters required for Linear Discriminant Analysis and 
 48 |     Quadratic Discrminant Analysis. 
 49 |     
 50 |     
 51 |     Command Line Options:
 52 |     ---------------------
 53 |     
 54 |     --feature-dimensions  - dimensionality of features (dependent variables)
 55 |     --targets             - list of all valid target values (json-encoded list)
 56 |     '''
 57 |     
 58 |     INPUT_PROTOCOL = RawValueProtocol
 59 |     
 60 |     INTERNAL_PROTOCOL = JSONProtocol
 61 |     
 62 |     OUTPUT_PROTCOL = JSONValueProtocol
 63 |     
 64 |     
 65 |     def __init__(self,*args,**kwargs):
 66 |         super(GaussianDiscriminantAnalysisMR,self).__init__(*args,**kwargs)
 67 |         self.k = len(self.targets)
 68 |         self.priors = [0]*self.k
 69 |         self.means = [np.zeros(self.dim) for i in range(self.k)]
 70 |         self.covariate = [np.zeros([self.dim,self.dim]) for i in range(self.k)]
 71 |         self.total = 0
 72 |         self.targets = json.loads(self.targest)
 73 |         self.target_set = set(self.targets)
 74 |         self.target_to_index = {}
 75 |         for i,target in enumerate(self.targets):
 76 |             self.target_to_index[target] = i
 77 |             
 78 |         
 79 |     def configure_options(self):
 80 |         super(GaussianDiscriminantAnalysisMR,self).configure_options()
 81 |         self.add_passthrough_option("--feature-dimensions", 
 82 |                                       type = int,
 83 |                                       help = "dimensionality of features")
 84 |         self.add_passthrough_option("--targets",
 85 |                                     type = str,
 86 |                                     help = "targets")
 87 | 
 88 |                                       
 89 |     def load_options(self,args):
 90 |         super(GaussianDiscriminantAnalysisMR,self).load_options(args)
 91 |         if self.options.feature_dimension is None:
 92 |             self.option_parser.error("You must specify dimensionality of data")
 93 |         else:
 94 |             self.dim = self.options.feature_dimension
 95 |         if self.options.targets is None:
 96 |             self.option_parser.error("You must specify targets")
 97 |         else:
 98 |             self.targets = self.options.targets
 99 |             
100 |     
101 |     def mapper_gda(self,_,line):
102 |         '''
103 |         Calculates and summarise intermediate values for each mapper.
104 |         (Intermediate values include number of observations in each class,
105 |         total number of observations etc. )
106 |         '''
107 |         y,features = extract_features(line)
108 |         n = len(features)
109 |         x = np.array(features)
110 |         index = self.target_to_index[y]
111 |         # error if dimensionalities do not match
112 |         if len(features) != self.dim:           
113 |             raise DimensionalityMismatchError(self.dim,n)
114 |         # targets are not in set of targets defined
115 |         if y not in self.target_set:
116 |             raise TargetValueError(y)
117 |         self.total+=1
118 |         self.means[index] += x
119 |         self.covariate[index] += np.outer(x,x)
120 |         self.priors[index] += 1
121 |         
122 |         
123 |     def mapper_final_gda(self):
124 |         '''Outputs data summarised for each mapper to reducer'''
125 |         yield 1,{ "total": self.total,
126 |                   "class counts": self.priors,
127 |                   "means": matrix_to_list(self.means),
128 |                   "covariates": [matrix_to_list(e) for e in self.covariate]}
129 |         
130 |         
131 |     def reducer_gda_parameters(self,key, parameters):
132 |         ''' Summarises intermediate values produced by each mapper to get final parameters '''
133 |         all_parameters = {}
134 |         # sum two lists (each list has length = number of classes)
135 |         vec_sum = lambda x,y: [x[i]+y[i] for i in range(self.k)]
136 |         # sum two list of lists
137 |         list_of_vec_sum = lambda x,y: [vec_sum(x[i],y[i]) for i in range(self.k)]
138 |         list_of_matrix_sum = lambda x,y: [list_of_vec_sum(x[i],y[i]) for i in range(self.k)]
139 |         # summarise parameters produced by each mapper
140 |         for parameter in parameters:
141 |             if len(all_parameters)==0:
142 |                 all_parameters = parameters
143 |             else:
144 |                 all_parameters["total"]+=parameters["total"]
145 |                 all_parameters["class counts"] = vec_sum(parameter["class counts"],all_parameters["class counts"])
146 |                 all_parameters["means"] = list_of_vec_sum(parameter["means"],all_parameters["means"])
147 |                 all_parameters["covariates"] = list_of_matrix_sum(parameter["covariates"],all_parameters["covariates"])
148 |         # calculate final parameters
149 |         for i in range(self.k):
150 |             all_parameters["means"][i] = float(all_parameters["means"][i])/all_parameters["class counts"][i]
151 |             mu = np.array(all_parameters["means"][i])
152 |             all_parameters["covariates"][i] = np.array(all_parameters["covariates"][i]) - all_parameters["class counts"][i]*np.outer(mu,mu)
153 |             all_parameters["covariates"][i] = matrix_to_list(all_parameters["covariates"][i])
154 |         yield None, all_parameters
155 |             
156 |             
157 |     def steps(self):
158 |         return [MRStep(mapper = self.mapper_gda,
159 |                        mapper_final = self.mapper_final_gda,
160 |                        reducer = self.reducer_lda_parameters)]
161 |                        
162 |                        
163 | if __name__=="__main__":
164 |     GaussianDiscriminantAnalysisMR.run()
165 |                        
166 |         
167 |         


--------------------------------------------------------------------------------
/Linear Regression MapReduce/LinearRegressionTS.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from mrjob.job import MRJob
  5 | from mrjob.protocol import JSONProtocol, RawValueProtocol
  6 | from mrjob.step import MRStep
  7 | import numpy as np
  8 | 
  9 | 
 10 | ######################## Helper Methods and Classes  ##########################
 11 | 
 12 | 
 13 | def cholesky_solution_linear_regression(x_t_x,x_t_y):
 14 |     '''
 15 |     Finds parameters of regression through Cholesky decomposition,
 16 |     given sample covariance of explanatory variables and covariance 
 17 |     between explanatory variable and dependent variable.
 18 |     
 19 |     Paramaters:
 20 |     -----------
 21 |     x_t_x    - numpy array of size 'm x m', represents sample covariance of explanatory variables
 22 |     x_t_y    - numpy array of size 'm x 1', represent covariance between expalanatory and dependent variable
 23 |     
 24 |     Output:
 25 |     -------
 26 |     Theta   - list of size m, represents values of coefficients 
 27 |     
 28 |     '''
 29 |     # L*L.T*Theta = x_t_y
 30 |     L = np.linalg.cholesky(x_t_x)
 31 |     #  solve L*z = x_t_y
 32 |     z = np.linalg.solve(L,x_t_y)
 33 |     #  solve L.T*Theta = z
 34 |     theta = np.linalg.solve(np.transpose(L),z)
 35 |     return theta
 36 |     
 37 |    
 38 | 
 39 | class DimensionMismatchError(Exception):
 40 | 
 41 |     def __init__(self,expected,observed):
 42 |         self.exp = expected
 43 |         self.obs = observed
 44 |         
 45 |     def __str__(self):
 46 |         err = "Expected number of dimensions: "+str(self.exp)+", observed: "+str(self.obs)
 47 |         return err
 48 |     
 49 | 
 50 | ############################## Map Reduce Job #################################
 51 | 
 52 | 
 53 | class LinearRegressionTS(MRJob):
 54 |     '''
 55 |     Calculates sample covariance matix of explanatory variables (x_t_x) and 
 56 |     vector of covariances between dependent variable expanatory variables (x_t_y)
 57 |     in single map reduce pass and then uses cholesky decomposition to
 58 |     obtain values of regression parameters.
 59 |     
 60 |     
 61 |     Important!!! Since final computations are performed on single reducer, 
 62 |     assumption is that dimensionality of data is relatively small i.e. input 
 63 |     matrix is tall and skinny.
 64 |     
 65 |     
 66 |     Input File:
 67 |     -----------
 68 |           
 69 |           Extract relevant features from input line by changing extract_variables
 70 |           method. You can add features for non-linear models (like x^2 or exp(x)).
 71 |           Current code assumes following input line format:
 72 |           
 73 |           input line = <dependent variable>, <feature_1>,...,<feature_n>
 74 |           
 75 |     Options:
 76 |     -----------
 77 |     
 78 |           -- dimension  - (int)  number of explanatory variables
 79 |           -- bias       - (bool) if True regression wil include bias term
 80 |     
 81 |     Output:
 82 |     -----------
 83 |           json-encoded list of parameters
 84 |     '''
 85 |     
 86 |     INPUT_PROTOCOL = RawValueProtocol
 87 |     
 88 |     INTERNAL_PROTOCOL = JSONProtocol
 89 |     
 90 |     OUTPUT_PROTOCOL = RawValueProtocol
 91 |        
 92 |     
 93 |     def __init__(self,*args, **kwargs):
 94 |         super(LinearRegressionTS, self).__init__(*args, **kwargs)
 95 |         n = self.options.dimension
 96 |         self.x_t_x  = np.zeros([n,n])
 97 |         self.x_t_y  = np.zeros(n)
 98 |         self.counts = 0
 99 |         
100 |     #--------------------------- feature extraction --------------------------#
101 |         
102 |     def extract_variables(self,line):
103 |         ''' (str)--(float,[float,float,float...])
104 |         Extracts set of relevant features. (Needs to be rewriten depending
105 |         on file input structure) 
106 |         '''
107 |         data = [float(e) for e in line.strip().split(",")]
108 |         y,features = data[0],data[1:]
109 |         return (y,features)
110 |         
111 |         
112 |     #---------------------------- Options ------------------------------------#
113 |         
114 |     def configure_options(self):
115 |         ''' Additional options'''
116 |         super(LinearRegressionTS,self).configure_options()
117 |         self.add_passthrough_option("--dimension", 
118 |                                     type = int,
119 |                                     help = "Number of explanatory variables (do not count bias term)")
120 |         self.add_passthrough_option("--bias", 
121 |                                     type = str, # (got error when tried to define bool) ???
122 |                                     help = "Bias term, bias not included if anything other than 'True' ",
123 |                                     default = "True")
124 |                                     
125 |     def load_options(self,args):
126 |         ''' Loads and checks whether options are provided'''
127 |         super(LinearRegressionTS,self).load_options(args)
128 |         if self.options.dimension is None:
129 |             self.option_parser.error("You should define number of explanatory variables")
130 |         else:
131 |             self.dim = self.options.dimension
132 |             
133 |             
134 |     #------------------------ map-reduce steps -------------------------------#
135 |             
136 |             
137 |     def mapper_lr(self,_,line):
138 |         '''
139 |         Calculates x_t_x and x_t_y for data processed by each mapper
140 |         '''
141 |         y,features = self.extract_variables(line)
142 |         if len(features) != self.dim:
143 |             raise DimensionMismatchError(self.dim,len(features))
144 |         if self.options.bias is "True":
145 |             features.append(1.0)
146 |         x = np.array(features)
147 |         self.x_t_x  += np.outer(x, x)
148 |         self.x_t_y  += y*x
149 |         self.counts += 1
150 |         
151 |     def mapper_lr_final(self):
152 |         '''
153 |         Transforms numpy arrays x_t_x and x_t_y into json-encodable list format
154 |         and sends to reducer
155 |         '''
156 |         yield 1,("x_t_x",  [list(row) for row in self.x_t_x])
157 |         yield 1,("x_t_y",  [xy for xy in self.x_t_y])
158 |         yield 1,("counts", self.counts)
159 |         
160 |     def reducer_lr(self,key,values):
161 |         '''
162 |         Aggregates results produced by each mapper and obtains x_t_x and x_t_y
163 |         for all data, then using cholesky decomposition obtains parameters of 
164 |         linear regression.
165 |         '''
166 |         n = self.dim
167 |         observations = 0
168 |         x_t_x = np.zeros([n,n]); x_t_y = np.zeros(n) 
169 |         for val in values:
170 |             if val[0]=="x_t_x":
171 |                 x_t_x           +=   np.array(val[1])
172 |             elif val[0]=="x_t_y":
173 |                 x_t_y           +=   np.array(val[1])
174 |             elif val[0]=="counts":
175 |                 observations    +=   val[1]
176 |         betas = cholesky_solution_linear_regression(x_t_x,x_t_y)
177 |         yield None,[e for e in betas]
178 |             
179 |     def steps(self):
180 |         '''Defines map-reduce steps '''
181 |         return [MRStep(mapper       = self.mapper_lr,
182 |                        mapper_final = self.mapper_lr_final,
183 |                        reducer      = self.reducer_lr)]
184 |                        
185 | if __name__=="__main__":
186 |     LinearRegressionTS.run()
187 |         
188 | 
189 |         


--------------------------------------------------------------------------------
/Multivariate Descriptive Statistics/MultivariateDescriptiveStatistics.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from mrjob.job import MRJob
  4 | from mrjob.step import MRStep
  5 | from mrjob.protocol import RawValueProtocol, JSONProtocol, JSONValueProtocol
  6 | import numpy as np
  7 | 
  8 | 
  9 | ########################  Helper functions & classes ##########################
 10 | 
 11 | class DimensionalityMismatch(Exception):
 12 |     
 13 |     def __init__(self,expected,real):
 14 |         self.exp = expected
 15 |         self.real = real
 16 |         
 17 |     def __str__(self):
 18 |         error = "Dimensionality mismatch. "+"Expected: "+str(self.exp)+" real: "+ str(self.real)
 19 |         return error
 20 |         
 21 |         
 22 | def extract_relevant_features(l):
 23 |     '''
 24 |     Extracts quantitative features for which summary statistics should be calculated
 25 |     '''
 26 |     data = l.strip().split(",")
 27 |     return [float(e) for e in data[1:5]]
 28 |         
 29 | def kurtosis(p4,covariance,n):
 30 |     '''
 31 |     Calcultes unbiased kurtosis (see Joanes and Gill (1998)).
 32 |     
 33 |     
 34 |     Input:
 35 |     ------
 36 |     
 37 |     p4         - list of size m, where each entry is sum of fourth order feature.
 38 |     covariance - two-dimensional list of size m x m, which is outer
 39 |                  product of input matrix with itself
 40 |     n          - number of observations
 41 |     
 42 |     Output:
 43 |     -------
 44 |                - (float) kurtosis
 45 |                
 46 |     [where m is dimensionality of data]
 47 |     '''
 48 |     kurtosis_standard = [ (kurt/n)/((n-1)*covariance[i,i]/n)**2 -3 for i,kurt in enumerate(p4)]
 49 |     kurtosis_unbiased = [ (kurt*(n+1)+6)*(n-1)/(n-2)/(n-3) for kurt in kurtosis_standard]
 50 |     return kurtosis_unbiased
 51 | 
 52 | def skewed(p3,covariance,n):
 53 |     '''    
 54 |     Calcultes skeweness
 55 | 
 56 |     Input:
 57 |     ------
 58 |     
 59 |     p3         - list of size m, where each entry is sum of cubes of each feature.
 60 |     covariance - two-dimensional list of size m x m, which is outer
 61 |                  product of input matrix with itself
 62 |     n          - number of observations
 63 |     
 64 |     Output:
 65 |     -------
 66 |                - (float) kurtosis
 67 |     
 68 |     [where m is dimensionality of data]
 69 |     '''
 70 |     return [np.sqrt(n*(n-1))/(n-2)*((skew/n)/(((n-1)*covariance[i,i]/n)**1.5)) for i,skew in enumerate(p3)]
 71 |     
 72 | ########################## MapReduce Job ######################################
 73 | 
 74 | class MultivariateDescriptiveStatisticsMR(MRJob):
 75 |     ''' 
 76 |     Calculates descriptive statistics for multivariate dataset.
 77 |     
 78 |     Following statistics are calculated:
 79 |     
 80 |        - Covariance Matrix 
 81 |        - Skewness of each variable (measure of assymetry)
 82 |        - Kurtosis of each variable (measure of peakedness)
 83 |        - Minimum for each variable
 84 |        - Maximum for each variable
 85 |        - Mean for each variable
 86 |           
 87 |     Note: accuracy of results were compared on test results with corresponding
 88 |     functions in R (min,max,mean,cov,skewness[library(e1071)], kurtosis[library(e1071)])
 89 |     '''
 90 |     
 91 |     INPUT_PROTOCOL = RawValueProtocol
 92 |     
 93 |     INTERNAL_PROTOCOL = JSONProtocol
 94 |     
 95 |     OUTPUT_PROTOCOL = JSONValueProtocol
 96 |     
 97 |     
 98 |     def __init__(self, *args, **kwargs):
 99 |         super(MultivariateDescriptiveStatisticsMR,self).__init__(*args, **kwargs)
100 |         d = self.dim
101 |         self.n = 0
102 |         self.max,self.min,self.mean = [0]*d,[0]*d,[0]*d
103 |         self.third_order, self.fourth_order = [0]*d, [0]*d
104 |         self.covariates = np.zeros([d,d], dtype = np.float64)
105 |         
106 |         
107 |     def configure_options(self):
108 |         super(MultivariateDescriptiveStatisticsMR,self).configure_options()
109 |         self.add_passthrough_option("--dimensions", type = int, 
110 |                                     help = "Number of columns of data matrix")
111 |                                       
112 |     def load_options(self,args):
113 |         super(MultivariateDescriptiveStatisticsMR,self).load_options(args)
114 |         if self.options.dimensions is None:
115 |             self.option_parser.error("You need specify expected dimensionlity")
116 |         else:
117 |             self.dim =  self.options.dimensions
118 | 
119 | 
120 |     def mapper_covar(self,_,line):
121 |         # extract features that you want to analyse
122 |         variables = MultivariateDescriptiveStatisticsMR.extract_relevant_features(line)
123 |         assert(len(variables)==self.dim), "input dimensionality mismatch"
124 |         self.n+=1
125 |         self.max = [max(m, var) for var in variables for m in self.max]
126 |         self.min = [min(m, var) for var in variables for m in self.min]
127 |         self.mean = [s+var for var in variables for s in self.mean]
128 |         self.third_order = [p+var**3 for var in variables for p in self.third_order]
129 |         self.fourth_order = [p+var**4 for var in variables for p in self.fourth_order]
130 |         self.covariates += np.outer(np.array(variables),np.array(variables))
131 |         
132 |         
133 |     def mapper_covar_final(self):
134 |         yield 1,("max", self.max)
135 |         yield 1,("min", self.min)
136 |         yield 1,("mean", self.mean)
137 |         yield 1,("observations", self.n)
138 |         yield 1,("third order", self.third_order)
139 |         yield 1,("fourth order", self.fourth_order)
140 |         yield 1,("covariates", [list(row) for row in self.covariates])
141 |         
142 |         
143 |     def reducer_summarise(self,key,values):
144 |         m = self.dim
145 |         p1,max_list,min_list = [0]*m,[0]*m,[0]*m
146 |         p3, p4 = [0]*m,[0]*m
147 |         covar_matr = np.zeros([m,m], dtype = np.float64)
148 |         n = 0
149 |         for val in values:
150 |             if val[0]=="max":
151 |                 max_list = [max(max_list[i],var) for i,var in enumerate(val[1])]
152 |             elif val[0]=="min":
153 |                 min_list = [min(min_list[i],var) for i,var in enumerate(val[1])]
154 |             elif val[0]=="mean":
155 |                 p1 = [p1[i]+var for i,var in enumerate(val[1])]
156 |             elif val[0]=="observations":
157 |                 n+=val[1]
158 |             elif val[0]=="third order":
159 |                 p3 = [p3[i]+cube for i,cube in enumerate(val[1])]
160 |             elif val[0]=="fourth order":
161 |                 p4 = [p4[i]+quad for i,quad in enumerate(val[1])]
162 |             else:
163 |                 covar_matr+=np.array(val[1])
164 |         # vector of means
165 |         means = [float(mu)/n for mu in p1]
166 |         # covariance matrix (biased but with lowest MSE)
167 |         covariance = (covar_matr - np.outer(np.array(means),np.array(means))*n)/(n-1)
168 |         # fourth moment: calculate sum((x_i-mean(x))^4) by decomposing it
169 |         p4 = [p4[i]-4*means[i]*p3[i]+6*(means[i]**2)*(covar_matr[i,i])-4*p1[i]*(means[i]**3)+n*means[i]**4 for i in range(m)]
170 |         # third moment: calculate sum((x_i-mean(x))^3) by decompsing it
171 |         p3 = [p3[i]-3*means[i]*covar_matr[i,i]+3*(means[i]**2)*p1[i] - n*means[i]**3 for i in range(m)]     
172 |         kurtosis_unbiased = kurtosis(p4,covariance,n)  # calculate kurtosis for each variable
173 |         skewness = skewed(p3,covariance,n)             # calculate skewness for each variable
174 |         matrix_to_list = lambda x: [list(e) for e in x]
175 |         covariance = matrix_to_list(covariance)
176 |         summary_statistics = {"mean":          means,
177 |                               "max":           max_list,
178 |                               "min":           min_list,
179 |                               "covariance":    covariance,
180 |                               "skewness":      skewness,
181 |                               "kurtosis":      kurtosis_unbiased,
182 |                               "observations":  n }
183 |         yield None, summary_statistics
184 |         
185 |         
186 |     def steps(self):
187 |         return [MRStep(mapper = self.mapper_covar,
188 |                        mapper_final = self.mapper_covar_final,
189 |                        reducer = self.reducer_summarise)]
190 |         
191 | if __name__=="__main__":
192 |     MultivariateDescriptiveStatisticsMR.run()


--------------------------------------------------------------------------------
/Gaussian Mixture Model MapReduce/IterationGaussianMixtureMR.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from mrjob.job import MRJob
  3 | from mrjob.protocol import JSONProtocol, RawValueProtocol, JSONValueProtocol
  4 | from mrjob.step import MRStep
  5 | import json
  6 | import numpy as np
  7 | 
  8 | 
  9 | def multivar_gauss_pdf(x, mu, cov):
 10 |     '''
 11 |     Caculates the multivariate normal density (pdf)
 12 |     
 13 |     Parameters:
 14 |     -----------
 15 |     
 16 |     x    - numpy array of a "d x 1" sample vector
 17 |     mu   - numpy array of a "d x 1" mean vector
 18 |     cov  - numpy array of a d x d" covariance matrix
 19 |     
 20 |     (where d - dimensionality of data)
 21 | 
 22 |     Output:
 23 |     -------
 24 |             - (float) probability of x given parameters of 
 25 |                      Gaussian Distribution
 26 |     '''
 27 |     part1 = 1 / ( ((2* np.pi)**(len(mu)/2)) * (np.linalg.det(cov)**(1/2)) )
 28 |     part2 = (-1/2) * np.dot(np.dot((x-mu).T,(np.linalg.inv(cov))),(x-mu))
 29 |     return float(part1 * np.exp(part2))
 30 |         
 31 | 
 32 | def responsibility(x,mu,cov,p,K):
 33 |     ''' 
 34 |     Calculates conditional probability of latent variable given
 35 |     observed data and parameters
 36 |     
 37 |     Parameters:
 38 |     -----------
 39 |     
 40 |     x     - numpy array of a "d x 1" sample vector
 41 |     mu    - list of length "K" of lists "d x 1" mean vector 
 42 |     cov   - list of length "K" numpy arrays each "d x d" covariance matrix
 43 |     p     - list of floats, each float prior probability of cluster
 44 |     K     - number of clusters (values of latent variables)
 45 |     
 46 |     (where d - dimensionality of data)
 47 |     
 48 |     Output:
 49 |           - list of floats, each element of list is responsibility corresponding 
 50 |             to x and relevant latent variable valiue
 51 |     '''
 52 |     resps = [p[k]*multivar_gauss_pdf(x,np.array(mu[k]),np.array(cov[k])) for k in range(K)]
 53 |     p_x = sum(resps)
 54 |     return [float(r_k)/p_x for r_k in resps]
 55 |     
 56 | 
 57 | def extract_features(line):
 58 |     ''' extracts features from line of input'''
 59 |     data = line.strip().split(",")
 60 |     return [ float(e) for e in data[1:] ]
 61 |     
 62 |     
 63 | def make_json_encodable(mixing, means, covar):
 64 |     '''
 65 |     Transforms 
 66 |     
 67 |     Parameters:
 68 |     -----------
 69 |     
 70 |     mixing   - list of size k
 71 |     means    - list of size k of numpy arrays (each numpy array has size d)
 72 |     covar    - list of size k of two dimensional numpy array (matrix of size dxd)
 73 |     
 74 |     (where d is dimensionality and k is number of clusters)
 75 | 
 76 |     Output:
 77 |     --------
 78 |              - dictionary with parameter names as keys 
 79 |              {"mu": list of mean vectors, "mixing": list of mixing coefficients,
 80 |               "covariance": list of covariance matrices}
 81 |     
 82 |     '''
 83 |     matrix_to_list = lambda x: [list(e) for e in x]
 84 |     mixing = mixing
 85 |     means = matrix_to_list(means)
 86 |     covariance = [matrix_to_list(e) for e in covar]
 87 |     return {"mixing":mixing,"mu":means,"covariance":covariance}
 88 | 
 89 | 
 90 |    
 91 | class IterationGaussianMixtureMR(MRJob):
 92 |     '''
 93 |     Runs single iteration of Expectation Maximization Algorithm for Gaussian
 94 |     Mixture Model.
 95 |     
 96 |     Mappers use parameters from previous iteration to calculate responsibilities
 97 |     and intermediate values that are then used by single reducer to calculate
 98 |     new parameters.
 99 |     
100 |     Command Line Options:
101 |     ---------------------
102 |     
103 |     --clusters             - number of clusters
104 |     --dimensions           - dimensionality of data
105 |     --parameters           - (str)json encoded dictionary of parameters
106 |     
107 |     '''
108 |     INPUT_PROTOCOL = RawValueProtocol
109 |     
110 |     INTERNAL_PROTOCOL = JSONProtocol
111 |     
112 |     OUTPUT_PROTOCOL = JSONValueProtocol
113 |         
114 | 
115 |     def __init__(self,*args,**kwargs):
116 |         super(IterationGaussianMixtureMR,self).__init__(*args,**kwargs)
117 |         # sum of responsibilities for each cluster & number of observations
118 |         self.resp_sum = [0]*self.clusters
119 |         self.N = 0
120 |         # sum of observations weighted by reponsibility 
121 |         self.resp_w_sum = [np.zeros(self.dim, dtype = np.float64) for i in range(self.clusters)]
122 |         # sum of x_n*x_n_t (outer products) weighted by reponsibility
123 |         self.resp_w_cov = [np.zeros([self.dim,self.dim], dtype = np.float64) for i in range(self.clusters)]   
124 |         
125 |         
126 |     def configure_options(self):
127 |         super(IterationGaussianMixtureMR,self).configure_options()
128 |         self.add_passthrough_option("--dimensions",
129 |                                     type = int,
130 |                                     help = "dimensionality of input data")
131 |         self.add_passthrough_option("--clusters",
132 |                                     type = int,
133 |                                     help = "number of clusters")
134 |         self.add_passthrough_option("--parameters",
135 |                              type = str,
136 |                              help = "file with parameters from previous iteration")
137 |     
138 |     
139 |     def load_options(self,args):
140 |         super(IterationGaussianMixtureMR,self).load_options(args)
141 |         # number of clusters
142 |         if self.options.clusters is None:
143 |             self.option_parser.error("You need to specify number of clusters")
144 |         else:
145 |             self.clusters = self.options.clusters
146 |         # data dimensionality
147 |         if self.options.dimensions is None:
148 |             self.option_parser.error("You need to specify dimensionality of data")
149 |         else:
150 |             self.dim = self.options.dimensions
151 |         # filename where parameters from previous iteration are saved
152 |         if self.options.parameters is None:
153 |             self.option_parser.error("You need to load file with distribution parameters")
154 |             
155 |     def mapper_gmm_init(self):
156 |         params = json.loads(self.options.parameters)
157 |         self.mu = params["mu"]
158 |         self.covar = params["covariance"]
159 |         self.mixing = params["mixing"]
160 |     
161 |     def mapper_gmm(self,_,line):
162 |         features = extract_features(line)
163 |         assert(len(features)==self.dim), "dimension mismatch"
164 |         x = np.array(features)
165 |         r_n = responsibility(x,self.mu,self.covar,self.mixing,self.clusters) # responsibilities
166 |         self.resp_sum = [self.resp_sum[i]+r_n_k for i,r_n_k in enumerate(r_n)]
167 |         self.resp_w_sum = [w_sum + r_n[i]*x for i,w_sum in enumerate(self.resp_w_sum)]
168 |         self.resp_w_cov = [w_covar+r_n[i]*np.outer(x,x) for i,w_covar in enumerate(self.resp_w_cov)]
169 |         self.N+=1
170 |         
171 |     def mapper_final_gmm(self):
172 |         matrix_to_list = lambda x: [list(e) for e in x]
173 |         # sum of responsibilities
174 |         yield 1,("r_sum", self.resp_sum)      
175 |         # sum of observations weighted by responsibility
176 |         yield 1,("r_w_sum", [list(e) for e in self.resp_w_sum])
177 |         # covariates weighted by responsibility
178 |         yield 1,("r_w_cov", [ matrix_to_list(cov) for cov in self.resp_w_cov])
179 |         # number of observations
180 |         yield 1,("total", self.N)                                              
181 |         
182 |     
183 |     def reducer_gmm(self,key, values):
184 |         N = 0;
185 |         r_sum = [0]*self.clusters
186 |         r_w_sum = [np.zeros(self.dim, dtype = np.float64) for i in range(self.clusters)]
187 |         r_w_cov = [np.zeros([self.dim,self.dim], dtype = np.float64) for i in range(self.clusters)]
188 |         for value in values:
189 |             if value[0]=="r_sum":
190 |                 r_sum = [r_sum[i]+gamma for i,gamma in enumerate(value[1])]
191 |             elif value[0]=="r_w_sum":
192 |                 r_w_sum = [r_w_sum[i]+np.array(r_w_new, dtype = np.float64) for i,r_w_new in enumerate(value[1])]
193 |             elif value[0]=="r_w_cov":
194 |                 r_w_cov = [ r_w_cov[i] + np.array(cov) for i,cov in enumerate(value[1])]
195 |             elif value[0]=="total":
196 |                 N+=value[1]
197 |         mixing = [float(gamma)/N for gamma in r_sum]
198 |         means =  [1.0/r_sum[i]*r_w_sum[i] for i, gamma in enumerate(mixing)]
199 |         covar =  [ 1.0/r_sum[k]*r_w_cov_k - np.outer(means[k],means[k]) for k,r_w_cov_k in enumerate(r_w_cov)]     
200 |         yield None, make_json_encodable(mixing,means,covar)
201 | 
202 |     def steps(self):
203 |         return [MRStep(mapper_init = self.mapper_gmm_init,
204 |                        mapper = self.mapper_gmm, 
205 |                        mapper_final = self.mapper_final_gmm,
206 |                        reducer = self.reducer_gmm)]
207 |                        
208 | if __name__=="__main__":
209 |     IterationGaussianMixtureMR.run()
210 |     
211 |     


--------------------------------------------------------------------------------
/Gaussian Mixture Model MapReduce/gmm_test_data.txt:
--------------------------------------------------------------------------------
  1 | 0,11.889696598182816,9.399577308092463
  2 | 1,8.141782466259908,9.527948700116914
  3 | 2,9.83561441698987,9.829664122277876
  4 | 3,9.259053979739067,9.323772861474911
  5 | 4,9.263677930814664,8.375190560774007
  6 | 5,9.757350580368621,9.314887035130026
  7 | 6,11.088658876568656,10.428006012366419
  8 | 7,11.143262957828346,10.201883251031845
  9 | 8,8.69383959269066,8.783559680325284
 10 | 9,9.576704300171153,8.558494673372817
 11 | 10,10.356197508699072,9.422707522937543
 12 | 11,8.673602532116556,10.901021157513314
 13 | 12,10.655171338795729,9.981231057404136
 14 | 13,10.466961483205777,8.659112144413218
 15 | 14,10.363731834422175,8.217446779324968
 16 | 15,10.217535490920621,9.53485308461512
 17 | 16,11.739695822206578,10.222179154117397
 18 | 17,9.638692444961226,10.036829570697162
 19 | 18,10.18969963125449,9.8247368588829
 20 | 19,8.837879828123471,10.243322513501534
 21 | 20,10.279095230986293,11.327708746941834
 22 | 21,9.682704196948393,8.177313158514684
 23 | 22,9.335839034275475,10.490247787819225
 24 | 23,9.997904502165957,9.041449511717712
 25 | 24,9.29504789012471,10.42905420836509
 26 | 25,8.767051423431063,10.379687522618916
 27 | 26,10.561822807236435,10.83323075405356
 28 | 27,9.2071259783157,10.528013574304838
 29 | 28,10.160689965561604,10.223508150118922
 30 | 29,9.944167607205753,8.698019313721305
 31 | 30,10.418525897310861,9.508779154752554
 32 | 31,11.120817443963995,10.526129645503078
 33 | 32,10.715534426540763,10.397645360599698
 34 | 33,11.884621474828569,9.591125424962172
 35 | 34,9.753256739952068,7.793189011004162
 36 | 35,8.610407180918672,9.086115427816948
 37 | 36,10.182228819882969,11.05460828030577
 38 | 37,9.22004171784772,13.034324162182392
 39 | 38,10.105060916793697,9.030958820657322
 40 | 39,9.984382144505195,9.090686331618633
 41 | 40,9.14584076992209,11.10574347078124
 42 | 41,11.667246858733101,10.651320290924891
 43 | 42,8.387128096090802,9.911586066454367
 44 | 43,9.569749965923826,9.3198696923227
 45 | 44,11.007806523828798,10.24401710795085
 46 | 45,10.303312714665271,10.072895936758643
 47 | 46,10.448145461879797,9.247585566175824
 48 | 47,10.922918115688484,9.590316003046215
 49 | 48,10.030315417591755,8.72522500978121
 50 | 49,10.989067335990477,9.728763574444695
 51 | 50,9.329076883745394,9.742913584392378
 52 | 51,10.855959680936218,9.607031218854152
 53 | 52,12.564674361928027,10.286343751607795
 54 | 53,10.680096601143054,10.309299442484601
 55 | 54,8.207501134810489,8.649429268806688
 56 | 55,12.635714286197347,9.754864710759529
 57 | 56,9.673623190778429,9.843122234554118
 58 | 57,9.702296811534842,9.506997420081817
 59 | 58,8.911439850481022,9.267704226862286
 60 | 59,8.763327663498817,10.759310148880003
 61 | 60,8.62727348693574,10.545296619643565
 62 | 61,9.415086888513237,9.137469073371014
 63 | 62,8.019619866898736,8.886766928729152
 64 | 63,9.36039841330762,9.409304764940158
 65 | 64,9.25652551394197,11.53250456130154
 66 | 65,10.356262280657237,10.174496998482963
 67 | 66,10.124887481939577,10.678484372823636
 68 | 67,10.965042164271594,10.081409487431168
 69 | 68,9.55914178689305,9.234841327714005
 70 | 69,8.026250000117132,11.997631982536102
 71 | 70,10.158110248393465,10.197973159435845
 72 | 71,9.96879270260045,8.860303686738954
 73 | 72,8.518062564381752,10.346421467880075
 74 | 73,11.06197615294631,7.884096686113358
 75 | 74,9.232618420886938,10.44432023409442
 76 | 75,11.392216252854075,8.96050039167426
 77 | 76,10.006433651410077,10.24995144665943
 78 | 77,12.667143703316873,9.134282367814839
 79 | 78,11.603954043522453,8.539740058173635
 80 | 79,10.355262924827707,8.669928973777699
 81 | 80,10.956149496046644,10.727380151777268
 82 | 81,10.000082207177442,9.396618090777519
 83 | 82,7.8114012714492285,9.696754128537894
 84 | 83,10.749626406618715,10.70574875199398
 85 | 84,9.701361543451927,9.791164489571257
 86 | 85,10.07347125434307,9.949578936363372
 87 | 86,10.724510443122407,8.709959507973224
 88 | 87,8.514139438635151,9.66622548748849
 89 | 88,9.655807587832276,10.591180319460728
 90 | 89,11.960772555805558,8.768099520357781
 91 | 90,10.455532746084764,11.505113906159323
 92 | 91,10.030375632597176,9.901535425762127
 93 | 92,9.577196914896852,8.643308902918893
 94 | 93,10.157136017565245,10.221475059279474
 95 | 94,11.364906239045027,9.556142335894155
 96 | 95,10.896812778945927,9.91561644826393
 97 | 96,10.245244982261436,10.764809139625296
 98 | 97,8.39193258315866,9.640255128428839
 99 | 98,7.869045001915454,9.991951326864006
100 | 99,10.26083091858728,10.116602894392708
101 | 100,-0.5297583718585257,-0.9766838245549572
102 | 101,1.685487810801928,0.4690593655004617
103 | 102,1.7809185202827154,-0.7439786775379666
104 | 103,1.1226156612796272,2.9134825309600845
105 | 104,1.3181314573279175,1.1017444443125102
106 | 105,1.0002305215917755,0.3676127784931953
107 | 106,-0.09283765687821144,1.6429521521510844
108 | 107,-0.3734397328941841,2.4919262869094894
109 | 108,0.74909034650456,0.9174194143928245
110 | 109,-0.06954171822790656,0.9004799986160914
111 | 110,2.4284430344008783,0.8644971646698215
112 | 111,2.7003034087791864,2.942531275288033
113 | 112,2.781511946041195,1.5801982247405415
114 | 113,0.25978472083564286,2.674189650036509
115 | 114,1.8496452444610467,-1.174074782719698
116 | 115,-0.49648249321947513,0.7047357435928405
117 | 116,0.9673605389654447,1.009104432432295
118 | 117,-0.32314315814211336,-0.3110807669353952
119 | 118,2.1469699080364455,1.824651046487534
120 | 119,2.435356635133539,0.12702031941034775
121 | 120,2.524093430061247,-0.3106723284205417
122 | 121,0.8023654873510608,0.2944377328052924
123 | 122,1.017321284932823,0.6016594179164722
124 | 123,0.9840254494892873,-1.4199134190354643
125 | 124,-0.10354819386142289,1.737815756012118
126 | 125,0.18135663167987104,0.8187655845081133
127 | 126,-0.1666601916661734,1.0860950270163892
128 | 127,0.5908566035060019,0.9032744653322091
129 | 128,-1.4714867055006766,0.8902193986657793
130 | 129,1.1322292621860297,1.4504295222159824
131 | 130,0.5293450533564301,-0.14111647426900653
132 | 131,1.8263994018772163,0.14759898379212988
133 | 132,0.4713996577723506,0.18480969293738347
134 | 133,0.787292022406012,0.041548064946067775
135 | 134,0.3454794065046819,0.8657779482465748
136 | 135,1.1889615755065692,1.5370544952810627
137 | 136,1.6705653540132208,0.007044771456957144
138 | 137,2.9768816992592875,-0.21634024261131213
139 | 138,2.8132642377181543,0.7493334397146709
140 | 139,-0.4044789721845281,1.6321443132997862
141 | 140,0.311719894635486,1.210677986394049
142 | 141,0.9102247891155003,2.382701362740132
143 | 142,1.5695809204463207,0.38040305525727125
144 | 143,0.05457798939602243,0.6217019644380728
145 | 144,0.8766024274446654,1.4869091380074162
146 | 145,1.8007261217921646,-0.48638861740203776
147 | 146,1.0951338224716824,-0.685573602091472
148 | 147,2.140139970401804,1.643810679135042
149 | 148,3.4325182211566556,1.126840239411029
150 | 149,0.5826258302859172,1.0556686155845936
151 | 150,0.7157401180441894,0.6547902187597557
152 | 151,-0.18784784715973646,0.7371184875274482
153 | 152,0.7243345172993352,1.734961417631381
154 | 153,2.3771733069305947,0.9470742401015901
155 | 154,0.4373231601729354,-0.8154774884412332
156 | 155,1.6419317687384325,0.1469989176876788
157 | 156,3.18251973435166,1.4519198792187695
158 | 157,0.7339939245078062,1.1372312431533893
159 | 158,0.946826479458742,1.1673853088420154
160 | 159,-0.4729836563782419,0.6716417415293987
161 | 160,0.8058099574464997,1.1696399224064815
162 | 161,2.6992005810329625,0.961860491181949
163 | 162,2.024997298713787,0.47877091221083845
164 | 163,2.5522498791519723,0.4299324405759669
165 | 164,0.5739343916146544,1.0045080090663834
166 | 165,1.35666915326178,-0.31466601955910756
167 | 166,1.086892858146093,2.0282043623389705
168 | 167,0.18409795377386962,2.3238952591251962
169 | 168,1.8579770238270976,1.2466501538302264
170 | 169,0.8880465028789818,0.9429394426250503
171 | 170,-0.3042446613223604,1.6218734017784513
172 | 171,1.2776771984612543,1.5669997816789594
173 | 172,2.164027053169204,0.05835633015698949
174 | 173,1.8476180634546273,0.8844207041775076
175 | 174,0.3415336965964516,0.7136305542705158
176 | 175,0.14421723290445176,1.0419038358764148
177 | 176,1.0771991003978565,1.1787375554204311
178 | 177,1.0718366116671554,2.0878134493858393
179 | 178,1.912900508907145,0.15521291242922264
180 | 179,0.5419179803924743,1.5625313860604253
181 | 180,-0.23674560655990096,-1.4604933334122738
182 | 181,1.6301287880124806,1.7379035790834743
183 | 182,2.0181245229038662,1.7186889492383628
184 | 183,2.2211403440156756,1.27568034492345
185 | 184,0.2261954447122092,-0.026328670026463552
186 | 185,-1.1269921165847325,2.2453017550210497
187 | 186,1.3373540600165845,0.8121116770848708
188 | 187,2.7180850973527124,2.063279588018826
189 | 188,1.1303141762058857,0.9987257674598756
190 | 189,0.4264214885661777,1.1206481326126658
191 | 190,0.38194609797862955,1.3667009059911215
192 | 191,2.2129695096994952,3.4717953196325877
193 | 192,-0.8260429551038402,1.0451795456652038
194 | 193,0.8952315047770464,-0.2912604504024898
195 | 194,-0.5487026343993706,1.3653688919762004
196 | 195,0.9056783237364758,0.7215989763368184
197 | 196,2.136898106008048,0.5533179746272908
198 | 197,0.5393352265376665,1.828975340374717
199 | 198,3.1802884059362895,0.11239955130689305
200 | 199,-0.5654238202351993,0.6261606939102331
201 | 


--------------------------------------------------------------------------------
/Gaussian Mixture Model MapReduce/gmm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Gaussian Mixture Model on EMR
  4 | 
  5 | 
  6 | """
  7 | 
  8 | import InitialiseGaussianMixtures as gmm_init
  9 | import IterationGaussianMixtureMR as gmm_iterator
 10 | import numpy as np
 11 | from boto.s3.connection import S3Connection
 12 | import json
 13 | import os
 14 | 
 15 | # use if you did not set up this parameters in configuration file
 16 | EMR_DEFAULT_PARAMS = ["--ec2-core-instance-bid-price", "0.4", 
 17 |                       "--ec2-core-instance-type" ,"m1.small",
 18 |                       "--num-ec2-core-instances", "1", 
 19 |                       "--ec2-task-instance-bid-price", "0.4", 
 20 |                       "--ec2-task-instance-type", "m1.small", 
 21 |                       "--num-ec2-task-instances","1"]
 22 | 
 23 | # access and secret key
 24 | ACCESS_KEY = "YOUR_ACCESS_KEY"
 25 | SECRET_KEY = "YOUR_SECRET_KEY"
 26 | 
 27 | 
 28 |                       
 29 | def dist_tot(mu_before, mu_after):
 30 |     ''' calculates sum of distances between list of vectors '''
 31 |     diffs = [np.array(mu_before[i])-np.array(mu) for i,mu in enumerate(mu_after)]
 32 |     return sum([np.sqrt(np.dot(mu_diff.T,mu_diff)) for mu_diff in diffs])
 33 |     
 34 |     
 35 |     
 36 | class Runner(object):
 37 |     
 38 |     """
 39 |     (i.e. sample and run K-means on sample to determine initial parameters
 40 |         )
 41 |     """
 42 |     
 43 |     def __init__(self,d,k,init_eps,sample_size,init_iteration_limit,
 44 |                  iteration_eps,em_iteration_limit, input_path, 
 45 |                  output_path,emr_local = "local", emr_defaults = False):
 46 |         self.dim = d                                        # dimensionality of data
 47 |         self.clusters = k                                   # number of expected clusters
 48 |         self.init_eps = init_eps                            # convergence threshold for K-means on initialisation step
 49 |         self.init_iteration_limit = init_iteration_limit    # limit for iterations for K-means on initial step
 50 |         self.iteration_eps = iteration_eps                  # convergence threshold for EM parameter
 51 |         self.em_iteration_limit = em_iteration_limit        # maximum number of iterations of EM algorithm
 52 |         self.input_path = input_path
 53 |         self.output_path = output_path
 54 |         self.sample_size = sample_size
 55 |         self.emr_defaults = emr_defaults
 56 |         assert emr_local=='emr' or emr_local=='local', " 'emr_local' should be either 'emr' or 'local' "
 57 |         self.emr_local = emr_local
 58 |         if self.emr_local == "emr":
 59 |             self.conn = S3Connection(aws_access_key_id = ACCESS_KEY,
 60 |                                      aws_secret_access_key = SECRET_KEY)
 61 |             
 62 |         
 63 |         
 64 |     ############### Initialisation of GMM parameters ##########################
 65 |         
 66 |         
 67 |     def config_and_run_init_step(self):
 68 |         ''' 
 69 |         Sets configuration paramters to run initial step of GMM algorithm.
 70 |         By default job will run in 'local' mode
 71 |         '''
 72 |         # set configuration
 73 |         init_configs = ["--dimensions",str(self.dim),
 74 |                         "--sample-size",str(self.sample_size),
 75 |                         "--clusters",str(self.clusters),
 76 |                         "--iteration-limit",str(self.init_iteration_limit),
 77 |                         "--kmeans-convergence",str(self.init_eps),
 78 |                         "-r", self.emr_local,
 79 |                         "--output-dir","_".join([self.output_path,"0"]),
 80 |                         "--no-output",self.input_path]
 81 |         init_configs_new = []
 82 |         if self.emr_defaults is True:
 83 |             init_configs_new.extend(EMR_DEFAULT_PARAMS[:])
 84 |         init_configs_new.extend(init_configs)
 85 |         # start job 
 86 |         mrJobInitStep = gmm_init.InitialiseGaussianMixtureMR(init_configs_new)
 87 |         with mrJobInitStep.make_runner() as runner:
 88 |             runner.run()
 89 |             
 90 |     
 91 |     #######################  Iterations of EM-algorithm  ######################
 92 |     
 93 |     @staticmethod
 94 |     def delta_stop_iterate(old_params,new_params):
 95 |         '''
 96 |         
 97 |         '''
 98 |         mu_old = old_params["mu"]
 99 |         mu_new = new_params["mu"]
100 |         delta = dist_tot(mu_new,mu_old)
101 |         return delta
102 |         
103 |         
104 |     
105 |     def iterate_em(self):
106 |         '''
107 |         Performs em iterations until convergence
108 |         '''
109 |         delta = 10
110 |         get_params = lambda p,i: self.load_params("_".join([p,str(i)])) # get parameters from previous iter.
111 |         old_params = get_params(self.output_path,0)
112 |         iteration = 1
113 |         while delta > self.iteration_eps and iteration < self.em_iteration_limit:
114 |             self.config_and_run_iter_step(iteration, json.dumps(old_params))
115 |             new_params = get_params(self.output_path,iteration)
116 |             delta = self.delta_stop_iterate(old_params,new_params)
117 |             iteration+=1
118 |             old_params = new_params
119 |           
120 |         
121 | 
122 |     def config_and_run_iter_step(self,iteration, parameters):
123 |         '''
124 |         Configure parameters to run single iteration of EM algorithm 
125 |         (each iteration consists of E-step and M-step)
126 |         '''
127 |         iter_configs = [ "--dimensions",str(self.dim),
128 |                          "--clusters",str(self.clusters),
129 |                          "--parameters", parameters,
130 |                          "-r", self.emr_local,
131 |                          "--output-dir","_".join([self.output_path,str(iteration)]),
132 |                          "--no-output",self.input_path ]
133 |         iter_configs_new = []
134 |         if self.emr_defaults is True:
135 |             iter_configs_new.extend(EMR_DEFAULT_PARAMS)
136 |         iter_configs_new.extend(iter_configs)
137 |         # start job
138 |         mrJobIterStep = gmm_iterator.IterationGaussianMixtureMR(iter_configs_new)
139 |         with mrJobIterStep.make_runner() as runner:
140 |             runner.run()
141 |             
142 |                                             
143 |     def load_params(self,path):
144 |         if self.emr_local == "local":
145 |             return self.local_load_params(path)
146 |         return self.s3_load_params(path)
147 | 
148 |                                      
149 |     def s3_load_params(self,s3_path):
150 |         ''' load parameters if they are on amazon s3'''
151 |         path = s3_path.strip("s3://").split("/")
152 |         mybucket = self.conn.get_bucket(path[0]) # connect to s3 bucket
153 |         s3_file_keys = [f for f in mybucket.list(prefix = "/".join(path[1:]))]
154 |         for s3key in s3_file_keys:
155 |             if mybucket.lookup(s3key).size > 0:
156 |                 data = s3key.get_contents_as_string()
157 |                 params = json.loads(data)
158 |                 return params
159 |                 
160 |     def local_load_params(self,local_path):
161 |         ''' load paramters if they are on local machine'''
162 |         current_dir = os.getcwd()
163 |         os.chdir(local_path)
164 |         for filename in os.listdir(os.getcwd()):
165 |             if "part-" in filename:
166 |                 if os.path.getsize(filename) > 0:
167 |                     with open(filename,"r") as in_file:
168 |                         data = json.load(in_file)
169 |                         os.chdir(current_dir)
170 |                         return data
171 |                     
172 |     def folder_cleanup(self):
173 |         pass
174 |     
175 |     
176 |     def main_run():
177 |         pass
178 |         
179 |             
180 |         
181 | if __name__=="__main__":
182 |     d = 2
183 |     k = 2
184 |     init_eps = 0.01
185 |     sample_size = 100
186 |     init_iteration_limit = 20
187 |     iteration_eps = 0.01
188 |     em_iteration_limit = 10
189 |     
190 |     #input_path = "/Users/amazaspshaumyan/Desktop/MapReduceAlgorithms/map_reduce/gmm_test_data.txt"
191 |     #output_path = "/Users/amazaspshaumyan/Desktop/MapReduceAlgorithms/map_reduce/gmm_test_final_iteration"
192 |     output_path = "s3://test-map-reduce-movielabs/expectation_maximization_clients/gmm_test_output_initial_test"
193 |     input_path = "s3://test-map-reduce-movielabs/expectation_maximization_clients/gmm_test_data.txt"
194 |     emr_local = "emr"
195 |     emr_defaults = True
196 |     gmm_mr = Runner(d,k,init_eps,sample_size,init_iteration_limit,
197 |                  iteration_eps,em_iteration_limit, input_path, 
198 |                  output_path,emr_local, emr_defaults)
199 |     gmm_mr.config_and_run_init_step()
200 |     gmm_mr.iterate_em()
201 |  
202 |                 
203 |         
204 |     
205 |         
206 |         
207 |     
208 |             
209 |         
210 |         
211 |     
212 |     
213 |     
214 |     
215 |          
216 |         
217 |     
218 |     
219 |     
220 |     
221 | 
222 |                                                                
223 |     
224 |     
225 |     


--------------------------------------------------------------------------------
/KNN MapReduce/knn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from mrjob.job import MRJob
  5 | from mrjob.protocol import RawValueProtocol,JSONProtocol
  6 | from mrjob.step import MRStep
  7 | import heapq
  8 | import csv
  9 | 
 10 | 
 11 | ################# Helper functions & classes ##################################
 12 | 
 13 | def dist(x,y):
 14 |     ''' defines euclidean distance between two vector-lists'''
 15 |     return sum([(x[i] - e)**2 for i,e in enumerate(y)])
 16 | 
 17 | 
 18 | class DimensionalityMismatchError(Exception):
 19 |     ''' Error for case when dimensionalities do not match'''
 20 |     def __init__(self,expected,real):
 21 |         self.expected = expected
 22 |         self.real = real
 23 |         
 24 |     def __str__(self):
 25 |         error = "Expected  dimensions: "+str(self.expected)+ " observed: "+str(self.real)
 26 |         return error
 27 |         
 28 |         
 29 | ###################  MapReduce Job  ########################################### 
 30 | 
 31 | 
 32 | 
 33 | class KnnMapReduce(MRJob):
 34 |     '''
 35 |     K nearest neighbours algorithm for classification and regression.
 36 |     Assumes that number of data points to be estimated is small and can be fitted
 37 |     into single machine.
 38 |     
 39 |     
 40 |     Input File:
 41 |     -----------
 42 |     
 43 |           Extract relevant features from input line by changing extract_features
 44 |           method.  Current code assumes following input line format:
 45 |          
 46 |           <non_informative_index>,<feature 1>,<feature 2>,...,< dependent variable >
 47 |       
 48 |     
 49 |     Options:
 50 |     -------
 51 |          --dimensionality         - number of dimensions in explanatory variables
 52 |          --knn-type               - type of estimation (should be either 'regression' 
 53 |                                     or 'classification')
 54 |          --n-neighbours           - number of nearest neighbours used for estimation
 55 |          --points-to-estimate     - file containing points that need to be estimated
 56 |     
 57 |     
 58 |     Output:
 59 |     -------
 60 |          Output line format:
 61 |           
 62 |          <feature 1>,<feature 2>,<feature 3>,< estimated dependent variable >
 63 | 
 64 |     '''
 65 |     
 66 |     INPUT_PROTOCOL = RawValueProtocol
 67 |     
 68 |     INTERNAL_PROTOCOL = JSONProtocol
 69 |     
 70 |     OUTPUT_PROTOCOL = RawValueProtocol
 71 |     
 72 |     def __init__(self,*args,**kwargs):
 73 |         super(KnnMapReduce,self).__init__(*args,**kwargs)
 74 |         with open(self.options.points_to_estimate,"r") as input_file:
 75 |             data = list(csv.reader(input_file))
 76 |         self.points = {}
 77 |         for dp in data:
 78 |             self.points[tuple([float(e) for e in dp])] = []
 79 |     
 80 |         
 81 |     #################### load & configure options #############################
 82 |     
 83 |     def configure_options(self):
 84 |         super(KnnMapReduce,self).configure_options()
 85 |         self.add_passthrough_option("--dimensionality",
 86 |                                     type = int,
 87 |                                     help = "dimenisonality of features")
 88 |         self.add_passthrough_option("--knn-type",
 89 |                                     type = str,
 90 |                                     help = "either regression or classification")
 91 |         self.add_passthrough_option("--n-neighbours",
 92 |                                     type = int,
 93 |                                     help = "number of neighbours used in classification or regression")
 94 |         self.add_file_option("--points-to-estimate",
 95 |                              type = "str",
 96 |                              help = "File containing all points that should be estimated")
 97 |                                     
 98 |                                      
 99 |     def load_options(self,args):
100 |         super(KnnMapReduce,self).load_options(args)
101 |         # feature dimensionality
102 |         if self.options.dimensionality is None:
103 |             self.option_parser.error("You need to specify feature dimensionality")
104 |         else:
105 |             self.dim = self.options.dimensionality
106 |         # type of knn (either regression or classification)
107 |         if self.options.knn_type != "regression" and self.options.knn_type != "classification":
108 |             self.option_parser.error("Either 'regression' or 'classification' ")
109 |         else:
110 |             self.knn_type = self.options.knn_type
111 |         # dimensionality
112 |         if self.options.n_neighbours is None:
113 |             self.option_parser.error("You need to specify number of nearest neighbours")
114 |         else:
115 |             self.n_neighbours = self.options.n_neighbours
116 |         if self.options.points_to_estimate is None:
117 |             self.option_parser.error("You need to specify file containing points which needs to be estimated")
118 |     
119 |     ################# Helper functions for extracting features ################
120 |             
121 |     def extract_features(self,line):
122 |         ''' Extracts data from line of input '''
123 |         data = line.strip().split(",")
124 |         return (data[-1], [ float(e) for e in data[1:-1] ])
125 |         
126 |         
127 |     ################# Map - Reduce Job ######################################## 
128 |             
129 |             
130 |     def mapper_knn(self,_,line):
131 |         '''
132 |         Finds nearest neighbours for each point in set of points that 
133 |         needs to be estimated.
134 |         '''
135 |         y, features = self.extract_features(line)
136 |         if len(features) != self.dim:
137 |             raise DimensionalityMismatchError(self.dim,len(features))
138 |         # for each point select n neighbours that are closest to it
139 |         for dp in self.points:
140 |            d_inv = -1*dist(features,dp)
141 |            observation = tuple([d_inv,features,y])
142 |            # if number of nearest neighbours is smaller than threshold add them
143 |            if len(self.points[dp]) < self.n_neighbours:
144 |               self.points[dp].append(observation)
145 |               if len(self.points[dp]) == self.n_neighbours:
146 |                  heapq.heapify(self.points[dp])
147 |            # compare with largest distance and push if it is smaller
148 |            else:
149 |               largest_neg_dist = self.points[dp][0][0]
150 |               if d_inv > largest_neg_dist:
151 |                  heapq.heapreplace(self.points[dp],observation)
152 | 
153 |     def mapper_knn_final(self):
154 |         '''
155 |         Each mapper outputs dictionary with key being data point that
156 |         needs to be estimated and value being priority queue of length 
157 |         'self.n_neighbours' of observation from training set
158 |         '''
159 |         yield 1, self.points.items()
160 |         
161 |         
162 |     def reducer_knn(self,key,points):
163 |         '''
164 |         Aggregates mapper output and finds set of training points which are 
165 |         closest to point that needs to be estoimated. Then depending on 
166 |         estimation type ('classification' or 'regression') outputs estimate
167 |         '''
168 |         for mapper_neighbors in points:
169 |             merged = None
170 |             mapper_knn = {}
171 |             for k,v in mapper_neighbors:
172 |                 mapper_knn[tuple(k)] = v
173 |             # process mapper outputs and find closest neighbours
174 |             if merged is None:
175 |                 merged = mapper_knn
176 |             else:
177 |                 for point in merged.keys():
178 |                     pq = mapper_knn[point]
179 |                     while pq:
180 |                           if len(merged[point]) < self.n_neighbours:
181 |                              heapq.heappush(merged[point],heapq.heappop(pq))
182 |                           else:
183 |                              largest_neg_dist = merged[point][0][0]
184 |                              if pq[0][0] > largest_neg_dist:
185 |                                 heapq.heapreplace(merged[point], heapq.heappop(pq))
186 |         for point in merged.keys():
187 |             # regression
188 |             if self.options.knn_type == "regression":
189 |                 estimates = [ float(observation[-1]) for observation in merged[point]]
190 |                 estimate = sum(estimates)/self.options.n_neighbours
191 |             # classification
192 |             else:
193 |                 estimates = {}
194 |                 for neg_dist,features,y in merged[point]:
195 |                     estimates[y] = estimates.get(y,0) + 1
196 |                 estimate,counts = max(estimates.items(),key = lambda x: x[-1])
197 |             # format output
198 |             output = list(point)
199 |             output.append(estimate)
200 |             yield None, ",".join([str(e) for e in output])
201 |             
202 |             
203 |     def steps(self):
204 |         return [MRStep(mapper       = self.mapper_knn,
205 |                        mapper_final = self.mapper_knn_final,
206 |                        reducer      = self.reducer_knn)]
207 |                        
208 | if __name__=="__main__":
209 |     KnnMapReduce.run()
210 |         
211 |             
212 |     
213 |             
214 |             


--------------------------------------------------------------------------------
/Gaussian Mixture Model MapReduce/InitialiseGaussianMixtures.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Initialisation step for MapReduce implementation of GMM.
  3 | 
  4 | Using MapReduce paradigm samples data from large dataset, so that sample fits
  5 | into one machine, then run K-means algorithm on sampled datato find centroids 
  6 | and cluster allocation of points.
  7 | Cluster allocation of data points is used to get initial parameters for GMM 
  8 | (i.e. : mixing coefficients (pdf of latent variable), mean vectors and covariance
  9 | matrix for each cluster)
 10 | '''
 11 | 
 12 | from mrjob.protocol import RawValueProtocol,JSONProtocol, JSONValueProtocol
 13 | from mrjob.job import MRJob
 14 | from mrjob.step import MRStep
 15 | import random
 16 | import heapq
 17 | import numpy as np
 18 | 
 19 | 
 20 | 
 21 | def extract_features(line):
 22 |     ''' extracts features from line of input'''
 23 |     data = line.strip().split(",")
 24 |     return [ float(e) for e in data[1:] ]
 25 | 
 26 | 
 27 | #########################  K-means ###########################################
 28 | 
 29 | 
 30 | class KmeansInitGMM(object):
 31 |     '''
 32 |     K-means algorihm for clustering.
 33 | 
 34 |     Parameters:
 35 |     -----------
 36 |     
 37 |     clusters           - (int)   number of expected clusters
 38 |     dim                - (int)   dimensionality of input
 39 |     epsilon            - (float) convergence threshold for k-means
 40 |     iteration_limit    - (int)   maximum number of iteration, where each 
 41 |                                  iteration consists of e_step and m_step
 42 |     data               - (list)  list of lists, where each inner list is 
 43 |                                  single data point
 44 |     
 45 |     '''
 46 |     
 47 |     def __init__(self, clusters, dim, epsilon, iteration_limit, data):
 48 |         self.k = clusters
 49 |         self.data = [extract_features(line) for line in data]
 50 |         self.m = dim
 51 |         self.r = [0]*len(data) # vector of cluster assignments
 52 |         self.convergence_epsilon = epsilon
 53 |         self.iteration_limit = iteration_limit
 54 |         
 55 |         
 56 |     def loss(self):
 57 |         ''' 
 58 |         Calculates loss function of K-means
 59 |         J =  sum_n[ sum_k [r_n_k*||x_n-mu_k||^2]]]
 60 |         '''
 61 |         r = self.r
 62 |         mu = self.clusters
 63 |         J = sum([np.dot((np.array(x)-mu[r[i]]).T,np.array(x)-mu[r[i]]) for i,x in enumerate(self.data)])
 64 |         return J
 65 |     
 66 |     def initialise(self):
 67 |         ''' randomly choses points from list'''
 68 |         self.clusters = random.sample(self.data,self.k)     
 69 |         
 70 |     def e_step(self):
 71 |         ''' E-step in K means algorithm, finds assignment of points to centroids'''
 72 |         for n,data_point in enumerate(self.data):
 73 |             min_cl = 0
 74 |             min_sq_dist = -1
 75 |             for i,cluster in enumerate(self.clusters):
 76 |                 dist_sq = sum([ (data_point[i]-cluster[i])**2 for i in range(self.m)])
 77 |                 if min_sq_dist==-1:
 78 |                     min_sq_dist = dist_sq
 79 |                 else:
 80 |                     if dist_sq < min_sq_dist:
 81 |                         min_sq_dist = dist_sq
 82 |                         min_cl = i
 83 |             self.r[n] = min_cl
 84 | 
 85 |             
 86 |     def m_step(self):
 87 |         ''' M-step in K-means algorithm, finds centroids that minimise loss function'''
 88 |         self.clusters = [[0]*self.m for i in range(self.k)] # update clusters
 89 |         cluster_counts = [0]*self.k
 90 |         for i,x in enumerate(self.data):
 91 |             cluster_counts[self.r[i]]+=1
 92 |             self.clusters[self.r[i]] = [self.clusters[self.r[i]][j]+x[j] for j in range(self.m)]
 93 |         mean_vector = lambda x,n: [float(el)/n for el in x]
 94 |         self.clusters = [mean_vector(self.clusters[i],cluster_counts[i]) for i in range(self.k)] 
 95 |             
 96 |     
 97 |     def run_k_means(self):
 98 |         ''' 
 99 |         Runs single pass of k-means algorithm
100 |         '''
101 |         self.initialise() # initialise clusters
102 |         next_loss = self.loss() # calculate loss function for initial clusters
103 |         prev_loss = next_loss +2*self.convergence_epsilon
104 |         iteration = 0
105 |         losses = []
106 |         while prev_loss - next_loss > self.convergence_epsilon and iteration < self.iteration_limit:
107 |             self.e_step()
108 |             self.m_step()
109 |             prev_loss = next_loss
110 |             losses.append(prev_loss)
111 |             next_loss = self.loss()
112 |             iteration+=1
113 |         
114 |             
115 |     def run(self, reruns = 10):
116 |         ''' 
117 |         Runs k-means several times and choosed and chooses parameters (mean vectors,
118 |         point cluster allocation) from the k-means run with smallest value of 
119 |         loss function.
120 |         
121 |         (Since loss function is not convex,it is not guaranteed that parameters 
122 |         obtained from single k-means algorithm pass will give global minimum
123 |         of k-means loss function)
124 |         '''
125 |         clusters = [[0]*self.m for i in range(self.k)]
126 |         loss_before = -1
127 |         r = self.r
128 |         for i in range(reruns):
129 |             self.run_k_means()
130 |             loss_new = self.loss()
131 |             if loss_before==-1:
132 |                 loss_before = loss_new
133 |                 clusters = [el[:] for el in self.clusters]
134 |                 r = self.r[:]
135 |             else:
136 |                 if loss_new < loss_before:
137 |                     loss_before = loss_new
138 |                     clusters = [el[:] for el in self.clusters]
139 |                     r = self.r[:]
140 |                     
141 |         self.final_r = r
142 |         self.final_clusters = clusters
143 |         
144 |         
145 |     def gmm_params(self):
146 |         ''' 
147 |         Calculates initial parameters for GMM based on cluster allocation of
148 |         points in best K-means
149 |         '''
150 |         total=0
151 |         mixing = [0]*self.k
152 |         covars = [np.zeros([self.m,self.m], dtype = np.float64) for i in range(self.k)]
153 |         mu = [np.zeros(self.m, dtype = np.float64) for i in range(self.k)]
154 |         for i,dp in enumerate(self.data):
155 |             k = self.final_r[i] # cluster
156 |             x = np.array(dp, dtype = np.float64)
157 |             mixing[k]+=1
158 |             total+=1
159 |             mu[k]+=x
160 |             covars[k]+=np.outer(x,x)
161 |         mu = [mu[j]/p for j,p in enumerate(mixing)]
162 |         covars = [1.0/mixing[j]*(covars[j] - mixing[j]*np.outer(mu[j],mu[j])) for j in range(self.k)]
163 |         mixing = [float(p)/total for p in mixing]
164 |         
165 |         matrix_to_list = lambda x: [list(e) for e in x]
166 |         mixing = mixing
167 |         mu = matrix_to_list(mu)
168 |         covariance = [matrix_to_list(e) for e in covars]
169 |         return {"mixing":mixing,"mu":mu,"covariance":covariance}
170 | 
171 |         
172 | ########  intialise parameters of Gaussian Mixture Model #####################
173 | 
174 | 
175 | class InitialiseGaussianMixtureMR(MRJob):
176 |     '''
177 |     MapReduce class that initialises parameters of GMM.
178 |     Each mapper assigns random priority to each line of input, chooses n (n = sample size)
179 |     lines with lowest priority level and outputs it.
180 |     Single reducer collects m (where m is number of mappers) lists of size n
181 |     and choses n lines with smallest priority, these final n lines of input
182 |     represent random sample of size n from data. Then k-means algorithm is used
183 |     on sampled data to find parameters for initialising.
184 |            
185 |     Command Line Options:
186 |     ---------------------
187 |     
188 |     --sample-size          - sample size
189 |     --clusters             - number of clusters
190 |     --dimensions           - dimensionality of data
191 |     --kmeans-convergence   - convergence threshold for k-means convergence
192 |     --iteration-limit      - limit on number of iterations for k-means
193 |     --kmeans-reruns        - number of times to run k-means
194 |     
195 |     '''
196 |     
197 |     
198 |     INPUT_PROTOCOL = RawValueProtocol
199 |     
200 |     INTERNAL_PROTOCOL = JSONProtocol
201 |     
202 |     OUTPUT_PROTOCOL = JSONValueProtocol
203 |     
204 |     def __init__(self,*args,**kwargs):
205 |         super(InitialiseGaussianMixtureMR,self).__init__(*args, **kwargs)
206 |         self.pq = []      
207 |         
208 |     def configure_options(self):
209 |         super(InitialiseGaussianMixtureMR,self).configure_options()
210 |         self.add_passthrough_option("--sample-size",
211 |                                     type= int,
212 |                                     help = "number of elements in sample")
213 |         self.add_passthrough_option("--clusters",
214 |                                     type = int,
215 |                                     help = "number of clusters")
216 |         self.add_passthrough_option("--dimensions",
217 |                                     type = int,
218 |                                     help = "dimensionality of input data")
219 |         self.add_passthrough_option("--kmeans-convergence",
220 |                                     type = float,
221 |                                     default = 0.01,
222 |                                     help = "convergence parameter for K-means loss function")
223 |         self.add_passthrough_option("--iteration-limit",
224 |                                     type = int,
225 |                                     default = 100,
226 |                                     help = "largest number of iterations that k-means algorithm is allowed")
227 |         self.add_passthrough_option("--kmeans-reruns",
228 |                                     type = int,
229 |                                     default = 10,
230 |                                     help = "number of k-means reruns ")
231 |                                     
232 |                                 
233 |                                     
234 |     def load_options(self, args):
235 |         super(InitialiseGaussianMixtureMR,self).load_options(args)
236 |         # size of sample for k-means, that will initialise parameters of GMM
237 |         if self.options.sample_size is None:
238 |             self.option_parser.error("You need to specify sample size")
239 |         else:
240 |             self.n = self.options.sample_size
241 |         # number of cluters
242 |         if self.options.clusters is None:
243 |             self.option_parser.error("You need to specify number of clusters")
244 |         else:
245 |             self.k = self.options.clusters
246 |         # dimensionality
247 |         if self.options.dimensions is None:
248 |             self.option_parser.error("You need to specify dimensionality of data")
249 |         else:
250 |             self.dim = self.options.dimensions
251 |             
252 |             
253 |     def mapper_initialise_gmm(self,_,line):
254 |         '''
255 |         Randomly samples n lines of input (where n is sample_size option), by
256 |         assigning random priority level and then choosing n lines of input 
257 |         with smallest priority level
258 |         '''
259 |         r = random.randrange(1000000)
260 |         if len(self.pq) < self.n:
261 |             heapq.heappush(self.pq,(r,line))
262 |         else:
263 |             if self.pq[0][0] < r:
264 |                heapq.heappushpop(self.pq,(r,line))
265 |             
266 |     def mapper_initialise_gmm_final(self):
267 |         yield 1, self.pq
268 |         
269 |     def reducer_kmeans_initialise_gmm(self,key,samples):
270 |         '''
271 |         Subsamples from mapper output and runs K-means algorithm on subsampled
272 |         data to initialise parameters of GMM.        
273 |         '''
274 |         pq_final = []
275 |         for sample in samples:
276 |             for element in sample:
277 |                 if len(pq_final) < self.n:
278 |                    pq_final.append(element)
279 |                    if len(pq_final)==self.n:
280 |                        heapq.heapify(pq_final)
281 |                 else:
282 |                     if pq_final[0][0] < element[0]:
283 |                         heapq.heappushpop(pq_final,element)
284 |         lines = [line for r,line in pq_final]
285 |         kmeans = KmeansInitGMM(self.k, self.dim, self.options.kmeans_convergence,self.options.iteration_limit,lines)
286 |         kmeans.run(reruns = self.options.kmeans_reruns)
287 |         params = kmeans.gmm_params()
288 |         yield None, params
289 |         
290 |         
291 |     def steps(self):
292 |         return [MRStep(mapper = self.mapper_initialise_gmm,
293 |                        mapper_final = self.mapper_initialise_gmm_final,
294 |                        reducer = self.reducer_kmeans_initialise_gmm)]
295 |                        
296 | if __name__=="__main__":
297 |     InitialiseGaussianMixtureMR.run()
298 |     
299 |     


--------------------------------------------------------------------------------
/RidgeRegression/RidgeRegressionMapReduce.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from mrjob.job import MRJob
  5 | from mrjob.protocol import RawValueProtocol,JSONProtocol,JSONValueProtocol
  6 | from mrjob.step import MRStep
  7 | import heapq
  8 | import csv
  9 | import numpy as np
 10 | import random
 11 | 
 12 | 
 13 | # ----------------------------- Helper Classes & Methods --------------------------------
 14 | 
 15 | def cholesky_solution_least_squares(part_one, part_two):
 16 |     '''Cholesky decomposition '''
 17 |     R     = np.linalg.cholesky(part_one)
 18 |     z     = np.linalg.solve(R,part_two)
 19 |     theta = np.linalg.solve(np.transpose(R),z)
 20 |     return theta
 21 |     
 22 |     
 23 | class PrioritySampler(object):
 24 |     
 25 |     def __init__(self,sample_size):
 26 |         self.sample_size = sample_size
 27 |         self.sample      = []
 28 |         
 29 |     def process_observation(self,observation):
 30 |         if len(self.sample) < self.sample_size:
 31 |             self.sample.append(observation)
 32 |             if len(self.sample) == self.sample_size:
 33 |                 heapq.heapify(self.sample_cv)
 34 |         else:
 35 |             if observation[0] > self.sample[0][0]:
 36 |                 heapq.heapreplace(self.sample,observation)
 37 |                 
 38 |     def process_observations(self,observations):
 39 |         for observation in observations:
 40 |             self.process_observation(observation)
 41 |             
 42 |             
 43 | 
 44 | class RidgeRegressionHoldOutCV(object):
 45 |     
 46 |     def __init__(self,lambdas, data):
 47 |         self.lambdas = lambdas
 48 |         self.data    = data
 49 |     
 50 |     
 51 |     def run_ridge_regression(self, lambda_ridge , scaling = None):
 52 |         
 53 |         def scaler(x, column_scaler):
 54 |             m = np.shape(x)[1]
 55 |             for i in range(m):
 56 |                 x[:,i] = column_scaler(x[:,i])
 57 |             return x
 58 |             
 59 |         X,Y  = [],[]
 60 |         for observation in self.data:
 61 |             features , y = observation[1:]
 62 |             X.append(features)
 63 |             Y.append(y)
 64 |         X = np.array(X)
 65 |         Y = np.array(Y)
 66 |         if scaling == "max-min":
 67 |             X = scaler(X,lambda x: x/(np.max(x) - np.min(x)))
 68 |         elif scaling == "z-score":
 69 |             X = scaler(X,lambda x: (x - np.mean(x))/np.std(x))
 70 |         # scale y to account for bias term
 71 |         Y = Y - np.mean(Y)
 72 |         # in case of max-min and no scaling, we need to substract mean from features
 73 |         if scaling != "z-score":
 74 |             X = scaler(X, lambda x: x-np.mean(x))
 75 |         
 76 |     def cv(self, scaling = None):
 77 |         err = [ self.run_ridge_regression(lambda_ridge, scaling) for lambda_ridge in self.lambdas]
 78 |         lambda_best, err = min([ (self.lambdas[i],err[i]) for i in range(len(self.lambdas)) ], key = lambda t: t[1])
 79 |         return lambda_best
 80 |             
 81 |             
 82 |             
 83 | class DimensionMismatch(Exception):
 84 |     
 85 |     def __init__(self,expected,observed):
 86 |         self.exp      = expected
 87 |         self.obs      = observed
 88 |         
 89 |     def __str__(self):
 90 |         err = "Expected number of observations: "+self.exp+" , observed: "+self.obs
 91 |         return err
 92 | 
 93 | 
 94 | 
 95 | class RidgeRegression(MRJob):
 96 |     '''
 97 |     
 98 |     Input File:
 99 |     -----------
100 |           
101 |           Extract relevant features from input line by changing extract_variables
102 |           method. You can add features for non-linear models (like x^2 or exp(x)).
103 |           Current code assumes following input line format:
104 |           
105 |           input line = <>,<feature_1>,...,<feature_n>,<dependent variable>
106 |           
107 |     Options:
108 |     -----------
109 |     
110 |           --dimension              - (int)  number of explanatory variables
111 |           --scaling                - (str)  'z-score' or 'max-min'
112 |           --hold-out-sample-size   - (int)  size of hold out cross validation set 
113 |           --cv-lambdas             - (str)  name of file containing set of regularisation 
114 |                                             parameters for cross validation
115 |                                             
116 |     '''
117 |     
118 |     INPUT_PROTOCOL = RawValueProtocol
119 |     
120 |     INTERNAL_PROTOCOL = JSONProtocol
121 |     
122 |     OUTPUT_PROTCOL = JSONValueProtocol
123 |     
124 |     def __init__(self,*args,**kwargs):
125 |         super(RidgeRegression,self).__init__(*args,**kwargs)
126 |         if self.scaling=="max-min":
127 |             self.max = [0]*self.dim
128 |             self.min = [0]*self.dim
129 |         self.mu = [0]*self.dim
130 |         self.y_av = 0.0
131 |         self.x_t_x = np.zeros([self.dim,self.dim], dtype = np.float)
132 |         self.x_t_y = [0]*self.dim
133 |         self.n = 0
134 |         self.lambdas_cv = self.read_lambdas(self.options.cv_lambdas)
135 |         self.sampler = Sampler(self.cv_size)
136 |         
137 |     #------------------------------------------- load & configure options ---------------------------------------#
138 |         
139 |     def configure_options(self):
140 |         super(RidgeRegression,self).configure_options()
141 |         self.add_passthrough_option("--dimension",
142 |                                     type = int,
143 |                                     help = "Number of explanatory variables")
144 |         self.add_passthrough_option("--hold-out-sample-size",
145 |                                     type = int,
146 |                                     help = "Size of sample for hold out cross validation",
147 |                                     default = 1000)
148 |         self.add_passthrough_option("--scaling",
149 |                                     type = str,
150 |                                     help = "Can be 'z-score' or 'max-min' ")
151 |         self.add_file_option("--cv-lambdas",
152 |                              type = "str",
153 |                              help = "Name of file that contains regularisation parameters for cross validation")
154 |                              
155 |     def load_options(self,args):
156 |         super(RidgeRegression,self).load_options(args)
157 |         # dimensionality
158 |         if self.options.dimension is None:
159 |             self.option_parser.error("You need to specify number of explanatory variables")
160 |         else:
161 |             self.dim = self.options.dimension
162 |         # set of lambdas for cross validation
163 |         if self.options.cv_lambdas is None:
164 |             self.option_parser.error("You need to specify name of file with set of regularisation parameters")
165 |         # sample size for hold out cross validation
166 |         self.cv_size = self.options.hold_out_sample_size
167 |         # scaling options
168 |         if self.options.scaling not in [None,'z-score','max-min']:
169 |             self.options_parser.error("You need to define proper scaling ('z-score' or 'max-min')")
170 |             
171 |         
172 |     #----------------------------------------- helper functions ----- --------------------------------------------#
173 |         
174 |     @staticmethod
175 |     def extract_features(line):
176 |         '''
177 |         Extracts dependent variable and features from line of input
178 |         '''
179 |         data = line.strip().split(",")
180 |         features = [float(e) for e in data[1:-1]]
181 |         y = float(data[-1])
182 |         return (y,features)
183 |       
184 |       
185 |     @staticmethod
186 |     def read_lambdas(filename):
187 |         ''' reads regularisation parameters'''
188 |         with open(filename,"r") as csvfile:
189 |             lambdas = list(csv.reader(csvfile))
190 |         return [float(e) for e in lambdas]
191 |         
192 |         
193 |     def join_mapper_intermediate_stats(self, mapper_one, mapper_two):
194 |         '''
195 |         Aggregates mapper outputs
196 |         '''
197 |         mapper_one["mu"]    = [mapper_one["mu"][i] + mapper_two[i] for i in range(self.dim)]
198 |         sum_lists = lambda x,y,n: [x[i] + y[i] for i in range(n)]
199 |         xtx_1, xtx_2 = mapper_one["x_t_x"], mapper_two["x_t_x"] 
200 |         mapper_one["x_t_x"] = [sum_lists(xtx_1[i],xtx_2[i],self.dim) for i in range(self.dim)]
201 |         mapper_one["y_av"] += mapper_two["y_av"]
202 |         mapper_one["n"]    += mapper_two["n"]
203 |         if self.options.scaling == "max-min":
204 |             mapper_one["max"] = [max(mapper_one["max"][i],mapper_two["max"][i]) for i in range(self.dim)]
205 |             mapper_one["min"] = [min(mapper_one["min"][i],mapper_two["min"][i]) for i in range(self.dim)]
206 |         return mapper_one
207 |         
208 |     
209 |     def estimate_params(self,data,lambda_ridge,scaling = None):
210 |         xtx   = np.array(data["x_t_x"])
211 |         xty   = np.array(data["x_t_y"]) 
212 |         mu    = np.array(data["mu"])
213 |         y_av  = data["y_av"]
214 |         n     = data["n"]
215 |         beta_bias   = y_av # (bias terms)
216 |         if scaling is None:
217 |            part_one  = xtx - n*np.outer(mu,mu)+lambda_ridge*np.eye(self.dim)
218 |            part_two  = xty - n*y_av*mu
219 |         elif scaling == "z_score":
220 |            sigma     = 1.0/np.sqrt(np.diag((1.0/n*(xtx-np.outer(mu,mu))))) # vector of standard deviations
221 |            scaler    = np.outer(sigma,sigma)
222 |            part_one  = np.dot(scaler,xtx-n*np.outer(mu,mu)) + lambda_ridge*np.eye(self.dim)
223 |            part_two  = sigma*xty - sigma*mu*y_av*n
224 |         elif scaling == "max-min":
225 |            scale_vec = 1.0/( np.array(data["max"]) - np.array(data["min"]) )
226 |            scaler    = np.outer(scale_vec,scale_vec)
227 |            part_one  = np.dot(scaler,xtx-n*np.outer(mu,mu)) + lambda_ridge*np.eye(self.dim)
228 |            part_two  = scale_vec*xty - scale_vec*mu*y_av*n
229 |         theta = cholesky_solution_least_squares(part_one, part_two)
230 |         return {"bias_term": beta_bias,"theta":list(theta)}
231 |         
232 |         
233 |         
234 |     #----------------------------------------------- Map - Reduce Job -------------------------------------------#
235 |         
236 |     def mapper_ridge(self,_,line):
237 |         y, features = self.extract_features(line)
238 |         x = np.array(features)
239 |         # update instance variables
240 |         if self.options.scaling=="max-min":
241 |             self.max = [max(current_max,features[i]) for i,current_max in enumerate(features)]
242 |             self.min = [max(current_max,features[i]) for i,current_max in enumerate(features)]
243 |         self.mu    = [ av+features[i] for i,av in enumerate(self.mu) ]
244 |         self.x_t_y = [ xty_i + y*features[i] for xty_i,i in enumerate(features)]
245 |         self.x_t_x = np.outer(x,x)
246 |         self.y_av +=y
247 |         self.n    +=1
248 |         # make sample for hold out cross validation set
249 |         rand_priority = random.randrange(start = 0, stop = 100000000)
250 |         observation = (rand_priority,features,y)
251 |         self.sampler.process_observation(observation)
252 |         
253 |                 
254 |                 
255 |     def mapper_ridge_final(self):
256 |         x_t_x = [list(row) for row in self.x_t_x] # transform numpy array to json-encodable data structure
257 |         intermediate_stats = {"mu":    self.mu,
258 |                               "x_"
259 |                               "x_t_x": x_t_x,
260 |                               "y_av":  self.y_av,
261 |                               "n":     self.n
262 |                              }
263 |         if self.options.scaling == "max-min":
264 |             intermediate_stats["max"] = self.max
265 |             intermediate_stats["min"] = self.min
266 |         yield None, ("stats",intermediate_stats)
267 |         yield None, ("hold_out_cv",self.sampler.sample)
268 |             
269 |                   
270 |                   
271 |     def reducer_ridge(self, key, vals):
272 |         '''
273 |         
274 |         '''
275 |         sampler = Sampler(self.cv_size)
276 |         final_summary_stats = {"mu":      [0]*self.dim,
277 |                                "x_t_x":   [[0]*self.dim for i in range(self.dim)],
278 |                                "x_t_y":   [0]*self.dim,
279 |                                "y_av":    0,
280 |                                "n":       0  }
281 |         for val in vals:
282 |             if val[0]=="stats":
283 |                 mapper_summary = val[1]
284 |                 final_summary_stats = self.join_mapper_intermediate_stats(final_summary_stats,mapper_summary)
285 |             else:
286 |                 sampler.process_observations(val[1])
287 |         # for each scaling type use cross validation to verify best lambda
288 |         # then use it on all data (including cv set) to find parameters
289 |         ridge   = RidgeRegressionHoldOutCV(self.lambdas, sampler.sample)
290 |         best_lambda = ridge.cv(self.options.scaling)
291 |         yield None, self.estimate_params(final_summary_stats,best_lambda,self.options.scaling)
292 | 
293 |             
294 |             
295 |     def steps(self):
296 |         return [MRStep(mapper       = self.mapper_ridge,
297 |                        mapper_final = self.mapper_ridge_final,
298 |                        reducer      = self.reducer_ridge)]
299 |                        
300 | if __name__=="__main__":
301 |     RidgeRegression.run()
302 |     
303 |         
304 |         
305 |         
306 |         
307 |         
308 | 


--------------------------------------------------------------------------------