├── KNN MapReduce ├── input_points.txt ├── example_dataset.txt ├── example_output_classification.txt ├── example_output_regression.txt └── knn.py ├── Linear Regression MapReduce ├── example_output ├── .DS_Store ├── example_dataset.txt └── LinearRegressionTS.py ├── README.md ├── RidgeRegression ├── .DS_Store └── RidgeRegressionMapReduce.py ├── Gaussian Mixture Model MapReduce ├── gmm_test_local_test_iteration_0 │ └── part-00000 ├── gmm_test_local_test_iteration_1 │ └── part-00000 ├── gmm_test_local_test_iteration_2 │ └── part-00000 ├── IterationGaussianMixtureMR.py ├── gmm_test_data.txt ├── gmm.py └── InitialiseGaussianMixtures.py ├── Random Sample MapReduce └── SimpleRandomSampleNoReplacementMR.py ├── Gaussian Discriminant Analysis MapReduce ├── gda_wrapper.py └── gda.py └── Multivariate Descriptive Statistics └── MultivariateDescriptiveStatistics.py /KNN MapReduce/input_points.txt: -------------------------------------------------------------------------------- 1 | 1.56,2.46 2 | 0.98,1.34 3 | 6.08,7.54 4 | 8.42,9.23 5 | -------------------------------------------------------------------------------- /Linear Regression MapReduce/example_output: -------------------------------------------------------------------------------- 1 | [3.139604536372969, 0.69858828816164764] 2 | -------------------------------------------------------------------------------- /KNN MapReduce/example_dataset.txt: -------------------------------------------------------------------------------- 1 | 0,1,2,1 2 | 1,0,1,1 3 | 2,1,1,1 4 | 3,10,9,0 5 | 4,8,7,0 6 | 5,6,9,0 7 | -------------------------------------------------------------------------------- /KNN MapReduce/example_output_classification.txt: -------------------------------------------------------------------------------- 1 | 0.98,1.34,1 2 | 6.08,7.54,0 3 | 8.42,9.23,0 4 | 1.56,2.46,1 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MapReduce-Machine-Learning 2 | 3 | Map-Reduce implementation of some machine learning algorithms 4 | -------------------------------------------------------------------------------- /KNN MapReduce/example_output_regression.txt: -------------------------------------------------------------------------------- 1 | 0.98,1.34,1.0 2 | 6.08,7.54,0.0 3 | 8.42,9.23,0.0 4 | 1.56,2.46,1.0 5 | -------------------------------------------------------------------------------- /RidgeRegression/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AmazaspShumik/MapReduce-Machine-Learning/HEAD/RidgeRegression/.DS_Store -------------------------------------------------------------------------------- /Linear Regression MapReduce/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AmazaspShumik/MapReduce-Machine-Learning/HEAD/Linear Regression MapReduce/.DS_Store -------------------------------------------------------------------------------- /Gaussian Mixture Model MapReduce/gmm_test_local_test_iteration_0/part-00000: -------------------------------------------------------------------------------- 1 | {"mixing": [0.48, 0.52], "mu": [[10.096098155296746, 9.7736326940978326], [0.86293812494265676, 0.76820108485655736]], "covariance": [[[1.4235652864798376, -0.17050578277420905], [-0.17050578277420905, 0.9520414860040205]], [[1.1309668152757282, -0.26122286099829789], [-0.26122286099829789, 0.64619841763880226]]]} 2 | -------------------------------------------------------------------------------- /Gaussian Mixture Model MapReduce/gmm_test_local_test_iteration_1/part-00000: -------------------------------------------------------------------------------- 1 | {"mixing": [0.5, 0.5], "mu": [[10.008817713066399, 9.7957780166309867], [1.0466993295879303, 0.8918511713799977]], "covariance": [[[1.1365934421562542, -0.075964802401628617], [-0.075964802401628617, 0.78716761230624854]], [[1.1201170855228824, -0.031611048687119281], [-0.031611048687119281, 0.86979878980839098]]]} 2 | -------------------------------------------------------------------------------- /Gaussian Mixture Model MapReduce/gmm_test_local_test_iteration_2/part-00000: -------------------------------------------------------------------------------- 1 | {"mixing": [0.5, 0.5], "mu": [[10.008817713066399, 9.7957780166309867], [1.0466993295879303, 0.8918511713799977]], "covariance": [[[1.1365934421562542, -0.075964802401628617], [-0.075964802401628617, 0.78716761230624854]], [[1.1201170855228824, -0.031611048687119281], [-0.031611048687119281, 0.86979878980839098]]]} 2 | -------------------------------------------------------------------------------- /Random Sample MapReduce/SimpleRandomSampleNoReplacementMR.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu May 7 16:01:20 2015 4 | 5 | @author: amazaspshaumyan 6 | """ 7 | 8 | from mrjob.job import MRJob 9 | from mrjob.step import MRStep 10 | from mrjob.protocol import RawValueProtocol, JSONProtocol 11 | import random 12 | import heapq 13 | 14 | 15 | class SimpleRandomSampleNoReplacementMR(MRJob): 16 | ''' Simple Random Sampling without replacement for relatively small sample 17 | sizes. 18 | Do not use for large sample sizes that can not fit in memory (current code 19 | uses only one reducer) 20 | 21 | Each line in input data is assigned random priority then n lines with largest 22 | corresponding priorities are selected (where n is size of random sample) 23 | 24 | ''' 25 | 26 | INPUT_PROTOCOL = RawValueProtocol 27 | 28 | INTERNAL_PROTOCOL = JSONProtocol 29 | 30 | OUTPUT_PROTOCOL = RawValueProtocol 31 | 32 | def __init__(self,*args,**kwargs): 33 | super(SimpleRandomSampleNoReplacementMR,self).__init__(*args, **kwargs) 34 | self.pq = [] 35 | 36 | def configure_options(self): 37 | super(SimpleRandomSampleNoReplacementMR,self).configure_options() 38 | self.add_passthrough_option("--sample-size", 39 | type= int, 40 | help = "number of elements in sample") 41 | 42 | def load_options(self,args): 43 | super(SimpleRandomSampleNoReplacementMR,self).load_options(args) 44 | if self.options.sample_size is None: 45 | self.option_parser.error("You need to specify sample size") 46 | else: 47 | self.n = self.options.sample_size 48 | 49 | def mapper_rs(self,_,line): 50 | r = random.randrange(1000000) 51 | if len(self.pq) < self.n: 52 | heapq.heappush(self.pq,(r,line)) 53 | else: 54 | if self.pq[0][0] < r: 55 | heapq.heappushpop(self.pq,(r,line)) 56 | 57 | def mapper_rs_final(self): 58 | yield 1, self.pq 59 | 60 | def reducer_rs(self,key,samples): 61 | pq_final = [] 62 | for sample in samples: 63 | for element in sample: 64 | if len(pq_final) < self.n: 65 | pq_final.append(element) 66 | if len(pq_final)==self.n: 67 | heapq.heapify(pq_final) 68 | else: 69 | if pq_final[0][0] < element[0]: 70 | heapq.heappushpop(pq_final,element) 71 | for r,line in pq_final: 72 | yield None, line 73 | 74 | def steps(self): 75 | return [MRStep(mapper = self.mapper_rs, 76 | mapper_final = self.mapper_rs_final, 77 | reducer = self.reducer_rs)] 78 | 79 | if __name__=="__main__": 80 | SimpleRandomSampleNoReplacementMR.run() 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /Linear Regression MapReduce/example_dataset.txt: -------------------------------------------------------------------------------- 1 | 2.21808980306,1.0,0.0958385695435 2 | 3.36402189406,1.0,0.260144497068 3 | 4.15992743817,1.0,0.350816858335 4 | 3.78196031881,1.0,0.784498848077 5 | 3.42989512572,1.0,0.756516398407 6 | 3.15950491457,1.0,0.951706599864 7 | 3.25515042186,1.0,0.189581130574 8 | 2.85369444908,1.0,0.370011550607 9 | 3.50858378154,1.0,0.0590663027425 10 | 3.90430478422,1.0,0.215343806169 11 | 3.45693232136,1.0,0.449606621905 12 | 2.75392984381,1.0,0.467775521684 13 | 2.96059817472,1.0,0.39167030594 14 | 3.32239943091,1.0,0.391057313689 15 | 4.33496722789,1.0,0.252178510274 16 | 2.91235980568,1.0,0.0674111956377 17 | 3.71830204197,1.0,0.0616356010853 18 | 3.2526960081,1.0,0.846743579917 19 | 3.69205534024,1.0,0.122784969768 20 | 3.07017383938,1.0,0.0266326664286 21 | 3.69061507324,1.0,0.385036208768 22 | 3.54567590508,1.0,0.870806590937 23 | 4.41083559002,1.0,0.338407273459 24 | 3.60068346044,1.0,0.276550896573 25 | 3.3326031757,1.0,0.570194488677 26 | 3.91016851652,1.0,0.327916765825 27 | 3.84847082807,1.0,0.499414512464 28 | 3.18634918861,1.0,0.468041957164 29 | 3.29221765542,1.0,0.513708729344 30 | 3.02148685747,1.0,0.807197008949 31 | 3.12681717646,1.0,0.0932085022848 32 | 3.57983546648,1.0,0.467758273876 33 | 3.23900869201,1.0,0.427403343319 34 | 4.57834602865,1.0,0.824081796391 35 | 2.37644200344,1.0,0.0730487124565 36 | 3.31498725761,1.0,0.400585779084 37 | 3.21553086907,1.0,0.402743818081 38 | 3.85959702777,1.0,0.0585452915492 39 | 3.44615734683,1.0,0.20314102397 40 | 3.48628658623,1.0,0.237082680794 41 | 3.05829561767,1.0,0.0517994300085 42 | 2.92315187738,1.0,0.164315225282 43 | 2.32904030685,1.0,0.268142893579 44 | 2.9552309982,1.0,0.351917716187 45 | 4.40407128841,1.0,0.924092466762 46 | 2.8225258913,1.0,0.0736433270037 47 | 3.96948927524,1.0,0.659915874842 48 | 3.56243467005,1.0,0.40964570709 49 | 3.47474682329,1.0,0.770583077191 50 | 3.56644300528,1.0,0.169839956493 51 | 2.91731170254,1.0,0.0146508450881 52 | 3.01295568085,1.0,0.545071147873 53 | 3.46360556376,1.0,0.185368560599 54 | 2.88143489161,1.0,0.0415632021348 55 | 4.0412414966,1.0,0.78353303003 56 | 3.07130972376,1.0,0.0600912187556 57 | 3.78054503771,1.0,0.373856196533 58 | 2.67311016754,1.0,0.717171494383 59 | 3.75106376672,1.0,0.00508139023391 60 | 3.94727073914,1.0,0.325172372195 61 | 3.29085843373,1.0,0.656444434018 62 | 3.32255132263,1.0,0.114928090252 63 | 3.12394866603,1.0,0.84060643069 64 | 2.53739670801,1.0,0.692190065838 65 | 3.75322060603,1.0,0.191343723907 66 | 4.1477197632,1.0,0.933759376925 67 | 3.13989205948,1.0,0.429303430318 68 | 3.2851647969,1.0,0.841234079435 69 | 3.20774504619,1.0,0.245968569897 70 | 3.51446893396,1.0,0.207929643678 71 | 2.67983489149,1.0,0.0380580305849 72 | 3.62731156643,1.0,0.389920308388 73 | 4.44198388793,1.0,0.462793931372 74 | 3.46638243416,1.0,0.458118777839 75 | 3.48977121311,1.0,0.716891061751 76 | 3.66471522624,1.0,0.27638286331 77 | 2.91767319777,1.0,0.135421973123 78 | 2.84514397006,1.0,0.503745974055 79 | 4.57023086158,1.0,0.606443402484 80 | 3.07915184373,1.0,0.237213452278 81 | 4.6912100323,1.0,0.894024333638 82 | 3.4989481316,1.0,0.00549703197849 83 | 3.76166465388,1.0,0.265318734242 84 | 3.94945854274,1.0,0.323770161366 85 | 5.21789943675,1.0,0.68209456231 86 | 2.99116512508,1.0,0.0202862280747 87 | 2.91418758566,1.0,0.518492194298 88 | 3.96622033204,1.0,0.799844072411 89 | 3.68633462338,1.0,0.683774764756 90 | 2.63705120301,1.0,0.0769351367498 91 | 3.18626145777,1.0,0.0882592101268 92 | 3.43380657408,1.0,0.803298114512 93 | 4.11905748802,1.0,0.488393678274 94 | 3.25768073747,1.0,0.146494823121 95 | 3.59930371649,1.0,0.7262645386 96 | 3.32525332426,1.0,0.0119205330777 97 | 2.36401703415,1.0,0.0482396643746 98 | 2.72247212355,1.0,0.0651489918596 99 | 2.64138521372,1.0,0.502341826693 100 | 2.9975020738,1.0,0.100776706488 101 | -------------------------------------------------------------------------------- /Gaussian Discriminant Analysis MapReduce/gda_wrapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import GDA as gda 4 | import numpy as np 5 | from boto.s3.connection import S3Connection 6 | import json 7 | import os 8 | 9 | # use if you did not set up this parameters in configuration file 10 | EMR_DEFAULT_PARAMS = ["--ec2-core-instance-bid-price", "0.4", 11 | "--ec2-core-instance-type" ,"m1.small", 12 | "--num-ec2-core-instances", "1", 13 | "--ec2-task-instance-bid-price", "0.4", 14 | "--ec2-task-instance-type", "m1.small", 15 | "--num-ec2-task-instances","1"] 16 | 17 | # access and secret key 18 | ACCESS_KEY = "YOUR_ACCESS_KEY" 19 | SECRET_KEY = "YOUR_SECRET_KEY" 20 | 21 | 22 | 23 | class GaussianDiscriminantAnalysis(object): 24 | ''' 25 | Performs Gaussian Discriminant Analysis for classification. Two approaches 26 | are available QDA (each class has its own covariance matrix) or LDA ( 27 | covariance matrix is shared). 28 | 29 | ''' 30 | 31 | def __init__(self,targets,dimensions, input_path, output_path, 32 | emr_local = "local", emr_defaults = True): 33 | self.targets = targets 34 | self.dimensions = dimensions 35 | self.input_path = input_path 36 | self.output_path = output_path 37 | self.emr_local = emr_local 38 | self.emr_defaults = emr_defaults 39 | self.params = {} 40 | 41 | def configure(self): 42 | ''' 43 | Sets configuration parameters to run map reduce job for finding 44 | parameters of Discriminant Analysis 45 | ''' 46 | configs = ["--feature-dimensions",str(self.dim), 47 | "--targets", json.loads(self.targets), 48 | "-r", self.emr_local, 49 | "--output-dir",self.output_path, 50 | "--no-output",self.input_path] 51 | configs_new = [] 52 | if self.emr_defaults is True: 53 | configs_new.extend(EMR_DEFAULT_PARAMS) 54 | configs_new.extend(configs) 55 | # start job 56 | mrJobGDA = gda.GaussianDiscriminantAnalysisMR(configs_new) 57 | with mrJobGDA.make_runner() as runner: 58 | runner.run() 59 | 60 | def load_params(self): 61 | if self.emr_local == "local": 62 | self.params = self.local_load_params(self.output_path) 63 | else: 64 | self.params = self.s3_load_params(self.output_path) 65 | 66 | 67 | def s3_load_params(self,s3_path): 68 | ''' load parameters if they are on amazon s3''' 69 | path = s3_path.strip("s3://").split("/") 70 | mybucket = self.conn.get_bucket(path[0]) # connect to s3 bucket 71 | s3_file_keys = [f for f in mybucket.list(prefix = "/".join(path[1:]))] 72 | for s3key in s3_file_keys: 73 | if mybucket.lookup(s3key).size > 0: 74 | data = s3key.get_contents_as_string() 75 | params = json.loads(data) 76 | return params 77 | 78 | def local_load_params(self,local_path): 79 | ''' load paramters if they are on local machine''' 80 | current_dir = os.getcwd() 81 | os.chdir(local_path) 82 | for filename in os.listdir(os.getcwd()): 83 | if "part-" in filename: 84 | if os.path.getsize(filename) > 0: 85 | with open(filename,"r") as in_file: 86 | data = json.load(in_file) 87 | os.chdir(current_dir) 88 | return data 89 | 90 | def posterior_probs(self, method = ): 91 | ''' get class probability 92 | 93 | 94 | method - (str) can have two values either 'QDA' or 'LDA' 95 | ''' 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /Gaussian Discriminant Analysis MapReduce/gda.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from mrjob.job import MRJob 5 | from mrjob.step import MRStep 6 | from mrjob.protocol import RawValueProtocol,JSONProtocol, JSONValueProtocol 7 | import numpy as np 8 | import json 9 | 10 | ################### Helper function & classes ################################# 11 | 12 | 13 | def extract_features(line): 14 | ''' Extracts data from line of input ''' 15 | data = line.strip.split(",") 16 | return data[1], data[2:] 17 | 18 | def matrix_to_list(input_data): 19 | return [list(e) for e in input_data] 20 | 21 | class DimensionalityMismatchError(Exception): 22 | ''' Error when dimensionalities do not match ''' 23 | def __init__(self,expected,real): 24 | self.exp = expected 25 | self.real = real 26 | 27 | def __str__(self): 28 | error = "Expected number of dimensions: "+str(self.exp)+" observed: "+ str(self.real) 29 | return error 30 | 31 | 32 | class TargetValueError(Exception): 33 | ''' Error for target values ''' 34 | def __init__(self,observed): 35 | self.observed = observed 36 | 37 | def __str__(self): 38 | error = "Observed value "+str(self.e) + " is not target value" 39 | return error 40 | 41 | 42 | ####################### MapReduce Job ######################################## 43 | 44 | 45 | class GaussianDiscriminantAnalysisMR(MRJob): 46 | ''' 47 | Calculates parameters required for Linear Discriminant Analysis and 48 | Quadratic Discrminant Analysis. 49 | 50 | 51 | Command Line Options: 52 | --------------------- 53 | 54 | --feature-dimensions - dimensionality of features (dependent variables) 55 | --targets - list of all valid target values (json-encoded list) 56 | ''' 57 | 58 | INPUT_PROTOCOL = RawValueProtocol 59 | 60 | INTERNAL_PROTOCOL = JSONProtocol 61 | 62 | OUTPUT_PROTCOL = JSONValueProtocol 63 | 64 | 65 | def __init__(self,*args,**kwargs): 66 | super(GaussianDiscriminantAnalysisMR,self).__init__(*args,**kwargs) 67 | self.k = len(self.targets) 68 | self.priors = [0]*self.k 69 | self.means = [np.zeros(self.dim) for i in range(self.k)] 70 | self.covariate = [np.zeros([self.dim,self.dim]) for i in range(self.k)] 71 | self.total = 0 72 | self.targets = json.loads(self.targest) 73 | self.target_set = set(self.targets) 74 | self.target_to_index = {} 75 | for i,target in enumerate(self.targets): 76 | self.target_to_index[target] = i 77 | 78 | 79 | def configure_options(self): 80 | super(GaussianDiscriminantAnalysisMR,self).configure_options() 81 | self.add_passthrough_option("--feature-dimensions", 82 | type = int, 83 | help = "dimensionality of features") 84 | self.add_passthrough_option("--targets", 85 | type = str, 86 | help = "targets") 87 | 88 | 89 | def load_options(self,args): 90 | super(GaussianDiscriminantAnalysisMR,self).load_options(args) 91 | if self.options.feature_dimension is None: 92 | self.option_parser.error("You must specify dimensionality of data") 93 | else: 94 | self.dim = self.options.feature_dimension 95 | if self.options.targets is None: 96 | self.option_parser.error("You must specify targets") 97 | else: 98 | self.targets = self.options.targets 99 | 100 | 101 | def mapper_gda(self,_,line): 102 | ''' 103 | Calculates and summarise intermediate values for each mapper. 104 | (Intermediate values include number of observations in each class, 105 | total number of observations etc. ) 106 | ''' 107 | y,features = extract_features(line) 108 | n = len(features) 109 | x = np.array(features) 110 | index = self.target_to_index[y] 111 | # error if dimensionalities do not match 112 | if len(features) != self.dim: 113 | raise DimensionalityMismatchError(self.dim,n) 114 | # targets are not in set of targets defined 115 | if y not in self.target_set: 116 | raise TargetValueError(y) 117 | self.total+=1 118 | self.means[index] += x 119 | self.covariate[index] += np.outer(x,x) 120 | self.priors[index] += 1 121 | 122 | 123 | def mapper_final_gda(self): 124 | '''Outputs data summarised for each mapper to reducer''' 125 | yield 1,{ "total": self.total, 126 | "class counts": self.priors, 127 | "means": matrix_to_list(self.means), 128 | "covariates": [matrix_to_list(e) for e in self.covariate]} 129 | 130 | 131 | def reducer_gda_parameters(self,key, parameters): 132 | ''' Summarises intermediate values produced by each mapper to get final parameters ''' 133 | all_parameters = {} 134 | # sum two lists (each list has length = number of classes) 135 | vec_sum = lambda x,y: [x[i]+y[i] for i in range(self.k)] 136 | # sum two list of lists 137 | list_of_vec_sum = lambda x,y: [vec_sum(x[i],y[i]) for i in range(self.k)] 138 | list_of_matrix_sum = lambda x,y: [list_of_vec_sum(x[i],y[i]) for i in range(self.k)] 139 | # summarise parameters produced by each mapper 140 | for parameter in parameters: 141 | if len(all_parameters)==0: 142 | all_parameters = parameters 143 | else: 144 | all_parameters["total"]+=parameters["total"] 145 | all_parameters["class counts"] = vec_sum(parameter["class counts"],all_parameters["class counts"]) 146 | all_parameters["means"] = list_of_vec_sum(parameter["means"],all_parameters["means"]) 147 | all_parameters["covariates"] = list_of_matrix_sum(parameter["covariates"],all_parameters["covariates"]) 148 | # calculate final parameters 149 | for i in range(self.k): 150 | all_parameters["means"][i] = float(all_parameters["means"][i])/all_parameters["class counts"][i] 151 | mu = np.array(all_parameters["means"][i]) 152 | all_parameters["covariates"][i] = np.array(all_parameters["covariates"][i]) - all_parameters["class counts"][i]*np.outer(mu,mu) 153 | all_parameters["covariates"][i] = matrix_to_list(all_parameters["covariates"][i]) 154 | yield None, all_parameters 155 | 156 | 157 | def steps(self): 158 | return [MRStep(mapper = self.mapper_gda, 159 | mapper_final = self.mapper_final_gda, 160 | reducer = self.reducer_lda_parameters)] 161 | 162 | 163 | if __name__=="__main__": 164 | GaussianDiscriminantAnalysisMR.run() 165 | 166 | 167 | -------------------------------------------------------------------------------- /Linear Regression MapReduce/LinearRegressionTS.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from mrjob.job import MRJob 5 | from mrjob.protocol import JSONProtocol, RawValueProtocol 6 | from mrjob.step import MRStep 7 | import numpy as np 8 | 9 | 10 | ######################## Helper Methods and Classes ########################## 11 | 12 | 13 | def cholesky_solution_linear_regression(x_t_x,x_t_y): 14 | ''' 15 | Finds parameters of regression through Cholesky decomposition, 16 | given sample covariance of explanatory variables and covariance 17 | between explanatory variable and dependent variable. 18 | 19 | Paramaters: 20 | ----------- 21 | x_t_x - numpy array of size 'm x m', represents sample covariance of explanatory variables 22 | x_t_y - numpy array of size 'm x 1', represent covariance between expalanatory and dependent variable 23 | 24 | Output: 25 | ------- 26 | Theta - list of size m, represents values of coefficients 27 | 28 | ''' 29 | # L*L.T*Theta = x_t_y 30 | L = np.linalg.cholesky(x_t_x) 31 | # solve L*z = x_t_y 32 | z = np.linalg.solve(L,x_t_y) 33 | # solve L.T*Theta = z 34 | theta = np.linalg.solve(np.transpose(L),z) 35 | return theta 36 | 37 | 38 | 39 | class DimensionMismatchError(Exception): 40 | 41 | def __init__(self,expected,observed): 42 | self.exp = expected 43 | self.obs = observed 44 | 45 | def __str__(self): 46 | err = "Expected number of dimensions: "+str(self.exp)+", observed: "+str(self.obs) 47 | return err 48 | 49 | 50 | ############################## Map Reduce Job ################################# 51 | 52 | 53 | class LinearRegressionTS(MRJob): 54 | ''' 55 | Calculates sample covariance matix of explanatory variables (x_t_x) and 56 | vector of covariances between dependent variable expanatory variables (x_t_y) 57 | in single map reduce pass and then uses cholesky decomposition to 58 | obtain values of regression parameters. 59 | 60 | 61 | Important!!! Since final computations are performed on single reducer, 62 | assumption is that dimensionality of data is relatively small i.e. input 63 | matrix is tall and skinny. 64 | 65 | 66 | Input File: 67 | ----------- 68 | 69 | Extract relevant features from input line by changing extract_variables 70 | method. You can add features for non-linear models (like x^2 or exp(x)). 71 | Current code assumes following input line format: 72 | 73 | input line = , ,..., 74 | 75 | Options: 76 | ----------- 77 | 78 | -- dimension - (int) number of explanatory variables 79 | -- bias - (bool) if True regression wil include bias term 80 | 81 | Output: 82 | ----------- 83 | json-encoded list of parameters 84 | ''' 85 | 86 | INPUT_PROTOCOL = RawValueProtocol 87 | 88 | INTERNAL_PROTOCOL = JSONProtocol 89 | 90 | OUTPUT_PROTOCOL = RawValueProtocol 91 | 92 | 93 | def __init__(self,*args, **kwargs): 94 | super(LinearRegressionTS, self).__init__(*args, **kwargs) 95 | n = self.options.dimension 96 | self.x_t_x = np.zeros([n,n]) 97 | self.x_t_y = np.zeros(n) 98 | self.counts = 0 99 | 100 | #--------------------------- feature extraction --------------------------# 101 | 102 | def extract_variables(self,line): 103 | ''' (str)--(float,[float,float,float...]) 104 | Extracts set of relevant features. (Needs to be rewriten depending 105 | on file input structure) 106 | ''' 107 | data = [float(e) for e in line.strip().split(",")] 108 | y,features = data[0],data[1:] 109 | return (y,features) 110 | 111 | 112 | #---------------------------- Options ------------------------------------# 113 | 114 | def configure_options(self): 115 | ''' Additional options''' 116 | super(LinearRegressionTS,self).configure_options() 117 | self.add_passthrough_option("--dimension", 118 | type = int, 119 | help = "Number of explanatory variables (do not count bias term)") 120 | self.add_passthrough_option("--bias", 121 | type = str, # (got error when tried to define bool) ??? 122 | help = "Bias term, bias not included if anything other than 'True' ", 123 | default = "True") 124 | 125 | def load_options(self,args): 126 | ''' Loads and checks whether options are provided''' 127 | super(LinearRegressionTS,self).load_options(args) 128 | if self.options.dimension is None: 129 | self.option_parser.error("You should define number of explanatory variables") 130 | else: 131 | self.dim = self.options.dimension 132 | 133 | 134 | #------------------------ map-reduce steps -------------------------------# 135 | 136 | 137 | def mapper_lr(self,_,line): 138 | ''' 139 | Calculates x_t_x and x_t_y for data processed by each mapper 140 | ''' 141 | y,features = self.extract_variables(line) 142 | if len(features) != self.dim: 143 | raise DimensionMismatchError(self.dim,len(features)) 144 | if self.options.bias is "True": 145 | features.append(1.0) 146 | x = np.array(features) 147 | self.x_t_x += np.outer(x, x) 148 | self.x_t_y += y*x 149 | self.counts += 1 150 | 151 | def mapper_lr_final(self): 152 | ''' 153 | Transforms numpy arrays x_t_x and x_t_y into json-encodable list format 154 | and sends to reducer 155 | ''' 156 | yield 1,("x_t_x", [list(row) for row in self.x_t_x]) 157 | yield 1,("x_t_y", [xy for xy in self.x_t_y]) 158 | yield 1,("counts", self.counts) 159 | 160 | def reducer_lr(self,key,values): 161 | ''' 162 | Aggregates results produced by each mapper and obtains x_t_x and x_t_y 163 | for all data, then using cholesky decomposition obtains parameters of 164 | linear regression. 165 | ''' 166 | n = self.dim 167 | observations = 0 168 | x_t_x = np.zeros([n,n]); x_t_y = np.zeros(n) 169 | for val in values: 170 | if val[0]=="x_t_x": 171 | x_t_x += np.array(val[1]) 172 | elif val[0]=="x_t_y": 173 | x_t_y += np.array(val[1]) 174 | elif val[0]=="counts": 175 | observations += val[1] 176 | betas = cholesky_solution_linear_regression(x_t_x,x_t_y) 177 | yield None,[e for e in betas] 178 | 179 | def steps(self): 180 | '''Defines map-reduce steps ''' 181 | return [MRStep(mapper = self.mapper_lr, 182 | mapper_final = self.mapper_lr_final, 183 | reducer = self.reducer_lr)] 184 | 185 | if __name__=="__main__": 186 | LinearRegressionTS.run() 187 | 188 | 189 | -------------------------------------------------------------------------------- /Multivariate Descriptive Statistics/MultivariateDescriptiveStatistics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from mrjob.job import MRJob 4 | from mrjob.step import MRStep 5 | from mrjob.protocol import RawValueProtocol, JSONProtocol, JSONValueProtocol 6 | import numpy as np 7 | 8 | 9 | ######################## Helper functions & classes ########################## 10 | 11 | class DimensionalityMismatch(Exception): 12 | 13 | def __init__(self,expected,real): 14 | self.exp = expected 15 | self.real = real 16 | 17 | def __str__(self): 18 | error = "Dimensionality mismatch. "+"Expected: "+str(self.exp)+" real: "+ str(self.real) 19 | return error 20 | 21 | 22 | def extract_relevant_features(l): 23 | ''' 24 | Extracts quantitative features for which summary statistics should be calculated 25 | ''' 26 | data = l.strip().split(",") 27 | return [float(e) for e in data[1:5]] 28 | 29 | def kurtosis(p4,covariance,n): 30 | ''' 31 | Calcultes unbiased kurtosis (see Joanes and Gill (1998)). 32 | 33 | 34 | Input: 35 | ------ 36 | 37 | p4 - list of size m, where each entry is sum of fourth order feature. 38 | covariance - two-dimensional list of size m x m, which is outer 39 | product of input matrix with itself 40 | n - number of observations 41 | 42 | Output: 43 | ------- 44 | - (float) kurtosis 45 | 46 | [where m is dimensionality of data] 47 | ''' 48 | kurtosis_standard = [ (kurt/n)/((n-1)*covariance[i,i]/n)**2 -3 for i,kurt in enumerate(p4)] 49 | kurtosis_unbiased = [ (kurt*(n+1)+6)*(n-1)/(n-2)/(n-3) for kurt in kurtosis_standard] 50 | return kurtosis_unbiased 51 | 52 | def skewed(p3,covariance,n): 53 | ''' 54 | Calcultes skeweness 55 | 56 | Input: 57 | ------ 58 | 59 | p3 - list of size m, where each entry is sum of cubes of each feature. 60 | covariance - two-dimensional list of size m x m, which is outer 61 | product of input matrix with itself 62 | n - number of observations 63 | 64 | Output: 65 | ------- 66 | - (float) kurtosis 67 | 68 | [where m is dimensionality of data] 69 | ''' 70 | return [np.sqrt(n*(n-1))/(n-2)*((skew/n)/(((n-1)*covariance[i,i]/n)**1.5)) for i,skew in enumerate(p3)] 71 | 72 | ########################## MapReduce Job ###################################### 73 | 74 | class MultivariateDescriptiveStatisticsMR(MRJob): 75 | ''' 76 | Calculates descriptive statistics for multivariate dataset. 77 | 78 | Following statistics are calculated: 79 | 80 | - Covariance Matrix 81 | - Skewness of each variable (measure of assymetry) 82 | - Kurtosis of each variable (measure of peakedness) 83 | - Minimum for each variable 84 | - Maximum for each variable 85 | - Mean for each variable 86 | 87 | Note: accuracy of results were compared on test results with corresponding 88 | functions in R (min,max,mean,cov,skewness[library(e1071)], kurtosis[library(e1071)]) 89 | ''' 90 | 91 | INPUT_PROTOCOL = RawValueProtocol 92 | 93 | INTERNAL_PROTOCOL = JSONProtocol 94 | 95 | OUTPUT_PROTOCOL = JSONValueProtocol 96 | 97 | 98 | def __init__(self, *args, **kwargs): 99 | super(MultivariateDescriptiveStatisticsMR,self).__init__(*args, **kwargs) 100 | d = self.dim 101 | self.n = 0 102 | self.max,self.min,self.mean = [0]*d,[0]*d,[0]*d 103 | self.third_order, self.fourth_order = [0]*d, [0]*d 104 | self.covariates = np.zeros([d,d], dtype = np.float64) 105 | 106 | 107 | def configure_options(self): 108 | super(MultivariateDescriptiveStatisticsMR,self).configure_options() 109 | self.add_passthrough_option("--dimensions", type = int, 110 | help = "Number of columns of data matrix") 111 | 112 | def load_options(self,args): 113 | super(MultivariateDescriptiveStatisticsMR,self).load_options(args) 114 | if self.options.dimensions is None: 115 | self.option_parser.error("You need specify expected dimensionlity") 116 | else: 117 | self.dim = self.options.dimensions 118 | 119 | 120 | def mapper_covar(self,_,line): 121 | # extract features that you want to analyse 122 | variables = MultivariateDescriptiveStatisticsMR.extract_relevant_features(line) 123 | assert(len(variables)==self.dim), "input dimensionality mismatch" 124 | self.n+=1 125 | self.max = [max(m, var) for var in variables for m in self.max] 126 | self.min = [min(m, var) for var in variables for m in self.min] 127 | self.mean = [s+var for var in variables for s in self.mean] 128 | self.third_order = [p+var**3 for var in variables for p in self.third_order] 129 | self.fourth_order = [p+var**4 for var in variables for p in self.fourth_order] 130 | self.covariates += np.outer(np.array(variables),np.array(variables)) 131 | 132 | 133 | def mapper_covar_final(self): 134 | yield 1,("max", self.max) 135 | yield 1,("min", self.min) 136 | yield 1,("mean", self.mean) 137 | yield 1,("observations", self.n) 138 | yield 1,("third order", self.third_order) 139 | yield 1,("fourth order", self.fourth_order) 140 | yield 1,("covariates", [list(row) for row in self.covariates]) 141 | 142 | 143 | def reducer_summarise(self,key,values): 144 | m = self.dim 145 | p1,max_list,min_list = [0]*m,[0]*m,[0]*m 146 | p3, p4 = [0]*m,[0]*m 147 | covar_matr = np.zeros([m,m], dtype = np.float64) 148 | n = 0 149 | for val in values: 150 | if val[0]=="max": 151 | max_list = [max(max_list[i],var) for i,var in enumerate(val[1])] 152 | elif val[0]=="min": 153 | min_list = [min(min_list[i],var) for i,var in enumerate(val[1])] 154 | elif val[0]=="mean": 155 | p1 = [p1[i]+var for i,var in enumerate(val[1])] 156 | elif val[0]=="observations": 157 | n+=val[1] 158 | elif val[0]=="third order": 159 | p3 = [p3[i]+cube for i,cube in enumerate(val[1])] 160 | elif val[0]=="fourth order": 161 | p4 = [p4[i]+quad for i,quad in enumerate(val[1])] 162 | else: 163 | covar_matr+=np.array(val[1]) 164 | # vector of means 165 | means = [float(mu)/n for mu in p1] 166 | # covariance matrix (biased but with lowest MSE) 167 | covariance = (covar_matr - np.outer(np.array(means),np.array(means))*n)/(n-1) 168 | # fourth moment: calculate sum((x_i-mean(x))^4) by decomposing it 169 | p4 = [p4[i]-4*means[i]*p3[i]+6*(means[i]**2)*(covar_matr[i,i])-4*p1[i]*(means[i]**3)+n*means[i]**4 for i in range(m)] 170 | # third moment: calculate sum((x_i-mean(x))^3) by decompsing it 171 | p3 = [p3[i]-3*means[i]*covar_matr[i,i]+3*(means[i]**2)*p1[i] - n*means[i]**3 for i in range(m)] 172 | kurtosis_unbiased = kurtosis(p4,covariance,n) # calculate kurtosis for each variable 173 | skewness = skewed(p3,covariance,n) # calculate skewness for each variable 174 | matrix_to_list = lambda x: [list(e) for e in x] 175 | covariance = matrix_to_list(covariance) 176 | summary_statistics = {"mean": means, 177 | "max": max_list, 178 | "min": min_list, 179 | "covariance": covariance, 180 | "skewness": skewness, 181 | "kurtosis": kurtosis_unbiased, 182 | "observations": n } 183 | yield None, summary_statistics 184 | 185 | 186 | def steps(self): 187 | return [MRStep(mapper = self.mapper_covar, 188 | mapper_final = self.mapper_covar_final, 189 | reducer = self.reducer_summarise)] 190 | 191 | if __name__=="__main__": 192 | MultivariateDescriptiveStatisticsMR.run() -------------------------------------------------------------------------------- /Gaussian Mixture Model MapReduce/IterationGaussianMixtureMR.py: -------------------------------------------------------------------------------- 1 | 2 | from mrjob.job import MRJob 3 | from mrjob.protocol import JSONProtocol, RawValueProtocol, JSONValueProtocol 4 | from mrjob.step import MRStep 5 | import json 6 | import numpy as np 7 | 8 | 9 | def multivar_gauss_pdf(x, mu, cov): 10 | ''' 11 | Caculates the multivariate normal density (pdf) 12 | 13 | Parameters: 14 | ----------- 15 | 16 | x - numpy array of a "d x 1" sample vector 17 | mu - numpy array of a "d x 1" mean vector 18 | cov - numpy array of a d x d" covariance matrix 19 | 20 | (where d - dimensionality of data) 21 | 22 | Output: 23 | ------- 24 | - (float) probability of x given parameters of 25 | Gaussian Distribution 26 | ''' 27 | part1 = 1 / ( ((2* np.pi)**(len(mu)/2)) * (np.linalg.det(cov)**(1/2)) ) 28 | part2 = (-1/2) * np.dot(np.dot((x-mu).T,(np.linalg.inv(cov))),(x-mu)) 29 | return float(part1 * np.exp(part2)) 30 | 31 | 32 | def responsibility(x,mu,cov,p,K): 33 | ''' 34 | Calculates conditional probability of latent variable given 35 | observed data and parameters 36 | 37 | Parameters: 38 | ----------- 39 | 40 | x - numpy array of a "d x 1" sample vector 41 | mu - list of length "K" of lists "d x 1" mean vector 42 | cov - list of length "K" numpy arrays each "d x d" covariance matrix 43 | p - list of floats, each float prior probability of cluster 44 | K - number of clusters (values of latent variables) 45 | 46 | (where d - dimensionality of data) 47 | 48 | Output: 49 | - list of floats, each element of list is responsibility corresponding 50 | to x and relevant latent variable valiue 51 | ''' 52 | resps = [p[k]*multivar_gauss_pdf(x,np.array(mu[k]),np.array(cov[k])) for k in range(K)] 53 | p_x = sum(resps) 54 | return [float(r_k)/p_x for r_k in resps] 55 | 56 | 57 | def extract_features(line): 58 | ''' extracts features from line of input''' 59 | data = line.strip().split(",") 60 | return [ float(e) for e in data[1:] ] 61 | 62 | 63 | def make_json_encodable(mixing, means, covar): 64 | ''' 65 | Transforms 66 | 67 | Parameters: 68 | ----------- 69 | 70 | mixing - list of size k 71 | means - list of size k of numpy arrays (each numpy array has size d) 72 | covar - list of size k of two dimensional numpy array (matrix of size dxd) 73 | 74 | (where d is dimensionality and k is number of clusters) 75 | 76 | Output: 77 | -------- 78 | - dictionary with parameter names as keys 79 | {"mu": list of mean vectors, "mixing": list of mixing coefficients, 80 | "covariance": list of covariance matrices} 81 | 82 | ''' 83 | matrix_to_list = lambda x: [list(e) for e in x] 84 | mixing = mixing 85 | means = matrix_to_list(means) 86 | covariance = [matrix_to_list(e) for e in covar] 87 | return {"mixing":mixing,"mu":means,"covariance":covariance} 88 | 89 | 90 | 91 | class IterationGaussianMixtureMR(MRJob): 92 | ''' 93 | Runs single iteration of Expectation Maximization Algorithm for Gaussian 94 | Mixture Model. 95 | 96 | Mappers use parameters from previous iteration to calculate responsibilities 97 | and intermediate values that are then used by single reducer to calculate 98 | new parameters. 99 | 100 | Command Line Options: 101 | --------------------- 102 | 103 | --clusters - number of clusters 104 | --dimensions - dimensionality of data 105 | --parameters - (str)json encoded dictionary of parameters 106 | 107 | ''' 108 | INPUT_PROTOCOL = RawValueProtocol 109 | 110 | INTERNAL_PROTOCOL = JSONProtocol 111 | 112 | OUTPUT_PROTOCOL = JSONValueProtocol 113 | 114 | 115 | def __init__(self,*args,**kwargs): 116 | super(IterationGaussianMixtureMR,self).__init__(*args,**kwargs) 117 | # sum of responsibilities for each cluster & number of observations 118 | self.resp_sum = [0]*self.clusters 119 | self.N = 0 120 | # sum of observations weighted by reponsibility 121 | self.resp_w_sum = [np.zeros(self.dim, dtype = np.float64) for i in range(self.clusters)] 122 | # sum of x_n*x_n_t (outer products) weighted by reponsibility 123 | self.resp_w_cov = [np.zeros([self.dim,self.dim], dtype = np.float64) for i in range(self.clusters)] 124 | 125 | 126 | def configure_options(self): 127 | super(IterationGaussianMixtureMR,self).configure_options() 128 | self.add_passthrough_option("--dimensions", 129 | type = int, 130 | help = "dimensionality of input data") 131 | self.add_passthrough_option("--clusters", 132 | type = int, 133 | help = "number of clusters") 134 | self.add_passthrough_option("--parameters", 135 | type = str, 136 | help = "file with parameters from previous iteration") 137 | 138 | 139 | def load_options(self,args): 140 | super(IterationGaussianMixtureMR,self).load_options(args) 141 | # number of clusters 142 | if self.options.clusters is None: 143 | self.option_parser.error("You need to specify number of clusters") 144 | else: 145 | self.clusters = self.options.clusters 146 | # data dimensionality 147 | if self.options.dimensions is None: 148 | self.option_parser.error("You need to specify dimensionality of data") 149 | else: 150 | self.dim = self.options.dimensions 151 | # filename where parameters from previous iteration are saved 152 | if self.options.parameters is None: 153 | self.option_parser.error("You need to load file with distribution parameters") 154 | 155 | def mapper_gmm_init(self): 156 | params = json.loads(self.options.parameters) 157 | self.mu = params["mu"] 158 | self.covar = params["covariance"] 159 | self.mixing = params["mixing"] 160 | 161 | def mapper_gmm(self,_,line): 162 | features = extract_features(line) 163 | assert(len(features)==self.dim), "dimension mismatch" 164 | x = np.array(features) 165 | r_n = responsibility(x,self.mu,self.covar,self.mixing,self.clusters) # responsibilities 166 | self.resp_sum = [self.resp_sum[i]+r_n_k for i,r_n_k in enumerate(r_n)] 167 | self.resp_w_sum = [w_sum + r_n[i]*x for i,w_sum in enumerate(self.resp_w_sum)] 168 | self.resp_w_cov = [w_covar+r_n[i]*np.outer(x,x) for i,w_covar in enumerate(self.resp_w_cov)] 169 | self.N+=1 170 | 171 | def mapper_final_gmm(self): 172 | matrix_to_list = lambda x: [list(e) for e in x] 173 | # sum of responsibilities 174 | yield 1,("r_sum", self.resp_sum) 175 | # sum of observations weighted by responsibility 176 | yield 1,("r_w_sum", [list(e) for e in self.resp_w_sum]) 177 | # covariates weighted by responsibility 178 | yield 1,("r_w_cov", [ matrix_to_list(cov) for cov in self.resp_w_cov]) 179 | # number of observations 180 | yield 1,("total", self.N) 181 | 182 | 183 | def reducer_gmm(self,key, values): 184 | N = 0; 185 | r_sum = [0]*self.clusters 186 | r_w_sum = [np.zeros(self.dim, dtype = np.float64) for i in range(self.clusters)] 187 | r_w_cov = [np.zeros([self.dim,self.dim], dtype = np.float64) for i in range(self.clusters)] 188 | for value in values: 189 | if value[0]=="r_sum": 190 | r_sum = [r_sum[i]+gamma for i,gamma in enumerate(value[1])] 191 | elif value[0]=="r_w_sum": 192 | r_w_sum = [r_w_sum[i]+np.array(r_w_new, dtype = np.float64) for i,r_w_new in enumerate(value[1])] 193 | elif value[0]=="r_w_cov": 194 | r_w_cov = [ r_w_cov[i] + np.array(cov) for i,cov in enumerate(value[1])] 195 | elif value[0]=="total": 196 | N+=value[1] 197 | mixing = [float(gamma)/N for gamma in r_sum] 198 | means = [1.0/r_sum[i]*r_w_sum[i] for i, gamma in enumerate(mixing)] 199 | covar = [ 1.0/r_sum[k]*r_w_cov_k - np.outer(means[k],means[k]) for k,r_w_cov_k in enumerate(r_w_cov)] 200 | yield None, make_json_encodable(mixing,means,covar) 201 | 202 | def steps(self): 203 | return [MRStep(mapper_init = self.mapper_gmm_init, 204 | mapper = self.mapper_gmm, 205 | mapper_final = self.mapper_final_gmm, 206 | reducer = self.reducer_gmm)] 207 | 208 | if __name__=="__main__": 209 | IterationGaussianMixtureMR.run() 210 | 211 | -------------------------------------------------------------------------------- /Gaussian Mixture Model MapReduce/gmm_test_data.txt: -------------------------------------------------------------------------------- 1 | 0,11.889696598182816,9.399577308092463 2 | 1,8.141782466259908,9.527948700116914 3 | 2,9.83561441698987,9.829664122277876 4 | 3,9.259053979739067,9.323772861474911 5 | 4,9.263677930814664,8.375190560774007 6 | 5,9.757350580368621,9.314887035130026 7 | 6,11.088658876568656,10.428006012366419 8 | 7,11.143262957828346,10.201883251031845 9 | 8,8.69383959269066,8.783559680325284 10 | 9,9.576704300171153,8.558494673372817 11 | 10,10.356197508699072,9.422707522937543 12 | 11,8.673602532116556,10.901021157513314 13 | 12,10.655171338795729,9.981231057404136 14 | 13,10.466961483205777,8.659112144413218 15 | 14,10.363731834422175,8.217446779324968 16 | 15,10.217535490920621,9.53485308461512 17 | 16,11.739695822206578,10.222179154117397 18 | 17,9.638692444961226,10.036829570697162 19 | 18,10.18969963125449,9.8247368588829 20 | 19,8.837879828123471,10.243322513501534 21 | 20,10.279095230986293,11.327708746941834 22 | 21,9.682704196948393,8.177313158514684 23 | 22,9.335839034275475,10.490247787819225 24 | 23,9.997904502165957,9.041449511717712 25 | 24,9.29504789012471,10.42905420836509 26 | 25,8.767051423431063,10.379687522618916 27 | 26,10.561822807236435,10.83323075405356 28 | 27,9.2071259783157,10.528013574304838 29 | 28,10.160689965561604,10.223508150118922 30 | 29,9.944167607205753,8.698019313721305 31 | 30,10.418525897310861,9.508779154752554 32 | 31,11.120817443963995,10.526129645503078 33 | 32,10.715534426540763,10.397645360599698 34 | 33,11.884621474828569,9.591125424962172 35 | 34,9.753256739952068,7.793189011004162 36 | 35,8.610407180918672,9.086115427816948 37 | 36,10.182228819882969,11.05460828030577 38 | 37,9.22004171784772,13.034324162182392 39 | 38,10.105060916793697,9.030958820657322 40 | 39,9.984382144505195,9.090686331618633 41 | 40,9.14584076992209,11.10574347078124 42 | 41,11.667246858733101,10.651320290924891 43 | 42,8.387128096090802,9.911586066454367 44 | 43,9.569749965923826,9.3198696923227 45 | 44,11.007806523828798,10.24401710795085 46 | 45,10.303312714665271,10.072895936758643 47 | 46,10.448145461879797,9.247585566175824 48 | 47,10.922918115688484,9.590316003046215 49 | 48,10.030315417591755,8.72522500978121 50 | 49,10.989067335990477,9.728763574444695 51 | 50,9.329076883745394,9.742913584392378 52 | 51,10.855959680936218,9.607031218854152 53 | 52,12.564674361928027,10.286343751607795 54 | 53,10.680096601143054,10.309299442484601 55 | 54,8.207501134810489,8.649429268806688 56 | 55,12.635714286197347,9.754864710759529 57 | 56,9.673623190778429,9.843122234554118 58 | 57,9.702296811534842,9.506997420081817 59 | 58,8.911439850481022,9.267704226862286 60 | 59,8.763327663498817,10.759310148880003 61 | 60,8.62727348693574,10.545296619643565 62 | 61,9.415086888513237,9.137469073371014 63 | 62,8.019619866898736,8.886766928729152 64 | 63,9.36039841330762,9.409304764940158 65 | 64,9.25652551394197,11.53250456130154 66 | 65,10.356262280657237,10.174496998482963 67 | 66,10.124887481939577,10.678484372823636 68 | 67,10.965042164271594,10.081409487431168 69 | 68,9.55914178689305,9.234841327714005 70 | 69,8.026250000117132,11.997631982536102 71 | 70,10.158110248393465,10.197973159435845 72 | 71,9.96879270260045,8.860303686738954 73 | 72,8.518062564381752,10.346421467880075 74 | 73,11.06197615294631,7.884096686113358 75 | 74,9.232618420886938,10.44432023409442 76 | 75,11.392216252854075,8.96050039167426 77 | 76,10.006433651410077,10.24995144665943 78 | 77,12.667143703316873,9.134282367814839 79 | 78,11.603954043522453,8.539740058173635 80 | 79,10.355262924827707,8.669928973777699 81 | 80,10.956149496046644,10.727380151777268 82 | 81,10.000082207177442,9.396618090777519 83 | 82,7.8114012714492285,9.696754128537894 84 | 83,10.749626406618715,10.70574875199398 85 | 84,9.701361543451927,9.791164489571257 86 | 85,10.07347125434307,9.949578936363372 87 | 86,10.724510443122407,8.709959507973224 88 | 87,8.514139438635151,9.66622548748849 89 | 88,9.655807587832276,10.591180319460728 90 | 89,11.960772555805558,8.768099520357781 91 | 90,10.455532746084764,11.505113906159323 92 | 91,10.030375632597176,9.901535425762127 93 | 92,9.577196914896852,8.643308902918893 94 | 93,10.157136017565245,10.221475059279474 95 | 94,11.364906239045027,9.556142335894155 96 | 95,10.896812778945927,9.91561644826393 97 | 96,10.245244982261436,10.764809139625296 98 | 97,8.39193258315866,9.640255128428839 99 | 98,7.869045001915454,9.991951326864006 100 | 99,10.26083091858728,10.116602894392708 101 | 100,-0.5297583718585257,-0.9766838245549572 102 | 101,1.685487810801928,0.4690593655004617 103 | 102,1.7809185202827154,-0.7439786775379666 104 | 103,1.1226156612796272,2.9134825309600845 105 | 104,1.3181314573279175,1.1017444443125102 106 | 105,1.0002305215917755,0.3676127784931953 107 | 106,-0.09283765687821144,1.6429521521510844 108 | 107,-0.3734397328941841,2.4919262869094894 109 | 108,0.74909034650456,0.9174194143928245 110 | 109,-0.06954171822790656,0.9004799986160914 111 | 110,2.4284430344008783,0.8644971646698215 112 | 111,2.7003034087791864,2.942531275288033 113 | 112,2.781511946041195,1.5801982247405415 114 | 113,0.25978472083564286,2.674189650036509 115 | 114,1.8496452444610467,-1.174074782719698 116 | 115,-0.49648249321947513,0.7047357435928405 117 | 116,0.9673605389654447,1.009104432432295 118 | 117,-0.32314315814211336,-0.3110807669353952 119 | 118,2.1469699080364455,1.824651046487534 120 | 119,2.435356635133539,0.12702031941034775 121 | 120,2.524093430061247,-0.3106723284205417 122 | 121,0.8023654873510608,0.2944377328052924 123 | 122,1.017321284932823,0.6016594179164722 124 | 123,0.9840254494892873,-1.4199134190354643 125 | 124,-0.10354819386142289,1.737815756012118 126 | 125,0.18135663167987104,0.8187655845081133 127 | 126,-0.1666601916661734,1.0860950270163892 128 | 127,0.5908566035060019,0.9032744653322091 129 | 128,-1.4714867055006766,0.8902193986657793 130 | 129,1.1322292621860297,1.4504295222159824 131 | 130,0.5293450533564301,-0.14111647426900653 132 | 131,1.8263994018772163,0.14759898379212988 133 | 132,0.4713996577723506,0.18480969293738347 134 | 133,0.787292022406012,0.041548064946067775 135 | 134,0.3454794065046819,0.8657779482465748 136 | 135,1.1889615755065692,1.5370544952810627 137 | 136,1.6705653540132208,0.007044771456957144 138 | 137,2.9768816992592875,-0.21634024261131213 139 | 138,2.8132642377181543,0.7493334397146709 140 | 139,-0.4044789721845281,1.6321443132997862 141 | 140,0.311719894635486,1.210677986394049 142 | 141,0.9102247891155003,2.382701362740132 143 | 142,1.5695809204463207,0.38040305525727125 144 | 143,0.05457798939602243,0.6217019644380728 145 | 144,0.8766024274446654,1.4869091380074162 146 | 145,1.8007261217921646,-0.48638861740203776 147 | 146,1.0951338224716824,-0.685573602091472 148 | 147,2.140139970401804,1.643810679135042 149 | 148,3.4325182211566556,1.126840239411029 150 | 149,0.5826258302859172,1.0556686155845936 151 | 150,0.7157401180441894,0.6547902187597557 152 | 151,-0.18784784715973646,0.7371184875274482 153 | 152,0.7243345172993352,1.734961417631381 154 | 153,2.3771733069305947,0.9470742401015901 155 | 154,0.4373231601729354,-0.8154774884412332 156 | 155,1.6419317687384325,0.1469989176876788 157 | 156,3.18251973435166,1.4519198792187695 158 | 157,0.7339939245078062,1.1372312431533893 159 | 158,0.946826479458742,1.1673853088420154 160 | 159,-0.4729836563782419,0.6716417415293987 161 | 160,0.8058099574464997,1.1696399224064815 162 | 161,2.6992005810329625,0.961860491181949 163 | 162,2.024997298713787,0.47877091221083845 164 | 163,2.5522498791519723,0.4299324405759669 165 | 164,0.5739343916146544,1.0045080090663834 166 | 165,1.35666915326178,-0.31466601955910756 167 | 166,1.086892858146093,2.0282043623389705 168 | 167,0.18409795377386962,2.3238952591251962 169 | 168,1.8579770238270976,1.2466501538302264 170 | 169,0.8880465028789818,0.9429394426250503 171 | 170,-0.3042446613223604,1.6218734017784513 172 | 171,1.2776771984612543,1.5669997816789594 173 | 172,2.164027053169204,0.05835633015698949 174 | 173,1.8476180634546273,0.8844207041775076 175 | 174,0.3415336965964516,0.7136305542705158 176 | 175,0.14421723290445176,1.0419038358764148 177 | 176,1.0771991003978565,1.1787375554204311 178 | 177,1.0718366116671554,2.0878134493858393 179 | 178,1.912900508907145,0.15521291242922264 180 | 179,0.5419179803924743,1.5625313860604253 181 | 180,-0.23674560655990096,-1.4604933334122738 182 | 181,1.6301287880124806,1.7379035790834743 183 | 182,2.0181245229038662,1.7186889492383628 184 | 183,2.2211403440156756,1.27568034492345 185 | 184,0.2261954447122092,-0.026328670026463552 186 | 185,-1.1269921165847325,2.2453017550210497 187 | 186,1.3373540600165845,0.8121116770848708 188 | 187,2.7180850973527124,2.063279588018826 189 | 188,1.1303141762058857,0.9987257674598756 190 | 189,0.4264214885661777,1.1206481326126658 191 | 190,0.38194609797862955,1.3667009059911215 192 | 191,2.2129695096994952,3.4717953196325877 193 | 192,-0.8260429551038402,1.0451795456652038 194 | 193,0.8952315047770464,-0.2912604504024898 195 | 194,-0.5487026343993706,1.3653688919762004 196 | 195,0.9056783237364758,0.7215989763368184 197 | 196,2.136898106008048,0.5533179746272908 198 | 197,0.5393352265376665,1.828975340374717 199 | 198,3.1802884059362895,0.11239955130689305 200 | 199,-0.5654238202351993,0.6261606939102331 201 | -------------------------------------------------------------------------------- /Gaussian Mixture Model MapReduce/gmm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Gaussian Mixture Model on EMR 4 | 5 | 6 | """ 7 | 8 | import InitialiseGaussianMixtures as gmm_init 9 | import IterationGaussianMixtureMR as gmm_iterator 10 | import numpy as np 11 | from boto.s3.connection import S3Connection 12 | import json 13 | import os 14 | 15 | # use if you did not set up this parameters in configuration file 16 | EMR_DEFAULT_PARAMS = ["--ec2-core-instance-bid-price", "0.4", 17 | "--ec2-core-instance-type" ,"m1.small", 18 | "--num-ec2-core-instances", "1", 19 | "--ec2-task-instance-bid-price", "0.4", 20 | "--ec2-task-instance-type", "m1.small", 21 | "--num-ec2-task-instances","1"] 22 | 23 | # access and secret key 24 | ACCESS_KEY = "YOUR_ACCESS_KEY" 25 | SECRET_KEY = "YOUR_SECRET_KEY" 26 | 27 | 28 | 29 | def dist_tot(mu_before, mu_after): 30 | ''' calculates sum of distances between list of vectors ''' 31 | diffs = [np.array(mu_before[i])-np.array(mu) for i,mu in enumerate(mu_after)] 32 | return sum([np.sqrt(np.dot(mu_diff.T,mu_diff)) for mu_diff in diffs]) 33 | 34 | 35 | 36 | class Runner(object): 37 | 38 | """ 39 | (i.e. sample and run K-means on sample to determine initial parameters 40 | ) 41 | """ 42 | 43 | def __init__(self,d,k,init_eps,sample_size,init_iteration_limit, 44 | iteration_eps,em_iteration_limit, input_path, 45 | output_path,emr_local = "local", emr_defaults = False): 46 | self.dim = d # dimensionality of data 47 | self.clusters = k # number of expected clusters 48 | self.init_eps = init_eps # convergence threshold for K-means on initialisation step 49 | self.init_iteration_limit = init_iteration_limit # limit for iterations for K-means on initial step 50 | self.iteration_eps = iteration_eps # convergence threshold for EM parameter 51 | self.em_iteration_limit = em_iteration_limit # maximum number of iterations of EM algorithm 52 | self.input_path = input_path 53 | self.output_path = output_path 54 | self.sample_size = sample_size 55 | self.emr_defaults = emr_defaults 56 | assert emr_local=='emr' or emr_local=='local', " 'emr_local' should be either 'emr' or 'local' " 57 | self.emr_local = emr_local 58 | if self.emr_local == "emr": 59 | self.conn = S3Connection(aws_access_key_id = ACCESS_KEY, 60 | aws_secret_access_key = SECRET_KEY) 61 | 62 | 63 | 64 | ############### Initialisation of GMM parameters ########################## 65 | 66 | 67 | def config_and_run_init_step(self): 68 | ''' 69 | Sets configuration paramters to run initial step of GMM algorithm. 70 | By default job will run in 'local' mode 71 | ''' 72 | # set configuration 73 | init_configs = ["--dimensions",str(self.dim), 74 | "--sample-size",str(self.sample_size), 75 | "--clusters",str(self.clusters), 76 | "--iteration-limit",str(self.init_iteration_limit), 77 | "--kmeans-convergence",str(self.init_eps), 78 | "-r", self.emr_local, 79 | "--output-dir","_".join([self.output_path,"0"]), 80 | "--no-output",self.input_path] 81 | init_configs_new = [] 82 | if self.emr_defaults is True: 83 | init_configs_new.extend(EMR_DEFAULT_PARAMS[:]) 84 | init_configs_new.extend(init_configs) 85 | # start job 86 | mrJobInitStep = gmm_init.InitialiseGaussianMixtureMR(init_configs_new) 87 | with mrJobInitStep.make_runner() as runner: 88 | runner.run() 89 | 90 | 91 | ####################### Iterations of EM-algorithm ###################### 92 | 93 | @staticmethod 94 | def delta_stop_iterate(old_params,new_params): 95 | ''' 96 | 97 | ''' 98 | mu_old = old_params["mu"] 99 | mu_new = new_params["mu"] 100 | delta = dist_tot(mu_new,mu_old) 101 | return delta 102 | 103 | 104 | 105 | def iterate_em(self): 106 | ''' 107 | Performs em iterations until convergence 108 | ''' 109 | delta = 10 110 | get_params = lambda p,i: self.load_params("_".join([p,str(i)])) # get parameters from previous iter. 111 | old_params = get_params(self.output_path,0) 112 | iteration = 1 113 | while delta > self.iteration_eps and iteration < self.em_iteration_limit: 114 | self.config_and_run_iter_step(iteration, json.dumps(old_params)) 115 | new_params = get_params(self.output_path,iteration) 116 | delta = self.delta_stop_iterate(old_params,new_params) 117 | iteration+=1 118 | old_params = new_params 119 | 120 | 121 | 122 | def config_and_run_iter_step(self,iteration, parameters): 123 | ''' 124 | Configure parameters to run single iteration of EM algorithm 125 | (each iteration consists of E-step and M-step) 126 | ''' 127 | iter_configs = [ "--dimensions",str(self.dim), 128 | "--clusters",str(self.clusters), 129 | "--parameters", parameters, 130 | "-r", self.emr_local, 131 | "--output-dir","_".join([self.output_path,str(iteration)]), 132 | "--no-output",self.input_path ] 133 | iter_configs_new = [] 134 | if self.emr_defaults is True: 135 | iter_configs_new.extend(EMR_DEFAULT_PARAMS) 136 | iter_configs_new.extend(iter_configs) 137 | # start job 138 | mrJobIterStep = gmm_iterator.IterationGaussianMixtureMR(iter_configs_new) 139 | with mrJobIterStep.make_runner() as runner: 140 | runner.run() 141 | 142 | 143 | def load_params(self,path): 144 | if self.emr_local == "local": 145 | return self.local_load_params(path) 146 | return self.s3_load_params(path) 147 | 148 | 149 | def s3_load_params(self,s3_path): 150 | ''' load parameters if they are on amazon s3''' 151 | path = s3_path.strip("s3://").split("/") 152 | mybucket = self.conn.get_bucket(path[0]) # connect to s3 bucket 153 | s3_file_keys = [f for f in mybucket.list(prefix = "/".join(path[1:]))] 154 | for s3key in s3_file_keys: 155 | if mybucket.lookup(s3key).size > 0: 156 | data = s3key.get_contents_as_string() 157 | params = json.loads(data) 158 | return params 159 | 160 | def local_load_params(self,local_path): 161 | ''' load paramters if they are on local machine''' 162 | current_dir = os.getcwd() 163 | os.chdir(local_path) 164 | for filename in os.listdir(os.getcwd()): 165 | if "part-" in filename: 166 | if os.path.getsize(filename) > 0: 167 | with open(filename,"r") as in_file: 168 | data = json.load(in_file) 169 | os.chdir(current_dir) 170 | return data 171 | 172 | def folder_cleanup(self): 173 | pass 174 | 175 | 176 | def main_run(): 177 | pass 178 | 179 | 180 | 181 | if __name__=="__main__": 182 | d = 2 183 | k = 2 184 | init_eps = 0.01 185 | sample_size = 100 186 | init_iteration_limit = 20 187 | iteration_eps = 0.01 188 | em_iteration_limit = 10 189 | 190 | #input_path = "/Users/amazaspshaumyan/Desktop/MapReduceAlgorithms/map_reduce/gmm_test_data.txt" 191 | #output_path = "/Users/amazaspshaumyan/Desktop/MapReduceAlgorithms/map_reduce/gmm_test_final_iteration" 192 | output_path = "s3://test-map-reduce-movielabs/expectation_maximization_clients/gmm_test_output_initial_test" 193 | input_path = "s3://test-map-reduce-movielabs/expectation_maximization_clients/gmm_test_data.txt" 194 | emr_local = "emr" 195 | emr_defaults = True 196 | gmm_mr = Runner(d,k,init_eps,sample_size,init_iteration_limit, 197 | iteration_eps,em_iteration_limit, input_path, 198 | output_path,emr_local, emr_defaults) 199 | gmm_mr.config_and_run_init_step() 200 | gmm_mr.iterate_em() 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | -------------------------------------------------------------------------------- /KNN MapReduce/knn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from mrjob.job import MRJob 5 | from mrjob.protocol import RawValueProtocol,JSONProtocol 6 | from mrjob.step import MRStep 7 | import heapq 8 | import csv 9 | 10 | 11 | ################# Helper functions & classes ################################## 12 | 13 | def dist(x,y): 14 | ''' defines euclidean distance between two vector-lists''' 15 | return sum([(x[i] - e)**2 for i,e in enumerate(y)]) 16 | 17 | 18 | class DimensionalityMismatchError(Exception): 19 | ''' Error for case when dimensionalities do not match''' 20 | def __init__(self,expected,real): 21 | self.expected = expected 22 | self.real = real 23 | 24 | def __str__(self): 25 | error = "Expected dimensions: "+str(self.expected)+ " observed: "+str(self.real) 26 | return error 27 | 28 | 29 | ################### MapReduce Job ########################################### 30 | 31 | 32 | 33 | class KnnMapReduce(MRJob): 34 | ''' 35 | K nearest neighbours algorithm for classification and regression. 36 | Assumes that number of data points to be estimated is small and can be fitted 37 | into single machine. 38 | 39 | 40 | Input File: 41 | ----------- 42 | 43 | Extract relevant features from input line by changing extract_features 44 | method. Current code assumes following input line format: 45 | 46 | ,,,...,< dependent variable > 47 | 48 | 49 | Options: 50 | ------- 51 | --dimensionality - number of dimensions in explanatory variables 52 | --knn-type - type of estimation (should be either 'regression' 53 | or 'classification') 54 | --n-neighbours - number of nearest neighbours used for estimation 55 | --points-to-estimate - file containing points that need to be estimated 56 | 57 | 58 | Output: 59 | ------- 60 | Output line format: 61 | 62 | ,,,< estimated dependent variable > 63 | 64 | ''' 65 | 66 | INPUT_PROTOCOL = RawValueProtocol 67 | 68 | INTERNAL_PROTOCOL = JSONProtocol 69 | 70 | OUTPUT_PROTOCOL = RawValueProtocol 71 | 72 | def __init__(self,*args,**kwargs): 73 | super(KnnMapReduce,self).__init__(*args,**kwargs) 74 | with open(self.options.points_to_estimate,"r") as input_file: 75 | data = list(csv.reader(input_file)) 76 | self.points = {} 77 | for dp in data: 78 | self.points[tuple([float(e) for e in dp])] = [] 79 | 80 | 81 | #################### load & configure options ############################# 82 | 83 | def configure_options(self): 84 | super(KnnMapReduce,self).configure_options() 85 | self.add_passthrough_option("--dimensionality", 86 | type = int, 87 | help = "dimenisonality of features") 88 | self.add_passthrough_option("--knn-type", 89 | type = str, 90 | help = "either regression or classification") 91 | self.add_passthrough_option("--n-neighbours", 92 | type = int, 93 | help = "number of neighbours used in classification or regression") 94 | self.add_file_option("--points-to-estimate", 95 | type = "str", 96 | help = "File containing all points that should be estimated") 97 | 98 | 99 | def load_options(self,args): 100 | super(KnnMapReduce,self).load_options(args) 101 | # feature dimensionality 102 | if self.options.dimensionality is None: 103 | self.option_parser.error("You need to specify feature dimensionality") 104 | else: 105 | self.dim = self.options.dimensionality 106 | # type of knn (either regression or classification) 107 | if self.options.knn_type != "regression" and self.options.knn_type != "classification": 108 | self.option_parser.error("Either 'regression' or 'classification' ") 109 | else: 110 | self.knn_type = self.options.knn_type 111 | # dimensionality 112 | if self.options.n_neighbours is None: 113 | self.option_parser.error("You need to specify number of nearest neighbours") 114 | else: 115 | self.n_neighbours = self.options.n_neighbours 116 | if self.options.points_to_estimate is None: 117 | self.option_parser.error("You need to specify file containing points which needs to be estimated") 118 | 119 | ################# Helper functions for extracting features ################ 120 | 121 | def extract_features(self,line): 122 | ''' Extracts data from line of input ''' 123 | data = line.strip().split(",") 124 | return (data[-1], [ float(e) for e in data[1:-1] ]) 125 | 126 | 127 | ################# Map - Reduce Job ######################################## 128 | 129 | 130 | def mapper_knn(self,_,line): 131 | ''' 132 | Finds nearest neighbours for each point in set of points that 133 | needs to be estimated. 134 | ''' 135 | y, features = self.extract_features(line) 136 | if len(features) != self.dim: 137 | raise DimensionalityMismatchError(self.dim,len(features)) 138 | # for each point select n neighbours that are closest to it 139 | for dp in self.points: 140 | d_inv = -1*dist(features,dp) 141 | observation = tuple([d_inv,features,y]) 142 | # if number of nearest neighbours is smaller than threshold add them 143 | if len(self.points[dp]) < self.n_neighbours: 144 | self.points[dp].append(observation) 145 | if len(self.points[dp]) == self.n_neighbours: 146 | heapq.heapify(self.points[dp]) 147 | # compare with largest distance and push if it is smaller 148 | else: 149 | largest_neg_dist = self.points[dp][0][0] 150 | if d_inv > largest_neg_dist: 151 | heapq.heapreplace(self.points[dp],observation) 152 | 153 | def mapper_knn_final(self): 154 | ''' 155 | Each mapper outputs dictionary with key being data point that 156 | needs to be estimated and value being priority queue of length 157 | 'self.n_neighbours' of observation from training set 158 | ''' 159 | yield 1, self.points.items() 160 | 161 | 162 | def reducer_knn(self,key,points): 163 | ''' 164 | Aggregates mapper output and finds set of training points which are 165 | closest to point that needs to be estoimated. Then depending on 166 | estimation type ('classification' or 'regression') outputs estimate 167 | ''' 168 | for mapper_neighbors in points: 169 | merged = None 170 | mapper_knn = {} 171 | for k,v in mapper_neighbors: 172 | mapper_knn[tuple(k)] = v 173 | # process mapper outputs and find closest neighbours 174 | if merged is None: 175 | merged = mapper_knn 176 | else: 177 | for point in merged.keys(): 178 | pq = mapper_knn[point] 179 | while pq: 180 | if len(merged[point]) < self.n_neighbours: 181 | heapq.heappush(merged[point],heapq.heappop(pq)) 182 | else: 183 | largest_neg_dist = merged[point][0][0] 184 | if pq[0][0] > largest_neg_dist: 185 | heapq.heapreplace(merged[point], heapq.heappop(pq)) 186 | for point in merged.keys(): 187 | # regression 188 | if self.options.knn_type == "regression": 189 | estimates = [ float(observation[-1]) for observation in merged[point]] 190 | estimate = sum(estimates)/self.options.n_neighbours 191 | # classification 192 | else: 193 | estimates = {} 194 | for neg_dist,features,y in merged[point]: 195 | estimates[y] = estimates.get(y,0) + 1 196 | estimate,counts = max(estimates.items(),key = lambda x: x[-1]) 197 | # format output 198 | output = list(point) 199 | output.append(estimate) 200 | yield None, ",".join([str(e) for e in output]) 201 | 202 | 203 | def steps(self): 204 | return [MRStep(mapper = self.mapper_knn, 205 | mapper_final = self.mapper_knn_final, 206 | reducer = self.reducer_knn)] 207 | 208 | if __name__=="__main__": 209 | KnnMapReduce.run() 210 | 211 | 212 | 213 | 214 | -------------------------------------------------------------------------------- /Gaussian Mixture Model MapReduce/InitialiseGaussianMixtures.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Initialisation step for MapReduce implementation of GMM. 3 | 4 | Using MapReduce paradigm samples data from large dataset, so that sample fits 5 | into one machine, then run K-means algorithm on sampled datato find centroids 6 | and cluster allocation of points. 7 | Cluster allocation of data points is used to get initial parameters for GMM 8 | (i.e. : mixing coefficients (pdf of latent variable), mean vectors and covariance 9 | matrix for each cluster) 10 | ''' 11 | 12 | from mrjob.protocol import RawValueProtocol,JSONProtocol, JSONValueProtocol 13 | from mrjob.job import MRJob 14 | from mrjob.step import MRStep 15 | import random 16 | import heapq 17 | import numpy as np 18 | 19 | 20 | 21 | def extract_features(line): 22 | ''' extracts features from line of input''' 23 | data = line.strip().split(",") 24 | return [ float(e) for e in data[1:] ] 25 | 26 | 27 | ######################### K-means ########################################### 28 | 29 | 30 | class KmeansInitGMM(object): 31 | ''' 32 | K-means algorihm for clustering. 33 | 34 | Parameters: 35 | ----------- 36 | 37 | clusters - (int) number of expected clusters 38 | dim - (int) dimensionality of input 39 | epsilon - (float) convergence threshold for k-means 40 | iteration_limit - (int) maximum number of iteration, where each 41 | iteration consists of e_step and m_step 42 | data - (list) list of lists, where each inner list is 43 | single data point 44 | 45 | ''' 46 | 47 | def __init__(self, clusters, dim, epsilon, iteration_limit, data): 48 | self.k = clusters 49 | self.data = [extract_features(line) for line in data] 50 | self.m = dim 51 | self.r = [0]*len(data) # vector of cluster assignments 52 | self.convergence_epsilon = epsilon 53 | self.iteration_limit = iteration_limit 54 | 55 | 56 | def loss(self): 57 | ''' 58 | Calculates loss function of K-means 59 | J = sum_n[ sum_k [r_n_k*||x_n-mu_k||^2]]] 60 | ''' 61 | r = self.r 62 | mu = self.clusters 63 | J = sum([np.dot((np.array(x)-mu[r[i]]).T,np.array(x)-mu[r[i]]) for i,x in enumerate(self.data)]) 64 | return J 65 | 66 | def initialise(self): 67 | ''' randomly choses points from list''' 68 | self.clusters = random.sample(self.data,self.k) 69 | 70 | def e_step(self): 71 | ''' E-step in K means algorithm, finds assignment of points to centroids''' 72 | for n,data_point in enumerate(self.data): 73 | min_cl = 0 74 | min_sq_dist = -1 75 | for i,cluster in enumerate(self.clusters): 76 | dist_sq = sum([ (data_point[i]-cluster[i])**2 for i in range(self.m)]) 77 | if min_sq_dist==-1: 78 | min_sq_dist = dist_sq 79 | else: 80 | if dist_sq < min_sq_dist: 81 | min_sq_dist = dist_sq 82 | min_cl = i 83 | self.r[n] = min_cl 84 | 85 | 86 | def m_step(self): 87 | ''' M-step in K-means algorithm, finds centroids that minimise loss function''' 88 | self.clusters = [[0]*self.m for i in range(self.k)] # update clusters 89 | cluster_counts = [0]*self.k 90 | for i,x in enumerate(self.data): 91 | cluster_counts[self.r[i]]+=1 92 | self.clusters[self.r[i]] = [self.clusters[self.r[i]][j]+x[j] for j in range(self.m)] 93 | mean_vector = lambda x,n: [float(el)/n for el in x] 94 | self.clusters = [mean_vector(self.clusters[i],cluster_counts[i]) for i in range(self.k)] 95 | 96 | 97 | def run_k_means(self): 98 | ''' 99 | Runs single pass of k-means algorithm 100 | ''' 101 | self.initialise() # initialise clusters 102 | next_loss = self.loss() # calculate loss function for initial clusters 103 | prev_loss = next_loss +2*self.convergence_epsilon 104 | iteration = 0 105 | losses = [] 106 | while prev_loss - next_loss > self.convergence_epsilon and iteration < self.iteration_limit: 107 | self.e_step() 108 | self.m_step() 109 | prev_loss = next_loss 110 | losses.append(prev_loss) 111 | next_loss = self.loss() 112 | iteration+=1 113 | 114 | 115 | def run(self, reruns = 10): 116 | ''' 117 | Runs k-means several times and choosed and chooses parameters (mean vectors, 118 | point cluster allocation) from the k-means run with smallest value of 119 | loss function. 120 | 121 | (Since loss function is not convex,it is not guaranteed that parameters 122 | obtained from single k-means algorithm pass will give global minimum 123 | of k-means loss function) 124 | ''' 125 | clusters = [[0]*self.m for i in range(self.k)] 126 | loss_before = -1 127 | r = self.r 128 | for i in range(reruns): 129 | self.run_k_means() 130 | loss_new = self.loss() 131 | if loss_before==-1: 132 | loss_before = loss_new 133 | clusters = [el[:] for el in self.clusters] 134 | r = self.r[:] 135 | else: 136 | if loss_new < loss_before: 137 | loss_before = loss_new 138 | clusters = [el[:] for el in self.clusters] 139 | r = self.r[:] 140 | 141 | self.final_r = r 142 | self.final_clusters = clusters 143 | 144 | 145 | def gmm_params(self): 146 | ''' 147 | Calculates initial parameters for GMM based on cluster allocation of 148 | points in best K-means 149 | ''' 150 | total=0 151 | mixing = [0]*self.k 152 | covars = [np.zeros([self.m,self.m], dtype = np.float64) for i in range(self.k)] 153 | mu = [np.zeros(self.m, dtype = np.float64) for i in range(self.k)] 154 | for i,dp in enumerate(self.data): 155 | k = self.final_r[i] # cluster 156 | x = np.array(dp, dtype = np.float64) 157 | mixing[k]+=1 158 | total+=1 159 | mu[k]+=x 160 | covars[k]+=np.outer(x,x) 161 | mu = [mu[j]/p for j,p in enumerate(mixing)] 162 | covars = [1.0/mixing[j]*(covars[j] - mixing[j]*np.outer(mu[j],mu[j])) for j in range(self.k)] 163 | mixing = [float(p)/total for p in mixing] 164 | 165 | matrix_to_list = lambda x: [list(e) for e in x] 166 | mixing = mixing 167 | mu = matrix_to_list(mu) 168 | covariance = [matrix_to_list(e) for e in covars] 169 | return {"mixing":mixing,"mu":mu,"covariance":covariance} 170 | 171 | 172 | ######## intialise parameters of Gaussian Mixture Model ##################### 173 | 174 | 175 | class InitialiseGaussianMixtureMR(MRJob): 176 | ''' 177 | MapReduce class that initialises parameters of GMM. 178 | Each mapper assigns random priority to each line of input, chooses n (n = sample size) 179 | lines with lowest priority level and outputs it. 180 | Single reducer collects m (where m is number of mappers) lists of size n 181 | and choses n lines with smallest priority, these final n lines of input 182 | represent random sample of size n from data. Then k-means algorithm is used 183 | on sampled data to find parameters for initialising. 184 | 185 | Command Line Options: 186 | --------------------- 187 | 188 | --sample-size - sample size 189 | --clusters - number of clusters 190 | --dimensions - dimensionality of data 191 | --kmeans-convergence - convergence threshold for k-means convergence 192 | --iteration-limit - limit on number of iterations for k-means 193 | --kmeans-reruns - number of times to run k-means 194 | 195 | ''' 196 | 197 | 198 | INPUT_PROTOCOL = RawValueProtocol 199 | 200 | INTERNAL_PROTOCOL = JSONProtocol 201 | 202 | OUTPUT_PROTOCOL = JSONValueProtocol 203 | 204 | def __init__(self,*args,**kwargs): 205 | super(InitialiseGaussianMixtureMR,self).__init__(*args, **kwargs) 206 | self.pq = [] 207 | 208 | def configure_options(self): 209 | super(InitialiseGaussianMixtureMR,self).configure_options() 210 | self.add_passthrough_option("--sample-size", 211 | type= int, 212 | help = "number of elements in sample") 213 | self.add_passthrough_option("--clusters", 214 | type = int, 215 | help = "number of clusters") 216 | self.add_passthrough_option("--dimensions", 217 | type = int, 218 | help = "dimensionality of input data") 219 | self.add_passthrough_option("--kmeans-convergence", 220 | type = float, 221 | default = 0.01, 222 | help = "convergence parameter for K-means loss function") 223 | self.add_passthrough_option("--iteration-limit", 224 | type = int, 225 | default = 100, 226 | help = "largest number of iterations that k-means algorithm is allowed") 227 | self.add_passthrough_option("--kmeans-reruns", 228 | type = int, 229 | default = 10, 230 | help = "number of k-means reruns ") 231 | 232 | 233 | 234 | def load_options(self, args): 235 | super(InitialiseGaussianMixtureMR,self).load_options(args) 236 | # size of sample for k-means, that will initialise parameters of GMM 237 | if self.options.sample_size is None: 238 | self.option_parser.error("You need to specify sample size") 239 | else: 240 | self.n = self.options.sample_size 241 | # number of cluters 242 | if self.options.clusters is None: 243 | self.option_parser.error("You need to specify number of clusters") 244 | else: 245 | self.k = self.options.clusters 246 | # dimensionality 247 | if self.options.dimensions is None: 248 | self.option_parser.error("You need to specify dimensionality of data") 249 | else: 250 | self.dim = self.options.dimensions 251 | 252 | 253 | def mapper_initialise_gmm(self,_,line): 254 | ''' 255 | Randomly samples n lines of input (where n is sample_size option), by 256 | assigning random priority level and then choosing n lines of input 257 | with smallest priority level 258 | ''' 259 | r = random.randrange(1000000) 260 | if len(self.pq) < self.n: 261 | heapq.heappush(self.pq,(r,line)) 262 | else: 263 | if self.pq[0][0] < r: 264 | heapq.heappushpop(self.pq,(r,line)) 265 | 266 | def mapper_initialise_gmm_final(self): 267 | yield 1, self.pq 268 | 269 | def reducer_kmeans_initialise_gmm(self,key,samples): 270 | ''' 271 | Subsamples from mapper output and runs K-means algorithm on subsampled 272 | data to initialise parameters of GMM. 273 | ''' 274 | pq_final = [] 275 | for sample in samples: 276 | for element in sample: 277 | if len(pq_final) < self.n: 278 | pq_final.append(element) 279 | if len(pq_final)==self.n: 280 | heapq.heapify(pq_final) 281 | else: 282 | if pq_final[0][0] < element[0]: 283 | heapq.heappushpop(pq_final,element) 284 | lines = [line for r,line in pq_final] 285 | kmeans = KmeansInitGMM(self.k, self.dim, self.options.kmeans_convergence,self.options.iteration_limit,lines) 286 | kmeans.run(reruns = self.options.kmeans_reruns) 287 | params = kmeans.gmm_params() 288 | yield None, params 289 | 290 | 291 | def steps(self): 292 | return [MRStep(mapper = self.mapper_initialise_gmm, 293 | mapper_final = self.mapper_initialise_gmm_final, 294 | reducer = self.reducer_kmeans_initialise_gmm)] 295 | 296 | if __name__=="__main__": 297 | InitialiseGaussianMixtureMR.run() 298 | 299 | -------------------------------------------------------------------------------- /RidgeRegression/RidgeRegressionMapReduce.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from mrjob.job import MRJob 5 | from mrjob.protocol import RawValueProtocol,JSONProtocol,JSONValueProtocol 6 | from mrjob.step import MRStep 7 | import heapq 8 | import csv 9 | import numpy as np 10 | import random 11 | 12 | 13 | # ----------------------------- Helper Classes & Methods -------------------------------- 14 | 15 | def cholesky_solution_least_squares(part_one, part_two): 16 | '''Cholesky decomposition ''' 17 | R = np.linalg.cholesky(part_one) 18 | z = np.linalg.solve(R,part_two) 19 | theta = np.linalg.solve(np.transpose(R),z) 20 | return theta 21 | 22 | 23 | class PrioritySampler(object): 24 | 25 | def __init__(self,sample_size): 26 | self.sample_size = sample_size 27 | self.sample = [] 28 | 29 | def process_observation(self,observation): 30 | if len(self.sample) < self.sample_size: 31 | self.sample.append(observation) 32 | if len(self.sample) == self.sample_size: 33 | heapq.heapify(self.sample_cv) 34 | else: 35 | if observation[0] > self.sample[0][0]: 36 | heapq.heapreplace(self.sample,observation) 37 | 38 | def process_observations(self,observations): 39 | for observation in observations: 40 | self.process_observation(observation) 41 | 42 | 43 | 44 | class RidgeRegressionHoldOutCV(object): 45 | 46 | def __init__(self,lambdas, data): 47 | self.lambdas = lambdas 48 | self.data = data 49 | 50 | 51 | def run_ridge_regression(self, lambda_ridge , scaling = None): 52 | 53 | def scaler(x, column_scaler): 54 | m = np.shape(x)[1] 55 | for i in range(m): 56 | x[:,i] = column_scaler(x[:,i]) 57 | return x 58 | 59 | X,Y = [],[] 60 | for observation in self.data: 61 | features , y = observation[1:] 62 | X.append(features) 63 | Y.append(y) 64 | X = np.array(X) 65 | Y = np.array(Y) 66 | if scaling == "max-min": 67 | X = scaler(X,lambda x: x/(np.max(x) - np.min(x))) 68 | elif scaling == "z-score": 69 | X = scaler(X,lambda x: (x - np.mean(x))/np.std(x)) 70 | # scale y to account for bias term 71 | Y = Y - np.mean(Y) 72 | # in case of max-min and no scaling, we need to substract mean from features 73 | if scaling != "z-score": 74 | X = scaler(X, lambda x: x-np.mean(x)) 75 | 76 | def cv(self, scaling = None): 77 | err = [ self.run_ridge_regression(lambda_ridge, scaling) for lambda_ridge in self.lambdas] 78 | lambda_best, err = min([ (self.lambdas[i],err[i]) for i in range(len(self.lambdas)) ], key = lambda t: t[1]) 79 | return lambda_best 80 | 81 | 82 | 83 | class DimensionMismatch(Exception): 84 | 85 | def __init__(self,expected,observed): 86 | self.exp = expected 87 | self.obs = observed 88 | 89 | def __str__(self): 90 | err = "Expected number of observations: "+self.exp+" , observed: "+self.obs 91 | return err 92 | 93 | 94 | 95 | class RidgeRegression(MRJob): 96 | ''' 97 | 98 | Input File: 99 | ----------- 100 | 101 | Extract relevant features from input line by changing extract_variables 102 | method. You can add features for non-linear models (like x^2 or exp(x)). 103 | Current code assumes following input line format: 104 | 105 | input line = <>,,...,, 106 | 107 | Options: 108 | ----------- 109 | 110 | --dimension - (int) number of explanatory variables 111 | --scaling - (str) 'z-score' or 'max-min' 112 | --hold-out-sample-size - (int) size of hold out cross validation set 113 | --cv-lambdas - (str) name of file containing set of regularisation 114 | parameters for cross validation 115 | 116 | ''' 117 | 118 | INPUT_PROTOCOL = RawValueProtocol 119 | 120 | INTERNAL_PROTOCOL = JSONProtocol 121 | 122 | OUTPUT_PROTCOL = JSONValueProtocol 123 | 124 | def __init__(self,*args,**kwargs): 125 | super(RidgeRegression,self).__init__(*args,**kwargs) 126 | if self.scaling=="max-min": 127 | self.max = [0]*self.dim 128 | self.min = [0]*self.dim 129 | self.mu = [0]*self.dim 130 | self.y_av = 0.0 131 | self.x_t_x = np.zeros([self.dim,self.dim], dtype = np.float) 132 | self.x_t_y = [0]*self.dim 133 | self.n = 0 134 | self.lambdas_cv = self.read_lambdas(self.options.cv_lambdas) 135 | self.sampler = Sampler(self.cv_size) 136 | 137 | #------------------------------------------- load & configure options ---------------------------------------# 138 | 139 | def configure_options(self): 140 | super(RidgeRegression,self).configure_options() 141 | self.add_passthrough_option("--dimension", 142 | type = int, 143 | help = "Number of explanatory variables") 144 | self.add_passthrough_option("--hold-out-sample-size", 145 | type = int, 146 | help = "Size of sample for hold out cross validation", 147 | default = 1000) 148 | self.add_passthrough_option("--scaling", 149 | type = str, 150 | help = "Can be 'z-score' or 'max-min' ") 151 | self.add_file_option("--cv-lambdas", 152 | type = "str", 153 | help = "Name of file that contains regularisation parameters for cross validation") 154 | 155 | def load_options(self,args): 156 | super(RidgeRegression,self).load_options(args) 157 | # dimensionality 158 | if self.options.dimension is None: 159 | self.option_parser.error("You need to specify number of explanatory variables") 160 | else: 161 | self.dim = self.options.dimension 162 | # set of lambdas for cross validation 163 | if self.options.cv_lambdas is None: 164 | self.option_parser.error("You need to specify name of file with set of regularisation parameters") 165 | # sample size for hold out cross validation 166 | self.cv_size = self.options.hold_out_sample_size 167 | # scaling options 168 | if self.options.scaling not in [None,'z-score','max-min']: 169 | self.options_parser.error("You need to define proper scaling ('z-score' or 'max-min')") 170 | 171 | 172 | #----------------------------------------- helper functions ----- --------------------------------------------# 173 | 174 | @staticmethod 175 | def extract_features(line): 176 | ''' 177 | Extracts dependent variable and features from line of input 178 | ''' 179 | data = line.strip().split(",") 180 | features = [float(e) for e in data[1:-1]] 181 | y = float(data[-1]) 182 | return (y,features) 183 | 184 | 185 | @staticmethod 186 | def read_lambdas(filename): 187 | ''' reads regularisation parameters''' 188 | with open(filename,"r") as csvfile: 189 | lambdas = list(csv.reader(csvfile)) 190 | return [float(e) for e in lambdas] 191 | 192 | 193 | def join_mapper_intermediate_stats(self, mapper_one, mapper_two): 194 | ''' 195 | Aggregates mapper outputs 196 | ''' 197 | mapper_one["mu"] = [mapper_one["mu"][i] + mapper_two[i] for i in range(self.dim)] 198 | sum_lists = lambda x,y,n: [x[i] + y[i] for i in range(n)] 199 | xtx_1, xtx_2 = mapper_one["x_t_x"], mapper_two["x_t_x"] 200 | mapper_one["x_t_x"] = [sum_lists(xtx_1[i],xtx_2[i],self.dim) for i in range(self.dim)] 201 | mapper_one["y_av"] += mapper_two["y_av"] 202 | mapper_one["n"] += mapper_two["n"] 203 | if self.options.scaling == "max-min": 204 | mapper_one["max"] = [max(mapper_one["max"][i],mapper_two["max"][i]) for i in range(self.dim)] 205 | mapper_one["min"] = [min(mapper_one["min"][i],mapper_two["min"][i]) for i in range(self.dim)] 206 | return mapper_one 207 | 208 | 209 | def estimate_params(self,data,lambda_ridge,scaling = None): 210 | xtx = np.array(data["x_t_x"]) 211 | xty = np.array(data["x_t_y"]) 212 | mu = np.array(data["mu"]) 213 | y_av = data["y_av"] 214 | n = data["n"] 215 | beta_bias = y_av # (bias terms) 216 | if scaling is None: 217 | part_one = xtx - n*np.outer(mu,mu)+lambda_ridge*np.eye(self.dim) 218 | part_two = xty - n*y_av*mu 219 | elif scaling == "z_score": 220 | sigma = 1.0/np.sqrt(np.diag((1.0/n*(xtx-np.outer(mu,mu))))) # vector of standard deviations 221 | scaler = np.outer(sigma,sigma) 222 | part_one = np.dot(scaler,xtx-n*np.outer(mu,mu)) + lambda_ridge*np.eye(self.dim) 223 | part_two = sigma*xty - sigma*mu*y_av*n 224 | elif scaling == "max-min": 225 | scale_vec = 1.0/( np.array(data["max"]) - np.array(data["min"]) ) 226 | scaler = np.outer(scale_vec,scale_vec) 227 | part_one = np.dot(scaler,xtx-n*np.outer(mu,mu)) + lambda_ridge*np.eye(self.dim) 228 | part_two = scale_vec*xty - scale_vec*mu*y_av*n 229 | theta = cholesky_solution_least_squares(part_one, part_two) 230 | return {"bias_term": beta_bias,"theta":list(theta)} 231 | 232 | 233 | 234 | #----------------------------------------------- Map - Reduce Job -------------------------------------------# 235 | 236 | def mapper_ridge(self,_,line): 237 | y, features = self.extract_features(line) 238 | x = np.array(features) 239 | # update instance variables 240 | if self.options.scaling=="max-min": 241 | self.max = [max(current_max,features[i]) for i,current_max in enumerate(features)] 242 | self.min = [max(current_max,features[i]) for i,current_max in enumerate(features)] 243 | self.mu = [ av+features[i] for i,av in enumerate(self.mu) ] 244 | self.x_t_y = [ xty_i + y*features[i] for xty_i,i in enumerate(features)] 245 | self.x_t_x = np.outer(x,x) 246 | self.y_av +=y 247 | self.n +=1 248 | # make sample for hold out cross validation set 249 | rand_priority = random.randrange(start = 0, stop = 100000000) 250 | observation = (rand_priority,features,y) 251 | self.sampler.process_observation(observation) 252 | 253 | 254 | 255 | def mapper_ridge_final(self): 256 | x_t_x = [list(row) for row in self.x_t_x] # transform numpy array to json-encodable data structure 257 | intermediate_stats = {"mu": self.mu, 258 | "x_" 259 | "x_t_x": x_t_x, 260 | "y_av": self.y_av, 261 | "n": self.n 262 | } 263 | if self.options.scaling == "max-min": 264 | intermediate_stats["max"] = self.max 265 | intermediate_stats["min"] = self.min 266 | yield None, ("stats",intermediate_stats) 267 | yield None, ("hold_out_cv",self.sampler.sample) 268 | 269 | 270 | 271 | def reducer_ridge(self, key, vals): 272 | ''' 273 | 274 | ''' 275 | sampler = Sampler(self.cv_size) 276 | final_summary_stats = {"mu": [0]*self.dim, 277 | "x_t_x": [[0]*self.dim for i in range(self.dim)], 278 | "x_t_y": [0]*self.dim, 279 | "y_av": 0, 280 | "n": 0 } 281 | for val in vals: 282 | if val[0]=="stats": 283 | mapper_summary = val[1] 284 | final_summary_stats = self.join_mapper_intermediate_stats(final_summary_stats,mapper_summary) 285 | else: 286 | sampler.process_observations(val[1]) 287 | # for each scaling type use cross validation to verify best lambda 288 | # then use it on all data (including cv set) to find parameters 289 | ridge = RidgeRegressionHoldOutCV(self.lambdas, sampler.sample) 290 | best_lambda = ridge.cv(self.options.scaling) 291 | yield None, self.estimate_params(final_summary_stats,best_lambda,self.options.scaling) 292 | 293 | 294 | 295 | def steps(self): 296 | return [MRStep(mapper = self.mapper_ridge, 297 | mapper_final = self.mapper_ridge_final, 298 | reducer = self.reducer_ridge)] 299 | 300 | if __name__=="__main__": 301 | RidgeRegression.run() 302 | 303 | 304 | 305 | 306 | 307 | 308 | --------------------------------------------------------------------------------