├── .gitignore ├── LICENSE ├── README.md ├── docs └── img │ └── ernest-workflow.png ├── examples ├── collect_data.sh ├── mllib_lr.py ├── mllib_rcv1.md └── rcv1-parsed.csv ├── expt_design.py ├── predictor.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Ernest: Efficient Performance Prediction for Advanced Analytics 2 | 3 | Ernest is a performance prediction framework for analytics jobs developed using frameworks like Apache Spark and run on cloud computing infrastructure. 4 | 5 | One of the main challenges in deploying large scale analytics applications in 6 | the cloud is choosing the right hardware configuration. Specifically in Amazon 7 | EC2 or Google Compute Engine clusters, choosing the right instance type and the 8 | right number of instances can significantly improve performance or lower cost. 9 | 10 | Ernest is a performance prediction framework that helps address this problem. 11 | Ernest builds performance models based on the behavior of the job on small 12 | samples of data and then predicts its performance on larger datasets and cluster 13 | sizes. To minimize the time and resources spent in building a model, Ernest 14 | uses [optimal experiment design](https://en.wikipedia.org/wiki/Optimal_design), 15 | a statistical technique that allows us to collect as few training points as 16 | required. For more details please see our [paper] 17 | (http://shivaram.org/publications/ernest-nsdi.pdf) and [talk slides](http://shivaram.org/talks/ernest-nsdi-2016.pdf) from NSDI 2016. 18 | 19 | ### Installing Ernest 20 | 21 | The easiest way to install Ernest is by cloning this repository. 22 | 23 | Running Ernest requires installing [SciPy](http://scipy.org), [NumPy](http://numpy.org) and 24 | [CVXPY](http://www.cvxpy.org). An easy way to do this is using the `requirements.txt` file. 25 | 26 | ``` 27 | pip install -r requirements.txt 28 | ``` 29 | 30 | ### Using Ernest 31 | 32 | At a high level there are three main steps to use Ernest as summarized in the following figure. 33 | 34 |

35 | Ernest Workflow 36 |

37 | 38 | These include: 39 | 40 | 1. Determining what sample data points to collect. To do this we will be using experiment design 41 | implemented in [expt_design.py](expt_design.py). This will return the set of training data points 42 | required to build a performance model. 43 | 2. Collect running time for the set of training data points. These can be executed using [Spark EC2 44 | scripts](http://github.com/amplab/spark-ec2) or Amazon EMR etc. 45 | 3. Building a performance model and using it for prediction. To do this we create a CSV file with 46 | measurements from previous step and use [predictor.py](predictor.py). 47 | 48 | For a more detailed example you can see our [example](examples/mllib_rcv1.md) on building a 49 | performance model for Spark MLlib algorithms. 50 | 51 | ## Limitations, Work In Progress 52 | 53 | One of the key insights that is used by Ernest is that a number of machine learning workloads are 54 | iterative in nature and have predictable structure in terms of computation and communication. 55 | Thus we are able to run a few iterations of the job on small samples of data to build a performance 56 | model. However this assumption may not be valid for all workloads. 57 | 58 | Further, to compare across instance types, we currently need to build a separate model for each instance 59 | type. We are working on developing new techniques to share performance models across instance types. 60 | -------------------------------------------------------------------------------- /docs/img/ernest-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amplab/ernest/4a7359c2570684116504c9a47eecb1271cd08125/docs/img/ernest-workflow.png -------------------------------------------------------------------------------- /examples/collect_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function run_lr { 4 | mcs=$1 5 | scale=$2 6 | echo -n "Cores $mcs " 7 | /root/spark/bin/spark-submit --total-executor-cores $mcs ./mllib_lr.py $scale $mcs 2>&1 | grep "LR.*took" 8 | } 9 | 10 | run_lr 32 0.125000 11 | run_lr 4 0.015625 12 | run_lr 4 0.021382 13 | run_lr 12 0.050164 14 | run_lr 12 0.055921 15 | run_lr 12 0.061678 16 | run_lr 16 0.061678 17 | -------------------------------------------------------------------------------- /examples/mllib_lr.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | 3 | import sys 4 | import time 5 | 6 | from pyspark import SparkContext 7 | from pyspark.sql import SQLContext 8 | 9 | from pyspark.ml.classification import LogisticRegression 10 | 11 | if __name__ == "__main__": 12 | if len(sys.argv) > 1: 13 | sample_frac = float(sys.argv[1]) 14 | else: 15 | sample_frac = 1.0 16 | 17 | if len(sys.argv) > 2: 18 | num_parts = int(sys.argv[2]) 19 | else: 20 | num_parts = 256 21 | 22 | sc = SparkContext(appName="LogisticRegressionWithElasticNet") 23 | sc.setLogLevel("WARN") 24 | sqlContext = SQLContext(sc) 25 | 26 | # Load training data 27 | training = sqlContext.read.format("libsvm").load("s3n://ernest-data/rcv1_test_256.binary") 28 | training = training.sample(False, sample_frac).coalesce(num_parts) 29 | training.cache().count() 30 | 31 | lr = LogisticRegression(maxIter=10, elasticNetParam=0.8) 32 | 33 | start = time.time() 34 | # Fit the model 35 | lrModel = lr.fit(training) 36 | end = time.time() 37 | 38 | print "LR sample: ", sample_frac, " took ", (end-start) 39 | -------------------------------------------------------------------------------- /examples/mllib_rcv1.md: -------------------------------------------------------------------------------- 1 | ## Example of running Ernest using Apache Spark ML 2 | 3 | This document presents an example of using Ernest to build a performance 4 | model for binary classification using Logistic Regression implemented in [Spark 5 | ML](http://spark.apache.org/mllib). 6 | 7 | ### Step1: Dataset, Experiment Design 8 | 9 | For this example we will use the [RCV1 10 | dataset](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#rcv1.binary) from the 11 | LibSVM repository. A pre-processed version of the dataset (after converting negative labels to 0 as 12 | required by MLlib) is available at `s3://ernest-data/rcv1_test_256.binary`. 13 | 14 | The first step in using Ernest is to use the Experiment Design module to figure out what training 15 | data points need to be collected. To do this we can run the following command 16 | ``` 17 | python expt_design.py --min-parts 4 --max-parts 32 --total-parts 256 --min-mcs 1 --max-mcs 8 --cores-per-mc 4 18 | ``` 19 | 20 | In the above case we choose the minimum and maximum number of data partitions that will be used for 21 | collecting training data and also set the maximum number of machines we wish to use. Finally since 22 | this tutorial uses `r3.xlarge` instances, we set the `cores-per-mc` as 4. 23 | 24 | The output from running this command looks something like 25 | ``` 26 | Machines, Cores, InputFraction, Partitions, Weight 27 | 8, 32, 0.125000, 32, 1.000000 28 | 1, 4, 0.015625, 4, 1.000000 29 | 1, 4, 0.021382, 6, 1.000000 30 | ... 31 | ``` 32 | 33 | This table shows the training data points we will next collect 34 | 35 | ### Step 2: Data collection 36 | 37 | To collect training data we launch a 8 node cluster of r3.xlarge machines. We can use existing tools 38 | like [spark-ec2](https://github.com/amplab/spark-ec2) to do this. 39 | 40 | ``` 41 | ./spark-ec2 -s 8 -t r3.xlarge -i -k --copy-aws-credentials --spark-version 1.6.2 launch ernest-demo 42 | ``` 43 | 44 | Once the cluster is up, we next run our target application with the sampling fraction and machine 45 | sizes listed above. An example for Logistic Regression with RCV1 is in the file 46 | [mllib_lr.py](mllib_lr.py) and a corresponding script to run this for various configurations is in 47 | [collect_data.sh](collect_data.sh). One important thing to note here is that we only run 10 48 | iterations of the algorithm as that is sufficient for building a model. While training on the 49 | complete data, the number of iterations and other parameters can be tweaked. 50 | 51 | After we collect the necessary data we put it together in a CSV file to feed into the model builder. 52 | For the above example the [CSV file](rcv1-parsed.csv) looks as follows 53 | ``` 54 | #Cores,Input Fraction, Time (s) 55 | 32,0.125,7.94516801834 56 | 4,0.015625,4.72029209137 57 | 4,0.021382,4.87661099434 58 | ... 59 | ``` 60 | 61 | ### Step 3: Model Building 62 | 63 | Our last step is to build the performance model using the collected data and then use it to predict 64 | behavior on large clusters, data sizes. To do this we can run the predictor with a command that 65 | looks like 66 | ``` 67 | python predictor.py rcv1-parsed.csv 68 | ``` 69 | This prints the predicted time taken to process the entire dataset when using up to 64 machines and 70 | the output for this case looks like 71 | ``` 72 | Machines, Predicted Time 73 | 4 44.6515640166 74 | 8 25.4777295249 75 | 12 19.36348049 76 | 16 16.4412832993 77 | 20 14.7682298198 78 | 24 13.7061636865 79 | 28 12.9855393036 80 | ... 81 | ``` 82 | 83 | Thus what we see is that the model predicts that as we go from 16 to 24 machines, the performance wins 84 | are limited as the time for 10 iterations only drops from 16.4s to 12.98s. This is because RCV1 is a 85 | very small dataset and at larger cluster sizes we spend more time on communication rather than on 86 | parallel computation. [Our paper](http://shivaram.org/publications/ernest-nsdi.pdf) contains more 87 | examples. 88 | -------------------------------------------------------------------------------- /examples/rcv1-parsed.csv: -------------------------------------------------------------------------------- 1 | #Cores,Input Fraction, Time (s) 2 | 32,0.125,7.94516801834 3 | 4,0.015625,4.72029209137 4 | 4,0.021382,4.87661099434 5 | 12,0.050164,6.71376490593 6 | 12,0.055921,6.71519398689 7 | 12,0.061678,6.6739718914 8 | 16,0.061678,6.83566999435 9 | -------------------------------------------------------------------------------- /expt_design.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cvxpy as cvx 3 | import argparse 4 | 5 | class ExperimentDesign(object): 6 | 7 | MIN_WEIGHT_FOR_SELECTION = 0.3 8 | 9 | ''' 10 | Represents an experiment design object that can be used to setup 11 | and run experiment design. 12 | ''' 13 | def __init__(self, parts_min, parts_max, total_parts, 14 | mcs_min=1, mcs_max=16, cores_per_mc=2, budget=10.0, 15 | num_parts_interpolate=20): 16 | ''' 17 | Create an experiment design instance. 18 | 19 | :param self: The object being created 20 | :type self: ExperimentDesign 21 | :param parts_min: Minimum number of partitions to use in experiments 22 | :type parts_min: int 23 | :param parts_max: Maximum number of partitions to use in experiments 24 | :type parts_max: int 25 | :param total_parts: Total number of partitions in the dataset 26 | :type total_parts: int 27 | :param mcs_min: Minimum number of machines to use in experiments 28 | :type mcs_min: int 29 | :param mcs_max: Maximum number of machines to use in experiments 30 | :type mcs_max: int 31 | :param cores_per_mc: Cores or slots available per machine 32 | :type cores_per_mc: int 33 | :param budget: Budget for the experiment design problem 34 | :type budget: float 35 | :param budget: Number of points to interpolate between parts_min and parts_max 36 | :type budget: float 37 | ''' 38 | self.parts_min = parts_min 39 | self.parts_max = parts_max 40 | self.total_parts = total_parts 41 | self.mcs_min = mcs_min 42 | self.mcs_max = mcs_max 43 | self.cores_per_mc = cores_per_mc 44 | self.budget = budget 45 | self.num_parts_interpolate = num_parts_interpolate 46 | 47 | def _construct_constraints(self, lambdas, points): 48 | '''Construct non-negative lambdas and budget constraints''' 49 | constraints = [] 50 | constraints.append(0 <= lambdas) 51 | constraints.append(lambdas <= 1) 52 | constraints.append(self._get_cost(lambdas, points) <= self.budget) 53 | return constraints 54 | 55 | def _get_cost(self, lambdas, points): 56 | '''Estimate the cost of an experiment. Right now this is input_frac/machines''' 57 | cost = 0 58 | num_points = len(points) 59 | scale_min = float(self.parts_min) / float(self.total_parts) 60 | for i in xrange(0, num_points): 61 | scale = points[i][0] 62 | mcs = points[i][1] 63 | cost = cost + (float(scale) / scale_min * 1.0 / float(mcs) * lambdas[i]) 64 | return cost 65 | 66 | def _get_training_points(self): 67 | '''Enumerate all the training points given the params for experiment design''' 68 | mcs_range = xrange(self.mcs_min, self.mcs_max + 1) 69 | 70 | scale_min = float(self.parts_min) / float(self.total_parts) 71 | scale_max = float(self.parts_max) / float(self.total_parts) 72 | scale_range = np.linspace(scale_min, scale_max, self.num_parts_interpolate) 73 | 74 | for scale in scale_range: 75 | for mcs in mcs_range: 76 | if np.round(scale * self.total_parts) >= self.cores_per_mc * mcs: 77 | yield [scale, mcs] 78 | 79 | def _frac2parts(self, fraction): 80 | '''Convert input fraction into number of partitions''' 81 | return int(np.ceil(fraction * self.total_parts)) 82 | 83 | def run(self): 84 | ''' Run experiment design. Returns a list of configurations and their scores''' 85 | training_points = list(self._get_training_points()) 86 | num_points = len(training_points) 87 | 88 | all_training_features = np.array([_get_features(point) for point in training_points]) 89 | covariance_matrices = list(_get_covariance_matrices(all_training_features)) 90 | 91 | lambdas = cvx.Variable(num_points) 92 | 93 | objective = cvx.Minimize(_construct_objective(covariance_matrices, lambdas)) 94 | constraints = self._construct_constraints(lambdas, training_points) 95 | 96 | problem = cvx.Problem(objective, constraints) 97 | 98 | opt_val = problem.solve() 99 | # TODO: Add debug logging 100 | # print "solution status ", problem.status 101 | # print "opt value is ", opt_val 102 | 103 | filtered_lambda_idxs = [] 104 | for i in range(0, num_points): 105 | if lambdas[i].value > self.MIN_WEIGHT_FOR_SELECTION: 106 | filtered_lambda_idxs.append((lambdas[i].value, i)) 107 | 108 | sorted_by_lambda = sorted(filtered_lambda_idxs, key=lambda t: t[0], reverse=True) 109 | return [(self._frac2parts(training_points[idx][0]), training_points[idx][0], 110 | training_points[idx][1], l) for (l, idx) in sorted_by_lambda] 111 | 112 | def _construct_objective(covariance_matrices, lambdas): 113 | ''' Constructs the CVX objective function. ''' 114 | num_points = len(covariance_matrices) 115 | num_dim = int(covariance_matrices[0].shape[0]) 116 | objective = 0 117 | matrix_part = np.zeros([num_dim, num_dim]) 118 | for j in xrange(0, num_points): 119 | matrix_part = matrix_part + covariance_matrices[j] * lambdas[j] 120 | 121 | for i in xrange(0, num_dim): 122 | k_vec = np.zeros(num_dim) 123 | k_vec[i] = 1.0 124 | objective = objective + cvx.matrix_frac(k_vec, matrix_part) 125 | 126 | return objective 127 | 128 | def _get_covariance_matrices(features_arr): 129 | ''' Returns a list of covariance matrices given expt design features''' 130 | col_means = np.mean(features_arr, axis=0) 131 | means_inv = (1.0 / col_means) 132 | nrows = features_arr.shape[0] 133 | for i in xrange(0, nrows): 134 | feature_row = features_arr[i,] 135 | ftf = np.outer(feature_row.transpose(), feature_row) 136 | yield np.diag(means_inv).transpose().dot(ftf.dot(np.diag(means_inv))) 137 | 138 | def _get_features(training_point): 139 | ''' Compute the features for a given point. Point is expected to be [input_frac, machines]''' 140 | scale = training_point[0] 141 | mcs = training_point[1] 142 | return [1.0, float(scale) / float(mcs), float(mcs), np.log(mcs)] 143 | 144 | 145 | if __name__ == "__main__": 146 | parser = argparse.ArgumentParser(description='Experiment Design') 147 | 148 | parser.add_argument('--min-parts', type=int, required=True, 149 | help='Minimum number of partitions to use in experiments') 150 | parser.add_argument('--max-parts', type=int, required=True, 151 | help='Maximum number of partitions to use in experiments') 152 | parser.add_argument('--total-parts', type=int, required=True, 153 | help='Total number of partitions in the dataset') 154 | 155 | parser.add_argument('--min-mcs', type=int, required=True, 156 | help='Minimum number of machines to use in experiments') 157 | parser.add_argument('--max-mcs', type=int, required=True, 158 | help='Maximum number of machines to use in experiments') 159 | 160 | parser.add_argument('--cores-per-mc', type=int, default=2, 161 | help='Number of cores or slots available per machine, (default 2)') 162 | parser.add_argument('--budget', type=float, default=10.0, 163 | help='Budget of experiment design problem, (default 10.0)') 164 | parser.add_argument('--num-parts-interpolate', type=int, default=20, 165 | help='Number of points to interpolate between min_parts and max_parts, (default 20)') 166 | 167 | args = parser.parse_args() 168 | 169 | ex = ExperimentDesign(args.min_parts, args.max_parts, args.total_parts, 170 | args.min_mcs, args.max_mcs, args.cores_per_mc, args.budget, 171 | args.num_parts_interpolate) 172 | 173 | expts = ex.run() 174 | print "Machines, Cores, InputFraction, Partitions, Weight" 175 | for expt in expts: 176 | print "%d, %d, %f, %d, %f" % (expt[2], expt[2] * args.cores_per_mc, expt[1], expt[0], expt[3]) 177 | -------------------------------------------------------------------------------- /predictor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | from scipy.optimize import nnls 4 | import csv 5 | import sys 6 | 7 | class Predictor(object): 8 | 9 | def __init__(self, training_data_in=[], data_file=None): 10 | ''' 11 | Initiliaze the Predictor with some training data 12 | The training data should be a list of [mcs, input_fraction, time] 13 | ''' 14 | self.training_data = [] 15 | self.training_data.extend(training_data_in) 16 | if data_file: 17 | with open(data_file, 'rb') as csvfile: 18 | reader = csv.reader(csvfile, delimiter=' ') 19 | for row in reader: 20 | if row[0][0] != '#': 21 | parts = row[0].split(',') 22 | mc = int(parts[0]) 23 | scale = float(parts[1]) 24 | time = float(parts[2]) 25 | self.training_data.append([mc, scale, time]) 26 | 27 | def add(self, mcs, input_fraction, time): 28 | self.training_data.append([mcs, input_fraction, time]) 29 | 30 | def predict(self, input_fraction, mcs): 31 | ''' 32 | Predict running time for given input fraction, number of machines. 33 | ''' 34 | test_features = np.array(self._get_features([input_fraction, mcs])) 35 | return test_features.dot(self.model[0]) 36 | 37 | def predict_all(self, test_data): 38 | ''' 39 | Predict running time for a batch of input sizes, machines. 40 | Input test_data should be a list where every element is (input_fraction, machines) 41 | ''' 42 | test_features = np.array([self._get_features([row[0], row[1]]) for row in test_data]) 43 | return test_features.dot(self.model[0]) 44 | 45 | def fit(self): 46 | print "Fitting a model with ", len(self.training_data), " points" 47 | labels = np.array([row[2] for row in self.training_data]) 48 | data_points = np.array([self._get_features(row) for row in self.training_data]) 49 | self.model = nnls(data_points, labels) 50 | # TODO: Add a debug logging mode ? 51 | # print "Residual norm ", self.model[1] 52 | # print "Model ", self.model[0] 53 | # Calculate training error 54 | training_errors = [] 55 | for p in self.training_data: 56 | predicted = self.predict(p[0], p[1]) 57 | training_errors.append(predicted / p[2]) 58 | 59 | training_errors = [str(np.around(i*100, 2)) + "%" for i in training_errors] 60 | print "Prediction ratios are", ", ".join(training_errors) 61 | return self.model[0] 62 | 63 | def num_examples(self): 64 | return len(self.training_data) 65 | 66 | def _get_features(self, training_point): 67 | mc = training_point[0] 68 | scale = training_point[1] 69 | return [1.0, float(scale) / float(mc), float(mc), np.log(mc)] 70 | 71 | if __name__ == "__main__": 72 | if len(sys.argv) != 2: 73 | print "Usage " 74 | sys.exit(0) 75 | 76 | pred = Predictor(data_file=sys.argv[1]) 77 | 78 | model = pred.fit() 79 | 80 | test_data = [[i, 1.0] for i in xrange(4, 64, 4)] 81 | 82 | predicted_times = pred.predict_all(test_data) 83 | print 84 | print "Machines, Predicted Time" 85 | for i in xrange(0, len(test_data)): 86 | print test_data[i][0], predicted_times[i] 87 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cvxpy==0.2.22 2 | numpy==1.9.2 3 | scipy==0.15.1 4 | --------------------------------------------------------------------------------