├── .gitignore ├── LICENSE ├── README.md ├── data ├── a9a │ ├── a9a_123d_test.dummy │ ├── a9a_123d_train.dense │ ├── a9a_123d_train.dummy │ ├── a9a_123d_train.libsvm │ └── a9a_123d_train_trans.libsvm └── abalone │ ├── abalone_8d_train.dense │ └── abalone_8d_train.libsvm ├── docs └── img │ ├── bo.png │ ├── feature_synthesis.png │ └── grid_vs_random.png ├── pom.xml └── src ├── main └── scala │ ├── com │ └── tencent │ │ └── angel │ │ └── spark │ │ └── automl │ │ ├── AutoConf.scala │ │ ├── HelloWorld.java │ │ ├── feature │ │ ├── DataLoader.scala │ │ ├── FeatureUtils.scala │ │ ├── InToOutRelation.scala │ │ ├── PipelineBuilder.scala │ │ ├── PipelineDriver.scala │ │ ├── PipelineWrapper.scala │ │ ├── TransformerWrapper.scala │ │ ├── UserProfileLoader.scala │ │ ├── cross │ │ │ ├── FeatureCross.scala │ │ │ ├── FeatureCrossMeta.scala │ │ │ └── FeatureCrossOp.scala │ │ ├── examples │ │ │ ├── FeatureCrossSelectorExample.scala │ │ │ ├── FeatureEngineeringExample.scala │ │ │ └── VectorReIndexZeroExample.scala │ │ ├── preprocess │ │ │ ├── BuckerizerWrapper.scala │ │ │ ├── Components.scala │ │ │ ├── FPreprocess.scala │ │ │ ├── HashingTFWrapper.scala │ │ │ ├── IDFWrapper.scala │ │ │ ├── MinMaxScalerWrapper.scala │ │ │ ├── PCAWrapper.scala │ │ │ ├── Sampler.scala │ │ │ ├── SamplerWrapper.scala │ │ │ ├── StandardScalerWrapper.scala │ │ │ ├── StopWordsRemoverWrapper.scala │ │ │ ├── StringIndexerWrapper.scala │ │ │ ├── TPreprocess.scala │ │ │ ├── TokenizerWrapper.scala │ │ │ └── Word2VecWrapper.scala │ │ ├── select │ │ │ ├── ChiSqSelectorWrapper.scala │ │ │ └── FeatureSelector.scala │ │ └── transform │ │ │ └── FTransform.scala │ │ ├── tuner │ │ ├── TunerParam.scala │ │ ├── acquisition │ │ │ ├── Acquisition.scala │ │ │ ├── EI.scala │ │ │ ├── UCB.scala │ │ │ └── optimizer │ │ │ │ ├── AcqOptimizer.scala │ │ │ │ ├── LocalSearch.scala │ │ │ │ └── RandomSearch.scala │ │ ├── config │ │ │ ├── Configuration.scala │ │ │ ├── ConfigurationSpace.scala │ │ │ └── EarlyStopping.scala │ │ ├── kernel │ │ │ ├── Covariance.scala │ │ │ ├── CovarianceType.scala │ │ │ ├── Matern3.scala │ │ │ ├── Matern5.scala │ │ │ ├── Matern5Iso.scala │ │ │ └── SquareExpIso.scala │ │ ├── math │ │ │ ├── BreezeOp.scala │ │ │ └── SquareDist.scala │ │ ├── model │ │ │ ├── GPExample.scala │ │ │ ├── GPKernelDiffFunc.scala │ │ │ └── GPModel.scala │ │ ├── parameter │ │ │ ├── ContinuousSpace.scala │ │ │ ├── DiscreteSpace.scala │ │ │ ├── ParamParser.scala │ │ │ └── ParamSpace.scala │ │ ├── solver │ │ │ ├── Solver.scala │ │ │ └── SolverWithTrail.scala │ │ ├── surrogate │ │ │ ├── GPSurrogate.scala │ │ │ ├── NormalSurrogate.scala │ │ │ ├── RFSurrogate.scala │ │ │ ├── Surrogate.scala │ │ │ └── SurrogateMode.scala │ │ └── trail │ │ │ ├── TestRunner.scala │ │ │ ├── TestTrail.scala │ │ │ ├── Trail.scala │ │ │ └── TrailRunner.scala │ │ └── utils │ │ ├── ArgsUtil.scala │ │ ├── AutoMLException.scala │ │ ├── DataUtils.scala │ │ └── Distribution.scala │ └── org │ └── apache │ └── spark │ └── ml │ └── feature │ └── operator │ ├── FtestSelector.scala │ ├── LassoSelector.scala │ ├── MetadataTransformUtils.scala │ ├── RandomForestSelector.scala │ ├── SelfCartesian.scala │ ├── VarianceSelector.scala │ ├── VectorCartesian.scala │ └── VectorReIndexNonZero.scala └── test └── scala └── com └── tencent └── angel └── spark └── automl ├── BreezeOpTest.scala ├── FeatureCrossTest.scala ├── FeatureEngineeringTest.scala ├── FeatureSelectorTest.scala ├── GPModelTest.scala ├── MetadataTest.scala ├── PipelineTest.scala ├── SquareDistTest.scala └── TunerTest.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | .DS_Store 3 | */.DS_Store 4 | *.class 5 | *.log 6 | target/* 7 | tmp/* 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutoML 2 | 3 | Angel's automatic machine learning toolkit. 4 | 5 | Angel-AutoML provides automatic hyper-parameter tuning and feature engineering operators. 6 | It is developed with Scala. 7 | As a stand-alone library, Angel-AutoML can be easily integrated in Java and Scala projects. 8 | 9 | We welcome everyone interested in machine learning to contribute code, create issues or pull requests. Please refer to [Angel Contribution Guide](https://github.com/Tencent/angel/blob/master/CONTRIBUTING.md) for more detail. 10 | 11 | ## Hyper-parameter tuning 12 | 13 | ### Strategies 14 | Angel-AutoML has three tuning strategies, i.e., Grid search, Random search, and Bayesian optimization. 15 | 16 | ![Grid search and random search](docs/img/grid_vs_random.png) 17 | 18 | ![Bayesian optimization](docs/img/bo.png) 19 | 20 | - **Grid search** equally divides the search space into grids with a fundamental assumption that the distributions of hyper-parameters are uniform. 21 | Though intuitive, grid search has two significant drawbacks: 1) the computing cost increases exponentially with respect to the number of parameters; 22 | and 2) the distributions of hyper-parameter are usually not uniform in real cases. 23 | Thus, grid search might spend great efforts on optimizing less important hyper-parameters in many cases. 24 | - **Random search** randomly samples a sequence of hyper-parameter combinations from the configuration space, 25 | and evaluates the sampled combinations. 26 | Though this approach can be more likely to pay more attention to more important hyper-parameters, 27 | there is still no guarantee of finding the optimal combination. 28 | - **Bayesian optimization (BO)** is different from the traditional modeless methods. 29 | It treats the tuning problem as a black-box function, where the input is the hyper-parameter combination, 30 | and the output is the model metric such as accuracy and auc. 31 | BO uses a cheap surrogate function to approximate the unknown target function. 32 | The surrogate function generates the probabilistic mean and variance of a given hyper-parameter combination. 33 | Then, an acquisition function evaluates the expected improvement of the generated combination. 34 | The hyper-parameter combination with highest improvement is chosen to conduct the next evaluation. 35 | This suggest-evaluate-feedback process iterates until convergence. 36 | Such a probabilistic interpretation approach enables Bayesian optimization to find the optima with much less evaluations on target function. 37 | 38 | For BO, Angel-AutoML implements a series of surrogate functions and acquisition functions. 39 | - **Surrogate function**: Gaussian process and random forest. 40 | We also implement the EM+LBFGS to optimize the hyper-parameters in kernel functions of Gaussian process. 41 | - **Acquisition function**: Probability of Improvement (PI), Expected Improvement (EI) and Upper Confidence Bound (UCB). 42 | 43 | ### Usage 44 | 45 | The tuning component of Angel-AutoML provides easy-to-use interfaces. 46 | Users can integrate it into their programs with fewer than 10 lines. 47 | 48 | - **Define hyper-parameter space.** 49 | Supported format of discrete hyper-parameter: {v1,v2,v3,v4} or {start: end: step}. 50 | ```scala 51 | val param1 = ParamSpace.fromConfigString("param1", "{1.0,2.0,3.0,4.0,5.0}") 52 | val param2 = ParamSpace.fromConfigString("param2", "{1:10:1}") 53 | ``` 54 | Supported format of continuous hyper-parameter: [start,end] or [start: end: num_of_elements] 55 | ```scala 56 | val param1 = ParamSpace.fromConfigString("param1", "[1,10]") 57 | val param2 = ParamSpace.fromConfigString("param2", "[1:10:10]") 58 | ``` 59 | - **Create solver of hyper-parameter tuning.** 60 | The first param is hyper-parameters defined above. 61 | The second param indicates whether the goal is minimizing the metric. 62 | The third param defines the surrogate (Random, Grid, or GaussianProcess). 63 | ```scala 64 | val solver: Solver = Solver(Array(param1, param2), true, surrogate = "Random") 65 | ``` 66 | - **Solver suggests a batch of hyper-parameter combinations.** 67 | The default batch size is 100. You can change this value via TunerParam.setBatchSize(). 68 | ```scala 69 | val configs: Array[Configuration] = solver.suggest() 70 | ``` 71 | - **User evaluates the objective function with the suggested hyper-parameter combinations.** 72 | ```scala 73 | val results: Array[Double] = objective.evaluate(configs) 74 | ``` 75 | - **User feeds the results to the solver.** 76 | ```scala 77 | solver.feed(configs, results) 78 | ``` 79 | - Jump to Step 3 and iterate until convergence. 80 | 81 | ## Feature engineering 82 | 83 | Feature engineering, such as feature selection and feature synthesis, has significant importance in industry level applications of machine learning. 84 | Angel-AutoML implements useful feature engineering operators with Spark MLlib. 85 | They can be easily assembled into Spark pipeline. 86 | 87 | ### Feature selection 88 | 89 | Since the feature selection operators in Spark MLlib is not enough, 90 | we enhance Spark by adding two categories of operators. 91 | - Statistic-based operators, including VarianceSelector and FtestSelector. 92 | - Model-based operators, including LassoSelector and RandomForestSelector. 93 | 94 | ### Feature synthesis 95 | 96 | A majority of online recommendation systems choose linear models, such as Logistic Regression, 97 | as their machine learning model for its high throughput and low latency. 98 | But Logistic Regression requires manual feature synthesis to achieve high accuracy, 99 | which makes automatic feature synthesis essential. 100 | However, existing automatic feature synthesis methods simply generate high-order cross features by cartesian product, 101 | incurring problem of dimension curse. 102 | Therefore, we propose Auto Feature synthesis (AFS), an iterative approach to generate high-order features. 103 | 104 | ![Automatic feature synthesis](docs/img/feature_synthesis.png) 105 | 106 | In AFE, each iteration is composed of two stages: 107 | - Amplification stage: cartesian product of arbitrary features 108 | - Reduction stage: feature selection and feature re-indexing. 109 | 110 | The above figure is an example of an AFS iteration: 111 | - The features are first amplified through a **cartesian product operator**. 112 | The number of features will increase quadratically after this step. 113 | - Next, the most important features are selected from the previous step by a **feature selector operator** (e.g. VarianceSelector and RandomForestSelector). 114 | - Then, the selected features are re-indexed to reduce the feature space by a **feature re-index operator**. 115 | - Finally, the generated features and the original features are concatenated by a **vector assembler operator**. 116 | -------------------------------------------------------------------------------- /docs/img/bo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Angel-ML/automl/1097b718ded332640da17d790d42fdd87ade41b8/docs/img/bo.png -------------------------------------------------------------------------------- /docs/img/feature_synthesis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Angel-ML/automl/1097b718ded332640da17d790d42fdd87ade41b8/docs/img/feature_synthesis.png -------------------------------------------------------------------------------- /docs/img/grid_vs_random.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Angel-ML/automl/1097b718ded332640da17d790d42fdd87ade41b8/docs/img/grid_vs_random.png -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/AutoConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl 20 | 21 | object AutoConf { 22 | 23 | object Preprocess { 24 | 25 | val ML_DATA_INPUT_FORMAT = "ml.data.format" 26 | val DEFAULT_ML_DATA_INPUT_FORMAT = "libsvm" 27 | 28 | val ML_DATA_SPLITOR = "ml.data.splitor" 29 | val DEFAULT_ML_DATA_SPLITOR = "\\s+" 30 | 31 | val INPUT_TYPE = "ml.input.type" 32 | val DEFAULT_INPUT_TYPE = "normal" 33 | 34 | val SAMPLE_RATE = "ml.sample.rate" 35 | val DEFAULT_SAMPLE_RATE = "1.0" 36 | 37 | val IMBALANCE_SAMPLE = "ml.imbalance.sample" 38 | val DEFAULT_IMBALANCE_SAMPLE = "false" 39 | 40 | val HAS_DISCRETER = "ml.has.discreter" 41 | val DEFAULT_HAS_DISCRETER = "false" 42 | 43 | val HAS_ONEHOTER = "ml.has.onehoter" 44 | val DEFAULT_HAS_ONEHOTER = "false" 45 | 46 | val HAS_MINMAXSCALAR = "ml.has.minmaxscalar" 47 | val DEFAULT_HAS_MINMAXSCALAR = "true" 48 | 49 | val HAS_STANDARDSCALAR = "ml.has.standardscalar" 50 | val DEFAULT_HAS_STANDARDSCALAR = "false" 51 | 52 | } 53 | 54 | } 55 | 56 | class AutoConf {} 57 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/HelloWorld.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | package com.tencent.angel.spark.automl; 18 | 19 | /** 20 | * @author Jeremy Jiang 21 | * @version 0.1.0-SNAPSHOT 22 | */ 23 | public class HelloWorld { 24 | 25 | public static String AUTHOR = "Jeremy Jiang"; 26 | public static String VERSION = "0.1.0-SNAPSHOT"; 27 | 28 | public static void main(String[] argv) { 29 | System.out.println("Welcome to Angel automl subproject!"); 30 | System.out.println("AUTHOR: " + AUTHOR); 31 | System.out.println("Version: " + VERSION); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/DataLoader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature 20 | 21 | import org.apache.spark.sql.{DataFrame, SparkSession} 22 | 23 | abstract class DataLoader(ss: SparkSession) { 24 | def load(input: String, separator: String): DataFrame 25 | 26 | def load(input: String): DataFrame = load(input, " ") 27 | } 28 | 29 | case class LibSVMDataLoader(ss: SparkSession) extends DataLoader(ss) { 30 | override def load(input: String, separator: String): DataFrame = { 31 | ss.read.format("libsvm").load(input) 32 | } 33 | } 34 | 35 | case class CSVDataLoader(ss: SparkSession) extends DataLoader(ss) { 36 | override def load(input: String, separator: String): DataFrame = { 37 | ss.read.csv(input) 38 | } 39 | } 40 | 41 | case class JSONDataLoader(ss: SparkSession) extends DataLoader(ss) { 42 | override def load(input: String, separator: String): DataFrame = { 43 | ss.read.json(input) 44 | } 45 | } 46 | 47 | case class DocumentDataLoader(ss: SparkSession) extends DataLoader(ss) { 48 | override def load(input: String, separator: String): DataFrame = { 49 | ss.createDataFrame( 50 | ss.sparkContext.textFile(input).map(Tuple1.apply) 51 | ).toDF("sentence") 52 | } 53 | } 54 | 55 | case class LabeledDocumentDataLoader(ss: SparkSession) extends DataLoader(ss) { 56 | override def load(input: String, separator: String): DataFrame = { 57 | require(separator.equals(","), 58 | "the label and sentence should be separated by comma") 59 | ss.createDataFrame( 60 | ss.sparkContext.textFile(input) 61 | .map { line => 62 | val splits = line.split(separator) 63 | (splits(0), splits(1)) 64 | }) 65 | .toDF("label", "sentence") 66 | } 67 | 68 | override def load(input: String): DataFrame = load(input, ",") 69 | } 70 | 71 | case class SimpleDataLoader(ss: SparkSession) extends DataLoader(ss) { 72 | override def load(input: String, separator: String): DataFrame = { 73 | ss.createDataFrame( 74 | ss.sparkContext.textFile(input) 75 | .map(_.split(separator)).map(Tuple1.apply) 76 | ).toDF("features") 77 | } 78 | } 79 | 80 | case class LabeledSimpleDataLoader(ss: SparkSession) extends DataLoader(ss) { 81 | override def load(input: String, separator: String): DataFrame = { 82 | ss.createDataFrame( 83 | ss.sparkContext.textFile(input) 84 | .map { line => 85 | val splits = line.split(separator) 86 | (splits.head, splits.tail) 87 | } 88 | ).toDF("label", "features") 89 | } 90 | } 91 | 92 | 93 | object DataLoader { 94 | 95 | def load(ss: SparkSession, 96 | format: String, 97 | input: String, 98 | separator: String = " "): DataFrame = { 99 | format match { 100 | case "libsvm" => LibSVMDataLoader(ss).load(input) 101 | case "csv" => CSVDataLoader(ss).load(input) 102 | case "json" => JSONDataLoader(ss).load(input) 103 | case "document" => DocumentDataLoader(ss).load(input, separator) 104 | case "label-document" => LabeledDocumentDataLoader(ss).load(input, separator) 105 | case "simple" => SimpleDataLoader(ss).load(input, separator) 106 | case "label-simple" => LabeledSimpleDataLoader(ss).load(input, separator) 107 | case _ => SimpleDataLoader(ss).load(input, separator) 108 | } 109 | } 110 | 111 | } 112 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/FeatureUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl.feature 19 | 20 | import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} 21 | import org.apache.spark.sql.{Dataset, Row} 22 | 23 | import scala.language.postfixOps 24 | 25 | object FeatureUtils { 26 | 27 | def maxDim(dataset: Dataset[Row], col: String = "features"): Int = { 28 | dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] => 29 | val dim = rows.map { case Row(v: Vector) => 30 | v match { 31 | case sv: SparseVector => sv.indices.last 32 | case dv: DenseVector => dv.size 33 | } 34 | }.max 35 | Iterator(dim) 36 | }.max + 1 37 | } 38 | 39 | def countNonZero(dataset: Dataset[Row], col: String = "features"): Array[Int] = { 40 | dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] => 41 | val mergeIndices = rows.map { case Row(v: Vector) => 42 | v match { 43 | case sv: SparseVector => 44 | sv.indices.toList 45 | } 46 | }.reduce(_ union _ distinct) 47 | Iterator(mergeIndices) 48 | }.reduce((a, b) => (a union b).distinct).toArray 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/InToOutRelation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature 20 | 21 | object InToOutRelation extends Enumeration { 22 | 23 | type InToOutRelation = Value 24 | 25 | val Fixed = Value("Fixed") 26 | val InPlace = Value("InPlace") 27 | val OneToOne = Value("OneToOne") 28 | val MultiToMulti = Value("MultiToMulti") 29 | val MultiToOne = Value("MultiToOne") 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/PipelineBuilder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature 20 | 21 | import org.apache.spark.SparkException 22 | import org.apache.spark.ml.PipelineStage 23 | 24 | import scala.collection.mutable 25 | import scala.collection.mutable.ArrayBuffer 26 | 27 | class IncompatibleFiledExecption(msg: String) extends SparkException(msg) {} 28 | 29 | object PipelineBuilder { 30 | 31 | def build(transformers: Array[TransformerWrapper]): Array[PipelineStage] = { 32 | val stages: ArrayBuffer[PipelineStage] = new ArrayBuffer[PipelineStage]() 33 | //val allInputCols: ArrayBuffer[String] = new ArrayBuffer[String]() 34 | val allInputCols: mutable.HashSet[String] = new mutable.HashSet[String]() 35 | 36 | transformers(0).setInputCols(transformers(0).requiredInputCols) 37 | transformers(0).setOutputCols(transformers(0).requiredOutputCols) 38 | allInputCols ++= transformers(0).getInputCols 39 | transformers(0).setAncestorCols(allInputCols.toArray) 40 | stages += transformers(0).declareInAndOut().getTransformer 41 | 42 | (1 until transformers.length).foreach { i => 43 | println(s"add $i-th transformer = ${transformers(i).getTransformer.getClass.getSimpleName}") 44 | // set parent 45 | transformers(i).setParent(transformers(i - 1)) 46 | // add new cols 47 | allInputCols ++= transformers(i - 1).getOutputCols 48 | // set parent cols 49 | transformers(i).setAncestorCols(allInputCols.toArray) 50 | // generate input cols 51 | transformers(i).generateInputCols() 52 | // generate output cols 53 | transformers(i).generateOutputCols() 54 | // add fully configured transformer 55 | stages += transformers(i).declareInAndOut().getTransformer 56 | } 57 | 58 | stages.toArray 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/PipelineDriver.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | package com.tencent.angel.spark.automl.feature 18 | 19 | import com.tencent.angel.spark.automl.feature.preprocess.{MinMaxScalerWrapper, StandardScalerWrapper} 20 | import org.apache.spark.ml.linalg.Vectors 21 | import org.apache.spark.sql.SparkSession 22 | 23 | object PipelineDriver { 24 | 25 | def main(args: Array[String]): Unit = { 26 | 27 | val spark = SparkSession.builder().master("local").getOrCreate() 28 | 29 | // val inputDF = spark.createDataFrame(Seq( 30 | // (0L, "a b c d e spark", 1.0), 31 | // (1L, "b d", 0.0), 32 | // (2L, "spark f g h", 1.0), 33 | // (3L, "hadoop mapreduce", 0.0) 34 | // )).toDF("id", "text", "label") 35 | 36 | val inputDF = spark.createDataFrame(Seq( 37 | (0, Vectors.dense(1.0, 0.1, -1.0)), 38 | (1, Vectors.dense(2.0, 1.1, 1.0)), 39 | (2, Vectors.dense(3.0, 10.1, 3.0)) 40 | )).toDF("id", "numerical") 41 | 42 | val pipelineWrapper = new PipelineWrapper() 43 | 44 | val transformers = Array[TransformerWrapper]( 45 | new MinMaxScalerWrapper(), 46 | new StandardScalerWrapper() 47 | ) 48 | 49 | val stages = PipelineBuilder.build(transformers) 50 | 51 | print(transformers(0).getInputCols) 52 | 53 | pipelineWrapper.setStages(stages) 54 | 55 | val model: PipelineModelWrapper = pipelineWrapper.fit(inputDF) 56 | 57 | val outDF = model.transform(inputDF) 58 | 59 | outDF.show() 60 | 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/PipelineWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature 20 | 21 | import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} 22 | import org.apache.spark.sql.{DataFrame, Dataset} 23 | 24 | class PipelineWrapper() { 25 | 26 | var pipeline = new Pipeline() 27 | 28 | var transformers: Array[TransformerWrapper] = Array() 29 | 30 | def setTransformers(value: Array[TransformerWrapper]): this.type = { 31 | transformers = value 32 | setStages(PipelineBuilder.build(transformers)) 33 | this 34 | } 35 | 36 | def setStages(value: Array[_ <: PipelineStage]): Unit = { 37 | pipeline = pipeline.setStages(value) 38 | } 39 | 40 | def fit(dataset: Dataset[_]): PipelineModelWrapper = { 41 | new PipelineModelWrapper(pipeline.fit(dataset), transformers) 42 | } 43 | 44 | } 45 | 46 | class PipelineModelWrapper(val model: PipelineModel, 47 | val transformers: Array[TransformerWrapper]) { 48 | 49 | def transform(dataset: Dataset[_]): DataFrame = { 50 | var df = model.transform(dataset) 51 | if (transformers.length >= 2) { 52 | (0 until transformers.length - 1).foreach { i => 53 | val outCols = transformers(i).getOutputCols 54 | for (col <- outCols) { 55 | df = df.drop(col) 56 | } 57 | } 58 | } 59 | df 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/TransformerWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature 20 | 21 | import com.tencent.angel.spark.automl.feature.InToOutRelation.InToOutRelation 22 | import org.apache.spark.ml.PipelineStage 23 | 24 | abstract class TransformerWrapper { 25 | 26 | val transformer: PipelineStage 27 | var parent: TransformerWrapper 28 | 29 | val relation: InToOutRelation 30 | 31 | val hasMultiInputs: Boolean 32 | val hasMultiOutputs: Boolean 33 | val needAncestorInputs: Boolean 34 | private val prefix = "out" 35 | 36 | val requiredInputCols: Array[String] 37 | val requiredOutputCols: Array[String] 38 | 39 | private var inputCols: Array[String] = _ 40 | private var outputCols: Array[String] = _ 41 | 42 | private var ancestorCols: Array[String] = _ 43 | 44 | def getTransformer = transformer 45 | 46 | def setParent(parent: TransformerWrapper): Unit = this.parent = parent 47 | 48 | def setInputCols(cols: Array[String]): Unit = inputCols = cols 49 | 50 | def setOutputCols(cols: Array[String]): Unit = outputCols = cols 51 | 52 | def getInputCols: Array[String] = inputCols 53 | 54 | def getOutputCols: Array[String] = outputCols 55 | 56 | def setAncestorCols(cols: Array[String]): Unit = ancestorCols = cols 57 | 58 | def generateInputCols(): Unit = { 59 | //require(ancestorCols.contains(requiredInputCols), "Missing required input cols.") 60 | // require(requiredInputCols.forall(ancestorCols.contains), "Missing required input cols.") 61 | // if transformer has required input cols, feed required input cols 62 | // if transformer needs all input cols, feed all input cols 63 | // if transformer has no required input cols, feed the output cols of the parent transformer 64 | if (ancestorCols.contains(requiredInputCols)) { 65 | setInputCols(requiredInputCols) 66 | } else if (needAncestorInputs) { 67 | setInputCols(ancestorCols) 68 | } else { 69 | setInputCols(parent.outputCols) 70 | } 71 | } 72 | 73 | def generateOutputCols(): Unit = { 74 | relation match { 75 | case InToOutRelation.Fixed => 76 | setOutputCols(requiredOutputCols) 77 | case InToOutRelation.InPlace => 78 | setOutputCols(inputCols) 79 | case InToOutRelation.OneToOne => 80 | setOutputCols(Array(prefix + transformer.getClass.getSimpleName)) 81 | case InToOutRelation.MultiToMulti => 82 | setOutputCols(inputCols.map(prefix + _)) 83 | case InToOutRelation.MultiToOne => 84 | setOutputCols(Array(prefix + transformer.getClass.getName.toLowerCase)) 85 | case _ => 86 | throw new IncompatibleFiledExecption( 87 | "wrong relations between input and output of transformer") 88 | } 89 | } 90 | 91 | def declareInAndOut(): this.type 92 | } 93 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/UserProfileLoader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | package com.tencent.angel.spark.automl.feature 18 | 19 | import com.tencent.angel.spark.automl.feature.preprocess._ 20 | 21 | import scala.collection.mutable.ArrayBuffer 22 | 23 | class UserProfileLoader { 24 | 25 | private var selectedComponents: ArrayBuffer[String] = ??? 26 | 27 | private def componentToTransformers(component: String): TransformerWrapper = { 28 | component match { 29 | case "SamplerWrapper" => new SamplerWrapper(0.5) 30 | case "StopWordsRemoverWrapper" => new StopWordsRemoverWrapper() 31 | case "Tokenizer" => new TokenizerWrapper() 32 | case "MinMaxScalerWrapper" => new MinMaxScalerWrapper() 33 | case "StandardScalerWrapper" => new StandardScalerWrapper() 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/cross/FeatureCross.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.cross 20 | 21 | import org.apache.spark.ml.Transformer 22 | import org.apache.spark.ml.param.ParamMap 23 | import org.apache.spark.ml.util.DefaultParamsWritable 24 | import org.apache.spark.sql.types.StructType 25 | import org.apache.spark.sql.{DataFrame, Dataset} 26 | 27 | class FeatureCross(override val uid: String) 28 | extends Transformer with DefaultParamsWritable { 29 | 30 | override def transform(dataset: Dataset[_]): DataFrame = ??? 31 | 32 | override def copy(extra: ParamMap): Transformer = ??? 33 | 34 | override def transformSchema(schema: StructType): StructType = ??? 35 | 36 | } -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/cross/FeatureCrossMeta.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl.feature.cross 19 | 20 | class FeatureCrossMeta(var curIdx: Int, val crossInfo: String) { 21 | 22 | } 23 | 24 | object FeatureCrossMeta { 25 | 26 | def apply(curIdx: Int, crossInfo: String): FeatureCrossMeta = { 27 | new FeatureCrossMeta(curIdx, crossInfo) 28 | } 29 | 30 | def cross(idx: Int, from: FeatureCrossMeta, to: FeatureCrossMeta): FeatureCrossMeta = { 31 | FeatureCrossMeta(idx, from.crossInfo + "*" + to.crossInfo) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/cross/FeatureCrossOp.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl.feature.cross 19 | 20 | import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} 21 | 22 | import scala.collection.mutable.ArrayBuffer 23 | 24 | object FeatureCrossOp { 25 | 26 | def flatCartesian(vector: Vector): Vector = { 27 | val curDim = vector.size 28 | vector match { 29 | case sv: SparseVector => 30 | val indices = new ArrayBuffer[Int]() 31 | val values = new ArrayBuffer[Double]() 32 | sv.indices.foreach { idx1 => 33 | sv.indices.foreach { idx2 => 34 | indices += curDim * idx1 + idx2 35 | values += sv(idx1) * sv(idx2) 36 | } 37 | } 38 | val sorted = indices.zip(values).sortBy(_._1) 39 | val sortedIndices = sorted.map(_._1) 40 | val sortedValues = sorted.map(_._2) 41 | new SparseVector(sv.size * sv.size, sortedIndices.toArray, sortedValues.toArray) 42 | case dv: DenseVector => 43 | val values: Array[Double] = new Array(dv.size * dv.size) 44 | (0 until dv.size).foreach { idx1 => 45 | (0 until dv.size).foreach { idx2 => 46 | values(dv.size * idx1 + idx2) = dv(idx1) * dv(idx2) 47 | } 48 | } 49 | new DenseVector(values) 50 | } 51 | } 52 | 53 | def main(args: Array[String]): Unit = { 54 | val v = new DenseVector(Array(1, 2, 3)) 55 | val cv = flatCartesian(v) 56 | println(cv.toDense.values.mkString(",")) 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/examples/FeatureCrossSelectorExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl.feature.examples 19 | 20 | import org.apache.spark.SparkConf 21 | import org.apache.spark.ml.Pipeline 22 | import org.apache.spark.ml.classification.LogisticRegression 23 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 24 | import org.apache.spark.ml.feature.VectorAssembler 25 | import org.apache.spark.ml.feature.operator.{VarianceSelector, VectorCartesian} 26 | import org.apache.spark.sql.SparkSession 27 | 28 | object FeatureCrossSelectorExample { 29 | 30 | def main(args: Array[String]): Unit = { 31 | 32 | val conf = new SparkConf() 33 | 34 | val input = conf.get("spark.input.path", "data/a9a/a9a_123d_train_trans.libsvm") 35 | val numFeatures = conf.get("spark.num.feature", "123") 36 | val twoOrderNumFeatures = conf.getInt("spark.two.order.num.feature", 123) 37 | val threeOrderNumFeatures = conf.getInt("spark.three.order.num.feature", 123) 38 | 39 | val spark = SparkSession.builder().master("local").config(conf).getOrCreate() 40 | 41 | val data = spark.read.format("libsvm") 42 | .option("numFeatures", numFeatures) 43 | .load(input) 44 | .persist() 45 | 46 | val cartesian = new VectorCartesian() 47 | .setInputCols(Array("features", "features")) 48 | .setOutputCol("f_f") 49 | 50 | val selector = new VarianceSelector() 51 | .setFeaturesCol("f_f") 52 | .setOutputCol("selected_f_f") 53 | .setNumTopFeatures(twoOrderNumFeatures) 54 | 55 | val cartesian2 = new VectorCartesian() 56 | .setInputCols(Array("features", "selected_f_f")) 57 | .setOutputCol("f_f_f") 58 | 59 | val selector2 = new VarianceSelector() 60 | .setFeaturesCol("f_f_f") 61 | .setOutputCol("selected_f_f_f") 62 | .setNumTopFeatures(threeOrderNumFeatures) 63 | 64 | val assembler = new VectorAssembler() 65 | .setInputCols(Array("features", "selected_f_f", "selected_f_f_f")) 66 | .setOutputCol("assembled_features") 67 | 68 | val pipeline = new Pipeline() 69 | .setStages(Array(cartesian, selector, cartesian2, selector2, assembler)) 70 | 71 | val crossDF = pipeline.fit(data).transform(data).persist() 72 | data.unpersist() 73 | crossDF.drop("f_f", "f_f_f", "selected_f_f", "selected_f_f_f") 74 | crossDF.show(1) 75 | 76 | val splitDF = crossDF.randomSplit(Array(0.9, 0.1)) 77 | 78 | val trainDF = splitDF(0).persist() 79 | val testDF = splitDF(1).persist() 80 | 81 | val originalLR = new LogisticRegression() 82 | .setFeaturesCol("features") 83 | .setLabelCol("label") 84 | .setMaxIter(20) 85 | .setRegParam(0.01) 86 | 87 | val originalPredictions = originalLR.fit(trainDF).transform(testDF) 88 | originalPredictions.show(1) 89 | val originalEvaluator = new BinaryClassificationEvaluator() 90 | .setLabelCol("label") 91 | .setRawPredictionCol("rawPrediction") 92 | .setMetricName("areaUnderROC") 93 | val originalAUC = originalEvaluator.evaluate(originalPredictions) 94 | println(s"original features auc: $originalAUC") 95 | 96 | val crossLR = new LogisticRegression() 97 | .setFeaturesCol("assembled_features") 98 | .setLabelCol("label") 99 | .setMaxIter(20) 100 | .setRegParam(0.01) 101 | 102 | val crossPredictions = crossLR.fit(trainDF).transform(testDF) 103 | crossPredictions.show(1) 104 | val crossEvaluator = new BinaryClassificationEvaluator() 105 | .setLabelCol("label") 106 | .setRawPredictionCol("rawPrediction") 107 | .setMetricName("areaUnderROC") 108 | val crossAUC = crossEvaluator.evaluate(crossPredictions) 109 | println(s"cross features auc: $crossAUC") 110 | 111 | spark.close() 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/examples/FeatureEngineeringExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl.feature.examples 19 | 20 | import org.apache.spark.SparkConf 21 | import org.apache.spark.ml.classification.LogisticRegression 22 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 23 | import org.apache.spark.ml.feature.VectorAssembler 24 | import org.apache.spark.ml.feature.operator.{VarianceSelector, VectorCartesian, VectorReIndexNonZero} 25 | import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} 26 | import org.apache.spark.sql.SparkSession 27 | 28 | import scala.collection.mutable.ArrayBuffer 29 | 30 | object FeatureEngineeringExample { 31 | 32 | def main(args: Array[String]): Unit = { 33 | 34 | val conf = new SparkConf().setMaster("local") 35 | 36 | val input = conf.get("spark.input.path", "data/a9a/a9a_123d_train_trans.libsvm") 37 | val numFeatures = conf.getInt("spark.num.feature", 123) 38 | val incNumFeatures = conf.getInt("spark.inc.num.feature", 10) 39 | val iter = conf.getInt("spark.ml.iteration", 1) 40 | val modelPath = conf.get("spark.model.path", "tmp/feature_engineer") 41 | 42 | val spark = SparkSession.builder().config(conf).getOrCreate() 43 | 44 | val data = spark.read.format("libsvm") 45 | .option("numFeatures", numFeatures) 46 | .load(input) 47 | .persist() 48 | 49 | val featureMap: Map[Int, Int] = Map[Int, Int]() 50 | 51 | val pipelineStages: ArrayBuffer[PipelineStage] = new ArrayBuffer 52 | val fieldsToAssembler: ArrayBuffer[String] = new ArrayBuffer[String]() 53 | val allFields: ArrayBuffer[String] = new ArrayBuffer[String]() 54 | 55 | val cartesianPrefix = "_f" 56 | val selectorPrefix = "_select" 57 | val filterPrefix = "_filter" 58 | var curField = "features" 59 | fieldsToAssembler += curField 60 | allFields += curField 61 | 62 | (0 until iter).foreach { iter => 63 | val cartesian = new VectorCartesian() 64 | .setInputCols(Array(curField, "features")) 65 | .setOutputCol(curField + cartesianPrefix) 66 | println(s"Cartesian -> input: $curField and features, output: ${curField + cartesianPrefix}") 67 | pipelineStages += cartesian 68 | curField += cartesianPrefix 69 | allFields += curField 70 | val selector = new VarianceSelector() 71 | .setFeaturesCol(curField) 72 | .setOutputCol(curField + selectorPrefix) 73 | .setNumTopFeatures(incNumFeatures) 74 | println(s"Selector -> input: $curField, output: ${curField + selectorPrefix}") 75 | pipelineStages += selector 76 | curField += selectorPrefix 77 | allFields += curField 78 | val filter = new VectorReIndexNonZero(featureMap) 79 | .setInputCol(curField) 80 | .setOutputCol(curField + filterPrefix) 81 | println(s"Filter -> input: $curField, output: ${curField + filterPrefix}") 82 | pipelineStages += filter 83 | curField += filterPrefix 84 | fieldsToAssembler += curField 85 | allFields += curField 86 | } 87 | 88 | println(s"assembler fields: ${fieldsToAssembler.mkString(",")}") 89 | val assembler = new VectorAssembler() 90 | .setInputCols(fieldsToAssembler.toArray) 91 | .setOutputCol("assembled_features") 92 | pipelineStages += assembler 93 | fieldsToAssembler += "assembled_features" 94 | allFields += "assembled_features" 95 | 96 | val usedFields = Array("features", "assembled_features") 97 | println(s"all fields: ${allFields.toArray.mkString(",")}") 98 | val dropFields = allFields.filter(!usedFields.contains(_)) 99 | println(s"drop fields: ${dropFields.toArray.mkString(",")}") 100 | 101 | val pipeline = new Pipeline() 102 | .setStages(pipelineStages.toArray) 103 | 104 | val model = pipeline.fit(data) 105 | model.save(modelPath) 106 | val load_model = PipelineModel.load(modelPath) 107 | val crossDF = load_model.transform(data).persist() 108 | crossDF.show(false) 109 | dropFields.foreach(crossDF.drop) 110 | 111 | val splitDF = crossDF.randomSplit(Array(0.7, 0.3)) 112 | val trainDF = splitDF(0).persist() 113 | val testDF = splitDF(1).persist() 114 | crossDF.unpersist() 115 | 116 | val originalLR = new LogisticRegression() 117 | .setFeaturesCol("features") 118 | .setLabelCol("label") 119 | .setMaxIter(20) 120 | .setRegParam(0.01) 121 | val originalPredictions = originalLR.fit(trainDF).transform(testDF) 122 | originalPredictions.show(1) 123 | val originalEvaluator = new BinaryClassificationEvaluator() 124 | .setLabelCol("label") 125 | .setRawPredictionCol("rawPrediction") 126 | .setMetricName("areaUnderROC") 127 | val originalAUC = originalEvaluator.evaluate(originalPredictions) 128 | println(s"original features auc: $originalAUC") 129 | 130 | val crossLR = new LogisticRegression() 131 | .setFeaturesCol("assembled_features") 132 | .setLabelCol("label") 133 | .setMaxIter(20) 134 | .setRegParam(0.01) 135 | val crossPredictions = crossLR.fit(trainDF).transform(testDF) 136 | crossPredictions.show(1) 137 | val crossEvaluator = new BinaryClassificationEvaluator() 138 | .setLabelCol("label") 139 | .setRawPredictionCol("rawPrediction") 140 | .setMetricName("areaUnderROC") 141 | val crossAUC = crossEvaluator.evaluate(crossPredictions) 142 | println(s"cross features auc: $crossAUC") 143 | } 144 | 145 | } 146 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/examples/VectorReIndexZeroExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl.feature.examples 19 | 20 | import com.tencent.angel.spark.automl.feature.FeatureUtils 21 | import com.tencent.angel.spark.automl.feature.cross.FeatureCrossMeta 22 | import org.apache.spark.ml.Pipeline 23 | import org.apache.spark.ml.feature.operator.{SelfCartesian, VectorReIndexNonZero} 24 | import org.apache.spark.sql.SparkSession 25 | 26 | object VectorReIndexZeroExample { 27 | 28 | def main(args: Array[String]): Unit = { 29 | 30 | val spark = SparkSession.builder().master("local").getOrCreate() 31 | 32 | val trainDF = spark.read.format("libsvm") 33 | .option("numFeatures", "123") 34 | .load("data/a9a/a9a_123d_train_trans.libsvm") 35 | .persist() 36 | 37 | val maxDim = FeatureUtils.maxDim(trainDF) 38 | println(s"max dimension: $maxDim") 39 | 40 | // feature cross meta 41 | var crossInfo: Map[Int, FeatureCrossMeta] = Map[Int, FeatureCrossMeta]() 42 | (0 until maxDim).foreach(idx => crossInfo += idx -> FeatureCrossMeta(idx, idx.toString)) 43 | 44 | val featureMap: Map[Int, Int] = Map[Int, Int]() 45 | 46 | val cartesian = new SelfCartesian() 47 | .setInputCol("features") 48 | .setOutputCol("cartesian_features") 49 | 50 | val filter = new VectorReIndexNonZero(featureMap) 51 | .setInputCol("cartesian_features") 52 | .setOutputCol("filter_features") 53 | 54 | val pipeline = new Pipeline() 55 | .setStages(Array(cartesian, filter)) 56 | 57 | val pipelineModel = pipeline.fit(trainDF) 58 | 59 | val filterDF = pipelineModel.transform(trainDF) 60 | 61 | println("nonzero features:") 62 | println(filter.featureMap.mkString(",")) 63 | 64 | filterDF.show(1) 65 | 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/BuckerizerWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.preprocess 20 | 21 | import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} 22 | import com.tencent.angel.spark.automl.feature.TransformerWrapper 23 | import org.apache.spark.ml.feature.Bucketizer 24 | 25 | class BuckerizerWrapper extends TransformerWrapper { 26 | 27 | override val transformer = new Bucketizer() 28 | override var parent: TransformerWrapper = _ 29 | 30 | override val requiredInputCols: Array[String] = Array("features") 31 | override val requiredOutputCols: Array[String] = Array("outBucketizer") 32 | 33 | override val hasMultiInputs: Boolean = false 34 | override val hasMultiOutputs: Boolean = false 35 | override val needAncestorInputs: Boolean = false 36 | 37 | override val relation: InToOutRelation = OneToOne 38 | 39 | override def declareInAndOut(): this.type = { 40 | transformer.setInputCol(getInputCols(0)) 41 | transformer.setOutputCol(getOutputCols(0)) 42 | this 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/Components.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.preprocess 20 | 21 | import org.apache.spark.ml.PipelineStage 22 | import org.apache.spark.ml.feature.{StopWordsRemover, Tokenizer} 23 | import org.apache.spark.sql.DataFrame 24 | 25 | import scala.collection.mutable.ArrayBuffer 26 | 27 | object Components { 28 | 29 | def sample(data: DataFrame, 30 | fraction: Double): DataFrame = { 31 | data.sample(false, fraction) 32 | } 33 | 34 | def addSampler(components: ArrayBuffer[PipelineStage], 35 | inputCol: String, 36 | fraction: Double): Unit = { 37 | val sampler = new Sampler(fraction) 38 | .setInputCol("features") 39 | components += sampler 40 | } 41 | 42 | def addTokenizer(components: ArrayBuffer[PipelineStage], 43 | inputCol: String, 44 | outputCol: String): Unit = { 45 | val tokenizer = new Tokenizer() 46 | .setInputCol(inputCol) 47 | .setOutputCol(outputCol) 48 | components += tokenizer 49 | } 50 | 51 | def addStopWordsRemover(components: ArrayBuffer[PipelineStage], 52 | inputCol: String, 53 | outputCol: String): Unit = { 54 | val remover = new StopWordsRemover() 55 | .setInputCol(inputCol) 56 | .setOutputCol(outputCol) 57 | components += remover 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/FPreprocess.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.preprocess 20 | 21 | import com.tencent.angel.spark.automl.AutoConf 22 | import com.tencent.angel.spark.automl.feature.DataLoader 23 | import com.tencent.angel.spark.automl.utils.ArgsUtil 24 | import org.apache.spark.ml.{Pipeline, PipelineStage} 25 | import org.apache.spark.sql.SparkSession 26 | 27 | import scala.collection.mutable.ArrayBuffer 28 | 29 | 30 | object FPreprocess { 31 | 32 | def main(args: Array[String]): Unit = { 33 | 34 | val params = ArgsUtil.parse(args) 35 | val master = params.getOrElse("master", "yarn") 36 | val deploy = params.getOrElse("deploy-mode", "cluster") 37 | val input = params.getOrElse("input", "") 38 | val inputSeparator = params.getOrElse(AutoConf.Preprocess.ML_DATA_SPLITOR, 39 | AutoConf.Preprocess.DEFAULT_ML_DATA_SPLITOR) 40 | val inputFormat = params.getOrElse(AutoConf.Preprocess.ML_DATA_INPUT_FORMAT, 41 | AutoConf.Preprocess.DEFAULT_ML_DATA_INPUT_FORMAT) 42 | val inputType = params.getOrElse(AutoConf.Preprocess.INPUT_TYPE, 43 | AutoConf.Preprocess.DEFAULT_INPUT_TYPE) 44 | val sampleRate = params.getOrElse(AutoConf.Preprocess.SAMPLE_RATE, 45 | AutoConf.Preprocess.DEFAULT_SAMPLE_RATE).toDouble 46 | val imbalanceSampleRate = params.getOrElse(AutoConf.Preprocess.IMBALANCE_SAMPLE, 47 | AutoConf.Preprocess.DEFAULT_IMBALANCE_SAMPLE) 48 | val hasTokenizer = if (inputFormat.equals("document")) true else false 49 | val hasStopWordsRemover = if (inputFormat.equals("document")) true else false 50 | 51 | val ss = SparkSession 52 | .builder 53 | .master(master + "-" + deploy) 54 | .appName("preprocess") 55 | .getOrCreate() 56 | 57 | var training = DataLoader.load(ss, inputFormat, input, inputSeparator) 58 | 59 | var components = new ArrayBuffer[PipelineStage] 60 | 61 | if (sampleRate > 0 & sampleRate < 1.0) 62 | Components.addSampler(components, 63 | "features", sampleRate) 64 | 65 | if (hasTokenizer) 66 | Components.addTokenizer(components, 67 | "sentence", "words") 68 | 69 | if (hasStopWordsRemover) 70 | Components.addStopWordsRemover(components, 71 | "words", "filterWords") 72 | 73 | val pipeline = new Pipeline() 74 | .setStages(components.toArray) 75 | 76 | val model = pipeline.fit(training) 77 | 78 | ss.stop() 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/HashingTFWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl.feature.preprocess 19 | 20 | import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} 21 | import com.tencent.angel.spark.automl.feature.TransformerWrapper 22 | import org.apache.spark.ml.Transformer 23 | import org.apache.spark.ml.feature.HashingTF 24 | 25 | class HashingTFWrapper(numFeatures: Int) extends TransformerWrapper { 26 | 27 | override val transformer: Transformer = new HashingTF().setNumFeatures(numFeatures) 28 | override var parent: TransformerWrapper = _ 29 | 30 | override val hasMultiInputs: Boolean = false 31 | override val hasMultiOutputs: Boolean = false 32 | override val needAncestorInputs: Boolean = false 33 | 34 | override val relation: InToOutRelation = OneToOne 35 | 36 | override val requiredInputCols: Array[String] = Array("words") 37 | override val requiredOutputCols: Array[String] = Array("outHashingTF") 38 | 39 | override def declareInAndOut(): this.type = { 40 | transformer.asInstanceOf[HashingTF].setInputCol(getInputCols(0)) 41 | transformer.asInstanceOf[HashingTF].setOutputCol(getOutputCols(0)) 42 | this 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/IDFWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | package com.tencent.angel.spark.automl.feature.preprocess 18 | 19 | import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} 20 | import com.tencent.angel.spark.automl.feature.TransformerWrapper 21 | import org.apache.spark.ml.feature.IDF 22 | 23 | class IDFWrapper extends TransformerWrapper { 24 | 25 | override val transformer = new IDF() 26 | override var parent: TransformerWrapper = _ 27 | 28 | override val hasMultiInputs: Boolean = false 29 | override val hasMultiOutputs: Boolean = false 30 | override val needAncestorInputs: Boolean = false 31 | 32 | override val relation: InToOutRelation = OneToOne 33 | 34 | override val requiredInputCols: Array[String] = Array("rawFeatures") 35 | override val requiredOutputCols: Array[String] = Array("outIDF") 36 | 37 | override def declareInAndOut(): this.type = { 38 | transformer.asInstanceOf[IDF].setInputCol(getInputCols(0)) 39 | transformer.asInstanceOf[IDF].setOutputCol(getOutputCols(0)) 40 | this 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/MinMaxScalerWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl.feature.preprocess 19 | 20 | import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} 21 | import com.tencent.angel.spark.automl.feature.TransformerWrapper 22 | import org.apache.spark.ml.feature.MinMaxScaler 23 | 24 | private[feature] class MinMaxScalerWrapper extends TransformerWrapper { 25 | 26 | override val transformer = new MinMaxScaler() 27 | override var parent: TransformerWrapper = _ 28 | 29 | override val hasMultiInputs: Boolean = false 30 | override val hasMultiOutputs: Boolean = false 31 | override val needAncestorInputs: Boolean = false 32 | 33 | override val relation: InToOutRelation = OneToOne 34 | 35 | override val requiredInputCols: Array[String] = Array("numerical") 36 | override val requiredOutputCols: Array[String] = Array("outMinMaxScaler") 37 | 38 | override def declareInAndOut(): this.type = { 39 | transformer.asInstanceOf[MinMaxScaler].setInputCol(getInputCols(0)) 40 | transformer.asInstanceOf[MinMaxScaler].setOutputCol(getOutputCols(0)) 41 | this 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/PCAWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.preprocess 20 | 21 | import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} 22 | import com.tencent.angel.spark.automl.feature.TransformerWrapper 23 | import org.apache.spark.ml.feature.PCA 24 | 25 | class PCAWrapper extends TransformerWrapper { 26 | 27 | override val transformer = new PCA() 28 | override var parent: TransformerWrapper = _ 29 | 30 | override val requiredInputCols: Array[String] = Array("features") 31 | override val requiredOutputCols: Array[String] = Array("outPCA") 32 | 33 | override val hasMultiInputs: Boolean = false 34 | override val hasMultiOutputs: Boolean = false 35 | override val needAncestorInputs: Boolean = false 36 | 37 | override val relation: InToOutRelation = OneToOne 38 | 39 | override def declareInAndOut(): this.type = { 40 | transformer.setInputCol(getInputCols(0)) 41 | transformer.setOutputCol(getOutputCols(0)) 42 | this 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/Sampler.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.preprocess 20 | 21 | import org.apache.spark.ml.linalg.Vector 22 | import org.apache.spark.ml.param.{Param, ParamMap} 23 | import org.apache.spark.ml.util.Identifiable 24 | import org.apache.spark.ml.{Pipeline, Transformer} 25 | import org.apache.spark.sql.types.StructType 26 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 27 | 28 | import scala.util.Random 29 | 30 | 31 | class Sampler(fraction: Double, 32 | override val uid: String, 33 | seed: Int = Random.nextInt) 34 | extends Transformer { 35 | 36 | def this(fraction: Double) = this(fraction, Identifiable.randomUID("sampler")) 37 | 38 | /** 39 | * Param for input column name. 40 | * 41 | * @group param 42 | */ 43 | final val inputCol: Param[String] = new Param[String](this, "inputCol", "input column name") 44 | 45 | /** @group setParam */ 46 | final def setInputCol(value: String): this.type = set(inputCol, value) 47 | 48 | /** @group getParam */ 49 | final def getInputCol: String = $(inputCol) 50 | 51 | /** @group getParam */ 52 | final def getOutputCol: String = $(inputCol) 53 | 54 | override def transform(dataset: Dataset[_]): DataFrame = { 55 | dataset.sample(false, fraction, seed).toDF 56 | } 57 | 58 | override def transformSchema(schema: StructType): StructType = { 59 | schema 60 | } 61 | 62 | override def copy(extra: ParamMap): Sampler = defaultCopy(extra) 63 | } 64 | 65 | object Sampler { 66 | 67 | def main(args: Array[String]): Unit = { 68 | val ss = SparkSession 69 | .builder 70 | .master("local") 71 | .appName("preprocess") 72 | .getOrCreate() 73 | 74 | val training = ss.read.format("libsvm") 75 | .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") 76 | 77 | println(training.count) 78 | 79 | val sampler = new Sampler(0.5) 80 | .setInputCol("features") 81 | 82 | val pipeline = new Pipeline() 83 | .setStages(Array(sampler)) 84 | 85 | val model = pipeline.fit(training) 86 | 87 | val test = ss.read.format("libsvm") 88 | .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") 89 | 90 | model.transform(test).select("*") 91 | .collect() 92 | .foreach { case Row(label: Double, vector: Vector) => 93 | println(s"($label, " + 94 | s"${vector.toSparse.indices.mkString("[", ",", "]")}, " + 95 | s"${vector.toSparse.values.mkString("[", ",", "]")}") 96 | } 97 | 98 | ss.stop() 99 | } 100 | } -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/SamplerWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.preprocess 20 | 21 | import com.tencent.angel.spark.automl.feature.InToOutRelation.{InPlace, InToOutRelation} 22 | import com.tencent.angel.spark.automl.feature.TransformerWrapper 23 | import org.apache.spark.ml.Transformer 24 | 25 | class SamplerWrapper(fraction: Double) extends TransformerWrapper { 26 | 27 | override val transformer: Transformer = new Sampler(fraction) 28 | override var parent: TransformerWrapper = _ 29 | 30 | override val hasMultiInputs: Boolean = false 31 | override val hasMultiOutputs: Boolean = false 32 | override val needAncestorInputs: Boolean = false 33 | 34 | override val relation: InToOutRelation = InPlace 35 | 36 | override val requiredInputCols: Array[String] = null 37 | override val requiredOutputCols: Array[String] = null 38 | 39 | override def declareInAndOut(): this.type = { 40 | transformer.asInstanceOf[Sampler].setInputCol(getInputCols(0)) 41 | this 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/StandardScalerWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | package com.tencent.angel.spark.automl.feature.preprocess 18 | 19 | import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} 20 | import com.tencent.angel.spark.automl.feature.TransformerWrapper 21 | import org.apache.spark.ml.feature.StandardScaler 22 | 23 | class StandardScalerWrapper extends TransformerWrapper { 24 | 25 | override val transformer = new StandardScaler() 26 | override var parent: TransformerWrapper = _ 27 | 28 | override val hasMultiInputs: Boolean = false 29 | override val hasMultiOutputs: Boolean = false 30 | override val needAncestorInputs: Boolean = false 31 | 32 | override val relation: InToOutRelation = OneToOne 33 | 34 | override val requiredInputCols: Array[String] = Array("numerical") 35 | override val requiredOutputCols: Array[String] = Array("standardNumerical") 36 | 37 | override def declareInAndOut(): this.type = { 38 | transformer.asInstanceOf[StandardScaler].setInputCol(getInputCols(0)) 39 | transformer.asInstanceOf[StandardScaler].setOutputCol(getOutputCols(0)) 40 | this 41 | } 42 | 43 | // def fit(df: DataFrame): Transformer = { 44 | // estimator.fit(df) 45 | // } 46 | // 47 | // def transform(dataset: Dataset[_]): DataFrame = { 48 | // val df = dataset.toDF() 49 | // 50 | // val scaler = new StandardScaler() 51 | // .setInputCol("features") 52 | // .setOutputCol("scaledFeatures") 53 | // .setWithStd(true) 54 | // .setWithMean(true) 55 | // val scalerModel = scaler.fit(df) 56 | // 57 | // val scaledDf = scalerModel.transform(df) 58 | // 59 | // scaledDf.drop("features").withColumnRenamed("scaledFeatures", "features") 60 | // } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/StopWordsRemoverWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl.feature.preprocess 19 | 20 | import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} 21 | import com.tencent.angel.spark.automl.feature.TransformerWrapper 22 | import org.apache.spark.ml.Transformer 23 | import org.apache.spark.ml.feature.StopWordsRemover 24 | 25 | class StopWordsRemoverWrapper extends TransformerWrapper { 26 | 27 | override val transformer: Transformer = new StopWordsRemover() 28 | override var parent: TransformerWrapper = _ 29 | 30 | override val hasMultiInputs: Boolean = false 31 | override val hasMultiOutputs: Boolean = false 32 | override val needAncestorInputs: Boolean = false 33 | 34 | override val relation: InToOutRelation = OneToOne 35 | 36 | override val requiredInputCols: Array[String] = Array("words") 37 | override val requiredOutputCols: Array[String] = Array("stopwords") 38 | 39 | override def declareInAndOut(): this.type = { 40 | transformer.asInstanceOf[StopWordsRemover].setInputCol(getInputCols(0)) 41 | transformer.asInstanceOf[StopWordsRemover].setOutputCol(getOutputCols(0)) 42 | this 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/StringIndexerWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.preprocess 20 | 21 | import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} 22 | import com.tencent.angel.spark.automl.feature.TransformerWrapper 23 | import org.apache.spark.ml.feature.{StringIndexer, Tokenizer} 24 | 25 | class StringIndexerWrapper extends TransformerWrapper { 26 | 27 | override val transformer = new StringIndexer() 28 | override var parent: TransformerWrapper = _ 29 | 30 | override val requiredInputCols: Array[String] = Array("words") 31 | override val requiredOutputCols: Array[String] = Array("outStringIndexer") 32 | 33 | override val hasMultiInputs: Boolean = false 34 | override val hasMultiOutputs: Boolean = false 35 | override val needAncestorInputs: Boolean = false 36 | 37 | override val relation: InToOutRelation = OneToOne 38 | 39 | override def declareInAndOut(): this.type = { 40 | transformer.asInstanceOf[Tokenizer].setInputCol(getInputCols(0)) 41 | transformer.asInstanceOf[Tokenizer].setOutputCol(getOutputCols(0)) 42 | this 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/TPreprocess.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.preprocess 20 | 21 | import org.apache.spark.ml.Transformer 22 | import org.apache.spark.ml.param.ParamMap 23 | import org.apache.spark.ml.util.DefaultParamsWritable 24 | import org.apache.spark.sql.types.StructType 25 | import org.apache.spark.sql.{DataFrame, Dataset} 26 | 27 | class TPreprocess(override val uid: String) 28 | extends Transformer with DefaultParamsWritable { 29 | 30 | override def transform(dataset: Dataset[_]): DataFrame = ??? 31 | 32 | override def copy(extra: ParamMap): Transformer = ??? 33 | 34 | override def transformSchema(schema: StructType): StructType = ??? 35 | } 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/TokenizerWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.preprocess 20 | 21 | import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} 22 | import com.tencent.angel.spark.automl.feature.TransformerWrapper 23 | import org.apache.spark.ml.Transformer 24 | import org.apache.spark.ml.feature.Tokenizer 25 | 26 | 27 | class TokenizerWrapper extends TransformerWrapper { 28 | 29 | override val transformer: Transformer = new Tokenizer() 30 | override var parent: TransformerWrapper = _ 31 | 32 | override val requiredInputCols: Array[String] = Array("sentence") 33 | override val requiredOutputCols: Array[String] = Array("outTokenizer") 34 | 35 | override val hasMultiInputs: Boolean = false 36 | override val hasMultiOutputs: Boolean = false 37 | override val needAncestorInputs: Boolean = false 38 | 39 | override val relation: InToOutRelation = OneToOne 40 | 41 | override def declareInAndOut(): this.type = { 42 | transformer.asInstanceOf[Tokenizer].setInputCol(getInputCols(0)) 43 | transformer.asInstanceOf[Tokenizer].setOutputCol(getOutputCols(0)) 44 | this 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/preprocess/Word2VecWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.preprocess 20 | 21 | import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} 22 | import com.tencent.angel.spark.automl.feature.TransformerWrapper 23 | import org.apache.spark.ml.feature.Word2Vec 24 | 25 | class Word2VecWrapper extends TransformerWrapper { 26 | 27 | override val transformer = new Word2Vec() 28 | override var parent: TransformerWrapper = _ 29 | 30 | override val requiredInputCols: Array[String] = Array("sentences") 31 | override val requiredOutputCols: Array[String] = Array("outWord2Vec") 32 | 33 | override val hasMultiInputs: Boolean = false 34 | override val hasMultiOutputs: Boolean = false 35 | override val needAncestorInputs: Boolean = false 36 | 37 | override val relation: InToOutRelation = OneToOne 38 | 39 | override def declareInAndOut(): this.type = { 40 | transformer.setInputCol(getInputCols(0)) 41 | transformer.setOutputCol(getOutputCols(0)) 42 | this 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/select/ChiSqSelectorWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.select 20 | 21 | import com.tencent.angel.spark.automl.feature.InToOutRelation.{InToOutRelation, OneToOne} 22 | import com.tencent.angel.spark.automl.feature.TransformerWrapper 23 | import org.apache.spark.ml.feature.ChiSqSelector 24 | 25 | class ChiSqSelectorWrapper extends TransformerWrapper { 26 | override val transformer = new ChiSqSelector() 27 | override var parent: TransformerWrapper = _ 28 | 29 | override val hasMultiInputs: Boolean = false 30 | override val hasMultiOutputs: Boolean = false 31 | override val needAncestorInputs: Boolean = false 32 | 33 | override val relation: InToOutRelation = OneToOne 34 | 35 | override val requiredInputCols: Array[String] = Array("numerical") 36 | override val requiredOutputCols: Array[String] = Array("outChiSeSelector") 37 | 38 | override def declareInAndOut(): this.type = { 39 | transformer.setFeaturesCol(getInputCols(0)) 40 | transformer.setOutputCol(getOutputCols(0)) 41 | this 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/select/FeatureSelector.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.select 20 | 21 | import org.apache.spark.ml.Transformer 22 | import org.apache.spark.ml.param.ParamMap 23 | import org.apache.spark.ml.util.DefaultParamsWritable 24 | import org.apache.spark.sql.types.StructType 25 | import org.apache.spark.sql.{DataFrame, Dataset} 26 | 27 | class FeatureSelector(override val uid: String) 28 | extends Transformer with DefaultParamsWritable { 29 | 30 | override def transform(dataset: Dataset[_]): DataFrame = ??? 31 | 32 | override def copy(extra: ParamMap): Transformer = ??? 33 | 34 | override def transformSchema(schema: StructType): StructType = ??? 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/feature/transform/FTransform.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.feature.transform 20 | 21 | import com.tencent.angel.spark.automl.AutoConf 22 | import com.tencent.angel.spark.automl.utils.ArgsUtil 23 | 24 | class FTransform { 25 | 26 | def main(args: Array[String]): Unit = { 27 | val params = ArgsUtil.parse(args) 28 | val master = params.getOrElse("master", "yarn") 29 | val deploy = params.getOrElse("deploy-mode", "cluster") 30 | val input = params.getOrElse("input", "") 31 | val inputSeparator = params.getOrElse(AutoConf.Preprocess.ML_DATA_SPLITOR, 32 | AutoConf.Preprocess.DEFAULT_ML_DATA_SPLITOR) 33 | val inputFormat = params.getOrElse(AutoConf.Preprocess.ML_DATA_INPUT_FORMAT, 34 | AutoConf.Preprocess.DEFAULT_ML_DATA_INPUT_FORMAT) 35 | val inputType = params.getOrElse(AutoConf.Preprocess.INPUT_TYPE, 36 | AutoConf.Preprocess.DEFAULT_INPUT_TYPE) 37 | val hasDiscreter = params.getOrElse(AutoConf.Preprocess.HAS_DISCRETER, 38 | AutoConf.Preprocess.DEFAULT_HAS_DISCRETER) 39 | val hasOnehoter = params.getOrElse(AutoConf.Preprocess.HAS_ONEHOTER, 40 | AutoConf.Preprocess.DEFAULT_HAS_ONEHOTER) 41 | val hasMinMaxScalar = params.getOrElse(AutoConf.Preprocess.HAS_MINMAXSCALAR, 42 | AutoConf.Preprocess.DEFAULT_HAS_MINMAXSCALAR) 43 | val hasStdScalar = params.getOrElse(AutoConf.Preprocess.HAS_STANDARDSCALAR, 44 | AutoConf.Preprocess.DEFAULT_HAS_STANDARDSCALAR) 45 | } 46 | 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/TunerParam.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner 20 | 21 | class TunerParam { 22 | } 23 | 24 | object TunerParam { 25 | 26 | var batchSize: Int = 1 27 | var sampleSize: Int = 10 * batchSize 28 | var defaultGridSize: Int = 100 29 | 30 | var taskName: String = "com.tencent.angel.spark.automl.tuner.trail.TestRunner" 31 | 32 | def setBatchSize(num: Int): Unit = { 33 | batchSize = num 34 | } 35 | 36 | def setSampleSize(num: Int): Unit = { 37 | sampleSize = num 38 | } 39 | 40 | def setDefaultGridSize(num: Int): Unit = { 41 | defaultGridSize = num 42 | } 43 | 44 | def setTaskName(name: String): Unit = { 45 | taskName = name 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/acquisition/Acquisition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.acquisition 20 | 21 | import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate 22 | import org.apache.spark.ml.linalg.Vector 23 | 24 | 25 | /** 26 | * Abstract base class for acquisition function 27 | */ 28 | abstract class Acquisition(val surrogate: Surrogate) { 29 | 30 | /** 31 | * Computes the acquisition value for a given point X 32 | * 33 | * @param X : (1, D), the input points where the acquisition function should be evaluated. 34 | * @return (1, 1) Expected Improvement of X, (1, D) Derivative of Expected Improvement at X 35 | */ 36 | def compute(X: Vector, derivative: Boolean = false): (Double, Vector) 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/acquisition/EI.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.acquisition 20 | 21 | import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate 22 | import org.apache.commons.logging.{Log, LogFactory} 23 | import org.apache.commons.math3.distribution.NormalDistribution 24 | import org.apache.spark.ml.linalg.{Vector, Vectors} 25 | 26 | /** 27 | * Expected improvement. 28 | * 29 | * @param surrogate 30 | * @param par : Controls the balance between exploration and exploitation of the acquisition function, default=0.0 31 | * 32 | */ 33 | class EI( 34 | override val surrogate: Surrogate, 35 | val par: Double) 36 | extends Acquisition(surrogate) { 37 | 38 | val LOG: Log = LogFactory.getLog(classOf[Surrogate]) 39 | 40 | override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = { 41 | val pred = surrogate.predict(X) // (mean, variance) 42 | 43 | // Use the best seen observation as incumbent 44 | val eta: Double = surrogate.curBest._2 45 | //println(s"best seen result: $eta") 46 | 47 | val m: Double = pred._1 48 | val s: Double = Math.sqrt(pred._2) 49 | //println(s"${X.toArray.mkString("(", ",", ")")}: mean[$m], variance[$s]") 50 | 51 | if (s == 0) { 52 | // if std is zero, we have observed x on all instances 53 | // using a RF, std should be never exactly 0.0 54 | (0.0, Vectors.dense(new Array[Double](X.size))) 55 | } else { 56 | val z = (pred._1 - eta - par) / s 57 | val norm: NormalDistribution = new NormalDistribution 58 | val cdf: Double = norm.cumulativeProbability(z) 59 | val pdf: Double = norm.density(z) 60 | val ei = s * (z * cdf + pdf) 61 | //println(s"EI of ${X.toArray.mkString("(", ",", ")")}: $ei, cur best: $eta, z: $z, cdf: $cdf, pdf: $pdf") 62 | (ei, Vectors.dense(new Array[Double](X.size))) 63 | } 64 | } 65 | } -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/acquisition/UCB.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.acquisition 20 | 21 | import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate 22 | import org.apache.commons.logging.{Log, LogFactory} 23 | import org.apache.spark.ml.linalg.{Vector, Vectors} 24 | 25 | /** 26 | * Expected improvement. 27 | * 28 | * @param surrogate 29 | * @param beta : Controls the upper confidence bound 30 | * Assume : 31 | * - t: number of iteration 32 | * - d: dimension of optimization space 33 | * - v: hyperparameter v = 1 34 | * - delta: small constant 0.1 (prob of regret) 35 | * Suggest value:beta = sqrt( v* (2* log( (t**(d/2. + 2))*(pi**2)/(3. * delta) ))) 36 | */ 37 | class UCB( 38 | override val surrogate: Surrogate, 39 | val beta: Double = 100) 40 | extends Acquisition(surrogate) { 41 | 42 | val LOG: Log = LogFactory.getLog(classOf[Surrogate]) 43 | 44 | override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = { 45 | val pred = surrogate.predict(X) // (mean, variance) 46 | 47 | val m: Double = pred._1 48 | val s: Double = Math.sqrt(pred._2) 49 | 50 | if (s == 0) { 51 | // if std is zero, we have observed x on all instances 52 | // using a RF, std should be never exactly 0.0 53 | (0.0, Vectors.dense(new Array[Double](X.size))) 54 | } else { 55 | val ucb = m + beta * s 56 | 57 | (ucb, Vectors.dense(new Array[Double](X.size))) 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/acquisition/optimizer/AcqOptimizer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.acquisition.optimizer 20 | 21 | import com.tencent.angel.spark.automl.tuner.acquisition.Acquisition 22 | import com.tencent.angel.spark.automl.tuner.config.{Configuration, ConfigurationSpace} 23 | 24 | /** 25 | * Abstract base class for acquisition maximization. 26 | * 27 | * @param acqFunc : The acquisition function which will be maximized 28 | * @param configSpace : Configuration space of parameters 29 | */ 30 | abstract class AcqOptimizer( 31 | val acqFunc: Acquisition, 32 | val configSpace: ConfigurationSpace) { 33 | 34 | /** 35 | * Maximizes the given acquisition function. 36 | * 37 | * @param numPoints : Number of queried points. 38 | * @return A set of tuple(acquisition value, Configuration). 39 | */ 40 | def maximize(numPoints: Int, sorted: Boolean = true): Array[(Double, Configuration)] 41 | 42 | def maximize: (Double, Configuration) 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/acquisition/optimizer/LocalSearch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.acquisition.optimizer 20 | 21 | import com.tencent.angel.spark.automl.tuner.acquisition.Acquisition 22 | import com.tencent.angel.spark.automl.tuner.config.{Configuration, ConfigurationSpace} 23 | 24 | /** 25 | * Implementation of local search. 26 | * 27 | * @param acqFunc : The acquisition function which will be maximized 28 | * @param configSpace : Configuration space of parameters 29 | * @param epsilon : In order to perform a local move one of the incumbent's neighbors needs at least an improvement higher than epsilon 30 | * @param numIters : Maximum number of iterations that the local search will perform 31 | */ 32 | class LocalSearch( 33 | override val acqFunc: Acquisition, 34 | override val configSpace: ConfigurationSpace, 35 | epsilon: String, numIters: Int) 36 | extends AcqOptimizer(acqFunc, configSpace) { 37 | 38 | /** 39 | * Starts a local search from the given start point and quits if either the max number of steps is reached or 40 | * no neighbor with an higher improvement was found 41 | * 42 | * @param numPoints : Number of queried points. 43 | * @return A set of tuple(acquisition_value, Configuration). 44 | */ 45 | override def maximize(numPoints: Int, 46 | sorted: Boolean = true): Array[(Double, Configuration)] = ??? 47 | 48 | override def maximize: (Double, Configuration) = ??? 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/acquisition/optimizer/RandomSearch.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.acquisition.optimizer 20 | 21 | import com.tencent.angel.spark.automl.tuner.TunerParam 22 | import com.tencent.angel.spark.automl.tuner.acquisition.Acquisition 23 | import com.tencent.angel.spark.automl.tuner.config.{Configuration, ConfigurationSpace} 24 | import org.apache.commons.logging.{Log, LogFactory} 25 | 26 | import scala.util.Random 27 | 28 | /** 29 | * Get candidate solutions via random sampling of configurations. 30 | * 31 | * @param acqFunc : The acquisition function which will be maximized 32 | * @param configSpace : Configuration space of parameters 33 | * @param seed 34 | */ 35 | class RandomSearch( 36 | override val acqFunc: Acquisition, 37 | override val configSpace: ConfigurationSpace, 38 | seed: Int = 100) extends AcqOptimizer(acqFunc, configSpace) { 39 | 40 | val LOG: Log = LogFactory.getLog(classOf[RandomSearch]) 41 | 42 | val rd = new Random(seed) 43 | 44 | override def maximize(numPoints: Int, sorted: Boolean = true): Array[(Double, Configuration)] = { 45 | //println(s"maximize RandomSearch") 46 | val configs: Array[Configuration] = configSpace.sample(TunerParam.sampleSize) 47 | if (configs.isEmpty) { 48 | Array[(Double, Configuration)]() 49 | } else { 50 | //configs.foreach { config => 51 | // println(s"sample a configuration: ${config.getVector.toArray.mkString(",")}") 52 | //} 53 | val retConfigs = if (sorted) { 54 | configs.map { config => 55 | (acqFunc.compute(config.getVector)._1, config) 56 | }.sortWith(_._1 > _._1).take(numPoints) 57 | } 58 | else { 59 | rd.shuffle(configs.map { config => 60 | (acqFunc.compute(config.getVector)._1, config) 61 | }.toTraversable).take(numPoints).toArray 62 | } 63 | retConfigs 64 | } 65 | } 66 | 67 | override def maximize: (Double, Configuration) = { 68 | maximize(1, true).head 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/config/Configuration.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.config 20 | 21 | import org.apache.spark.ml.linalg.Vector 22 | import org.apache.spark.ml.param._ 23 | 24 | /** 25 | * A single configuration 26 | * 27 | * @param configSpace : The configuration space for this configuration 28 | * @param vector : A vector for efficient representation of configuration. 29 | */ 30 | class Configuration( 31 | param2Idx: Map[String, Int], 32 | param2Doc: Map[String, String], 33 | vector: Vector) { 34 | 35 | def getVector: Vector = vector 36 | 37 | def getParamMap: ParamMap = { 38 | val paramMap = ParamMap.empty 39 | for (name: String <- param2Idx.keys) { 40 | val param: Param[Double] = new Param(this.toString, name, param2Doc.getOrElse(name, "")) 41 | paramMap.put(param, vector(param2Idx(name))) 42 | } 43 | paramMap 44 | } 45 | 46 | def getValues: Array[Double] = vector.toArray 47 | 48 | def keys: List[String] = param2Idx.keys.toList 49 | 50 | def get(name: String): Double = get(param2Idx.getOrElse(name, -1)) 51 | 52 | def get(idx: Int): Double = vector(idx) 53 | 54 | def contains(name: String): Boolean = param2Idx.contains(name) 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/config/EarlyStopping.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.config 20 | 21 | /** 22 | * A single configuration 23 | * 24 | * @param patience : How long to wait after last time validation loss improved. 25 | * Default: 5 26 | * @param minimize : Whether to minimize or maximize the val_score 27 | * Default: false 28 | */ 29 | class EarlyStopping(patience: Int = 5, 30 | minDelta: Double = 0.0, 31 | minimize: Boolean = false) { 32 | 33 | var counter: Int = 0 34 | var bestScore: Double = if (minimize) Double.PositiveInfinity else Double.NegativeInfinity 35 | var earlyStop: Boolean = false 36 | val pat = patience 37 | 38 | def greater(a: Double, b: Double): Boolean = a > b 39 | 40 | def less(a: Double, b: Double): Boolean = a < b 41 | 42 | val monitorOp: (Double, Double) => Boolean = if (minimize) less else greater 43 | 44 | def bound(score: Double): Double = if (minimize) score + minDelta else score - minDelta 45 | 46 | def update(val_score: Double): Unit = { 47 | val score = val_score 48 | if (monitorOp(bound(score), bestScore)) { 49 | bestScore = score 50 | counter = 0 51 | } else { 52 | counter += 1 53 | println(s"EarlyStopping counter: $counter out of $patience") 54 | if (counter >= patience) { 55 | earlyStop = true 56 | } 57 | } 58 | } 59 | } -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/kernel/Covariance.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.kernel 20 | 21 | import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} 22 | 23 | /** 24 | * Covariance function given two points. 25 | */ 26 | trait Covariance { 27 | 28 | /** 29 | * the covariance function 30 | * 31 | * @param x1 32 | * @param x2 33 | * @param params 34 | * @return 35 | */ 36 | def cov(x1: BDM[Double], 37 | x2: BDM[Double], 38 | params: BDV[Double]): BDM[Double] 39 | 40 | /** 41 | * the derivative of covariance function against kernel hyper-parameters 42 | * 43 | * @param x1 44 | * @param x2 45 | * @param params 46 | * @return 47 | */ 48 | def grad(x1: BDM[Double], 49 | x2: BDM[Double], 50 | params: BDV[Double]): Array[BDM[Double]] 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/kernel/CovarianceType.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.kernel 20 | 21 | object CovarianceType extends Enumeration { 22 | 23 | type CovarianceType = Value 24 | 25 | val MATERN3 = Value("MATERN3") 26 | val MATERN5 = Value("MATERN5") 27 | val MATERN5_ISO = Value("MATERN5_ISO") 28 | val SQUAREEXP_ISO = Value("SQUAREEXP_ISO") 29 | 30 | def fromString(name: String): Covariance = { 31 | val covType = CovarianceType.withName(name.toUpperCase()) 32 | fromString(covType) 33 | } 34 | 35 | def fromString(covType: CovarianceType.Value): Covariance = covType match { 36 | case MATERN3 => new Matern3 37 | case MATERN5 => new Matern5 38 | case MATERN5_ISO => new Matern5Iso 39 | case SQUAREEXP_ISO => new SquareExpIso 40 | case _ => new Matern5 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/kernel/Matern3.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.kernel 20 | 21 | import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, _} 22 | import breeze.numerics.{exp, pow, sqrt} 23 | import com.tencent.angel.spark.automl.tuner.math.SquareDist 24 | 25 | /** 26 | * Matern covariance function with v = 3/2 27 | * (1 + sqrt(3)*r/l) * exp(-sqrt(3)*r/l) 28 | * Here r is the distance |x1-x2| of two points 29 | * Hyper-parameter: l is the length scale 30 | */ 31 | case class Matern3() extends Covariance { 32 | 33 | /** 34 | * the covariance function 35 | * 36 | * @param x1 37 | * @param x2 38 | * @param params 39 | * @return 40 | */ 41 | override def cov(x1: BDM[Double], 42 | x2: BDM[Double], 43 | params: BDV[Double]): BDM[Double] = { 44 | 45 | require(params.size == 1, 46 | s"Number of hyper parameters is ${params.length} while expected 1") 47 | 48 | val l = params(0) 49 | 50 | val distMat = SquareDist(x1, x2) 51 | val r = sqrt(distMat) 52 | 53 | val vPart = sqrt(3) * r / l + 1.0 54 | val expPart = exp(-sqrt(3) * r / l) 55 | val covMatrix = vPart *:* expPart 56 | 57 | covMatrix 58 | } 59 | 60 | /** 61 | * the derivative of covariance function against kernel hyper-parameters 62 | * 63 | * @param x1 64 | * @param x2 65 | * @param params 66 | * @return 67 | */ 68 | override def grad(x1: BDM[Double], 69 | x2: BDM[Double], 70 | params: BDV[Double]): Array[BDM[Double]] = { 71 | 72 | require(params.size == 1, 73 | s"Number of hyper parameters is ${params.length} while expected 1") 74 | 75 | val l = params(0) 76 | 77 | val distMat = SquareDist(x1, x2) 78 | val r = sqrt(distMat) 79 | 80 | val vPart = sqrt(3) * r / l + 1.0 81 | val expPart = exp(-sqrt(3) * r / l) 82 | 83 | val vPartGrad = -(sqrt(3) * r / pow(l, 2)) *:* expPart 84 | val expPartGrad = vPart *:* expPart *:* (sqrt(3) * r / pow(l, 2)) 85 | 86 | val gradL = vPartGrad + expPartGrad 87 | 88 | Array(gradL) 89 | } 90 | } 91 | 92 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/kernel/Matern5.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.kernel 20 | 21 | import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} 22 | import breeze.numerics._ 23 | import com.tencent.angel.spark.automl.tuner.math.SquareDist 24 | 25 | /** 26 | * Matern covariance function with v = 5/2 27 | * (1 + sqrt(5)*r/l + 5r^2/(3l^2)) * exp(-sqrt(5)*r/l) 28 | * Here r is the distance |x1-x2| of two points 29 | * Hyper-parameter: l is the length scale 30 | */ 31 | case class Matern5() extends Covariance { 32 | 33 | /** 34 | * the covariance function 35 | * 36 | * @param x1 37 | * @param x2 38 | * @param params 39 | * @return 40 | */ 41 | override def cov(x1: BDM[Double], 42 | x2: BDM[Double], 43 | params: BDV[Double]): BDM[Double] = { 44 | 45 | require(params.size == 1, 46 | s"Number of hyper parameters is ${params.length} while expected 1") 47 | 48 | val l = params(0) 49 | 50 | val distMat = SquareDist(x1, x2) 51 | val r = sqrt(distMat) 52 | 53 | val vPart = sqrt(5) * r / l + 5.0 / 3.0 * distMat / pow(l, 2) + 1.0 54 | val expPart = exp(-sqrt(5) * r / l) 55 | val covMatrix = vPart *:* expPart 56 | 57 | covMatrix 58 | } 59 | 60 | /** 61 | * the derivative of covariance function against kernel hyper-parameters 62 | * 63 | * @param x1 64 | * @param x2 65 | * @param params 66 | * @return 67 | */ 68 | override def grad(x1: BDM[Double], 69 | x2: BDM[Double], 70 | params: BDV[Double]): Array[BDM[Double]] = { 71 | 72 | require(params.size == 1, 73 | s"Number of hyper parameters is ${params.length} while expected 1") 74 | 75 | val l = params(0) 76 | 77 | val distMat = SquareDist(x1, x2) 78 | val r = sqrt(distMat) 79 | 80 | val vPart = sqrt(5) * r / l + 5.0 / 3.0 * distMat / pow(l, 2) + 1.0 81 | val expPart = exp(-sqrt(5) * r / l) 82 | 83 | val vPartGrad = -(sqrt(5) * r / pow(l, 2) + 10.0 * distMat / (3.0 * pow(l, 3))) *:* expPart 84 | val expPartGrad = vPart *:* expPart *:* (sqrt(5) * r / pow(l, 2)) 85 | 86 | val gradL = vPartGrad + expPartGrad 87 | 88 | Array(gradL) 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/kernel/Matern5Iso.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.kernel 20 | 21 | import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} 22 | import breeze.numerics._ 23 | import com.tencent.angel.spark.automl.tuner.math.SquareDist 24 | 25 | /** 26 | * Matern covariance function with v = 5/2 and isotropic distance measure 27 | * theta^2 * (1 + sqrt(5)*r/l + 5r^2/(3l^2)) * exp(-sqrt(5)*r/l) 28 | * Here r is the distance |x1-x2| of two points 29 | * Hyper-parameter: theta is the signal variance, l is the length scale 30 | **/ 31 | case class Matern5Iso() extends Covariance { 32 | 33 | /** 34 | * the covariance function 35 | * 36 | * @param x1 37 | * @param x2 38 | * @param params 39 | * @return 40 | */ 41 | override def cov(x1: BDM[Double], 42 | x2: BDM[Double], 43 | params: BDV[Double]): BDM[Double] = { 44 | 45 | require(params.size == 2, 46 | s"Number of hyper parameters is ${params.length} while expected 2") 47 | 48 | val theta = params(0) 49 | val l = params(1) 50 | 51 | val distMat = SquareDist(x1, x2) 52 | val r = sqrt(distMat) 53 | 54 | val vPart = (sqrt(5) * r) / l + distMat / pow(l, 2) * 5.0 / 3.0 + 1.0 55 | val expPart = exp(-sqrt(5) * r / l) 56 | val covMatrix = pow(theta, 2) * vPart *:* expPart 57 | // println(covMatrix) 58 | covMatrix 59 | } 60 | 61 | /** 62 | * the derivative of covariance function against kernel hyper-parameters 63 | * 64 | * @param x1 65 | * @param x2 66 | * @param params 67 | * @return 68 | */ 69 | override def grad(x1: BDM[Double], 70 | x2: BDM[Double], 71 | params: BDV[Double]): Array[BDM[Double]] = { 72 | 73 | require(params.size == 2, 74 | s"Number of hyper parameters is ${params.length} while expected 2") 75 | 76 | val theta = params(0) 77 | val l = params(1) 78 | 79 | val distMat = SquareDist(x1, x2) 80 | val r = sqrt(distMat) 81 | 82 | val vPart = sqrt(5) * r / l + 5.0 / 3.0 * distMat / pow(l, 2) + 1.0 83 | val expPart = exp(-sqrt(5) * r / l) 84 | 85 | val vPartGrad = -(sqrt(5) * r / pow(l, 2) + 10.0 * distMat / (3.0 * pow(l, 3))) *:* expPart * pow(theta, 2) 86 | val expPartGrad = vPart *:* expPart *:* (sqrt(5) * r / pow(l, 2)) * pow(theta, 2) 87 | 88 | val gradL = vPartGrad + expPartGrad 89 | val gradTheta = vPart *:* expPart * 2.0 * theta 90 | // println(cov_l_grad) 91 | Array(gradTheta, gradL) 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/kernel/SquareExpIso.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | package com.tencent.angel.spark.automl.tuner.kernel 18 | 19 | import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} 20 | import breeze.numerics._ 21 | import com.tencent.angel.spark.automl.tuner.math.SquareDist 22 | 23 | /** 24 | * Square exponential covariance function with isotropic distance measure 25 | * k(x1, x2) = theta^2 * exp( -(x1-x2)^2 / l^2 ) 26 | * Hyper-parameter: theta is the signal variance, l is the length scale 27 | **/ 28 | case class SquareExpIso() extends Covariance { 29 | 30 | /** 31 | * the covariance function 32 | * 33 | * @param x1 34 | * @param x2 35 | * @param params 36 | * @return 37 | */ 38 | override def cov(x1: BDM[Double], 39 | x2: BDM[Double], 40 | params: BDV[Double]): BDM[Double] = { 41 | 42 | require(params.size == 2, 43 | s"Number of hyper parameters is ${params.length} while expected 2") 44 | 45 | val theta = params(0) 46 | val l = params(1) 47 | 48 | val distMat = SquareDist(x1, x2) 49 | 50 | val covMatrix = pow(theta, 2) * exp(-0.5 * distMat / pow(l, 2)) 51 | 52 | covMatrix 53 | } 54 | 55 | /** 56 | * the derivative of covariance function against kernel hyper-parameters 57 | * 58 | * @param x1 59 | * @param x2 60 | * @param params 61 | * @return 62 | */ 63 | override def grad(x1: BDM[Double], 64 | x2: BDM[Double], 65 | params: BDV[Double]): Array[BDM[Double]] = { 66 | 67 | require(params.size == 2, 68 | s"Number of hyper parameters is ${params.length} while expected 2") 69 | 70 | val theta = params(0) 71 | val l = params(1) 72 | 73 | val distMat = SquareDist(x1, x2) 74 | val r = sqrt(distMat) 75 | 76 | val expDistMat = exp(-0.5 * distMat / pow(l, 2)) 77 | 78 | val gradTheta = 2 * theta * expDistMat 79 | 80 | val gradL = pow(theta, 2) * expDistMat *:* distMat / pow(l, 3) 81 | 82 | Array(gradTheta, gradL) 83 | } 84 | } 85 | 86 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/math/BreezeOp.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.math 20 | 21 | import breeze.linalg.{cholesky, diag, inv, sum, trace, DenseMatrix => BDM, DenseVector => BDV} 22 | import breeze.numerics.log 23 | 24 | import scala.math.Pi 25 | 26 | object BreezeOp { 27 | 28 | /** 29 | * calculate the inverse of a matrix with cholesky decomposition 30 | * 31 | * @param L : the Cholesky decomposition of matrix A where A = L'*L 32 | * @return inv(A)=inv(L)*inv(L') 33 | */ 34 | def choleskyInv(L: BDM[Double]): BDM[Double] = { 35 | val invL = inv(L) 36 | invL * invL.t 37 | } 38 | 39 | /** 40 | * sum of log diag of positive definite matrices 41 | * 42 | * @param L 43 | * @return 44 | */ 45 | def sumLogDiag(L: BDM[Double]): Double = { 46 | 2 * sum(log(diag(L))) 47 | } 48 | 49 | def logLike(meanX: BDV[Double], 50 | KXX: BDM[Double], 51 | invKXX: BDM[Double], 52 | y: BDV[Double]): Double = { 53 | 54 | val m = meanX 55 | 56 | val logDiag = sumLogDiag(cholesky(KXX)) 57 | 58 | val value = -0.5 * (y - m).t * invKXX * (y - m) - 0.5 * logDiag - 0.5 * meanX.size * scala.math.log(2 * Pi) 59 | 60 | value(0) 61 | } 62 | 63 | def logLikeD(meanX: BDV[Double], 64 | invKXX: BDM[Double], 65 | y: BDV[Double], 66 | covGrads: Array[BDM[Double]]): BDV[Double] = { 67 | 68 | val m = meanX 69 | val alpha = invKXX * (y - m) 70 | 71 | val grads = covGrads.map { covGrad => 72 | val tmp = alpha * alpha.t - invKXX 73 | 0.5 * trace(tmp * covGrad) 74 | } 75 | 76 | BDV(grads) 77 | } 78 | 79 | def cartesian(A: Array[Double], B: Array[Double]) = for (a <- A; b <- B) yield { 80 | Array(a, b) 81 | } 82 | 83 | def cartesian(A: Array[Array[Double]], B: Array[Double]) = for (a <- A; b <- B) yield { 84 | (a.toBuffer += b).toArray 85 | } 86 | } -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/math/SquareDist.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.math 20 | 21 | import breeze.generic.UFunc 22 | import breeze.linalg.{DenseMatrix => BDM, _} 23 | 24 | /** 25 | * Computes pair-wise square distances between matrices x1 and x2. 26 | * 27 | * @param x1 [N x D] 28 | * @param x2 [M x D] 29 | * @return matrix of square distances [N x M] 30 | */ 31 | object SquareDist extends UFunc { 32 | 33 | implicit object implBinary 34 | extends Impl2[BDM[Double], BDM[Double], BDM[Double]] { 35 | 36 | def apply(x1: BDM[Double], 37 | x2: BDM[Double]): BDM[Double] = { 38 | 39 | val t1 = -2.0 * (x1 * x2.t) 40 | 41 | val t2 = t1(*, ::) + sum(x2.t *:* x2.t, Axis._0).t 42 | 43 | t2(::, *) + sum(x1.t *:* x1.t, Axis._0).t 44 | } 45 | } 46 | 47 | } -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/model/GPExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.model 20 | 21 | import breeze.linalg.{DenseMatrix, DenseVector} 22 | import com.tencent.angel.spark.automl.tuner.kernel.Matern5Iso 23 | 24 | object GPExample { 25 | 26 | def main(args: Array[String]): Unit = { 27 | 28 | val X = DenseMatrix((1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)).t 29 | val y = 2.0 * DenseVector(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0) 30 | val z = DenseMatrix((2.5, 4.5, 6.5, 8.5, 10.0, 12.0)).t 31 | val truePredZ = 2.0 * DenseVector(2.5, 4.5, 6.5, 8.5, 10.0, 12.0) 32 | 33 | // //2.Test no_linear(y=cos(x)+1) 34 | // val X = DenseMatrix((1.0,2.0, 3.0,4.0,5.0,6.0,7.0,8.0,9.0)).t 35 | // val y = cos(DenseVector(1.0,2.0, 3.0,4.0,5.0,6.0,7.0,8.0,9.0))+1.0 36 | // val z = DenseMatrix((2.5, 4.5,6.5,8.5,10.0,12.0)).t 37 | // val truePredZ = cos(DenseVector(2.5, 4.5,6.5,8.5,10.0,12.0))+1.0 38 | 39 | // //3.Test no_linear(y=x^2) 40 | // val X = DenseMatrix((1.0,2.0, 3.0,4.0,5.0,6.0,7.0,8.0,9.0)).t 41 | // val y = DenseVector(1.0,4.0, 9.0,16.0,25.0,36.0,49.0,64.0,81.0) 42 | // val z = DenseMatrix((2.5, 4.5,6.5,8.5,10.0,12.0)).t 43 | // val truePredZ = pow(z,2) 44 | 45 | //val covFunc = SquareExpIso() 46 | val covFunc = Matern5Iso() 47 | val initCovParams = DenseVector(1.0, 1.0) 48 | val initNoiseStdDev = 0.01 49 | 50 | val gpModel = GPModel(covFunc, initCovParams, initNoiseStdDev) 51 | 52 | gpModel.fit(X, y) 53 | 54 | println("Fitted covariance function params:") 55 | println(gpModel.covParams) 56 | println("Fitted noiseStdDev:") 57 | println(gpModel.noiseStdDev) 58 | println("\n") 59 | 60 | val prediction = gpModel.predict(z) 61 | println("Mean and Var:") 62 | println(prediction) 63 | println("True value:") 64 | println(truePredZ) 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/model/GPKernelDiffFunc.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.model 20 | 21 | import breeze.linalg.{MatrixNotSymmetricException, NotConvergedException, DenseMatrix => BDM, DenseVector => BDV} 22 | import breeze.optimize.DiffFunction 23 | import com.tencent.angel.spark.automl.tuner.math.BreezeOp 24 | 25 | class GPKernelDiffFunc(model: GPModel) extends DiffFunction[BDV[Double]] { 26 | 27 | var iter: Int = _ 28 | 29 | override def calculate(params: BDV[Double]): (Double, BDV[Double]) = { 30 | 31 | try { 32 | //println(s"------iteration $iter------") 33 | val covParams = BDV(params.toArray.dropRight(1)) 34 | model.covParams = covParams 35 | val noiseStdDev = params.toArray.last 36 | model.noiseStdDev = noiseStdDev 37 | //println(s"covariance params: $covParams") 38 | //println(s"standard derivative: $noiseStdDev") 39 | 40 | val meanX = model.meanFunc(model.X) 41 | val KXX = model.calKXX() 42 | 43 | //println(s"meanX: $meanX") 44 | //println(s"KXX: $KXX") 45 | 46 | val invKXX = model.calInvKXX(KXX) 47 | //println("inverse of KXX:") 48 | //println(invKXX) 49 | 50 | //println("true inverse of KXX:") 51 | //println(inv(KXX)) 52 | 53 | val loglikeLoss = -BreezeOp.logLike(meanX, KXX, invKXX, model.y) 54 | //println(s"log likelihood loss: $loglikeLoss") 55 | 56 | // calculate partial derivatives 57 | val covarFuncGrads = model.covFunc.grad(model.X, model.X, covParams) 58 | //println("covariance grads:") 59 | //covarFuncGrads.foreach(println) 60 | 61 | val covarNoiseGrad = 2 * noiseStdDev * BDM.eye[Double](model.X.rows) 62 | //println("covariance noise grads:") 63 | //println(covarNoiseGrad) 64 | 65 | val allGrads = covarFuncGrads :+ covarNoiseGrad 66 | 67 | val loglikeGrads = BreezeOp.logLikeD(meanX, invKXX, model.y, allGrads).map(d => -d) 68 | //println(s"grad of covariance params: $loglikeGrads") 69 | 70 | iter = iter + 1 71 | 72 | (loglikeLoss, loglikeGrads) 73 | } catch { 74 | case e: NotConvergedException => 75 | //println(s"not converge exception $e") 76 | //(Double.NaN, BDV.zeros[Double](params.size) * Double.NaN) 77 | throw e 78 | case e: MatrixNotSymmetricException => 79 | println(s"matrix not symmetric exception $e") 80 | (Double.NaN, BDV.zeros[Double](params.size) * Double.NaN) 81 | throw e 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/model/GPModel.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.model 20 | 21 | import breeze.linalg.{Axis, MatrixNotSymmetricException, cholesky, diag, DenseMatrix => BDM, DenseVector => BDV} 22 | import breeze.optimize.LBFGS 23 | import com.tencent.angel.spark.automl.tuner.kernel.{Covariance, CovarianceType} 24 | import com.tencent.angel.spark.automl.tuner.math.BreezeOp 25 | 26 | import scala.math._ 27 | 28 | class GPModel(val covFunc: Covariance, 29 | var covParams: BDV[Double], 30 | var noiseStdDev: Double, 31 | val meanFunc: (BDM[Double]) => BDV[Double]) { 32 | 33 | var X: BDM[Double] = _ 34 | var y: BDV[Double] = _ 35 | var KXX: BDM[Double] = _ 36 | var L: BDM[Double] = _ 37 | 38 | def remove(idx: Int): Unit = { 39 | } 40 | 41 | def fit(newX: BDM[Double], 42 | newy: BDV[Double]): Boolean = { 43 | require(newX.rows == newy.length, "incompatible size of the input X and y") 44 | 45 | var trainSuccess = true 46 | 47 | if ((X == null && y == null) || 48 | (newX.rows > X.rows && newy.length > y.length)) { 49 | X = newX 50 | y = newy 51 | } 52 | 53 | val kernelDiffFunc = new GPKernelDiffFunc(this) 54 | val initParams = BDV(covParams.toArray :+ noiseStdDev) 55 | //println(s"init params: ${initParams}") 56 | 57 | var newParams = initParams 58 | val optimizer = new LBFGS[BDV[Double]](maxIter = 10, m = 7, tolerance = 1e-10) 59 | //val optimizer = new SimpleSGD[BDV[Double]](1, 10) 60 | try { 61 | newParams = optimizer.minimize(kernelDiffFunc, initParams) 62 | } catch { 63 | case _: breeze.linalg.NotConvergedException | _: MatrixNotSymmetricException => 64 | //println(s"Breeze Not Converged Exception") 65 | newParams = initParams 66 | trainSuccess = false 67 | X = X.delete(X.rows - 1, Axis._0) 68 | y = y.slice(0, y.length - 1) 69 | } 70 | 71 | // println(optimizer) 72 | // println(s"new params: ${newParams}") 73 | // if(!checkParam(newParams)) { 74 | // newParams = initParams 75 | // println(s"reset to init params: ${newParams}") 76 | // trainSuccess = false 77 | // println(s"history size: ${X.rows} ${y.length}") 78 | // X = X.delete(X.rows - 1, Axis._0) 79 | // y = y.slice(0, y.length - 1) 80 | // println(s"history size: ${X.rows} ${y.length}") 81 | // } 82 | 83 | val newCovParams = BDV(newParams.toArray.dropRight(1)) 84 | val newNoiseStdDev = newParams.toArray.last 85 | 86 | this.covParams = newCovParams 87 | this.noiseStdDev = newNoiseStdDev 88 | 89 | trainSuccess 90 | } 91 | 92 | def checkParam(params: BDV[Double]): Boolean = { 93 | var isValid = true 94 | params.values.foreach { param: Double => 95 | if (param.isNaN || param.isInfinity) 96 | isValid = false 97 | } 98 | isValid 99 | } 100 | 101 | def update(newX: BDM[Double], 102 | newy: BDV[Double]): this.type = { 103 | this 104 | } 105 | 106 | def predict(newX: BDM[Double]): BDM[Double] = { 107 | if (X == null || y == null) { 108 | BDM.zeros(newX.rows, cols = 2) 109 | } else { 110 | val meanX = meanFunc(X) 111 | 112 | val KXX = calKXX() 113 | 114 | val invKXX = calInvKXX(KXX) 115 | 116 | val KXZ = covFunc.cov(X, newX, covParams) 117 | 118 | val KZZ = covFunc.cov(newX, newX, covParams) 119 | 120 | val meanNewX = meanFunc(newX) 121 | 122 | val predMean = meanNewX + KXZ.t * (invKXX * (y - meanX)) 123 | val predVar = diag(KZZ - KXZ.t * invKXX * KXZ).map { v => 124 | if (v < -1e-12 | v.isNaN | v.isInfinite) 0 else v 125 | } 126 | 127 | BDV.horzcat(predMean, predVar) 128 | } 129 | } 130 | 131 | def calKXX(): BDM[Double] = { 132 | val KXX = covFunc.cov(X, X, covParams) + 133 | pow(noiseStdDev, 2) * BDM.eye[Double](X.rows) 134 | //+ BDM.eye[Double](X.rows) * 1e-7 135 | 136 | KXX 137 | } 138 | 139 | def calInvKXX(KXX: BDM[Double]): BDM[Double] = { 140 | val l = cholesky(KXX) 141 | val invKXX = BreezeOp.choleskyInv(l.t) 142 | 143 | invKXX 144 | } 145 | } 146 | 147 | object GPModel { 148 | 149 | def apply(covFunc: Covariance, 150 | covParams: BDV[Double], 151 | noiseStdDev: Double, 152 | meanFunc: (BDM[Double]) => BDV[Double]): GPModel = { 153 | new GPModel(covFunc, covParams, noiseStdDev, meanFunc) 154 | } 155 | 156 | def apply(covFunc: Covariance, 157 | covParams: BDV[Double], 158 | noiseStdDev: Double, 159 | mean: Double = 0.0): GPModel = { 160 | val meanFunc = (x: BDM[Double]) => BDV.zeros[Double](x.rows) + mean 161 | new GPModel(covFunc, covParams, noiseStdDev, meanFunc) 162 | } 163 | 164 | def apply(covName: String, 165 | covParams: BDV[Double], 166 | noiseStdDev: Double, 167 | meanFunc: (BDM[Double]) => BDV[Double]): GPModel = { 168 | new GPModel(CovarianceType.fromString(covName), covParams, noiseStdDev, meanFunc) 169 | } 170 | 171 | def apply(covType: CovarianceType.Value, 172 | covParams: BDV[Double], 173 | noiseStdDev: Double, 174 | meanFunc: (BDM[Double]) => BDV[Double]): GPModel = { 175 | new GPModel(CovarianceType.fromString(covType), covParams, noiseStdDev, meanFunc) 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/parameter/ContinuousSpace.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.parameter 20 | 21 | import com.tencent.angel.spark.automl.utils.{AutoMLException, Distribution} 22 | 23 | import scala.collection.mutable.ArrayBuffer 24 | import scala.util.Random 25 | 26 | /** 27 | * 28 | * @param name : Name of the parameter 29 | * @param lower : Start of the continuous space included. 30 | * @param upper : End of the continuous space included. 31 | * @param num : Sampling count if possible. 32 | * @param seed 33 | */ 34 | class ContinuousSpace( 35 | override val name: String, 36 | var lower: Double, 37 | var upper: Double, 38 | var num: Int, 39 | distribution: Distribution.Value = Distribution.LINEAR, 40 | override val doc: String = "continuous param space") extends ParamSpace[Double](name, doc) { 41 | 42 | private val helper: String = "supported format of continuous parameter: [0,1] or [0:1:100]" 43 | 44 | override val pType: String = "continuous" 45 | override val vType: String = "double" 46 | 47 | def this(name: String, lower: Double, upper: Double) = { 48 | this(name, lower, upper, -1) 49 | } 50 | 51 | def this(name: String, config: String) = { 52 | this(name, 0, 1, -1) 53 | val items = parseConfig(config) 54 | lower = items._1 55 | upper = items._2 56 | num = items._3 57 | resetGrid(num) 58 | } 59 | 60 | require(lower < upper, s"lower bound should less than upper bound") 61 | 62 | val rd = new Random() 63 | 64 | var isGrid: Boolean = false 65 | var gridValues: Array[Double] = _ 66 | 67 | def parseConfig(input: String): (Double, Double, Int) = { 68 | assert(input.startsWith("[") && input.endsWith("]")) 69 | val config = input.substring(1, input.length - 1) 70 | val ret: (Double, Double, Int) = config.trim match { 71 | case _ if config.contains(",") => 72 | val splits = config.split(',') 73 | splits.length match { 74 | case 2 => (splits(0).toDouble, splits(1).toDouble, -1) 75 | case _ => throw new AutoMLException(s"invalid discrete, $helper") 76 | } 77 | case _ if config.contains(":") => 78 | val splits = config.split(':') 79 | splits.length match { 80 | case 3 => (splits(0).toDouble, splits(1).toDouble, splits(2).toInt) 81 | case _ => throw new AutoMLException(s"invalid discrete, $helper") 82 | } 83 | case _ => throw new AutoMLException(s"invalid discrete, $helper") 84 | } 85 | ret 86 | } 87 | 88 | def getGridValues(num: Int): Array[Double] = { 89 | var ret: ArrayBuffer[Double] = ArrayBuffer[Double]() 90 | distribution match { 91 | case Distribution.LINEAR => 92 | val interval: Double = (upper - lower) / (num - 1) 93 | (0 until num).foreach { i => 94 | ret += lower + i * interval 95 | } 96 | case _ => println(s"Distribution $distribution not supported") 97 | } 98 | ret.toArray 99 | } 100 | 101 | def resetGrid(numGrid: Int): Unit = { 102 | num = numGrid 103 | isGrid = if (numGrid < 0) false else true 104 | gridValues = if (isGrid) getGridValues(numGrid) else Array.empty 105 | } 106 | 107 | def getLower: Double = lower 108 | 109 | def getUpper: Double = upper 110 | 111 | def getValues: Array[Double] = gridValues 112 | 113 | def numValues: Int = if (isGrid) gridValues.length else Int.MaxValue 114 | 115 | def toGridSearch: ParamSpace[Double] = this 116 | 117 | def toRandomSpace: ParamSpace[Double] = this 118 | 119 | override def sample(size: Int): List[Double] = List.fill[Double](size)(sampleOne) 120 | 121 | def sampleOne(): Double = { 122 | if (isGrid) 123 | gridValues(rd.nextInt(numValues)) 124 | else 125 | lower + (upper - lower) * rd.nextDouble() 126 | } 127 | 128 | override def toString: String = 129 | if (isGrid) 130 | s"ContinuousSpace[$name]: (${gridValues mkString (",")})" 131 | else s"ContinuousSpace[$name]: ($lower -> $upper)" 132 | } 133 | 134 | object ContinuousSpace { 135 | 136 | def apply(name: String, config: String) = { 137 | new ContinuousSpace(name, config) 138 | } 139 | } -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/parameter/DiscreteSpace.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.parameter 20 | 21 | import com.tencent.angel.spark.automl.utils.AutoMLException 22 | 23 | import scala.reflect.{ClassTag, _} 24 | import scala.util.Random 25 | 26 | /** 27 | * Search space with discrete values 28 | * 29 | * @param name : Name of the parameter 30 | * @param values : List of all possible values 31 | */ 32 | class DiscreteSpace[T <: AnyVal : ClassTag]( 33 | override val name: String, 34 | var values: Array[T], 35 | override val doc: String = "discrete param") extends ParamSpace[T](name, doc) { 36 | 37 | private val helper: String = "supported format of discrete parameter: {0.1,0.2,0.3,0.4} or {0.1:1:0.1}" 38 | 39 | override val pType: String = "discrete" 40 | override val vType = classTag[T].runtimeClass.getSimpleName.toLowerCase 41 | 42 | def this(name: String, config: String, doc: String) = { 43 | this(name, Array.empty[T], doc) 44 | this.values = parseConfig(config) 45 | } 46 | 47 | def this(name: String, config: String) = { 48 | this(name, config, "discrete param") 49 | } 50 | 51 | def parseConfig(input: String): Array[T] = { 52 | assert(input.startsWith("{") && input.endsWith("}")) 53 | val config = input.substring(1, input.length - 1) 54 | val values: Array[T] = config.trim match { 55 | case _ if config.contains(",") => 56 | config.split(',').map(asType) 57 | case _ if config.contains(":") => 58 | val splits = config.split(':') 59 | splits.length match { 60 | case 2 => (splits(0).toDouble to splits(1).toDouble by 1.0f).toArray.map(asType) 61 | case 3 => (splits(0).toDouble to splits(1).toDouble by splits(2).toDouble).toArray.map(asType) 62 | case _ => throw new AutoMLException(s"invalid discrete, $helper") 63 | } 64 | case _ => throw new AutoMLException(s"invalid discrete, $helper") 65 | } 66 | values 67 | } 68 | 69 | def asType(s: String): T = { 70 | val c = implicitly[ClassTag[T]].runtimeClass 71 | c match { 72 | case _ if c == classOf[Int] => s.toInt.asInstanceOf[T] 73 | case _ if c == classOf[Long] => s.toLong.asInstanceOf[T] 74 | case _ if c == classOf[Float] => s.toFloat.asInstanceOf[T] 75 | case _ if c == classOf[Double] => s.toDouble.asInstanceOf[T] 76 | case _ => throw new AutoMLException(s"auto param with type ${c} is not supported") 77 | } 78 | } 79 | 80 | def asType(s: Double): T = { 81 | val c = implicitly[ClassTag[T]].runtimeClass 82 | c match { 83 | case _ if c == classOf[Int] => s.toInt.asInstanceOf[T] 84 | case _ if c == classOf[Long] => s.toLong.asInstanceOf[T] 85 | case _ if c == classOf[Float] => s.toFloat.asInstanceOf[T] 86 | case _ if c == classOf[Double] => s.toDouble.asInstanceOf[T] 87 | case _ => throw new AutoMLException(s"auto param with type ${c} is not supported") 88 | } 89 | } 90 | 91 | def asDouble(num: AnyVal): Double = { 92 | num match { 93 | case i: Int => i.toDouble 94 | case i: Long => i.toLong 95 | case i: Float => i.toDouble 96 | case i: Double => i 97 | case _ => throw new AutoMLException(s"type ${num.getClass} is not supported") 98 | } 99 | } 100 | 101 | val rd = new Random() 102 | 103 | def getValues: Array[Double] = values.map(asDouble) 104 | 105 | def numValues: Int = values.length 106 | 107 | def toGridSearch: ParamSpace[T] = this 108 | 109 | def toRandomSpace: ParamSpace[T] = this 110 | 111 | def sample(size: Int): List[T] = { 112 | List.fill[T](size)(sampleOne) 113 | } 114 | 115 | def sampleOne(): T = values(rd.nextInt(numValues)) 116 | 117 | override def toString: String = s"DiscreteSpace[$name]: (${values mkString (",")})" 118 | 119 | } 120 | 121 | object DiscreteSpace { 122 | 123 | def apply[T <: AnyVal : ClassTag](name: String, config: String): DiscreteSpace[T] = { 124 | new DiscreteSpace[T](name, config) 125 | } 126 | 127 | def main(args: Array[String]): Unit = { 128 | val obj = new DiscreteSpace[Int]("test", "1:10:1") 129 | println(obj.toString) 130 | println(obj.getValues(1)) 131 | println(obj.sample(2).toString()) 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/parameter/ParamParser.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl.tuner.parameter 19 | 20 | import com.tencent.angel.spark.automl.utils.AutoMLException 21 | 22 | import scala.beans.BeanProperty 23 | 24 | /** 25 | * parse configuration of auto tuning from the command 26 | * valid format: PARAM_NAME|PARAM_TYPE|VALUE_TYPE|PARAM_RANGE|OPTIONS, multiple params are separated by # 27 | * example: ml.learn.rate|C|double|0.01,1|linear#ml.learn.decay|D|double|0,0.01,0.1 28 | */ 29 | object ParamParser { 30 | 31 | val helper = "supported format: PARAM_NAME|PARAM_TYPE|VALUE_TYPE|PARAM_RANGE|OPTIONS, OPTIONS is optional" 32 | val helper_param_type = "param type should be D, C or CA (D means discrete, C means continuous, and CA means categorical" 33 | val helper_value_type = "value type should be float, double, int or long" 34 | 35 | val INTER_PARAM_SEP = "#" 36 | val INNER_PARAM_SEP = "\\|" 37 | 38 | def parse(input: String): Array[ParamConfig] = { 39 | separateParams(input).map(parseOneParam) 40 | } 41 | 42 | /** 43 | * separate the config command to a set of parameter config 44 | */ 45 | def separateParams(input: String): Array[String] = { 46 | val params = input.split(INTER_PARAM_SEP) 47 | assert(params.nonEmpty, helper) 48 | params 49 | } 50 | 51 | /** 52 | * parse config for each parameter 53 | */ 54 | def parseOneParam(input: String): ParamConfig = { 55 | val configs = input.split(INNER_PARAM_SEP) 56 | println(s"configs: ${configs.mkString(",")}") 57 | assert(configs.size == 4 || configs.size == 5, helper) 58 | val paramName = getParamName(configs) 59 | val paramType = getParamType(configs) 60 | val valueType = getValueType(configs, paramType) 61 | val paramRange = getParamRange(configs, paramType) 62 | val options = getOptions(configs) 63 | new ParamConfig(paramName, paramType, valueType, paramRange, options) 64 | } 65 | 66 | def getParamName(configs: Array[String]): String = configs(0) 67 | 68 | def getParamType(configs: Array[String]): String = { 69 | val paramType = configs(1).toUpperCase 70 | paramType match { 71 | case "D" => "discrete" 72 | case "C" => "continuous" 73 | case "CA" => "categorical" 74 | case _ => throw new AutoMLException(helper_param_type) 75 | } 76 | } 77 | 78 | def getValueType(configs: Array[String], paramType: String): String = { 79 | val valueType = configs(2).toLowerCase 80 | paramType match { 81 | case "discrete" => 82 | assert(Array("float", "double", "int", "long").contains(valueType), helper_value_type) 83 | valueType 84 | case "continuous" => 85 | "double" 86 | case "categorical" => 87 | valueType 88 | } 89 | } 90 | 91 | def getParamRange(configs: Array[String], paramType: String): String = { 92 | paramType match { 93 | case "discrete" => configs(3).mkString("{", "", "}") 94 | case "continuous" => configs(3).mkString("[", "", "]") 95 | // TODO: use categorical specific format 96 | case "categorical" => configs(3) 97 | } 98 | } 99 | 100 | 101 | def getOptions(configs: Array[String]): Option[String] = { 102 | if (configs.size == 4) 103 | None 104 | else 105 | Some(configs(4)) 106 | } 107 | 108 | } 109 | 110 | class ParamConfig(@BeanProperty var paramName: String, 111 | @BeanProperty var paramType: String, 112 | @BeanProperty var valueType: String, 113 | @BeanProperty var paramRange: String, 114 | @BeanProperty var option: Option[String]) 115 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/parameter/ParamSpace.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.parameter 20 | 21 | import com.tencent.angel.spark.automl.utils.AutoMLException 22 | 23 | import scala.reflect.ClassTag 24 | 25 | 26 | /** 27 | * Base class of a single parameter's search space. 28 | * 29 | * @param name : Name of the parameter 30 | */ 31 | abstract class ParamSpace[+T: ClassTag](val name: String, 32 | val doc: String = "param with search space") { 33 | 34 | val pType: String 35 | 36 | val vType: String 37 | 38 | def sample(size: Int): List[T] 39 | 40 | def sampleOne(): T 41 | 42 | def getValues: Array[Double] 43 | 44 | def numValues: Int 45 | } 46 | 47 | object ParamSpace { 48 | 49 | def fromConfigString(name: String, config: String): ParamSpace[Double] = { 50 | val vType = 51 | if (config.trim.startsWith("[") && config.trim.endsWith("]")) 52 | "continuous" 53 | else if (config.trim.startsWith("{") && config.trim.endsWith("}")) 54 | "discrete" 55 | else "none" 56 | vType match { 57 | case "continuous" => ContinuousSpace(name, config) 58 | case "discrete" => DiscreteSpace[Double](name, config) 59 | case _ => throw new AutoMLException(s"auto param config is not supported") 60 | } 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/solver/SolverWithTrail.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.solver 20 | 21 | import com.tencent.angel.spark.automl.tuner.config.Configuration 22 | import com.tencent.angel.spark.automl.tuner.trail.Trail 23 | import org.apache.spark.ml.linalg.Vector 24 | 25 | class SolverWithTrail(val solver: Solver, val trail: Trail) { 26 | 27 | /** 28 | * The main Bayesian optimization loop 29 | * 30 | * @param numIter : Number of Iterations 31 | * @param X : Initial data points that are already evaluated 32 | * @param Y : Initial function values of the already evaluated points 33 | * @return Incumbent and function value of the incumbent 34 | */ 35 | def run(numIter: Int, X: Array[Configuration] = null, Y: Array[Double] = null): (Vector, Double) = { 36 | if (X != null && Y != null && X.size == Y.size) 37 | solver.feed(X, Y) 38 | (0 until numIter).foreach { iter => 39 | println(s"------iteration $iter starts------") 40 | val configs: Array[Configuration] = solver.suggest() 41 | val results: Array[Double] = trail.evaluate(configs) 42 | solver.feed(configs, results) 43 | } 44 | solver.surrogate.curBest 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/surrogate/GPSurrogate.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.surrogate 20 | 21 | import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} 22 | import com.tencent.angel.spark.automl.tuner.config.ConfigurationSpace 23 | import com.tencent.angel.spark.automl.tuner.kernel.Matern5Iso 24 | import com.tencent.angel.spark.automl.tuner.model.GPModel 25 | import com.tencent.angel.spark.automl.utils.DataUtils 26 | import org.apache.commons.logging.{Log, LogFactory} 27 | import org.apache.spark.ml.linalg.Vector 28 | 29 | class GPSurrogate( 30 | override val cs: ConfigurationSpace, 31 | override val minimize: Boolean = true) 32 | extends Surrogate(cs, minimize) { 33 | 34 | override val LOG: Log = LogFactory.getLog(classOf[RFSurrogate]) 35 | 36 | val covFunc = Matern5Iso() 37 | val initCovParams = BDV(1.0, 1.0) 38 | val initNoiseStdDev = 0.1 39 | val gpModel: GPModel = GPModel(covFunc, initCovParams, initNoiseStdDev) 40 | 41 | /** 42 | * Train the surrogate on curX and curY. 43 | */ 44 | override def train(): Unit = { 45 | val breezeX: BDM[Double] = DataUtils.toBreeze(preX.toArray) 46 | val breezeY: BDV[Double] = DataUtils.toBreeze(preY.toArray) 47 | val success = gpModel.fit(breezeX, breezeY) 48 | if (!success) { 49 | preX.remove(preX.length - 1) 50 | preY.remove(preY.length - 1) 51 | println(s"drop the new configuration owing to convergence failure.") 52 | } 53 | 54 | /*println("Fitted covariance function params:") 55 | println(gpModel.covParams) 56 | println("Fitted noiseStdDev:") 57 | println(gpModel.noiseStdDev) 58 | println("\n")*/ 59 | 60 | } 61 | 62 | /** 63 | * Predict means and variances for a single given X. 64 | * 65 | * @param X 66 | * @return a tuple of (mean, variance) 67 | */ 68 | override def predict(X: Vector): (Double, Double) = { 69 | val breezeX = DataUtils.toBreeze(X).toDenseMatrix 70 | 71 | val pred = gpModel.predict(breezeX) 72 | 73 | //println(s"predict of ${X.toArray.mkString(",")}: mean[${pred(0, 0)}] variance[${pred(0, 1)}]") 74 | 75 | (pred(0, 0), pred(0, 1)) 76 | } 77 | 78 | override def stop(): Unit = { 79 | 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/surrogate/NormalSurrogate.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.surrogate 20 | 21 | import com.tencent.angel.spark.automl.tuner.config.ConfigurationSpace 22 | import org.apache.spark.ml.linalg.Vector 23 | 24 | 25 | class NormalSurrogate(override val cs: ConfigurationSpace, 26 | override val minimize: Boolean = true) extends Surrogate(cs, minimize) { 27 | 28 | override def update(X: Array[Vector], Y: Array[Double]): Unit = { 29 | preX ++= X 30 | preY ++= Y 31 | } 32 | 33 | /** 34 | * NormalSurrogate is designed for random-search and grid-search 35 | * Thus it doesn't need train and predict function 36 | */ 37 | override def train(): Unit = {} 38 | 39 | 40 | def predict(X: Vector): (Double, Double) = { 41 | (0.0, 0.0) 42 | } 43 | 44 | override def stop(): Unit = {} 45 | 46 | } -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/surrogate/RFSurrogate.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.surrogate 20 | 21 | import com.tencent.angel.spark.automl.tuner.config.ConfigurationSpace 22 | import com.tencent.angel.spark.automl.utils.DataUtils 23 | import org.apache.commons.logging.{Log, LogFactory} 24 | import org.apache.spark.ml.linalg.Vector 25 | import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, RandomForestRegressionModel, RandomForestRegressor} 26 | import org.apache.spark.sql.{DataFrame, SparkSession} 27 | 28 | class RFSurrogate( 29 | override val cs: ConfigurationSpace, 30 | override val minimize: Boolean = true) 31 | extends Surrogate(cs, minimize) { 32 | 33 | override val LOG: Log = LogFactory.getLog(classOf[RFSurrogate]) 34 | 35 | var model: RandomForestRegressionModel = _ 36 | val numTrees: Int = 5 37 | val maxDepth: Int = 2 38 | 39 | val ss = SparkSession.builder() 40 | .master("local") 41 | .appName("test") 42 | .getOrCreate() 43 | 44 | ss.sparkContext.setLogLevel("ERROR") 45 | 46 | override def train(): Unit = { 47 | 48 | if (preX.size < Math.pow(2, maxDepth - 1)) 49 | return 50 | 51 | val data: DataFrame = DataUtils.parse(ss, schema, preX.toArray, preY.toArray) 52 | 53 | 54 | val rf = new RandomForestRegressor() 55 | .setLabelCol("label") 56 | .setFeaturesCol("features") 57 | .setNumTrees(numTrees) 58 | .setMaxDepth(maxDepth) 59 | 60 | model = rf.fit(data) 61 | } 62 | 63 | /** 64 | * Predict means and variances for a single given X. 65 | * 66 | * @param X 67 | * @return a tuple of (mean, variance) 68 | */ 69 | override def predict(X: Vector): (Double, Double) = { 70 | 71 | if (preX.size < Math.pow(2, maxDepth - 1)) { 72 | return (0.0, 0.0) 73 | } 74 | 75 | val preds = model.trees.map { tree: DecisionTreeRegressionModel => 76 | val pred = tree.transform(DataUtils.parse(ss, schema, X)) 77 | pred.select("prediction").first().getDouble(0) 78 | } 79 | 80 | //println(s"tree predictions of ${X.toArray.mkString(",")}: ${preds.mkString(",")}") 81 | 82 | val mean: Double = preds.sum / preds.length 83 | val variance = preds.map(x => Math.pow(x - mean, 2)).sum / preds.length 84 | 85 | //println(s"predict of ${X.toArray.mkString(",")}: mean[$mean] variance[$variance]") 86 | 87 | (mean, variance) 88 | } 89 | 90 | override def stop(): Unit = { 91 | ss.stop 92 | } 93 | 94 | } -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/surrogate/Surrogate.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.surrogate 20 | 21 | import com.tencent.angel.spark.automl.tuner.config.ConfigurationSpace 22 | import org.apache.commons.logging.{Log, LogFactory} 23 | import org.apache.spark.ml.linalg.Vector 24 | import org.apache.spark.sql.types.{DataTypes, StructField, StructType} 25 | 26 | import scala.collection.mutable.ArrayBuffer 27 | 28 | /** 29 | * Abstract base class for surrogate model. 30 | * 31 | * @param numParams : Number of parameters in a configuration 32 | */ 33 | abstract class Surrogate( 34 | val cs: ConfigurationSpace, 35 | val minimize: Boolean = true) { 36 | 37 | var fields: ArrayBuffer[StructField] = new ArrayBuffer[StructField]() 38 | fields += DataTypes.createStructField("label", DataTypes.DoubleType, false) 39 | fields += DataTypes.createStructField("features", DataTypes.createArrayType(DataTypes.DoubleType), false) 40 | 41 | val schema: StructType = StructType( 42 | StructField("label", DataTypes.DoubleType, nullable = false) :: 43 | StructField("features", DataTypes.createArrayType(DataTypes.DoubleType), false) :: 44 | Nil) 45 | 46 | val LOG: Log = LogFactory.getLog(classOf[Surrogate]) 47 | 48 | // Previous input data points, (N, D) 49 | var preX: ArrayBuffer[Vector] = new ArrayBuffer[Vector]() 50 | // previous target value, (N, ) 51 | var preY: ArrayBuffer[Double] = new ArrayBuffer[Double]() 52 | 53 | /** 54 | * Train the surrogate on curX and curY. 55 | */ 56 | def train(): Unit 57 | 58 | /** 59 | * Train the surrogate on X and Y. 60 | * 61 | * @param X : (N, D), input data points. 62 | * @param Y : (N, 1), the corresponding target values. 63 | */ 64 | def train(X: Array[Vector], Y: Array[Double]): Unit = { 65 | preX.clear 66 | preY.clear 67 | preX ++ X 68 | preY ++ Y 69 | train 70 | } 71 | 72 | /** 73 | * Update the surrogate with more X and Y. 74 | * 75 | * @param X 76 | * @param Y 77 | */ 78 | def update(X: Array[Vector], Y: Array[Double]): Unit = { 79 | if (!X.isEmpty && !Y.isEmpty) { 80 | X.zip(Y).foreach(tuple => print(tuple._1, tuple._2)) 81 | preX ++= X 82 | preY ++= Y 83 | train 84 | } 85 | } 86 | 87 | def print(X: Vector, y: Double): Unit = { 88 | println(s"update surrogate with X[${X.toArray.mkString("(", ",", ")")}] " + 89 | s"and Y[${if (minimize) -y else y}]") 90 | } 91 | 92 | def update(X: Vector, y: Double): Unit = { 93 | print(X, y) 94 | preX += X 95 | preY += y 96 | train 97 | } 98 | 99 | /** 100 | * Predict means and variances for given X. 101 | * 102 | * @param X 103 | * @return tuples of (mean, variance) 104 | */ 105 | def predict(X: Array[Vector]): Array[(Double, Double)] = { 106 | X.map(predict) 107 | } 108 | 109 | /** 110 | * Predict means and variances for a single given X. 111 | * 112 | * @param X 113 | * @return a tuple of (mean, variance) 114 | */ 115 | def predict(X: Vector): (Double, Double) 116 | 117 | def stop(): Unit 118 | 119 | def curBest: (Vector, Double) = { 120 | if (minimize) curMin else curMax 121 | } 122 | 123 | def curMin: (Vector, Double) = { 124 | if (preY.isEmpty) 125 | (null, Double.MaxValue) 126 | else { 127 | val maxIdx: Int = preY.zipWithIndex.max._2 128 | (preX(maxIdx), -preY(maxIdx)) 129 | } 130 | } 131 | 132 | def curMax: (Vector, Double) = { 133 | if (preY.isEmpty) 134 | (null, Double.MinValue) 135 | else { 136 | val maxIdx: Int = preY.zipWithIndex.max._2 137 | (preX(maxIdx), preY(maxIdx)) 138 | } 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/surrogate/SurrogateMode.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl.tuner.surrogate 19 | 20 | 21 | object SurrogateMode extends Enumeration { 22 | 23 | type SurrogateMode = Value 24 | 25 | val GP = Value("GaussianProcess") 26 | val RF = Value("RandomForest") 27 | val RANDOM = Value("Random") 28 | val GRID = Value("Grid") 29 | 30 | def fromString(mode: String): SurrogateMode = { 31 | SurrogateMode.withName(mode) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/trail/TestRunner.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.trail 20 | 21 | import com.github.fommil.netlib.F2jBLAS 22 | import com.tencent.angel.spark.automl.tuner.config.Configuration 23 | 24 | class TestRunner(config: Configuration) extends TrailRunner(config) { 25 | 26 | override def call(): Double = { 27 | new F2jBLAS().ddot(config.getVector.size, 28 | config.getVector.toDense.values, 29 | 1, 30 | config.getVector.toDense.values, 31 | 1) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/trail/TestTrail.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.trail 20 | 21 | import com.github.fommil.netlib.F2jBLAS 22 | import com.tencent.angel.spark.automl.tuner.config.Configuration 23 | 24 | class TestTrail extends Trail { 25 | 26 | override def evaluate(config: Configuration): Double = { 27 | val ret = new F2jBLAS().ddot(config.getVector.size, 28 | config.getVector.toDense.values, 29 | 1, 30 | config.getVector.toDense.values, 31 | 1) 32 | println(s"evaluate ${config.getVector.toArray.mkString("(", ",", ")")}, result $ret") 33 | ret 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/trail/Trail.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.trail 20 | 21 | import com.tencent.angel.spark.automl.tuner.config.Configuration 22 | 23 | abstract class Trail { 24 | 25 | def evaluate(configs: Array[Configuration]): Array[Double] = configs.map(evaluate) 26 | 27 | def evaluate(config: Configuration): Double 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/tuner/trail/TrailRunner.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.tuner.trail 20 | 21 | import java.util.concurrent.Callable 22 | 23 | import com.tencent.angel.spark.automl.tuner.config.Configuration 24 | 25 | abstract class TrailRunner(var config: Configuration) extends Callable[Double] { 26 | 27 | override def call(): Double 28 | 29 | def setConf(newConf: Configuration): Unit = { 30 | config = newConf 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/utils/ArgsUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.utils 20 | 21 | import scala.collection.mutable 22 | 23 | object ArgsUtil { 24 | 25 | def parse(args: Array[String]): Map[String, String] = { 26 | val cmdArgs = new mutable.HashMap[String, String]() 27 | println("parsing parameter") 28 | for (arg <- args) { 29 | val sepIdx = arg.indexOf(":") 30 | if (sepIdx != -1) { 31 | val k = arg.substring(0, sepIdx).trim 32 | val v = arg.substring(sepIdx + 1).trim 33 | if (v != "" && v != "Nan" && v != null) { 34 | cmdArgs.put(k, v) 35 | println(s"param $k = $v") 36 | } 37 | } 38 | } 39 | cmdArgs.toMap 40 | } 41 | } -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/utils/AutoMLException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | package com.tencent.angel.spark.automl.utils 18 | 19 | class AutoMLException(msg: String) extends Exception(msg) 20 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/utils/DataUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.utils 20 | 21 | import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} 22 | import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} 23 | import org.apache.spark.sql.types.StructType 24 | import org.apache.spark.sql.{DataFrame, SparkSession} 25 | 26 | object DataUtils { 27 | 28 | def parse(ss: SparkSession, 29 | schema: StructType, 30 | X: Array[Vector], 31 | Y: Array[Double]): DataFrame = { 32 | require(X.size == Y.size, 33 | "The size of configurations should be equal to the size of rewards.") 34 | ss.createDataFrame( 35 | Y.zip(X)).toDF("label", "features") 36 | } 37 | 38 | def parse(ss: SparkSession, 39 | schema: StructType, 40 | X: Vector): DataFrame = { 41 | parse(ss, schema, Array(X), Array(0)) 42 | } 43 | 44 | def toBreeze(values: Array[Double]): BDV[Double] = { 45 | new BDV[Double](values) 46 | } 47 | 48 | def toBreeze(vector: Vector): BDV[Double] = vector match { 49 | case sv: SparseVector => new BDV[Double](vector.toDense.values) 50 | case dv: DenseVector => new BDV[Double](dv.values) 51 | } 52 | 53 | def toBreeze(X: Array[Vector]): BDM[Double] = { 54 | val mat = BDM.zeros[Double](X.size, X(0).size) 55 | for (i <- 0 until X.size) { 56 | for (j <- 0 until X(0).size) { 57 | mat(i, j) = X(i)(j) 58 | } 59 | } 60 | mat 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/com/tencent/angel/spark/automl/utils/Distribution.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl.utils 20 | 21 | object Distribution extends Enumeration { 22 | 23 | type Distribution = Value 24 | 25 | val LINEAR = Value("1") 26 | 27 | def checkExists(distribution: String): Boolean = this.values.exists(_.toString == distribution) 28 | 29 | def printAll(): Unit = this.values.foreach(println) 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/feature/operator/MetadataTransformUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package org.apache.spark.ml.feature.operator 19 | 20 | import org.apache.spark.sql.types.{MetadataBuilder, StructField} 21 | 22 | import scala.collection.mutable.ArrayBuffer 23 | 24 | /** 25 | * The factory to record the generation information for each feature in the pipeline 26 | */ 27 | object MetadataTransformUtils { 28 | 29 | final val DERIVATION: String = "derivation" 30 | 31 | /** 32 | * create derivation for all features 33 | * 34 | * @param numFeatures 35 | * @return 36 | */ 37 | private def createDerivation(numFeatures: Int): Array[String] = { 38 | val arrayBuffer = ArrayBuffer[String]() 39 | (0 until numFeatures).foreach { i => 40 | arrayBuffer.append("f_" + i.toString) 41 | } 42 | arrayBuffer.toArray 43 | } 44 | 45 | 46 | private def createSelectedDerivation(selectedFeatures: Array[Int]): Array[String] = { 47 | val arrayBuffer = ArrayBuffer[String]() 48 | selectedFeatures.foreach { i => 49 | arrayBuffer.append("f_" + i.toString) 50 | } 51 | arrayBuffer.toArray 52 | } 53 | 54 | 55 | /** 56 | * 57 | * @param feature1 58 | * @param feature2 59 | * @return Array[String] 60 | */ 61 | private def cartesianWithArray(feature1: Array[String], feature2: Array[String]): Array[String] = { 62 | val res = ArrayBuffer[String]() 63 | feature1.foreach { f1 => 64 | feature2.foreach { f2 => 65 | res.append("(" + f1 + " x " + f2 + ")") 66 | } 67 | } 68 | res.toArray 69 | } 70 | 71 | /** 72 | * 73 | * @param field 74 | * @param filterIndices 75 | * @param numFeatures 76 | * @return 77 | */ 78 | def featureSelectionTransform(field: StructField, // Metadata is private[types] 79 | selectedIndices: Array[Int], 80 | numFeatures: Int): MetadataBuilder = { 81 | val metadata = field.metadata 82 | 83 | var derivation = Array[String]() 84 | if (metadata.contains(DERIVATION)) { 85 | derivation = selectedIndices map metadata.getStringArray(DERIVATION) 86 | } else { 87 | derivation = createDerivation(numFeatures) 88 | } 89 | 90 | new MetadataBuilder().withMetadata(metadata).putStringArray(DERIVATION, derivation) 91 | } 92 | 93 | /** 94 | * 95 | * @param fields : The Array[StructField] 96 | * @param numFeatures number of features 97 | * @return 98 | */ 99 | def vectorCartesianTransform(fields: Array[StructField], numFeatures: Int): MetadataBuilder = { 100 | if (fields.length < 2) { 101 | throw new IllegalArgumentException("the number of cols in the input DataFrame should be no less than 2") 102 | } 103 | 104 | var res = Array[String]() 105 | if (fields.head.metadata.contains(DERIVATION)) { 106 | res = fields.head.metadata.getStringArray(DERIVATION) 107 | } else { 108 | res = createDerivation(numFeatures) 109 | } 110 | 111 | for (i <- 1 until fields.length) { 112 | if (fields(i).metadata.contains(DERIVATION)) { 113 | res = cartesianWithArray(res, fields(i).metadata.getStringArray(DERIVATION)) 114 | } else { 115 | res = cartesianWithArray(res, createDerivation(numFeatures)) 116 | } 117 | } 118 | 119 | val metadata = fields.last.metadata 120 | new MetadataBuilder().withMetadata(metadata).putStringArray(DERIVATION, res) 121 | } 122 | 123 | } 124 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/feature/operator/SelfCartesian.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package org.apache.spark.ml.feature.operator 19 | 20 | import com.tencent.angel.spark.automl.feature.cross.FeatureCrossOp 21 | import org.apache.spark.annotation.Since 22 | import org.apache.spark.ml.UnaryTransformer 23 | import org.apache.spark.ml.linalg.{Vector, VectorUDT} 24 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} 25 | import org.apache.spark.sql.types.DataType 26 | 27 | class SelfCartesian(override val uid: String) 28 | extends UnaryTransformer[Vector, Vector, SelfCartesian] with DefaultParamsWritable { 29 | 30 | def this() = this(Identifiable.randomUID("SelfCartesian")) 31 | 32 | override protected def createTransformFunc: Vector => Vector = FeatureCrossOp.flatCartesian 33 | 34 | override protected def outputDataType: DataType = new VectorUDT() 35 | } 36 | 37 | object SelfCartesian extends DefaultParamsReadable[SelfCartesian] { 38 | 39 | @Since("1.6.0") 40 | override def load(path: String): SelfCartesian = super.load(path) 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/feature/operator/VectorReIndexNonZero.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package org.apache.spark.ml.feature.operator 19 | 20 | import org.apache.spark.SparkException 21 | import org.apache.spark.ml.Transformer 22 | import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} 23 | import org.apache.spark.ml.feature.VectorAssembler 24 | import org.apache.spark.ml.linalg.{SparseVector, Vector, VectorUDT, Vectors} 25 | import org.apache.spark.ml.param.ParamMap 26 | import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} 27 | import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} 28 | import org.apache.spark.sql.functions.{col, struct, udf} 29 | import org.apache.spark.sql.types._ 30 | import org.apache.spark.sql.{DataFrame, Dataset, Row} 31 | 32 | import scala.collection.mutable.ArrayBuilder 33 | import scala.language.postfixOps 34 | 35 | class VectorReIndexNonZero(var featureMap: Map[Int, Int], override val uid: String) 36 | extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { 37 | 38 | def this(featureMap: Map[Int, Int]) = this(featureMap, Identifiable.randomUID("VectorReIndexNonZero")) 39 | 40 | def this() = this(Map[Int, Int](), Identifiable.randomUID("VectorReIndexNonZero")) 41 | 42 | def this(uid: String) = this(Map[Int, Int](), uid) 43 | 44 | /** @group setParam */ 45 | def setInputCol(value: String): this.type = set(inputCol, value) 46 | 47 | /** @group setParam */ 48 | def setOutputCol(value: String): this.type = set(outputCol, value) 49 | 50 | override def transform(dataset: Dataset[_]): DataFrame = { 51 | transformSchema(dataset.schema, logging = true) 52 | val schema = dataset.schema 53 | val metadata = dataset.select($(inputCol)).schema.fields.last.metadata 54 | 55 | val nnzIndices = VectorReIndexNonZero.getNonZero(dataset, $(inputCol)) 56 | 57 | featureMap ++= nnzIndices.zipWithIndex.toMap 58 | println(s"feature map:") 59 | println(featureMap.mkString(",")) 60 | 61 | // Data transformation. 62 | val filterFunc = udf { r: Row => 63 | val vec = r.get(0).asInstanceOf[Vector] 64 | VectorReIndexNonZero.filter(featureMap, vec) 65 | } 66 | 67 | val args = Array($(inputCol)).map { c => 68 | schema(c).dataType match { 69 | case _: VectorUDT => dataset(c) 70 | } 71 | } 72 | 73 | dataset.select(col("*"), filterFunc(struct(args: _*)).as($(outputCol), metadata)) 74 | .drop($(inputCol)) 75 | } 76 | 77 | override def transformSchema(schema: StructType): StructType = { 78 | val inputColName = $(inputCol) 79 | val outputColName = $(outputCol) 80 | val inputDataType = schema(inputColName).dataType 81 | if (!inputDataType.isInstanceOf[VectorUDT]) { 82 | throw new IllegalArgumentException(s"Data type $inputDataType is not supported.") 83 | } 84 | if (schema.fieldNames.contains(outputColName)) { 85 | throw new IllegalArgumentException(s"Output column $outputColName already exists.") 86 | } 87 | StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true)) 88 | } 89 | 90 | override def copy(extra: ParamMap): VectorAssembler = defaultCopy(extra) 91 | } 92 | 93 | object VectorReIndexNonZero extends DefaultParamsReadable[VectorReIndexNonZero] { 94 | 95 | override def load(path: String): VectorReIndexNonZero = super.load(path) 96 | 97 | private def getAttrs(dataset: Dataset[_], inputCol: String): Array[Attribute] = { 98 | val schema = dataset.schema 99 | lazy val first = dataset.toDF.first() 100 | val field = schema(inputCol) 101 | val index = schema.fieldIndex(inputCol) 102 | field.dataType match { 103 | case _: VectorUDT => 104 | val group = AttributeGroup.fromStructField(field) 105 | if (group.attributes.isDefined) { 106 | // If attributes are defined, copy them with updated names. 107 | group.attributes.get.zipWithIndex.map { case (attr, i) => 108 | if (attr.name.isDefined) { 109 | // TODO: Define a rigorous naming scheme. 110 | attr.withName(inputCol + "_" + attr.name.get) 111 | } else { 112 | attr.withName(inputCol + "_" + i) 113 | } 114 | } 115 | } else { 116 | // Otherwise, treat all attributes as numeric. If we cannot get the number of attributes 117 | // from metadata, check the first row. 118 | val numAttrs = group.numAttributes.getOrElse(first.getAs[Vector](index).size) 119 | Array.tabulate(numAttrs)(i => NumericAttribute.defaultAttr.withName(inputCol + "_" + i)) 120 | } 121 | case otherType => 122 | throw new SparkException(s"VectorReIndexNonZero does not support the $otherType type") 123 | } 124 | } 125 | 126 | private def getNonZero(dataset: Dataset[_], 127 | column: String): Array[Int] = { 128 | dataset.select(column).rdd.mapPartitions { rows: Iterator[Row] => 129 | val mergeIndices = rows.map { case Row(v: Vector) => 130 | v match { 131 | case sv: SparseVector => 132 | sv.indices 133 | case _ => throw new IllegalArgumentException(s"Input column $column should be SparseVector.") 134 | } 135 | }.reduce(_ union _ distinct) 136 | Iterator(mergeIndices) 137 | }.collect().reduce((a, b) => (a union b).distinct).sortBy(x => x) 138 | } 139 | 140 | private def filter(featureMap: Map[Int, Int], vec: Vector): Vector = { 141 | val indices = ArrayBuilder.make[Int] 142 | val values = ArrayBuilder.make[Double] 143 | vec match { 144 | case vec: Vector => 145 | vec.foreachActive { case (i, v) => 146 | if (v != 0.0) { 147 | indices += featureMap(i) 148 | values += v 149 | } 150 | } 151 | case null => 152 | // TODO: output Double.NaN? 153 | throw new SparkException("Vector to filter cannot be null.") 154 | case o => 155 | throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.") 156 | } 157 | Vectors.sparse(featureMap.size, indices.result(), values.result()).compressed 158 | } 159 | 160 | } 161 | 162 | -------------------------------------------------------------------------------- /src/test/scala/com/tencent/angel/spark/automl/BreezeOpTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl 20 | 21 | import com.tencent.angel.spark.automl.tuner.math.BreezeOp._ 22 | import org.junit.Assert._ 23 | import org.scalatest.FunSuite 24 | 25 | class BreezeOpTest extends FunSuite { 26 | 27 | test("test cartesian") { 28 | 29 | val a: Array[Double] = Array(1.0, 2.0) 30 | val b: Array[Double] = Array(3.0, 4.0) 31 | val c: Array[Array[Double]] = cartesian(a, b) 32 | val expected: Array[Array[Double]] = Array(Array(1.0, 3.0), Array(1.0, 4.0), Array(2.0, 3.0), Array(2.0, 4.0)) 33 | 34 | println(c.deep.mkString("\n")) 35 | assertEquals(expected.deep.mkString("\n"), c.deep.mkString("\n")) 36 | } 37 | 38 | test("test_higher_cartesian") { 39 | 40 | val a: Array[Double] = Array(1.0, 2.0) 41 | val b: Array[Double] = Array(3.0, 4.0) 42 | val c: Array[Double] = Array(5.0, 6.0) 43 | val d: Array[Array[Double]] = cartesian(a, b) 44 | val e: Array[Array[Double]] = cartesian(d, c) 45 | val expected = Array(Array(1.0, 3.0, 5.0), 46 | Array(1.0, 3.0, 6.0), 47 | Array(1.0, 4.0, 5.0), 48 | Array(1.0, 4.0, 6.0), 49 | Array(2.0, 3.0, 5.0), 50 | Array(2.0, 3.0, 6.0), 51 | Array(2.0, 4.0, 5.0), 52 | Array(2.0, 4.0, 6.0)) 53 | 54 | println(e.deep.mkString("\n")) 55 | assertEquals(expected.deep.mkString("\n"), e.deep.mkString("\n")) 56 | } 57 | 58 | test("test_cartesian_array") { 59 | 60 | val a: Array[Double] = Array(1.0, 2.0) 61 | val b: Array[Double] = Array(3.0, 4.0) 62 | val c: Array[Double] = Array(5.0, 6.0) 63 | val d: Array[Double] = Array(7.0, 8.0) 64 | val allArray = Array(a, b, c, d) 65 | var tmp: Array[Array[Double]] = cartesian(allArray(0), allArray(1)) 66 | allArray.foreach { case a => 67 | if (a != allArray(0) && a != allArray(1)) { 68 | tmp = cartesian(tmp, a) 69 | } 70 | } 71 | val expected = Array(Array(1.0, 3.0, 5.0, 7.0), 72 | Array(1.0, 3.0, 5.0, 8.0), 73 | Array(1.0, 3.0, 6.0, 7.0), 74 | Array(1.0, 3.0, 6.0, 8.0), 75 | Array(1.0, 4.0, 5.0, 7.0), 76 | Array(1.0, 4.0, 5.0, 8.0), 77 | Array(1.0, 4.0, 6.0, 7.0), 78 | Array(1.0, 4.0, 6.0, 8.0), 79 | Array(2.0, 3.0, 5.0, 7.0), 80 | Array(2.0, 3.0, 5.0, 8.0), 81 | Array(2.0, 3.0, 6.0, 7.0), 82 | Array(2.0, 3.0, 6.0, 8.0), 83 | Array(2.0, 4.0, 5.0, 7.0), 84 | Array(2.0, 4.0, 5.0, 8.0), 85 | Array(2.0, 4.0, 6.0, 7.0), 86 | Array(2.0, 4.0, 6.0, 8.0)) 87 | 88 | println(tmp.deep.mkString("\n")) 89 | assertEquals(expected.deep.mkString("\n"), tmp.deep.mkString("\n")) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/test/scala/com/tencent/angel/spark/automl/FeatureEngineeringTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl 19 | 20 | import java.io.File 21 | 22 | import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression} 23 | import org.apache.spark.ml.feature.VectorAssembler 24 | import org.apache.spark.ml.feature.operator._ 25 | import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage} 26 | import org.apache.spark.sql.SparkSession 27 | import org.scalatest.{BeforeAndAfter, FunSuite} 28 | 29 | import scala.collection.mutable.ArrayBuffer 30 | 31 | class FeatureEngineeringTest extends FunSuite with BeforeAndAfter { 32 | 33 | var spark: SparkSession = _ 34 | 35 | before { 36 | spark = SparkSession.builder().master("local").getOrCreate() 37 | } 38 | 39 | after { 40 | spark.close() 41 | } 42 | 43 | test("test_iterative_cross") { 44 | 45 | val dim = 123 46 | val incDim = 10 47 | val iter = 2 48 | val modelPath = "tmp/model/feature_engineer" 49 | 50 | val data = spark.read.format("libsvm") 51 | .option("numFeatures", dim) 52 | .load("data/a9a/a9a_123d_train_trans.libsvm") 53 | .persist() 54 | 55 | val featureMap: Map[Int, Int] = Map[Int, Int]() 56 | 57 | val pipelineStages: ArrayBuffer[PipelineStage] = new ArrayBuffer 58 | val usedFields: ArrayBuffer[String] = new ArrayBuffer[String]() 59 | 60 | val cartesianPrefix = "_f" 61 | val selectorPrefix = "_select" 62 | val filterPrefix = "_filter" 63 | var curField = "features" 64 | usedFields += curField 65 | 66 | (0 until iter).foreach { iter => 67 | // add cartesian operator 68 | val cartesian = new VectorCartesian() 69 | .setInputCols(Array("features", curField)) 70 | .setOutputCol(curField + cartesianPrefix) 71 | println(s"Cartesian -> input features and $curField, output ${curField + cartesianPrefix}") 72 | pipelineStages += cartesian 73 | curField += cartesianPrefix 74 | 75 | // add selector operator 76 | val selector = new RandomForestSelector() 77 | .setFeaturesCol(curField) 78 | .setLabelCol("label") 79 | .setOutputCol(curField + selectorPrefix) 80 | .setNumTopFeatures(incDim) 81 | println(s"Selector -> input $curField, output ${curField + selectorPrefix}") 82 | pipelineStages += selector 83 | curField += selectorPrefix 84 | 85 | // add filter operator 86 | val filter = new VectorReIndexNonZero(featureMap) 87 | .setInputCol(curField) 88 | .setOutputCol(curField + filterPrefix) 89 | println(s"Filter -> input $curField, output ${curField + filterPrefix}") 90 | pipelineStages += filter 91 | curField += filterPrefix 92 | usedFields += curField 93 | } 94 | 95 | println(s"used fields: ${usedFields.toArray.mkString(",")}") 96 | 97 | val assembler = new VectorAssembler() 98 | .setInputCols(usedFields.toArray) 99 | .setOutputCol("assembled_features") 100 | pipelineStages += assembler 101 | 102 | val pipeline = new Pipeline() 103 | .setStages(pipelineStages.toArray) 104 | 105 | val model = pipeline.fit(data) 106 | deleteRecursively(new File(modelPath)) 107 | model.save(modelPath) 108 | val load_model = PipelineModel.load(modelPath) 109 | 110 | val crossDF = load_model.transform(data).persist() 111 | data.unpersist() 112 | crossDF.show(1) 113 | 114 | usedFields.takeRight(usedFields.length - 1).foreach { field => 115 | println(crossDF.select(field).schema.fields.last.metadata 116 | .getStringArray(MetadataTransformUtils.DERIVATION).length + " cross features in " + usedFields.last) 117 | println(crossDF.select(field).schema.fields.last.metadata 118 | .getStringArray(MetadataTransformUtils.DERIVATION).mkString(",")) 119 | } 120 | 121 | val splitDF = crossDF.randomSplit(Array(0.7, 0.3)) 122 | 123 | val trainDF = splitDF(0).persist() 124 | val testDF = splitDF(1).persist() 125 | 126 | val originalLR = new LogisticRegression() 127 | .setFeaturesCol("features") 128 | .setLabelCol("label") 129 | .setMaxIter(100) 130 | .setRegParam(0.01) 131 | val originalAUC = originalLR.fit(trainDF).evaluate(testDF) 132 | .asInstanceOf[BinaryLogisticRegressionSummary].areaUnderROC 133 | println(s"original features auc = $originalAUC") 134 | 135 | val crossLR = new LogisticRegression() 136 | .setFeaturesCol("assembled_features") 137 | .setLabelCol("label") 138 | .setMaxIter(100) 139 | .setRegParam(0.01) 140 | val crossAUC = crossLR.fit(trainDF).evaluate(testDF) 141 | .asInstanceOf[BinaryLogisticRegressionSummary].areaUnderROC 142 | println(s"cross features auc = $crossAUC") 143 | } 144 | 145 | def deleteRecursively(file: File): Unit = { 146 | if (file.isDirectory) { 147 | file.listFiles.foreach(deleteRecursively) 148 | } 149 | if (file.exists && !file.delete) { 150 | throw new Exception(s"Unable to delete ${file.getAbsolutePath}") 151 | } 152 | } 153 | 154 | } 155 | -------------------------------------------------------------------------------- /src/test/scala/com/tencent/angel/spark/automl/GPModelTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl 20 | 21 | import breeze.linalg.{DenseMatrix, DenseVector} 22 | import breeze.numerics.{cos, pow} 23 | import com.tencent.angel.spark.automl.tuner.kernel.Matern5Iso 24 | import com.tencent.angel.spark.automl.tuner.model.GPModel 25 | import org.scalatest.FunSuite 26 | 27 | class GPModelTest extends FunSuite { 28 | 29 | test("test_linear") { 30 | // Test linear: y=2*x 31 | val X = DenseMatrix((1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)).t 32 | val y = 2.0 * DenseVector(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0) 33 | val z = DenseMatrix((2.5, 4.5, 6.5, 8.5, 10.0, 12.0)).t 34 | val truePredZ = 2.0 * DenseVector(2.5, 4.5, 6.5, 8.5, 10.0, 12.0) 35 | 36 | val covFunc = Matern5Iso() 37 | val initCovParams = DenseVector(1.0, 1.0) 38 | val initNoiseStdDev = 0.01 39 | 40 | val gpModel = GPModel(covFunc, initCovParams, initNoiseStdDev) 41 | gpModel.fit(X, y) 42 | 43 | println("Fitted covariance function params:") 44 | println(gpModel.covParams) 45 | println("Fitted noiseStdDev:") 46 | println(gpModel.noiseStdDev) 47 | println("\n") 48 | 49 | val prediction = gpModel.predict(z) 50 | println("Mean and Var:") 51 | println(prediction) 52 | println("True value:") 53 | println(truePredZ) 54 | } 55 | 56 | test("test_cosine") { 57 | // Test no_linear: y=cos(x)+1 58 | val X = DenseMatrix((1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)).t 59 | val y = cos(DenseVector(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)) + 1.0 60 | val z = DenseMatrix((2.5, 4.5, 6.5, 8.5, 10.0, 12.0)).t 61 | val truePredZ = cos(DenseVector(2.5, 4.5, 6.5, 8.5, 10.0, 12.0)) + 1.01 62 | 63 | val covFunc = Matern5Iso() 64 | val initCovParams = DenseVector(1.0, 1.0) 65 | val initNoiseStdDev = 0.01 66 | 67 | val gpModel = GPModel(covFunc, initCovParams, initNoiseStdDev) 68 | gpModel.fit(X, y) 69 | 70 | println("Fitted covariance function params:") 71 | println(gpModel.covParams) 72 | println("Fitted noiseStdDev:") 73 | println(gpModel.noiseStdDev) 74 | println("\n") 75 | 76 | val prediction = gpModel.predict(z) 77 | println("Mean and Var:") 78 | println(prediction) 79 | println("True value:") 80 | println(truePredZ) 81 | } 82 | 83 | test("testSquare") { 84 | // Test no_linear: y=x^2 85 | val X = DenseMatrix((1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)).t 86 | val y = DenseVector(1.0, 4.0, 9.0, 16.0, 25.0, 36.0, 49.0, 64.0, 81.0) 87 | val z = DenseMatrix((2.5, 4.5, 6.5, 8.5, 10.0, 12.0)).t 88 | val truePredZ = pow(z, 2) 89 | 90 | val covFunc = Matern5Iso() 91 | val initCovParams = DenseVector(1.0, 1.0) 92 | val initNoiseStdDev = 0.01 93 | 94 | val gpModel = GPModel(covFunc, initCovParams, initNoiseStdDev) 95 | gpModel.fit(X, y) 96 | 97 | println("Fitted covariance function params:") 98 | println(gpModel.covParams) 99 | println("Fitted noiseStdDev:") 100 | println(gpModel.noiseStdDev) 101 | println("\n") 102 | 103 | val prediction = gpModel.predict(z) 104 | println("Mean and Var:") 105 | println(prediction) 106 | println("True value:") 107 | println(truePredZ) 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/test/scala/com/tencent/angel/spark/automl/MetadataTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl 19 | 20 | import org.apache.spark.ml.Pipeline 21 | import org.apache.spark.ml.feature.VectorAssembler 22 | import org.apache.spark.ml.feature.operator.{MetadataTransformUtils, VectorCartesian} 23 | import org.apache.spark.sql.SparkSession 24 | import org.scalatest.{BeforeAndAfter, FunSuite} 25 | 26 | class MetadataTest extends FunSuite with BeforeAndAfter { 27 | 28 | var spark: SparkSession = _ 29 | 30 | before { 31 | spark = SparkSession.builder().master("local").getOrCreate() 32 | } 33 | 34 | after { 35 | spark.close() 36 | } 37 | 38 | test("test_vector_cartesian") { 39 | val data = spark.read.format("libsvm") 40 | .option("numFeatures", "123") 41 | .load("data/a9a/a9a_123d_train_trans.libsvm") 42 | .persist() 43 | 44 | val cartesian = new VectorCartesian() 45 | .setInputCols(Array("features", "features")) 46 | .setOutputCol("cartesian_features") 47 | 48 | val assembler = new VectorAssembler() 49 | .setInputCols(Array("features", "cartesian_features")) 50 | .setOutputCol("assemble_features") 51 | 52 | val pipeline = new Pipeline() 53 | .setStages(Array(cartesian, assembler)) 54 | 55 | val featureModel = pipeline.fit(data) 56 | val crossDF = featureModel.transform(data) 57 | 58 | crossDF.schema.fields.foreach { field => 59 | println("name: " + field.name) 60 | println("metadata: " + field.metadata.toString()) 61 | } 62 | } 63 | 64 | test("test_three_order_cartesian") { 65 | val data = spark.read.format("libsvm") 66 | .option("numFeatures", 8) 67 | .load("data/abalone/abalone_8d_train.libsvm") 68 | .persist() 69 | 70 | val cartesian = new VectorCartesian() 71 | .setInputCols(Array("features", "features")) 72 | .setOutputCol("f_f") 73 | 74 | val cartesian2 = new VectorCartesian() 75 | .setInputCols(Array("features", "f_f")) 76 | .setOutputCol("f_f_f") 77 | 78 | val pipeline = new Pipeline() 79 | .setStages(Array(cartesian, cartesian2)) 80 | 81 | val crossDF = pipeline.fit(data).transform(data).persist() 82 | 83 | // first cartesian, the number of dimensions is 64 84 | println("first cartesian dimension = " + crossDF.select("f_f").schema.fields.last.metadata.getStringArray(MetadataTransformUtils.DERIVATION).length) 85 | println(crossDF.select("f_f").schema.fields.last.metadata.getStringArray(MetadataTransformUtils.DERIVATION).mkString(",")) 86 | 87 | println() 88 | 89 | // second cartesian, the number of dimensions is 512 90 | println("second cartesian dimension = " + crossDF.select("f_f_f").schema.fields.last.metadata.getStringArray(MetadataTransformUtils.DERIVATION).length) 91 | println(crossDF.select("f_f_f").schema.fields.last.metadata.getStringArray(MetadataTransformUtils.DERIVATION).mkString(",")) 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/test/scala/com/tencent/angel/spark/automl/PipelineTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl 19 | 20 | import com.tencent.angel.spark.automl.feature.preprocess.{HashingTFWrapper, IDFWrapper, TokenizerWrapper} 21 | import com.tencent.angel.spark.automl.feature.{PipelineBuilder, PipelineWrapper, TransformerWrapper} 22 | import org.apache.spark.sql.SparkSession 23 | import org.scalatest.{BeforeAndAfter, FunSuite} 24 | 25 | class PipelineTest extends FunSuite with BeforeAndAfter { 26 | 27 | var spark: SparkSession = _ 28 | 29 | before { 30 | spark = SparkSession.builder().master("local").getOrCreate() 31 | } 32 | 33 | after { 34 | spark.close() 35 | } 36 | 37 | test("test_tfidf") { 38 | val sentenceData = spark.createDataFrame(Seq( 39 | (0.0, "Hi I heard about Spark"), 40 | (0.0, "I wish Java could use case classes"), 41 | (1.0, "Logistic regression models are neat") 42 | )).toDF("label", "sentence") 43 | 44 | val pipelineWrapper = new PipelineWrapper() 45 | 46 | val transformers = Array[TransformerWrapper]( 47 | new TokenizerWrapper(), 48 | new HashingTFWrapper(20), 49 | new IDFWrapper() 50 | ) 51 | 52 | val stages = PipelineBuilder.build(transformers) 53 | 54 | transformers.foreach { transformer => 55 | val inputCols = transformer.getInputCols 56 | val outputCols = transformer.getOutputCols 57 | inputCols.foreach(print) 58 | print(" ") 59 | outputCols.foreach(print) 60 | println() 61 | } 62 | 63 | pipelineWrapper.setStages(stages) 64 | 65 | val model = pipelineWrapper.fit(sentenceData) 66 | 67 | val outputDF = model.transform(sentenceData) 68 | outputDF.select("outIDF").show() 69 | outputDF.select("outIDF").foreach { row => 70 | println(row.get(0).getClass.getSimpleName) 71 | val arr = row.get(0) 72 | println(arr.toString) 73 | } 74 | outputDF.rdd.map(row => row.toString()).repartition(1) 75 | .saveAsTextFile("tmp/output/tfidf") 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/test/scala/com/tencent/angel/spark/automl/SquareDistTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | 19 | package com.tencent.angel.spark.automl 20 | 21 | import breeze.linalg.{DenseMatrix, DenseVector} 22 | import com.tencent.angel.spark.automl.tuner.math.SquareDist 23 | import org.junit.Assert._ 24 | import org.scalatest.FunSuite 25 | 26 | class SquareDistTest extends FunSuite { 27 | 28 | test("test_XX_1D") { 29 | 30 | val x = DenseVector(1.0, 2.0, 3.0).toDenseMatrix.t 31 | val expected = DenseMatrix((0.0, 1.0, 4.0), (1.0, 0.0, 1.0), (4.0, 1.0, 0.0)) 32 | assertEquals(expected, SquareDist(x, x)) 33 | } 34 | 35 | test("test_XX_2D") { 36 | 37 | val x = DenseMatrix((1.0, 2.0, 3.0), (4.0, 5.0, 6.0)).t 38 | val expected = DenseMatrix((0.0, 2.0, 8.0), (2.0, 0.0, 2.0), (8.0, 2.0, 0.0)) 39 | assertEquals(expected, SquareDist(x, x)) 40 | } 41 | 42 | test("test_XY_1D") { 43 | 44 | val x1 = DenseVector(1.0, 2.0, 3.0).toDenseMatrix.t 45 | val x2 = DenseVector(4.0, 5.0).toDenseMatrix.t 46 | 47 | val expected = DenseMatrix((9.0, 16.0), (4.0, 9.0), (1.0, 4.0)) 48 | assertEquals(expected, SquareDist(x1, x2)) 49 | } 50 | 51 | test("test_XY_2D") { 52 | 53 | val x1 = DenseMatrix((1.0, 2.0, 3.0), (4.0, 5.0, 6.0)).t 54 | val x2 = DenseMatrix((7.0, 8.0), (9.0, 10.0)).t 55 | 56 | val expected = DenseMatrix((61.0, 85.0), (41.0, 61.0), (25.0, 41.0)) 57 | assertEquals(expected, SquareDist(x1, x2)) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/test/scala/com/tencent/angel/spark/automl/TunerTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Tencent is pleased to support the open source community by making Angel available. 3 | * 4 | * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 7 | * compliance with the License. You may obtain a copy of the License at 8 | * 9 | * https://opensource.org/licenses/Apache-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software distributed under the License 12 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 13 | * or implied. See the License for the specific language governing permissions and limitations under 14 | * the License. 15 | * 16 | */ 17 | 18 | package com.tencent.angel.spark.automl 19 | 20 | import com.tencent.angel.spark.automl.tuner.config.Configuration 21 | import com.tencent.angel.spark.automl.tuner.parameter.ParamSpace 22 | import com.tencent.angel.spark.automl.tuner.solver.Solver 23 | import com.tencent.angel.spark.automl.tuner.trail.{TestTrail, Trail} 24 | import org.apache.spark.ml.linalg.Vector 25 | import org.scalatest.FunSuite 26 | 27 | class TunerTest extends FunSuite { 28 | 29 | test("test_random") { 30 | val param1 = ParamSpace.fromConfigString("param1", "{2.0,3.0,4.0,5.0,6.0}") 31 | val param2 = ParamSpace.fromConfigString("param2", "{3:10:1}") 32 | val solver: Solver = Solver(Array(param1, param2), true, surrogate = "Random") 33 | val trail: Trail = new TestTrail() 34 | (0 until 10).foreach { iter => 35 | println(s"------iteration $iter starts------") 36 | val configs: Array[Configuration] = solver.suggest() 37 | val results: Array[Double] = trail.evaluate(configs) 38 | solver.feed(configs, results) 39 | } 40 | val result: (Vector, Double) = solver.optimal 41 | solver.stop 42 | println(s"Best configuration ${result._1.toArray.mkString(",")}, best performance: ${result._2}") 43 | } 44 | 45 | test("test_grid") { 46 | val param1 = ParamSpace.fromConfigString("param1", "[1,10]") 47 | val param2 = ParamSpace.fromConfigString("param2", "[-5:5:10]") 48 | val solver: Solver = Solver(Array(param1, param2), true, surrogate = "Grid") 49 | val trail: Trail = new TestTrail() 50 | (0 until 10).foreach { iter => 51 | println(s"------iteration $iter starts------") 52 | val configs: Array[Configuration] = solver.suggest() 53 | val results: Array[Double] = trail.evaluate(configs) 54 | solver.feed(configs, results) 55 | } 56 | val result: (Vector, Double) = solver.optimal 57 | solver.stop 58 | println(s"Best configuration ${result._1.toArray.mkString(",")}, best performance: ${result._2}") 59 | } 60 | 61 | test("test_gp") { 62 | val param1 = ParamSpace.fromConfigString("param1", "[1,10]") 63 | val param2 = ParamSpace.fromConfigString("param2", "[-5:5:10]") 64 | val param3 = ParamSpace.fromConfigString("param3", "{0.0,1.0,3.0,5.0}") 65 | val param4 = ParamSpace.fromConfigString("param4", "{-5:5:1}") 66 | val solver: Solver = Solver(Array(param1, param2, param3, param4), true, surrogate = "GaussianProcess") 67 | val trail: Trail = new TestTrail() 68 | (0 until 10).foreach { iter => 69 | println(s"------iteration $iter starts------") 70 | val configs: Array[Configuration] = solver.suggest 71 | val results: Array[Double] = trail.evaluate(configs) 72 | solver.feed(configs, results) 73 | } 74 | val result: (Vector, Double) = solver.optimal 75 | solver.stop 76 | println(s"Best configuration ${result._1.toArray.mkString(",")}, best performance: ${result._2}") 77 | } 78 | 79 | test("test_rf") { 80 | val param1 = ParamSpace.fromConfigString("param1", "[1,10]") 81 | val param2 = ParamSpace.fromConfigString("param2", "[-5:5:10]") 82 | val param3 = ParamSpace.fromConfigString("param3", "{0.0,1.0,3.0,5.0}") 83 | val param4 = ParamSpace.fromConfigString("param4", "{-5:5:1}") 84 | val solver: Solver = Solver(Array(param1, param2, param3, param4), true, "RandomForest") 85 | val trail: Trail = new TestTrail() 86 | (0 until 10).foreach { iter => 87 | println(s"------iteration $iter starts------") 88 | val configs: Array[Configuration] = solver.suggest 89 | val results: Array[Double] = trail.evaluate(configs) 90 | solver.feed(configs, results) 91 | } 92 | val result: (Vector, Double) = solver.optimal 93 | solver.stop 94 | println(s"Best configuration ${result._1.toArray.mkString(",")}, best performance: ${result._2}") 95 | } 96 | } 97 | --------------------------------------------------------------------------------