├── .gitignore ├── CONTRIBUTORS.md ├── EC2.md ├── LICENSE ├── Makefile ├── README.md ├── RELEASE.md ├── bin ├── keystone-ec2.sh ├── run-main.sh └── run-pipeline.sh ├── build.sbt ├── examples ├── images │ ├── cifar_random_patch.sh │ ├── mnist_random_fft.sh │ └── voc_sift_fisher.sh └── text │ └── newsgroups_ngrams_tfidf.sh ├── lib └── libImageFeatures.dylib ├── project ├── build.properties └── plugins.sbt ├── sbt └── sbt ├── scripts ├── constantEstimator.R └── solver-comparisons-final.csv └── src ├── main ├── cpp │ ├── EncEval.cxx │ ├── EncEval.h │ ├── VLFeat.cxx │ └── VLFeat.h ├── resources │ └── log4j.properties └── scala │ └── keystoneml │ ├── evaluation │ ├── AugmentedExamplesEvaluator.scala │ ├── BinaryClassifierEvaluator.scala │ ├── Evaluator.scala │ ├── MeanAveragePrecisionEvaluator.scala │ └── MulticlassClassifierEvaluator.scala │ ├── loaders │ ├── AmazonReviewsDataLoader.scala │ ├── CifarLoader.scala │ ├── CsvDataLoader.scala │ ├── ImageLoaderUtils.scala │ ├── ImageNetLoader.scala │ ├── LabeledData.scala │ ├── NewsgroupsDataLoader.scala │ ├── TimitFeaturesDataLoader.scala │ └── VOCLoader.scala │ ├── nodes │ ├── images │ │ ├── CenterCornerPatcher.scala │ │ ├── Convolver.scala │ │ ├── Cropper.scala │ │ ├── DaisyExtractor.scala │ │ ├── FisherVector.scala │ │ ├── GrayScaler.scala │ │ ├── HogExtractor.scala │ │ ├── ImageVectorizer.scala │ │ ├── LCSExtractor.scala │ │ ├── LabeledImageExtractors.scala │ │ ├── PixelScaler.scala │ │ ├── Pooler.scala │ │ ├── RandomImageTransformer.scala │ │ ├── RandomPatcher.scala │ │ ├── SIFTExtractor.scala │ │ ├── SymmetricRectifier.scala │ │ ├── Windower.scala │ │ └── external │ │ │ ├── FisherVector.scala │ │ │ └── SIFTExtractor.scala │ ├── learning │ │ ├── ApproximatePCA.scala │ │ ├── BlockLinearMapper.scala │ │ ├── BlockWeightedLeastSquares.scala │ │ ├── CostModel.scala │ │ ├── DistributedPCA.scala │ │ ├── GaussianMixtureModel.scala │ │ ├── GaussianMixtureModelEstimator.scala │ │ ├── Gradient.scala │ │ ├── KMeansPlusPlus.scala │ │ ├── KernelBlockLinearMapper.scala │ │ ├── KernelGenerator.scala │ │ ├── KernelMatrix.scala │ │ ├── KernelRidgeRegression.scala │ │ ├── LBFGS.scala │ │ ├── LeastSquaresEstimator.scala │ │ ├── LinearDiscriminantAnalysis.scala │ │ ├── LinearMapper.scala │ │ ├── LocalLeastSquaresEstimator.scala │ │ ├── LogisticRegressionModel.scala │ │ ├── NaiveBayesModel.scala │ │ ├── PCA.scala │ │ ├── PerClassWeightedLeastSquares.scala │ │ ├── SparseLinearMapper.scala │ │ ├── ZCAWhitener.scala │ │ ├── external │ │ │ └── GaussianMixtureModelEstimator.scala │ │ └── internal │ │ │ └── ReWeightedLeastSquares.scala │ ├── nlp │ │ ├── CoreNLPFeatureExtractor.scala │ │ ├── HashingTF.scala │ │ ├── NGramsHashingTF.scala │ │ ├── StringUtils.scala │ │ ├── StupidBackoff.scala │ │ ├── WordFrequencyEncoder.scala │ │ ├── indexers.scala │ │ └── ngrams.scala │ ├── stats │ │ ├── CosineRandomFeatures.scala │ │ ├── LinearRectifier.scala │ │ ├── NormalizeRows.scala │ │ ├── PaddedFFT.scala │ │ ├── RandomSignNode.scala │ │ ├── Sampling.scala │ │ ├── SignedHellingerMapper.scala │ │ ├── StandardScaler.scala │ │ └── TermFrequency.scala │ └── util │ │ ├── AllSparseFeatures.scala │ │ ├── Cacher.scala │ │ ├── ClassLabelIndicators.scala │ │ ├── CommonSparseFeatures.scala │ │ ├── Densify.scala │ │ ├── FloatToDouble.scala │ │ ├── Identity.scala │ │ ├── MatrixVectorizer.scala │ │ ├── MaxClassifier.scala │ │ ├── Shuffler.scala │ │ ├── SparseFeatureVectorizer.scala │ │ ├── Sparsify.scala │ │ ├── TopKClassifier.scala │ │ ├── VectorCombiner.scala │ │ └── VectorSplitter.scala │ ├── pipelines │ ├── FunctionNode.scala │ ├── Logging.scala │ ├── images │ │ ├── cifar │ │ │ ├── LinearPixels.scala │ │ │ ├── RandomCifar.scala │ │ │ ├── RandomPatchCifar.scala │ │ │ ├── RandomPatchCifarAugmented.scala │ │ │ ├── RandomPatchCifarAugmentedKernel.scala │ │ │ └── RandomPatchCifarKernel.scala │ │ ├── imagenet │ │ │ └── ImageNetSiftLcsFV.scala │ │ ├── mnist │ │ │ └── MnistRandomFFT.scala │ │ └── voc │ │ │ └── VOCSIFTFisher.scala │ ├── nlp │ │ └── StupidBackoffPipeline.scala │ ├── speech │ │ └── TimitPipeline.scala │ └── text │ │ ├── AmazonReviewsPipeline.scala │ │ └── NewsgroupsPipeline.scala │ ├── utils │ ├── MLlibUtils.scala │ ├── MatrixUtils.scala │ ├── Stats.scala │ ├── external │ │ ├── EncEval.scala │ │ └── VLFeat.scala │ └── images │ │ ├── Image.scala │ │ ├── ImageConversions.scala │ │ └── ImageUtils.scala │ └── workflow │ ├── AnalysisUtils.scala │ ├── AutoCacheRule.scala │ ├── ChainUtils.scala │ ├── Chainable.scala │ ├── DefaultOptimizer.scala │ ├── EquivalentNodeMergeRule.scala │ ├── Estimator.scala │ ├── Expression.scala │ ├── ExtractSaveablePrefixes.scala │ ├── FittedPipeline.scala │ ├── GatherTransformerOperator.scala │ ├── Graph.scala │ ├── GraphExecutor.scala │ ├── GraphId.scala │ ├── Identity.scala │ ├── LabelEstimator.scala │ ├── NodeOptimizationRule.scala │ ├── Operator.scala │ ├── OptimizableNodes.scala │ ├── Pipeline.scala │ ├── PipelineDataset.scala │ ├── PipelineDatum.scala │ ├── PipelineEnv.scala │ ├── PipelineResult.scala │ ├── Prefix.scala │ ├── Rule.scala │ ├── RuleExecutor.scala │ ├── SavedStateLoadRule.scala │ ├── SparkUtilWrapper.scala │ ├── Transformer.scala │ ├── TransformerGraph.scala │ ├── UnusedBranchRemovalRule.scala │ ├── WeightedNode.scala │ ├── WeightedOperator.scala │ └── WorkflowUtils.scala └── test ├── python └── images │ └── pyconv.py ├── resources ├── aMat-1class.csv ├── aMat.csv ├── aMatShuffled.csv ├── bMat-1class.csv ├── bMat.csv ├── bMatShuffled.csv ├── gmm_data.txt ├── images │ ├── 000012.jpg │ ├── convolved.gantrycrane.csv │ ├── convolved.gantrycrane.png │ ├── feats.csv │ ├── feats128.csv │ ├── gantrycrane.png │ ├── imagenet-test-labels │ ├── imagenet │ │ └── n15075141.tar │ ├── voc │ │ └── voctest.tar │ ├── voc_codebook │ │ ├── means.csv │ │ ├── priors │ │ └── variances.csv │ └── voclabels.csv └── iris.data └── scala └── keystoneml ├── evaluation ├── BinaryClassifierEvaluatorSuite.scala ├── MeanAveragePrecisionSuite.scala └── MulticlassClassifierEvaluatorSuite.scala ├── loaders ├── ImageNetLoaderSuite.scala └── VOCLoaderSuite.scala ├── nodes ├── images │ ├── CenterCornerPatcherSuite.scala │ ├── ConvolverSuite.scala │ ├── DaisyExtractorSuite.scala │ ├── HogExtractorSuite.scala │ ├── ImageBenchMarkSuite.scala │ ├── LCSExtractorSuite.scala │ ├── PoolingSuite.scala │ ├── RandomPatcherSuite.scala │ ├── SIFTExtractorSuite.scala │ └── WindowingSuite.scala ├── learning │ ├── BlockLinearMapperSuite.scala │ ├── BlockWeightedLeastSquaresSuite.scala │ ├── GaussianMixtureModelSuite.scala │ ├── KMeansPlusPlusSuite.scala │ ├── KernelModelSuite.scala │ ├── LBFGSSuite.scala │ ├── LeastSquaresEstimatorSuite.scala │ ├── LinearDiscriminantAnalysisSuite.scala │ ├── LinearMapperSuite.scala │ ├── LogisticRegressionModelSuite.scala │ ├── NaiveBayesModelSuite.scala │ ├── PCASuite.scala │ └── ZCAWhiteningSuite.scala ├── misc │ ├── SparseFeatureVectorizerSuite.scala │ └── TermFrequencySuite.scala ├── nlp │ ├── CoreNLPFeatureExtractorSuite.scala │ ├── HashingTFSuite.scala │ ├── NGramIndexerSuite.scala │ ├── NGramSuite.scala │ ├── NGramsHashingTFSuite.scala │ ├── StringUtilsSuite.scala │ └── WordFrequencyEncoderSuite.scala ├── stats │ ├── CosineRandomFeaturesSuite.scala │ ├── LinearRectifierSuite.scala │ ├── PaddedFFTSuite.scala │ ├── RandomSignNodeSuite.scala │ ├── SignedHellingerMapperSuite.scala │ └── StandardScalerSuite.scala └── util │ ├── ClassLabelIndicatorsSuite.scala │ ├── MaxClassifierSuite.scala │ ├── TopKClassifierSuite.scala │ └── VectorSplitterSuite.scala ├── pipelines └── nlp │ └── StupidBackoffSuite.scala ├── utils ├── ImageUtilsSuite.scala ├── MLlibUtilsSuite.scala ├── MatrixUtilsSuite.scala ├── TestUtils.scala ├── external │ ├── EncEvalSuite.scala │ └── VLFeatSuite.scala └── images │ └── ImageSuite.scala └── workflow ├── AnalysisUtilsSuite.scala ├── AutocCacheRuleSuite.scala ├── EstimatorSuite.scala ├── GraphSuite.scala ├── LabelEstimatorSuite.scala ├── NodeOptimizationRuleSuite.scala ├── OperatorSuite.scala ├── PipelineContext.scala └── PipelineSuite.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | dist/* 6 | target/ 7 | lib_managed/ 8 | src_managed/ 9 | project/boot/ 10 | project/plugins/project/ 11 | 12 | # Scala-IDE specific 13 | .idea* 14 | .scala_dependencies 15 | 16 | # Jars 17 | *.jar 18 | 19 | # vim tmps 20 | .*sw* 21 | 22 | # Jekyll stuff 23 | _site/ 24 | 25 | # Data for running examples. 26 | example_data/ 27 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | Contributors 2 | ============ 3 | 4 | KeystoneML has been developed by the following people (alphabetically): 5 | 6 | * Daniel Bruckner 7 | * Michael J. Franklin 8 | * Nicolas Garneau 9 | * Gylfi Gudmundsson 10 | * Eric Jonas 11 | * Tomer Kaftan 12 | * Daniel Langkilde 13 | * Henry Milner 14 | * Benjamin Recht 15 | * Vaishaal Shankar 16 | * Evan R. Sparks 17 | * Stephen Tu 18 | * Shivaram Venkataraman 19 | * Zongheng Yang 20 | -------------------------------------------------------------------------------- /EC2.md: -------------------------------------------------------------------------------- 1 | # Running KeystoneML on EC2 2 | 3 | To run KeystoneML on EC2 you can use the 4 | [spark-ec2](http://spark.apache.org/docs/latest/ec2-scripts.html) scripts. 5 | 6 | ## Getting spark-ec2 7 | 8 | As the KeystoneML scripts require a recent version of spark-ec2, it is 9 | recommended that you clone the spark-ec2 master branch for this. You can do this with 10 | ``` 11 | git clone https://github.com/amplab/spark-ec2.git 12 | ``` 13 | 14 | ## Launching a Cluster 15 | 16 | You can now use the `bin/keystone-ec2.sh` to launch a cluster with KeystoneML pre-installed. 17 | To do that you can run a command which looks like 18 | 19 | ``` 20 | SPARK_EC2_DIR= ./bin/keystone-ec2.sh \ 21 | -s 4 \ 22 | -t r3.4xlarge \ 23 | -i \ 24 | -k \ 25 | launch keystone-test-cluster 26 | ``` 27 | 28 | The above command launches 4 slaves and 1 master machine of type r3.4xlarge. 29 | Note that you can pass in any spark-ec2 options (like spot-prices etc.) to this script. 30 | 31 | ## Running KeystoneML on the cluster 32 | 33 | Once the cluster launch finishes you can login to the master node and the KeystoneML 34 | repository should be present in `/root/keystone`. 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # KeystoneML 2 | The biggest, baddest pipelines around. 3 | 4 | 5 | # Example pipeline 6 | 7 | ### Build the KeystoneML project 8 | 9 | ``` 10 | ./sbt/sbt assembly 11 | make # This builds the native libraries used in KeystoneML 12 | ``` 13 | 14 | ### Example: MNIST pipeline 15 | 16 | ``` 17 | # Get the data from S3 18 | wget http://mnist-data.s3.amazonaws.com/train-mnist-dense-with-labels.data 19 | wget http://mnist-data.s3.amazonaws.com/test-mnist-dense-with-labels.data 20 | 21 | KEYSTONE_MEM=4g ./bin/run-pipeline.sh \ 22 | keystoneml.pipelines.images.mnist.MnistRandomFFT \ 23 | --trainLocation ./train-mnist-dense-with-labels.data \ 24 | --testLocation ./test-mnist-dense-with-labels.data \ 25 | --numFFTs 4 \ 26 | --blockSize 2048 27 | ``` 28 | 29 | ## Running with spark-submit 30 | 31 | To run KeystoneML pipelines on large datasets you will need a [Spark](http://spark.apache.org) cluster. 32 | KeystoneML pipelines run on the cluster using 33 | [spark-submit](http://spark.apache.org/docs/latest/submitting-applications.html). 34 | 35 | You need to export `SPARK_HOME` to run KeystoneML using spark-submit. Having done 36 | that you can similarly use run-pipeline.sh to launch your pipeline. 37 | 38 | ``` 39 | export SPARK_HOME=~/spark-1.3.1-bin-cdh4 # should match the version keystone is built with 40 | KEYSTONE_MEM=4g ./bin/run-pipeline.sh \ 41 | keystoneml.pipelines.images.mnist.MnistRandomFFT \ 42 | --trainLocation ./train-mnist-dense-with-labels.data \ 43 | --testLocation ./test-mnist-dense-with-labels.data \ 44 | --numFFTs 4 \ 45 | --blockSize 2048 46 | ``` 47 | -------------------------------------------------------------------------------- /bin/keystone-ec2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$SPARK_EC2_DIR" ] || [ ! -f "$SPARK_EC2_DIR"/spark-ec2 ]; then 4 | echo "SPARK_EC2_DIR is not set correctly, please set SPARK_EC2_DIR to be /ec2" 5 | exit 1 6 | fi 7 | 8 | $SPARK_EC2_DIR/spark-ec2 \ 9 | --hadoop-major-version=2 \ 10 | --spark-version=1.3.1 \ 11 | --spark-ec2-git-repo=https://github.com/shivaram/spark-ec2 \ 12 | --spark-ec2-git-branch=keystone \ 13 | --copy-aws-credentials \ 14 | $@ 15 | -------------------------------------------------------------------------------- /bin/run-main.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | SCALA_VERSION=2.10 21 | 22 | # Figure out where the Scala framework is installed 23 | FWDIR="$(cd `dirname $0`/..; pwd)" 24 | 25 | if [ -z "$1" ]; then 26 | echo "Usage: run-main.sh []" >&2 27 | exit 1 28 | fi 29 | 30 | ASSEMBLY_JAR="" 31 | if [ -e "$FWDIR"/target/scala-$SCALA_VERSION/keystoneml-assembly-*.jar ]; then 32 | export ASSEMBLY_JAR=`ls "$FWDIR"/target/scala-$SCALA_VERSION/keystoneml-assembly*.jar` 33 | fi 34 | 35 | if [[ -z $ASSEMBLY_JAR ]]; then 36 | echo "Failed to find assembly JAR in $FWDIR/target" >&2 37 | echo "You need to run sbt/sbt assembly before running this program" >&2 38 | exit 1 39 | fi 40 | CLASSPATH="$ASSEMBLY_JAR" 41 | 42 | # Find java binary 43 | if [ -n "${JAVA_HOME}" ]; then 44 | RUNNER="${JAVA_HOME}/bin/java" 45 | else 46 | if [ `command -v java` ]; then 47 | RUNNER="java" 48 | else 49 | echo "JAVA_HOME is not set" >&2 50 | exit 1 51 | fi 52 | fi 53 | 54 | # Set KEYSTONE_MEM if it isn't already set since we also use it for this process 55 | KEYSTONE_MEM=${KEYSTONE_MEM:-1g} 56 | export KEYSTONE_MEM 57 | 58 | JAVA_OPTS="$JAVA_OPTS -Xms$KEYSTONE_MEM -Xmx$KEYSTONE_MEM ""$SPARK_JAVA_OPTS" 59 | 60 | exec "$RUNNER" -Djava.library.path=$FWDIR/lib -cp "$CLASSPATH" $JAVA_OPTS "$@" 61 | -------------------------------------------------------------------------------- /bin/run-pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Figure out where we are. 4 | FWDIR="$(cd `dirname $0`; pwd)" 5 | 6 | CLASS=$1 7 | shift 8 | 9 | # Set OMP_NUM_THREADS on workers and driver to something appropriate. 10 | # This is due to OpenBLAS not handling large numbers of cores very well. 11 | # See: https://github.com/amplab/keystone/issues/198 for more information. 12 | 13 | if [[ -z "$OMP_NUM_THREADS" ]]; then 14 | # Determine number of cores. We assume that hyperthreading is enabled and thus divide cores by two. 15 | unamestr=`uname` 16 | if [[ $unamestr == "Darwin" ]]; then 17 | CORES=$((`sysctl -n hw.ncpu`/2)) 18 | elif [[ $unamestr == "Linux" ]]; then 19 | CORES=$((`cat /proc/cpuinfo | grep processor | wc -l`/2)) 20 | else # Windows,BSD? Do the safest thing. 21 | CORES=1 22 | fi 23 | 24 | # Set OMP_NUM_THREADS to MIN(32,CORES) to avoid stack smashing issues. 25 | export OMP_NUM_THREADS=$(($CORES>32?32:$CORES)) 26 | else 27 | if [[ $OMP_NUM_THREADS -gt 32 ]]; then 28 | echo 'Warning: setting OMP_NUM_THREADS > 32 may cause instability.' 29 | fi 30 | fi 31 | 32 | EXECUTOR_OMP_NUM_THREADS=${EXECUTOR_OMP_NUM_THREADS:-1} 33 | 34 | if [[ -z "$SPARK_HOME" ]]; then 35 | echo "SPARK_HOME is not set, running pipeline locally" 36 | $FWDIR/run-main.sh $CLASS "$@" 37 | else 38 | # TODO: Figure out a way to pass in either a conf file / flags to spark-submit 39 | KEYSTONE_MEM=${KEYSTONE_MEM:-1g} 40 | export KEYSTONE_MEM 41 | 42 | # Set some commonly used config flags on the cluster 43 | $SPARK_HOME/bin/spark-submit \ 44 | --deploy-mode client \ 45 | --class $CLASS \ 46 | --driver-class-path $FWDIR/../target/scala-2.10/keystoneml-assembly-0.3.0-SNAPSHOT.jar \ 47 | --driver-library-path $FWDIR/../lib \ 48 | --conf spark.executor.extraLibraryPath=$FWDIR/../lib \ 49 | --conf spark.executor.extraClassPath=$FWDIR/../target/scala-2.10/keystoneml-assembly-0.3.0-SNAPSHOT.jar \ 50 | --conf spark.executorEnv.OMP_NUM_THREADS=$EXECUTOR_OMP_NUM_THREADS \ 51 | --driver-memory $KEYSTONE_MEM \ 52 | target/scala-2.10/keystoneml-assembly-0.3.0-SNAPSHOT.jar \ 53 | "$@" 54 | fi 55 | -------------------------------------------------------------------------------- /examples/images/cifar_random_patch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | #Set environment variables 5 | : ${KEYSTONE_MEM:=4g} 6 | export KEYSTONE_MEM 7 | 8 | KEYSTONE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/../.. 9 | : ${EXAMPLE_DATA_DIR:=$KEYSTONE_DIR/example_data} 10 | 11 | if [ ! -d $EXAMPLE_DATA_DIR ]; then 12 | mkdir $EXAMPLE_DATA_DIR 13 | fi 14 | 15 | #Download data if necessary. 16 | if [[ ! ( -f $EXAMPLE_DATA_DIR/cifar_train.bin && -f $EXAMPLE_DATA_DIR/cifar_test.bin ) ]]; then 17 | #Get the data 18 | wget -O $TMPDIR/cifar-10-binary.tar.gz http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz 19 | 20 | #Decompress it 21 | tar zxvf $TMPDIR/cifar-10-binary.tar.gz -C $TMPDIR 22 | cat $TMPDIR/cifar-10-batches-bin/data_batch*.bin > $EXAMPLE_DATA_DIR/cifar_train.bin 23 | mv $TMPDIR/cifar-10-batches-bin/test_batch.bin $EXAMPLE_DATA_DIR/cifar_test.bin 24 | 25 | #Clean up. 26 | rm -rf $TMPDIR/cifar-10-batches-bin 27 | rm -rf $TMPDIR/cifar-10-binary.tar.gz 28 | fi 29 | 30 | #Run the pipeline 31 | $KEYSTONE_DIR/bin/run-pipeline.sh \ 32 | keystoneml.pipelines.images.cifar.RandomPatchCifar \ 33 | --trainLocation $EXAMPLE_DATA_DIR/cifar_train.bin \ 34 | --testLocation $EXAMPLE_DATA_DIR/cifar_test.bin \ 35 | --numFilters 10000 \ 36 | --lambda 3000 \ 37 | --whiteningEpsilon 1e-5 38 | -------------------------------------------------------------------------------- /examples/images/mnist_random_fft.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | #Set environment variables 5 | : ${KEYSTONE_MEM:=4g} 6 | export KEYSTONE_MEM 7 | 8 | : ${NUM_FFTS:=4} 9 | : ${BLOCK_SIZE:=2048} 10 | 11 | KEYSTONE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/../.. 12 | : ${EXAMPLE_DATA_DIR:=$KEYSTONE_DIR/example_data} 13 | 14 | if [ ! -d $EXAMPLE_DATA_DIR ]; then 15 | mkdir $EXAMPLE_DATA_DIR 16 | fi 17 | 18 | # Get the data from S3 19 | if [ ! -f $EXAMPLE_DATA_DIR/train-mnist-dense-with-labels.data ]; then 20 | wget -O $EXAMPLE_DATA_DIR/train-mnist-dense-with-labels.data \ 21 | http://mnist-data.s3.amazonaws.com/train-mnist-dense-with-labels.data 22 | fi 23 | 24 | if [ ! -f $EXAMPLE_DATA_DIR/test-mnist-dense-with-labels.data ]; then 25 | wget -O $EXAMPLE_DATA_DIR/test-mnist-dense-with-labels.data \ 26 | http://mnist-data.s3.amazonaws.com/test-mnist-dense-with-labels.data 27 | fi 28 | 29 | $KEYSTONE_DIR/bin/run-pipeline.sh \ 30 | keystoneml.pipelines.images.mnist.MnistRandomFFT \ 31 | --trainLocation $EXAMPLE_DATA_DIR/train-mnist-dense-with-labels.data \ 32 | --testLocation $EXAMPLE_DATA_DIR/test-mnist-dense-with-labels.data \ 33 | --numFFTs $NUM_FFTS \ 34 | --blockSize $BLOCK_SIZE 35 | -------------------------------------------------------------------------------- /examples/images/voc_sift_fisher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | #Set environment variables 5 | : ${KEYSTONE_MEM:=12g} 6 | export KEYSTONE_MEM 7 | 8 | KEYSTONE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/../.. 9 | : ${EXAMPLE_DATA_DIR:=$KEYSTONE_DIR/example_data} 10 | 11 | 12 | #Get the data and copy to HDFS if necessary. 13 | if [ ! -f $EXAMPLE_DATA_DIR/VOCtrainval_06-Nov-2007.tar ]; then 14 | wget -O $EXAMPLE_DATA_DIR/VOCtrainval_06-Nov-2007.tar http://s3-us-west-2.amazonaws.com/voc-data/VOCtrainval_06-Nov-2007.tar 15 | fi 16 | 17 | if [ ! -f $EXAMPLE_DATA_DIR/VOCtest_06-Nov-2007.tar ]; then 18 | wget -O $EXAMPLE_DATA_DIR/VOCtest_06-Nov-2007.tar http://s3-us-west-2.amazonaws.com/voc-data/VOCtest_06-Nov-2007.tar 19 | fi 20 | 21 | #Run the pipeline 22 | $KEYSTONE_DIR/bin/run-pipeline.sh \ 23 | keystoneml.pipelines.images.voc.VOCSIFTFisher \ 24 | --trainLocation $EXAMPLE_DATA_DIR/VOCtrainval_06-Nov-2007.tar \ 25 | --testLocation $EXAMPLE_DATA_DIR/VOCtest_06-Nov-2007.tar \ 26 | --labelPath $KEYSTONE_DIR/src/test/resources/images/voclabels.csv \ 27 | --numParts 200 28 | -------------------------------------------------------------------------------- /examples/text/newsgroups_ngrams_tfidf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | #Set environment variables 5 | : ${KEYSTONE_MEM:=4g} 6 | export KEYSTONE_MEM 7 | 8 | : ${NUM_PARTS:=256} 9 | : ${NGRAMS:=2} 10 | : ${COMMON_FEATURES:=1000} 11 | 12 | KEYSTONE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/../.. 13 | : ${EXAMPLE_DATA_DIR:=$KEYSTONE_DIR/example_data} 14 | 15 | if [ ! -d $EXAMPLE_DATA_DIR ]; then 16 | mkdir $EXAMPLE_DATA_DIR 17 | fi 18 | 19 | 20 | #Download 20 Newsgroups data if necessary. 21 | if [ ! -f $EXAMPLE_DATA_DIR/20news-bydate.tar.gz ]; then 22 | wget -O $EXAMPLE_DATA_DIR/20news-bydate.tar.gz http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz 23 | tar zxvf $EXAMPLE_DATA_DIR/20news-bydate.tar.gz -C $EXAMPLE_DATA_DIR 24 | fi 25 | 26 | #Run pipeline. 27 | $KEYSTONE_DIR/bin/run-pipeline.sh \ 28 | keystoneml.pipelines.text.NewsgroupsPipeline \ 29 | --trainLocation $EXAMPLE_DATA_DIR/20news-bydate-train \ 30 | --testLocation $EXAMPLE_DATA_DIR/20news-bydate-test \ 31 | --nGrams $NGRAMS \ 32 | --commonFeatures $COMMON_FEATURES 33 | -------------------------------------------------------------------------------- /lib/libImageFeatures.dylib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amplab/keystone/74e2fb5efaff55675603508bd0c479bb8875901f/lib/libImageFeatures.dylib -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | 2 | sbt.version=0.13.13 3 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += "Sonatype snapshots" at "http://oss.sonatype.org/content/repositories/snapshots/" 2 | 3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 4 | 5 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") 6 | -------------------------------------------------------------------------------- /sbt/sbt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | # This script launches sbt for this project. If present it uses the system 21 | # version of sbt. If there is no system version of sbt it attempts to download 22 | # sbt locally. 23 | SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties` 24 | URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 25 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar 26 | JAR=sbt/sbt-launch-${SBT_VERSION}.jar 27 | 28 | # Download sbt launch jar if it hasn't been downloaded yet 29 | if [ ! -f ${JAR} ]; then 30 | # Download 31 | printf "Attempting to fetch sbt\n" 32 | if hash curl 2>/dev/null; then 33 | curl --fail --location --silent ${URL1} > ${JAR} || curl --fail --location --silent ${URL2} > ${JAR} 34 | elif hash wget 2>/dev/null; then 35 | wget --quiet ${URL1} -O ${JAR} || wget --quiet ${URL2} -O ${JAR} 36 | else 37 | printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" 38 | exit -1 39 | fi 40 | fi 41 | if [ ! -f ${JAR} ]; then 42 | # We failed to download 43 | printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" 44 | exit -1 45 | fi 46 | printf "Launching sbt from ${JAR}\n" 47 | 48 | FWDIR="$(cd `dirname $0`/..; pwd)" 49 | 50 | java \ 51 | -Djava.library.path="$FWDIR/lib" \ 52 | -Xmx4000m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \ 53 | -jar ${JAR} \ 54 | "$@" 55 | -------------------------------------------------------------------------------- /scripts/constantEstimator.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(reshape) 3 | 4 | nmachines <- 16 5 | bs <- 1024 6 | 7 | flops <- function(solver, n, d, k, sparsity) { 8 | ifelse(solver=="LS - LBFGS", 9 | 20*n*sparsity*d*k/nmachines, 10 | ifelse(solver=="Exact", 11 | n*d*(d+k)/nmachines, 12 | 3*(n*d*(bs+k)/nmachines))) 13 | } 14 | 15 | mem <- function(solver, n, d, k, sparsity) { 16 | ifelse(solver=="LS - LBFGS", 17 | 20*n*d*sparsity/nmachines, 18 | ifelse(solver=="Exact", 19 | n*d/nmachines + d**2, 20 | 3*(n*d/nmachines + d*k))) 21 | } 22 | 23 | network <- function(solver, n, d, k, sparsity) { 24 | ifelse(solver=="LS - LBFGS", 25 | 20*2*d*k*log(nmachines), 26 | ifelse(solver=="Exact", 27 | d*(d+k), 28 | 3*2*(d*(bs+k))*log(nmachines))) 29 | } 30 | 31 | 32 | main <- function () { 33 | x <- read.csv("solver-comparisons-final.csv") 34 | n=list(Amazon=65e6, TIMIT=2.2e6) 35 | k=list(Amazon=2, TIMIT=138) 36 | nnz=list(Amazon=0.005, TIMIT=1.0) 37 | colnames(x) <- c("Experiment", "solver", "d","t","train.error","loss","loss.2") 38 | 39 | 40 | x$n <- as.numeric(n[x$Experiment]) 41 | x$k <- as.numeric(k[x$Experiment]) 42 | x$s <- as.numeric(nnz[x$Experiment]) 43 | 44 | x$cpu <- with(x, flops(solver, n, d, k, s)) 45 | x$mem <- with(x, mem(solver, n, d, k, s)) 46 | x$network <- with(x, network(solver, n, d, k, s)) 47 | 48 | list(data=x, model=lm(t ~ cpu + mem + network, data=x)) 49 | } 50 | 51 | 52 | plotter <- function(res) { 53 | res$data$pred <- predict(res$model, res$data) 54 | dn <- res$data[,c("Experiment","solver","d","t","pred")] 55 | dnm <- melt(dn, id.vars=c("Experiment","solver","d")) 56 | 57 | qplot(d, value, geom='line', color=solver, shape=variable, data=dnm) + 58 | facet_grid(Experiment ~ ., scale="free_y") + 59 | theme_bw() + 60 | geom_point() 61 | } -------------------------------------------------------------------------------- /scripts/solver-comparisons-final.csv: -------------------------------------------------------------------------------- 1 | Experiment,Solver,Num Features,Time (ms),Train Error (%),Loss,Loss/2 2 | Amazon,Exact,1024,186149,15.9,0.4675666294,0.2337833147 3 | Amazon,Block,1024,894313,15.9,0.4675666294,0.2337833147 4 | Amazon,LS - LBFGS,1024,33704,15.5,0.4833497961,0.2416748981 5 | Amazon,Exact,2048,690558,14.5,0.4343235648,0.2171617824 6 | Amazon,Block,2048,1756617,14.4,0.4349389635,0.2174694817 7 | Amazon,LS - LBFGS,2048,33643,13.9,0.4512289158,0.2256144579 8 | Amazon,Block,4096,3476561,13,0.4045493394,0.2022746697 9 | Amazon,LS - LBFGS,4096,40606,12.7,0.4208100192,0.2104050096 10 | Amazon,Block,8192,6889505,12.1,0.3886991737,0.1943495868 11 | Amazon,LS - LBFGS,8192,45407,11.9,0.4045554759,0.2022777379 12 | Amazon,Block,16384,13631976,11.4,0.3761958693,0.1880979346 13 | Amazon,LS - LBFGS,16384,52290,11.4,0.3958041617,0.1979020809 14 | TIMIT,Exact,1024,7323,50.42190579,1.584064323,0.7920321614 15 | TIMIT,Block,1024,33521,50.42190579,1.584064323,0.7920321614 16 | TIMIT,LS - LBFGS,1024,70396,50.40627225,1.583816101,0.7919080506 17 | TIMIT,Exact,2048,17949,46.04677894,1.490036052,0.7450180261 18 | TIMIT,Block,2048,61395,46.22247864,1.499799737,0.7498998685 19 | TIMIT,LS - LBFGS,2048,98834,46.32929304,1.497731196,0.7488655978 20 | TIMIT,Exact,4096,76562,42.15900112,1.400043524,0.7000217618 21 | TIMIT,Block,4096,120998,42.52203686,1.414037154,0.7070185769 22 | TIMIT,LS - LBFGS,4096,259498,43.25974465,1.41154238,0.7057711902 23 | TIMIT,Exact,8192,315183,38.63821184,1.314324568,0.6571622842 24 | TIMIT,Block,8192,255570,39.12005362,1.336486736,0.6682433679 25 | TIMIT,LS - LBFGS,8192,810286,40.81220695,1.341040478,0.6705202388 26 | TIMIT,Block,16384,580555,35.73174973,1.265805335,0.6329026676 27 | TIMIT,LS - LBFGS,16384,1589308,39.58150961,1.293037819,0.6465189093 -------------------------------------------------------------------------------- /src/main/cpp/EncEval.h: -------------------------------------------------------------------------------- 1 | /* DO NOT EDIT THIS FILE - it is machine generated */ 2 | #include 3 | /* Header for class keystoneml_utils_external_EncEval */ 4 | 5 | #ifndef _Included_keystoneml_utils_external_EncEval 6 | #define _Included_keystoneml_utils_external_EncEval 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | /* 11 | * Class: keystoneml_utils_external_EncEval 12 | * Method: computeGMM 13 | * Signature: (II[F)[F 14 | */ 15 | JNIEXPORT jfloatArray JNICALL Java_keystoneml_utils_external_EncEval_computeGMM 16 | (JNIEnv *, jobject, jint, jint, jfloatArray); 17 | 18 | /* 19 | * Class: keystoneml_utils_external_EncEval 20 | * Method: calcAndGetFVs 21 | * Signature: ([FII[F[F[F)[F 22 | */ 23 | JNIEXPORT jfloatArray JNICALL Java_keystoneml_utils_external_EncEval_calcAndGetFVs 24 | (JNIEnv *, jobject, jfloatArray, jint, jint, jfloatArray, jfloatArray, jfloatArray); 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | #endif 30 | -------------------------------------------------------------------------------- /src/main/cpp/VLFeat.h: -------------------------------------------------------------------------------- 1 | /* DO NOT EDIT THIS FILE - it is machine generated */ 2 | #include 3 | /* Header for class keystoneml_utils_external_VLFeat */ 4 | 5 | #ifndef _Included_keystoneml_utils_external_VLFeat 6 | #define _Included_keystoneml_utils_external_VLFeat 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | /* 11 | * Class: keystoneml_utils_external_VLFeat 12 | * Method: getSIFTs 13 | * Signature: (IIIIII[F)[S 14 | */ 15 | JNIEXPORT jshortArray JNICALL Java_keystoneml_utils_external_VLFeat_getSIFTs 16 | (JNIEnv *, jobject, jint, jint, jint, jint, jint, jint, jfloatArray); 17 | 18 | #ifdef __cplusplus 19 | } 20 | #endif 21 | #endif 22 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 6 | 7 | # Only pay attention to INFO messages from Keystone. 8 | log4j.logger.keystoneml.pipelines=INFO 9 | log4j.logger.keystoneml.workflow=INFO 10 | log4j.logger.keystoneml.nodes=INFO 11 | log4j.logger.keystoneml.utils=INFO 12 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/evaluation/AugmentedExamplesEvaluator.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.evaluation 2 | 3 | import breeze.linalg._ 4 | import keystoneml.nodes.util.MaxClassifier 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.reflect.ClassTag 8 | 9 | object AggregationPolicyType extends Enumeration { 10 | type AggregationPolicyType = Value 11 | val average, borda = Value 12 | } 13 | 14 | class AugmentedExamplesEvaluator[T : ClassTag]( 15 | names: RDD[T], 16 | numClasses: Int, 17 | policy: AggregationPolicyType.Value = AggregationPolicyType.average) 18 | extends Evaluator[DenseVector[Double], Int, MulticlassMetrics] with Serializable { 19 | 20 | def averagePolicy(preds: Array[DenseVector[Double]]): DenseVector[Double] = { 21 | preds.reduce(_ + _) :/ preds.size.toDouble 22 | } 23 | 24 | /** 25 | * Borda averaging works as follows: 26 | * Let s(k) be the ordering of patch k. 27 | * For i in images, 28 | * For k in patches, 29 | * score[i] += s(k)[i] 30 | */ 31 | def bordaPolicy(preds: Array[DenseVector[Double]]): DenseVector[Double] = { 32 | val ranks = preds.map { vec => 33 | val sortedPreds = vec.toArray.zipWithIndex.sortBy(_._1).map(_._2) 34 | val rank = DenseVector(sortedPreds.zipWithIndex.sortBy(_._1).map(x => x._2.toDouble)) 35 | rank 36 | } 37 | ranks.reduceLeft(_ + _) 38 | } 39 | 40 | def evaluate( 41 | predicted: RDD[DenseVector[Double]], 42 | actualLabels: RDD[Int]): MulticlassMetrics = { 43 | 44 | val aggFunc = policy match { 45 | case AggregationPolicyType.borda => bordaPolicy _ 46 | case _ => averagePolicy _ 47 | } 48 | 49 | // associate a name with each predicted, actual 50 | val namedPreds = names.zip(predicted.zip(actualLabels)) 51 | 52 | // group by name to get all the predicted values for a name 53 | val groupedPreds = namedPreds.groupByKey(names.partitions.length).map { case (group, iter) => 54 | val predActuals = iter.toArray // this is a array of tuples 55 | val predsForName = predActuals.map(_._1) 56 | assert(predActuals.map(_._2).distinct.size == 1) 57 | val actualForName: Int = predActuals.map(_._2).head 58 | 59 | (predsForName, actualForName) 60 | }.cache() 61 | 62 | // Averaging policy 63 | val finalPred = groupedPreds.map(x => (aggFunc(x._1), x._2) ) 64 | val finalPredictedLabels = MaxClassifier(finalPred.map(_._1)) 65 | val finalActualLabels = finalPred.map(_._2) 66 | 67 | val ret = new MulticlassClassifierEvaluator(numClasses).evaluate(finalPredictedLabels, finalActualLabels) 68 | groupedPreds.unpersist() 69 | ret 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/evaluation/Evaluator.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.evaluation 2 | 3 | import org.apache.spark.rdd.RDD 4 | import keystoneml.workflow.PipelineDataset 5 | 6 | /** 7 | * An Evaluator is an object whose "evaluate" method takes a vector of Predictions and a set of Labels (of the same 8 | * length and order) and returns an "Evaluation" which is specific to the domain (binary classification, multi-label 9 | * classification, etc.). The Evaluation is typically a set of summary statistics designed to capture the performance 10 | * of a machine learning pipeline. 11 | * 12 | * Because evaluation typically happens at the end of a pipeline, we support the cartesian product of 13 | * {RDD, PipelineDataset} for both sets of arguments. 14 | * 15 | * @tparam P Type of Predictions. 16 | * @tparam L Type of the Labels. 17 | * @tparam E Type of the Evaluation. 18 | */ 19 | trait Evaluator[P,L,E] { 20 | 21 | /** 22 | * Generate an evaluation. 23 | * 24 | * @param predictions Predicted values. 25 | * @param labels True labels. (Same order and length and the predictions). 26 | * 27 | * @return An evaluation. 28 | */ 29 | def evaluate(predictions: RDD[P], labels: RDD[L]): E 30 | 31 | def evaluate(predictions: PipelineDataset[P], labels: RDD[L]): E = evaluate(predictions.get, labels) 32 | 33 | def evaluate(predictions: RDD[P], labels: PipelineDataset[L]): E = evaluate(predictions, labels.get) 34 | 35 | def evaluate(predictions: PipelineDataset[P], labels: PipelineDataset[L]): E = evaluate(predictions.get, labels.get) 36 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/loaders/AmazonReviewsDataLoader.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.loaders 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.{SQLContext, SparkSession} 5 | 6 | 7 | object AmazonReviewsDataLoader { 8 | /** 9 | * Loads the Amazon Product Reviews dataset for binary classification. 10 | * Each review is a JSON string with (at least) two fields: "reviewText" and "overAll". 11 | * 12 | * This data loader produces an RDD of labeled reviews. 13 | * 14 | * @param spark SparkSession to use (needed for SQL) 15 | * @param dataDir Directory of the training data 16 | * @param threshold Lowest value at which to consider a review positive. 17 | * @return A Labeled Dataset that contains the data strings and labels. 18 | */ 19 | def apply(spark: SparkSession, dataDir: String, threshold: Double): LabeledData[Int, String] = { 20 | import spark.implicits._ 21 | 22 | val df = spark.read.json(dataDir) 23 | val data = df.select(df("overall"), df("reviewText")) 24 | .map(r => (if(r.getAs[Double](0) >= threshold) 1 else 0, r.getAs[String](1))).rdd 25 | 26 | LabeledData(data) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/loaders/CifarLoader.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.loaders 2 | 3 | import java.io.FileInputStream 4 | 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.rdd.RDD 7 | import keystoneml.utils.{ImageMetadata, LabeledImage, RowColumnMajorByteArrayVectorizedImage} 8 | 9 | 10 | /** 11 | * Loads images from the CIFAR-10 Dataset. 12 | */ 13 | object CifarLoader { 14 | // We hardcode this because these are properties of the CIFAR-10 dataset. 15 | val nrow = 32 16 | val ncol = 32 17 | val nchan = 3 18 | 19 | val labelSize = 1 20 | 21 | def cifar10ToBufferedImage(cifar: Array[Byte]): RowColumnMajorByteArrayVectorizedImage = { 22 | val byteLen = nrow*ncol*nchan 23 | 24 | // Allocate some space for the rows. 25 | require(cifar.length == byteLen, "CIFAR-10 Images MUST be 32x32x3.") 26 | 27 | RowColumnMajorByteArrayVectorizedImage(cifar, ImageMetadata(nrow, ncol, nchan)) 28 | } 29 | 30 | def loadLabeledImages(path: String): Seq[LabeledImage] = { 31 | val imgCount = labelSize + nrow*ncol*nchan 32 | 33 | val imageBytes = Array.fill[Byte](imgCount)(0x00) 34 | var out = Array[LabeledImage]() 35 | 36 | val inFile = new FileInputStream(path) 37 | 38 | while(inFile.read(imageBytes, 0, imgCount) > 0) { 39 | val img = cifar10ToBufferedImage(imageBytes.tail) 40 | val label = imageBytes.head.toShort 41 | val li = LabeledImage(img, label) 42 | out = out :+ li 43 | } 44 | out 45 | } 46 | 47 | def apply(sc: SparkContext, path: String): RDD[LabeledImage] = { 48 | val images = CifarLoader.loadLabeledImages(path) 49 | 50 | sc.parallelize(images) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/loaders/CsvDataLoader.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.loaders 2 | 3 | import breeze.linalg.DenseVector 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | 7 | /** 8 | * Data Loader that loads csv files of comma separated numbers into an RDD of DenseVectors 9 | */ 10 | object CsvDataLoader { 11 | /** 12 | * Load CSV files from the given path into an RDD of DenseVectors 13 | * @param sc The spark context to use 14 | * @param path The path to the CSV files 15 | * @return RDD of DenseVectors, one per CSV row 16 | */ 17 | def apply(sc: SparkContext, path: String): RDD[DenseVector[Double]] = { 18 | sc.textFile(path).map(row => DenseVector(row.split(",").map(_.toDouble))) 19 | } 20 | 21 | /** 22 | * Load CSV files from the given path into an RDD of DenseVectors 23 | * @param sc The spark context to use 24 | * @param path The path to the CSV files 25 | * @param minPartitions The minimum # of partitions to use 26 | * @return RDD of DenseVectors, one per CSV row 27 | */ 28 | def apply(sc: SparkContext, path: String, minPartitions: Int): RDD[DenseVector[Double]] = { 29 | sc.textFile(path, minPartitions).map(row => DenseVector(row.split(",").map(_.toDouble))) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/loaders/ImageNetLoader.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.loaders 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD 5 | import keystoneml.utils.LabeledImage 6 | 7 | /** 8 | * Helper object to loads images from ImageNet Datasets. 9 | */ 10 | 11 | object ImageNetLoader { 12 | 13 | val NUM_CLASSES = 1000 14 | 15 | /** 16 | * Loads images from @dataPath and associates images with the labels provided in @labelPath 17 | * 18 | * @param sc SparkContext to use 19 | * @param dataPath Directory containing tar files (can be a HDFS path). This classes assumes 20 | * that each tar file contains images within a directory. The name of the 21 | * directory is treated as the className. 22 | * @param labelsPath Local file that maps classNames to a numeric value 23 | */ 24 | def apply(sc: SparkContext, dataPath: String, labelsPath: String): RDD[LabeledImage] = { 25 | val filePathsRDD = ImageLoaderUtils.getFilePathsRDD(sc, dataPath) 26 | 27 | val labelsMapFile = scala.io.Source.fromFile(labelsPath) 28 | val labelsMap = labelsMapFile.getLines().map(x => x.toString).toArray.map { line => 29 | val parts = line.split(" ") 30 | (parts(0), parts(1).toInt) 31 | }.toMap 32 | 33 | def labelsMapF(fname: String): Int = { 34 | labelsMap(fname.split('/')(0)) 35 | } 36 | 37 | ImageLoaderUtils.loadFiles(filePathsRDD, labelsMapF, LabeledImage.apply) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/loaders/LabeledData.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.loaders 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | import scala.reflect.ClassTag 6 | 7 | /** 8 | * A case class containing an RDD of labeled data 9 | * @tparam Label The type of the labels 10 | * @tparam Datum The type of the data 11 | */ 12 | case class LabeledData[Label : ClassTag, Datum : ClassTag](labeledData: RDD[(Label, Datum)]) { 13 | val data: RDD[Datum] = labeledData.map(_._2) 14 | val labels: RDD[Label] = labeledData.map(_._1) 15 | } 16 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/loaders/NewsgroupsDataLoader.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.loaders 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.{RDD, UnionRDD} 5 | 6 | import scala.reflect.ClassTag 7 | 8 | 9 | object NewsgroupsDataLoader { 10 | /** The 20 Newsgroups class labels (and directory names) **/ 11 | val classes = Array( 12 | "comp.graphics", 13 | "comp.os.ms-windows.misc", 14 | "comp.sys.ibm.pc.hardware", 15 | "comp.sys.mac.hardware", 16 | "comp.windows.x", 17 | "rec.autos", 18 | "rec.motorcycles", 19 | "rec.sport.baseball", 20 | "rec.sport.hockey", 21 | "sci.crypt", 22 | "sci.electronics", 23 | "sci.med", 24 | "sci.space", 25 | "misc.forsale", 26 | "talk.politics.misc", 27 | "talk.politics.guns", 28 | "talk.politics.mideast", 29 | "talk.religion.misc", 30 | "alt.atheism", 31 | "soc.religion.christian" 32 | ) 33 | 34 | /** 35 | * Loads the 20 newsgroups dataset. 36 | * Designed to load data from 20news-bydate.tar.gz from http://qwone.com/~jason/20Newsgroups/ 37 | * 38 | * The expected directory structure for the train and test dirs is: 39 | * train_or_test_dir/class_label/docs_as_separate_plaintext_files 40 | * 41 | * @param sc SparkContext to use 42 | * @param dataDir Directory of the training data 43 | * @return A NewsgroupsData object containing the loaded train & test data as RDDs 44 | */ 45 | def apply(sc: SparkContext, dataDir: String): LabeledData[Int, String] = { 46 | val data: RDD[(Int, String)] = new UnionRDD(sc, classes.zipWithIndex.map{ case (className, index) => { 47 | sc.wholeTextFiles(s"$dataDir/$className").map(index -> _._2) 48 | }}) 49 | 50 | LabeledData(data) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/loaders/TimitFeaturesDataLoader.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.loaders 2 | 3 | import breeze.linalg.DenseVector 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.collection.mutable 8 | 9 | /** A case class containing loaded pre-featurized TIMIT train & test data */ 10 | case class TimitFeaturesData( 11 | train: LabeledData[Int, DenseVector[Double]], 12 | test: LabeledData[Int, DenseVector[Double]] 13 | ) 14 | 15 | object TimitFeaturesDataLoader { 16 | val timitDimension = 440 17 | val numClasses = 147 18 | 19 | // Assumes lines are formatted as 20 | // row col value 21 | private def parseSparseLabels(fileName: String) = { 22 | // Mapping from row number to label 23 | val ret = new mutable.HashMap[Long, Int] 24 | 25 | val lines = scala.io.Source.fromFile(fileName).getLines() 26 | lines.foreach { line => 27 | val parts = line.split(" ") 28 | ret(parts(0).toLong - 1) = parts(1).toInt 29 | } 30 | ret 31 | } 32 | 33 | private def createLabelsRDD( 34 | labelsMap: mutable.HashMap[Long, Int], 35 | featuresRDD: RDD[_]) = { 36 | val labelsMapBC = featuresRDD.context.broadcast(labelsMap) 37 | val labelsRDD = featuresRDD.zipWithIndex().map { case (item, row) => 38 | labelsMapBC.value(row) - 1 39 | } 40 | labelsRDD 41 | } 42 | 43 | /** 44 | * Loads the pre-featurized Timit data. 45 | * Expects features data to be stored as a csv of numbers, 46 | * and labels as "row# label" where row# is the number of the row in the data csv it is 47 | * referring to (starting at row #1) 48 | * 49 | * @param sc SparkContext to use 50 | * @param trainDataLocation CSV of the training data 51 | * @param trainLabelsLocation labels of the training data 52 | * @param testDataLocation CSV of the test data 53 | * @param testLabelsLocation labels of the test data 54 | * @param numParts number of partitions per RDD 55 | * @return A TimitFeaturesData object containing the loaded train & test data as RDDs 56 | */ 57 | def apply(sc: SparkContext, 58 | trainDataLocation: String, 59 | trainLabelsLocation: String, 60 | testDataLocation: String, 61 | testLabelsLocation: String, 62 | numParts: Int = 512): TimitFeaturesData = { 63 | val trainData = CsvDataLoader(sc, trainDataLocation, numParts) 64 | val trainLabels = createLabelsRDD(parseSparseLabels(trainLabelsLocation), trainData) 65 | 66 | val testData = CsvDataLoader(sc, testDataLocation, numParts) 67 | val testLabels = createLabelsRDD(parseSparseLabels(testLabelsLocation), testData) 68 | TimitFeaturesData(LabeledData(trainLabels.zip(trainData)), LabeledData(testLabels.zip(testData))) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/loaders/VOCLoader.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.loaders 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD 5 | import keystoneml.pipelines.Logging 6 | import keystoneml.utils.MultiLabeledImage 7 | 8 | 9 | case class VOCDataPath(imagesDirName: String, namePrefix: String, numParts: Option[Int]) 10 | case class VOCLabelPath(labelsFileName: String) 11 | 12 | /** 13 | * A data loader for the VOC 2007 Dataset. Expects input in a tar file. 14 | */ 15 | object VOCLoader extends Logging with Serializable { 16 | val NUM_CLASSES = 20 // This is a constant defined by the VOC 2007 dataset. 17 | 18 | /** 19 | * Loads a data path given a spark context and labels and returns an RDD[MultiLabeledImage]. 20 | * 21 | * A property of the VOC dataset is that images can have multiple labels which we 22 | * have to deal with later in the pipeline. 23 | * 24 | * @param sc A Spark Context 25 | * @param dataPath Path to image tar. 26 | * @param labelsPath Path to label csv. 27 | * @return 28 | */ 29 | def apply(sc: SparkContext, dataPath: VOCDataPath, labelsPath: VOCLabelPath): RDD[MultiLabeledImage] = { 30 | val filePathsRDD = ImageLoaderUtils.getFilePathsRDD(sc, dataPath.imagesDirName, dataPath.numParts) 31 | 32 | val labelsMapFile = scala.io.Source.fromFile(labelsPath.labelsFileName) 33 | 34 | val labelsMap: Map[String, Array[Int]] = labelsMapFile 35 | .getLines() 36 | .drop(1) 37 | .map(x => x.toString) 38 | .map { line => 39 | val parts = line.split(",") 40 | (parts(4).replace("\"", ""), parts(1).toInt - 1) 41 | } 42 | .toArray 43 | .groupBy(_._1) 44 | .mapValues(_.map(_._2)) 45 | .map(identity) 46 | 47 | labelsMapFile.close() 48 | 49 | ImageLoaderUtils.loadFiles(filePathsRDD, labelsMap, MultiLabeledImage.apply, Some(dataPath.namePrefix)) 50 | } 51 | } 52 | 53 | 54 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/CenterCornerPatcher.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | import keystoneml.utils.{ImageUtils, Image} 6 | import keystoneml.pipelines.FunctionNode 7 | 8 | /** 9 | * Extract four corner patches and the center patch of the specified size. 10 | * If flips is set to true, then horizontal flips of all 5 patches is also 11 | * returned 12 | * 13 | * @param patchSizeX size of patch along xDim 14 | * @param patchSizeY size of patch along yDim 15 | * @param horizontalFlips if horizontal flips of patches should also be returned 16 | * @return patches of size patchSizeX x patchSizeY 17 | */ 18 | case class CenterCornerPatcher( 19 | patchSizeX: Int, 20 | patchSizeY: Int, 21 | horizontalFlips: Boolean) extends FunctionNode[RDD[Image], RDD[Image]] { 22 | 23 | def apply(in: RDD[Image]): RDD[Image] = { 24 | in.flatMap { x => 25 | centerCornerPatchImage(x) 26 | } 27 | } 28 | 29 | def centerCornerPatchImage(in: Image): Iterator[Image] = { 30 | val xDim = in.metadata.xDim 31 | val yDim = in.metadata.yDim 32 | 33 | val startXs = Array(0, xDim-patchSizeX, 0, xDim-patchSizeX, (xDim-patchSizeX)/2) 34 | val startYs = Array(0, 0, yDim-patchSizeY, yDim-patchSizeY, (yDim-patchSizeY)/2) 35 | 36 | (0 until startXs.length).iterator.flatMap { idx => 37 | val endX = startXs(idx) + patchSizeX 38 | val endY = startYs(idx) + patchSizeY 39 | val im = ImageUtils.crop(in, startXs(idx), startYs(idx), endX, endY) 40 | if (horizontalFlips) { 41 | val flippedIm = ImageUtils.flipHorizontal(im) 42 | Iterator(im, flippedIm) 43 | } else { 44 | Iterator.single(im) 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/Cropper.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import keystoneml.utils.{ImageUtils, Image} 4 | import keystoneml.workflow.Transformer 5 | 6 | /** 7 | * Crop an input image to the given bounding box described by 8 | * (startX, startY, endX, endY). 9 | * 10 | * Wrapper for `ImageUtils.crop()` 11 | * 12 | * @param startX x-position (inclusive) to describe upper left corner of BB 13 | * @param startY y-position (inclusive) to describe upper left corner of BB 14 | * @param endX x-position (exclusive) to describe lower right corner of BB 15 | * @param endY y-position (exclusive) to describe lower right corner of BB 16 | * @return new image of size (endX - startX, endY - startY) 17 | */ 18 | case class Cropper(startX: Int, startY: Int, endX: Int, endY: Int) extends Transformer[Image,Image] { 19 | def apply(in: Image): Image = { 20 | ImageUtils.crop(in, startX, startY, endX, endY) 21 | } 22 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/GrayScaler.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import keystoneml.workflow.Transformer 4 | import keystoneml.utils.{ImageUtils, Image} 5 | 6 | /** 7 | * Converts an input images to NTSC-standard grayscale. 8 | */ 9 | object GrayScaler extends Transformer[Image,Image] { 10 | def apply(in: Image): Image = ImageUtils.toGrayScale(in) 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/ImageVectorizer.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import breeze.linalg.DenseVector 4 | import org.apache.spark.rdd.RDD 5 | import keystoneml.pipelines._ 6 | import keystoneml.utils.Image 7 | import keystoneml.workflow.Transformer 8 | 9 | /** 10 | * Takes an image and converts it to a dense vector. 11 | */ 12 | object ImageVectorizer extends Transformer[Image, DenseVector[Double]] { 13 | def apply(in: Image): DenseVector[Double] = { 14 | DenseVector(in.toArray) 15 | } 16 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/LabeledImageExtractors.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import keystoneml.utils.{MultiLabeledImage, Image, LabeledImage} 4 | import keystoneml.workflow.Transformer 5 | 6 | /** 7 | * Extracts a label from a labeled image. 8 | */ 9 | object LabelExtractor extends Transformer[LabeledImage, Int] { 10 | def apply(in: LabeledImage): Int = in.label 11 | } 12 | 13 | /** 14 | * Extracts an image from a labeled image. 15 | */ 16 | object ImageExtractor extends Transformer[LabeledImage, Image] { 17 | def apply(in: LabeledImage): Image = in.image 18 | } 19 | 20 | /** 21 | * Extracts a label from a multi-labeled image. 22 | */ 23 | object MultiLabelExtractor extends Transformer[MultiLabeledImage, Array[Int]] { 24 | override def apply(in: MultiLabeledImage): Array[Int] = in.label 25 | } 26 | 27 | /** 28 | * Extracts an image from a multi-labeled image. 29 | */ 30 | object MultiLabeledImageExtractor extends Transformer[MultiLabeledImage, Image] { 31 | def apply(in: MultiLabeledImage): Image = in.image 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/PixelScaler.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import keystoneml.workflow.Transformer 4 | import keystoneml.utils.{ImageUtils, Image} 5 | 6 | 7 | /** 8 | * Rescales an input image from [0 .. 255] to [0 .. 1]. Works by dividing each pixel by 255.0. 9 | */ 10 | object PixelScaler extends Transformer[Image,Image] { 11 | def apply(im: Image): Image = { 12 | ImageUtils.mapPixels(im, _/255.0) 13 | } 14 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/Pooler.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import breeze.linalg.DenseVector 4 | import keystoneml.pipelines._ 5 | import keystoneml.utils.{ImageMetadata, ChannelMajorArrayVectorizedImage, Image} 6 | import keystoneml.workflow.Transformer 7 | 8 | /** 9 | * This node takes an image and performs pooling on regions of the image. 10 | * 11 | * Divides images into fixed size pools, but when fed with images of various 12 | * sizes may produce a varying number of pools. 13 | * 14 | * NOTE: By default strides start from poolSize/2. 15 | * 16 | * @param stride x and y stride to get regions of the image 17 | * @param poolSize size of the patch to perform pooling on 18 | * @param pixelFunction function to apply on every pixel before pooling 19 | * @param poolFunction pooling function to use on every region. 20 | */ 21 | class Pooler( 22 | stride: Int, 23 | poolSize: Int, 24 | pixelFunction: Double => Double, 25 | poolFunction: DenseVector[Double] => Double) 26 | extends Transformer[Image, Image] { 27 | 28 | val strideStart = poolSize / 2 29 | 30 | def apply(image: Image) = { 31 | val xDim = image.metadata.xDim 32 | val yDim = image.metadata.yDim 33 | val numChannels = image.metadata.numChannels 34 | 35 | val numPoolsX = math.ceil((xDim - strideStart).toDouble / stride).toInt 36 | val numPoolsY = math.ceil((yDim - strideStart).toDouble / stride).toInt 37 | val patch = new Array[Double]( numPoolsX * numPoolsY * numChannels) 38 | 39 | // Start at strideStart in (x, y) and 40 | for (x <- strideStart until xDim by stride; 41 | y <- strideStart until yDim by stride) { 42 | // Extract the pool. Then apply the pixel and pool functions 43 | 44 | val pool = DenseVector.zeros[Double](poolSize * poolSize) 45 | val startX = x - poolSize/2 46 | val endX = math.min(x + poolSize/2, xDim) 47 | val startY = y - poolSize/2 48 | val endY = math.min(y + poolSize/2, yDim) 49 | 50 | var c = 0 51 | while (c < numChannels) { 52 | var s = startX 53 | while (s < endX) { 54 | var b = startY 55 | while (b < endY) { 56 | pool((s-startX) + (b-startY)*(endX-startX)) = 57 | pixelFunction(image.get(s, b, c)) 58 | b = b + 1 59 | } 60 | s = s + 1 61 | } 62 | patch(c + (x - strideStart)/stride * numChannels + 63 | (y - strideStart)/stride * numPoolsX * numChannels) = poolFunction(pool) 64 | c = c + 1 65 | } 66 | } 67 | ChannelMajorArrayVectorizedImage(patch, ImageMetadata(numPoolsX, numPoolsY, numChannels)) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/RandomImageTransformer.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | import keystoneml.utils.{ImageUtils, Image} 6 | import keystoneml.workflow.Transformer 7 | 8 | /** 9 | * Transform an image with the given probability 10 | * 11 | * @param chance probability that an image should be transformed 12 | * @param transform function to apply to image 13 | * @return transformed image or original image 14 | */ 15 | 16 | case class RandomImageTransformer( 17 | chance: Double, 18 | transform: Image => Image, 19 | seed: Long = 12334L) extends Transformer[Image, Image] { 20 | 21 | val rnd = new java.util.Random(seed) 22 | 23 | def apply(im: Image): Image = { 24 | val flip = rnd.nextDouble() 25 | if (flip < chance) { 26 | transform(im) 27 | } else { 28 | im 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/RandomPatcher.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | import keystoneml.utils.{ImageUtils, Image} 6 | import keystoneml.pipelines.FunctionNode 7 | 8 | /** 9 | * Extract uniformly random patches from an image 10 | * 11 | * @param numPatches number of random patches to extract 12 | * @param patchSizeX size of each patch along xDim 13 | * @param patchSizeY size of each patch along yDim 14 | * @return numPatches images of size patchSizeX x patchSizeY 15 | */ 16 | case class RandomPatcher( 17 | numPatches: Int, 18 | patchSizeX: Int, 19 | patchSizeY: Int, 20 | seed: Long = 12334L) extends FunctionNode[RDD[Image], RDD[Image]] { 21 | 22 | val rnd = new java.util.Random(seed) 23 | 24 | def apply(in: RDD[Image]): RDD[Image] = { 25 | in.flatMap { x => 26 | randomPatchImage(x) 27 | } 28 | } 29 | 30 | def randomPatchImage(in: Image): Iterator[Image] = { 31 | val xDim = in.metadata.xDim 32 | val yDim = in.metadata.yDim 33 | 34 | (0 until numPatches).iterator.map { x => 35 | val borderSizeX = xDim - patchSizeX 36 | val borderSizeY = yDim - patchSizeY 37 | // Pick a random int between 0 and borderSize (inclusive) 38 | val startX = rnd.nextInt(borderSizeX + 1) 39 | val endX = startX + patchSizeX 40 | val startY = rnd.nextInt(borderSizeY + 1) 41 | val endY = startY + patchSizeY 42 | 43 | ImageUtils.crop(in, startX, startY, endX, endY) 44 | } 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/SIFTExtractor.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import breeze.linalg.DenseMatrix 4 | import keystoneml.workflow.Transformer 5 | import keystoneml.utils.Image 6 | 7 | /** 8 | * Abstract interface for SIFT extractor. 9 | */ 10 | trait SIFTExtractorInterface extends Transformer[Image, DenseMatrix[Float]] -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/SymmetricRectifier.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import keystoneml.pipelines._ 4 | import keystoneml.utils.{ChannelMajorArrayVectorizedImage, Image} 5 | import keystoneml.workflow.Transformer 6 | 7 | case class SymmetricRectifier(maxVal: Double = 0.0, alpha: Double = 0.0) 8 | extends Transformer[Image, Image] { 9 | 10 | def apply(img: Image): Image = { 11 | val res = ChannelMajorArrayVectorizedImage( 12 | new Array[Double](img.metadata.xDim * img.metadata.yDim * img.metadata.numChannels * 2), 13 | img.metadata.copy(numChannels = img.metadata.numChannels * 2)) 14 | 15 | var x, y, c = 0 16 | while (x < img.metadata.xDim) { 17 | y = 0 18 | while (y < img.metadata.yDim) { 19 | c = 0 20 | while (c < img.metadata.numChannels) { 21 | res.put(x, y, c, math.max(maxVal, img.get(x, y, c) - alpha)) 22 | res.put(x, y, c + img.metadata.numChannels, math.max(maxVal, -img.get(x, y, c) - alpha)) 23 | c += 1 24 | } 25 | y += 1 26 | } 27 | x += 1 28 | } 29 | 30 | res 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/Windower.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import breeze.linalg.DenseVector 4 | import org.apache.spark.rdd.RDD 5 | import keystoneml.pipelines.FunctionNode 6 | import keystoneml.utils.{ImageMetadata, ChannelMajorArrayVectorizedImage, Image} 7 | 8 | 9 | /** 10 | * @param stride How big a step to take between patches. 11 | * @param windowSize Size of a patch. 12 | */ 13 | class Windower( 14 | stride: Int, 15 | windowSize: Int) extends FunctionNode[RDD[Image], RDD[Image]] { 16 | 17 | def apply(in: RDD[Image]) = { 18 | in.flatMap(getImageWindow) 19 | } 20 | 21 | def getImageWindow(image: Image) = { 22 | val xDim = image.metadata.xDim 23 | val yDim = image.metadata.yDim 24 | val numChannels = image.metadata.numChannels 25 | 26 | // Start at (0,0) in (x, y) and 27 | (0 until xDim - windowSize + 1 by stride).flatMap { x => 28 | (0 until yDim - windowSize + 1 by stride).map { y => 29 | // Extract the window. 30 | val pool = new DenseVector[Double](windowSize * windowSize * numChannels) 31 | val startX = x 32 | val endX = x + windowSize 33 | val startY = y 34 | val endY = y + windowSize 35 | 36 | var c = 0 37 | while (c < numChannels) { 38 | var s = startX 39 | while (s < endX) { 40 | var b = startY 41 | while (b < endY) { 42 | pool(c + (s-startX)*numChannels + 43 | (b-startY)*(endX-startX)*numChannels) = image.get(s, b, c) 44 | b = b + 1 45 | } 46 | s = s + 1 47 | } 48 | c = c + 1 49 | } 50 | ChannelMajorArrayVectorizedImage(pool.toArray, 51 | ImageMetadata(windowSize, windowSize, numChannels)) 52 | } 53 | } 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/external/FisherVector.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images.external 2 | 3 | import breeze.linalg._ 4 | import keystoneml.nodes.images.FisherVectorInterface 5 | import keystoneml.nodes.learning.GaussianMixtureModel 6 | import keystoneml.nodes.learning.external.GaussianMixtureModelEstimator 7 | import org.apache.spark.rdd.RDD 8 | import keystoneml.utils.MatrixUtils 9 | import keystoneml.utils.external.EncEval 10 | import keystoneml.workflow.{Transformer, Estimator} 11 | 12 | /** 13 | * Implements a wrapper for the `enceval` Fisher Vector implementation. 14 | * 15 | * @param gmm A trained Gaussian Mixture Model 16 | */ 17 | case class FisherVector( 18 | gmm: GaussianMixtureModel) 19 | extends FisherVectorInterface { 20 | 21 | @transient lazy val extLib = new EncEval() 22 | 23 | val numDims = gmm.means.rows 24 | val numCentroids = gmm.means.cols 25 | val numFeatures = numDims * numCentroids * 2 26 | 27 | override def apply(in: DenseMatrix[Float]): DenseMatrix[Float] = { 28 | val means = convert(gmm.means, Float).toArray 29 | val vars = convert(gmm.variances, Float).toArray 30 | val wts = convert(gmm.weights, Float).toArray 31 | 32 | val fisherVector = extLib.calcAndGetFVs(means, numDims, numCentroids, 33 | vars, wts, in.toArray) 34 | 35 | new DenseMatrix(numDims, numCentroids*2, fisherVector) 36 | } 37 | } 38 | 39 | /** 40 | * Trains an `enceval` Fisher Vector implementation, via 41 | * estimating a GMM by treating each column of the inputs as a separate 42 | * DenseVector input to [[GaussianMixtureModelEstimator]] 43 | * 44 | * TODO: Pending philosophical discussions on how to best make it so you can 45 | * swap in GMM, KMeans++, etc. for Fisher Vectors. For now just hard-codes GMM here 46 | * 47 | * @param k Number of centers to estimate. 48 | */ 49 | case class EncEvalGMMFisherVectorEstimator(k: Int) extends Estimator[DenseMatrix[Float], DenseMatrix[Float]] { 50 | def fit(data: RDD[DenseMatrix[Float]]): FisherVector = { 51 | val gmmTrainingData = data.flatMap(x => MatrixUtils.matrixToColArray(x).map(i => convert(i, Double))) 52 | val gmmEst = new GaussianMixtureModelEstimator(k) 53 | val gmm = gmmEst.fit(gmmTrainingData) 54 | FisherVector(gmm) 55 | } 56 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/images/external/SIFTExtractor.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images.external 2 | 3 | import breeze.linalg._ 4 | import keystoneml.nodes.images.SIFTExtractorInterface 5 | import org.apache.spark.rdd.RDD 6 | import keystoneml.utils.Image 7 | import keystoneml.utils.external.VLFeat 8 | 9 | /** 10 | * Extracts SIFT Descriptors at dense intervals at multiple scales using the vlfeat C library. 11 | * 12 | * @param stepSize Spacing between each sampled descriptor. 13 | * @param binSize Size of histogram bins for SIFT. 14 | * @param scales Number of scales at which to extract. 15 | */ 16 | class SIFTExtractor(val stepSize: Int = 3, val binSize: Int = 4, val scales: Int = 4, val scaleStep: Int = 1) 17 | extends SIFTExtractorInterface { 18 | @transient lazy val extLib = new VLFeat() 19 | 20 | val descriptorSize = 128 21 | 22 | /** 23 | * Extract SIFTs from an image. 24 | * @param in The input to pass into this pipeline node 25 | * @return The output for the given input 26 | */ 27 | def apply(in: Image): DenseMatrix[Float] = { 28 | val rawDescDataShort = extLib.getSIFTs(in.metadata.xDim, in.metadata.yDim, 29 | stepSize, binSize, scales, scaleStep, in.getSingleChannelAsFloatArray()) 30 | val numCols = rawDescDataShort.length/descriptorSize 31 | val rawDescData = rawDescDataShort.map(s => s.toFloat) 32 | new DenseMatrix(descriptorSize, numCols, rawDescData) 33 | } 34 | } 35 | 36 | object SIFTExtractor { 37 | def apply(stepSize: Int = 3, binSize: Int = 4, scales: Int = 4, scaleStep: Int = 1) = { 38 | new SIFTExtractor(stepSize, binSize, scales, scaleStep) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/learning/CostModel.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.learning 2 | 3 | /** 4 | * A trait that represents a known system performance cost model for a solver. 5 | */ 6 | trait CostModel { 7 | def cost( 8 | n: Long, 9 | d: Int, 10 | k: Int, 11 | sparsity: Double, 12 | numMachines: Int, 13 | cpuWeight: Double, 14 | memWeight: Double, 15 | networkWeight: Double) 16 | : Double 17 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/learning/DistributedPCA.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.learning 2 | 3 | import breeze.linalg._ 4 | import breeze.numerics._ 5 | import breeze.stats._ 6 | import com.github.fommil.netlib.LAPACK.{getInstance => lapack} 7 | import org.apache.spark.rdd.RDD 8 | import org.netlib.util.intW 9 | import keystoneml.pipelines._ 10 | import keystoneml.utils.MatrixUtils 11 | import keystoneml.workflow.{Transformer, Estimator} 12 | 13 | import edu.berkeley.cs.amplab.mlmatrix.{RowPartition, NormalEquations, RowPartitionedMatrix, TSQR} 14 | 15 | /** 16 | * Estimates a PCA model for dimensionality reduction using a distributedQR. 17 | * 18 | * @param dims Dimensions to reduce input dataset to. 19 | */ 20 | class DistributedPCAEstimator(dims: Int) extends Estimator[DenseVector[Float], DenseVector[Float]] 21 | with CostModel with Logging { 22 | 23 | /** 24 | * Adapted from the "PCA2" matlab code given in appendix B of this paper: 25 | * https://www.cs.princeton.edu/picasso/mats/PCA-Tutorial-Intuition_jp.pdf 26 | * 27 | * @param samples Features to be reduced. Logically row-major. 28 | * @return A PCA model which will perform dimensionality reduction when applied to data. 29 | */ 30 | def fit(samples: RDD[DenseVector[Float]]): PCATransformer = { 31 | new PCATransformer(computePCA(samples, dims)) 32 | } 33 | 34 | def computePCA(dataMat: RDD[DenseVector[Float]], dims: Int): DenseMatrix[Float] = { 35 | 36 | val mat = new RowPartitionedMatrix(dataMat.mapPartitions { part => 37 | val dblIter = part.map(x => convert(x, Double)) 38 | MatrixUtils.rowsToMatrixIter(dblIter).map(RowPartition(_)) 39 | }) 40 | val means = DenseVector(mat.colSums():_*) :/ mat.numRows().toDouble 41 | 42 | val meansBC = dataMat.context.broadcast(means) 43 | val zeroMeanMat = new RowPartitionedMatrix(mat.rdd.map { part => 44 | RowPartition(part.mat(*, ::) - meansBC.value) 45 | }) 46 | 47 | val rPart = new TSQR().qrR(zeroMeanMat) 48 | 49 | val svd.SVD(u, s, pcaT) = svd(rPart) 50 | 51 | val pca = convert(pcaT.t, Float) 52 | 53 | val matlabConventionPCA = PCAEstimator.enforceMatlabPCASignConvention(pca) 54 | 55 | // Return a subset of the columns. 56 | matlabConventionPCA(::, 0 until dims) 57 | } 58 | 59 | override def cost( 60 | n: Long, 61 | d: Int, 62 | k: Int, 63 | sparsity: Double, 64 | numMachines: Int, 65 | cpuWeight: Double, 66 | memWeight: Double, 67 | networkWeight: Double): Double = { 68 | val log2NumMachines = math.log(numMachines.toDouble) / math.log(2.0) 69 | val flops = n.toDouble * d * d / numMachines + d.toDouble * d * d * log2NumMachines 70 | val bytesScanned = n.toDouble * d 71 | val network = d.toDouble * d * log2NumMachines 72 | math.max(cpuWeight * flops, memWeight * bytesScanned) + networkWeight * network 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/learning/KernelMatrix.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.learning 2 | 3 | import scala.collection.mutable.HashMap 4 | import scala.reflect.ClassTag 5 | 6 | import breeze.linalg._ 7 | 8 | import org.apache.spark.rdd.RDD 9 | 10 | import keystoneml.utils.{MatrixUtils, Stats} 11 | import keystoneml.workflow.{Transformer, LabelEstimator} 12 | 13 | /** 14 | * Defines a wrapper to access elements of a symmetric distributed 15 | * matrix that is generated using a kernel function. 16 | */ 17 | trait KernelMatrix { 18 | 19 | /** 20 | * Extract specified columns from the kernel matrix. 21 | * NOTE: This returns a *cached* RDD and unpersist should 22 | * be called at the end of a block. 23 | * 24 | * @param colIdxs the column indexes to extract 25 | * @return A sub-matrix of size n x idxs.size as an RDD. 26 | */ 27 | def apply(colIdxs: Seq[Int]): RDD[DenseMatrix[Double]] 28 | 29 | /** 30 | * Extract a diagonal block from the kernel matrix. 31 | * 32 | * @param idxs the column, row indexes to extract 33 | * @return A local matrix of size idxs.size x idxs.size 34 | */ 35 | def diagBlock(idxs: Seq[Int]): DenseMatrix[Double] 36 | 37 | /** 38 | * Clean up resources associated with a kernel block. 39 | * 40 | * @param colIdxs column indexes corresponding to the block. 41 | */ 42 | def unpersist(colIdxs: Seq[Int]): Unit 43 | } 44 | 45 | /** 46 | * Column-wise block implementation of a kernel matrix. 47 | * This class uses a kernel transformer to lazily populate the column blocks 48 | * and caches them optionally 49 | */ 50 | class BlockKernelMatrix[T: ClassTag]( 51 | val kernelGen: KernelTransformer[T], 52 | val data: RDD[T], 53 | val cacheKernel: Boolean) 54 | extends KernelMatrix { 55 | 56 | val colBlockCache = HashMap.empty[Seq[Int], RDD[DenseMatrix[Double]]] 57 | val diagBlockCache = HashMap.empty[Seq[Int], DenseMatrix[Double]] 58 | 59 | def apply(colIdxs: Seq[Int]): RDD[DenseMatrix[Double]] = { 60 | if (colBlockCache.contains(colIdxs)) { 61 | colBlockCache(colIdxs) 62 | } else { 63 | val (kBlock, diagBlock) = kernelGen.computeKernel(data, colIdxs) 64 | if (cacheKernel) { 65 | colBlockCache += (colIdxs -> kBlock) 66 | diagBlockCache += (colIdxs -> diagBlock) 67 | } 68 | kBlock 69 | } 70 | } 71 | 72 | def unpersist(colIdxs: Seq[Int]): Unit = { 73 | if (colBlockCache.contains(colIdxs) && !cacheKernel) { 74 | colBlockCache(colIdxs).unpersist(true) 75 | } 76 | } 77 | 78 | def diagBlock(idxs: Seq[Int]): DenseMatrix[Double] = { 79 | if (!diagBlockCache.contains(idxs)) { 80 | val (kBlock, diagBlock) = kernelGen.computeKernel(data, idxs) 81 | if (cacheKernel) { 82 | colBlockCache += (idxs -> kBlock) 83 | diagBlockCache += (idxs -> diagBlock) 84 | } 85 | diagBlock 86 | } else { 87 | diagBlockCache(idxs) 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/learning/LocalLeastSquaresEstimator.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.learning 2 | 3 | import breeze.linalg._ 4 | import breeze.stats._ 5 | import keystoneml.nodes.stats.StandardScalerModel 6 | import org.apache.spark.rdd.RDD 7 | import keystoneml.utils.MatrixUtils 8 | import keystoneml.workflow.LabelEstimator 9 | 10 | /** 11 | * Learns a linear model (OLS) based on training features and training labels. 12 | * Works well when the number of features >> number of examples, and the data fits locally. 13 | * 14 | * @param lambda regularization parameter 15 | */ 16 | class LocalLeastSquaresEstimator(lambda: Double) 17 | extends LabelEstimator[DenseVector[Double], DenseVector[Double], DenseVector[Double]] { 18 | 19 | override def fit( 20 | trainingFeatures: RDD[DenseVector[Double]], 21 | trainingLabels: RDD[DenseVector[Double]]): LinearMapper[DenseVector[Double]] = { 22 | LocalLeastSquaresEstimator.trainWithL2(trainingFeatures, trainingLabels, lambda) 23 | } 24 | } 25 | 26 | object LocalLeastSquaresEstimator { 27 | /** 28 | * Learns a linear model (OLS) based on training features and training labels. 29 | * Works well when the number of features >> number of examples. 30 | * 31 | * @param trainingFeatures Training features. 32 | * @param trainingLabels Training labels. 33 | * @return 34 | */ 35 | def trainWithL2( 36 | trainingFeatures: RDD[DenseVector[Double]], 37 | trainingLabels: RDD[DenseVector[Double]], 38 | lambda: Double): LinearMapper[DenseVector[Double]] = { 39 | 40 | val A_parts = trainingFeatures.mapPartitions { x => 41 | MatrixUtils.rowsToMatrixIter(x) 42 | }.collect() 43 | val b_parts = trainingLabels.mapPartitions { x => 44 | MatrixUtils.rowsToMatrixIter(x) 45 | }.collect() 46 | 47 | val A_local = DenseMatrix.vertcat(A_parts:_*) 48 | val b_local = DenseMatrix.vertcat(b_parts:_*) 49 | 50 | val featuresMean = mean(A_local(::, *)).t 51 | val labelsMean = mean(b_local(::, *)).t 52 | 53 | val A_zm = A_local(*, ::) - featuresMean 54 | val b_zm = b_local(*, ::) - labelsMean 55 | 56 | val AAt = A_zm * A_zm.t 57 | val model = A_zm.t * ( (AAt + (DenseMatrix.eye[Double](AAt.rows) :* lambda)) \ b_zm ) 58 | LinearMapper(model, Some(labelsMean), Some(new StandardScalerModel(featuresMean, None))) 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/learning/NaiveBayesModel.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.learning 2 | 3 | import breeze.linalg.{DenseMatrix, DenseVector, Vector} 4 | import org.apache.spark.mllib.classification.NaiveBayes 5 | import org.apache.spark.mllib.regression.LabeledPoint 6 | import org.apache.spark.rdd.RDD 7 | import keystoneml.utils.MLlibUtils.breezeVectorToMLlib 8 | import keystoneml.workflow.{Transformer, LabelEstimator} 9 | 10 | import scala.reflect.ClassTag 11 | 12 | /** 13 | * A Multinomial Naive Bayes model that transforms feature vectors to vectors containing 14 | * the log posterior probabilities of the different classes 15 | * 16 | * @param labels list of class labels, ranging from 0 to (C - 1) inclusive 17 | * @param pi log of class priors, whose dimension is C, number of labels 18 | * @param theta log of class conditional probabilities, whose dimension is C-by-D, 19 | * where D is number of features 20 | */ 21 | class NaiveBayesModel[T <: Vector[Double]]( 22 | val labels: Array[Int], 23 | val pi: Array[Double], 24 | val theta: Array[Array[Double]]) extends Transformer[T, DenseVector[Double]] { 25 | 26 | private val brzPi = new DenseVector[Double](pi.length) 27 | private val brzTheta = new DenseMatrix[Double](theta.length, theta(0).length) 28 | 29 | { 30 | // Need to put an extra pair of braces to prevent Scala treating `i` as a member. 31 | var i = 0 32 | while (i < theta.length) { 33 | brzPi(labels(i)) = pi(i) 34 | var j = 0 35 | while (j < theta(i).length) { 36 | brzTheta(labels(i), j) = theta(i)(j) 37 | j += 1 38 | } 39 | i += 1 40 | } 41 | } 42 | 43 | /** 44 | * Transforms a feature vector to a vector containing the log(posterior probabilities) of the different classes 45 | * according to this naive bayes model. 46 | 47 | * @param in The input feature vector 48 | * @return Log-posterior probabilites of the classes for the input features 49 | */ 50 | override def apply(in: T): DenseVector[Double] = { 51 | brzPi + brzTheta * in 52 | } 53 | } 54 | 55 | /** 56 | * A LabelEstimator which learns a multinomial naive bayes model from training data. 57 | * Outputs a Transformer that maps features to vectors containing the log-posterior-probabilities 58 | * of the various classes according to the learned model. 59 | * 60 | * @param lambda The lambda parameter to use for the naive bayes model 61 | */ 62 | case class NaiveBayesEstimator[T <: Vector[Double] : ClassTag](numClasses: Int, lambda: Double = 1.0) 63 | extends LabelEstimator[T, DenseVector[Double], Int] { 64 | override def fit(in: RDD[T], labels: RDD[Int]): NaiveBayesModel[T] = { 65 | val labeledPoints = labels.zip(in).map(x => LabeledPoint(x._1, breezeVectorToMLlib(x._2))) 66 | val model = NaiveBayes.train(labeledPoints, lambda) 67 | 68 | new NaiveBayesModel(model.labels.map(_.toInt), model.pi, model.theta) 69 | } 70 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/learning/SparseLinearMapper.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.learning 2 | 3 | import breeze.linalg._ 4 | import org.apache.spark.rdd.RDD 5 | import keystoneml.workflow.Transformer 6 | 7 | /** 8 | * Computes A * x + b i.e. a linear map of data using a trained model. 9 | * 10 | * @param x trained model 11 | * @param bOpt optional intercept to add 12 | */ 13 | case class SparseLinearMapper( 14 | x: DenseMatrix[Double], 15 | bOpt: Option[DenseVector[Double]] = None) 16 | extends Transformer[SparseVector[Double], DenseVector[Double]] { 17 | 18 | /** 19 | * Apply a linear model to an input. 20 | * @param in Input. 21 | * @return Output. 22 | */ 23 | def apply(in: SparseVector[Double]): DenseVector[Double] = { 24 | val out = x.t * in 25 | bOpt.foreach { b => 26 | out :+= b 27 | } 28 | 29 | out 30 | } 31 | 32 | /** 33 | * Apply a linear model to a collection of inputs. 34 | * 35 | * @param in Collection of A's. 36 | * @return Collection of B's. 37 | */ 38 | override def apply(in: RDD[SparseVector[Double]]): RDD[DenseVector[Double]] = { 39 | val modelBroadcast = in.context.broadcast(x) 40 | val bBroadcast = in.context.broadcast(bOpt) 41 | in.map(row => { 42 | val out = modelBroadcast.value.t * row 43 | bBroadcast.value.foreach { b => 44 | out :+= b 45 | } 46 | 47 | out 48 | }) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/learning/ZCAWhitener.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.learning 2 | 3 | import breeze.linalg._ 4 | import breeze.numerics._ 5 | import breeze.stats._ 6 | import com.github.fommil.netlib.LAPACK.{getInstance => lapack} 7 | import org.apache.spark.rdd.RDD 8 | import org.netlib.util.intW 9 | import keystoneml.pipelines._ 10 | import keystoneml.workflow.{Transformer, Estimator} 11 | 12 | class ZCAWhitener(val whitener: DenseMatrix[Double], val means: DenseVector[Double]) 13 | extends Transformer[DenseMatrix[Double],DenseMatrix[Double]] { 14 | 15 | def apply(in: DenseMatrix[Double]): DenseMatrix[Double] = { 16 | (in(*, ::) - means) * whitener 17 | } 18 | } 19 | 20 | /** 21 | * Computes a ZCA Whitener, which is intended to rotate an input dataset to identity covariance. 22 | * The "Z" in ZCA Whitening means that the solution will be as close to the original dataset as possible while having 23 | * this identity covariance property. 24 | * 25 | * See here for more details: 26 | * http://ufldl.stanford.edu/wiki/index.php/Whitening 27 | * 28 | * @param eps Regularization Parameter 29 | */ 30 | class ZCAWhitenerEstimator(val eps: Double = 0.1) 31 | extends Estimator[DenseMatrix[Double],DenseMatrix[Double]] { 32 | 33 | def fit(in: RDD[DenseMatrix[Double]]): ZCAWhitener = { 34 | fitSingle(in.first) 35 | } 36 | 37 | def fitSingle(in: DenseMatrix[Double]): ZCAWhitener = { 38 | val means = (mean(in(::, *))).t 39 | 40 | val whitener: DenseMatrix[Double] = { 41 | val inc = convert(in(*, ::) - means, Float) 42 | val rows = inc.rows 43 | val cols = inc.cols 44 | 45 | val s1 = DenseVector.zeros[Float](math.min(rows, cols)) 46 | val v1 = DenseMatrix.zeros[Float](inc.cols, inc.cols) 47 | 48 | // Get optimal workspace size 49 | // we do this by sending -1 as lwork to the lapack function 50 | val scratch, work = new Array[Float](1) 51 | val info = new intW(0) 52 | 53 | lapack.sgesvd("N", "A", rows, cols, scratch, rows, scratch, null, 1, scratch, cols, work, -1, info) 54 | 55 | val lwork1 = work(0).toInt 56 | val workspace = new Array[Float](lwork1) 57 | 58 | // Perform the SVD with sgesvd 59 | lapack.sgesvd("N", "A", rows, cols, inc.copy.data, rows, s1.data, null, 1, v1.data, cols, workspace, workspace.length, info) 60 | 61 | val s2 = pow(s1, 2.0f) / (rows - 1.0f) 62 | 63 | val sn1 = diag((s2 + eps.toFloat) :^ -0.5f) 64 | 65 | // NOTE: sgesvd returns singular values in the opposite order (when compared to eigenvalues) 66 | // Thus we need v.t * s * v here ? 67 | val svdMat = v1.t * sn1 * v1 68 | 69 | convert(svdMat, Double) 70 | } 71 | 72 | new ZCAWhitener(whitener, means) 73 | 74 | } 75 | } 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/learning/external/GaussianMixtureModelEstimator.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.learning.external 2 | 3 | import breeze.linalg.{convert, DenseMatrix, DenseVector} 4 | import keystoneml.nodes.learning.GaussianMixtureModel 5 | import org.apache.spark.rdd.RDD 6 | import keystoneml.utils.external.EncEval 7 | import keystoneml.workflow.Estimator 8 | 9 | /** 10 | * Fit a Gaussian Mixture model to Data. 11 | * 12 | * @param k Number of centers to estimate. 13 | */ 14 | class GaussianMixtureModelEstimator(k: Int) extends Estimator[DenseVector[Double], DenseVector[Double]] { 15 | 16 | /** 17 | * Currently this model works on items that fit in local memory. 18 | * @param samples 19 | * @return A PipelineNode (Transformer) which can be called on new data. 20 | */ 21 | def fit(samples: RDD[DenseVector[Double]]): GaussianMixtureModel = { 22 | fit(samples.collect) 23 | } 24 | 25 | /** 26 | * Fit a Gaussian mixture model with `k` centers to a sample array. 27 | * 28 | * @param samples Sample Array - all elements must be the same size. 29 | * @return A Gaussian Mixture Model. 30 | */ 31 | def fit(samples: Array[DenseVector[Double]]): GaussianMixtureModel = { 32 | val extLib = new EncEval 33 | val nDim = samples(0).length 34 | 35 | // Flatten this thing out. 36 | val sampleFloats = samples.map(_.toArray.map(_.toFloat)) 37 | val res = extLib.computeGMM(k, nDim, sampleFloats.flatten) 38 | 39 | val meanSize = k*nDim 40 | val varSize = k*nDim 41 | val coefSize = k*nDim 42 | 43 | // Each array region is expected to be centroid-major. 44 | val means = convert(new DenseMatrix(nDim, k, res.slice(0, meanSize)), Double) 45 | val vars = convert(new DenseMatrix(nDim, k, res.slice(meanSize, meanSize+varSize)), Double) 46 | val coefs = convert(new DenseVector(res.slice(meanSize+varSize, meanSize+varSize+coefSize)), Double) 47 | 48 | new GaussianMixtureModel(means, vars, coefs) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/nlp/CoreNLPFeatureExtractor.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.nlp 2 | 3 | import edu.arizona.sista.processors.Processor 4 | import edu.arizona.sista.processors.fastnlp.FastNLPProcessor 5 | import org.apache.spark.rdd.RDD 6 | import keystoneml.workflow.Transformer 7 | 8 | /** 9 | * Transformer that uses CoreNLP to (in order): 10 | * - Tokenize document 11 | * - Lemmatize tokens 12 | * - Replace entities w/ their type (e.g. "Jon" => "NAME", "Paris" => "PLACE") 13 | * - Return n-grams for the above (respecting sentence boundaries) 14 | * Note: Much slower than just using [[Tokenizer]] followed by [[NGramsFeaturizer]] 15 | * 16 | * @param orders The size of the n-grams to output 17 | */ 18 | case class CoreNLPFeatureExtractor(orders: Seq[Int]) extends Transformer[String, Seq[String]] { 19 | @transient lazy val proc = new FastNLPProcessor() 20 | 21 | override def apply(in: String): Seq[String] = { 22 | val doc = proc.mkDocument(in) 23 | proc.tagPartsOfSpeech(doc) 24 | proc.lemmatize(doc) 25 | proc.recognizeNamedEntities(doc) 26 | doc.clear() 27 | val out = doc.sentences.map(s => { 28 | val out = new Array[String](s.words.length) 29 | for (i <- 0 to s.words.length - 1) { 30 | out(i) = if (s.entities.get(i) != "O") s.entities.get(i) else normalize(s.lemmas.get(i)) 31 | } 32 | out 33 | }) 34 | orders.map(n => { 35 | out.map(s => { 36 | s.sliding(n).map(gram => gram.mkString(" ")).toList 37 | }).flatMap(identity).toList 38 | }).flatMap(identity).toList 39 | } 40 | 41 | def normalize(s : String): String = { 42 | val pattern = "[^a-zA-Z0-9\\s+]" 43 | pattern.r.replaceAllIn(s,pattern=>"").toLowerCase 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/nlp/HashingTF.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.nlp 2 | 3 | import breeze.linalg.SparseVector 4 | import keystoneml.workflow.Transformer 5 | 6 | /** 7 | * Converts a sequence of terms to a sparse vector representing their frequencies, 8 | * using the hashing trick: https://en.wikipedia.org/wiki/Feature_hashing 9 | * 10 | * Terms are hashed using Scala's `.##` method. We may want to convert to MurmurHash3 for strings, 11 | * as discussed for Spark's ML Pipelines in https://issues.apache.org/jira/browse/SPARK-10574 12 | * 13 | * @param numFeatures The desired feature space to convert to using the hashing trick. 14 | */ 15 | case class HashingTF[T <: Seq[Any]](numFeatures: Int) extends Transformer[T, SparseVector[Double]] { 16 | def nonNegativeMod(x: Int, mod: Int): Int = { 17 | val rawMod = x % mod 18 | rawMod + (if (rawMod < 0) mod else 0) 19 | } 20 | 21 | def apply(document: T): SparseVector[Double] = { 22 | val termFrequencies = scala.collection.mutable.HashMap.empty[Int, Double] 23 | document.foreach { term => 24 | val i = nonNegativeMod(term.##, numFeatures) 25 | termFrequencies.put(i, termFrequencies.getOrElse(i, 0.0) + 1.0) 26 | } 27 | 28 | SparseVector(numFeatures)(termFrequencies.toSeq:_*) 29 | } 30 | } 31 | 32 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/nlp/StringUtils.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.nlp 2 | 3 | import java.util.Locale 4 | 5 | import org.apache.spark.rdd.RDD 6 | import keystoneml.workflow.Transformer 7 | 8 | /** 9 | * Transformer that tokenizes a String into a Seq[String] by splitting on a regular expression. 10 | * @param sep the delimiting regular expression to split on. 11 | * Defaults to matching all punctuation and whitespace 12 | */ 13 | case class Tokenizer(sep: String = "[\\p{Punct}\\s]+") extends Transformer[String, Seq[String]] { 14 | override def apply(in: String): Seq[String] = in.split(sep) 15 | } 16 | 17 | /** 18 | * Transformer that trims a String of leading and trailing whitespace 19 | */ 20 | object Trim extends Transformer[String, String] { 21 | override def apply(in: String): String = in.trim 22 | } 23 | 24 | /** 25 | * Transformer that converts a String to lower case 26 | * @param locale The locale to use. Defaults to `Locale.getDefault` 27 | */ 28 | case class LowerCase(locale: Locale = Locale.getDefault) extends Transformer[String, String] { 29 | override def apply(in: String): String = in.toLowerCase(locale) 30 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/nlp/WordFrequencyEncoder.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.nlp 2 | 3 | import org.apache.spark.broadcast.Broadcast 4 | import org.apache.spark.rdd.RDD 5 | import keystoneml.workflow.{Estimator, Transformer} 6 | 7 | object WordFrequencyEncoder extends Estimator[Seq[String], Seq[Int]] { 8 | private[this] def makeUnigrams(data: RDD[Seq[String]]) = 9 | NGramsCounts[String]().apply(NGramsFeaturizer[String](1 to 1).apply(data)) 10 | 11 | // TODO: alternative approach: collectAsMap once, let driver do the work. 12 | def fit(data: RDD[Seq[String]]): WordFrequencyTransformer = { 13 | val unigramCounts = makeUnigrams(data) 14 | 15 | val wordIndex = unigramCounts 16 | .zipWithIndex() // indexes respect the sorted order 17 | .map { case ((unigram, count), index) => 18 | // valid if # of word types in training data is less than Int.MaxValue 19 | (unigram.words(0), index.asInstanceOf[Int]) 20 | }.collectAsMap() 21 | 22 | val wordIndexBroadcast = unigramCounts.sparkContext.broadcast(wordIndex) 23 | 24 | val unigrams = unigramCounts.map { case (unigram, count) => 25 | (wordIndexBroadcast.value(unigram.words(0)), count) 26 | }.collectAsMap() 27 | 28 | new WordFrequencyTransformer(wordIndexBroadcast, unigrams) 29 | } 30 | 31 | } 32 | 33 | /** 34 | * Encodes string tokens as non-negative integers, which are indices of the 35 | * tokens' positions in the sorted-by-frequency order. Out-of-vocabulary words 36 | * are mapped to the special index -1. 37 | * 38 | * The parameters passed to this class are usually calculated by [[WordFrequencyEncoder]]. 39 | * 40 | * @param wordIndexBroadcast A mapping from token string to its frequency-ordered index 41 | * @param unigramCounts the counts of unigrams in the training corpus 42 | */ 43 | class WordFrequencyTransformer( 44 | wordIndexBroadcast: Broadcast[scala.collection.Map[String, Int]], 45 | val unigramCounts: scala.collection.Map[Int, Int]) 46 | extends Transformer[Seq[String], Seq[Int]] { 47 | 48 | final val OOV_INDEX = -1 49 | 50 | override def apply(in: RDD[Seq[String]]): RDD[Seq[Int]] = { 51 | in.mapPartitions { case part => 52 | val index = wordIndexBroadcast.value 53 | part.map(ngram => ngram.map(index.getOrElse(_, OOV_INDEX))) 54 | } 55 | } 56 | 57 | def apply(words: Seq[String]): Seq[Int] = { 58 | val index = wordIndexBroadcast.value 59 | words.map(index.getOrElse(_, OOV_INDEX)) 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/stats/CosineRandomFeatures.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import breeze.linalg._ 4 | import breeze.numerics._ 5 | import breeze.stats.distributions.Rand 6 | import org.apache.spark.rdd.RDD 7 | import keystoneml.pipelines._ 8 | import keystoneml.utils.MatrixUtils 9 | import keystoneml.workflow.Transformer 10 | 11 | /** 12 | * Transformer that extracts random cosine features from a feature vector 13 | * @param W A matrix of dimension (# output features) by (# input features) 14 | * @param b a dense vector of dimension (# output features) 15 | * 16 | * Transformer maps vector x to cos(x * transpose(W) + b). 17 | * Kernel trick to allow Linear Solver to learn cosine interaction terms of the input 18 | */ 19 | class CosineRandomFeatures( 20 | @transient val W: DenseMatrix[Double], // should be numOutputFeatures by numInputFeatures 21 | @transient val b: DenseVector[Double]) // should be numOutputFeatures by 1 22 | extends Transformer[DenseVector[Double], DenseVector[Double]] { 23 | 24 | require(b.length == W.rows, "# of rows in W and size of b should match") 25 | override def apply(in: RDD[DenseVector[Double]]): RDD[DenseVector[Double]] = { 26 | val wBroadcast = in.sparkContext.broadcast(W) 27 | val bBroadcast = in.sparkContext.broadcast(b) 28 | in.mapPartitions { part => 29 | MatrixUtils.rowsToMatrixIter(part).flatMap { data => 30 | val features: DenseMatrix[Double] = data * wBroadcast.value.t 31 | features(*,::) :+= bBroadcast.value 32 | cos.inPlace(features) 33 | MatrixUtils.matrixToRowArray(features).iterator 34 | } 35 | } 36 | } 37 | 38 | override def apply(in: DenseVector[Double]): DenseVector[Double] = { 39 | val features = (in.t * W.t).t 40 | features :+= b 41 | cos.inPlace(features) 42 | features 43 | } 44 | } 45 | 46 | /** 47 | * Companion Object to generate random cosine features from various distributions 48 | */ 49 | object CosineRandomFeatures { 50 | /** Generate Random Cosine Features from the given distributions **/ 51 | def apply( 52 | numInputFeatures: Int, 53 | numOutputFeatures: Int, 54 | gamma: Double, 55 | wDist: Rand[Double] = Rand.gaussian, 56 | bDist: Rand[Double] = Rand.uniform) = { 57 | val W = DenseMatrix.rand(numOutputFeatures, numInputFeatures, wDist) :* gamma 58 | val b = DenseVector.rand(numOutputFeatures, bDist) :* (2*math.Pi) 59 | new CosineRandomFeatures(W, b) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/stats/LinearRectifier.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import breeze.linalg.DenseVector 4 | import keystoneml.pipelines._ 5 | import keystoneml.workflow.Transformer 6 | 7 | /** 8 | * This transformer applies a Linear Rectifier, 9 | * an activation function defined as: 10 | * f(x) = max({@param maxVal}, x - {@param alpha}) 11 | */ 12 | case class LinearRectifier(maxVal: Double = 0.0, alpha: Double = 0.0) 13 | extends Transformer[DenseVector[Double], DenseVector[Double]] { 14 | def apply(in: DenseVector[Double]): DenseVector[Double] = { 15 | in.map(e => math.max(maxVal, e - alpha)) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/stats/NormalizeRows.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import breeze.linalg.{max, sum, DenseVector} 4 | import breeze.numerics._ 5 | import keystoneml.workflow.Transformer 6 | 7 | /** 8 | * Divides each row by the max of its two-norm and 2.2e-16. 9 | */ 10 | object NormalizeRows extends Transformer[DenseVector[Double], DenseVector[Double]] { 11 | def apply(in: DenseVector[Double]): DenseVector[Double] = { 12 | val norm = max(sqrt(sum(pow(in, 2.0))), 2.2e-16) 13 | in / norm 14 | } 15 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/stats/PaddedFFT.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import breeze.linalg.DenseVector 4 | import breeze.math.Complex 5 | import keystoneml.workflow.Transformer 6 | 7 | /** 8 | * This transformer pads input vectors to the nearest power of two, 9 | * then returns the real values of the first half of the fourier transform on the padded vectors. 10 | * 11 | * Goes from vectors of size n to vectors of size nextPositivePowerOfTwo(n)/2 12 | */ 13 | case class PaddedFFT() extends Transformer[DenseVector[Double], DenseVector[Double]] { 14 | override def apply(in: DenseVector[Double]): DenseVector[Double] = { 15 | val paddedSize = nextPositivePowerOfTwo(in.length) 16 | val fft: DenseVector[Complex] = breeze.signal.fourierTr(in.padTo(paddedSize, 0.0).toDenseVector) 17 | fft(0 until (paddedSize / 2)).map(_.real) 18 | } 19 | 20 | def nextPositivePowerOfTwo(i : Int) = 1 << (32 - Integer.numberOfLeadingZeros(i - 1)) 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/stats/RandomSignNode.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import breeze.linalg._ 4 | import breeze.stats.distributions._ 5 | import keystoneml.workflow.Transformer 6 | 7 | /** 8 | * A node that takes in DenseVector[Double] and randomly flips 9 | * the sign of some of the elements 10 | */ 11 | case class RandomSignNode(signs: DenseVector[Double]) 12 | extends Transformer[DenseVector[Double], DenseVector[Double]] { 13 | 14 | def apply(in: DenseVector[Double]): DenseVector[Double] = in :* signs 15 | 16 | } 17 | 18 | object RandomSignNode { 19 | /* Create a random sign node */ 20 | def apply(size: Int, rand: RandBasis = Rand): RandomSignNode = { 21 | val signs = 2.0*convert(DenseVector.rand(size, Binomial(1, 0.5)(rand)), Double) - 1.0 22 | new RandomSignNode(signs) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/stats/Sampling.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import breeze.linalg.{DenseVector, DenseMatrix} 4 | import org.apache.spark.rdd.RDD 5 | import keystoneml.pipelines.FunctionNode 6 | import keystoneml.workflow.Transformer 7 | 8 | /** 9 | * Given a collection of Dense Matrices, this will generate a sample of 10 | * @param numSamplesPerMatrix columns from each matrix. 11 | */ 12 | case class ColumnSampler(numSamplesPerMatrix: Int) 13 | extends Transformer[DenseMatrix[Float], DenseMatrix[Float]] { 14 | 15 | def apply(in: DenseMatrix[Float]): DenseMatrix[Float] = { 16 | val cols = Seq.fill(numSamplesPerMatrix) { 17 | scala.util.Random.nextInt(in.cols) 18 | } 19 | in(::, cols).toDenseMatrix 20 | } 21 | } 22 | 23 | 24 | /** 25 | * Takes a sample of an input RDD of size size. 26 | * @param size Number of elements to return. 27 | */ 28 | class Sampler[T](val size: Int, val seed: Int = 42) extends FunctionNode[RDD[T], Array[T]] { 29 | def apply(in: RDD[T]): Array[T] = { 30 | in.takeSample(false, size, seed) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/stats/SignedHellingerMapper.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import breeze.linalg.{DenseVector, DenseMatrix} 4 | import breeze.numerics._ 5 | import keystoneml.workflow.Transformer 6 | 7 | /** 8 | * Apply power normalization: z <- sign(z)|z|^{\rho} 9 | * with \rho = \frac{1}{2} 10 | * This a "signed square root" 11 | */ 12 | object SignedHellingerMapper extends Transformer[DenseVector[Double], DenseVector[Double]] { 13 | def apply(in: DenseVector[Double]): DenseVector[Double] = { 14 | signum(in) :* sqrt(abs(in)) 15 | } 16 | } 17 | 18 | object BatchSignedHellingerMapper extends Transformer[DenseMatrix[Float], DenseMatrix[Float]] { 19 | def apply(in: DenseMatrix[Float]): DenseMatrix[Float] = { 20 | in.map(x => (math.signum(x) * math.sqrt(math.abs(x))).toFloat) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/stats/StandardScaler.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import breeze.linalg.DenseVector 4 | import breeze.numerics.sqrt 5 | import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer 6 | import org.apache.spark.rdd.RDD 7 | import keystoneml.utils.MLlibUtils 8 | import keystoneml.workflow.{Transformer, Estimator} 9 | 10 | /** 11 | * Represents a StandardScaler model that can transform dense vectors. 12 | * 13 | * @param mean column mean values 14 | * @param std column standard deviation values 15 | */ 16 | class StandardScalerModel(val mean: DenseVector[Double], val std: Option[DenseVector[Double]] = None) 17 | extends Transformer[DenseVector[Double], DenseVector[Double]] { 18 | /** 19 | * Applies standardization transformation on a vector. 20 | * 21 | * @param in Vector to be standardized. 22 | * @return Standardized vector. If the std of a column is zero, it will return default `0.0` 23 | * for the column with zero std. 24 | */ 25 | override def apply(in: DenseVector[Double]): DenseVector[Double] = { 26 | val out = in - mean 27 | std.foreach(x => { 28 | out :/= x 29 | }) 30 | out 31 | } 32 | } 33 | 34 | /** 35 | * Standardizes features by removing the mean and scaling to unit std using column summary 36 | * statistics on the samples in the training set. 37 | */ 38 | class StandardScaler(normalizeStdDev: Boolean = true, eps: Double = 1E-12) extends Estimator[DenseVector[Double], DenseVector[Double]]{ 39 | /** 40 | * Computes the mean and variance and stores as a model to be used for later scaling. 41 | * 42 | * @param data The data used to compute the mean and variance to build the transformation model. 43 | * @return a StandardScalarModel 44 | */ 45 | override def fit(data: RDD[DenseVector[Double]]): StandardScalerModel = { 46 | val summary = data.treeAggregate(new MultivariateOnlineSummarizer)( 47 | (aggregator, data) => aggregator.add(MLlibUtils.breezeVectorToMLlib(data)), 48 | (aggregator1, aggregator2) => aggregator1.merge(aggregator2)) 49 | if (normalizeStdDev) { 50 | new StandardScalerModel( 51 | MLlibUtils.mllibVectorToDenseBreeze(summary.mean), 52 | Some(sqrt(MLlibUtils.mllibVectorToDenseBreeze(summary.variance)) 53 | .map(r => if (r.isNaN | r.isInfinite | math.abs(r) < eps) 1.0 else r))) 54 | } else { 55 | new StandardScalerModel( 56 | MLlibUtils.mllibVectorToDenseBreeze(summary.mean), 57 | None) 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/stats/TermFrequency.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import keystoneml.workflow.Transformer 4 | 5 | /** 6 | * Transformer that maps a Seq[Any] of objects to a Seq[(Any, Double)] of (unique object, weighting_scheme(tf)), 7 | * where tf is the number of times the unique object appeared in the original Seq[Any], 8 | * and the weighting_scheme is a lambda of Double => Double that defaults to the identity function. 9 | * 10 | * As an example, the following would return a transformer that maps a Seq[Any] 11 | * to all objects seen with the log of their count plus 1: 12 | * {{{ 13 | * TermFrequency(x => math.log(x) + 1) 14 | * }}} 15 | * 16 | * @param fun the weighting scheme to apply to the frequencies (defaults to identity) 17 | */ 18 | case class TermFrequency[T](fun: Double => Double = identity) extends Transformer[Seq[T], Seq[(T, Double)]] { 19 | override def apply(in: Seq[T]): Seq[(T, Double)] = in.groupBy(identity).mapValues(x => fun(x.size)).toSeq 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/AllSparseFeatures.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.SparseVector 4 | import org.apache.spark.rdd.RDD 5 | import keystoneml.workflow.Estimator 6 | 7 | import scala.reflect.ClassTag 8 | 9 | /** 10 | * An Estimator that chooses all sparse features observed when training, 11 | * and produces a transformer which builds a sparse vector out of them. 12 | * 13 | * Deterministically orders the feature mappings by earliest appearance in the RDD 14 | */ 15 | case class AllSparseFeatures[T: ClassTag]() extends Estimator[Seq[(T, Double)], SparseVector[Double]] { 16 | override def fit(data: RDD[Seq[(T, Double)]]): SparseFeatureVectorizer[T] = { 17 | val featureOccurrences = data.flatMap(_.map(_._1)) 18 | // zip with unique ids and take the smallest unique id for a given feature to get 19 | // a deterministic ordering 20 | val featuresWithUniqueId = featureOccurrences.zipWithUniqueId().reduceByKey { 21 | (x, y) => Math.min(x, y) 22 | } 23 | val featureSpace = featuresWithUniqueId.sortBy(_._2).map(_._1) 24 | .collect().zipWithIndex.toMap 25 | new SparseFeatureVectorizer(featureSpace) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/Cacher.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import org.apache.spark.rdd.RDD 4 | import keystoneml.pipelines.Logging 5 | import keystoneml.workflow.Transformer 6 | 7 | import scala.reflect.ClassTag 8 | 9 | /** 10 | * Caches an RDD at a given point within a Pipeline. Follows Spark's lazy evaluation conventions. 11 | * 12 | * @param name An optional name to set on the cached output. Useful for debugging. 13 | * @tparam T Type of the input to cache. 14 | */ 15 | case class Cacher[T: ClassTag](name: Option[String] = None) extends Transformer[T,T] with Logging { 16 | override def apply(in: RDD[T]): RDD[T] = { 17 | logInfo(s"CACHING ${name.getOrElse(in.id)}") 18 | name match { 19 | case Some(x) => in.cache().setName(x) 20 | case None => in.cache() 21 | } 22 | } 23 | 24 | override def apply(in: T): T = in 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/ClassLabelIndicators.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.DenseVector 4 | import org.apache.spark.rdd.RDD 5 | import keystoneml.pipelines._ 6 | import keystoneml.workflow.Transformer 7 | 8 | /** 9 | * Given a class label, returns a binary vector that indicates when that class is present. 10 | * 11 | * Expects labels in the range [0, numClasses) and numClasses > 1. 12 | * 13 | * @param numClasses 14 | */ 15 | case class ClassLabelIndicatorsFromIntLabels(numClasses: Int) 16 | extends Transformer[Int, DenseVector[Double]] { 17 | 18 | assert(numClasses > 1, "numClasses must be > 1.") 19 | 20 | def apply(in: Int): DenseVector[Double] = { 21 | if(in < 0 || in >= numClasses) { 22 | throw new RuntimeException("Class labels are expected to be in the range [0, numClasses)") 23 | } 24 | 25 | val indicatorVector = DenseVector.fill(numClasses, -1.0) 26 | indicatorVector(in) = 1.0 27 | indicatorVector 28 | } 29 | } 30 | 31 | /** 32 | * Given a set of class labels, returns a binary vector that indicates when each class is present. 33 | * 34 | * Expects labels in the range [0, numClasses) and numClasses > 1. 35 | * 36 | * @param numClasses 37 | */ 38 | case class ClassLabelIndicatorsFromIntArrayLabels(numClasses: Int, validate: Boolean = false) 39 | extends Transformer[Array[Int], DenseVector[Double]] { 40 | 41 | assert(numClasses > 1, "numClasses must be > 1.") 42 | 43 | def apply(in: Array[Int]): DenseVector[Double] = { 44 | if(validate && (in.max >= numClasses || in.min < 0)) { 45 | throw new RuntimeException("Class labels are expected to be in the range [0, numClasses)") 46 | } 47 | 48 | val indicatorVector = DenseVector.fill(numClasses, -1.0) 49 | var i = 0 50 | while (i < in.length) { 51 | indicatorVector(in(i)) = 1.0 52 | i += 1 53 | } 54 | indicatorVector 55 | } 56 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/CommonSparseFeatures.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.SparseVector 4 | import org.apache.spark.rdd.RDD 5 | import keystoneml.workflow.Estimator 6 | 7 | import scala.collection.JavaConversions._ 8 | import scala.reflect.ClassTag 9 | 10 | /** 11 | * An Estimator that chooses the most frequently observed sparse features when training, 12 | * and produces a transformer which builds a sparse vector out of them 13 | * 14 | * Deterministically orders the feature mappings first by decreasing number of appearances, 15 | * then by earliest appearance in the RDD 16 | * 17 | * @param numFeatures The number of features to keep 18 | */ 19 | case class CommonSparseFeatures[T : ClassTag](numFeatures: Int) extends Estimator[Seq[(T, Double)], SparseVector[Double]] { 20 | // Ordering that compares (feature, frequency) pairs according to their frequencies 21 | val ordering = new Ordering[(T, (Int, Long))] { 22 | def compare(x: (T, (Int, Long)), y: (T, (Int, Long))): Int = { 23 | if (x._2._1 == y._2._1) { 24 | x._2._2.compare(y._2._2) 25 | } else { 26 | x._2._1.compare(y._2._1) 27 | } 28 | } 29 | } 30 | 31 | /** This method merges two seqs and keeps the top numFeatures */ 32 | def merge(a: Seq[(T, (Int, Long))], b: Seq[(T, (Int, Long))]): Seq[(T, (Int, Long))] = { 33 | (a ++ b).sorted(ordering.reverse).take(numFeatures) 34 | } 35 | 36 | override def fit(data: RDD[Seq[(T, Double)]]): SparseFeatureVectorizer[T] = { 37 | val featureOccurrences = data.flatMap(identity).zipWithUniqueId().map(x => (x._1._1, (1, x._2))) 38 | // zip with unique ids and take the smallest unique id for a given feature to get 39 | // a deterministic ordering 40 | val featureFrequenciesWithUniqueId = featureOccurrences.reduceByKey { 41 | (x, y) => (x._1 + y._1, Math.min(x._2, y._2)) 42 | } 43 | val mapRDDs = featureFrequenciesWithUniqueId mapPartitions { items => 44 | // Priority keeps the largest elements, so let's reverse the ordering. 45 | Iterator.single(takeOrdered(items, numFeatures)(ordering.reverse)) 46 | } 47 | val mostCommonFeatures = mapRDDs.treeReduce(merge).map(_._1) 48 | 49 | val featureSpace = mostCommonFeatures.zipWithIndex.toMap 50 | new SparseFeatureVectorizer(featureSpace) 51 | } 52 | 53 | /** 54 | * Returns the first K elements from the input as defined by the specified implicit Ordering[T] 55 | * and maintains the ordering. 56 | */ 57 | def takeOrdered[T](input: Iterator[T], num: Int)(implicit ord: Ordering[T]): Seq[T] = { 58 | val ordering = new com.google.common.collect.Ordering[T] { 59 | override def compare(l: T, r: T) = ord.compare(l, r) 60 | } 61 | ordering.leastOf(asJavaIterator(input), num) 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/Densify.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.DenseVector 4 | import breeze.linalg.Vector 5 | import keystoneml.workflow.Transformer 6 | 7 | /** 8 | * Transformer to densify vectors into DenseVectors. 9 | */ 10 | case class Densify[T <: Vector[Double]]() extends Transformer[T, DenseVector[Double]] { 11 | /** 12 | * Apply this Transformer to a single input item 13 | * 14 | * @param in The input item to pass into this transformer 15 | * @return The output value 16 | */ 17 | override def apply(in: T): DenseVector[Double] = in match { 18 | case dense: DenseVector[Double] => dense 19 | case _ => in.toDenseVector 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/FloatToDouble.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg._ 4 | import keystoneml.workflow.Transformer 5 | 6 | /** 7 | * Converts float matrix to a double matrix. 8 | */ 9 | object FloatToDouble extends Transformer[DenseMatrix[Float], DenseMatrix[Double]] { 10 | def apply(in: DenseMatrix[Float]): DenseMatrix[Double] = convert(in, Double) 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/Identity.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import org.apache.spark.rdd.RDD 4 | import keystoneml.workflow.Transformer 5 | 6 | import scala.reflect.ClassTag 7 | 8 | /** 9 | * This class performs a no-op on its input. 10 | * 11 | * @tparam T Type of the input and, by definition, output. 12 | */ 13 | class Identity[T: ClassTag] extends Transformer[T,T] { 14 | def apply(in: T): T = in 15 | override def apply(in: RDD[T]): RDD[T] = in 16 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/MatrixVectorizer.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.{DenseMatrix, DenseVector} 4 | import keystoneml.workflow.Transformer 5 | 6 | /** 7 | * Flattens a matrix into a vector. 8 | */ 9 | object MatrixVectorizer extends Transformer[DenseMatrix[Double], DenseVector[Double]] { 10 | def apply(in: DenseMatrix[Double]): DenseVector[Double] = in.toDenseVector 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/MaxClassifier.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.{DenseVector, argmax} 4 | import keystoneml.workflow.Transformer 5 | 6 | /** 7 | * Transformer that returns the index of the largest value in the vector 8 | */ 9 | object MaxClassifier extends Transformer[DenseVector[Double], Int] { 10 | override def apply(in: DenseVector[Double]): Int = argmax(in) 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/Shuffler.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import org.apache.spark.rdd.RDD 4 | import keystoneml.pipelines.Logging 5 | import keystoneml.workflow.Transformer 6 | 7 | import scala.reflect.ClassTag 8 | 9 | /** 10 | * Randomly shuffle the rows of an RDD within a pipeline. Uses a shuffle operation in Spark. 11 | * 12 | * @param numParts An optional parameter indicating the number of output partitions. 13 | * @tparam T Type of the input to shuffle. 14 | */ 15 | class Shuffler[T: ClassTag](numParts: Option[Int] = None) extends Transformer[T,T] with Logging { 16 | override def apply(in: RDD[T]): RDD[T] = { 17 | val numToRepartition = numParts.getOrElse(in.partitions.size) 18 | in.repartition(numToRepartition) 19 | } 20 | 21 | override def apply(in: T): T = in 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/SparseFeatureVectorizer.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.SparseVector 4 | import keystoneml.workflow.Transformer 5 | 6 | /** A transformer which given a feature space, maps features of the form (feature id, value) into a sparse vector */ 7 | class SparseFeatureVectorizer[T](featureSpace: Map[T, Int]) extends Transformer[Seq[(T, Double)], SparseVector[Double]] { 8 | private def transformVector(in: Seq[(T, Double)], featureSpaceMap: Map[T, Int]): SparseVector[Double] = { 9 | val features = in.map(f => (featureSpaceMap.get(f._1), f._2)) 10 | .filter(_._1.isDefined) 11 | .map(f => (f._1.get, f._2.toDouble)) 12 | SparseVector(featureSpaceMap.size)(features:_*) 13 | } 14 | 15 | override def apply(in: Seq[(T, Double)]): SparseVector[Double] = { 16 | transformVector(in, featureSpace) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/Sparsify.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.{SparseVector, DenseVector, Vector} 4 | import keystoneml.workflow.Transformer 5 | 6 | /** 7 | * Transformer to convert vectors into SparseVectors. 8 | */ 9 | case class Sparsify[T <: Vector[Double]]() extends Transformer[T, SparseVector[Double]] { 10 | /** 11 | * Apply this Transformer to a single input item 12 | * 13 | * @param in The input item to pass into this transformer 14 | * @return The output value 15 | */ 16 | override def apply(in: T): SparseVector[Double] = in match { 17 | case sparse: SparseVector[Double] => sparse 18 | case _ => SparseVector(in.toArray) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/TopKClassifier.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.{DenseVector, argtopk} 4 | import keystoneml.workflow.Transformer 5 | 6 | /** 7 | * Transformer that returns the indices of the largest k values of the vector, in order 8 | */ 9 | class TopKClassifier(k: Int) extends Transformer[DenseVector[Double], Array[Int]] { 10 | override def apply(in: DenseVector[Double]): Array[Int] = { 11 | in.toArray.zipWithIndex.sortBy(-_._1).take(k).map(_._2) 12 | } 13 | } 14 | 15 | /** 16 | * Object to allow creating top k classifier w/o new 17 | */ 18 | object TopKClassifier { 19 | def apply(k: Int) = new TopKClassifier(k) 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/VectorCombiner.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.{DenseMatrix, DenseVector} 4 | import keystoneml.workflow.Transformer 5 | 6 | import scala.reflect.ClassTag 7 | 8 | /** 9 | * Concats a Seq of DenseVectors into a single DenseVector. 10 | */ 11 | case class VectorCombiner[T : ClassTag]()(implicit zero: breeze.storage.Zero[T]) 12 | extends Transformer[Seq[DenseVector[T]], DenseVector[T]] { 13 | def apply(in: Seq[DenseVector[T]]): DenseVector[T] = DenseVector.vertcat(in:_*) 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/nodes/util/VectorSplitter.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.DenseVector 4 | import org.apache.spark.rdd.RDD 5 | import keystoneml.pipelines.FunctionNode 6 | 7 | /** 8 | * This transformer splits the input vector into a number of blocks. 9 | */ 10 | class VectorSplitter( 11 | blockSize: Int, 12 | numFeaturesOpt: Option[Int] = None) 13 | extends FunctionNode[RDD[DenseVector[Double]], Seq[RDD[DenseVector[Double]]]] { 14 | 15 | override def apply(in: RDD[DenseVector[Double]]): Seq[RDD[DenseVector[Double]]] = { 16 | val numFeatures = numFeaturesOpt.getOrElse(in.first.length) 17 | val numBlocks = math.ceil(numFeatures.toDouble / blockSize).toInt 18 | (0 until numBlocks).map { blockNum => 19 | in.map { vec => 20 | // Expliclity call toArray as breeze's slice is lazy 21 | val end = math.min(numFeatures, (blockNum + 1) * blockSize) 22 | DenseVector(vec.slice(blockNum * blockSize, end).toArray) 23 | } 24 | } 25 | } 26 | 27 | def splitVector(in: DenseVector[Double]): Seq[DenseVector[Double]] = { 28 | val numFeatures = numFeaturesOpt.getOrElse(in.length) 29 | val numBlocks = math.ceil(numFeatures.toDouble / blockSize).toInt 30 | (0 until numBlocks).map { blockNum => 31 | // Expliclity call toArray as breeze's slice is lazy 32 | val end = math.min(numFeatures, (blockNum + 1) * blockSize) 33 | DenseVector(in.slice(blockNum * blockSize, end).toArray) 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/pipelines/FunctionNode.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.pipelines 2 | 3 | abstract class FunctionNode[A,B] extends (A => B) with Serializable -------------------------------------------------------------------------------- /src/main/scala/keystoneml/pipelines/Logging.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.pipelines 2 | 3 | import org.slf4j.{Logger, LoggerFactory} 4 | 5 | /** 6 | * Utility trait for Logging 7 | */ 8 | trait Logging { 9 | // Make the log field transient so that objects with Logging can 10 | // be serialized and used on another machine 11 | @transient private var log_ : Logger = null 12 | 13 | // Method to get or create the logger for this object 14 | protected def log: Logger = { 15 | if (log_ == null) { 16 | var className = this.getClass.getName 17 | // Ignore trailing $'s in the class names for Scala objects 18 | if (className.endsWith("$")) { 19 | className = className.substring(0, className.length - 1) 20 | } 21 | log_ = LoggerFactory.getLogger(className) 22 | } 23 | log_ 24 | } 25 | 26 | // Log methods that take only a String 27 | protected def logInfo(msg: => String) { 28 | if (log.isInfoEnabled) log.info(msg) 29 | } 30 | 31 | protected def logDebug(msg: => String) { 32 | if (log.isDebugEnabled) log.debug(msg) 33 | } 34 | 35 | protected def logTrace(msg: => String) { 36 | if (log.isTraceEnabled) log.trace(msg) 37 | } 38 | 39 | protected def logWarning(msg: => String) { 40 | if (log.isWarnEnabled) log.warn(msg) 41 | } 42 | 43 | protected def logError(msg: => String) { 44 | if (log.isErrorEnabled) log.error(msg) 45 | } 46 | 47 | // Log methods that take Throwables (Exceptions/Errors) too 48 | protected def logInfo(msg: => String, throwable: Throwable) { 49 | if (log.isInfoEnabled) log.info(msg, throwable) 50 | } 51 | 52 | protected def logDebug(msg: => String, throwable: Throwable) { 53 | if (log.isDebugEnabled) log.debug(msg, throwable) 54 | } 55 | 56 | protected def logTrace(msg: => String, throwable: Throwable) { 57 | if (log.isTraceEnabled) log.trace(msg, throwable) 58 | } 59 | 60 | protected def logWarning(msg: => String, throwable: Throwable) { 61 | if (log.isWarnEnabled) log.warn(msg, throwable) 62 | } 63 | 64 | protected def logError(msg: => String, throwable: Throwable) { 65 | if (log.isErrorEnabled) log.error(msg, throwable) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/pipelines/nlp/StupidBackoffPipeline.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.pipelines.nlp 2 | 3 | import keystoneml.nodes.nlp._ 4 | 5 | import org.apache.spark.{SparkContext, SparkConf} 6 | 7 | import scopt.OptionParser 8 | 9 | object StupidBackoffPipeline { 10 | 11 | val appName = "StupidBackoffPipeline" 12 | 13 | case class StupidBackoffConfig(trainData: String = "", numParts: Int = 16, n: Int = 3) 14 | 15 | def parse(args: Array[String]): StupidBackoffConfig = 16 | new OptionParser[StupidBackoffConfig](appName) { 17 | head(appName, "0.1") 18 | opt[String]("trainData") required() action { (x, c) => c.copy(trainData = x) } 19 | opt[String]("numParts") required() action { (x, c) => c.copy(numParts = x.toInt) } 20 | opt[String]("n") optional() action { (x, c) => c.copy(n = x.toInt) } 21 | }.parse(args, StupidBackoffConfig()).get 22 | 23 | def main(args: Array[String]) { 24 | val appConfig = parse(args) 25 | val conf = new SparkConf().setAppName(appName) 26 | conf.setIfMissing("spark.master", "local[4]") 27 | val sc = new SparkContext(conf) 28 | 29 | val text = Tokenizer()(sc.textFile(appConfig.trainData, appConfig.numParts)) 30 | 31 | /** Vocab generation step */ 32 | val frequencyEncode = WordFrequencyEncoder.fit(text) 33 | val unigramCounts = frequencyEncode.unigramCounts 34 | 35 | /** NGram (n >= 2) generation step */ 36 | val makeNGrams = frequencyEncode andThen NGramsFeaturizer(2 to appConfig.n) 37 | 38 | val ngramCounts = NGramsCounts[Int](NGramsCountsMode.NoAdd).apply( 39 | makeNGrams(text).get) 40 | 41 | /** Stupid backoff scoring step */ 42 | val stupidBackoff = StupidBackoffEstimator[Int](unigramCounts) 43 | val languageModel = stupidBackoff.fit(ngramCounts) 44 | 45 | /** Done: save or serve */ 46 | languageModel.scoresRDD.cache() 47 | println( 48 | s"""|number of tokens: ${languageModel.numTokens} 49 | |size of vocabulary: ${languageModel.unigramCounts.size} 50 | |number of ngrams: ${languageModel.scoresRDD.count()} 51 | |""".stripMargin) 52 | println("trained scores of 100 ngrams in the corpus:") 53 | languageModel.scoresRDD.take(100).foreach(println) 54 | 55 | sc.stop() 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/utils/MLlibUtils.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.utils 2 | 3 | import breeze.linalg.{SparseVector, DenseMatrix, DenseVector} 4 | 5 | /** 6 | * Provides conversions between MLlib vectors & matrices, and Breeze vectors & matrices 7 | */ 8 | object MLlibUtils { 9 | 10 | /** Convert an MLlib vector to a Breeze dense vector */ 11 | def mllibVectorToDenseBreeze(vector: org.apache.spark.mllib.linalg.Vector): DenseVector[Double] = { 12 | vector match { 13 | case dense: org.apache.spark.mllib.linalg.DenseVector => new DenseVector[Double](dense.values) 14 | case _ => new DenseVector[Double](vector.toArray) 15 | } 16 | } 17 | 18 | /** Convert an MLlib matrix to a Breeze dense matrix */ 19 | def mllibMatrixToDenseBreeze(matrix: org.apache.spark.mllib.linalg.Matrix): DenseMatrix[Double] = { 20 | matrix match { 21 | case dense: org.apache.spark.mllib.linalg.DenseMatrix => { 22 | if (!dense.isTransposed) { 23 | new DenseMatrix[Double](dense.numRows, dense.numCols, dense.values) 24 | } else { 25 | val breezeMatrix = new DenseMatrix[Double](dense.numRows, dense.numCols, dense.values) 26 | breezeMatrix.t 27 | } 28 | } 29 | 30 | case _ => new DenseMatrix[Double](matrix.numRows, matrix.numCols, matrix.toArray) 31 | } 32 | } 33 | 34 | /** Convert a Breeze vector to an MLlib vector, maintaining underlying data structure (sparse vs dense) */ 35 | def breezeVectorToMLlib(breezeVector: breeze.linalg.Vector[Double]): org.apache.spark.mllib.linalg.Vector = { 36 | breezeVector match { 37 | case v: DenseVector[Double] => 38 | if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) { 39 | new org.apache.spark.mllib.linalg.DenseVector(v.data) 40 | } else { 41 | new org.apache.spark.mllib.linalg.DenseVector(v.toArray) // Can't use underlying array directly, so make a new one 42 | } 43 | case v: SparseVector[Double] => 44 | if (v.index.length == v.used) { 45 | new org.apache.spark.mllib.linalg.SparseVector(v.length, v.index, v.data) 46 | } else { 47 | new org.apache.spark.mllib.linalg.SparseVector(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) 48 | } 49 | case v: breeze.linalg.Vector[_] => 50 | sys.error("Unsupported Breeze vector type: " + v.getClass.getName) 51 | } 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/utils/external/EncEval.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.utils.external 2 | 3 | class EncEval extends Serializable { 4 | System.loadLibrary("ImageFeatures") // This will load libImageEncoders.{so,dylib} from the library path. 5 | 6 | /** 7 | * Compute a mixture of Gaussians given a set of sample points. 8 | * @param nGauss Number of Gaussians to estimate. 9 | * @param nDim Number of dimensions of each sample. 10 | * @param samples The samples (in sample-major order). 11 | * @return The Gaussians, their variances, and their weights in a single flat array. (Center-major order). 12 | */ 13 | @native 14 | def computeGMM(nGauss: Int, nDim: Int, samples: Array[Float]): Array[Float] 15 | 16 | /** 17 | * Calculates Fisher Vectors for a set of descriptors given a GMM. 18 | * 19 | * @param means Means - flat array in center-major order. 20 | * @param dims Number of dimensions of each center. 21 | * @param numClusters Number of GMM cluster centers. 22 | * @param covariances The variances of the GMM centers in center-major order. 23 | * @param priors The weights of the GMM in center order. 24 | * @param dSiftDescriptors Bag of descriptors on which to compute the GMM. 25 | * @return The Fisher Vector for the input descriptors. 26 | */ 27 | @native 28 | def calcAndGetFVs(means: Array[Float], dims: Int, numClusters: Int, covariances: Array[Float], 29 | priors: Array[Float], dSiftDescriptors: Array[Float]) : Array[Float] 30 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/utils/external/VLFeat.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.utils.external 2 | 3 | class VLFeat extends Serializable { 4 | System.loadLibrary("ImageFeatures") // This will load libImageEncoders.{so,dylib} from the library path. 5 | 6 | /** 7 | * Gets SIFT Descriptors at Multiple Scales emulating the `vl_phow` MATLAB routine. 8 | * Under the hood it uses vl_dsift from the vlfeat library. 9 | * 10 | * @param width Image Width. 11 | * @param height Image Height. 12 | * @param step Step size at which to sample SIFT descriptors. 13 | * @param bin SIFT Descriptor bin size. 14 | * @param numScales Number of scales to extract at. 15 | * @param image Input image as float array. 16 | * @return SIFTs as Shorts. 17 | */ 18 | @native 19 | def getSIFTs( 20 | width: Int, 21 | height: Int, 22 | step: Int, 23 | bin: Int, 24 | numScales: Int, 25 | scaleStep: Int, 26 | image: Array[Float]): Array[Short] 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/ChainUtils.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | import scala.reflect.ClassTag 6 | 7 | /** 8 | * A chain of two Transformers in a row (as a Transformer) 9 | * @param first 10 | * @param second 11 | */ 12 | case class TransformerChain[A, B, C : ClassTag](first: Transformer[A, B], second: Transformer[B, C]) extends Transformer[A, C] { 13 | override def apply(in: A): C = second(first(in)) 14 | override def apply(in: RDD[A]): RDD[C] = second(first(in)) 15 | } 16 | 17 | /** 18 | * A chain of a Transformer followed by an Estimator (as an Estimator) 19 | * @param first 20 | * @param second 21 | */ 22 | case class TransformerEstimatorChain[A, B, C : ClassTag](first: Transformer[A, B], second: Estimator[B, C]) 23 | extends Estimator[A, C] { 24 | 25 | override def fit(data: RDD[A]): Transformer[A, C] = { 26 | TransformerChain(first, second.fit(first(data))) 27 | } 28 | } 29 | 30 | /** 31 | * A chain of a Transformer followed by a LabelEstimator (as a LabelEstimator) 32 | * @param first 33 | * @param second 34 | */ 35 | case class TransformerLabelEstimatorChain[A, B, C : ClassTag, L](first: Transformer[A, B], second: LabelEstimator[B, C, L]) 36 | extends LabelEstimator[A, C, L] { 37 | 38 | override def fit(data: RDD[A], labels: RDD[L]): Transformer[A, C] = { 39 | TransformerChain(first, second.fit(first(data), labels)) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/DefaultOptimizer.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import keystoneml.workflow.AutoCacheRule.GreedyCache 4 | 5 | /** 6 | * The default Pipeline optimizer used when executing pipelines. 7 | */ 8 | object DefaultOptimizer extends Optimizer { 9 | protected val batches: Seq[Batch] = 10 | Batch("Load Saved State", Once, ExtractSaveablePrefixes, SavedStateLoadRule, UnusedBranchRemovalRule) :: 11 | Batch("Common Sub-expression Elimination", FixedPoint(Int.MaxValue), EquivalentNodeMergeRule) :: 12 | Batch("Node Level Optimization", Once, new NodeOptimizationRule) :: 13 | Nil 14 | } 15 | 16 | /** 17 | * Optimizes a Pipeline DAG, with auto-caching 18 | */ 19 | class AutoCachingOptimizer(strategy: AutoCacheRule.CachingStrategy = GreedyCache()) extends Optimizer { 20 | protected val batches: Seq[Batch] = 21 | Batch("Load Saved State", Once, ExtractSaveablePrefixes, SavedStateLoadRule, UnusedBranchRemovalRule) :: 22 | Batch("Common Sub-expression Elimination", FixedPoint(Int.MaxValue), EquivalentNodeMergeRule) :: 23 | Batch("Node Level Optimization", Once, new NodeOptimizationRule) :: 24 | Batch("Auto Cache", Once, new AutoCacheRule(strategy)) :: 25 | Nil 26 | } 27 | 28 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/EquivalentNodeMergeRule.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | /** 4 | * A rule to merge equivalent nodes in the DAG. 5 | * Nodes are considered equivalent if: 6 | * - The operators stored within the nodes are equal, i.e. `.hashCode()` of both ops is equal AND `.equals()` is true 7 | * (when an operator is a case class, both methods will automatically be generated) 8 | * - They share the same dependencies 9 | * 10 | * This rule also merges prefixes if any of 11 | * the nodes being merged have their prefix attached. 12 | */ 13 | object EquivalentNodeMergeRule extends Rule { 14 | override def apply(plan: Graph, prefixes: Map[NodeId, Prefix]): (Graph, Map[NodeId, Prefix]) = { 15 | val nodeSetsToMerge = plan.nodes.groupBy(id => (plan.getOperator(id), plan.getDependencies(id))).values 16 | 17 | if (nodeSetsToMerge.size == plan.nodes.size) { 18 | // no nodes are mergable 19 | (plan, prefixes) 20 | } else { 21 | nodeSetsToMerge.filter(_.size > 1).foldLeft((plan, prefixes)) { 22 | case ((curPlan, curPrefixes), setToMerge) => { 23 | // Construct a graph that merges all of the nodes 24 | val nodeToKeep = setToMerge.minBy(_.id) 25 | val nextGraph = (setToMerge - nodeToKeep).foldLeft(curPlan) { 26 | case (partialMergedPlan, nodeToMerge) => { 27 | partialMergedPlan 28 | .replaceDependency(nodeToMerge, nodeToKeep) 29 | .removeNode(nodeToMerge) 30 | } 31 | } 32 | 33 | // If any of the nodes being merged have been executed, update the prefixes 34 | val prefix = setToMerge.collectFirst { 35 | case node if curPrefixes.contains(node) => curPrefixes(node) 36 | } 37 | val nextPrefixes = if (prefix.nonEmpty) { 38 | (curPrefixes -- setToMerge) + (nodeToKeep -> prefix.get) 39 | } else { 40 | curPrefixes 41 | } 42 | 43 | (nextGraph, nextPrefixes) 44 | } 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/Estimator.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | /** 6 | * An estimator has a `fitRDD` method which takes an input and emits a [[Transformer]] 7 | * @tparam A The type of input this estimator (and the resulting Transformer) takes 8 | * @tparam B The output type of the Transformer this estimator produces when being fit 9 | */ 10 | abstract class Estimator[A, B] extends EstimatorOperator { 11 | /** 12 | * Constructs a pipeline that fits this estimator to training data, 13 | * then applies the resultant transformer to the Pipeline input. 14 | * 15 | * @param data The training data 16 | * @return A pipeline that fits this estimator and applies the result to inputs. 17 | */ 18 | final def withData(data: RDD[A]): Pipeline[A, B] = { 19 | withData(PipelineDataset(data)) 20 | } 21 | 22 | /** 23 | * Constructs a pipeline that fits this estimator to training data, 24 | * then applies the resultant transformer to the Pipeline input. 25 | * 26 | * @param data The training data 27 | * @return A pipeline that fits this estimator and applies the result to inputs. 28 | */ 29 | final def withData(data: PipelineDataset[A]): Pipeline[A, B] = { 30 | // Remove the data sink, 31 | // Then insert this estimator into the graph with the data as the input 32 | val curSink = data.executor.graph.getSinkDependency(data.sink) 33 | val (estGraph, estId) = data.executor.graph.removeSink(data.sink).addNode(this, Seq(curSink)) 34 | 35 | // Now that the estimator is attached to the data, we need to build a pipeline DAG 36 | // that applies the fit output of the estimator. We do this by creating a new Source in the DAG, 37 | val (estGraphWithNewSource, sourceId) = estGraph.addSource() 38 | 39 | // Adding a delegating transformer that depends on the source and the label estimator, 40 | val (almostFinalGraph, delegatingId) = estGraphWithNewSource.addNode(new DelegatingOperator, Seq(estId, sourceId)) 41 | 42 | // And finally adding a sink that connects to the delegating transformer. 43 | val (newGraph, sinkId) = almostFinalGraph.addSink(delegatingId) 44 | 45 | new Pipeline(new GraphExecutor(newGraph), sourceId, sinkId) 46 | } 47 | 48 | /** 49 | * The non-type-safe `fitRDDs` method of [[EstimatorOperator]] that is being overridden by the Estimator API. 50 | */ 51 | final override private[workflow] def fitRDDs(inputs: Seq[DatasetExpression]): TransformerOperator = { 52 | fit(inputs.head.get.asInstanceOf[RDD[A]]) 53 | } 54 | 55 | /** 56 | * The type-safe method that ML developers need to implement when writing new Estimators. 57 | * 58 | * @param data The estimator's training data. 59 | * @return A new transformer 60 | */ 61 | def fit(data: RDD[A]): Transformer[A, B] 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/Expression.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | /** 6 | * Output is a trait extended by everything that may be output by an [[Operator]]. 7 | * It is intended to add some extra type checking to the internal operator execution. 8 | */ 9 | private[workflow] sealed trait Expression { 10 | def get: Any 11 | } 12 | 13 | /** 14 | * This is an output that wraps around an [[RDD]]. It wraps the RDD as call-by-name, so the RDD 15 | * need not have been computed yet by the time this output is created. 16 | * 17 | * The first time the contained value is accessed using `get`, it will be computed. Every time after 18 | * that it will already be stored, and will not be computed. 19 | */ 20 | private[workflow] class DatasetExpression(compute: => RDD[_]) extends Expression { 21 | lazy override val get: RDD[_] = compute 22 | } 23 | 24 | /** 25 | * This is an output that wraps around a single untyped [[Any]] datum. It wraps the datum as call-by-name, 26 | * so it need not have been computed by the time this output is created. 27 | * 28 | * The first time the contained value is accessed using `get`, it will be computed. Every time after 29 | * that it will already be stored, and will not be computed. 30 | */ 31 | private[workflow] class DatumExpression(compute: => Any) extends Expression { 32 | lazy override val get: Any = compute 33 | } 34 | 35 | /** 36 | * This is an output that wraps around a [[TransformerOperator]]. It wraps the transformer as call-by-name, 37 | * so it need not have been computed by the time this output is created. 38 | * 39 | * The first time the contained value is accessed using `get`, it will be computed. Every time after 40 | * that it will already be stored, and will not be computed. 41 | */ 42 | private[workflow] class TransformerExpression(compute: => TransformerOperator) extends Expression { 43 | lazy override val get: TransformerOperator = compute 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/ExtractSaveablePrefixes.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import keystoneml.nodes.util.Cacher 4 | 5 | /** 6 | * Extract the prefixes of all Nodes whose state we want to save for reuse by other Pipeline apply and fit calls. 7 | * This is all nodes that either have a Cacher or an EstimatorOperator as the internal operator. 8 | */ 9 | object ExtractSaveablePrefixes extends Rule { 10 | override def apply(plan: Graph, prefixes: Map[NodeId, Prefix]): (Graph, Map[NodeId, Prefix]) = { 11 | val nodesToExtract = plan.operators.collect { 12 | case (node, _: Cacher[_]) => node 13 | case (node, _: EstimatorOperator) => node 14 | } 15 | 16 | val newPrefixes = nodesToExtract.map { 17 | node => (node, Prefix.findPrefix(plan, node)) 18 | }.toMap 19 | 20 | (plan, newPrefixes) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/FittedPipeline.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | /** 6 | * This is the result of fitting a [[Pipeline]]. It is logically equivalent to the Pipeline it is produced by, 7 | * but with all Estimators pre-fit, and only containing Transformers in the underlying graph. 8 | * Applying a FittedPipeline to new data does not trigger any new optimization or estimator fitting. 9 | * 10 | * Unlike normal Pipelines, FittedPipelines are serializable and may be written to and from disk. 11 | * 12 | * @param transformerGraph The DAG representing the execution (only contains Transformers) 13 | * @param source The SourceId of the Pipeline 14 | * @param sink The SinkId of the Pipeline 15 | * @tparam A type of the data this FittedPipeline expects as input 16 | * @tparam B type of the data this FittedPipeline outputs 17 | */ 18 | class FittedPipeline[A, B] private[workflow] ( 19 | private[workflow] val transformerGraph: TransformerGraph, 20 | private[workflow] val source: SourceId, 21 | private[workflow] val sink: SinkId 22 | ) extends Chainable[A, B] with Serializable { 23 | 24 | /** 25 | * Converts this FittedPipeline back into a Pipeline. 26 | */ 27 | override def toPipeline: Pipeline[A, B] = new Pipeline( 28 | new GraphExecutor(transformerGraph.toGraph, optimize = false), 29 | source, 30 | sink) 31 | 32 | /** 33 | * The application of this FittedPipeline to a single input item. 34 | * 35 | * @param in The input item to pass into this transformer 36 | * @return The output value 37 | */ 38 | def apply(in: A): B = toPipeline.apply(in).get() 39 | 40 | /** 41 | * The application of this FittedPipeline to an RDD of input items. 42 | * 43 | * @param in The RDD input to pass into this transformer 44 | * @return The RDD output for the given input 45 | */ 46 | def apply(in: RDD[A]): RDD[B] = toPipeline.apply(in).get() 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/GatherTransformerOperator.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | /** 6 | * A [[TransformerOperator]] that gathers multiple datasets of {@tparam T} into a dataset of Seq[T] 7 | * (Or individual datums of T into a single Seq[T]) 8 | */ 9 | private[workflow] case class GatherTransformerOperator[T]() extends TransformerOperator { 10 | override private[workflow] def singleTransform(inputs: Seq[DatumExpression]): Any = { 11 | inputs.map(_.get.asInstanceOf[T]) 12 | } 13 | 14 | override private[workflow] def batchTransform(inputs: Seq[DatasetExpression]): RDD[_] = { 15 | inputs.map(_.get.asInstanceOf[RDD[T]].map(t => Seq(t))).reduceLeft((x, y) => { 16 | x.zip(y).map(z => z._1 ++ z._2) 17 | }) 18 | } 19 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/GraphId.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | /** 4 | * This is a unifying type for Node, Source, and Sink ids in the 5 | * internal graph data structure representing workloads. 6 | */ 7 | private[workflow] sealed trait GraphId 8 | 9 | /** 10 | * This represents the id of a Sink in the internal graph data structure. 11 | * @param id The internal value, unique to each id 12 | */ 13 | private[workflow] case class SinkId(id: Long) extends GraphId 14 | 15 | /** 16 | * This is a unifying type for Node and Source ids in the 17 | * internal graph data structure representing workloads. 18 | */ 19 | private[workflow] sealed trait NodeOrSourceId extends GraphId 20 | 21 | /** 22 | * This represents the id of a Node in the internal graph data structure. 23 | * @param id The internal value, unique to each id 24 | */ 25 | private[workflow] case class NodeId(id: Long) extends NodeOrSourceId 26 | 27 | /** 28 | * This represents the id of a Source in the internal graph data structure. 29 | * @param id The internal value, unique to each id 30 | */ 31 | private[workflow] case class SourceId(id: Long) extends NodeOrSourceId 32 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/Identity.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | import scala.reflect.ClassTag 6 | 7 | /** 8 | * This transformer performs a no-op on its input. 9 | * 10 | * @tparam T Type of the input and, by definition, output. 11 | */ 12 | case class Identity[T : ClassTag]() extends Transformer[T,T] { 13 | override def apply(in: T): T = in 14 | override def apply(in: RDD[T]): RDD[T] = in 15 | } 16 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/OptimizableNodes.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | import scala.reflect.ClassTag 6 | 7 | sealed trait Optimizable 8 | 9 | /** 10 | * Represents a node-level optimizable transformer and its optimization rules 11 | */ 12 | abstract class OptimizableTransformer[A, B : ClassTag] extends Transformer[A, B] with Optimizable { 13 | val default: Transformer[A, B] 14 | override def apply(a: A): B = { 15 | default.apply(a) 16 | } 17 | override def apply(data: RDD[A]): RDD[B] = { 18 | default.apply(data) 19 | } 20 | 21 | def optimize(sample: RDD[A], numPerPartition: Map[Int, Int]): Transformer[A, B] 22 | } 23 | 24 | /** 25 | * Represents a node-level optimizable Estimator and its optimization rules 26 | */ 27 | abstract class OptimizableEstimator[A, B] extends Estimator[A, B] with Optimizable { 28 | val default: Estimator[A, B] 29 | 30 | // Fit using whatever the default is. 31 | override def fit(data: RDD[A]): Transformer[A, B] = { 32 | default.fit(data) 33 | } 34 | 35 | def optimize(sample: RDD[A], numPerPartition: Map[Int, Int]): Estimator[A, B] 36 | } 37 | 38 | /** 39 | * Represents a node-level optimizable LabelEstimator and its optimization rules 40 | */ 41 | abstract class OptimizableLabelEstimator[A, B, L] extends LabelEstimator[A, B, L] with Optimizable { 42 | val default: LabelEstimator[A, B, L] 43 | 44 | // Fit using whatever the default is. 45 | override def fit(data: RDD[A], labels: RDD[L]): Transformer[A, B] = { 46 | default.fit(data, labels) 47 | } 48 | 49 | def optimize(sample: RDD[A], sampleLabels: RDD[L], numPerPartition: Map[Int, Int]): LabelEstimator[A, B, L] 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/PipelineDataset.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | /** 6 | * This class is a lazy wrapper around the output of a pipeline that was passed an RDD as input. 7 | * 8 | * Under the hood, it extends [[PipelineResult]] and keeps track of the necessary execution plan. 9 | */ 10 | class PipelineDataset[T] private[workflow](executor: GraphExecutor, sink: SinkId) 11 | extends PipelineResult[RDD[T]]( 12 | executor, 13 | sink) 14 | 15 | object PipelineDataset { 16 | private[workflow] def apply[T](rdd: RDD[T]): PipelineDataset[T] = { 17 | val emptyGraph = Graph(Set(), Map(), Map(), Map()) 18 | val (graphWithDataset, nodeId) = emptyGraph.addNode(new DatasetOperator(rdd), Seq()) 19 | val (graph, sinkId) = graphWithDataset.addSink(nodeId) 20 | 21 | new PipelineDataset[T](new GraphExecutor(graph), sinkId) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/PipelineDatum.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | /** 4 | * This class is a lazy wrapper around the output of a pipeline that was passed a single datum as input. 5 | * 6 | * Under the hood, it extends [[PipelineResult]] and keeps track of the necessary execution plan. 7 | */ 8 | class PipelineDatum[T] private[workflow](executor: GraphExecutor, sink: SinkId) 9 | extends PipelineResult[T]( 10 | executor, 11 | sink) 12 | 13 | object PipelineDatum { 14 | private[workflow] def apply[T](datum: T): PipelineDatum[T] = { 15 | val emptyGraph = Graph(Set(), Map(), Map(), Map()) 16 | val (graphWithDataset, nodeId) = emptyGraph.addNode(new DatumOperator(datum), Seq()) 17 | val (graph, sinkId) = graphWithDataset.addSink(nodeId) 18 | 19 | new PipelineDatum[T](new GraphExecutor(graph), sinkId) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/PipelineEnv.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | /** 4 | * PipelineEnv is an environment shared by multiple [[Pipeline]]s, containing variables 5 | * such as the Prefix state table and the current Pipeline [[Optimizer]]. 6 | */ 7 | class PipelineEnv { 8 | /** 9 | * This is the global execution state of Pipelines with this environment. 10 | * It is a mutable hashmap of logical prefix to the executed result at that prefix. 11 | * It is not currently thread-safe. 12 | */ 13 | private[workflow] val state: scala.collection.mutable.Map[Prefix, Expression] = scala.collection.mutable.Map() 14 | 15 | /** 16 | * The internally stored optimizer used for all Pipeline execution. Accessible using getter and setter. 17 | */ 18 | private var _optimizer: Optimizer = DefaultOptimizer 19 | 20 | /** 21 | * @return The current optimizer used during Pipeline execution. 22 | */ 23 | def getOptimizer: Optimizer = _optimizer 24 | 25 | /** 26 | * Globally set a new optimizer to use during Pipeline execution. 27 | * 28 | * @param optimizer The new optimizer to use 29 | */ 30 | def setOptimizer(optimizer: Optimizer): Unit = { 31 | _optimizer = optimizer 32 | } 33 | 34 | /** 35 | * Reset this PipelineEnv (clear state and set the Optimizer to the DefaultOptimizer) 36 | */ 37 | private [workflow] def reset(): Unit = { 38 | state.clear() 39 | setOptimizer(DefaultOptimizer) 40 | } 41 | } 42 | 43 | object PipelineEnv { 44 | lazy val getOrCreate: PipelineEnv = new PipelineEnv 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/PipelineResult.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | /** 4 | * A PipelineResult is a lazy wrapper around the result of applying a [[Pipeline]] to data. 5 | * Internally it contains the Pipeline's execution plan with data sources inserted, 6 | * and the sink that the Pipeline's output is expected to be produced by. 7 | * 8 | * @param executor The Pipeline's underlying execution plan, 9 | * with the Pipeline's sources inserted into the [[Graph]] 10 | * @param sink The Pipeline's sink 11 | * @tparam T The type of the result. 12 | */ 13 | abstract class PipelineResult[T] private[workflow] ( 14 | private[workflow] val executor: GraphExecutor, 15 | private[workflow] val sink: SinkId 16 | ) { 17 | 18 | private lazy val result: T = executor.execute(sink).get.asInstanceOf[T] 19 | final def get(): T = result 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/Prefix.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | 4 | private[workflow] object Prefix { 5 | /** 6 | * Given a graph and a node, output the prefix of the id. 7 | * Will error if provided a node with a source in the dependencies. 8 | * 9 | * @param graph The graph to use 10 | * @param node A node in the graph 11 | * @return The prefix of that id 12 | */ 13 | def findPrefix(graph: Graph, node: NodeId): Prefix = { 14 | val rootOp = graph.getOperator(node) 15 | val deps = graph.getDependencies(node).map { 16 | case dep: NodeId => findPrefix(graph, dep) 17 | case dep: SourceId => 18 | throw new IllegalArgumentException("May not get the prefix of a node with Sources in the dependencies.") 19 | } 20 | 21 | Prefix(rootOp, deps) 22 | } 23 | } 24 | 25 | /** 26 | * This case class represents the logical prefix of a node in a Pipeline. 27 | * @param operator The operator stored at the node 28 | * @param deps The prefixes of the operator's dependencies 29 | */ 30 | private[workflow] case class Prefix(operator: Operator, deps: Seq[Prefix]) 31 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/Rule.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | /** 4 | * Represents a DAG transformation rule: A transformation from one DAG 5 | * to a differently-executed but logically equivalent DAG. 6 | * 7 | * A rule must also produce execution state for 8 | * the new DAG, logically equivalent to the execution state 9 | * attached to the old DAG. 10 | */ 11 | abstract class Rule { 12 | /** Name for this rule, automatically inferred based on class name. */ 13 | val ruleName: String = { 14 | val className = getClass.getName 15 | if (className endsWith "$") className.dropRight(1) else className 16 | } 17 | 18 | def apply(plan: Graph, prefixes: Map[NodeId, Prefix]): (Graph, Map[NodeId, Prefix]) 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/SavedStateLoadRule.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | /** 4 | * A rule to load any saved state for the [[PipelineEnv.state]] prefix state table 5 | * for nodes we want to consider either loading or saving the results of. 6 | */ 7 | object SavedStateLoadRule extends Rule { 8 | override def apply(plan: Graph, prefixes: Map[NodeId, Prefix]): (Graph, Map[NodeId, Prefix]) = { 9 | val newGraph = prefixes.foldLeft(plan) { 10 | case (curGraph, (node, prefix)) => 11 | PipelineEnv.getOrCreate.state.get(prefix).map { 12 | case expression => 13 | curGraph.setOperator(node, new ExpressionOperator(expression)) 14 | .setDependencies(node, Seq()) 15 | }.getOrElse(curGraph) 16 | } 17 | 18 | (newGraph, prefixes) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/SparkUtilWrapper.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.util 2 | 3 | object SparkUtilWrapper { 4 | def estimateSize(obj: AnyRef): Long = SizeEstimator.estimate(obj) 5 | } -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/Transformer.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | import scala.reflect.ClassTag 6 | 7 | /** 8 | * Transformers are operators that may be applied both to single input items and to RDDs of input items. 9 | * They may be chained together, along with [[Estimator]]s and [[LabelEstimator]]s, to produce complex 10 | * pipelines. 11 | * 12 | * Transformer extends [[Pipeline]], meaning that its publicly exposed methods for transforming data 13 | * and chaining are implemented there. 14 | * 15 | * @tparam A input item type the transformer takes 16 | * @tparam B output item type the transformer produces 17 | */ 18 | abstract class Transformer[A, B : ClassTag] extends TransformerOperator with Chainable[A, B] { 19 | override def toPipeline: Pipeline[A, B] = new Pipeline( 20 | executor = new GraphExecutor(Graph( 21 | sources = Set(SourceId(0)), 22 | sinkDependencies = Map(SinkId(0) -> NodeId(0)), 23 | operators = Map(NodeId(0) -> this), 24 | dependencies = Map(NodeId(0) -> Seq(SourceId(0))) 25 | )), 26 | source = SourceId(0), 27 | sink = SinkId(0) 28 | ) 29 | 30 | /** 31 | * The application of this Transformer to a single input item. 32 | * This method MUST be overridden by ML developers. 33 | * 34 | * @param in The input item to pass into this transformer 35 | * @return The output value 36 | */ 37 | def apply(in: A): B 38 | 39 | /** 40 | * The application of this Transformer to an RDD of input items. 41 | * This method may optionally be overridden by ML developers. 42 | * 43 | * @param in The bulk RDD input to pass into this transformer 44 | * @return The bulk RDD output for the given input 45 | */ 46 | def apply(in: RDD[A]): RDD[B] = in.map(apply) 47 | 48 | final override private[workflow] def singleTransform(inputs: Seq[DatumExpression]): Any = { 49 | apply(inputs.head.get.asInstanceOf[A]) 50 | } 51 | 52 | final override private[workflow] def batchTransform(inputs: Seq[DatasetExpression]): RDD[_] = { 53 | apply(inputs.head.get.asInstanceOf[RDD[A]]) 54 | } 55 | } 56 | 57 | object Transformer { 58 | /** 59 | * This constructor takes a function and returns a Transformer that maps it over the input RDD 60 | * 61 | * @param f The function to apply to every item in the RDD being transformed 62 | * @tparam I input type of the transformer 63 | * @tparam O output type of the transformer 64 | * @return Transformer that applies the given function to all items in the RDD 65 | */ 66 | def apply[I, O : ClassTag](f: I => O): Transformer[I, O] = new Transformer[I, O] { 67 | override def apply(in: RDD[I]): RDD[O] = in.map(f) 68 | override def apply(in: I): O = f(in) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/TransformerGraph.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | /** 4 | * TransformerGraphs are similar to [[Graph]]s, but unlike normal Graphs they may only contain 5 | * [[TransformerOperator]]s as operators, and as a result are guaranteed to be serializable. 6 | * 7 | * @param sources The set of all [[SourceId]]s of sources in the graph 8 | * @param sinkDependencies A map of [[SinkId]] to the id of the node or source the sink depends on 9 | * @param operators A map of [[NodeId]] to the operator contained within that node 10 | * @param dependencies A map of [[NodeId]] to the node's ordered dependencies 11 | */ 12 | private[workflow] case class TransformerGraph( 13 | sources: Set[SourceId], 14 | sinkDependencies: Map[SinkId, NodeOrSourceId], 15 | operators: Map[NodeId, TransformerOperator], 16 | dependencies: Map[NodeId, Seq[NodeOrSourceId]] 17 | ) { 18 | 19 | /** 20 | * Convert this TransformerGraph into a standard [[Graph]] 21 | */ 22 | private[workflow] def toGraph: Graph = { 23 | Graph( 24 | sources = sources, 25 | sinkDependencies = sinkDependencies, 26 | operators = operators, 27 | dependencies = dependencies) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/UnusedBranchRemovalRule.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | /** 4 | * A rule to remove all nodes & sources in a graph that don't lead to any sink, 5 | * and are effectively unused. 6 | */ 7 | object UnusedBranchRemovalRule extends Rule { 8 | override def apply(plan: Graph, prefixes: Map[NodeId, Prefix]): (Graph, Map[NodeId, Prefix]) = { 9 | val ancestorsOfSinks = plan.sinks.foldLeft(Set[GraphId]()) { 10 | case (ancestors, sink) => ancestors ++ AnalysisUtils.getAncestors(plan, sink) 11 | } 12 | 13 | val nodesToRemove = plan.nodes -- ancestorsOfSinks.collect { case node: NodeId => node } 14 | val sourcesToRemove = plan.sources -- ancestorsOfSinks.collect { case source: SourceId => source } 15 | 16 | val afterSourceRemoval = sourcesToRemove.foldLeft(plan) { 17 | case (curPlan, sourceToRemove) => curPlan.removeSource(sourceToRemove) 18 | } 19 | 20 | nodesToRemove.foldLeft((afterSourceRemoval, prefixes)) { 21 | case ((curPlan, curPrefixes), nodeToRemove) => (curPlan.removeNode(nodeToRemove), curPrefixes - nodeToRemove) 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/WeightedNode.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | /** 4 | * A mix-in that attaches a weight to a node that represents how often it must iterate 5 | * over its input. 6 | */ 7 | trait WeightedNode { 8 | val weight: Int 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/WeightedOperator.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | /** 4 | * A mix-in that attaches a weight to an operator that represents how often it must iterate 5 | * over its input. 6 | */ 7 | trait WeightedOperator { 8 | val weight: Int 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/keystoneml/workflow/WorkflowUtils.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | object WorkflowUtils { 6 | /** 7 | * Return the number of items in each partition in an RDD. 8 | * @param rdd Input RDD. 9 | * @tparam T RDD Type. 10 | * @return A [[Map]] keyed by partition ID containing the number of elements in each partition of the RDD. 11 | */ 12 | def numPerPartition[T](rdd: RDD[T]): Map[Int, Int] = { 13 | rdd.mapPartitionsWithIndex { 14 | case (id, partition) => Iterator.single((id, partition.length)) 15 | }.collect().toMap 16 | } 17 | 18 | } -------------------------------------------------------------------------------- /src/test/python/images/pyconv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import csv 5 | import numpy as np 6 | from scipy.misc import imread, imsave 7 | from scipy.signal import convolve 8 | 9 | #This script convolves an image in python and its output is used in 10 | #the Convolver unit tests to ensure that convolver output matches 11 | #an equivalent python call. 12 | 13 | #This script was run from src/test/resources/images/ as: 14 | #python pyconv.py gantrycrane.png convolved.gantrycrane.png convolved.gantrycrane.csv 15 | 16 | 17 | def main(): 18 | x = imread(sys.argv[1]) 19 | k1 = np.array([i for i in range(27)]).reshape((3,3,3)) 20 | out = np.sum(convolve(x, k1, mode='valid'), 2) 21 | imsave(sys.argv[2], out) 22 | cwriter = csv.writer(open(sys.argv[3], 'w')) 23 | for x in range(out.shape[0]): 24 | for y in range(out.shape[1]): 25 | cwriter.writerow([x,y,out[x,y]]) 26 | 27 | 28 | if __name__ == "__main__": 29 | main() 30 | -------------------------------------------------------------------------------- /src/test/resources/aMat-1class.csv: -------------------------------------------------------------------------------- 1 | 0.10266850507085126,0.4499763204326901,-0.15374850502641021,-0.015879756324382748,0.4437926700329148,0.7808071690334957,-0.08218768514428863,0.48140007039716,-0.019712057549647364,0.2009836160928337,-0.8000566853661935,0.12167303371323324 2 | 0.44732263301010217,0.9951414121993158,0.8130665381040776,-1.183012821913078,-0.7081795326278753,-0.2365018666630304,1.1966589648301693,-1.2916743784290192,-0.09425629499384529,-0.9651145207437652,-0.8953331802899065,-0.9220777634896545 3 | -0.7623817369690132,0.9257676421568312,-1.4667522264035207,0.05272020922346383,1.2149725887284197,-0.8779025816833662,-0.762795288627363,0.39898952926221504,0.40825734564162786,3.103511435086207,1.5310257139379873,-0.6868105045330928 4 | -0.5008969913101462,0.4532396861574774,-0.29393358849474976,0.5592102787356051,0.6916956616970765,-1.3004633365428844,2.019373540599413,0.3652134453707413,1.910512585516455,2.751731295807471,1.059249138315071,0.10725052982484896 5 | -0.3530558373493292,1.0070284676996972,0.31828544648906393,-0.41233492717046566,0.45555494507753697,-1.7027192789791656,-2.405329542540906,-0.4703247395781227,-0.6821969614843767,-1.065966277390593,-0.8263294641770074,0.1788389733691391 6 | -------------------------------------------------------------------------------- /src/test/resources/bMat-1class.csv: -------------------------------------------------------------------------------- 1 | 1.0,-1.0,-1.0 2 | 1.0,-1.0,-1.0 3 | 1.0,-1.0,-1.0 4 | 1.0,-1.0,-1.0 5 | 1.0,-1.0,-1.0 6 | -------------------------------------------------------------------------------- /src/test/resources/bMat.csv: -------------------------------------------------------------------------------- 1 | 1.0,-1.0,-1.0 2 | 1.0,-1.0,-1.0 3 | 1.0,-1.0,-1.0 4 | 1.0,-1.0,-1.0 5 | 1.0,-1.0,-1.0 6 | -1.0,1.0,-1.0 7 | -1.0,1.0,-1.0 8 | -1.0,1.0,-1.0 9 | -1.0,1.0,-1.0 10 | -1.0,1.0,-1.0 11 | -1.0,-1.0,1.0 12 | -1.0,-1.0,1.0 13 | -1.0,-1.0,1.0 14 | -1.0,-1.0,1.0 15 | -1.0,-1.0,1.0 16 | -------------------------------------------------------------------------------- /src/test/resources/bMatShuffled.csv: -------------------------------------------------------------------------------- 1 | 1.0,-1.0,-1.0 2 | 1.0,-1.0,-1.0 3 | 1.0,-1.0,-1.0 4 | -1.0,1.0,-1.0 5 | -1.0,1.0,-1.0 6 | -1.0,1.0,-1.0 7 | -1.0,1.0,-1.0 8 | -1.0,1.0,-1.0 9 | -1.0,-1.0,1.0 10 | -1.0,-1.0,1.0 11 | -1.0,-1.0,1.0 12 | -1.0,-1.0,1.0 13 | -1.0,-1.0,1.0 14 | 1.0,-1.0,-1.0 15 | 1.0,-1.0,-1.0 16 | -------------------------------------------------------------------------------- /src/test/resources/images/000012.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amplab/keystone/74e2fb5efaff55675603508bd0c479bb8875901f/src/test/resources/images/000012.jpg -------------------------------------------------------------------------------- /src/test/resources/images/convolved.gantrycrane.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amplab/keystone/74e2fb5efaff55675603508bd0c479bb8875901f/src/test/resources/images/convolved.gantrycrane.png -------------------------------------------------------------------------------- /src/test/resources/images/gantrycrane.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amplab/keystone/74e2fb5efaff55675603508bd0c479bb8875901f/src/test/resources/images/gantrycrane.png -------------------------------------------------------------------------------- /src/test/resources/images/imagenet-test-labels: -------------------------------------------------------------------------------- 1 | n15075141 12 2 | -------------------------------------------------------------------------------- /src/test/resources/images/imagenet/n15075141.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amplab/keystone/74e2fb5efaff55675603508bd0c479bb8875901f/src/test/resources/images/imagenet/n15075141.tar -------------------------------------------------------------------------------- /src/test/resources/images/voc/voctest.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amplab/keystone/74e2fb5efaff55675603508bd0c479bb8875901f/src/test/resources/images/voc/voctest.tar -------------------------------------------------------------------------------- /src/test/scala/keystoneml/evaluation/BinaryClassifierEvaluatorSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.evaluation 2 | 3 | import org.apache.spark.SparkContext 4 | import org.scalatest.FunSuite 5 | import keystoneml.utils.Stats 6 | import keystoneml.workflow.PipelineContext 7 | 8 | class BinaryClassifierEvaluatorSuite extends FunSuite with PipelineContext { 9 | test("Multiclass keystoneml.evaluation metrics") { 10 | /* 11 | * Contingency table for binary classification with total 12 instances: 12 | * |6|2| true label: positive 13 | * |1|3| true label: negative 14 | */ 15 | sc = new SparkContext("local", "test") 16 | 17 | val predictionAndLabels = sc.parallelize( Seq.fill(6)((true, true)) ++ Seq.fill(2)((false, true)) 18 | ++ Seq.fill(1)((true, false)) ++ Seq.fill(3)((false, false)), 2) 19 | val metrics = BinaryClassifierEvaluator.evaluate(predictionAndLabels.map(_._1), predictionAndLabels.map(_._2)) 20 | 21 | assert(metrics.tp === 6) 22 | assert(metrics.fp === 1) 23 | assert(metrics.tn === 3) 24 | assert(metrics.fn === 2) 25 | 26 | assert(Stats.aboutEq(metrics.precision, 6.0/7.0)) 27 | assert(Stats.aboutEq(metrics.recall, 6.0/8.0)) 28 | assert(Stats.aboutEq(metrics.accuracy, 9.0/12.0)) 29 | assert(Stats.aboutEq(metrics.specificity, 3.0/4.0)) 30 | assert(Stats.aboutEq(metrics.fScore(), 2.0 * 6.0 / (2.0 * 6.0 + 2.0 + 1.0))) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/evaluation/MeanAveragePrecisionSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.evaluation 2 | 3 | import breeze.linalg.DenseVector 4 | import org.scalatest.FunSuite 5 | import org.apache.spark.SparkContext 6 | import keystoneml.utils.Stats 7 | import keystoneml.workflow.PipelineContext 8 | 9 | class MeanAveragePrecisionSuite extends FunSuite with PipelineContext { 10 | 11 | test("random map test") { 12 | sc = new SparkContext("local", "test") 13 | 14 | // Build some random test data with 4 classes 0,1,2,3 15 | val actual = List(Array(0, 3), Array(2), Array(1, 2), Array(0)) 16 | val actualRdd = sc.parallelize(actual) 17 | 18 | val predicted = List( 19 | DenseVector(0.1, -0.05, 0.12, 0.5), 20 | DenseVector(-0.23, -0.45, 0.23, 0.1), 21 | DenseVector(-0.34, -0.32, -0.66, 1.52), 22 | DenseVector(-0.1, -0.2, 0.5, 0.8)) 23 | 24 | val predictedRdd = sc.parallelize(predicted) 25 | 26 | val map = new MeanAveragePrecisionEvaluator(4).evaluate(predictedRdd, actualRdd) 27 | 28 | // Expected values from running this in MATLAB 29 | val expected = DenseVector(1.0, 0.3333, 0.5, 0.3333) 30 | 31 | assert(Stats.aboutEq(map, expected, 1e-4)) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/loaders/ImageNetLoaderSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.loaders 2 | 3 | import org.scalatest.FunSuite 4 | import org.apache.spark.SparkContext 5 | import keystoneml.utils.TestUtils 6 | import keystoneml.workflow.PipelineContext 7 | 8 | class ImageNetLoaderSuite extends FunSuite with PipelineContext { 9 | test("load a sample of imagenet data") { 10 | sc = new SparkContext("local", "test") 11 | val dataPath = TestUtils.getTestResourceFileName("images/imagenet") 12 | val labelsPath = TestUtils.getTestResourceFileName("images/imagenet-test-labels") 13 | 14 | val imgs = ImageNetLoader.apply(sc, dataPath, labelsPath).collect() 15 | // We should have 5 images 16 | assert(imgs.length === 5) 17 | 18 | // The images should all have label 12 19 | assert(imgs.map(_.label).distinct.length === 1) 20 | assert(imgs.map(_.label).distinct.head === 12) 21 | 22 | // The image filenames should begin with n15075141 23 | assert(imgs.forall(_.filename.get.startsWith("n15075141")), "Image filenames should be correct") 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/loaders/VOCLoaderSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.loaders 2 | 3 | import org.scalatest.FunSuite 4 | import org.apache.spark.SparkContext 5 | import keystoneml.utils.TestUtils 6 | import keystoneml.workflow.PipelineContext 7 | 8 | class VOCLoaderSuite extends FunSuite with PipelineContext { 9 | test("load a sample of VOC data") { 10 | sc = new SparkContext("local", "test") 11 | val dataPath = TestUtils.getTestResourceFileName("images/voc") 12 | val labelsPath = TestUtils.getTestResourceFileName("images/voclabels.csv") 13 | 14 | val imgs = VOCLoader(sc, 15 | VOCDataPath(dataPath, "VOCdevkit/VOC2007/JPEGImages/", Some(1)), 16 | VOCLabelPath(labelsPath)).collect() 17 | 18 | // We should have 10 images 19 | assert(imgs.length === 10) 20 | 21 | // There should be one file whose name ends with "000104.jpg" 22 | val personMonitor = imgs.filter(_.filename.get.endsWith("000104.jpg")) 23 | assert(personMonitor.length === 1) 24 | 25 | // It should have two labels, 14 and 19. 26 | assert(personMonitor(0).label.contains(14) && personMonitor(0).label.contains(19)) 27 | 28 | // There should be two 13 labels total and 9 should be distinct. 29 | assert(imgs.map(_.label).flatten.length === 13) 30 | assert(imgs.map(_.label).flatten.distinct.length === 9) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/images/CenterCornerPatcherSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import org.scalatest.FunSuite 4 | import keystoneml.pipelines.Logging 5 | import keystoneml.utils.{ChannelMajorArrayVectorizedImage, ImageMetadata, TestUtils} 6 | 7 | class CenterCornerPatcherSuite extends FunSuite with Logging { 8 | 9 | test("check number and dimension of patches") { 10 | val image = TestUtils.loadTestImage("images/000012.jpg") 11 | val xDim = image.metadata.xDim 12 | val yDim = image.metadata.yDim 13 | val patchSizeX = xDim / 2 14 | val patchSizeY = yDim / 2 15 | 16 | val withFlipPatcher = CenterCornerPatcher(patchSizeX, patchSizeY, true) 17 | val withFlipPatches = withFlipPatcher.centerCornerPatchImage(image).toSeq 18 | 19 | assert(withFlipPatches.map(_.metadata.xDim).forall(_ == patchSizeX) && 20 | withFlipPatches.map(_.metadata.yDim).forall(_ == patchSizeY) && 21 | withFlipPatches.map(_.metadata.numChannels).forall(_ == image.metadata.numChannels), 22 | "All patches must have right dimensions") 23 | 24 | assert(withFlipPatches.size === 10, "Number of patches must match") 25 | 26 | val noFlipPatcher = CenterCornerPatcher(patchSizeX, patchSizeY, false) 27 | val noFlipPatches = noFlipPatcher.centerCornerPatchImage(image).toSeq 28 | 29 | assert(noFlipPatches.map(_.metadata.xDim).forall(_ == patchSizeX) && 30 | noFlipPatches.map(_.metadata.yDim).forall(_ == patchSizeY) && 31 | noFlipPatches.map(_.metadata.numChannels).forall(_ == image.metadata.numChannels), 32 | "All patches must have right dimensions") 33 | 34 | assert(noFlipPatches.size === 5, "Number of patches must match") 35 | } 36 | 37 | test("1x1 image patches") { 38 | val imgArr = 39 | (0 until 5).flatMap { x => 40 | (0 until 5).flatMap { y => 41 | (0 until 1).map { c => 42 | (c + x * 1 + y * 5 * 1).toDouble 43 | } 44 | } 45 | }.toArray 46 | 47 | val image = new ChannelMajorArrayVectorizedImage(imgArr, ImageMetadata(5, 5, 1)) 48 | val patchSizeX = 1 49 | val patchSizeY = 1 50 | 51 | val noFlipPatcher = CenterCornerPatcher(patchSizeX, patchSizeY, false) 52 | val noFlipPatches = noFlipPatcher.centerCornerPatchImage(image).toSeq 53 | 54 | assert(noFlipPatches.length === 5) 55 | // NOTE(shivaram): This assumes order of patches returned stays the same. 56 | assert(noFlipPatches(0).get(0, 0, 0) === 0.0) 57 | assert(noFlipPatches(1).get(0, 0, 0) === 20.0) 58 | assert(noFlipPatches(2).get(0, 0, 0) === 4.0) 59 | assert(noFlipPatches(3).get(0, 0, 0) === 24.0) 60 | assert(noFlipPatches(4).get(0, 0, 0) === 12.0) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/images/DaisyExtractorSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import breeze.linalg._ 4 | import keystoneml.nodes.images.external.SIFTExtractor 5 | import org.scalatest.FunSuite 6 | 7 | import keystoneml.pipelines.Logging 8 | import keystoneml.utils.{ImageUtils, Stats, TestUtils} 9 | 10 | class DaisyExtractorSuite extends FunSuite with Logging { 11 | test("Load an Image and compute Daisy Features") { 12 | val testImage = TestUtils.loadTestImage("images/gantrycrane.png") 13 | val grayImage = ImageUtils.toGrayScale(testImage) 14 | 15 | val df = new DaisyExtractor() 16 | val daisyDescriptors = convert(df.apply(grayImage), Double) 17 | 18 | val firstKeyPointSum = sum(daisyDescriptors(::, 0)) 19 | val fullFeatureSum = sum(daisyDescriptors) 20 | 21 | // Values found from running matlab code on same input file. 22 | val matlabFirstKeyPointSum = 55.127217737738533 23 | val matlabFullFeatureSum = 3.240635661296463E5 24 | 25 | // TODO: This should be at most 1e-8 as we are using Floats. But its 1e-5, 1e-7 right now ? 26 | assert(Stats.aboutEq( 27 | (firstKeyPointSum - matlabFirstKeyPointSum)/matlabFirstKeyPointSum, 0, 1e-5), 28 | "First keypoint sum must match for Daisy") 29 | assert(Stats.aboutEq((fullFeatureSum - matlabFullFeatureSum)/matlabFullFeatureSum, 0, 1e-7), 30 | "Sum of Daisys must match expected sum") 31 | } 32 | 33 | test("Daisy and SIFT extractors should have same row/column ordering.") { 34 | val testImage = TestUtils.loadTestImage("images/gantrycrane.png") 35 | val grayImage = ImageUtils.toGrayScale(testImage) 36 | 37 | val df = new DaisyExtractor() 38 | val daisyDescriptors = convert(df.apply(grayImage), Double) 39 | 40 | val se = SIFTExtractor(scaleStep = 2) 41 | val siftDescriptors = se.apply(grayImage) 42 | 43 | assert(daisyDescriptors.rows == df.daisyFeatureSize && siftDescriptors.rows == se.descriptorSize) 44 | 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/images/HogExtractorSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import breeze.linalg._ 4 | import org.scalatest.FunSuite 5 | 6 | import keystoneml.pipelines.Logging 7 | import keystoneml.utils.{ImageUtils, Stats, TestUtils} 8 | 9 | class HogExtractorSuite extends FunSuite with Logging { 10 | test("Load an Image and compute Hog Features") { 11 | val testImage = TestUtils.loadTestImage("images/gantrycrane.png") 12 | 13 | // NOTE: The MATLAB implementation from voc-release5 uses 14 | // images in double range -- So convert our image by rescaling 15 | val testImageScaled = ImageUtils.mapPixels(testImage, x => x/255.0) 16 | 17 | val binSize = 50 18 | val hog = new HogExtractor(binSize) 19 | val descriptors = hog.apply(testImageScaled) 20 | 21 | val ourSum = sum(descriptors) 22 | val matlabSum = 59.2162514 23 | 24 | assert(Stats.aboutEq((ourSum - matlabSum) / ourSum, 0, 1e-8), 25 | "Hog features sum should match") 26 | 27 | // With a smaller bin size 28 | val hog1 = new HogExtractor(binSize=8) 29 | val descriptors1 = hog1.apply(testImageScaled) 30 | 31 | val matlabSum1 = 4.5775269e+03 32 | val ourSum1 = sum(descriptors1) 33 | 34 | // TODO: Figure out why error is a bit higher here ? 35 | assert(Stats.aboutEq((ourSum1 - matlabSum1) / ourSum1, 0, 1e-4), 36 | "Hog features sum should match") 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/images/LCSExtractorSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import breeze.linalg._ 4 | import org.scalatest.FunSuite 5 | 6 | import keystoneml.pipelines.Logging 7 | import keystoneml.utils.{ImageUtils, Stats, TestUtils} 8 | 9 | class LCSExtractorSuite extends FunSuite with Logging { 10 | test("Load an Image and compute LCS Features") { 11 | val testImage = TestUtils.loadTestImage("images/gantrycrane.png") 12 | 13 | val lf = new LCSExtractor(stride=4, subPatchSize=6, strideStart=16) 14 | val lcsDescriptors = convert(lf.apply(testImage), Double) 15 | 16 | val firstKeyPointSum = sum(lcsDescriptors(::, 0)) 17 | val fullFeatureSum = sum(lcsDescriptors) 18 | 19 | // Values found from running matlab code on same input file. 20 | val matlabFirstKeyPointSum = 3.786557667540610e+03 21 | val matlabFullFeatureSum = 3.171963632855949e+07 22 | 23 | assert( 24 | Stats.aboutEq((firstKeyPointSum - matlabFirstKeyPointSum)/matlabFirstKeyPointSum, 0, 1e-8), 25 | "First keypoint sum must match for LCS") 26 | assert(Stats.aboutEq((fullFeatureSum - matlabFullFeatureSum)/matlabFullFeatureSum, 0, 1e-8), 27 | "Sum of LCS must match expected sum") 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/images/PoolingSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import breeze.linalg.{DenseVector, sum} 4 | import keystoneml.nodes._ 5 | import org.scalatest.FunSuite 6 | import keystoneml.pipelines.Logging 7 | import keystoneml.utils.{ChannelMajorArrayVectorizedImage, ImageMetadata} 8 | 9 | class PoolingSuite extends FunSuite with Logging { 10 | 11 | test("pooling") { 12 | val imgArr = 13 | (0 until 4).flatMap { x => 14 | (0 until 4).flatMap { y => 15 | (0 until 1).map { c => 16 | (c + x * 1 + y * 4 * 1).toDouble 17 | } 18 | } 19 | }.toArray 20 | 21 | val image = new ChannelMajorArrayVectorizedImage(imgArr, ImageMetadata(4, 4, 1)) 22 | val pooling = new Pooler(2, 2, x => x, x => x.max) 23 | 24 | val poolImage = pooling(image) 25 | 26 | assert(poolImage.get(0, 0, 0) === 5.0) 27 | assert(poolImage.get(0, 1, 0) === 7.0) 28 | assert(poolImage.get(1, 0, 0) === 13.0) 29 | assert(poolImage.get(1, 1, 0) === 15.0) 30 | } 31 | 32 | test("pooling odd") { 33 | val hogImgSize = 14 34 | val convSizes = List(1, 2, 3, 4, 6, 8) 35 | convSizes.foreach { convSize => 36 | val convResSize = hogImgSize - convSize + 1 37 | 38 | val imgArr = 39 | (0 until convResSize).flatMap { x => 40 | (0 until convResSize).flatMap { y => 41 | (0 until 1000).map { c => 42 | (c + x * 1 + y * 4 * 1).toDouble 43 | } 44 | } 45 | }.toArray 46 | 47 | val image = new ChannelMajorArrayVectorizedImage( 48 | imgArr, ImageMetadata(convResSize, convResSize, 1000)) 49 | 50 | val poolSizeReqd = math.ceil(convResSize / 2.0).toInt 51 | 52 | // We want poolSize to be even !! 53 | val poolSize = (math.ceil(poolSizeReqd / 2.0) * 2).toInt 54 | // overlap as little as possible 55 | val poolStride = convResSize - poolSize 56 | 57 | 58 | println(s"VALUES: $convSize $convResSize $poolSizeReqd $poolSize $poolStride") 59 | 60 | def summ(x: DenseVector[Double]): Double = sum(x) 61 | 62 | val pooling = new Pooler(poolStride, poolSize, identity, summ) 63 | val poolImage = pooling(image) 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/images/RandomPatcherSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images 2 | 3 | import org.scalatest.FunSuite 4 | import keystoneml.pipelines.Logging 5 | import keystoneml.utils.{ChannelMajorArrayVectorizedImage, ImageMetadata, TestUtils} 6 | 7 | class RandomPatcherSuite extends FunSuite with Logging { 8 | 9 | test("patch dimensions, number") { 10 | val image = TestUtils.loadTestImage("images/000012.jpg") 11 | val xDim = image.metadata.xDim 12 | val yDim = image.metadata.yDim 13 | val patchSizeX = xDim / 2 14 | val patchSizeY = yDim / 2 15 | val numPatches = 5 16 | 17 | val patcher = RandomPatcher(numPatches, patchSizeX, patchSizeY) 18 | 19 | val patches = patcher.randomPatchImage(image).toSeq 20 | 21 | assert(patches.map(_.metadata.xDim).forall(_ == patchSizeX) && 22 | patches.map(_.metadata.yDim).forall(_ == patchSizeY) && 23 | patches.map(_.metadata.numChannels).forall(_ == image.metadata.numChannels), 24 | "All patches must have right dimensions") 25 | 26 | assert(patches.size === numPatches, 27 | "Number of patches must match argument passed in") 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/images/SIFTExtractorSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.images.external 2 | 3 | import org.scalatest.FunSuite 4 | import keystoneml.pipelines.Logging 5 | import keystoneml.utils.{ImageUtils, TestUtils} 6 | 7 | class SIFTExtractorSuite extends FunSuite with Logging { 8 | test("Test Sift on a single image RDD, scaleStep=1 and scaleStep=0, 0 should have more descriptors") { 9 | val testImage = TestUtils.loadTestImage("images/000012.jpg") 10 | val singleImage = ImageUtils.mapPixels(testImage, _/255.0) 11 | val grayImage = ImageUtils.toGrayScale(singleImage) 12 | 13 | val se1 = SIFTExtractor(scaleStep = 1) 14 | val res1 = se1(grayImage) 15 | 16 | val se0 = SIFTExtractor(scaleStep = 0) 17 | val res0 = se0(grayImage) 18 | 19 | logInfo(s"Scale 1 shape is: ${res1.rows}x${res1.cols}") 20 | logInfo(s"Scale 0 shape is: ${res0.rows}x${res0.cols}") 21 | 22 | assert(res1.cols < res0.cols) 23 | 24 | } 25 | 26 | } -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/learning/BlockLinearMapperSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.learning 2 | 3 | import breeze.linalg.{DenseVector, DenseMatrix} 4 | import breeze.stats.distributions.Rand 5 | import keystoneml.workflow.PipelineContext 6 | import scala.collection.mutable.ArrayBuffer 7 | 8 | import org.scalatest.FunSuite 9 | 10 | import org.apache.spark.SparkContext 11 | import org.apache.spark.rdd.RDD 12 | 13 | import keystoneml.pipelines._ 14 | import keystoneml.utils.Stats 15 | 16 | class BlockLinearMapperSuite extends FunSuite with PipelineContext with Logging { 17 | 18 | test("BlockLinearMapper transformation") { 19 | sc = new SparkContext("local", "test") 20 | 21 | val inDims = 1000 22 | val outDims = 100 23 | val numChunks = 5 24 | val numPerChunk = inDims/numChunks 25 | 26 | val mat = DenseMatrix.rand(inDims, outDims, Rand.gaussian) 27 | val vec = DenseVector.rand(inDims, Rand.gaussian) 28 | val intercept = DenseVector.rand(outDims, Rand.gaussian) 29 | 30 | val splitVec = (0 until numChunks).map(i => vec((numPerChunk*i) until (numPerChunk*i + numPerChunk))) 31 | val splitMat = (0 until numChunks).map(i => mat((numPerChunk*i) until (numPerChunk*i + numPerChunk), ::)) 32 | 33 | val linearMapper = new LinearMapper[DenseVector[Double]](mat, Some(intercept)) 34 | val blockLinearMapper = new BlockLinearMapper(splitMat, numPerChunk, Some(intercept)) 35 | 36 | val linearOut = linearMapper(vec) 37 | 38 | // Test with intercept 39 | assert(Stats.aboutEq(blockLinearMapper(vec), linearOut, 1e-4)) 40 | 41 | // Test the apply and evaluate call 42 | val blmOuts = new ArrayBuffer[RDD[DenseVector[Double]]] 43 | val splitVecRDDs = splitVec.map { vec => 44 | sc.parallelize(Seq(vec), 1) 45 | } 46 | blockLinearMapper.applyAndEvaluate(splitVecRDDs, 47 | (predictedValues: RDD[DenseVector[Double]]) => { 48 | blmOuts += predictedValues 49 | () 50 | } 51 | ) 52 | 53 | // The last blmOut should match the linear mapper's output 54 | assert(Stats.aboutEq(blmOuts.last.collect()(0), linearOut, 1e-4)) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/learning/KernelModelSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.learning 2 | 3 | import breeze.linalg._ 4 | 5 | import org.apache.spark.SparkContext 6 | import org.scalatest.FunSuite 7 | 8 | import keystoneml.workflow.PipelineContext 9 | import keystoneml.utils.{MatrixUtils, Stats} 10 | 11 | class KernelModelSuite extends FunSuite with PipelineContext { 12 | 13 | test("KernelModel XOR test") { 14 | sc = new SparkContext("local", "test") 15 | 16 | val x = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0),DenseVector(1.0, -1.0)) 17 | val xTest = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0)) 18 | val y = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0), DenseVector(1.0, 0.0)) 19 | val yTest = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0)) 20 | 21 | val xRDD = sc.parallelize(x, 2) 22 | val yRDD = sc.parallelize(y, 2) 23 | val xTestRDD = sc.parallelize(xTest, 2) 24 | 25 | val gaussian = new GaussianKernelGenerator(10) 26 | // Set block size to number of data points so no blocking happens 27 | val clf = new KernelRidgeRegression(gaussian, 0, 4, 2) 28 | 29 | val kernelModel = clf.fit(xRDD, yRDD) 30 | val yHat = kernelModel(xTestRDD).collect() 31 | // Fit should be good 32 | val delta = MatrixUtils.rowsToMatrix(yHat) - MatrixUtils.rowsToMatrix(yTest) 33 | 34 | delta :*= delta 35 | println("SUM OF DELTA1 " + sum(delta)) 36 | assert(Stats.aboutEq(sum(delta), 0, 1e-4)) 37 | } 38 | 39 | test("KernelModel XOR blocked test") { 40 | sc = new SparkContext("local", "test") 41 | 42 | val x = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0),DenseVector(1.0, -1.0)) 43 | val xTest = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0)) 44 | val y = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0), DenseVector(1.0, 0.0)) 45 | val yTest = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0)) 46 | 47 | val xRDD = sc.parallelize(x, 2) 48 | val yRDD = sc.parallelize(y, 2) 49 | val xTestRDD = sc.parallelize(xTest, 2) 50 | 51 | val gaussian = new GaussianKernelGenerator(10) 52 | 53 | // Set block size to half number of data points so blocking happens 54 | val clf = new KernelRidgeRegression(gaussian, 0, 2, 2) 55 | 56 | val kernelModel = clf.fit(xRDD, yRDD) 57 | val yHat = kernelModel(xTestRDD).collect() 58 | // Fit should be good 59 | val delta = MatrixUtils.rowsToMatrix(yHat) - MatrixUtils.rowsToMatrix(yTest) 60 | 61 | delta :*= delta 62 | assert(Stats.aboutEq(sum(delta), 0, 1e-4)) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/learning/ZCAWhiteningSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.learning 2 | 3 | import breeze.linalg._ 4 | import breeze.numerics._ 5 | import breeze.stats.distributions._ 6 | import org.scalatest.FunSuite 7 | import keystoneml.pipelines._ 8 | import keystoneml.workflow.PipelineContext 9 | 10 | class ZCAWhiteningSuite extends FunSuite with PipelineContext with Logging { 11 | 12 | val nrows = 10000 13 | val ndim = 10 14 | 15 | val x = DenseMatrix.rand[Double](nrows, ndim, Gaussian(0.0, 1.0)) 16 | 17 | def fitAndCompare(x: DenseMatrix[Double], eps: Double, thresh: Double): Boolean = { 18 | val whitener = new ZCAWhitenerEstimator(eps).fitSingle(x) 19 | 20 | val wx = whitener(x) 21 | 22 | //Checks max(max(abs(cov(whiten(x))) - eye(10)) < sqrt(eps) 23 | max(abs(cov(convert(wx, Double)) - DenseMatrix.eye[Double](ndim))) < thresh 24 | } 25 | 26 | test("whitening with small epsilon") { 27 | assert(fitAndCompare(x, 1e-12, 1e-4), 28 | "Whitening the base matrix should produce unit variance and zero covariance.") 29 | } 30 | 31 | test("whitening with large epsilon") { 32 | assert(fitAndCompare(x, 0.1, 0.1), 33 | "Whitening the base matrix should produce unit variance and zero covariance.") 34 | 35 | assert(!fitAndCompare(x, 0.1, 1e-4), 36 | "Whitening the base matrix with a large epsilon should be somewhat noisy.") 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/misc/SparseFeatureVectorizerSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.misc 2 | 3 | import keystoneml.nodes.util.{SparseFeatureVectorizer, AllSparseFeatures, CommonSparseFeatures} 4 | import org.apache.spark.SparkContext 5 | import org.scalatest.FunSuite 6 | import keystoneml.pipelines.Logging 7 | import keystoneml.workflow.PipelineContext 8 | 9 | class SparseFeatureVectorizerSuite extends FunSuite with PipelineContext with Logging { 10 | test("sparse feature vectorization") { 11 | sc = new SparkContext("local", "test") 12 | 13 | val featureVectorizer = new SparseFeatureVectorizer(Map("First" -> 0, "Second" -> 1, "Third" -> 2)) 14 | val test = Seq(("Third", 4.0), ("Fourth", 6.0), ("First", 1.0)) 15 | val vector = featureVectorizer.apply(sc.parallelize(Seq(test))).first() 16 | 17 | assert(vector.size == 3) 18 | assert(vector(0) == 1) 19 | assert(vector(1) == 0) 20 | assert(vector(2) == 4) 21 | } 22 | 23 | test("all sparse feature selection") { 24 | sc = new SparkContext("local", "test") 25 | val train = sc.parallelize(List(Seq(("First", 0.0), ("Second", 6.0)), Seq(("Third", 3.0), ("Second", 4.0)))) 26 | 27 | val featureVectorizer = AllSparseFeatures().fit(train.map(x => x)) 28 | // The selected features should now be "First", "Second", and "Third" 29 | 30 | val test = Seq(("Third", 4.0), ("Fourth", 6.0), ("First", 1.0)) 31 | val out = featureVectorizer.apply(sc.parallelize(Seq(test))).first().toArray 32 | 33 | assert(out === Array(1.0, 0.0, 4.0)) 34 | } 35 | 36 | test("common sparse feature selection") { 37 | sc = new SparkContext("local", "test") 38 | val train = sc.parallelize(List( 39 | Seq(("First", 0.0), ("Second", 6.0)), 40 | Seq(("Third", 3.0), ("Second", 4.8)), 41 | Seq(("Third", 7.0), ("Fourth", 5.0)), 42 | Seq(("Fifth", 5.0), ("Second", 7.3)) 43 | )) 44 | 45 | val featureVectorizer = CommonSparseFeatures(2).fit(train.map(x => x)) 46 | // The selected features should now be "Second", and "Third" 47 | 48 | val test = Seq(("Third", 4.0), ("Seventh", 8.0), ("Second", 1.3), ("Fourth", 6.0), ("First", 1.0)) 49 | val out = featureVectorizer.apply(sc.parallelize(Seq(test))).first().toArray 50 | 51 | assert(out === Array(1.3, 4.0)) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/misc/TermFrequencySuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.misc 2 | 3 | import keystoneml.nodes.stats.TermFrequency 4 | import org.apache.spark.SparkContext 5 | import org.scalatest.FunSuite 6 | import keystoneml.workflow.PipelineContext 7 | 8 | class TermFrequencySuite extends FunSuite with PipelineContext { 9 | test("term frequency of simple strings") { 10 | sc = new SparkContext("local", "test") 11 | val in = Seq(Seq[Any]("b", "a", "c", "b", "b", "a", "b")) 12 | val out = TermFrequency().apply(sc.parallelize(in)).first().toMap 13 | assert(out === Map("a" -> 2, "b" -> 4, "c" -> 1)) 14 | } 15 | 16 | test("term frequency of varying types") { 17 | sc = new SparkContext("local", "test") 18 | val in = Seq(Seq("b", "a", "c", ("b", "b"), ("b", "b"), 12, 12, "a", "b", 12)) 19 | val out = TermFrequency().apply(sc.parallelize(in)).first().toMap 20 | assert(out === Map("a" -> 2, "b" -> 2, "c" -> 1, ("b", "b") -> 2, 12 -> 3)) 21 | } 22 | 23 | test("log term frequency") { 24 | sc = new SparkContext("local", "test") 25 | val in = Seq(Seq[Any]("b", "a", "c", "b", "b", "a", "b")) 26 | val out = TermFrequency(x => math.log(x + 1)).apply(sc.parallelize(in)).first().toMap 27 | assert(out === Map("a" -> math.log(3), "b" -> math.log(5), "c" -> math.log(2))) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/nlp/CoreNLPFeatureExtractorSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.nlp 2 | 3 | import org.apache.spark.SparkContext 4 | import org.scalatest.FunSuite 5 | import keystoneml.pipelines.Logging 6 | import keystoneml.workflow.PipelineContext 7 | 8 | class CoreNLPFeatureExtractorSuite extends FunSuite with PipelineContext with Logging { 9 | test("lemmatization") { 10 | sc = new SparkContext("local", "test") 11 | 12 | val text = "jumping snakes lakes oceans hunted" 13 | val tokens = CoreNLPFeatureExtractor(1 to 3).apply(sc.parallelize(Seq(text))).first().toSet 14 | 15 | // Make sure at least very simple cases were lemmatized 16 | assert(tokens.contains("jump")) 17 | assert(tokens.contains("snake")) 18 | assert(tokens.contains("lake")) 19 | assert(tokens.contains("ocean")) 20 | assert(tokens.contains("hunt")) 21 | 22 | // Assert the unlemmatized tokens are no longer there 23 | assert(!tokens.contains("jumping")) 24 | assert(!tokens.contains("snakes")) 25 | assert(!tokens.contains("oceans")) 26 | assert(!tokens.contains("lakes")) 27 | assert(!tokens.contains("hunted")) 28 | } 29 | 30 | test("entity extraction") { 31 | sc = new SparkContext("local", "test") 32 | 33 | val text = "John likes cake and he lives in Florida" 34 | val tokens = CoreNLPFeatureExtractor(1 to 3).apply(sc.parallelize(Seq(text))).first().toSet 35 | 36 | // Make sure at least very simple entities were identified and extracted 37 | assert(tokens.contains("PERSON")) 38 | assert(tokens.contains("LOCATION")) 39 | 40 | // Assert the original tokens are no longer there 41 | assert(!tokens.contains("John")) 42 | assert(!tokens.contains("Florida")) 43 | } 44 | 45 | test("1-2-3-grams") { 46 | sc = new SparkContext("local", "test") 47 | 48 | val text = "a b c d" 49 | val tokens = CoreNLPFeatureExtractor(1 to 3).apply(sc.parallelize(Seq(text))).first().toSet 50 | 51 | // Make sure expected unigrams appear 52 | assert(tokens.contains("a")) 53 | assert(tokens.contains("b")) 54 | assert(tokens.contains("c")) 55 | assert(tokens.contains("d")) 56 | 57 | // Make sure expected bigrams appear 58 | assert(tokens.contains("a b")) 59 | assert(tokens.contains("b c")) 60 | assert(tokens.contains("c d")) 61 | 62 | // Make sure expected 3-grams appear 63 | assert(tokens.contains("a b c")) 64 | assert(tokens.contains("b c d")) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/nlp/HashingTFSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.nlp 2 | 3 | import org.scalatest.FunSuite 4 | import keystoneml.workflow.PipelineContext 5 | 6 | class HashingTFSuite extends FunSuite with PipelineContext { 7 | 8 | test("HashingTF with no collisions") { 9 | val dims = 4000 10 | val hashingTF = HashingTF[Seq[String]](dims) 11 | 12 | val testDatum = Seq("1", "2", "4", "4", "4", "4", "2") 13 | 14 | val vector = hashingTF(testDatum) 15 | 16 | // Assert that the vector is actually sparse and has the right number of active positions 17 | assert(vector.activeSize === 3) 18 | assert(vector.length === dims) 19 | 20 | val termFrequenciesSet = vector.toArray.toSet 21 | 22 | // Assert that there are indices with all of the correct values 23 | assert(termFrequenciesSet === Set(0, 1, 2, 4)) 24 | } 25 | 26 | test("HashingTF with collisions") { 27 | val hashingTF = HashingTF[Seq[String]](2) 28 | 29 | val testDatum = Seq("1", "2", "4", "4", "4", "4", "2") 30 | 31 | val vector = hashingTF(testDatum) 32 | assert(vector.activeSize === 2) 33 | assert(vector.length === 2) 34 | 35 | // Assert that the sum of the tf's is still correct even though there were collisions 36 | assert(vector.toArray.sum === testDatum.size) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/nlp/NGramIndexerSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.nlp 2 | 3 | import org.scalatest.FunSuite 4 | 5 | class NGramIndexerSuite extends FunSuite { 6 | 7 | test("pack()") { 8 | require(NaiveBitPackIndexer.pack(Seq(1)) == math.pow(2, 40).toLong) 9 | 10 | require(NaiveBitPackIndexer.pack(Seq(1, 1)) == 11 | math.pow(2, 40).toLong + math.pow(2, 20).toLong + math.pow(2, 60).toLong) 12 | 13 | require(NaiveBitPackIndexer.pack(Seq(1, 1, 1)) == 14 | 1 + math.pow(2, 40).toLong + math.pow(2, 20).toLong + math.pow(2, 61).toLong) 15 | 16 | val ngramIndexer = new NGramIndexerImpl[Int] 17 | val seq = ngramIndexer.minNgramOrder to ngramIndexer.maxNgramOrder 18 | require(ngramIndexer.pack(seq).equals(new NGram(seq))) 19 | } 20 | 21 | test("removeFarthestWord()") { 22 | def testWith[Word >: Int, Ngram](indexer: BackoffIndexer[Word, Ngram]) = { 23 | var ngramId = indexer.pack(Seq(1, 2, 3)) 24 | var context = indexer.removeFarthestWord(ngramId) 25 | var expected = indexer.pack(Seq(2, 3)) 26 | require(context == expected, s"actual $context, expected $expected") 27 | 28 | ngramId = indexer.pack(Seq(1, 2)) 29 | context = indexer.removeFarthestWord(ngramId) 30 | expected = indexer.pack(Seq(2)) 31 | require(context == expected, s"actual $context, expected $expected") 32 | } 33 | 34 | testWith(new NGramIndexerImpl[Int]) 35 | testWith(NaiveBitPackIndexer) 36 | } 37 | 38 | test("removeCurrentWord()") { 39 | def testWith[Word >: Int, Ngram](indexer: BackoffIndexer[Word, Ngram]) = { 40 | var ngramId = indexer.pack(Seq(1, 2, 3)) 41 | var context = indexer.removeCurrentWord(ngramId) 42 | var expected = indexer.pack(Seq(1, 2)) 43 | require(context == expected, s"actual $context, expected $expected") 44 | 45 | ngramId = indexer.pack(Seq(1, 2)) 46 | context = indexer.removeCurrentWord(ngramId) 47 | expected = indexer.pack(Seq(1)) 48 | require(context == expected, s"actual $context, expected $expected") 49 | } 50 | 51 | testWith(new NGramIndexerImpl[Int]) 52 | testWith(NaiveBitPackIndexer) 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/nlp/NGramsHashingTFSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.nlp 2 | 3 | import org.scalatest.FunSuite 4 | import keystoneml.workflow.PipelineContext 5 | 6 | class NGramsHashingTFSuite extends FunSuite with PipelineContext { 7 | 8 | test("NGramsHashingTF 1 to 1") { 9 | val dims = 40000 10 | 11 | val testDatum = "this sentence is a sentence is the some there some then there some".split(" ") 12 | val ngrams = NGramsFeaturizer(1 to 1).apply(testDatum) 13 | val tfVector = HashingTF(dims).apply(ngrams) 14 | 15 | val ngramsHashingTFVector = NGramsHashingTF(1 to 1, dims).apply(testDatum) 16 | 17 | // Assert that the NGramsHashingTF node returns the same output as first getting n-grams then hashing 18 | assert(ngramsHashingTFVector === tfVector) 19 | } 20 | 21 | test("NGramsHashingTF 1 to 3") { 22 | val dims = 40000 23 | 24 | val testDatum = "this sentence is a sentence is the some there some then there some".split(" ") 25 | val ngrams = NGramsFeaturizer(1 to 3).apply(testDatum) 26 | val tfVector = HashingTF(dims).apply(ngrams) 27 | 28 | val ngramsHashingTFVector = NGramsHashingTF(1 to 3, dims).apply(testDatum) 29 | 30 | // Assert that the NGramsHashingTF node returns the same output as first getting n-grams then hashing 31 | assert(ngramsHashingTFVector === tfVector) 32 | } 33 | 34 | test("NGramsHashingTF 2 to 3") { 35 | val dims = 40000 36 | 37 | val testDatum = "this sentence is a sentence is the some there some then there some".split(" ") 38 | val ngrams = NGramsFeaturizer(2 to 3).apply(testDatum) 39 | val tfVector = HashingTF(dims).apply(ngrams) 40 | 41 | val ngramsHashingTFVector = NGramsHashingTF(2 to 3, dims).apply(testDatum) 42 | 43 | // Assert that the NGramsHashingTF node returns the same output as first getting n-grams then hashing 44 | assert(ngramsHashingTFVector === tfVector) 45 | } 46 | 47 | test("NGramsHashingTF with collisions 1 to 3") { 48 | val dims = 6 49 | 50 | val testDatum = "this sentence is a sentence is the some there some then there some".split(" ") 51 | val ngrams = NGramsFeaturizer(1 to 3).apply(testDatum) 52 | val tfVector = HashingTF(dims).apply(ngrams) 53 | 54 | val ngramsHashingTFVector = NGramsHashingTF(1 to 3, dims).apply(testDatum) 55 | 56 | // Assert that the NGramsHashingTF node returns the same output as first getting n-grams then hashing 57 | assert(ngramsHashingTFVector === tfVector) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/nlp/StringUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.nlp 2 | 3 | import org.apache.spark.SparkContext 4 | import org.scalatest.FunSuite 5 | import keystoneml.workflow.PipelineContext 6 | 7 | class StringUtilsSuite extends FunSuite with PipelineContext { 8 | val stringToManip = Array(" The quick BROWN fo.X ", " ! !.,)JumpeD. ovER the LAZy DOG.. ! ") 9 | test("trim") { 10 | sc = new SparkContext("local", "test") 11 | val out = Trim.apply(sc.parallelize(stringToManip, 1)).collect().toSeq 12 | assert(out === Seq("The quick BROWN fo.X", "! !.,)JumpeD. ovER the LAZy DOG.. !")) 13 | } 14 | 15 | test("lower case") { 16 | sc = new SparkContext("local", "test") 17 | val out = LowerCase().apply(sc.parallelize(stringToManip, 1)).collect().toSeq 18 | assert(out === Seq(" the quick brown fo.x ", " ! !.,)jumped. over the lazy dog.. ! ")) 19 | } 20 | 21 | test("tokenizer") { 22 | sc = new SparkContext("local", "test") 23 | val out = Tokenizer().apply(sc.parallelize(stringToManip, 1)).collect().toSeq 24 | assert(out === Seq(Seq("", "The", "quick", "BROWN", "fo", "X"), Seq("", "JumpeD", "ovER", "the", "LAZy", "DOG"))) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/nlp/WordFrequencyEncoderSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.nlp 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | import org.scalatest.FunSuite 6 | import keystoneml.workflow.PipelineContext 7 | 8 | class WordFrequencyEncoderSuite extends FunSuite with PipelineContext { 9 | 10 | val text = Seq("Winter coming", "Winter Winter is coming") 11 | 12 | test("WordFrequencyEncoder") { 13 | sc = new SparkContext("local[2]", "WordFrequencyEncoderSuite") 14 | val rdd = Tokenizer()(sc.parallelize(text, 2)) 15 | val encoder = WordFrequencyEncoder.fit(rdd) 16 | 17 | assert(encoder(rdd).collect().sameElements(Seq(Seq(0, 1), Seq(0, 0, 2, 1))), 18 | "frequency-encoded result incorrect") 19 | assert(encoder.unigramCounts === Map(0 -> 3, 1 -> 2, 2 -> 1), 20 | "fitted value unigramCounts incorrect") 21 | 22 | assert(encoder(sc.parallelize(Seq(Seq("hi")), 1)).collect() === Array(Seq(-1)), 23 | "OOV words not mapped to -1") 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/stats/CosineRandomFeaturesSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import breeze.linalg._ 4 | import breeze.numerics.cos 5 | import breeze.stats._ 6 | import breeze.stats.distributions.{CauchyDistribution, Rand} 7 | import org.scalatest.FunSuite 8 | import keystoneml.utils.Stats 9 | 10 | 11 | class CosineRandomFeaturesSuite extends FunSuite { 12 | val gamma = 1.34 13 | val numInputFeatures = 400 14 | val numOutputFeatures = 1000 15 | 16 | test("Guassian cosine random features") { 17 | val rf = CosineRandomFeatures(numInputFeatures, numOutputFeatures, gamma) 18 | 19 | // Check that b is uniform 20 | assert(max(rf.b) <= 2*math.Pi) 21 | assert(min(rf.b) >= 0) 22 | assert(rf.b.size == numOutputFeatures) 23 | 24 | // Check that W is gaussian 25 | assert(rf.W.rows == numOutputFeatures) 26 | assert(rf.W.cols == numInputFeatures) 27 | assert(Stats.aboutEq(mean(rf.W),0, 10e-3 * gamma)) 28 | assert(Stats.aboutEq(variance(rf.W), gamma * gamma, 10e-3 * gamma * gamma)) 29 | 30 | //check the mapping 31 | val in = DenseVector.rand(numInputFeatures, Rand.uniform) 32 | val out = cos((in.t * rf.W.t).t + rf.b) 33 | assert(Stats.aboutEq(rf(in), out, 10e-3)) 34 | } 35 | 36 | test("Cauchy cosine random features") { 37 | val rf = CosineRandomFeatures( 38 | numInputFeatures, 39 | numOutputFeatures, 40 | gamma, 41 | new CauchyDistribution(0, 1)) 42 | 43 | // Check that b is uniform 44 | assert(max(rf.b) <= 2*math.Pi) 45 | assert(min(rf.b) >= 0) 46 | assert(rf.b.size == numOutputFeatures) 47 | 48 | // Check that W is cauchy 49 | assert(rf.W.rows == numOutputFeatures) 50 | assert(rf.W.cols == numInputFeatures) 51 | assert(Stats.aboutEq(median(rf.W),0,10e-3 * gamma)) 52 | 53 | //check the mapping 54 | val in = DenseVector.rand(numInputFeatures, Rand.uniform) 55 | val out = cos((in.t * rf.W.t).t + rf.b) 56 | assert(Stats.aboutEq(rf(in), out, 10e-3)) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/stats/LinearRectifierSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import breeze.linalg.DenseMatrix 4 | import breeze.stats.distributions.Rand 5 | import org.apache.spark.SparkContext 6 | import org.scalatest.FunSuite 7 | import keystoneml.pipelines._ 8 | import keystoneml.utils.{TestUtils, MatrixUtils} 9 | import keystoneml.workflow.PipelineContext 10 | 11 | class LinearRectifierSuite extends FunSuite with PipelineContext with Logging { 12 | 13 | test("Test MaxVal") { 14 | sc = new SparkContext("local", "test") 15 | val matrixParts = TestUtils.createRandomMatrix(sc, 128, 16, 4).rdd.map(_.mat) 16 | 17 | val x = matrixParts.flatMap(y => MatrixUtils.matrixToRowArray(y)) 18 | val y = x.map(r => r.forall(_ >= 0.0)) 19 | 20 | val valmaxNode = LinearRectifier() 21 | val maxy = valmaxNode.apply(x).map(r => r.forall(_ >= 0.0)) 22 | 23 | //The random matrix should *not* all be >= 0 24 | assert(!y.reduce {(a,b) => a | b}) 25 | 26 | //The valmax'ed random matrix *should* all be >= 0. 27 | assert(maxy.reduce {(a,b) => a | b}) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/stats/PaddedFFTSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import breeze.linalg._ 4 | import org.apache.spark.SparkContext 5 | import org.scalatest.FunSuite 6 | import keystoneml.pipelines.Logging 7 | import keystoneml.utils.Stats 8 | import keystoneml.workflow.PipelineContext 9 | 10 | 11 | class PaddedFFTSuite extends FunSuite with PipelineContext with Logging { 12 | test("Test PaddedFFT node") { 13 | sc = new SparkContext("local", "test") 14 | 15 | // Set up a test matrix. 16 | val ones = DenseVector.zeros[Double](100) 17 | val twos = DenseVector.zeros[Double](100) 18 | ones(0) = 1.0 19 | twos(2) = 1.0 20 | 21 | val x = sc.parallelize(Seq(twos, ones)) 22 | val fftd = PaddedFFT().apply(x).collect() 23 | 24 | val twosout = fftd(0) 25 | val onesout = fftd(1) 26 | 27 | // Proof by agreement w/ R: Re(fft(c(0, 0, 1, rep(0, 125)))) 28 | assert(twosout.length === 64) 29 | assert(Stats.aboutEq(twosout(0), 1.0)) 30 | assert(Stats.aboutEq(twosout(16), 0.0)) 31 | assert(Stats.aboutEq(twosout(32), -1.0)) 32 | assert(Stats.aboutEq(twosout(48), 0.0)) 33 | 34 | // Proof by agreement w/ R: Re(fft(c(1, rep(0, 127)))) 35 | assert(Stats.aboutEq(onesout, DenseVector.ones[Double](64))) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/stats/RandomSignNodeSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import breeze.linalg._ 4 | import org.scalatest.FunSuite 5 | import org.scalatest.matchers.ShouldMatchers 6 | import keystoneml.pipelines.Logging 7 | 8 | class RandomSignNodeSuite extends FunSuite with Logging with ShouldMatchers { 9 | 10 | test("RandomSignNode") { 11 | val signs = DenseVector(1.0, -1.0, 1.0) 12 | val node = RandomSignNode(signs) 13 | val data: DenseVector[Double] = DenseVector(1.0, 2.0, 3.0) 14 | val result = node(data) 15 | Seq(result) should equal (Seq(DenseVector(1.0, -2.0, 3.0))) 16 | } 17 | 18 | test("RandomSignNode.create") { 19 | val node = RandomSignNode(1000) 20 | 21 | node.signs.foreach(elt => assert(elt == -1.0 || elt == 1.0)) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/stats/SignedHellingerMapperSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.stats 2 | 3 | import breeze.linalg.DenseVector 4 | import org.scalatest.FunSuite 5 | 6 | class SignedHellingerMapperSuite extends FunSuite { 7 | test("signed hellinger mapper") { 8 | val x = DenseVector(1.0, -4.0, 0.0, -9.0, 16.0) 9 | val shmx = DenseVector(1.0, -2.0, 0.0, -3.0, 4.0) 10 | 11 | assert(SignedHellingerMapper(x) == shmx, "Result should be signed square root of input.") 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/util/ClassLabelIndicatorsSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.DenseVector 4 | import org.scalatest.FunSuite 5 | 6 | class ClassLabelIndicatorsSuite extends FunSuite { 7 | test("single label indicators") { 8 | intercept[AssertionError] { 9 | val zerolabels = ClassLabelIndicatorsFromIntLabels(0) 10 | } 11 | 12 | intercept[AssertionError] { 13 | val onelabel = ClassLabelIndicatorsFromIntLabels(1) 14 | } 15 | 16 | 17 | val fivelabel = ClassLabelIndicatorsFromIntLabels(5) 18 | assert(fivelabel(2) === DenseVector(-1.0,-1.0,1.0,-1.0,-1.0)) 19 | 20 | intercept[RuntimeException] { 21 | fivelabel(5) 22 | } 23 | } 24 | 25 | test("multiple label indicators without validation") { 26 | intercept[AssertionError] { 27 | val zerolabels = ClassLabelIndicatorsFromIntArrayLabels(0) 28 | } 29 | 30 | intercept[AssertionError] { 31 | val onelabel = ClassLabelIndicatorsFromIntArrayLabels(1) 32 | } 33 | 34 | val fivelabel = ClassLabelIndicatorsFromIntArrayLabels(5) 35 | 36 | assert(fivelabel(Array(2,1)) === DenseVector(-1.0,1.0,1.0,-1.0,-1.0)) 37 | 38 | intercept[IndexOutOfBoundsException] { 39 | fivelabel(Array(4,6)) 40 | } 41 | 42 | assert(fivelabel(Array(-1,2)) === DenseVector(-1.0,-1.0,1.0,-1.0,1.0), 43 | "In the unchecked case, we should get weird behavior.") 44 | 45 | } 46 | 47 | test("multiple label indicators with validation") { 48 | intercept[AssertionError] { 49 | val zerolabels = ClassLabelIndicatorsFromIntArrayLabels(0, true) 50 | } 51 | 52 | intercept[AssertionError] { 53 | val onelabel = ClassLabelIndicatorsFromIntArrayLabels(1, true) 54 | } 55 | 56 | val fivelabel = ClassLabelIndicatorsFromIntArrayLabels(5, true) 57 | 58 | assert(fivelabel(Array(2,1)) === DenseVector(-1.0,1.0,1.0,-1.0,-1.0)) 59 | 60 | intercept[RuntimeException] { 61 | fivelabel(Array(4,6)) 62 | } 63 | 64 | intercept[RuntimeException] { 65 | fivelabel(Array(-1,2)) 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/util/MaxClassifierSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.DenseVector 4 | import org.scalatest.FunSuite 5 | 6 | class MaxClassifierSuite extends FunSuite { 7 | test("max classifier") { 8 | assert(MaxClassifier.apply(DenseVector(-10.0, 42.4, 335.23, -43.0)) === 2) 9 | assert(MaxClassifier.apply(DenseVector(Double.MinValue)) === 0) 10 | assert(MaxClassifier.apply(DenseVector(3.0, -23.2, 2.99)) === 0) 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/util/TopKClassifierSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg.DenseVector 4 | import org.apache.spark.SparkContext 5 | import org.scalatest.FunSuite 6 | import keystoneml.workflow.PipelineContext 7 | 8 | class TopKClassifierSuite extends FunSuite with PipelineContext { 9 | test("top k classifier, k <= vector size") { 10 | sc = new SparkContext("local", "test") 11 | 12 | assert(TopKClassifier(2).apply(DenseVector(-10.0, 42.4, -43.0, 23.0)) === Array(1, 3)) 13 | assert(TopKClassifier(4).apply(DenseVector(Double.MinValue, Double.MaxValue, 12.0, 11.0, 10.0)) === Array(1, 2, 3, 4)) 14 | assert(TopKClassifier(3).apply(DenseVector(3.0, -23.2, 2.99)) === Array(0, 2, 1)) 15 | } 16 | 17 | test("top k classifier, k > vector size") { 18 | sc = new SparkContext("local", "test") 19 | 20 | assert(TopKClassifier(5).apply(DenseVector(-10.0, 42.4, -43.0, 23.0)) === Array(1, 3, 0, 2)) 21 | assert(TopKClassifier(2).apply(DenseVector(Double.MinValue)) === Array(0)) 22 | assert(TopKClassifier(20).apply(DenseVector(3.0, -23.2, 2.99)) === Array(0, 2, 1)) 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/nodes/util/VectorSplitterSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.nodes.util 2 | 3 | import breeze.linalg._ 4 | import org.scalatest.FunSuite 5 | 6 | class VectorSplitterSuite extends FunSuite { 7 | test("vector splitter") { 8 | for ( 9 | bs <- Array(128, 256, 512, 1024, 2048); 10 | mul <- 0 to 2; 11 | off <- 0 to 20 by 5; 12 | feats <- Array(Some(bs*mul + off), None) 13 | ) { 14 | val sp = new VectorSplitter(bs, feats) 15 | val vec = DenseVector.zeros[Double](bs*mul + off) 16 | 17 | val expectedSplits = (bs*mul + off)/bs + (if ((bs*mul + off) % bs == 0) 0 else 1) 18 | 19 | assert(sp.splitVector(vec).length === expectedSplits, 20 | s"True length is ${sp.splitVector(vec).length}, expected length is ${expectedSplits}") 21 | } 22 | } 23 | 24 | test("vector splitter maintains order") { 25 | for ( 26 | bs <- Array(128, 256, 512, 1024, 2048); 27 | mul <- 0 to 2; 28 | off <- 0 to 20 by 5; 29 | feats <- Array(Some(bs*mul + off), None) 30 | ) { 31 | val sp = new VectorSplitter(bs, feats) 32 | val vec = rand(bs*mul + off) 33 | 34 | assert(DenseVector.vertcat(sp.splitVector(vec):_*) === vec, 35 | s"Recombinded split vector of length ${bs*mul + off} with block size $bs did not match its input") 36 | } 37 | } 38 | } -------------------------------------------------------------------------------- /src/test/scala/keystoneml/utils/ImageUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.utils 2 | 3 | import org.scalatest.FunSuite 4 | 5 | class ImageUtilsSuite extends FunSuite { 6 | 7 | test("crop") { 8 | val imgArr = 9 | (0 until 4).flatMap { x => 10 | (0 until 4).flatMap { y => 11 | (0 until 1).map { c => 12 | (c + x * 1 + y * 4 * 1).toDouble 13 | } 14 | } 15 | }.toArray 16 | 17 | val image = new ChannelMajorArrayVectorizedImage(imgArr, ImageMetadata(4, 4, 1)) 18 | val cropped = ImageUtils.crop(image, 1, 1, 3, 3) 19 | 20 | assert(cropped.metadata.xDim == 2) 21 | assert(cropped.metadata.yDim == 2) 22 | assert(cropped.metadata.numChannels == 1) 23 | 24 | assert(cropped.get(0, 0, 0) == 5.0) 25 | assert(cropped.get(0, 1, 0) == 6.0) 26 | assert(cropped.get(1, 0, 0) == 9.0) 27 | assert(cropped.get(1, 1, 0) == 10.0) 28 | } 29 | 30 | test("flipHorizontal") { 31 | val imgArr = 32 | (0 until 4).flatMap { x => 33 | (0 until 4).flatMap { y => 34 | (0 until 1).map { c => 35 | (c + x * 1 + y * 4 * 1).toDouble 36 | } 37 | } 38 | }.toArray 39 | 40 | val image = new ChannelMajorArrayVectorizedImage(imgArr, ImageMetadata(4, 4, 1)) 41 | 42 | val flipped = ImageUtils.flipHorizontal(image) 43 | 44 | assert(flipped.metadata.xDim == 4) 45 | assert(flipped.metadata.yDim == 4) 46 | assert(flipped.metadata.numChannels == 1) 47 | 48 | (0 until 4).foreach { x => 49 | assert(flipped.get(x, 0, 0) == image.get(x, 3, 0)) 50 | assert(flipped.get(x, 1, 0) == image.get(x, 2, 0)) 51 | assert(flipped.get(x, 2, 0) == image.get(x, 1, 0)) 52 | assert(flipped.get(x, 3, 0) == image.get(x, 0, 0)) 53 | } 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/utils/MLlibUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.utils 2 | 3 | import org.apache.spark.mllib.linalg._ 4 | import breeze.linalg.{DenseVector => BDV, SparseVector => BSV} 5 | import org.scalatest.FunSuite 6 | 7 | class MLlibUtilsSuite extends FunSuite { 8 | val arr = Array(0.1, 0.2, 0.3, 0.4) 9 | val n = 20 10 | val indices = Array(0, 3, 5, 10, 13) 11 | val values = Array(0.1, 0.5, 0.3, -0.8, -1.0) 12 | 13 | test("dense vector to breeze dense") { 14 | val vec = Vectors.dense(arr) 15 | assert(MLlibUtils.mllibVectorToDenseBreeze(vec) === new BDV[Double](arr)) 16 | } 17 | 18 | test("sparse vector to breeze dense") { 19 | val vec = Vectors.sparse(n, indices, values) 20 | val breeze = new BDV[Double](n) 21 | indices.zip(values).foreach { case (x, y) => 22 | breeze(x) = y 23 | } 24 | assert(MLlibUtils.mllibVectorToDenseBreeze(vec) === breeze) 25 | } 26 | 27 | test("dense breeze to vector") { 28 | val breeze = new BDV[Double](arr) 29 | val vec = MLlibUtils.breezeVectorToMLlib(breeze).asInstanceOf[DenseVector] 30 | assert(vec.size === arr.length) 31 | assert(vec.values.eq(arr), "should not copy data") 32 | } 33 | 34 | test("sparse breeze to vector") { 35 | val breeze = new BSV[Double](indices, values, n) 36 | val vec = MLlibUtils.breezeVectorToMLlib(breeze).asInstanceOf[SparseVector] 37 | assert(vec.size === n) 38 | assert(vec.indices.eq(indices), "should not copy data") 39 | assert(vec.values.eq(values), "should not copy data") 40 | } 41 | 42 | test("sparse breeze with partially-used arrays to vector") { 43 | val activeSize = 3 44 | val breeze = new BSV[Double](indices, values, activeSize, n) 45 | val vec = MLlibUtils.breezeVectorToMLlib(breeze).asInstanceOf[SparseVector] 46 | assert(vec.size === n) 47 | assert(vec.indices === indices.slice(0, activeSize)) 48 | assert(vec.values === values.slice(0, activeSize)) 49 | } 50 | 51 | test("dense matrix to breeze dense") { 52 | val mat = Matrices.dense(3, 2, Array(0.0, 1.0, 2.0, 3.0, 4.0, 5.0)) 53 | val breeze = MLlibUtils.mllibMatrixToDenseBreeze(mat) 54 | assert(breeze.rows === mat.numRows) 55 | assert(breeze.cols === mat.numCols) 56 | assert(breeze.data.eq(mat.asInstanceOf[DenseMatrix].values), "should not copy data") 57 | } 58 | 59 | test("sparse matrix to breeze dense") { 60 | val values = Array(1.0, 2.0, 4.0, 5.0) 61 | val colPtrs = Array(0, 2, 4) 62 | val rowIndices = Array(1, 2, 1, 2) 63 | val mat = Matrices.sparse(3, 2, colPtrs, rowIndices, values) 64 | val breeze = MLlibUtils.mllibMatrixToDenseBreeze(mat) 65 | assert(breeze.rows === mat.numRows) 66 | assert(breeze.cols === mat.numCols) 67 | assert(breeze.toArray === mat.toArray) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/utils/MatrixUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.utils 2 | 3 | import org.scalatest.FunSuite 4 | 5 | import breeze.linalg._ 6 | import breeze.stats._ 7 | 8 | import org.apache.spark.SparkContext 9 | 10 | import keystoneml.pipelines._ 11 | import keystoneml.workflow.PipelineContext 12 | 13 | class MatrixUtilsSuite extends FunSuite with PipelineContext { 14 | 15 | test("computeMean works correctly") { 16 | val numRows = 1000 17 | val numCols = 32 18 | val numParts = 4 19 | sc = new SparkContext("local", "test") 20 | val in = DenseMatrix.rand(numRows, numCols) 21 | val inArr = MatrixUtils.matrixToRowArray(in) 22 | val rdd = sc.parallelize(inArr, numParts).mapPartitions { iter => 23 | Iterator.single(MatrixUtils.rowsToMatrix(iter)) 24 | } 25 | val expected = mean(in(::, *)).t 26 | val actual = MatrixUtils.computeMean(rdd) 27 | assert(Stats.aboutEq(expected, actual, 1e-6)) 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/utils/external/EncEvalSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.utils.external 2 | 3 | import java.io.File 4 | 5 | import breeze.linalg._ 6 | import breeze.stats.distributions.Gaussian 7 | import keystoneml.nodes.learning.GaussianMixtureModel 8 | import keystoneml.nodes.learning.external.GaussianMixtureModelEstimator 9 | import org.scalatest.FunSuite 10 | import keystoneml.pipelines.Logging 11 | import keystoneml.utils.{Stats, TestUtils} 12 | 13 | class EncEvalSuite extends FunSuite with Logging { 14 | 15 | test("Load SIFT Descriptors and compute Fisher Vector Features") { 16 | 17 | val siftDescriptor = csvread(new File(TestUtils.getTestResourceFileName("images/feats.csv"))) 18 | 19 | val gmmMeans = TestUtils.getTestResourceFileName("images/voc_codebook/means.csv") 20 | val gmmVars = TestUtils.getTestResourceFileName("images/voc_codebook/variances.csv") 21 | val gmmWeights = TestUtils.getTestResourceFileName("images/voc_codebook/priors") 22 | 23 | val gmm = GaussianMixtureModel.load(gmmMeans, gmmVars, gmmWeights) 24 | 25 | val nCenters = gmm.means.cols 26 | val nDim = gmm.means.rows 27 | 28 | val extLib = new EncEval 29 | 30 | val fisherVector = extLib.calcAndGetFVs( 31 | gmm.means.toArray.map(_.toFloat), 32 | nCenters, 33 | nDim, 34 | gmm.variances.toArray.map(_.toFloat), 35 | gmm.weights.toArray.map(_.toFloat), 36 | siftDescriptor.toArray.map(_.toFloat)) 37 | 38 | log.info(s"Fisher Vector is ${fisherVector.sum}") 39 | assert(Stats.aboutEq(fisherVector.sum, 40.109097, 1e-4), "SUM of Fisher Vectors must match expected sum.") 40 | 41 | } 42 | 43 | test("Compute a GMM from scala") { 44 | val nsamps = 10000 45 | 46 | // Generate two gaussians. 47 | val x = Gaussian(-1.0, 0.5).samples.take(nsamps).toArray 48 | val y = Gaussian(5.0, 1.0).samples.take(nsamps).toArray 49 | 50 | val z = shuffle(x ++ y).map(x => DenseVector(x)) 51 | 52 | // Compute a 1-d GMM. 53 | val extLib = new EncEval 54 | val gmm = new GaussianMixtureModelEstimator(2).fit(z) 55 | 56 | logInfo(s"GMM means: ${gmm.means.toArray.mkString(",")}") 57 | logInfo(s"GMM vars: ${gmm.variances.toArray.mkString(",")}") 58 | logInfo(s"GMM weights: ${gmm.weights.toArray.mkString(",")}") 59 | 60 | // The results should be close to the distribution we set up. 61 | assert(Stats.aboutEq(min(gmm.means), -1.0, 1e-1), "Smallest mean should be close to -1.0") 62 | assert(Stats.aboutEq(max(gmm.means), 5.0, 1e-1), "Largest mean should be close to 1.0") 63 | assert(Stats.aboutEq(math.sqrt(min(gmm.variances)), 0.5, 1e-1), "Smallest SD should be close to 0.25") 64 | assert(Stats.aboutEq(math.sqrt(max(gmm.variances)), 1.0, 1e-1), "Largest SD should be close to 5.0") 65 | } 66 | } -------------------------------------------------------------------------------- /src/test/scala/keystoneml/utils/external/VLFeatSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.utils.external 2 | 3 | import java.io.File 4 | 5 | import breeze.linalg._ 6 | import breeze.numerics.abs 7 | import org.scalatest.FunSuite 8 | import keystoneml.pipelines.Logging 9 | import keystoneml.utils.{ImageUtils, MatrixUtils, TestUtils} 10 | 11 | class VLFeatSuite extends FunSuite with Logging { 12 | test("Load an Image and compute SIFT Features") { 13 | val testImage = TestUtils.loadTestImage("images/000012.jpg") 14 | val singleImage = ImageUtils.mapPixels(testImage, _/255.0) 15 | val grayImage = ImageUtils.toGrayScale(singleImage) 16 | 17 | val extLib = new VLFeat 18 | 19 | val stepSize = 3 20 | val binSize = 4 21 | val scales = 4 22 | val descriptorLength = 128 23 | val scaleStep = 0 24 | 25 | val rawDescDataShort = extLib.getSIFTs(grayImage.metadata.xDim, grayImage.metadata.yDim, 26 | stepSize, binSize, scales, scaleStep, grayImage.getSingleChannelAsFloatArray()) 27 | 28 | assert(rawDescDataShort.length % descriptorLength == 0, "Resulting SIFTs must be 128-dimensional.") 29 | 30 | val numCols = rawDescDataShort.length/descriptorLength 31 | val result = new DenseMatrix(descriptorLength, numCols, rawDescDataShort.map(_.toDouble)) 32 | 33 | // Compare with the output of running this image through vl_phow with matlab from the enceval package: 34 | // featpipem_addpaths; 35 | // im = im2single(imread('images/000012.jpg')); 36 | // featextr = featpipem.features.PhowExtractor(); 37 | // featextr.step = 3; 38 | // [frames feats] = featextr.compute(im); 39 | // csvwrite('images/feats128.csv', feats) 40 | 41 | val testFeatures = csvread(new File(TestUtils.getTestResourceFileName("images/feats128.csv"))) 42 | 43 | val diff = result - testFeatures 44 | 45 | // Because of subtle differences in the way image smoothing works in the VLFeat C library and the VLFeat matlab 46 | // library (vl_imsmooth_f vs. _vl_imsmooth_f), these two matrices will not be exactly the same. 47 | // Instead, we check that 99.5% of the matrix entries are off by at most 1. 48 | val absdiff = abs(diff).toDenseVector 49 | 50 | assert(absdiff.findAll(_ > 1.0).length.toDouble < 0.005*absdiff.length, 51 | "Fewer than 0.05% of entries may be different by more than 1.") 52 | } 53 | } -------------------------------------------------------------------------------- /src/test/scala/keystoneml/utils/images/ImageSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.utils.images 2 | 3 | import org.scalatest.FunSuite 4 | import keystoneml.pipelines.Logging 5 | import keystoneml.utils.VectorizedImage 6 | import keystoneml.utils.TestUtils._ 7 | 8 | class ImageSuite extends FunSuite with Logging { 9 | test("Vectorized Image Coordinates Should be Correct") { 10 | val (x,y,z) = (100,100,3) 11 | 12 | val images = Array[VectorizedImage]( 13 | genChannelMajorArrayVectorizedImage(x,y,z), 14 | genColumnMajorArrayVectorizedImage(x,y,z), 15 | genRowMajorArrayVectorizedImage(x,y,z), 16 | genRowColumnMajorByteArrayVectorizedImage(x,y,z) 17 | ) 18 | 19 | for ( 20 | img <- images; 21 | idx <- 0 until x*y*z 22 | ) { 23 | val coord = img.vectorToImageCoords(idx) 24 | assert(img.imageToVectorCoords(coord.x,coord.y,coord.channelIdx) == idx, 25 | s"imageToVectorCoords(vectorToImageCoords(idx)) should be equivalent to identity(idx) for img $img") 26 | } 27 | 28 | for ( 29 | img <- images; 30 | xi <- 0 until x; 31 | yi <- 0 until y; 32 | zi <- 0 until z 33 | ) { 34 | val coord = img.vectorToImageCoords(img.imageToVectorCoords(xi,yi,zi)) 35 | assert((coord.x, coord.y, coord.channelIdx) == (xi,yi,zi), 36 | s"vectorToImageCoords(imageToVectorCoords(x,y,z)) should be equivalent to identity(x,y,z) for img $img") 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /src/test/scala/keystoneml/workflow/EstimatorSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD 5 | import org.scalatest.FunSuite 6 | import keystoneml.pipelines.Logging 7 | 8 | class EstimatorSuite extends FunSuite with PipelineContext with Logging { 9 | test("Estimator fit RDD") { 10 | sc = new SparkContext("local", "test") 11 | 12 | val intEstimator = new Estimator[Int, Int] { 13 | def fit(data: RDD[Int]): Transformer[Int, Int] = { 14 | val first = data.first() 15 | Transformer(x => x + first) 16 | } 17 | } 18 | 19 | val trainData = sc.parallelize(Seq(32, 94, 12)) 20 | val testData = sc.parallelize(Seq(42, 58, 61)) 21 | 22 | val pipeline = intEstimator.withData(trainData) 23 | assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + 32, 58 + 32, 61 + 32)) 24 | } 25 | 26 | test("Estimator fit Pipeline Data") { 27 | sc = new SparkContext("local", "test") 28 | 29 | val transformer = Transformer[Int, Int](_ * 2) 30 | 31 | val intEstimator = new Estimator[Int, Int] { 32 | def fit(data: RDD[Int]): Transformer[Int, Int] = { 33 | val first = data.first() 34 | Transformer(x => x + first) 35 | } 36 | } 37 | 38 | val trainData = sc.parallelize(Seq(32, 94, 12)) 39 | val testData = sc.parallelize(Seq(42, 58, 61)) 40 | 41 | val pipeline = intEstimator.withData(transformer(trainData)) 42 | assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + 64, 58 + 64, 61 + 64)) 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/workflow/LabelEstimatorSuite.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD 5 | import org.scalatest.FunSuite 6 | import keystoneml.pipelines.Logging 7 | 8 | class LabelEstimatorSuite extends FunSuite with PipelineContext with Logging { 9 | test("LabelEstimator fit RDD") { 10 | sc = new SparkContext("local", "test") 11 | 12 | val intEstimator = new LabelEstimator[Int, Int, String] { 13 | def fit(data: RDD[Int], labels: RDD[String]): Transformer[Int, Int] = { 14 | val first = data.first() 15 | val label = labels.first().hashCode 16 | Transformer(x => x + first + label) 17 | 18 | } 19 | } 20 | 21 | val trainData = sc.parallelize(Seq(32, 94, 12)) 22 | val trainLabels = sc.parallelize(Seq("sjkfdl", "iw", "432")) 23 | val testData = sc.parallelize(Seq(42, 58, 61)) 24 | 25 | val pipeline = intEstimator.withData(trainData, trainLabels) 26 | val offset = 32 + "sjkfdl".hashCode 27 | assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + offset, 58 + offset, 61 + offset)) 28 | } 29 | 30 | test("LabelEstimator fit pipeline data") { 31 | sc = new SparkContext("local", "test") 32 | 33 | val dataTransformer = Transformer[Int, Int](_ * 2) 34 | val labelTransformer = Transformer[String, String](_ + "hi") 35 | 36 | val intEstimator = new LabelEstimator[Int, Int, String] { 37 | def fit(data: RDD[Int], labels: RDD[String]): Transformer[Int, Int] = { 38 | val first = data.first() 39 | val label = labels.first().hashCode 40 | Transformer(x => x + first + label) 41 | 42 | } 43 | } 44 | 45 | val trainData = sc.parallelize(Seq(32, 94, 12)) 46 | val trainLabels = sc.parallelize(Seq("sjkfdl", "iw", "432")) 47 | val testData = sc.parallelize(Seq(42, 58, 61)) 48 | 49 | val pipeline = intEstimator.withData(dataTransformer(trainData), labelTransformer(trainLabels)) 50 | val offset = 64 + "sjkfdlhi".hashCode 51 | assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + offset, 58 + offset, 61 + offset)) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/test/scala/keystoneml/workflow/PipelineContext.scala: -------------------------------------------------------------------------------- 1 | package keystoneml.workflow 2 | 3 | import org.apache.spark.SparkContext 4 | import org.scalatest.{BeforeAndAfterEach, Suite} 5 | 6 | // TODO: delete this file and use the version from Spark once SPARK-750 is fixed. 7 | 8 | /** Manages a local `sc` {@link SparkContext} variable, and the PipelineEnv, correctly stopping it after each test. */ 9 | trait PipelineContext extends BeforeAndAfterEach { self: Suite => 10 | 11 | @transient var sc: SparkContext = _ 12 | 13 | override def afterEach() { 14 | PipelineEnv.getOrCreate.reset() 15 | resetSparkContext() 16 | super.afterEach() 17 | } 18 | 19 | def resetSparkContext() = { 20 | if (sc != null) { 21 | PipelineContext.stop(sc) 22 | sc = null 23 | } 24 | } 25 | } 26 | 27 | object PipelineContext { 28 | def stop(sc: SparkContext) { 29 | sc.stop() 30 | // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown 31 | System.clearProperty("spark.driver.port") 32 | } 33 | 34 | /** Runs `f` by passing in `sc` and ensures that `sc` is stopped. */ 35 | def withSpark[T](sc: SparkContext)(f: SparkContext => T) = { 36 | try { 37 | f(sc) 38 | } finally { 39 | stop(sc) 40 | } 41 | } 42 | 43 | } --------------------------------------------------------------------------------