├── .gitignore
├── CONTRIBUTORS.md
├── EC2.md
├── LICENSE
├── Makefile
├── README.md
├── RELEASE.md
├── bin
    ├── keystone-ec2.sh
    ├── run-main.sh
    └── run-pipeline.sh
├── build.sbt
├── examples
    ├── images
    │   ├── cifar_random_patch.sh
    │   ├── mnist_random_fft.sh
    │   └── voc_sift_fisher.sh
    └── text
    │   └── newsgroups_ngrams_tfidf.sh
├── lib
    └── libImageFeatures.dylib
├── project
    ├── build.properties
    └── plugins.sbt
├── sbt
    └── sbt
├── scripts
    ├── constantEstimator.R
    └── solver-comparisons-final.csv
└── src
    ├── main
        ├── cpp
        │   ├── EncEval.cxx
        │   ├── EncEval.h
        │   ├── VLFeat.cxx
        │   └── VLFeat.h
        ├── resources
        │   └── log4j.properties
        └── scala
        │   └── keystoneml
        │       ├── evaluation
        │           ├── AugmentedExamplesEvaluator.scala
        │           ├── BinaryClassifierEvaluator.scala
        │           ├── Evaluator.scala
        │           ├── MeanAveragePrecisionEvaluator.scala
        │           └── MulticlassClassifierEvaluator.scala
        │       ├── loaders
        │           ├── AmazonReviewsDataLoader.scala
        │           ├── CifarLoader.scala
        │           ├── CsvDataLoader.scala
        │           ├── ImageLoaderUtils.scala
        │           ├── ImageNetLoader.scala
        │           ├── LabeledData.scala
        │           ├── NewsgroupsDataLoader.scala
        │           ├── TimitFeaturesDataLoader.scala
        │           └── VOCLoader.scala
        │       ├── nodes
        │           ├── images
        │           │   ├── CenterCornerPatcher.scala
        │           │   ├── Convolver.scala
        │           │   ├── Cropper.scala
        │           │   ├── DaisyExtractor.scala
        │           │   ├── FisherVector.scala
        │           │   ├── GrayScaler.scala
        │           │   ├── HogExtractor.scala
        │           │   ├── ImageVectorizer.scala
        │           │   ├── LCSExtractor.scala
        │           │   ├── LabeledImageExtractors.scala
        │           │   ├── PixelScaler.scala
        │           │   ├── Pooler.scala
        │           │   ├── RandomImageTransformer.scala
        │           │   ├── RandomPatcher.scala
        │           │   ├── SIFTExtractor.scala
        │           │   ├── SymmetricRectifier.scala
        │           │   ├── Windower.scala
        │           │   └── external
        │           │   │   ├── FisherVector.scala
        │           │   │   └── SIFTExtractor.scala
        │           ├── learning
        │           │   ├── ApproximatePCA.scala
        │           │   ├── BlockLinearMapper.scala
        │           │   ├── BlockWeightedLeastSquares.scala
        │           │   ├── CostModel.scala
        │           │   ├── DistributedPCA.scala
        │           │   ├── GaussianMixtureModel.scala
        │           │   ├── GaussianMixtureModelEstimator.scala
        │           │   ├── Gradient.scala
        │           │   ├── KMeansPlusPlus.scala
        │           │   ├── KernelBlockLinearMapper.scala
        │           │   ├── KernelGenerator.scala
        │           │   ├── KernelMatrix.scala
        │           │   ├── KernelRidgeRegression.scala
        │           │   ├── LBFGS.scala
        │           │   ├── LeastSquaresEstimator.scala
        │           │   ├── LinearDiscriminantAnalysis.scala
        │           │   ├── LinearMapper.scala
        │           │   ├── LocalLeastSquaresEstimator.scala
        │           │   ├── LogisticRegressionModel.scala
        │           │   ├── NaiveBayesModel.scala
        │           │   ├── PCA.scala
        │           │   ├── PerClassWeightedLeastSquares.scala
        │           │   ├── SparseLinearMapper.scala
        │           │   ├── ZCAWhitener.scala
        │           │   ├── external
        │           │   │   └── GaussianMixtureModelEstimator.scala
        │           │   └── internal
        │           │   │   └── ReWeightedLeastSquares.scala
        │           ├── nlp
        │           │   ├── CoreNLPFeatureExtractor.scala
        │           │   ├── HashingTF.scala
        │           │   ├── NGramsHashingTF.scala
        │           │   ├── StringUtils.scala
        │           │   ├── StupidBackoff.scala
        │           │   ├── WordFrequencyEncoder.scala
        │           │   ├── indexers.scala
        │           │   └── ngrams.scala
        │           ├── stats
        │           │   ├── CosineRandomFeatures.scala
        │           │   ├── LinearRectifier.scala
        │           │   ├── NormalizeRows.scala
        │           │   ├── PaddedFFT.scala
        │           │   ├── RandomSignNode.scala
        │           │   ├── Sampling.scala
        │           │   ├── SignedHellingerMapper.scala
        │           │   ├── StandardScaler.scala
        │           │   └── TermFrequency.scala
        │           └── util
        │           │   ├── AllSparseFeatures.scala
        │           │   ├── Cacher.scala
        │           │   ├── ClassLabelIndicators.scala
        │           │   ├── CommonSparseFeatures.scala
        │           │   ├── Densify.scala
        │           │   ├── FloatToDouble.scala
        │           │   ├── Identity.scala
        │           │   ├── MatrixVectorizer.scala
        │           │   ├── MaxClassifier.scala
        │           │   ├── Shuffler.scala
        │           │   ├── SparseFeatureVectorizer.scala
        │           │   ├── Sparsify.scala
        │           │   ├── TopKClassifier.scala
        │           │   ├── VectorCombiner.scala
        │           │   └── VectorSplitter.scala
        │       ├── pipelines
        │           ├── FunctionNode.scala
        │           ├── Logging.scala
        │           ├── images
        │           │   ├── cifar
        │           │   │   ├── LinearPixels.scala
        │           │   │   ├── RandomCifar.scala
        │           │   │   ├── RandomPatchCifar.scala
        │           │   │   ├── RandomPatchCifarAugmented.scala
        │           │   │   ├── RandomPatchCifarAugmentedKernel.scala
        │           │   │   └── RandomPatchCifarKernel.scala
        │           │   ├── imagenet
        │           │   │   └── ImageNetSiftLcsFV.scala
        │           │   ├── mnist
        │           │   │   └── MnistRandomFFT.scala
        │           │   └── voc
        │           │   │   └── VOCSIFTFisher.scala
        │           ├── nlp
        │           │   └── StupidBackoffPipeline.scala
        │           ├── speech
        │           │   └── TimitPipeline.scala
        │           └── text
        │           │   ├── AmazonReviewsPipeline.scala
        │           │   └── NewsgroupsPipeline.scala
        │       ├── utils
        │           ├── MLlibUtils.scala
        │           ├── MatrixUtils.scala
        │           ├── Stats.scala
        │           ├── external
        │           │   ├── EncEval.scala
        │           │   └── VLFeat.scala
        │           └── images
        │           │   ├── Image.scala
        │           │   ├── ImageConversions.scala
        │           │   └── ImageUtils.scala
        │       └── workflow
        │           ├── AnalysisUtils.scala
        │           ├── AutoCacheRule.scala
        │           ├── ChainUtils.scala
        │           ├── Chainable.scala
        │           ├── DefaultOptimizer.scala
        │           ├── EquivalentNodeMergeRule.scala
        │           ├── Estimator.scala
        │           ├── Expression.scala
        │           ├── ExtractSaveablePrefixes.scala
        │           ├── FittedPipeline.scala
        │           ├── GatherTransformerOperator.scala
        │           ├── Graph.scala
        │           ├── GraphExecutor.scala
        │           ├── GraphId.scala
        │           ├── Identity.scala
        │           ├── LabelEstimator.scala
        │           ├── NodeOptimizationRule.scala
        │           ├── Operator.scala
        │           ├── OptimizableNodes.scala
        │           ├── Pipeline.scala
        │           ├── PipelineDataset.scala
        │           ├── PipelineDatum.scala
        │           ├── PipelineEnv.scala
        │           ├── PipelineResult.scala
        │           ├── Prefix.scala
        │           ├── Rule.scala
        │           ├── RuleExecutor.scala
        │           ├── SavedStateLoadRule.scala
        │           ├── SparkUtilWrapper.scala
        │           ├── Transformer.scala
        │           ├── TransformerGraph.scala
        │           ├── UnusedBranchRemovalRule.scala
        │           ├── WeightedNode.scala
        │           ├── WeightedOperator.scala
        │           └── WorkflowUtils.scala
    └── test
        ├── python
            └── images
            │   └── pyconv.py
        ├── resources
            ├── aMat-1class.csv
            ├── aMat.csv
            ├── aMatShuffled.csv
            ├── bMat-1class.csv
            ├── bMat.csv
            ├── bMatShuffled.csv
            ├── gmm_data.txt
            ├── images
            │   ├── 000012.jpg
            │   ├── convolved.gantrycrane.csv
            │   ├── convolved.gantrycrane.png
            │   ├── feats.csv
            │   ├── feats128.csv
            │   ├── gantrycrane.png
            │   ├── imagenet-test-labels
            │   ├── imagenet
            │   │   └── n15075141.tar
            │   ├── voc
            │   │   └── voctest.tar
            │   ├── voc_codebook
            │   │   ├── means.csv
            │   │   ├── priors
            │   │   └── variances.csv
            │   └── voclabels.csv
            └── iris.data
        └── scala
            └── keystoneml
                ├── evaluation
                    ├── BinaryClassifierEvaluatorSuite.scala
                    ├── MeanAveragePrecisionSuite.scala
                    └── MulticlassClassifierEvaluatorSuite.scala
                ├── loaders
                    ├── ImageNetLoaderSuite.scala
                    └── VOCLoaderSuite.scala
                ├── nodes
                    ├── images
                    │   ├── CenterCornerPatcherSuite.scala
                    │   ├── ConvolverSuite.scala
                    │   ├── DaisyExtractorSuite.scala
                    │   ├── HogExtractorSuite.scala
                    │   ├── ImageBenchMarkSuite.scala
                    │   ├── LCSExtractorSuite.scala
                    │   ├── PoolingSuite.scala
                    │   ├── RandomPatcherSuite.scala
                    │   ├── SIFTExtractorSuite.scala
                    │   └── WindowingSuite.scala
                    ├── learning
                    │   ├── BlockLinearMapperSuite.scala
                    │   ├── BlockWeightedLeastSquaresSuite.scala
                    │   ├── GaussianMixtureModelSuite.scala
                    │   ├── KMeansPlusPlusSuite.scala
                    │   ├── KernelModelSuite.scala
                    │   ├── LBFGSSuite.scala
                    │   ├── LeastSquaresEstimatorSuite.scala
                    │   ├── LinearDiscriminantAnalysisSuite.scala
                    │   ├── LinearMapperSuite.scala
                    │   ├── LogisticRegressionModelSuite.scala
                    │   ├── NaiveBayesModelSuite.scala
                    │   ├── PCASuite.scala
                    │   └── ZCAWhiteningSuite.scala
                    ├── misc
                    │   ├── SparseFeatureVectorizerSuite.scala
                    │   └── TermFrequencySuite.scala
                    ├── nlp
                    │   ├── CoreNLPFeatureExtractorSuite.scala
                    │   ├── HashingTFSuite.scala
                    │   ├── NGramIndexerSuite.scala
                    │   ├── NGramSuite.scala
                    │   ├── NGramsHashingTFSuite.scala
                    │   ├── StringUtilsSuite.scala
                    │   └── WordFrequencyEncoderSuite.scala
                    ├── stats
                    │   ├── CosineRandomFeaturesSuite.scala
                    │   ├── LinearRectifierSuite.scala
                    │   ├── PaddedFFTSuite.scala
                    │   ├── RandomSignNodeSuite.scala
                    │   ├── SignedHellingerMapperSuite.scala
                    │   └── StandardScalerSuite.scala
                    └── util
                    │   ├── ClassLabelIndicatorsSuite.scala
                    │   ├── MaxClassifierSuite.scala
                    │   ├── TopKClassifierSuite.scala
                    │   └── VectorSplitterSuite.scala
                ├── pipelines
                    └── nlp
                    │   └── StupidBackoffSuite.scala
                ├── utils
                    ├── ImageUtilsSuite.scala
                    ├── MLlibUtilsSuite.scala
                    ├── MatrixUtilsSuite.scala
                    ├── TestUtils.scala
                    ├── external
                    │   ├── EncEvalSuite.scala
                    │   └── VLFeatSuite.scala
                    └── images
                    │   └── ImageSuite.scala
                └── workflow
                    ├── AnalysisUtilsSuite.scala
                    ├── AutocCacheRuleSuite.scala
                    ├── EstimatorSuite.scala
                    ├── GraphSuite.scala
                    ├── LabelEstimatorSuite.scala
                    ├── NodeOptimizationRuleSuite.scala
                    ├── OperatorSuite.scala
                    ├── PipelineContext.scala
                    └── PipelineSuite.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | dist/*
 6 | target/
 7 | lib_managed/
 8 | src_managed/
 9 | project/boot/
10 | project/plugins/project/
11 | 
12 | # Scala-IDE specific
13 | .idea*
14 | .scala_dependencies
15 | 
16 | # Jars
17 | *.jar
18 | 
19 | # vim tmps
20 | .*sw*
21 | 
22 | # Jekyll stuff
23 | _site/
24 | 
25 | # Data for running examples.
26 | example_data/
27 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
 1 | Contributors
 2 | ============
 3 | 
 4 | KeystoneML has been developed by the following people (alphabetically):
 5 | 
 6 | * Daniel Bruckner
 7 | * Michael J. Franklin
 8 | * Nicolas Garneau
 9 | * Gylfi Gudmundsson
10 | * Eric Jonas
11 | * Tomer Kaftan
12 | * Daniel Langkilde
13 | * Henry Milner
14 | * Benjamin Recht
15 | * Vaishaal Shankar
16 | * Evan R. Sparks
17 | * Stephen Tu
18 | * Shivaram Venkataraman
19 | * Zongheng Yang
20 | 


--------------------------------------------------------------------------------
/EC2.md:
--------------------------------------------------------------------------------
 1 | # Running KeystoneML on EC2
 2 | 
 3 | To run KeystoneML on EC2 you can use the
 4 | [spark-ec2](http://spark.apache.org/docs/latest/ec2-scripts.html) scripts.
 5 | 
 6 | ## Getting spark-ec2
 7 | 
 8 | As the KeystoneML scripts require a recent version of spark-ec2, it is
 9 | recommended that you clone the spark-ec2 master branch for this. You can do this with
10 | ```
11 | git clone https://github.com/amplab/spark-ec2.git
12 | ``` 
13 | 
14 | ## Launching a Cluster
15 | 
16 | You can now use the `bin/keystone-ec2.sh` to launch a cluster with KeystoneML pre-installed.
17 | To do that you can run a command which looks like 
18 | 
19 | ```
20 | SPARK_EC2_DIR=<path_to_your_spark-ec2> ./bin/keystone-ec2.sh \
21 |   -s 4 \
22 |   -t r3.4xlarge \
23 |   -i <key-file> \
24 |   -k <key-name> \
25 |   launch keystone-test-cluster
26 | ```
27 | 
28 | The above command launches 4 slaves and 1 master machine of type r3.4xlarge.
29 | Note that you can pass in any spark-ec2 options (like spot-prices etc.) to this script.
30 | 
31 | ## Running KeystoneML on the cluster
32 | 
33 | Once the cluster launch finishes you can login to the master node and the KeystoneML
34 | repository should be present in `/root/keystone`.
35 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # KeystoneML
 2 | The biggest, baddest pipelines around.
 3 | 
 4 | 
 5 | # Example pipeline
 6 | 
 7 | ### Build the KeystoneML project
 8 | 
 9 | ```
10 | ./sbt/sbt assembly
11 | make # This builds the native libraries used in KeystoneML
12 | ```
13 | 
14 | ### Example: MNIST pipeline
15 | 
16 | ```
17 | # Get the data from S3
18 | wget http://mnist-data.s3.amazonaws.com/train-mnist-dense-with-labels.data
19 | wget http://mnist-data.s3.amazonaws.com/test-mnist-dense-with-labels.data
20 | 
21 | KEYSTONE_MEM=4g ./bin/run-pipeline.sh \
22 |   keystoneml.pipelines.images.mnist.MnistRandomFFT \
23 |   --trainLocation ./train-mnist-dense-with-labels.data \
24 |   --testLocation ./test-mnist-dense-with-labels.data \
25 |   --numFFTs 4 \
26 |   --blockSize 2048
27 | ```
28 | 
29 | ## Running with spark-submit
30 | 
31 | To run KeystoneML pipelines on large datasets you will need a [Spark](http://spark.apache.org) cluster. 
32 | KeystoneML pipelines run on the cluster using
33 | [spark-submit](http://spark.apache.org/docs/latest/submitting-applications.html).
34 | 
35 | You need to export `SPARK_HOME` to run KeystoneML using spark-submit. Having done
36 | that you can similarly use run-pipeline.sh to launch your pipeline.
37 | 
38 | ```
39 | export SPARK_HOME=~/spark-1.3.1-bin-cdh4 # should match the version keystone is built with
40 | KEYSTONE_MEM=4g ./bin/run-pipeline.sh \
41 |   keystoneml.pipelines.images.mnist.MnistRandomFFT \
42 |   --trainLocation ./train-mnist-dense-with-labels.data \
43 |   --testLocation ./test-mnist-dense-with-labels.data \
44 |   --numFFTs 4 \
45 |   --blockSize 2048
46 | ```
47 | 


--------------------------------------------------------------------------------
/bin/keystone-ec2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$SPARK_EC2_DIR" ] || [ ! -f "$SPARK_EC2_DIR"/spark-ec2 ]; then
 4 |   echo "SPARK_EC2_DIR is not set correctly, please set SPARK_EC2_DIR to be <your_spark_clone>/ec2"
 5 |   exit 1
 6 | fi
 7 | 
 8 | $SPARK_EC2_DIR/spark-ec2 \
 9 |   --hadoop-major-version=2 \
10 |   --spark-version=1.3.1 \
11 |   --spark-ec2-git-repo=https://github.com/shivaram/spark-ec2 \
12 |   --spark-ec2-git-branch=keystone \
13 |   --copy-aws-credentials \
14 |   $@
15 | 


--------------------------------------------------------------------------------
/bin/run-main.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | SCALA_VERSION=2.10
21 | 
22 | # Figure out where the Scala framework is installed
23 | FWDIR="$(cd `dirname $0`/..; pwd)"
24 | 
25 | if [ -z "$1" ]; then
26 |   echo "Usage: run-main.sh <class> [<args>]" >&2
27 |   exit 1
28 | fi
29 | 
30 | ASSEMBLY_JAR=""
31 | if [ -e "$FWDIR"/target/scala-$SCALA_VERSION/keystoneml-assembly-*.jar ]; then
32 |   export ASSEMBLY_JAR=`ls "$FWDIR"/target/scala-$SCALA_VERSION/keystoneml-assembly*.jar`
33 | fi
34 | 
35 | if [[ -z $ASSEMBLY_JAR ]]; then
36 |   echo "Failed to find assembly JAR in $FWDIR/target" >&2
37 |   echo "You need to run sbt/sbt assembly before running this program" >&2
38 |   exit 1
39 | fi
40 | CLASSPATH="$ASSEMBLY_JAR"
41 | 
42 | # Find java binary
43 | if [ -n "${JAVA_HOME}" ]; then
44 |   RUNNER="${JAVA_HOME}/bin/java"
45 | else
46 |   if [ `command -v java` ]; then
47 |     RUNNER="java"
48 |   else
49 |     echo "JAVA_HOME is not set" >&2
50 |     exit 1
51 |   fi
52 | fi
53 | 
54 | # Set KEYSTONE_MEM if it isn't already set since we also use it for this process
55 | KEYSTONE_MEM=${KEYSTONE_MEM:-1g}
56 | export KEYSTONE_MEM
57 | 
58 | JAVA_OPTS="$JAVA_OPTS -Xms$KEYSTONE_MEM -Xmx$KEYSTONE_MEM ""$SPARK_JAVA_OPTS"
59 | 
60 | exec "$RUNNER" -Djava.library.path=$FWDIR/lib -cp "$CLASSPATH" $JAVA_OPTS "$@"
61 | 


--------------------------------------------------------------------------------
/bin/run-pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Figure out where we are.
 4 | FWDIR="$(cd `dirname $0`; pwd)"
 5 | 
 6 | CLASS=$1
 7 | shift
 8 | 
 9 | # Set OMP_NUM_THREADS on workers and driver to something appropriate.
10 | # This is due to OpenBLAS not handling large numbers of cores very well.
11 | # See: https://github.com/amplab/keystone/issues/198 for more information. 
12 | 
13 | if [[ -z "$OMP_NUM_THREADS" ]]; then
14 |   # Determine number of cores. We assume that hyperthreading is enabled and thus divide cores by two.
15 |   unamestr=`uname`
16 |   if [[ $unamestr == "Darwin" ]]; then
17 |     CORES=$((`sysctl -n hw.ncpu`/2))
18 |   elif [[ $unamestr == "Linux" ]]; then
19 |     CORES=$((`cat /proc/cpuinfo | grep processor | wc -l`/2))
20 |   else # Windows,BSD? Do the safest thing.
21 |     CORES=1
22 |   fi 
23 |  
24 |   # Set OMP_NUM_THREADS to MIN(32,CORES) to avoid stack smashing issues.
25 |   export OMP_NUM_THREADS=$(($CORES>32?32:$CORES))
26 | else
27 |   if [[ $OMP_NUM_THREADS -gt 32 ]]; then
28 |     echo 'Warning: setting OMP_NUM_THREADS > 32 may cause instability.'
29 |   fi
30 | fi
31 | 
32 | EXECUTOR_OMP_NUM_THREADS=${EXECUTOR_OMP_NUM_THREADS:-1}
33 | 
34 | if [[ -z "$SPARK_HOME" ]]; then
35 |   echo "SPARK_HOME is not set, running pipeline locally"
36 |   $FWDIR/run-main.sh $CLASS "$@"
37 | else
38 |   # TODO: Figure out a way to pass in either a conf file / flags to spark-submit
39 |   KEYSTONE_MEM=${KEYSTONE_MEM:-1g}
40 |   export KEYSTONE_MEM
41 | 
42 |   # Set some commonly used config flags on the cluster
43 |   $SPARK_HOME/bin/spark-submit \
44 |     --deploy-mode client \
45 |     --class $CLASS \
46 |     --driver-class-path $FWDIR/../target/scala-2.10/keystoneml-assembly-0.3.0-SNAPSHOT.jar \
47 |     --driver-library-path $FWDIR/../lib \
48 |     --conf spark.executor.extraLibraryPath=$FWDIR/../lib \
49 |     --conf spark.executor.extraClassPath=$FWDIR/../target/scala-2.10/keystoneml-assembly-0.3.0-SNAPSHOT.jar \
50 |     --conf spark.executorEnv.OMP_NUM_THREADS=$EXECUTOR_OMP_NUM_THREADS \
51 |     --driver-memory $KEYSTONE_MEM \
52 |     target/scala-2.10/keystoneml-assembly-0.3.0-SNAPSHOT.jar \
53 |     "$@"
54 | fi
55 | 


--------------------------------------------------------------------------------
/examples/images/cifar_random_patch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | #Set environment variables
 5 | : ${KEYSTONE_MEM:=4g}
 6 | export KEYSTONE_MEM
 7 | 
 8 | KEYSTONE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/../..
 9 | : ${EXAMPLE_DATA_DIR:=$KEYSTONE_DIR/example_data}
10 | 
11 | if [ ! -d $EXAMPLE_DATA_DIR ]; then
12 |     mkdir $EXAMPLE_DATA_DIR
13 | fi
14 | 
15 | #Download data if necessary.
16 | if [[ ! ( -f $EXAMPLE_DATA_DIR/cifar_train.bin && -f $EXAMPLE_DATA_DIR/cifar_test.bin ) ]]; then
17 |     #Get the data
18 |     wget -O $TMPDIR/cifar-10-binary.tar.gz  http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz 
19 | 
20 |     #Decompress it
21 |     tar zxvf $TMPDIR/cifar-10-binary.tar.gz -C $TMPDIR
22 |     cat $TMPDIR/cifar-10-batches-bin/data_batch*.bin > $EXAMPLE_DATA_DIR/cifar_train.bin
23 |     mv $TMPDIR/cifar-10-batches-bin/test_batch.bin $EXAMPLE_DATA_DIR/cifar_test.bin
24 |        
25 |     #Clean up. 
26 |     rm -rf $TMPDIR/cifar-10-batches-bin
27 |     rm -rf $TMPDIR/cifar-10-binary.tar.gz
28 | fi
29 | 
30 | #Run the pipeline
31 | $KEYSTONE_DIR/bin/run-pipeline.sh \
32 |   keystoneml.pipelines.images.cifar.RandomPatchCifar \
33 |   --trainLocation $EXAMPLE_DATA_DIR/cifar_train.bin \
34 |   --testLocation $EXAMPLE_DATA_DIR/cifar_test.bin \
35 |   --numFilters 10000 \
36 |   --lambda 3000 \
37 |   --whiteningEpsilon 1e-5
38 | 


--------------------------------------------------------------------------------
/examples/images/mnist_random_fft.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | #Set environment variables
 5 | : ${KEYSTONE_MEM:=4g}
 6 | export KEYSTONE_MEM
 7 | 
 8 | : ${NUM_FFTS:=4}
 9 | : ${BLOCK_SIZE:=2048}
10 | 
11 | KEYSTONE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/../..
12 | : ${EXAMPLE_DATA_DIR:=$KEYSTONE_DIR/example_data}
13 | 
14 | if [ ! -d $EXAMPLE_DATA_DIR ]; then
15 |     mkdir $EXAMPLE_DATA_DIR
16 | fi
17 | 
18 | # Get the data from S3
19 | if [ ! -f $EXAMPLE_DATA_DIR/train-mnist-dense-with-labels.data ]; then
20 |     wget -O $EXAMPLE_DATA_DIR/train-mnist-dense-with-labels.data  \
21 |         http://mnist-data.s3.amazonaws.com/train-mnist-dense-with-labels.data
22 | fi
23 | 
24 | if [ ! -f $EXAMPLE_DATA_DIR/test-mnist-dense-with-labels.data ]; then
25 |     wget -O $EXAMPLE_DATA_DIR/test-mnist-dense-with-labels.data \
26 |         http://mnist-data.s3.amazonaws.com/test-mnist-dense-with-labels.data
27 | fi
28 | 
29 | $KEYSTONE_DIR/bin/run-pipeline.sh \
30 |   keystoneml.pipelines.images.mnist.MnistRandomFFT \
31 |   --trainLocation $EXAMPLE_DATA_DIR/train-mnist-dense-with-labels.data \
32 |   --testLocation $EXAMPLE_DATA_DIR/test-mnist-dense-with-labels.data \
33 |   --numFFTs $NUM_FFTS \
34 |   --blockSize $BLOCK_SIZE
35 | 


--------------------------------------------------------------------------------
/examples/images/voc_sift_fisher.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | #Set environment variables
 5 | : ${KEYSTONE_MEM:=12g}
 6 | export KEYSTONE_MEM
 7 | 
 8 | KEYSTONE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/../..
 9 | : ${EXAMPLE_DATA_DIR:=$KEYSTONE_DIR/example_data}
10 | 
11 | 
12 | #Get the data and copy to HDFS if necessary.
13 | if [ ! -f $EXAMPLE_DATA_DIR/VOCtrainval_06-Nov-2007.tar ]; then
14 |     wget -O $EXAMPLE_DATA_DIR/VOCtrainval_06-Nov-2007.tar http://s3-us-west-2.amazonaws.com/voc-data/VOCtrainval_06-Nov-2007.tar
15 | fi
16 | 
17 | if [ ! -f $EXAMPLE_DATA_DIR/VOCtest_06-Nov-2007.tar ]; then
18 |     wget -O $EXAMPLE_DATA_DIR/VOCtest_06-Nov-2007.tar http://s3-us-west-2.amazonaws.com/voc-data/VOCtest_06-Nov-2007.tar
19 | fi
20 | 
21 | #Run the pipeline
22 | $KEYSTONE_DIR/bin/run-pipeline.sh \
23 |   keystoneml.pipelines.images.voc.VOCSIFTFisher \
24 |   --trainLocation $EXAMPLE_DATA_DIR/VOCtrainval_06-Nov-2007.tar \
25 |   --testLocation $EXAMPLE_DATA_DIR/VOCtest_06-Nov-2007.tar \
26 |   --labelPath $KEYSTONE_DIR/src/test/resources/images/voclabels.csv \
27 |   --numParts 200
28 | 


--------------------------------------------------------------------------------
/examples/text/newsgroups_ngrams_tfidf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | #Set environment variables
 5 | : ${KEYSTONE_MEM:=4g}
 6 | export KEYSTONE_MEM
 7 | 
 8 | : ${NUM_PARTS:=256}
 9 | : ${NGRAMS:=2}
10 | : ${COMMON_FEATURES:=1000}
11 | 
12 | KEYSTONE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/../..
13 | : ${EXAMPLE_DATA_DIR:=$KEYSTONE_DIR/example_data}
14 | 
15 | if [ ! -d $EXAMPLE_DATA_DIR ]; then
16 |     mkdir $EXAMPLE_DATA_DIR
17 | fi
18 | 
19 | 
20 | #Download 20 Newsgroups data if necessary.
21 | if [ ! -f $EXAMPLE_DATA_DIR/20news-bydate.tar.gz ]; then
22 |     wget -O $EXAMPLE_DATA_DIR/20news-bydate.tar.gz http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz
23 |     tar zxvf $EXAMPLE_DATA_DIR/20news-bydate.tar.gz -C $EXAMPLE_DATA_DIR
24 | fi
25 | 
26 | #Run pipeline.
27 | $KEYSTONE_DIR/bin/run-pipeline.sh \
28 |     keystoneml.pipelines.text.NewsgroupsPipeline \
29 |     --trainLocation $EXAMPLE_DATA_DIR/20news-bydate-train \
30 |     --testLocation $EXAMPLE_DATA_DIR/20news-bydate-test \
31 |     --nGrams $NGRAMS \
32 |     --commonFeatures $COMMON_FEATURES
33 | 


--------------------------------------------------------------------------------
/lib/libImageFeatures.dylib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amplab/keystone/74e2fb5efaff55675603508bd0c479bb8875901f/lib/libImageFeatures.dylib


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | 
2 | sbt.version=0.13.13
3 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += "Sonatype snapshots" at "http://oss.sonatype.org/content/repositories/snapshots/"
2 | 
3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
4 | 
5 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
6 | 


--------------------------------------------------------------------------------
/sbt/sbt:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | # This script launches sbt for this project. If present it uses the system 
21 | # version of sbt. If there is no system version of sbt it attempts to download
22 | # sbt locally.
23 | SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties`
24 | URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
25 | URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
26 | JAR=sbt/sbt-launch-${SBT_VERSION}.jar
27 | 
28 | # Download sbt launch jar if it hasn't been downloaded yet
29 | if [ ! -f ${JAR} ]; then
30 |   # Download
31 |   printf "Attempting to fetch sbt\n"
32 |   if hash curl 2>/dev/null; then
33 |     curl --fail --location --silent ${URL1} > ${JAR} || curl --fail --location --silent ${URL2} > ${JAR}
34 |   elif hash wget 2>/dev/null; then
35 |     wget --quiet ${URL1} -O ${JAR} || wget --quiet ${URL2} -O ${JAR}
36 |   else
37 |     printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
38 |     exit -1
39 |   fi
40 | fi
41 | if [ ! -f ${JAR} ]; then
42 |   # We failed to download
43 |   printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
44 |   exit -1
45 | fi
46 | printf "Launching sbt from ${JAR}\n"
47 | 
48 | FWDIR="$(cd `dirname $0`/..; pwd)"
49 | 
50 | java \
51 |   -Djava.library.path="$FWDIR/lib" \
52 |   -Xmx4000m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
53 |   -jar ${JAR} \
54 |   "$@"
55 | 


--------------------------------------------------------------------------------
/scripts/constantEstimator.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(reshape)
 3 | 
 4 | nmachines <- 16
 5 | bs <- 1024
 6 | 
 7 | flops <- function(solver, n, d, k, sparsity) {
 8 |   ifelse(solver=="LS - LBFGS",
 9 |   20*n*sparsity*d*k/nmachines,
10 |   ifelse(solver=="Exact",
11 |   n*d*(d+k)/nmachines,
12 |   3*(n*d*(bs+k)/nmachines)))
13 | }
14 | 
15 | mem <- function(solver, n, d, k, sparsity) {
16 |   ifelse(solver=="LS - LBFGS",
17 |     20*n*d*sparsity/nmachines,
18 |     ifelse(solver=="Exact",
19 |     n*d/nmachines + d**2,
20 |     3*(n*d/nmachines + d*k)))
21 | }
22 | 
23 | network <- function(solver, n, d, k, sparsity) {
24 |   ifelse(solver=="LS - LBFGS",
25 |     20*2*d*k*log(nmachines),
26 |     ifelse(solver=="Exact",
27 |     d*(d+k),
28 |     3*2*(d*(bs+k))*log(nmachines)))
29 | }
30 | 
31 | 
32 | main <- function () {
33 |   x <- read.csv("solver-comparisons-final.csv")
34 |   n=list(Amazon=65e6, TIMIT=2.2e6)
35 |   k=list(Amazon=2, TIMIT=138)
36 |   nnz=list(Amazon=0.005, TIMIT=1.0)
37 |   colnames(x) <- c("Experiment", "solver", "d","t","train.error","loss","loss.2")
38 | 
39 | 
40 |   x$n <- as.numeric(n[x$Experiment])
41 |   x$k <- as.numeric(k[x$Experiment])
42 |   x$s <- as.numeric(nnz[x$Experiment])
43 | 
44 |   x$cpu <- with(x, flops(solver, n, d, k, s))
45 |   x$mem <- with(x, mem(solver, n, d, k, s))
46 |   x$network <- with(x, network(solver, n, d, k, s))
47 | 
48 |   list(data=x, model=lm(t ~ cpu + mem + network, data=x))
49 | }
50 | 
51 | 
52 | plotter <- function(res) {
53 |   res$data$pred <- predict(res$model, res$data)
54 |   dn <- res$data[,c("Experiment","solver","d","t","pred")]
55 |   dnm <- melt(dn, id.vars=c("Experiment","solver","d"))
56 | 
57 |   qplot(d, value, geom='line', color=solver, shape=variable, data=dnm) +
58 |     facet_grid(Experiment ~ ., scale="free_y") +
59 |     theme_bw() +
60 |     geom_point()
61 | }


--------------------------------------------------------------------------------
/scripts/solver-comparisons-final.csv:
--------------------------------------------------------------------------------
 1 | Experiment,Solver,Num Features,Time (ms),Train Error (%),Loss,Loss/2
 2 | Amazon,Exact,1024,186149,15.9,0.4675666294,0.2337833147
 3 | Amazon,Block,1024,894313,15.9,0.4675666294,0.2337833147
 4 | Amazon,LS - LBFGS,1024,33704,15.5,0.4833497961,0.2416748981
 5 | Amazon,Exact,2048,690558,14.5,0.4343235648,0.2171617824
 6 | Amazon,Block,2048,1756617,14.4,0.4349389635,0.2174694817
 7 | Amazon,LS - LBFGS,2048,33643,13.9,0.4512289158,0.2256144579
 8 | Amazon,Block,4096,3476561,13,0.4045493394,0.2022746697
 9 | Amazon,LS - LBFGS,4096,40606,12.7,0.4208100192,0.2104050096
10 | Amazon,Block,8192,6889505,12.1,0.3886991737,0.1943495868
11 | Amazon,LS - LBFGS,8192,45407,11.9,0.4045554759,0.2022777379
12 | Amazon,Block,16384,13631976,11.4,0.3761958693,0.1880979346
13 | Amazon,LS - LBFGS,16384,52290,11.4,0.3958041617,0.1979020809
14 | TIMIT,Exact,1024,7323,50.42190579,1.584064323,0.7920321614
15 | TIMIT,Block,1024,33521,50.42190579,1.584064323,0.7920321614
16 | TIMIT,LS - LBFGS,1024,70396,50.40627225,1.583816101,0.7919080506
17 | TIMIT,Exact,2048,17949,46.04677894,1.490036052,0.7450180261
18 | TIMIT,Block,2048,61395,46.22247864,1.499799737,0.7498998685
19 | TIMIT,LS - LBFGS,2048,98834,46.32929304,1.497731196,0.7488655978
20 | TIMIT,Exact,4096,76562,42.15900112,1.400043524,0.7000217618
21 | TIMIT,Block,4096,120998,42.52203686,1.414037154,0.7070185769
22 | TIMIT,LS - LBFGS,4096,259498,43.25974465,1.41154238,0.7057711902
23 | TIMIT,Exact,8192,315183,38.63821184,1.314324568,0.6571622842
24 | TIMIT,Block,8192,255570,39.12005362,1.336486736,0.6682433679
25 | TIMIT,LS - LBFGS,8192,810286,40.81220695,1.341040478,0.6705202388
26 | TIMIT,Block,16384,580555,35.73174973,1.265805335,0.6329026676
27 | TIMIT,LS - LBFGS,16384,1589308,39.58150961,1.293037819,0.6465189093


--------------------------------------------------------------------------------
/src/main/cpp/EncEval.h:
--------------------------------------------------------------------------------
 1 | /* DO NOT EDIT THIS FILE - it is machine generated */
 2 | #include <jni.h>
 3 | /* Header for class keystoneml_utils_external_EncEval */
 4 | 
 5 | #ifndef _Included_keystoneml_utils_external_EncEval
 6 | #define _Included_keystoneml_utils_external_EncEval
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | /*
11 |  * Class:     keystoneml_utils_external_EncEval
12 |  * Method:    computeGMM
13 |  * Signature: (II[F)[F
14 |  */
15 | JNIEXPORT jfloatArray JNICALL Java_keystoneml_utils_external_EncEval_computeGMM
16 |   (JNIEnv *, jobject, jint, jint, jfloatArray);
17 | 
18 | /*
19 |  * Class:     keystoneml_utils_external_EncEval
20 |  * Method:    calcAndGetFVs
21 |  * Signature: ([FII[F[F[F)[F
22 |  */
23 | JNIEXPORT jfloatArray JNICALL Java_keystoneml_utils_external_EncEval_calcAndGetFVs
24 |   (JNIEnv *, jobject, jfloatArray, jint, jint, jfloatArray, jfloatArray, jfloatArray);
25 | 
26 | #ifdef __cplusplus
27 | }
28 | #endif
29 | #endif
30 | 


--------------------------------------------------------------------------------
/src/main/cpp/VLFeat.h:
--------------------------------------------------------------------------------
 1 | /* DO NOT EDIT THIS FILE - it is machine generated */
 2 | #include <jni.h>
 3 | /* Header for class keystoneml_utils_external_VLFeat */
 4 | 
 5 | #ifndef _Included_keystoneml_utils_external_VLFeat
 6 | #define _Included_keystoneml_utils_external_VLFeat
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | /*
11 |  * Class:     keystoneml_utils_external_VLFeat
12 |  * Method:    getSIFTs
13 |  * Signature: (IIIIII[F)[S
14 |  */
15 | JNIEXPORT jshortArray JNICALL Java_keystoneml_utils_external_VLFeat_getSIFTs
16 |   (JNIEnv *, jobject, jint, jint, jint, jint, jint, jint, jfloatArray);
17 | 
18 | #ifdef __cplusplus
19 | }
20 | #endif
21 | #endif
22 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=ERROR, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 6 | 
 7 | # Only pay attention to INFO messages from Keystone.
 8 | log4j.logger.keystoneml.pipelines=INFO
 9 | log4j.logger.keystoneml.workflow=INFO
10 | log4j.logger.keystoneml.nodes=INFO
11 | log4j.logger.keystoneml.utils=INFO
12 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/evaluation/AugmentedExamplesEvaluator.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.evaluation
 2 | 
 3 | import breeze.linalg._
 4 | import keystoneml.nodes.util.MaxClassifier
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | import scala.reflect.ClassTag
 8 | 
 9 | object AggregationPolicyType extends Enumeration {
10 |   type AggregationPolicyType = Value
11 |   val average, borda = Value
12 | }
13 | 
14 | class AugmentedExamplesEvaluator[T : ClassTag](
15 |     names: RDD[T],
16 |     numClasses: Int,
17 |     policy: AggregationPolicyType.Value = AggregationPolicyType.average)
18 |   extends Evaluator[DenseVector[Double], Int, MulticlassMetrics] with Serializable {
19 | 
20 |   def averagePolicy(preds: Array[DenseVector[Double]]): DenseVector[Double] = {
21 |     preds.reduce(_ + _) :/ preds.size.toDouble
22 |   }
23 | 
24 |   /**
25 |    * Borda averaging works as follows:
26 |    * Let s(k) be the ordering of patch k.
27 |    * For i in images,
28 |    *  For k in patches,
29 |    *    score[i] += s(k)[i]
30 |    */
31 |   def bordaPolicy(preds: Array[DenseVector[Double]]): DenseVector[Double] = {
32 |     val ranks = preds.map { vec =>
33 |       val sortedPreds = vec.toArray.zipWithIndex.sortBy(_._1).map(_._2)
34 |       val rank = DenseVector(sortedPreds.zipWithIndex.sortBy(_._1).map(x => x._2.toDouble))
35 |       rank
36 |     }
37 |     ranks.reduceLeft(_ + _)
38 |   }
39 | 
40 |   def evaluate(
41 |       predicted: RDD[DenseVector[Double]],
42 |       actualLabels: RDD[Int]): MulticlassMetrics = {
43 | 
44 |     val aggFunc = policy match {
45 |       case AggregationPolicyType.borda => bordaPolicy _
46 |       case _ => averagePolicy _
47 |     }
48 |        
49 |     // associate a name with each predicted, actual
50 |     val namedPreds = names.zip(predicted.zip(actualLabels))
51 | 
52 |     // group by name to get all the predicted values for a name
53 |     val groupedPreds = namedPreds.groupByKey(names.partitions.length).map { case (group, iter) =>
54 |       val predActuals = iter.toArray // this is a array of tuples
55 |       val predsForName = predActuals.map(_._1)
56 |       assert(predActuals.map(_._2).distinct.size == 1)
57 |       val actualForName: Int = predActuals.map(_._2).head
58 | 
59 |       (predsForName, actualForName)
60 |     }.cache()
61 | 
62 |     // Averaging policy
63 |     val finalPred = groupedPreds.map(x => (aggFunc(x._1), x._2) )
64 |     val finalPredictedLabels = MaxClassifier(finalPred.map(_._1))
65 |     val finalActualLabels = finalPred.map(_._2)
66 | 
67 |     val ret = new MulticlassClassifierEvaluator(numClasses).evaluate(finalPredictedLabels, finalActualLabels)
68 |     groupedPreds.unpersist()
69 |     ret
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/evaluation/Evaluator.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.evaluation
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import keystoneml.workflow.PipelineDataset
 5 | 
 6 | /**
 7 |   * An Evaluator is an object whose "evaluate" method takes a vector of Predictions and a set of Labels (of the same
 8 |   * length and order) and returns an "Evaluation" which is specific to the domain (binary classification, multi-label
 9 |   * classification, etc.). The Evaluation is typically a set of summary statistics designed to capture the performance
10 |   * of a machine learning pipeline.
11 |   *
12 |   * Because evaluation typically happens at the end of a pipeline, we support the cartesian product of
13 |   * {RDD, PipelineDataset} for both sets of arguments.
14 |   *
15 |   * @tparam P Type of Predictions.
16 |   * @tparam L Type of the Labels.
17 |   * @tparam E Type of the Evaluation.
18 |   */
19 | trait Evaluator[P,L,E] {
20 | 
21 |   /**
22 |     * Generate an evaluation.
23 |     *
24 |     * @param predictions Predicted values.
25 |     * @param labels True labels. (Same order and length and the predictions).
26 |     *
27 |     * @return An evaluation.
28 |     */
29 |   def evaluate(predictions: RDD[P], labels: RDD[L]): E
30 | 
31 |   def evaluate(predictions: PipelineDataset[P], labels: RDD[L]): E = evaluate(predictions.get, labels)
32 | 
33 |   def evaluate(predictions: RDD[P], labels: PipelineDataset[L]): E = evaluate(predictions, labels.get)
34 |   
35 |   def evaluate(predictions: PipelineDataset[P], labels: PipelineDataset[L]): E = evaluate(predictions.get, labels.get)
36 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/loaders/AmazonReviewsDataLoader.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.loaders
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.sql.{SQLContext, SparkSession}
 5 | 
 6 | 
 7 | object AmazonReviewsDataLoader {
 8 |   /**
 9 |    * Loads the Amazon Product Reviews dataset for binary classification.
10 |    * Each review is a JSON string with (at least) two fields: "reviewText" and "overAll".
11 |    *
12 |    * This data loader produces an RDD of labeled reviews.
13 |    *
14 |    * @param spark  SparkSession to use (needed for SQL)
15 |    * @param dataDir  Directory of the training data
16 |    * @param threshold  Lowest value at which to consider a review positive.
17 |    * @return  A Labeled Dataset that contains the data strings and labels.
18 |    */
19 |   def apply(spark: SparkSession, dataDir: String, threshold: Double): LabeledData[Int, String] = {
20 |     import spark.implicits._
21 | 
22 |     val df = spark.read.json(dataDir)
23 |     val data = df.select(df("overall"), df("reviewText"))
24 |         .map(r => (if(r.getAs[Double](0) >= threshold) 1 else 0, r.getAs[String](1))).rdd
25 | 
26 |     LabeledData(data)
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/loaders/CifarLoader.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.loaders
 2 | 
 3 | import java.io.FileInputStream
 4 | 
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.rdd.RDD
 7 | import keystoneml.utils.{ImageMetadata, LabeledImage, RowColumnMajorByteArrayVectorizedImage}
 8 | 
 9 | 
10 | /**
11 |  * Loads images from the CIFAR-10 Dataset.
12 |  */
13 | object CifarLoader {
14 |   // We hardcode this because these are properties of the CIFAR-10 dataset.
15 |   val nrow = 32
16 |   val ncol = 32
17 |   val nchan = 3
18 | 
19 |   val labelSize = 1
20 | 
21 |   def cifar10ToBufferedImage(cifar: Array[Byte]): RowColumnMajorByteArrayVectorizedImage = {
22 |     val byteLen = nrow*ncol*nchan
23 | 
24 |     // Allocate some space for the rows.
25 |     require(cifar.length == byteLen, "CIFAR-10 Images MUST be 32x32x3.")
26 | 
27 |     RowColumnMajorByteArrayVectorizedImage(cifar, ImageMetadata(nrow, ncol, nchan))
28 |   }
29 | 
30 |   def loadLabeledImages(path: String): Seq[LabeledImage] = {
31 |     val imgCount = labelSize + nrow*ncol*nchan
32 | 
33 |     val imageBytes = Array.fill[Byte](imgCount)(0x00)
34 |     var out = Array[LabeledImage]()
35 | 
36 |     val inFile = new FileInputStream(path)
37 | 
38 |     while(inFile.read(imageBytes, 0, imgCount) > 0) {
39 |       val img = cifar10ToBufferedImage(imageBytes.tail)
40 |       val label = imageBytes.head.toShort
41 |       val li = LabeledImage(img, label)
42 |       out = out :+ li
43 |     }
44 |     out
45 |   }
46 | 
47 |   def apply(sc: SparkContext, path: String): RDD[LabeledImage] = {
48 |     val images = CifarLoader.loadLabeledImages(path)
49 | 
50 |     sc.parallelize(images)
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/loaders/CsvDataLoader.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.loaders
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | /**
 8 |  * Data Loader that loads csv files of comma separated numbers into an RDD of DenseVectors
 9 |  */
10 | object CsvDataLoader {
11 |   /**
12 |    * Load CSV files from the given path into an RDD of DenseVectors
13 |    * @param sc The spark context to use
14 |    * @param path The path to the CSV files
15 |    * @return RDD of DenseVectors, one per CSV row
16 |    */
17 |   def apply(sc: SparkContext, path: String): RDD[DenseVector[Double]] = {
18 |     sc.textFile(path).map(row => DenseVector(row.split(",").map(_.toDouble)))
19 |   }
20 | 
21 |   /**
22 |    * Load CSV files from the given path into an RDD of DenseVectors
23 |    * @param sc The spark context to use
24 |    * @param path The path to the CSV files
25 |    * @param minPartitions The minimum # of partitions to use
26 |    * @return RDD of DenseVectors, one per CSV row
27 |    */
28 |   def apply(sc: SparkContext, path: String, minPartitions: Int): RDD[DenseVector[Double]] = {
29 |     sc.textFile(path, minPartitions).map(row => DenseVector(row.split(",").map(_.toDouble)))
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/loaders/ImageNetLoader.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.loaders
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.rdd.RDD
 5 | import keystoneml.utils.LabeledImage
 6 | 
 7 | /**
 8 |  * Helper object to loads images from ImageNet Datasets.
 9 |  */
10 | 
11 | object ImageNetLoader {
12 | 
13 |   val NUM_CLASSES = 1000
14 |   
15 |   /**
16 |    * Loads images from @dataPath and associates images with the labels provided in @labelPath
17 |    *
18 |    * @param sc SparkContext to use
19 |    * @param dataPath Directory containing tar files (can be a HDFS path). This classes assumes
20 |    *                 that each tar file contains images within a directory. The name of the
21 |    *                 directory is treated as the className.
22 |    * @param labelsPath Local file that maps classNames to a numeric value
23 |    */
24 |   def apply(sc: SparkContext, dataPath: String, labelsPath: String): RDD[LabeledImage] = {
25 |     val filePathsRDD = ImageLoaderUtils.getFilePathsRDD(sc, dataPath)
26 | 
27 |     val labelsMapFile = scala.io.Source.fromFile(labelsPath)
28 |     val labelsMap = labelsMapFile.getLines().map(x => x.toString).toArray.map { line =>
29 |       val parts = line.split(" ")
30 |       (parts(0), parts(1).toInt)
31 |     }.toMap
32 | 
33 |     def labelsMapF(fname: String): Int = {
34 |       labelsMap(fname.split('/')(0))
35 |     }
36 | 
37 |     ImageLoaderUtils.loadFiles(filePathsRDD, labelsMapF, LabeledImage.apply)
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/loaders/LabeledData.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.loaders
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | import scala.reflect.ClassTag
 6 | 
 7 | /**
 8 |  * A case class containing an RDD of labeled data
 9 |  * @tparam Label  The type of the labels
10 |  * @tparam Datum  The type of the data
11 |  */
12 | case class LabeledData[Label : ClassTag, Datum : ClassTag](labeledData: RDD[(Label, Datum)]) {
13 |   val data: RDD[Datum] = labeledData.map(_._2)
14 |   val labels: RDD[Label] = labeledData.map(_._1)
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/loaders/NewsgroupsDataLoader.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.loaders
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.rdd.{RDD, UnionRDD}
 5 | 
 6 | import scala.reflect.ClassTag
 7 | 
 8 | 
 9 | object NewsgroupsDataLoader {
10 |   /** The 20 Newsgroups class labels (and directory names) **/
11 |   val classes = Array(
12 |     "comp.graphics",
13 |     "comp.os.ms-windows.misc",
14 |     "comp.sys.ibm.pc.hardware",
15 |     "comp.sys.mac.hardware",
16 |     "comp.windows.x",
17 |     "rec.autos",
18 |     "rec.motorcycles",
19 |     "rec.sport.baseball",
20 |     "rec.sport.hockey",
21 |     "sci.crypt",
22 |     "sci.electronics",
23 |     "sci.med",
24 |     "sci.space",
25 |     "misc.forsale",
26 |     "talk.politics.misc",
27 |     "talk.politics.guns",
28 |     "talk.politics.mideast",
29 |     "talk.religion.misc",
30 |     "alt.atheism",
31 |     "soc.religion.christian"
32 |   )
33 | 
34 |   /**
35 |    * Loads the 20 newsgroups dataset.
36 |    * Designed to load data from 20news-bydate.tar.gz from http://qwone.com/~jason/20Newsgroups/
37 |    *
38 |    * The expected directory structure for the train and test dirs is:
39 |    * train_or_test_dir/class_label/docs_as_separate_plaintext_files
40 |    *
41 |    * @param sc  SparkContext to use
42 |    * @param dataDir  Directory of the training data
43 |    * @return  A NewsgroupsData object containing the loaded train & test data as RDDs
44 |    */
45 |   def apply(sc: SparkContext, dataDir: String): LabeledData[Int, String] = {
46 |     val data: RDD[(Int, String)] = new UnionRDD(sc, classes.zipWithIndex.map{ case (className, index) => {
47 |       sc.wholeTextFiles(s"$dataDir/$className").map(index -> _._2)
48 |     }})
49 | 
50 |     LabeledData(data)
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/loaders/TimitFeaturesDataLoader.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.loaders
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | import scala.collection.mutable
 8 | 
 9 | /** A case class containing loaded pre-featurized TIMIT train & test data */
10 | case class TimitFeaturesData(
11 |   train: LabeledData[Int, DenseVector[Double]],
12 |   test: LabeledData[Int, DenseVector[Double]]
13 | )
14 | 
15 | object TimitFeaturesDataLoader {
16 |   val timitDimension = 440
17 |   val numClasses = 147
18 | 
19 |   // Assumes lines are formatted as
20 |   // row col value
21 |   private def parseSparseLabels(fileName: String) = {
22 |     // Mapping from row number to label
23 |     val ret = new mutable.HashMap[Long, Int]
24 | 
25 |     val lines = scala.io.Source.fromFile(fileName).getLines()
26 |     lines.foreach { line =>
27 |       val parts = line.split(" ")
28 |       ret(parts(0).toLong - 1) = parts(1).toInt
29 |     }
30 |     ret
31 |   }
32 | 
33 |   private def createLabelsRDD(
34 |       labelsMap: mutable.HashMap[Long, Int],
35 |       featuresRDD: RDD[_]) = {
36 |     val labelsMapBC = featuresRDD.context.broadcast(labelsMap)
37 |     val labelsRDD = featuresRDD.zipWithIndex().map { case (item, row) =>
38 |       labelsMapBC.value(row) - 1
39 |     }
40 |     labelsRDD
41 |   }
42 | 
43 |   /**
44 |    * Loads the pre-featurized Timit data.
45 |    * Expects features data to be stored as a csv of numbers,
46 |    * and labels as "row# label" where row# is the number of the row in the data csv it is
47 |    * referring to (starting at row #1)
48 |    *
49 |    * @param sc  SparkContext to use
50 |    * @param trainDataLocation  CSV of the training data
51 |    * @param trainLabelsLocation  labels of the training data
52 |    * @param testDataLocation  CSV of the test data
53 |    * @param testLabelsLocation  labels of the test data
54 |    * @param numParts  number of partitions per RDD
55 |    * @return  A TimitFeaturesData object containing the loaded train & test data as RDDs
56 |    */
57 |   def apply(sc: SparkContext,
58 |       trainDataLocation: String,
59 |       trainLabelsLocation: String,
60 |       testDataLocation: String,
61 |       testLabelsLocation: String,
62 |       numParts: Int = 512): TimitFeaturesData = {
63 |     val trainData = CsvDataLoader(sc, trainDataLocation, numParts)
64 |     val trainLabels = createLabelsRDD(parseSparseLabels(trainLabelsLocation), trainData)
65 | 
66 |     val testData = CsvDataLoader(sc, testDataLocation, numParts)
67 |     val testLabels = createLabelsRDD(parseSparseLabels(testLabelsLocation), testData)
68 |     TimitFeaturesData(LabeledData(trainLabels.zip(trainData)), LabeledData(testLabels.zip(testData)))
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/loaders/VOCLoader.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.loaders
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.rdd.RDD
 5 | import keystoneml.pipelines.Logging
 6 | import keystoneml.utils.MultiLabeledImage
 7 | 
 8 | 
 9 | case class VOCDataPath(imagesDirName: String, namePrefix: String, numParts: Option[Int])
10 | case class VOCLabelPath(labelsFileName: String)
11 | 
12 | /**
13 |  * A data loader for the VOC 2007 Dataset. Expects input in a tar file.
14 |  */
15 | object VOCLoader extends Logging with Serializable {
16 |   val NUM_CLASSES = 20 // This is a constant defined by the VOC 2007 dataset.
17 | 
18 |   /**
19 |    * Loads a data path given a spark context and labels and returns an RDD[MultiLabeledImage].
20 |    *
21 |    * A property of the VOC dataset is that images can have multiple labels which we
22 |    * have to deal with later in the pipeline.
23 |    *
24 |    * @param sc A Spark Context
25 |    * @param dataPath Path to image tar.
26 |    * @param labelsPath Path to label csv.
27 |    * @return
28 |    */
29 |   def apply(sc: SparkContext, dataPath: VOCDataPath, labelsPath: VOCLabelPath): RDD[MultiLabeledImage] = {
30 |     val filePathsRDD = ImageLoaderUtils.getFilePathsRDD(sc, dataPath.imagesDirName, dataPath.numParts)
31 | 
32 |     val labelsMapFile = scala.io.Source.fromFile(labelsPath.labelsFileName)
33 | 
34 |     val labelsMap: Map[String, Array[Int]] = labelsMapFile
35 |       .getLines()
36 |       .drop(1)
37 |       .map(x => x.toString)
38 |       .map { line =>
39 |         val parts = line.split(",")
40 |         (parts(4).replace("\"", ""), parts(1).toInt - 1)
41 |       }
42 |       .toArray
43 |       .groupBy(_._1)
44 |       .mapValues(_.map(_._2))
45 |       .map(identity)
46 | 
47 |     labelsMapFile.close()
48 | 
49 |     ImageLoaderUtils.loadFiles(filePathsRDD, labelsMap, MultiLabeledImage.apply, Some(dataPath.namePrefix))
50 |   }
51 | }
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/CenterCornerPatcher.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | import keystoneml.utils.{ImageUtils, Image}
 6 | import keystoneml.pipelines.FunctionNode
 7 | 
 8 | /**
 9 |   * Extract four corner patches and the center patch of the specified size.
10 |   * If flips is set to true, then horizontal flips of all 5 patches is also
11 |   * returned
12 |   *
13 |   * @param patchSizeX size of patch along xDim
14 |   * @param patchSizeY size of patch along yDim
15 |   * @param horizontalFlips if horizontal flips of patches should also be returned
16 |   * @return patches of size patchSizeX x patchSizeY
17 |   */
18 | case class CenterCornerPatcher(
19 |     patchSizeX: Int,
20 |     patchSizeY: Int,
21 |     horizontalFlips: Boolean) extends FunctionNode[RDD[Image], RDD[Image]] {
22 | 
23 |   def apply(in: RDD[Image]): RDD[Image] = {
24 |     in.flatMap { x => 
25 |       centerCornerPatchImage(x)
26 |     }
27 |   }
28 | 
29 |   def centerCornerPatchImage(in: Image): Iterator[Image] = {
30 |     val xDim = in.metadata.xDim
31 |     val yDim = in.metadata.yDim
32 | 
33 |     val startXs = Array(0, xDim-patchSizeX, 0, xDim-patchSizeX, (xDim-patchSizeX)/2)
34 |     val startYs = Array(0, 0, yDim-patchSizeY, yDim-patchSizeY, (yDim-patchSizeY)/2)
35 | 
36 |     (0 until startXs.length).iterator.flatMap { idx =>
37 |       val endX = startXs(idx) + patchSizeX
38 |       val endY = startYs(idx) + patchSizeY
39 |       val im = ImageUtils.crop(in, startXs(idx), startYs(idx), endX, endY)
40 |       if (horizontalFlips) {
41 |         val flippedIm = ImageUtils.flipHorizontal(im) 
42 |         Iterator(im, flippedIm)
43 |       } else {
44 |         Iterator.single(im)
45 |       }
46 |     }
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/Cropper.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import keystoneml.utils.{ImageUtils, Image}
 4 | import keystoneml.workflow.Transformer
 5 | 
 6 | /**
 7 |   * Crop an input image to the given bounding box described by
 8 |   * (startX, startY, endX, endY).
 9 |   *
10 |   * Wrapper for `ImageUtils.crop()`
11 |   *
12 |   * @param startX x-position (inclusive) to describe upper left corner of BB
13 |   * @param startY y-position (inclusive) to describe upper left corner of BB
14 |   * @param endX x-position (exclusive) to describe lower right corner of BB
15 |   * @param endY y-position (exclusive) to describe lower right corner of BB
16 |   * @return new image of size (endX - startX, endY - startY)
17 |   */
18 | case class Cropper(startX: Int, startY: Int, endX: Int, endY: Int) extends Transformer[Image,Image] {
19 |   def apply(in: Image): Image = {
20 |     ImageUtils.crop(in, startX, startY, endX, endY)
21 |   }
22 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/GrayScaler.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import keystoneml.workflow.Transformer
 4 | import keystoneml.utils.{ImageUtils, Image}
 5 | 
 6 | /**
 7 |  * Converts an input images to NTSC-standard grayscale.
 8 |  */
 9 | object GrayScaler extends Transformer[Image,Image] {
10 |   def apply(in: Image): Image = ImageUtils.toGrayScale(in)
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/ImageVectorizer.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import org.apache.spark.rdd.RDD
 5 | import keystoneml.pipelines._
 6 | import keystoneml.utils.Image
 7 | import keystoneml.workflow.Transformer
 8 | 
 9 | /**
10 |  * Takes an image and converts it to a dense vector.
11 |  */
12 | object ImageVectorizer extends Transformer[Image, DenseVector[Double]] {
13 |   def apply(in: Image): DenseVector[Double] = {
14 |     DenseVector(in.toArray)
15 |   }
16 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/LabeledImageExtractors.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import keystoneml.utils.{MultiLabeledImage, Image, LabeledImage}
 4 | import keystoneml.workflow.Transformer
 5 | 
 6 | /**
 7 |  * Extracts a label from a labeled image.
 8 |  */
 9 | object LabelExtractor extends Transformer[LabeledImage, Int] {
10 |   def apply(in: LabeledImage): Int = in.label
11 | }
12 | 
13 | /**
14 |  * Extracts an image from a labeled image.
15 |  */
16 | object ImageExtractor extends Transformer[LabeledImage, Image] {
17 |   def apply(in: LabeledImage): Image = in.image
18 | }
19 | 
20 | /**
21 |  * Extracts a label from a multi-labeled image.
22 |  */
23 | object MultiLabelExtractor extends Transformer[MultiLabeledImage, Array[Int]] {
24 |   override def apply(in: MultiLabeledImage): Array[Int] = in.label
25 | }
26 | 
27 | /**
28 |  * Extracts an image from a multi-labeled image.
29 |  */
30 | object MultiLabeledImageExtractor extends Transformer[MultiLabeledImage, Image] {
31 |   def apply(in: MultiLabeledImage): Image = in.image
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/PixelScaler.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import keystoneml.workflow.Transformer
 4 | import keystoneml.utils.{ImageUtils, Image}
 5 | 
 6 | 
 7 | /**
 8 |  * Rescales an input image from [0 .. 255] to [0 .. 1]. Works by dividing each pixel by 255.0.
 9 |  */
10 | object PixelScaler extends Transformer[Image,Image] {
11 |   def apply(im: Image): Image = {
12 |     ImageUtils.mapPixels(im, _/255.0)
13 |   }
14 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/Pooler.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import keystoneml.pipelines._
 5 | import keystoneml.utils.{ImageMetadata, ChannelMajorArrayVectorizedImage, Image}
 6 | import keystoneml.workflow.Transformer
 7 | 
 8 | /**
 9 |  * This node takes an image and performs pooling on regions of the image.
10 |  *
11 |  * Divides images into fixed size pools, but when fed with images of various
12 |  * sizes may produce a varying number of pools.
13 |  *
14 |  * NOTE: By default strides start from poolSize/2.
15 |  *
16 |  * @param stride x and y stride to get regions of the image
17 |  * @param poolSize size of the patch to perform pooling on
18 |  * @param pixelFunction function to apply on every pixel before pooling
19 |  * @param poolFunction pooling function to use on every region.
20 |  */
21 | class Pooler(
22 |     stride: Int,
23 |     poolSize: Int,
24 |     pixelFunction: Double => Double,
25 |     poolFunction: DenseVector[Double] => Double)
26 |   extends Transformer[Image, Image] {
27 | 
28 |   val strideStart = poolSize / 2
29 | 
30 |   def apply(image: Image) = {
31 |     val xDim = image.metadata.xDim
32 |     val yDim = image.metadata.yDim
33 |     val numChannels = image.metadata.numChannels
34 | 
35 |     val numPoolsX = math.ceil((xDim - strideStart).toDouble / stride).toInt
36 |     val numPoolsY = math.ceil((yDim - strideStart).toDouble / stride).toInt
37 |     val patch = new Array[Double]( numPoolsX * numPoolsY * numChannels)
38 | 
39 |     // Start at strideStart in (x, y) and
40 |     for (x <- strideStart until xDim by stride;
41 |          y <- strideStart until yDim by stride) {
42 |       // Extract the pool. Then apply the pixel and pool functions
43 | 
44 |       val pool = DenseVector.zeros[Double](poolSize * poolSize)
45 |       val startX = x - poolSize/2
46 |       val endX = math.min(x + poolSize/2, xDim)
47 |       val startY = y - poolSize/2
48 |       val endY = math.min(y + poolSize/2, yDim)
49 | 
50 |       var c = 0
51 |       while (c < numChannels) {
52 |         var s = startX
53 |         while (s < endX) {
54 |           var b = startY
55 |           while (b < endY) {
56 |             pool((s-startX) + (b-startY)*(endX-startX)) =
57 |               pixelFunction(image.get(s, b, c))
58 |             b = b + 1
59 |           }
60 |           s = s + 1
61 |         }
62 |         patch(c + (x - strideStart)/stride * numChannels +
63 |           (y - strideStart)/stride * numPoolsX * numChannels) = poolFunction(pool)
64 |         c = c + 1
65 |       }
66 |     }
67 |     ChannelMajorArrayVectorizedImage(patch, ImageMetadata(numPoolsX, numPoolsY, numChannels))
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/RandomImageTransformer.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | import keystoneml.utils.{ImageUtils, Image}
 6 | import keystoneml.workflow.Transformer
 7 | 
 8 | /**
 9 |  * Transform an image with the given probability
10 |  *
11 |  * @param chance probability that an image should be transformed
12 |  * @param transform function to apply to image
13 |  * @return transformed image or original image
14 |  */
15 | 
16 | case class RandomImageTransformer(
17 |     chance: Double,
18 |     transform: Image => Image,
19 |     seed: Long = 12334L) extends Transformer[Image, Image] {
20 | 
21 |   val rnd = new java.util.Random(seed)
22 | 
23 |   def apply(im: Image): Image = {
24 |     val flip = rnd.nextDouble()
25 |     if (flip < chance) {
26 |       transform(im)
27 |     } else {
28 |       im
29 |     }
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/RandomPatcher.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | import keystoneml.utils.{ImageUtils, Image}
 6 | import keystoneml.pipelines.FunctionNode
 7 | 
 8 | /**
 9 |   * Extract uniformly random patches from an image
10 |   *
11 |   * @param numPatches number of random patches to extract
12 |   * @param patchSizeX size of each patch along xDim
13 |   * @param patchSizeY size of each patch along yDim
14 |   * @return numPatches images of size patchSizeX x patchSizeY
15 |   */
16 | case class RandomPatcher(
17 |     numPatches: Int,
18 |     patchSizeX: Int,
19 |     patchSizeY: Int,
20 |     seed: Long = 12334L) extends FunctionNode[RDD[Image], RDD[Image]] {
21 | 
22 |   val rnd = new java.util.Random(seed)
23 | 
24 |   def apply(in: RDD[Image]): RDD[Image] = {
25 |     in.flatMap { x => 
26 |       randomPatchImage(x)
27 |     }
28 |   }
29 | 
30 |   def randomPatchImage(in: Image): Iterator[Image] = {
31 |     val xDim = in.metadata.xDim
32 |     val yDim = in.metadata.yDim
33 | 
34 |     (0 until numPatches).iterator.map { x =>
35 |       val borderSizeX = xDim - patchSizeX
36 |       val borderSizeY = yDim - patchSizeY
37 |       // Pick a random int between 0 and borderSize (inclusive)
38 |       val startX = rnd.nextInt(borderSizeX + 1)
39 |       val endX = startX + patchSizeX
40 |       val startY = rnd.nextInt(borderSizeY + 1)
41 |       val endY = startY + patchSizeY
42 | 
43 |       ImageUtils.crop(in, startX, startY, endX, endY)
44 |     }
45 |   }
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/SIFTExtractor.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import breeze.linalg.DenseMatrix
 4 | import keystoneml.workflow.Transformer
 5 | import keystoneml.utils.Image
 6 | 
 7 | /**
 8 |  * Abstract interface for SIFT extractor.
 9 |  */
10 | trait SIFTExtractorInterface extends Transformer[Image, DenseMatrix[Float]]


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/SymmetricRectifier.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import keystoneml.pipelines._
 4 | import keystoneml.utils.{ChannelMajorArrayVectorizedImage, Image}
 5 | import keystoneml.workflow.Transformer
 6 | 
 7 | case class SymmetricRectifier(maxVal: Double = 0.0, alpha: Double = 0.0)
 8 |   extends Transformer[Image, Image] {
 9 | 
10 |   def apply(img: Image): Image = {
11 |     val res = ChannelMajorArrayVectorizedImage(
12 |       new Array[Double](img.metadata.xDim * img.metadata.yDim * img.metadata.numChannels * 2),
13 |       img.metadata.copy(numChannels = img.metadata.numChannels * 2))
14 | 
15 |     var x, y, c = 0
16 |     while (x < img.metadata.xDim) {
17 |       y = 0
18 |       while (y < img.metadata.yDim) {
19 |         c = 0
20 |         while (c < img.metadata.numChannels) {
21 |           res.put(x, y, c, math.max(maxVal, img.get(x, y, c) - alpha))
22 |           res.put(x, y, c + img.metadata.numChannels, math.max(maxVal, -img.get(x, y, c) - alpha))
23 |           c += 1
24 |         }
25 |         y += 1
26 |       }
27 |       x += 1
28 |     }
29 | 
30 |     res
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/Windower.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import org.apache.spark.rdd.RDD
 5 | import keystoneml.pipelines.FunctionNode
 6 | import keystoneml.utils.{ImageMetadata, ChannelMajorArrayVectorizedImage, Image}
 7 | 
 8 | 
 9 | /**
10 |  * @param stride How big a step to take between patches.
11 |  * @param windowSize Size of a patch.
12 |  */
13 | class Windower(
14 |     stride: Int,
15 |     windowSize: Int) extends FunctionNode[RDD[Image], RDD[Image]] {
16 | 
17 |   def apply(in: RDD[Image]) = {
18 |     in.flatMap(getImageWindow)
19 |   }
20 | 
21 |   def getImageWindow(image: Image) = {
22 |     val xDim = image.metadata.xDim
23 |     val yDim = image.metadata.yDim
24 |     val numChannels = image.metadata.numChannels
25 | 
26 |     // Start at (0,0) in (x, y) and
27 |     (0 until xDim - windowSize + 1 by stride).flatMap { x =>
28 |       (0 until yDim - windowSize + 1 by stride).map { y =>
29 |         // Extract the window.
30 |         val pool = new DenseVector[Double](windowSize * windowSize * numChannels)
31 |         val startX = x
32 |         val endX = x + windowSize
33 |         val startY = y
34 |         val endY = y + windowSize
35 | 
36 |         var c = 0
37 |         while (c < numChannels) {
38 |           var s = startX
39 |           while (s < endX) {
40 |             var b = startY
41 |             while (b < endY) {
42 |               pool(c + (s-startX)*numChannels +
43 |                 (b-startY)*(endX-startX)*numChannels) = image.get(s, b, c)
44 |               b = b + 1
45 |             }
46 |             s = s + 1
47 |           }
48 |           c = c + 1
49 |         }
50 |         ChannelMajorArrayVectorizedImage(pool.toArray,
51 |           ImageMetadata(windowSize, windowSize, numChannels))
52 |       }
53 |     }
54 |   }
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/external/FisherVector.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images.external
 2 | 
 3 | import breeze.linalg._
 4 | import keystoneml.nodes.images.FisherVectorInterface
 5 | import keystoneml.nodes.learning.GaussianMixtureModel
 6 | import keystoneml.nodes.learning.external.GaussianMixtureModelEstimator
 7 | import org.apache.spark.rdd.RDD
 8 | import keystoneml.utils.MatrixUtils
 9 | import keystoneml.utils.external.EncEval
10 | import keystoneml.workflow.{Transformer, Estimator}
11 | 
12 | /**
13 |  * Implements a wrapper for the `enceval` Fisher Vector implementation.
14 |  *
15 |  * @param gmm A trained Gaussian Mixture Model
16 |  */
17 | case class FisherVector(
18 |     gmm: GaussianMixtureModel)
19 |   extends FisherVectorInterface {
20 | 
21 |   @transient lazy val extLib = new EncEval()
22 | 
23 |   val numDims = gmm.means.rows
24 |   val numCentroids = gmm.means.cols
25 |   val numFeatures = numDims * numCentroids * 2
26 | 
27 |   override def apply(in: DenseMatrix[Float]): DenseMatrix[Float] = {
28 |     val means = convert(gmm.means, Float).toArray
29 |     val vars = convert(gmm.variances, Float).toArray
30 |     val wts = convert(gmm.weights, Float).toArray
31 | 
32 |     val fisherVector = extLib.calcAndGetFVs(means, numDims, numCentroids,
33 |       vars, wts, in.toArray)
34 | 
35 |     new DenseMatrix(numDims, numCentroids*2, fisherVector)
36 |   }
37 | }
38 | 
39 | /**
40 |  * Trains an `enceval` Fisher Vector implementation, via
41 |  * estimating a GMM by treating each column of the inputs as a separate
42 |  * DenseVector input to [[GaussianMixtureModelEstimator]]
43 |  *
44 |  * TODO: Pending philosophical discussions on how to best make it so you can
45 |  * swap in GMM, KMeans++, etc. for Fisher Vectors. For now just hard-codes GMM here
46 |  *
47 |  * @param k Number of centers to estimate.
48 |  */
49 | case class EncEvalGMMFisherVectorEstimator(k: Int) extends Estimator[DenseMatrix[Float], DenseMatrix[Float]] {
50 |   def fit(data: RDD[DenseMatrix[Float]]): FisherVector = {
51 |     val gmmTrainingData = data.flatMap(x => MatrixUtils.matrixToColArray(x).map(i => convert(i, Double)))
52 |     val gmmEst = new GaussianMixtureModelEstimator(k)
53 |     val gmm = gmmEst.fit(gmmTrainingData)
54 |     FisherVector(gmm)
55 |   }
56 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/images/external/SIFTExtractor.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images.external
 2 | 
 3 | import breeze.linalg._
 4 | import keystoneml.nodes.images.SIFTExtractorInterface
 5 | import org.apache.spark.rdd.RDD
 6 | import keystoneml.utils.Image
 7 | import keystoneml.utils.external.VLFeat
 8 | 
 9 | /**
10 |  * Extracts SIFT Descriptors at dense intervals at multiple scales using the vlfeat C library.
11 |  *
12 |  * @param stepSize Spacing between each sampled descriptor.
13 |  * @param binSize Size of histogram bins for SIFT.
14 |  * @param scales Number of scales at which to extract.
15 |  */
16 | class SIFTExtractor(val stepSize: Int = 3, val binSize: Int = 4, val scales: Int = 4, val scaleStep: Int = 1)
17 |   extends SIFTExtractorInterface {
18 |   @transient lazy val extLib = new VLFeat()
19 | 
20 |   val descriptorSize = 128
21 | 
22 |   /**
23 |    * Extract SIFTs from an image.
24 |    * @param in The input to pass into this pipeline node
25 |    * @return The output for the given input
26 |    */
27 |   def apply(in: Image): DenseMatrix[Float] = {
28 |     val rawDescDataShort = extLib.getSIFTs(in.metadata.xDim, in.metadata.yDim,
29 |       stepSize, binSize, scales, scaleStep, in.getSingleChannelAsFloatArray())
30 |     val numCols = rawDescDataShort.length/descriptorSize
31 |     val rawDescData = rawDescDataShort.map(s => s.toFloat)
32 |     new DenseMatrix(descriptorSize, numCols, rawDescData)
33 |   }
34 | }
35 | 
36 | object SIFTExtractor {
37 |   def apply(stepSize: Int = 3, binSize: Int = 4, scales: Int = 4, scaleStep: Int = 1) = {
38 |     new SIFTExtractor(stepSize, binSize, scales, scaleStep)
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/learning/CostModel.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.learning
 2 | 
 3 | /**
 4 |  * A trait that represents a known system performance cost model for a solver.
 5 |  */
 6 | trait CostModel {
 7 |   def cost(
 8 |     n: Long,
 9 |     d: Int,
10 |     k: Int,
11 |     sparsity: Double,
12 |     numMachines: Int,
13 |     cpuWeight: Double,
14 |     memWeight: Double,
15 |     networkWeight: Double)
16 |   : Double
17 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/learning/DistributedPCA.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.learning
 2 | 
 3 | import breeze.linalg._
 4 | import breeze.numerics._
 5 | import breeze.stats._
 6 | import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
 7 | import org.apache.spark.rdd.RDD
 8 | import org.netlib.util.intW
 9 | import keystoneml.pipelines._
10 | import keystoneml.utils.MatrixUtils
11 | import keystoneml.workflow.{Transformer, Estimator}
12 | 
13 | import edu.berkeley.cs.amplab.mlmatrix.{RowPartition, NormalEquations, RowPartitionedMatrix, TSQR}
14 | 
15 | /**
16 |  * Estimates a PCA model for dimensionality reduction using a distributedQR.
17 |  *
18 |  * @param dims Dimensions to reduce input dataset to.
19 |  */
20 | class DistributedPCAEstimator(dims: Int) extends Estimator[DenseVector[Float], DenseVector[Float]]
21 |   with CostModel with Logging {
22 | 
23 |   /**
24 |    * Adapted from the "PCA2" matlab code given in appendix B of this paper:
25 |    *    https://www.cs.princeton.edu/picasso/mats/PCA-Tutorial-Intuition_jp.pdf
26 |    *
27 |    * @param samples Features to be reduced. Logically row-major.
28 |    * @return A PCA model which will perform dimensionality reduction when applied to data.
29 |    */
30 |   def fit(samples: RDD[DenseVector[Float]]): PCATransformer = {
31 |     new PCATransformer(computePCA(samples, dims))
32 |   }
33 | 
34 |   def computePCA(dataMat: RDD[DenseVector[Float]], dims: Int): DenseMatrix[Float] = {
35 | 
36 |     val mat = new RowPartitionedMatrix(dataMat.mapPartitions { part =>
37 |       val dblIter = part.map(x => convert(x, Double))
38 |       MatrixUtils.rowsToMatrixIter(dblIter).map(RowPartition(_))
39 |     })
40 |     val means = DenseVector(mat.colSums():_*) :/ mat.numRows().toDouble
41 | 
42 |     val meansBC = dataMat.context.broadcast(means)
43 |     val zeroMeanMat = new RowPartitionedMatrix(mat.rdd.map { part =>
44 |       RowPartition(part.mat(*, ::) - meansBC.value)
45 |     })
46 | 
47 |     val rPart = new TSQR().qrR(zeroMeanMat)
48 | 
49 |     val svd.SVD(u, s, pcaT) = svd(rPart)
50 | 
51 |     val pca = convert(pcaT.t, Float)
52 | 
53 |     val matlabConventionPCA = PCAEstimator.enforceMatlabPCASignConvention(pca)
54 | 
55 |     // Return a subset of the columns.
56 |     matlabConventionPCA(::, 0 until dims)
57 |   }
58 | 
59 |   override def cost(
60 |     n: Long,
61 |     d: Int,
62 |     k: Int,
63 |     sparsity: Double,
64 |     numMachines: Int,
65 |     cpuWeight: Double,
66 |     memWeight: Double,
67 |     networkWeight: Double): Double = {
68 |     val log2NumMachines = math.log(numMachines.toDouble) / math.log(2.0)
69 |     val flops = n.toDouble * d * d / numMachines + d.toDouble * d * d * log2NumMachines
70 |     val bytesScanned = n.toDouble * d
71 |     val network = d.toDouble * d * log2NumMachines
72 |     math.max(cpuWeight * flops, memWeight * bytesScanned) + networkWeight * network
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/learning/KernelMatrix.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.learning
 2 | 
 3 | import scala.collection.mutable.HashMap
 4 | import scala.reflect.ClassTag
 5 | 
 6 | import breeze.linalg._
 7 | 
 8 | import org.apache.spark.rdd.RDD
 9 | 
10 | import keystoneml.utils.{MatrixUtils, Stats}
11 | import keystoneml.workflow.{Transformer, LabelEstimator}
12 | 
13 | /**
14 |  * Defines a wrapper to access elements of a symmetric distributed 
15 |  * matrix that is generated using a kernel function.
16 |  */
17 | trait KernelMatrix {
18 | 
19 |   /**
20 |    * Extract specified columns from the kernel matrix. 
21 |    * NOTE: This returns a *cached* RDD and unpersist should
22 |    * be called at the end of a block.
23 |    *
24 |    * @param colIdxs the column indexes to extract
25 |    * @return A sub-matrix of size n x idxs.size as an RDD.
26 |    */
27 |   def apply(colIdxs: Seq[Int]): RDD[DenseMatrix[Double]]
28 | 
29 |   /**
30 |    * Extract a diagonal block from the kernel matrix.
31 |    *
32 |    * @param idxs the column, row indexes to extract
33 |    * @return A local matrix of size idxs.size x idxs.size
34 |    */
35 |   def diagBlock(idxs: Seq[Int]): DenseMatrix[Double]
36 | 
37 |   /**
38 |    * Clean up resources associated with a kernel block.
39 |    *
40 |    * @param colIdxs column indexes corresponding to the block.
41 |    */
42 |   def unpersist(colIdxs: Seq[Int]): Unit
43 | }
44 | 
45 | /**
46 |  * Column-wise block implementation of a kernel matrix.
47 |  * This class uses a kernel transformer to lazily populate the column blocks
48 |  * and caches them optionally
49 |  */
50 | class BlockKernelMatrix[T: ClassTag](
51 |     val kernelGen: KernelTransformer[T],
52 |     val data: RDD[T],
53 |     val cacheKernel: Boolean)
54 |   extends KernelMatrix {
55 | 
56 |   val colBlockCache = HashMap.empty[Seq[Int], RDD[DenseMatrix[Double]]]
57 |   val diagBlockCache = HashMap.empty[Seq[Int], DenseMatrix[Double]]
58 | 
59 |   def apply(colIdxs: Seq[Int]): RDD[DenseMatrix[Double]] = {
60 |     if (colBlockCache.contains(colIdxs)) {
61 |       colBlockCache(colIdxs)
62 |     } else {
63 |       val (kBlock, diagBlock) = kernelGen.computeKernel(data, colIdxs)
64 |       if (cacheKernel) {
65 |         colBlockCache += (colIdxs -> kBlock)
66 |         diagBlockCache += (colIdxs -> diagBlock)
67 |       }
68 |       kBlock
69 |     }
70 |   }
71 | 
72 |   def unpersist(colIdxs: Seq[Int]): Unit = {
73 |     if (colBlockCache.contains(colIdxs) && !cacheKernel) {
74 |       colBlockCache(colIdxs).unpersist(true)
75 |     }
76 |   }
77 | 
78 |   def diagBlock(idxs: Seq[Int]): DenseMatrix[Double] = {
79 |     if (!diagBlockCache.contains(idxs)) {
80 |       val (kBlock, diagBlock) = kernelGen.computeKernel(data, idxs)
81 |       if (cacheKernel) {
82 |         colBlockCache += (idxs -> kBlock)
83 |         diagBlockCache += (idxs -> diagBlock)
84 |       }
85 |       diagBlock
86 |     } else {
87 |       diagBlockCache(idxs)
88 |     }
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/learning/LocalLeastSquaresEstimator.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.learning
 2 | 
 3 | import breeze.linalg._
 4 | import breeze.stats._
 5 | import keystoneml.nodes.stats.StandardScalerModel
 6 | import org.apache.spark.rdd.RDD
 7 | import keystoneml.utils.MatrixUtils
 8 | import keystoneml.workflow.LabelEstimator
 9 | 
10 | /**
11 |  * Learns a linear model (OLS) based on training features and training labels.
12 |  * Works well when the number of features >> number of examples, and the data fits locally.
13 |  *
14 |  * @param lambda regularization parameter
15 |  */
16 | class LocalLeastSquaresEstimator(lambda: Double)
17 |     extends LabelEstimator[DenseVector[Double], DenseVector[Double], DenseVector[Double]] {
18 | 
19 |   override def fit(
20 |     trainingFeatures: RDD[DenseVector[Double]],
21 |     trainingLabels: RDD[DenseVector[Double]]): LinearMapper[DenseVector[Double]] = {
22 |     LocalLeastSquaresEstimator.trainWithL2(trainingFeatures, trainingLabels, lambda)
23 |   }
24 | }
25 | 
26 | object LocalLeastSquaresEstimator {
27 |   /**
28 |    * Learns a linear model (OLS) based on training features and training labels.
29 |    * Works well when the number of features >> number of examples.
30 |    *
31 |    * @param trainingFeatures Training features.
32 |    * @param trainingLabels Training labels.
33 |    * @return
34 |    */
35 |   def trainWithL2(
36 |    trainingFeatures: RDD[DenseVector[Double]],
37 |    trainingLabels: RDD[DenseVector[Double]],
38 |    lambda: Double): LinearMapper[DenseVector[Double]] = {
39 | 
40 |     val A_parts = trainingFeatures.mapPartitions { x =>
41 |       MatrixUtils.rowsToMatrixIter(x)
42 |     }.collect()
43 |     val b_parts = trainingLabels.mapPartitions { x =>
44 |       MatrixUtils.rowsToMatrixIter(x)
45 |     }.collect()
46 | 
47 |     val A_local = DenseMatrix.vertcat(A_parts:_*)
48 |     val b_local = DenseMatrix.vertcat(b_parts:_*)
49 | 
50 |     val featuresMean = mean(A_local(::, *)).t
51 |     val labelsMean = mean(b_local(::, *)).t
52 | 
53 |     val A_zm = A_local(*, ::) - featuresMean
54 |     val b_zm = b_local(*, ::) - labelsMean
55 | 
56 |     val AAt = A_zm * A_zm.t
57 |     val model = A_zm.t * ( (AAt + (DenseMatrix.eye[Double](AAt.rows) :* lambda)) \ b_zm )
58 |     LinearMapper(model, Some(labelsMean), Some(new StandardScalerModel(featuresMean, None)))
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/learning/NaiveBayesModel.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.learning
 2 | 
 3 | import breeze.linalg.{DenseMatrix, DenseVector, Vector}
 4 | import org.apache.spark.mllib.classification.NaiveBayes
 5 | import org.apache.spark.mllib.regression.LabeledPoint
 6 | import org.apache.spark.rdd.RDD
 7 | import keystoneml.utils.MLlibUtils.breezeVectorToMLlib
 8 | import keystoneml.workflow.{Transformer, LabelEstimator}
 9 | 
10 | import scala.reflect.ClassTag
11 | 
12 | /**
13 |  * A Multinomial Naive Bayes model that transforms feature vectors to vectors containing
14 |  * the log posterior probabilities of the different classes
15 |  *
16 |  * @param labels list of class labels, ranging from 0 to (C - 1) inclusive
17 |  * @param pi log of class priors, whose dimension is C, number of labels
18 |  * @param theta log of class conditional probabilities, whose dimension is C-by-D,
19 |  *              where D is number of features
20 |  */
21 | class NaiveBayesModel[T <: Vector[Double]](
22 |     val labels: Array[Int],
23 |     val pi: Array[Double],
24 |     val theta: Array[Array[Double]]) extends Transformer[T, DenseVector[Double]] {
25 | 
26 |   private val brzPi = new DenseVector[Double](pi.length)
27 |   private val brzTheta = new DenseMatrix[Double](theta.length, theta(0).length)
28 | 
29 |   {
30 |     // Need to put an extra pair of braces to prevent Scala treating `i` as a member.
31 |     var i = 0
32 |     while (i < theta.length) {
33 |       brzPi(labels(i)) = pi(i)
34 |       var j = 0
35 |       while (j < theta(i).length) {
36 |         brzTheta(labels(i), j) = theta(i)(j)
37 |         j += 1
38 |       }
39 |       i += 1
40 |     }
41 |   }
42 | 
43 |   /**
44 |    * Transforms a feature vector to a vector containing the log(posterior probabilities) of the different classes
45 |    * according to this naive bayes model.
46 | 
47 |    * @param in The input feature vector
48 |    * @return Log-posterior probabilites of the classes for the input features
49 |    */
50 |   override def apply(in: T): DenseVector[Double] = {
51 |     brzPi + brzTheta * in
52 |   }
53 | }
54 | 
55 | /**
56 |  * A LabelEstimator which learns a multinomial naive bayes model from training data.
57 |  * Outputs a Transformer that maps features to vectors containing the log-posterior-probabilities
58 |  * of the various classes according to the learned model.
59 |  *
60 |  * @param lambda The lambda parameter to use for the naive bayes model
61 |  */
62 | case class NaiveBayesEstimator[T <: Vector[Double] : ClassTag](numClasses: Int, lambda: Double = 1.0)
63 |     extends LabelEstimator[T, DenseVector[Double], Int] {
64 |   override def fit(in: RDD[T], labels: RDD[Int]): NaiveBayesModel[T] = {
65 |     val labeledPoints = labels.zip(in).map(x => LabeledPoint(x._1, breezeVectorToMLlib(x._2)))
66 |     val model = NaiveBayes.train(labeledPoints, lambda)
67 | 
68 |     new NaiveBayesModel(model.labels.map(_.toInt), model.pi, model.theta)
69 |   }
70 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/learning/SparseLinearMapper.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.learning
 2 | 
 3 | import breeze.linalg._
 4 | import org.apache.spark.rdd.RDD
 5 | import keystoneml.workflow.Transformer
 6 | 
 7 | /**
 8 |  * Computes A * x + b i.e. a linear map of data using a trained model.
 9 |  *
10 |  * @param x trained model
11 |  * @param bOpt optional intercept to add
12 |  */
13 | case class SparseLinearMapper(
14 |     x: DenseMatrix[Double],
15 |     bOpt: Option[DenseVector[Double]] = None)
16 |   extends Transformer[SparseVector[Double], DenseVector[Double]] {
17 | 
18 |   /**
19 |    * Apply a linear model to an input.
20 |    * @param in Input.
21 |    * @return Output.
22 |    */
23 |   def apply(in: SparseVector[Double]): DenseVector[Double] = {
24 |     val out = x.t * in
25 |     bOpt.foreach { b =>
26 |       out :+= b
27 |     }
28 | 
29 |     out
30 |   }
31 | 
32 |   /**
33 |    * Apply a linear model to a collection of inputs.
34 |    *
35 |    * @param in Collection of A's.
36 |    * @return Collection of B's.
37 |    */
38 |   override def apply(in: RDD[SparseVector[Double]]): RDD[DenseVector[Double]] = {
39 |     val modelBroadcast = in.context.broadcast(x)
40 |     val bBroadcast = in.context.broadcast(bOpt)
41 |     in.map(row => {
42 |       val out = modelBroadcast.value.t * row
43 |       bBroadcast.value.foreach { b =>
44 |         out :+= b
45 |       }
46 | 
47 |       out
48 |     })
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/learning/ZCAWhitener.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.learning
 2 | 
 3 | import breeze.linalg._
 4 | import breeze.numerics._
 5 | import breeze.stats._
 6 | import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
 7 | import org.apache.spark.rdd.RDD
 8 | import org.netlib.util.intW
 9 | import keystoneml.pipelines._
10 | import keystoneml.workflow.{Transformer, Estimator}
11 | 
12 | class ZCAWhitener(val whitener: DenseMatrix[Double], val means: DenseVector[Double])
13 |   extends Transformer[DenseMatrix[Double],DenseMatrix[Double]] {
14 | 
15 |   def apply(in: DenseMatrix[Double]): DenseMatrix[Double] = {
16 |     (in(*, ::) - means) * whitener
17 |   }
18 | }
19 | 
20 | /**
21 |   * Computes a ZCA Whitener, which is intended to rotate an input dataset to identity covariance.
22 |   * The "Z" in ZCA Whitening means that the solution will be as close to the original dataset as possible while having
23 |   * this identity covariance property.
24 |   *
25 |   * See here for more details:
26 |   * http://ufldl.stanford.edu/wiki/index.php/Whitening
27 |   *
28 |   * @param eps Regularization Parameter
29 |   */
30 | class ZCAWhitenerEstimator(val eps: Double = 0.1)
31 |   extends Estimator[DenseMatrix[Double],DenseMatrix[Double]] {
32 | 
33 |   def fit(in: RDD[DenseMatrix[Double]]): ZCAWhitener = {
34 |     fitSingle(in.first)
35 |   }
36 | 
37 |   def fitSingle(in: DenseMatrix[Double]): ZCAWhitener = {
38 |     val means = (mean(in(::, *))).t
39 | 
40 |     val whitener: DenseMatrix[Double] = {
41 |       val inc = convert(in(*, ::) - means, Float)
42 |       val rows = inc.rows
43 |       val cols = inc.cols
44 | 
45 |       val s1 = DenseVector.zeros[Float](math.min(rows, cols))
46 |       val v1 = DenseMatrix.zeros[Float](inc.cols, inc.cols)
47 | 
48 |       // Get optimal workspace size
49 |       // we do this by sending -1 as lwork to the lapack function
50 |       val scratch, work = new Array[Float](1)
51 |       val info = new intW(0)
52 | 
53 |       lapack.sgesvd("N", "A", rows, cols, scratch, rows, scratch, null, 1, scratch, cols, work, -1, info)
54 | 
55 |       val lwork1 = work(0).toInt
56 |       val workspace = new Array[Float](lwork1)
57 | 
58 |       // Perform the SVD with sgesvd
59 |       lapack.sgesvd("N", "A", rows, cols, inc.copy.data, rows, s1.data, null, 1, v1.data, cols, workspace, workspace.length, info)
60 | 
61 |       val s2  = pow(s1, 2.0f) / (rows - 1.0f)
62 | 
63 |       val sn1 = diag((s2 + eps.toFloat) :^ -0.5f)
64 | 
65 |       // NOTE: sgesvd returns singular values in the opposite order (when compared to eigenvalues)
66 |       // Thus we need v.t * s * v here ?
67 |       val svdMat = v1.t * sn1 * v1
68 | 
69 |       convert(svdMat, Double)
70 |     }
71 | 
72 |     new ZCAWhitener(whitener, means)
73 | 
74 |   }
75 | }
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/learning/external/GaussianMixtureModelEstimator.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.learning.external
 2 | 
 3 | import breeze.linalg.{convert, DenseMatrix, DenseVector}
 4 | import keystoneml.nodes.learning.GaussianMixtureModel
 5 | import org.apache.spark.rdd.RDD
 6 | import keystoneml.utils.external.EncEval
 7 | import keystoneml.workflow.Estimator
 8 | 
 9 | /**
10 |  * Fit a Gaussian Mixture model to Data.
11 |  *
12 |  * @param k Number of centers to estimate.
13 |  */
14 | class GaussianMixtureModelEstimator(k: Int) extends Estimator[DenseVector[Double], DenseVector[Double]] {
15 | 
16 |   /**
17 |    * Currently this model works on items that fit in local memory.
18 |    * @param samples
19 |    * @return A PipelineNode (Transformer) which can be called on new data.
20 |    */
21 |   def fit(samples: RDD[DenseVector[Double]]): GaussianMixtureModel = {
22 |     fit(samples.collect)
23 |   }
24 | 
25 |   /**
26 |    * Fit a Gaussian mixture model with `k` centers to a sample array.
27 |    *
28 |    * @param samples Sample Array - all elements must be the same size.
29 |    * @return A Gaussian Mixture Model.
30 |    */
31 |   def fit(samples: Array[DenseVector[Double]]): GaussianMixtureModel = {
32 |     val extLib = new EncEval
33 |     val nDim = samples(0).length
34 | 
35 |     // Flatten this thing out.
36 |     val sampleFloats = samples.map(_.toArray.map(_.toFloat))
37 |     val res = extLib.computeGMM(k, nDim, sampleFloats.flatten)
38 | 
39 |     val meanSize = k*nDim
40 |     val varSize = k*nDim
41 |     val coefSize = k*nDim
42 | 
43 |     // Each array region is expected to be centroid-major.
44 |     val means = convert(new DenseMatrix(nDim, k, res.slice(0, meanSize)), Double)
45 |     val vars = convert(new DenseMatrix(nDim, k, res.slice(meanSize, meanSize+varSize)), Double)
46 |     val coefs = convert(new DenseVector(res.slice(meanSize+varSize, meanSize+varSize+coefSize)), Double)
47 | 
48 |     new GaussianMixtureModel(means, vars, coefs)
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/nlp/CoreNLPFeatureExtractor.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.nlp
 2 | 
 3 | import edu.arizona.sista.processors.Processor
 4 | import edu.arizona.sista.processors.fastnlp.FastNLPProcessor
 5 | import org.apache.spark.rdd.RDD
 6 | import keystoneml.workflow.Transformer
 7 | 
 8 | /**
 9 |  * Transformer that uses CoreNLP to (in order):
10 |  * - Tokenize document
11 |  * - Lemmatize tokens
12 |  * - Replace entities w/ their type (e.g. "Jon" => "NAME", "Paris" => "PLACE")
13 |  * - Return n-grams for the above (respecting sentence boundaries)
14 |  * Note: Much slower than just using [[Tokenizer]] followed by [[NGramsFeaturizer]]
15 |  *
16 |  * @param orders  The size of the n-grams to output
17 |  */
18 | case class CoreNLPFeatureExtractor(orders: Seq[Int]) extends Transformer[String, Seq[String]] {
19 |   @transient lazy val proc = new FastNLPProcessor()
20 | 
21 |   override def apply(in: String): Seq[String] = {
22 |     val doc = proc.mkDocument(in)
23 |     proc.tagPartsOfSpeech(doc)
24 |     proc.lemmatize(doc)
25 |     proc.recognizeNamedEntities(doc)
26 |     doc.clear()
27 |     val out = doc.sentences.map(s => {
28 |       val out = new Array[String](s.words.length)
29 |       for (i <- 0 to s.words.length - 1) {
30 |         out(i) = if (s.entities.get(i) != "O") s.entities.get(i) else normalize(s.lemmas.get(i))
31 |       }
32 |       out
33 |     })
34 |     orders.map(n => {
35 |       out.map(s => {
36 |         s.sliding(n).map(gram => gram.mkString(" ")).toList
37 |       }).flatMap(identity).toList
38 |     }).flatMap(identity).toList
39 |   }
40 | 
41 |   def normalize(s : String): String = {
42 |     val pattern = "[^a-zA-Z0-9\\s+]"
43 |     pattern.r.replaceAllIn(s,pattern=>"").toLowerCase
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/nlp/HashingTF.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.nlp
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import keystoneml.workflow.Transformer
 5 | 
 6 | /**
 7 |  * Converts a sequence of terms to a sparse vector representing their frequencies,
 8 |  * using the hashing trick: https://en.wikipedia.org/wiki/Feature_hashing
 9 |  *
10 |  * Terms are hashed using Scala's `.##` method. We may want to convert to MurmurHash3 for strings,
11 |  * as discussed for Spark's ML Pipelines in https://issues.apache.org/jira/browse/SPARK-10574
12 |  *
13 |  * @param numFeatures The desired feature space to convert to using the hashing trick.
14 |  */
15 | case class HashingTF[T <: Seq[Any]](numFeatures: Int) extends Transformer[T, SparseVector[Double]] {
16 |   def nonNegativeMod(x: Int, mod: Int): Int = {
17 |     val rawMod = x % mod
18 |     rawMod + (if (rawMod < 0) mod else 0)
19 |   }
20 | 
21 |   def apply(document: T): SparseVector[Double] = {
22 |     val termFrequencies = scala.collection.mutable.HashMap.empty[Int, Double]
23 |     document.foreach { term =>
24 |       val i = nonNegativeMod(term.##, numFeatures)
25 |       termFrequencies.put(i, termFrequencies.getOrElse(i, 0.0) + 1.0)
26 |     }
27 | 
28 |     SparseVector(numFeatures)(termFrequencies.toSeq:_*)
29 |   }
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/nlp/StringUtils.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.nlp
 2 | 
 3 | import java.util.Locale
 4 | 
 5 | import org.apache.spark.rdd.RDD
 6 | import keystoneml.workflow.Transformer
 7 | 
 8 | /**
 9 |  * Transformer that tokenizes a String into a Seq[String] by splitting on a regular expression.
10 |  * @param sep the delimiting regular expression to split on.
11 |  *            Defaults to matching all punctuation and whitespace
12 |  */
13 | case class Tokenizer(sep: String = "[\\p{Punct}\\s]+") extends Transformer[String, Seq[String]] {
14 |   override def apply(in: String): Seq[String] = in.split(sep)
15 | }
16 | 
17 | /**
18 |  * Transformer that trims a String of leading and trailing whitespace
19 |  */
20 | object Trim extends Transformer[String, String] {
21 |   override def apply(in: String): String = in.trim
22 | }
23 | 
24 | /**
25 |  * Transformer that converts a String to lower case
26 |  * @param locale  The locale to use. Defaults to `Locale.getDefault`
27 |  */
28 | case class LowerCase(locale: Locale = Locale.getDefault) extends Transformer[String, String] {
29 |   override def apply(in: String): String = in.toLowerCase(locale)
30 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/nlp/WordFrequencyEncoder.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.nlp
 2 | 
 3 | import org.apache.spark.broadcast.Broadcast
 4 | import org.apache.spark.rdd.RDD
 5 | import keystoneml.workflow.{Estimator, Transformer}
 6 | 
 7 | object WordFrequencyEncoder extends Estimator[Seq[String], Seq[Int]] {
 8 |   private[this] def makeUnigrams(data: RDD[Seq[String]]) =
 9 |     NGramsCounts[String]().apply(NGramsFeaturizer[String](1 to 1).apply(data))
10 | 
11 |   // TODO: alternative approach: collectAsMap once, let driver do the work.
12 |   def fit(data: RDD[Seq[String]]): WordFrequencyTransformer = {
13 |     val unigramCounts = makeUnigrams(data)
14 | 
15 |     val wordIndex = unigramCounts
16 |       .zipWithIndex() // indexes respect the sorted order
17 |       .map { case ((unigram, count), index) =>
18 |         // valid if # of word types in training data is less than Int.MaxValue
19 |         (unigram.words(0), index.asInstanceOf[Int])
20 |       }.collectAsMap()
21 | 
22 |     val wordIndexBroadcast = unigramCounts.sparkContext.broadcast(wordIndex)
23 | 
24 |     val unigrams = unigramCounts.map { case (unigram, count) =>
25 |       (wordIndexBroadcast.value(unigram.words(0)), count)
26 |     }.collectAsMap()
27 | 
28 |     new WordFrequencyTransformer(wordIndexBroadcast, unigrams)
29 |   }
30 | 
31 | }
32 | 
33 | /**
34 |  * Encodes string tokens as non-negative integers, which are indices of the
35 |  * tokens' positions in the sorted-by-frequency order.  Out-of-vocabulary words
36 |  * are mapped to the special index -1.
37 |  *
38 |  * The parameters passed to this class are usually calculated by [[WordFrequencyEncoder]].
39 |  *
40 |  * @param wordIndexBroadcast A mapping from token string to its frequency-ordered index
41 |  * @param unigramCounts the counts of unigrams in the training corpus
42 |  */
43 | class WordFrequencyTransformer(
44 |     wordIndexBroadcast: Broadcast[scala.collection.Map[String, Int]],
45 |     val unigramCounts: scala.collection.Map[Int, Int])
46 |   extends Transformer[Seq[String], Seq[Int]] {
47 | 
48 |   final val OOV_INDEX = -1
49 | 
50 |   override def apply(in: RDD[Seq[String]]): RDD[Seq[Int]] = {
51 |     in.mapPartitions { case part =>
52 |       val index = wordIndexBroadcast.value
53 |       part.map(ngram => ngram.map(index.getOrElse(_, OOV_INDEX)))
54 |     }
55 |   }
56 | 
57 |   def apply(words: Seq[String]): Seq[Int] = {
58 |     val index = wordIndexBroadcast.value
59 |     words.map(index.getOrElse(_, OOV_INDEX))
60 |   }
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/stats/CosineRandomFeatures.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import breeze.linalg._
 4 | import breeze.numerics._
 5 | import breeze.stats.distributions.Rand
 6 | import org.apache.spark.rdd.RDD
 7 | import keystoneml.pipelines._
 8 | import keystoneml.utils.MatrixUtils
 9 | import keystoneml.workflow.Transformer
10 | 
11 | /**
12 |  * Transformer that extracts random cosine features from a feature vector
13 |  * @param W A matrix of dimension (# output features) by (# input features)
14 |  * @param b a dense vector of dimension (# output features)
15 |  *
16 |  * Transformer maps vector x to cos(x * transpose(W) + b).
17 |  * Kernel trick to allow Linear Solver to learn cosine interaction terms of the input
18 |  */
19 | class CosineRandomFeatures(
20 |   @transient val W: DenseMatrix[Double], // should be numOutputFeatures by numInputFeatures
21 |   @transient val b: DenseVector[Double]) // should be numOutputFeatures by 1
22 |   extends Transformer[DenseVector[Double], DenseVector[Double]] {
23 | 
24 |   require(b.length == W.rows, "# of rows in W and size of b should match")
25 |   override def apply(in: RDD[DenseVector[Double]]): RDD[DenseVector[Double]] = {
26 |     val wBroadcast = in.sparkContext.broadcast(W)
27 |     val bBroadcast = in.sparkContext.broadcast(b)
28 |     in.mapPartitions { part =>
29 |       MatrixUtils.rowsToMatrixIter(part).flatMap { data =>
30 |         val features: DenseMatrix[Double] = data * wBroadcast.value.t
31 |         features(*,::) :+= bBroadcast.value
32 |         cos.inPlace(features)
33 |         MatrixUtils.matrixToRowArray(features).iterator
34 |       }
35 |     }
36 |   }
37 | 
38 |   override def apply(in: DenseVector[Double]): DenseVector[Double] = {
39 |     val features = (in.t * W.t).t
40 |     features :+= b
41 |     cos.inPlace(features)
42 |     features
43 |   }
44 | }
45 | 
46 | /**
47 |  * Companion Object to generate random cosine features from various distributions
48 |  */
49 | object CosineRandomFeatures {
50 |   /** Generate Random Cosine Features from the given distributions **/
51 |   def apply(
52 |       numInputFeatures: Int,
53 |       numOutputFeatures: Int,
54 |       gamma: Double,
55 |       wDist: Rand[Double] = Rand.gaussian,
56 |       bDist: Rand[Double] = Rand.uniform) = {
57 |     val W = DenseMatrix.rand(numOutputFeatures, numInputFeatures, wDist) :* gamma
58 |     val b = DenseVector.rand(numOutputFeatures, bDist) :* (2*math.Pi)
59 |     new CosineRandomFeatures(W, b)
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/stats/LinearRectifier.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import keystoneml.pipelines._
 5 | import keystoneml.workflow.Transformer
 6 | 
 7 | /**
 8 |  * This transformer applies a Linear Rectifier,
 9 |  * an activation function defined as:
10 |  * f(x) = max({@param maxVal}, x - {@param alpha})
11 |  */
12 | case class LinearRectifier(maxVal: Double = 0.0, alpha: Double = 0.0)
13 |   extends Transformer[DenseVector[Double], DenseVector[Double]] {
14 |   def apply(in: DenseVector[Double]): DenseVector[Double] = {
15 |     in.map(e => math.max(maxVal, e - alpha))
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/stats/NormalizeRows.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import breeze.linalg.{max, sum, DenseVector}
 4 | import breeze.numerics._
 5 | import keystoneml.workflow.Transformer
 6 | 
 7 | /**
 8 |  * Divides each row by the max of its two-norm and 2.2e-16.
 9 |  */
10 | object NormalizeRows extends Transformer[DenseVector[Double], DenseVector[Double]] {
11 |   def apply(in: DenseVector[Double]): DenseVector[Double] = {
12 |     val norm = max(sqrt(sum(pow(in, 2.0))), 2.2e-16)
13 |     in / norm
14 |   }
15 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/stats/PaddedFFT.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import breeze.math.Complex
 5 | import keystoneml.workflow.Transformer
 6 | 
 7 | /**
 8 |  * This transformer pads input vectors to the nearest power of two,
 9 |  * then returns the real values of the first half of the fourier transform on the padded vectors.
10 |  *
11 |  * Goes from vectors of size n to vectors of size nextPositivePowerOfTwo(n)/2
12 |  */
13 | case class PaddedFFT() extends Transformer[DenseVector[Double], DenseVector[Double]] {
14 |   override def apply(in: DenseVector[Double]): DenseVector[Double] = {
15 |     val paddedSize = nextPositivePowerOfTwo(in.length)
16 |     val fft: DenseVector[Complex] = breeze.signal.fourierTr(in.padTo(paddedSize, 0.0).toDenseVector)
17 |     fft(0 until (paddedSize / 2)).map(_.real)
18 |   }
19 | 
20 |   def nextPositivePowerOfTwo(i : Int) = 1 << (32 - Integer.numberOfLeadingZeros(i - 1))
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/stats/RandomSignNode.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import breeze.linalg._
 4 | import breeze.stats.distributions._
 5 | import keystoneml.workflow.Transformer
 6 | 
 7 | /**
 8 |  *  A node that takes in DenseVector[Double] and randomly flips
 9 |  *  the sign of some of the elements
10 |  */
11 | case class RandomSignNode(signs: DenseVector[Double])
12 |     extends Transformer[DenseVector[Double], DenseVector[Double]] {
13 | 
14 |   def apply(in: DenseVector[Double]): DenseVector[Double] = in :* signs
15 | 
16 | }
17 | 
18 | object RandomSignNode {
19 |   /* Create a random sign node */
20 |   def apply(size: Int, rand: RandBasis = Rand): RandomSignNode = {
21 |     val signs = 2.0*convert(DenseVector.rand(size, Binomial(1, 0.5)(rand)), Double) - 1.0
22 |     new RandomSignNode(signs)
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/stats/Sampling.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import breeze.linalg.{DenseVector, DenseMatrix}
 4 | import org.apache.spark.rdd.RDD
 5 | import keystoneml.pipelines.FunctionNode
 6 | import keystoneml.workflow.Transformer
 7 | 
 8 | /**
 9 |  * Given a collection of Dense Matrices, this will generate a sample of
10 |  * @param numSamplesPerMatrix columns from each matrix.
11 |  */
12 | case class ColumnSampler(numSamplesPerMatrix: Int)
13 |   extends Transformer[DenseMatrix[Float], DenseMatrix[Float]] {
14 | 
15 |   def apply(in: DenseMatrix[Float]): DenseMatrix[Float] = {
16 |     val cols = Seq.fill(numSamplesPerMatrix) {
17 |       scala.util.Random.nextInt(in.cols)
18 |     }
19 |     in(::, cols).toDenseMatrix
20 |   }
21 | }
22 | 
23 | 
24 | /**
25 |  * Takes a sample of an input RDD of size size.
26 |  * @param size Number of elements to return.
27 |  */
28 | class Sampler[T](val size: Int, val seed: Int = 42) extends FunctionNode[RDD[T], Array[T]] {
29 |   def apply(in: RDD[T]): Array[T] = {
30 |     in.takeSample(false, size, seed)
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/stats/SignedHellingerMapper.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import breeze.linalg.{DenseVector, DenseMatrix}
 4 | import breeze.numerics._
 5 | import keystoneml.workflow.Transformer
 6 | 
 7 | /**
 8 |  *  Apply power normalization: z <- sign(z)|z|^{\rho}
 9 |  *  with \rho = \frac{1}{2}
10 |  *  This a "signed square root"
11 |  */
12 | object SignedHellingerMapper extends Transformer[DenseVector[Double], DenseVector[Double]] {
13 |   def apply(in: DenseVector[Double]): DenseVector[Double] = {
14 |     signum(in) :* sqrt(abs(in))
15 |   }
16 | }
17 | 
18 | object BatchSignedHellingerMapper extends Transformer[DenseMatrix[Float], DenseMatrix[Float]] {
19 |   def apply(in: DenseMatrix[Float]): DenseMatrix[Float] = {
20 |     in.map(x => (math.signum(x) * math.sqrt(math.abs(x))).toFloat)
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/stats/StandardScaler.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import breeze.numerics.sqrt
 5 | import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 6 | import org.apache.spark.rdd.RDD
 7 | import keystoneml.utils.MLlibUtils
 8 | import keystoneml.workflow.{Transformer, Estimator}
 9 | 
10 | /**
11 |  * Represents a StandardScaler model that can transform dense vectors.
12 |  *
13 |  * @param mean column mean values
14 |  * @param std column standard deviation values
15 |  */
16 | class StandardScalerModel(val mean: DenseVector[Double], val std: Option[DenseVector[Double]] = None)
17 |     extends Transformer[DenseVector[Double], DenseVector[Double]] {
18 |   /**
19 |    * Applies standardization transformation on a vector.
20 |    *
21 |    * @param in Vector to be standardized.
22 |    * @return Standardized vector. If the std of a column is zero, it will return default `0.0`
23 |    *         for the column with zero std.
24 |    */
25 |   override def apply(in: DenseVector[Double]): DenseVector[Double] = {
26 |     val out = in - mean
27 |     std.foreach(x => {
28 |       out :/= x
29 |     })
30 |     out
31 |   }
32 | }
33 | 
34 | /**
35 |  * Standardizes features by removing the mean and scaling to unit std using column summary
36 |  * statistics on the samples in the training set.
37 |  */
38 | class StandardScaler(normalizeStdDev: Boolean = true, eps: Double = 1E-12) extends Estimator[DenseVector[Double], DenseVector[Double]]{
39 |   /**
40 |    * Computes the mean and variance and stores as a model to be used for later scaling.
41 |    *
42 |    * @param data The data used to compute the mean and variance to build the transformation model.
43 |    * @return a StandardScalarModel
44 |    */
45 |   override def fit(data: RDD[DenseVector[Double]]): StandardScalerModel = {
46 |     val summary = data.treeAggregate(new MultivariateOnlineSummarizer)(
47 |       (aggregator, data) => aggregator.add(MLlibUtils.breezeVectorToMLlib(data)),
48 |       (aggregator1, aggregator2) => aggregator1.merge(aggregator2))
49 |     if (normalizeStdDev) {
50 |       new StandardScalerModel(
51 |         MLlibUtils.mllibVectorToDenseBreeze(summary.mean),
52 |         Some(sqrt(MLlibUtils.mllibVectorToDenseBreeze(summary.variance))
53 |             .map(r => if (r.isNaN | r.isInfinite | math.abs(r) < eps) 1.0 else r)))
54 |     } else {
55 |       new StandardScalerModel(
56 |         MLlibUtils.mllibVectorToDenseBreeze(summary.mean),
57 |         None)
58 |     }
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/stats/TermFrequency.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import keystoneml.workflow.Transformer
 4 | 
 5 | /**
 6 |  * Transformer that maps a Seq[Any] of objects to a Seq[(Any, Double)] of (unique object, weighting_scheme(tf)),
 7 |  * where tf is the number of times the unique object appeared in the original Seq[Any],
 8 |  * and the weighting_scheme is a lambda of Double => Double that defaults to the identity function.
 9 |  *
10 |  * As an example, the following would return a transformer that maps a Seq[Any]
11 |  * to all objects seen with the log of their count plus 1:
12 |  * {{{
13 |  *   TermFrequency(x => math.log(x) + 1)
14 |  * }}}
15 |  *
16 |  * @param fun the weighting scheme to apply to the frequencies (defaults to identity)
17 |  */
18 | case class TermFrequency[T](fun: Double => Double = identity) extends Transformer[Seq[T], Seq[(T, Double)]] {
19 |   override def apply(in: Seq[T]): Seq[(T, Double)] = in.groupBy(identity).mapValues(x => fun(x.size)).toSeq
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/AllSparseFeatures.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import org.apache.spark.rdd.RDD
 5 | import keystoneml.workflow.Estimator
 6 | 
 7 | import scala.reflect.ClassTag
 8 | 
 9 | /**
10 |  * An Estimator that chooses all sparse features observed when training,
11 |  * and produces a transformer which builds a sparse vector out of them.
12 |  *
13 |  * Deterministically orders the feature mappings by earliest appearance in the RDD
14 |  */
15 | case class AllSparseFeatures[T: ClassTag]() extends Estimator[Seq[(T, Double)], SparseVector[Double]] {
16 |   override def fit(data: RDD[Seq[(T, Double)]]): SparseFeatureVectorizer[T] = {
17 |     val featureOccurrences = data.flatMap(_.map(_._1))
18 |     // zip with unique ids and take the smallest unique id for a given feature to get
19 |     // a deterministic ordering
20 |     val featuresWithUniqueId = featureOccurrences.zipWithUniqueId().reduceByKey {
21 |       (x, y) => Math.min(x, y)
22 |     }
23 |     val featureSpace = featuresWithUniqueId.sortBy(_._2).map(_._1)
24 |         .collect().zipWithIndex.toMap
25 |     new SparseFeatureVectorizer(featureSpace)
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/Cacher.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import keystoneml.pipelines.Logging
 5 | import keystoneml.workflow.Transformer
 6 | 
 7 | import scala.reflect.ClassTag
 8 | 
 9 | /**
10 |  * Caches an RDD at a given point within a Pipeline. Follows Spark's lazy evaluation conventions.
11 |  *
12 |  * @param name An optional name to set on the cached output. Useful for debugging.
13 |  * @tparam T Type of the input to cache.
14 |  */
15 | case class Cacher[T: ClassTag](name: Option[String] = None) extends Transformer[T,T] with Logging {
16 |   override def apply(in: RDD[T]): RDD[T] = {
17 |     logInfo(s"CACHING ${name.getOrElse(in.id)}")
18 |     name match {
19 |       case Some(x) => in.cache().setName(x)
20 |       case None => in.cache()
21 |     }
22 |   }
23 | 
24 |   override def apply(in: T): T = in
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/ClassLabelIndicators.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import org.apache.spark.rdd.RDD
 5 | import keystoneml.pipelines._
 6 | import keystoneml.workflow.Transformer
 7 | 
 8 | /**
 9 |  * Given a class label, returns a binary vector that indicates when that class is present.
10 |  *
11 |  * Expects labels in the range [0, numClasses) and numClasses > 1.
12 |  *
13 |  * @param numClasses
14 |  */
15 | case class ClassLabelIndicatorsFromIntLabels(numClasses: Int)
16 |     extends Transformer[Int, DenseVector[Double]] {
17 | 
18 |   assert(numClasses > 1, "numClasses must be > 1.")
19 | 
20 |   def apply(in: Int): DenseVector[Double] = {
21 |     if(in < 0 || in >= numClasses) {
22 |       throw new RuntimeException("Class labels are expected to be in the range [0, numClasses)")
23 |     }
24 | 
25 |     val indicatorVector = DenseVector.fill(numClasses, -1.0)
26 |     indicatorVector(in) = 1.0
27 |     indicatorVector
28 |   }
29 | }
30 | 
31 | /**
32 |  * Given a set of class labels, returns a binary vector that indicates when each class is present.
33 |  *
34 |  * Expects labels in the range [0, numClasses) and numClasses > 1.
35 |  *
36 |  * @param numClasses
37 |  */
38 | case class ClassLabelIndicatorsFromIntArrayLabels(numClasses: Int, validate: Boolean = false)
39 |     extends Transformer[Array[Int], DenseVector[Double]] {
40 | 
41 |   assert(numClasses > 1, "numClasses must be > 1.")
42 | 
43 |   def apply(in: Array[Int]): DenseVector[Double] = {
44 |     if(validate && (in.max >= numClasses || in.min < 0)) {
45 |       throw new RuntimeException("Class labels are expected to be in the range [0, numClasses)")
46 |     }
47 | 
48 |     val indicatorVector = DenseVector.fill(numClasses, -1.0)
49 |     var i = 0
50 |     while (i < in.length) {
51 |       indicatorVector(in(i)) = 1.0
52 |       i += 1
53 |     }
54 |     indicatorVector
55 |   }
56 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/CommonSparseFeatures.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import org.apache.spark.rdd.RDD
 5 | import keystoneml.workflow.Estimator
 6 | 
 7 | import scala.collection.JavaConversions._
 8 | import scala.reflect.ClassTag
 9 | 
10 | /**
11 |  * An Estimator that chooses the most frequently observed sparse features when training,
12 |  * and produces a transformer which builds a sparse vector out of them
13 |  *
14 |  * Deterministically orders the feature mappings first by decreasing number of appearances,
15 |  * then by earliest appearance in the RDD
16 |  *
17 |  * @param numFeatures The number of features to keep
18 |  */
19 | case class CommonSparseFeatures[T : ClassTag](numFeatures: Int) extends Estimator[Seq[(T, Double)], SparseVector[Double]] {
20 |   // Ordering that compares (feature, frequency) pairs according to their frequencies
21 |   val ordering = new Ordering[(T, (Int, Long))] {
22 |     def compare(x: (T, (Int, Long)), y: (T, (Int, Long))): Int = {
23 |       if (x._2._1 == y._2._1) {
24 |         x._2._2.compare(y._2._2)
25 |       } else {
26 |         x._2._1.compare(y._2._1)
27 |       }
28 |     }
29 |   }
30 | 
31 |   /** This method merges two seqs and keeps the top numFeatures */
32 |   def merge(a: Seq[(T, (Int, Long))], b: Seq[(T, (Int, Long))]): Seq[(T, (Int, Long))] = {
33 |     (a ++ b).sorted(ordering.reverse).take(numFeatures)
34 |   }
35 | 
36 |   override def fit(data: RDD[Seq[(T, Double)]]): SparseFeatureVectorizer[T] = {
37 |     val featureOccurrences = data.flatMap(identity).zipWithUniqueId().map(x => (x._1._1, (1, x._2)))
38 |     // zip with unique ids and take the smallest unique id for a given feature to get
39 |     // a deterministic ordering
40 |     val featureFrequenciesWithUniqueId = featureOccurrences.reduceByKey {
41 |       (x, y) => (x._1 + y._1, Math.min(x._2, y._2))
42 |     }
43 |     val mapRDDs = featureFrequenciesWithUniqueId mapPartitions { items =>
44 |       // Priority keeps the largest elements, so let's reverse the ordering.
45 |       Iterator.single(takeOrdered(items, numFeatures)(ordering.reverse))
46 |     }
47 |     val mostCommonFeatures = mapRDDs.treeReduce(merge).map(_._1)
48 | 
49 |     val featureSpace = mostCommonFeatures.zipWithIndex.toMap
50 |     new SparseFeatureVectorizer(featureSpace)
51 |   }
52 | 
53 |   /**
54 |    * Returns the first K elements from the input as defined by the specified implicit Ordering[T]
55 |    * and maintains the ordering.
56 |    */
57 |   def takeOrdered[T](input: Iterator[T], num: Int)(implicit ord: Ordering[T]): Seq[T] = {
58 |     val ordering = new com.google.common.collect.Ordering[T] {
59 |       override def compare(l: T, r: T) = ord.compare(l, r)
60 |     }
61 |     ordering.leastOf(asJavaIterator(input), num)
62 |   }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/Densify.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import breeze.linalg.Vector
 5 | import keystoneml.workflow.Transformer
 6 | 
 7 | /**
 8 |  * Transformer to densify vectors into DenseVectors.
 9 |  */
10 | case class Densify[T <: Vector[Double]]() extends Transformer[T, DenseVector[Double]] {
11 |   /**
12 |    * Apply this Transformer to a single input item
13 |    *
14 |    * @param in The input item to pass into this transformer
15 |    * @return The output value
16 |    */
17 |   override def apply(in: T): DenseVector[Double] = in match {
18 |     case dense: DenseVector[Double] => dense
19 |     case _ => in.toDenseVector
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/FloatToDouble.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg._
 4 | import keystoneml.workflow.Transformer
 5 | 
 6 | /**
 7 |  * Converts float matrix to a double matrix.
 8 |  */
 9 | object FloatToDouble extends Transformer[DenseMatrix[Float], DenseMatrix[Double]] {
10 |   def apply(in: DenseMatrix[Float]): DenseMatrix[Double] = convert(in, Double)
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/Identity.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import keystoneml.workflow.Transformer
 5 | 
 6 | import scala.reflect.ClassTag
 7 | 
 8 | /**
 9 |  * This class performs a no-op on its input.
10 |  *
11 |  * @tparam T Type of the input and, by definition, output.
12 |  */
13 | class Identity[T: ClassTag] extends Transformer[T,T] {
14 |   def apply(in: T): T = in
15 |   override def apply(in: RDD[T]): RDD[T] = in
16 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/MatrixVectorizer.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.{DenseMatrix, DenseVector}
 4 | import keystoneml.workflow.Transformer
 5 | 
 6 | /**
 7 |  * Flattens a matrix into a vector.
 8 |  */
 9 | object MatrixVectorizer extends Transformer[DenseMatrix[Double], DenseVector[Double]] {
10 |   def apply(in: DenseMatrix[Double]): DenseVector[Double] = in.toDenseVector
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/MaxClassifier.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.{DenseVector, argmax}
 4 | import keystoneml.workflow.Transformer
 5 | 
 6 | /**
 7 |  * Transformer that returns the index of the largest value in the vector
 8 |  */
 9 | object MaxClassifier extends Transformer[DenseVector[Double], Int] {
10 |   override def apply(in: DenseVector[Double]): Int = argmax(in)
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/Shuffler.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import keystoneml.pipelines.Logging
 5 | import keystoneml.workflow.Transformer
 6 | 
 7 | import scala.reflect.ClassTag
 8 | 
 9 | /**
10 |  * Randomly shuffle the rows of an RDD within a pipeline. Uses a shuffle operation in Spark.
11 |  *
12 |  * @param numParts An optional parameter indicating the number of output partitions. 
13 |  * @tparam T Type of the input to shuffle.
14 |  */
15 | class Shuffler[T: ClassTag](numParts: Option[Int] = None) extends Transformer[T,T] with Logging {
16 |   override def apply(in: RDD[T]): RDD[T] = {
17 |     val numToRepartition = numParts.getOrElse(in.partitions.size)
18 |     in.repartition(numToRepartition)
19 |   }
20 | 
21 |   override def apply(in: T): T = in
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/SparseFeatureVectorizer.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.SparseVector
 4 | import keystoneml.workflow.Transformer
 5 | 
 6 | /** A transformer which given a feature space, maps features of the form (feature id, value) into a sparse vector */
 7 | class SparseFeatureVectorizer[T](featureSpace: Map[T, Int]) extends Transformer[Seq[(T, Double)], SparseVector[Double]] {
 8 |   private def transformVector(in: Seq[(T, Double)], featureSpaceMap: Map[T, Int]): SparseVector[Double] = {
 9 |     val features = in.map(f => (featureSpaceMap.get(f._1), f._2))
10 |         .filter(_._1.isDefined)
11 |         .map(f => (f._1.get, f._2.toDouble))
12 |     SparseVector(featureSpaceMap.size)(features:_*)
13 |   }
14 | 
15 |   override def apply(in: Seq[(T, Double)]): SparseVector[Double] = {
16 |     transformVector(in, featureSpace)
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/Sparsify.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.{SparseVector, DenseVector, Vector}
 4 | import keystoneml.workflow.Transformer
 5 | 
 6 | /**
 7 |  * Transformer to convert vectors into SparseVectors.
 8 |  */
 9 | case class Sparsify[T <: Vector[Double]]() extends Transformer[T, SparseVector[Double]] {
10 |   /**
11 |    * Apply this Transformer to a single input item
12 |    *
13 |    * @param in The input item to pass into this transformer
14 |    * @return The output value
15 |    */
16 |   override def apply(in: T): SparseVector[Double] = in match {
17 |     case sparse: SparseVector[Double] => sparse
18 |     case _ => SparseVector(in.toArray)
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/TopKClassifier.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.{DenseVector, argtopk}
 4 | import keystoneml.workflow.Transformer
 5 | 
 6 | /**
 7 |  * Transformer that returns the indices of the largest k values of the vector, in order
 8 |  */
 9 | class TopKClassifier(k: Int) extends Transformer[DenseVector[Double], Array[Int]] {
10 |   override def apply(in: DenseVector[Double]): Array[Int] = {
11 |     in.toArray.zipWithIndex.sortBy(-_._1).take(k).map(_._2)
12 |   }
13 | }
14 | 
15 | /**
16 |  * Object to allow creating top k classifier w/o new
17 |  */
18 | object TopKClassifier {
19 |   def apply(k: Int) = new TopKClassifier(k)
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/VectorCombiner.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.{DenseMatrix, DenseVector}
 4 | import keystoneml.workflow.Transformer
 5 | 
 6 | import scala.reflect.ClassTag
 7 | 
 8 | /**
 9 |  * Concats a Seq of DenseVectors into a single DenseVector.
10 |  */
11 | case class VectorCombiner[T : ClassTag]()(implicit zero: breeze.storage.Zero[T])
12 |     extends Transformer[Seq[DenseVector[T]], DenseVector[T]] {
13 |   def apply(in: Seq[DenseVector[T]]): DenseVector[T] = DenseVector.vertcat(in:_*)
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/nodes/util/VectorSplitter.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import org.apache.spark.rdd.RDD
 5 | import keystoneml.pipelines.FunctionNode
 6 | 
 7 | /**
 8 |  * This transformer splits the input vector into a number of blocks.
 9 |  */
10 | class VectorSplitter(
11 |     blockSize: Int,
12 |     numFeaturesOpt: Option[Int] = None) 
13 |   extends FunctionNode[RDD[DenseVector[Double]], Seq[RDD[DenseVector[Double]]]] {
14 | 
15 |   override def apply(in: RDD[DenseVector[Double]]): Seq[RDD[DenseVector[Double]]] = {
16 |     val numFeatures = numFeaturesOpt.getOrElse(in.first.length)
17 |     val numBlocks = math.ceil(numFeatures.toDouble / blockSize).toInt
18 |     (0 until numBlocks).map { blockNum =>
19 |       in.map { vec =>
20 |         // Expliclity call toArray as breeze's slice is lazy
21 |         val end = math.min(numFeatures, (blockNum + 1) * blockSize)
22 |         DenseVector(vec.slice(blockNum * blockSize, end).toArray)
23 |       }
24 |     }
25 |   }
26 | 
27 |   def splitVector(in: DenseVector[Double]): Seq[DenseVector[Double]] = {
28 |     val numFeatures = numFeaturesOpt.getOrElse(in.length)
29 |     val numBlocks = math.ceil(numFeatures.toDouble / blockSize).toInt
30 |     (0 until numBlocks).map { blockNum =>
31 |       // Expliclity call toArray as breeze's slice is lazy
32 |       val end = math.min(numFeatures, (blockNum + 1) * blockSize)
33 |       DenseVector(in.slice(blockNum * blockSize, end).toArray)
34 |     }
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/pipelines/FunctionNode.scala:
--------------------------------------------------------------------------------
1 | package keystoneml.pipelines
2 | 
3 | abstract class FunctionNode[A,B] extends (A => B) with Serializable


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/pipelines/Logging.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.pipelines
 2 | 
 3 | import org.slf4j.{Logger, LoggerFactory}
 4 | 
 5 | /**
 6 |  * Utility trait for Logging
 7 |  */
 8 | trait Logging {
 9 |   // Make the log field transient so that objects with Logging can
10 |   // be serialized and used on another machine
11 |   @transient private var log_ : Logger = null
12 | 
13 |   // Method to get or create the logger for this object
14 |   protected def log: Logger = {
15 |     if (log_ == null) {
16 |       var className = this.getClass.getName
17 |       // Ignore trailing $'s in the class names for Scala objects
18 |       if (className.endsWith("$")) {
19 |         className = className.substring(0, className.length - 1)
20 |       }
21 |       log_ = LoggerFactory.getLogger(className)
22 |     }
23 |     log_
24 |   }
25 | 
26 |   // Log methods that take only a String
27 |   protected def logInfo(msg: => String) {
28 |     if (log.isInfoEnabled) log.info(msg)
29 |   }
30 | 
31 |   protected def logDebug(msg: => String) {
32 |     if (log.isDebugEnabled) log.debug(msg)
33 |   }
34 | 
35 |   protected def logTrace(msg: => String) {
36 |     if (log.isTraceEnabled) log.trace(msg)
37 |   }
38 | 
39 |   protected def logWarning(msg: => String) {
40 |     if (log.isWarnEnabled) log.warn(msg)
41 |   }
42 | 
43 |   protected def logError(msg: => String) {
44 |     if (log.isErrorEnabled) log.error(msg)
45 |   }
46 | 
47 |   // Log methods that take Throwables (Exceptions/Errors) too
48 |   protected def logInfo(msg: => String, throwable: Throwable) {
49 |     if (log.isInfoEnabled) log.info(msg, throwable)
50 |   }
51 | 
52 |   protected def logDebug(msg: => String, throwable: Throwable) {
53 |     if (log.isDebugEnabled) log.debug(msg, throwable)
54 |   }
55 | 
56 |   protected def logTrace(msg: => String, throwable: Throwable) {
57 |     if (log.isTraceEnabled) log.trace(msg, throwable)
58 |   }
59 | 
60 |   protected def logWarning(msg: => String, throwable: Throwable) {
61 |     if (log.isWarnEnabled) log.warn(msg, throwable)
62 |   }
63 | 
64 |   protected def logError(msg: => String, throwable: Throwable) {
65 |     if (log.isErrorEnabled) log.error(msg, throwable)
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/pipelines/nlp/StupidBackoffPipeline.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.pipelines.nlp
 2 | 
 3 | import keystoneml.nodes.nlp._
 4 | 
 5 | import org.apache.spark.{SparkContext, SparkConf}
 6 | 
 7 | import scopt.OptionParser
 8 | 
 9 | object StupidBackoffPipeline {
10 | 
11 |   val appName = "StupidBackoffPipeline"
12 | 
13 |   case class StupidBackoffConfig(trainData: String = "", numParts: Int = 16, n: Int = 3)
14 | 
15 |   def parse(args: Array[String]): StupidBackoffConfig =
16 |     new OptionParser[StupidBackoffConfig](appName) {
17 |       head(appName, "0.1")
18 |       opt[String]("trainData") required() action { (x, c) => c.copy(trainData = x) }
19 |       opt[String]("numParts") required() action { (x, c) => c.copy(numParts = x.toInt) }
20 |       opt[String]("n") optional() action { (x, c) => c.copy(n = x.toInt) }
21 |     }.parse(args, StupidBackoffConfig()).get
22 | 
23 |   def main(args: Array[String]) {
24 |     val appConfig = parse(args)
25 |     val conf = new SparkConf().setAppName(appName)
26 |     conf.setIfMissing("spark.master", "local[4]")
27 |     val sc = new SparkContext(conf)
28 | 
29 |     val text = Tokenizer()(sc.textFile(appConfig.trainData, appConfig.numParts))
30 | 
31 |     /** Vocab generation step */
32 |     val frequencyEncode = WordFrequencyEncoder.fit(text)
33 |     val unigramCounts = frequencyEncode.unigramCounts
34 | 
35 |     /** NGram (n >= 2) generation step */
36 |     val makeNGrams = frequencyEncode andThen NGramsFeaturizer(2 to appConfig.n)
37 | 
38 |     val ngramCounts = NGramsCounts[Int](NGramsCountsMode.NoAdd).apply(
39 |       makeNGrams(text).get)
40 | 
41 |     /** Stupid backoff scoring step */
42 |     val stupidBackoff = StupidBackoffEstimator[Int](unigramCounts)
43 |     val languageModel = stupidBackoff.fit(ngramCounts)
44 | 
45 |     /** Done: save or serve */
46 |     languageModel.scoresRDD.cache()
47 |     println(
48 |       s"""|number of tokens: ${languageModel.numTokens}
49 |           |size of vocabulary: ${languageModel.unigramCounts.size}
50 |           |number of ngrams: ${languageModel.scoresRDD.count()}
51 |           |""".stripMargin)
52 |     println("trained scores of 100 ngrams in the corpus:")
53 |     languageModel.scoresRDD.take(100).foreach(println)
54 | 
55 |     sc.stop()
56 |   }
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/utils/MLlibUtils.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.utils
 2 | 
 3 | import breeze.linalg.{SparseVector, DenseMatrix, DenseVector}
 4 | 
 5 | /**
 6 |  * Provides conversions between MLlib vectors & matrices, and Breeze vectors & matrices
 7 |  */
 8 | object MLlibUtils {
 9 | 
10 |   /** Convert an MLlib vector to a Breeze dense vector */
11 |   def mllibVectorToDenseBreeze(vector: org.apache.spark.mllib.linalg.Vector): DenseVector[Double] = {
12 |     vector match {
13 |       case dense: org.apache.spark.mllib.linalg.DenseVector => new DenseVector[Double](dense.values)
14 |       case _ => new DenseVector[Double](vector.toArray)
15 |     }
16 |   }
17 | 
18 |   /** Convert an MLlib matrix to a Breeze dense matrix */
19 |   def mllibMatrixToDenseBreeze(matrix: org.apache.spark.mllib.linalg.Matrix): DenseMatrix[Double] = {
20 |     matrix match {
21 |       case dense: org.apache.spark.mllib.linalg.DenseMatrix => {
22 |         if (!dense.isTransposed) {
23 |           new DenseMatrix[Double](dense.numRows, dense.numCols, dense.values)
24 |         } else {
25 |           val breezeMatrix = new DenseMatrix[Double](dense.numRows, dense.numCols, dense.values)
26 |           breezeMatrix.t
27 |         }
28 |       }
29 | 
30 |       case _ => new DenseMatrix[Double](matrix.numRows, matrix.numCols, matrix.toArray)
31 |     }
32 |   }
33 | 
34 |   /** Convert a Breeze vector to an MLlib vector, maintaining underlying data structure (sparse vs dense) */
35 |   def breezeVectorToMLlib(breezeVector: breeze.linalg.Vector[Double]): org.apache.spark.mllib.linalg.Vector = {
36 |     breezeVector match {
37 |       case v: DenseVector[Double] =>
38 |         if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) {
39 |           new org.apache.spark.mllib.linalg.DenseVector(v.data)
40 |         } else {
41 |           new org.apache.spark.mllib.linalg.DenseVector(v.toArray)  // Can't use underlying array directly, so make a new one
42 |         }
43 |       case v: SparseVector[Double] =>
44 |         if (v.index.length == v.used) {
45 |           new org.apache.spark.mllib.linalg.SparseVector(v.length, v.index, v.data)
46 |         } else {
47 |           new org.apache.spark.mllib.linalg.SparseVector(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used))
48 |         }
49 |       case v: breeze.linalg.Vector[_] =>
50 |         sys.error("Unsupported Breeze vector type: " + v.getClass.getName)
51 |     }
52 |   }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/utils/external/EncEval.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.utils.external
 2 | 
 3 | class EncEval extends Serializable {
 4 |   System.loadLibrary("ImageFeatures") // This will load libImageEncoders.{so,dylib} from the library path.
 5 | 
 6 |   /**
 7 |    * Compute a mixture of Gaussians given a set of sample points.
 8 |    * @param nGauss Number of Gaussians to estimate.
 9 |    * @param nDim Number of dimensions of each sample.
10 |    * @param samples The samples (in sample-major order).
11 |    * @return The Gaussians, their variances, and their weights in a single flat array. (Center-major order).
12 |    */
13 |   @native
14 |   def computeGMM(nGauss: Int, nDim: Int, samples: Array[Float]): Array[Float]
15 | 
16 |   /**
17 |    * Calculates Fisher Vectors for a set of descriptors given a GMM.
18 |    *
19 |    * @param means Means - flat array in center-major order.
20 |    * @param dims Number of dimensions of each center.
21 |    * @param numClusters Number of GMM cluster centers.
22 |    * @param covariances The variances of the GMM centers in center-major order.
23 |    * @param priors The weights of the GMM in center order.
24 |    * @param dSiftDescriptors Bag of descriptors on which to compute the GMM.
25 |    * @return The Fisher Vector for the input descriptors.
26 |    */
27 |   @native
28 |   def calcAndGetFVs(means: Array[Float], dims: Int, numClusters: Int, covariances: Array[Float],
29 |                     priors: Array[Float], dSiftDescriptors: Array[Float]) : Array[Float]
30 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/utils/external/VLFeat.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.utils.external
 2 | 
 3 | class VLFeat extends Serializable {
 4 |   System.loadLibrary("ImageFeatures") // This will load libImageEncoders.{so,dylib} from the library path.
 5 | 
 6 |   /**
 7 |    * Gets SIFT Descriptors at Multiple Scales emulating the `vl_phow` MATLAB routine.
 8 |    * Under the hood it uses vl_dsift from the vlfeat library.
 9 |    *
10 |    * @param width Image Width.
11 |    * @param height Image Height.
12 |    * @param step Step size at which to sample SIFT descriptors.
13 |    * @param bin SIFT Descriptor bin size.
14 |    * @param numScales Number of scales to extract at.
15 |    * @param image Input image as float array.
16 |    * @return SIFTs as Shorts.
17 |    */
18 |   @native
19 |   def getSIFTs(
20 |       width: Int,
21 |       height: Int,
22 |       step: Int,
23 |       bin: Int,
24 |       numScales: Int,
25 |       scaleStep: Int,
26 |       image: Array[Float]): Array[Short]
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/ChainUtils.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | import scala.reflect.ClassTag
 6 | 
 7 | /**
 8 |  * A chain of two Transformers in a row (as a Transformer)
 9 |  * @param first
10 |  * @param second
11 |  */
12 | case class TransformerChain[A, B, C : ClassTag](first: Transformer[A, B], second: Transformer[B, C]) extends Transformer[A, C] {
13 |   override def apply(in: A): C = second(first(in))
14 |   override def apply(in: RDD[A]): RDD[C] = second(first(in))
15 | }
16 | 
17 | /**
18 |  * A chain of a Transformer followed by an Estimator (as an Estimator)
19 |  * @param first
20 |  * @param second
21 |  */
22 | case class TransformerEstimatorChain[A, B, C : ClassTag](first: Transformer[A, B], second: Estimator[B, C])
23 |   extends Estimator[A, C] {
24 | 
25 |   override def fit(data: RDD[A]): Transformer[A, C] = {
26 |     TransformerChain(first, second.fit(first(data)))
27 |   }
28 | }
29 | 
30 | /**
31 |  * A chain of a Transformer followed by a LabelEstimator (as a LabelEstimator)
32 |  * @param first
33 |  * @param second
34 |  */
35 | case class TransformerLabelEstimatorChain[A, B, C : ClassTag, L](first: Transformer[A, B], second: LabelEstimator[B, C, L])
36 |   extends LabelEstimator[A, C, L] {
37 | 
38 |   override def fit(data: RDD[A], labels: RDD[L]): Transformer[A, C] = {
39 |     TransformerChain(first, second.fit(first(data), labels))
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/DefaultOptimizer.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import keystoneml.workflow.AutoCacheRule.GreedyCache
 4 | 
 5 | /**
 6 |  * The default Pipeline optimizer used when executing pipelines.
 7 |  */
 8 | object DefaultOptimizer extends Optimizer {
 9 |   protected val batches: Seq[Batch] =
10 |     Batch("Load Saved State", Once, ExtractSaveablePrefixes, SavedStateLoadRule, UnusedBranchRemovalRule) ::
11 |     Batch("Common Sub-expression Elimination", FixedPoint(Int.MaxValue), EquivalentNodeMergeRule) ::
12 |     Batch("Node Level Optimization", Once, new NodeOptimizationRule) ::
13 |       Nil
14 | }
15 | 
16 | /**
17 |  * Optimizes a Pipeline DAG, with auto-caching
18 |  */
19 | class AutoCachingOptimizer(strategy: AutoCacheRule.CachingStrategy = GreedyCache()) extends Optimizer {
20 |   protected val batches: Seq[Batch] =
21 |     Batch("Load Saved State", Once, ExtractSaveablePrefixes, SavedStateLoadRule, UnusedBranchRemovalRule) ::
22 |     Batch("Common Sub-expression Elimination", FixedPoint(Int.MaxValue), EquivalentNodeMergeRule) ::
23 |     Batch("Node Level Optimization", Once, new NodeOptimizationRule) ::
24 |     Batch("Auto Cache", Once, new AutoCacheRule(strategy)) ::
25 |     Nil
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/EquivalentNodeMergeRule.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | /**
 4 |  * A rule to merge equivalent nodes in the DAG.
 5 |  * Nodes are considered equivalent if:
 6 |  * - The operators stored within the nodes are equal, i.e. `.hashCode()` of both ops is equal AND `.equals()` is true
 7 |  *    (when an operator is a case class, both methods will automatically be generated)
 8 |  * - They share the same dependencies
 9 |  *
10 |  * This rule also merges prefixes if any of
11 |  * the nodes being merged have their prefix attached.
12 |  */
13 | object EquivalentNodeMergeRule extends Rule {
14 |   override def apply(plan: Graph, prefixes: Map[NodeId, Prefix]): (Graph, Map[NodeId, Prefix]) = {
15 |     val nodeSetsToMerge = plan.nodes.groupBy(id => (plan.getOperator(id), plan.getDependencies(id))).values
16 | 
17 |     if (nodeSetsToMerge.size == plan.nodes.size) {
18 |       // no nodes are mergable
19 |       (plan, prefixes)
20 |     } else {
21 |       nodeSetsToMerge.filter(_.size > 1).foldLeft((plan, prefixes)) {
22 |         case ((curPlan, curPrefixes), setToMerge) => {
23 |           // Construct a graph that merges all of the nodes
24 |           val nodeToKeep = setToMerge.minBy(_.id)
25 |           val nextGraph = (setToMerge - nodeToKeep).foldLeft(curPlan) {
26 |             case (partialMergedPlan, nodeToMerge) => {
27 |               partialMergedPlan
28 |                 .replaceDependency(nodeToMerge, nodeToKeep)
29 |                 .removeNode(nodeToMerge)
30 |             }
31 |           }
32 | 
33 |           // If any of the nodes being merged have been executed, update the prefixes
34 |           val prefix = setToMerge.collectFirst {
35 |             case node if curPrefixes.contains(node) => curPrefixes(node)
36 |           }
37 |           val nextPrefixes = if (prefix.nonEmpty) {
38 |             (curPrefixes -- setToMerge) + (nodeToKeep -> prefix.get)
39 |           } else {
40 |             curPrefixes
41 |           }
42 | 
43 |           (nextGraph, nextPrefixes)
44 |         }
45 |       }
46 |     }
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/Estimator.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | /**
 6 |  * An estimator has a `fitRDD` method which takes an input and emits a [[Transformer]]
 7 |  * @tparam A The type of input this estimator (and the resulting Transformer) takes
 8 |  * @tparam B The output type of the Transformer this estimator produces when being fit
 9 |  */
10 | abstract class Estimator[A, B] extends EstimatorOperator {
11 |   /**
12 |    * Constructs a pipeline that fits this estimator to training data,
13 |    * then applies the resultant transformer to the Pipeline input.
14 |    *
15 |    * @param data The training data
16 |    * @return A pipeline that fits this estimator and applies the result to inputs.
17 |    */
18 |   final def withData(data: RDD[A]): Pipeline[A, B] = {
19 |     withData(PipelineDataset(data))
20 |   }
21 | 
22 |   /**
23 |    * Constructs a pipeline that fits this estimator to training data,
24 |    * then applies the resultant transformer to the Pipeline input.
25 |    *
26 |    * @param data The training data
27 |    * @return A pipeline that fits this estimator and applies the result to inputs.
28 |    */
29 |   final def withData(data: PipelineDataset[A]): Pipeline[A, B] = {
30 |     // Remove the data sink,
31 |     // Then insert this estimator into the graph with the data as the input
32 |     val curSink = data.executor.graph.getSinkDependency(data.sink)
33 |     val (estGraph, estId) = data.executor.graph.removeSink(data.sink).addNode(this, Seq(curSink))
34 | 
35 |     // Now that the estimator is attached to the data, we need to build a pipeline DAG
36 |     // that applies the fit output of the estimator. We do this by creating a new Source in the DAG,
37 |     val (estGraphWithNewSource, sourceId) = estGraph.addSource()
38 | 
39 |     // Adding a delegating transformer that depends on the source and the label estimator,
40 |     val (almostFinalGraph, delegatingId) = estGraphWithNewSource.addNode(new DelegatingOperator, Seq(estId, sourceId))
41 | 
42 |     // And finally adding a sink that connects to the delegating transformer.
43 |     val (newGraph, sinkId) = almostFinalGraph.addSink(delegatingId)
44 | 
45 |     new Pipeline(new GraphExecutor(newGraph), sourceId, sinkId)
46 |   }
47 | 
48 |   /**
49 |    * The non-type-safe `fitRDDs` method of [[EstimatorOperator]] that is being overridden by the Estimator API.
50 |    */
51 |   final override private[workflow] def fitRDDs(inputs: Seq[DatasetExpression]): TransformerOperator = {
52 |     fit(inputs.head.get.asInstanceOf[RDD[A]])
53 |   }
54 | 
55 |   /**
56 |    * The type-safe method that ML developers need to implement when writing new Estimators.
57 |    *
58 |    * @param data The estimator's training data.
59 |    * @return A new transformer
60 |    */
61 |   def fit(data: RDD[A]): Transformer[A, B]
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/Expression.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | /**
 6 |  * Output is a trait extended by everything that may be output by an [[Operator]].
 7 |  * It is intended to add some extra type checking to the internal operator execution.
 8 |  */
 9 | private[workflow] sealed trait Expression {
10 |   def get: Any
11 | }
12 | 
13 | /**
14 |  * This is an output that wraps around an [[RDD]]. It wraps the RDD as call-by-name, so the RDD
15 |  * need not have been computed yet by the time this output is created.
16 |  *
17 |  * The first time the contained value is accessed using `get`, it will be computed. Every time after
18 |  * that it will already be stored, and will not be computed.
19 |  */
20 | private[workflow] class DatasetExpression(compute: => RDD[_]) extends Expression {
21 |   lazy override val get: RDD[_] = compute
22 | }
23 | 
24 | /**
25 |  * This is an output that wraps around a single untyped [[Any]] datum. It wraps the datum as call-by-name,
26 |  * so it need not have been computed by the time this output is created.
27 |  *
28 |  * The first time the contained value is accessed using `get`, it will be computed. Every time after
29 |  * that it will already be stored, and will not be computed.
30 |  */
31 | private[workflow] class DatumExpression(compute: => Any) extends Expression {
32 |   lazy override val get: Any = compute
33 | }
34 | 
35 | /**
36 |  * This is an output that wraps around a [[TransformerOperator]]. It wraps the transformer as call-by-name,
37 |  * so it need not have been computed by the time this output is created.
38 |  *
39 |  * The first time the contained value is accessed using `get`, it will be computed. Every time after
40 |  * that it will already be stored, and will not be computed.
41 |  */
42 | private[workflow] class TransformerExpression(compute: => TransformerOperator) extends Expression {
43 |   lazy override val get: TransformerOperator = compute
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/ExtractSaveablePrefixes.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import keystoneml.nodes.util.Cacher
 4 | 
 5 | /**
 6 |  * Extract the prefixes of all Nodes whose state we want to save for reuse by other Pipeline apply and fit calls.
 7 |  * This is all nodes that either have a Cacher or an EstimatorOperator as the internal operator.
 8 |  */
 9 | object ExtractSaveablePrefixes extends Rule {
10 |   override def apply(plan: Graph, prefixes: Map[NodeId, Prefix]): (Graph, Map[NodeId, Prefix]) = {
11 |     val nodesToExtract = plan.operators.collect {
12 |       case (node, _: Cacher[_]) => node
13 |       case (node, _: EstimatorOperator) => node
14 |     }
15 | 
16 |     val newPrefixes = nodesToExtract.map {
17 |       node => (node, Prefix.findPrefix(plan, node))
18 |     }.toMap
19 | 
20 |     (plan, newPrefixes)
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/FittedPipeline.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | /**
 6 |  * This is the result of fitting a [[Pipeline]]. It is logically equivalent to the Pipeline it is produced by,
 7 |  * but with all Estimators pre-fit, and only containing Transformers in the underlying graph.
 8 |  * Applying a FittedPipeline to new data does not trigger any new optimization or estimator fitting.
 9 |  *
10 |  * Unlike normal Pipelines, FittedPipelines are serializable and may be written to and from disk.
11 |  *
12 |  * @param transformerGraph The DAG representing the execution (only contains Transformers)
13 |  * @param source The SourceId of the Pipeline
14 |  * @param sink The SinkId of the Pipeline
15 |  * @tparam A type of the data this FittedPipeline expects as input
16 |  * @tparam B type of the data this FittedPipeline outputs
17 |  */
18 | class FittedPipeline[A, B] private[workflow] (
19 |     private[workflow] val transformerGraph: TransformerGraph,
20 |     private[workflow] val source: SourceId,
21 |     private[workflow] val sink: SinkId
22 |   ) extends Chainable[A, B] with Serializable {
23 | 
24 |   /**
25 |    * Converts this FittedPipeline back into a Pipeline.
26 |    */
27 |   override def toPipeline: Pipeline[A, B] = new Pipeline(
28 |     new GraphExecutor(transformerGraph.toGraph, optimize = false),
29 |     source,
30 |     sink)
31 | 
32 |   /**
33 |    * The application of this FittedPipeline to a single input item.
34 |    *
35 |    * @param in  The input item to pass into this transformer
36 |    * @return  The output value
37 |    */
38 |   def apply(in: A): B = toPipeline.apply(in).get()
39 | 
40 |   /**
41 |    * The application of this FittedPipeline to an RDD of input items.
42 |    *
43 |    * @param in The RDD input to pass into this transformer
44 |    * @return The RDD output for the given input
45 |    */
46 |   def apply(in: RDD[A]): RDD[B] = toPipeline.apply(in).get()
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/GatherTransformerOperator.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | /**
 6 |  * A [[TransformerOperator]] that gathers multiple datasets of {@tparam T} into a dataset of Seq[T]
 7 |  * (Or individual datums of T into a single Seq[T])
 8 |  */
 9 | private[workflow] case class GatherTransformerOperator[T]() extends TransformerOperator {
10 |   override private[workflow] def singleTransform(inputs: Seq[DatumExpression]): Any = {
11 |     inputs.map(_.get.asInstanceOf[T])
12 |   }
13 | 
14 |   override private[workflow] def batchTransform(inputs: Seq[DatasetExpression]): RDD[_] = {
15 |     inputs.map(_.get.asInstanceOf[RDD[T]].map(t => Seq(t))).reduceLeft((x, y) => {
16 |       x.zip(y).map(z => z._1 ++ z._2)
17 |     })
18 |   }
19 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/GraphId.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | /**
 4 |  * This is a unifying type for Node, Source, and Sink ids in the
 5 |  * internal graph data structure representing workloads.
 6 |  */
 7 | private[workflow] sealed trait GraphId
 8 | 
 9 | /**
10 |  * This represents the id of a Sink in the internal graph data structure.
11 |  * @param id The internal value, unique to each id
12 |  */
13 | private[workflow] case class SinkId(id: Long) extends GraphId
14 | 
15 | /**
16 |  * This is a unifying type for Node and Source ids in the
17 |  * internal graph data structure representing workloads.
18 |  */
19 | private[workflow] sealed trait NodeOrSourceId extends GraphId
20 | 
21 | /**
22 |  * This represents the id of a Node in the internal graph data structure.
23 |  * @param id The internal value, unique to each id
24 |  */
25 | private[workflow] case class NodeId(id: Long) extends NodeOrSourceId
26 | 
27 | /**
28 |  * This represents the id of a Source in the internal graph data structure.
29 |  * @param id The internal value, unique to each id
30 |  */
31 | private[workflow] case class SourceId(id: Long) extends NodeOrSourceId
32 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/Identity.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | import scala.reflect.ClassTag
 6 | 
 7 | /**
 8 |  * This transformer performs a no-op on its input.
 9 |  *
10 |  * @tparam T Type of the input and, by definition, output.
11 |  */
12 | case class Identity[T : ClassTag]() extends Transformer[T,T] {
13 |   override def apply(in: T): T = in
14 |   override def apply(in: RDD[T]): RDD[T] = in
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/OptimizableNodes.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | import scala.reflect.ClassTag
 6 | 
 7 | sealed trait Optimizable
 8 | 
 9 | /**
10 |  * Represents a node-level optimizable transformer and its optimization rules
11 |  */
12 | abstract class OptimizableTransformer[A, B : ClassTag] extends Transformer[A, B] with Optimizable {
13 |   val default: Transformer[A, B]
14 |   override def apply(a: A): B = {
15 |     default.apply(a)
16 |   }
17 |   override def apply(data: RDD[A]): RDD[B] = {
18 |     default.apply(data)
19 |   }
20 | 
21 |   def optimize(sample: RDD[A], numPerPartition: Map[Int, Int]): Transformer[A, B]
22 | }
23 | 
24 | /**
25 |  * Represents a node-level optimizable Estimator and its optimization rules
26 |  */
27 | abstract class OptimizableEstimator[A, B] extends Estimator[A, B] with Optimizable {
28 |   val default: Estimator[A, B]
29 | 
30 |   // Fit using whatever the default is.
31 |   override def fit(data: RDD[A]): Transformer[A, B] = {
32 |     default.fit(data)
33 |   }
34 | 
35 |   def optimize(sample: RDD[A], numPerPartition: Map[Int, Int]): Estimator[A, B]
36 | }
37 | 
38 | /**
39 |  * Represents a node-level optimizable LabelEstimator and its optimization rules
40 |  */
41 | abstract class OptimizableLabelEstimator[A, B, L] extends LabelEstimator[A, B, L] with Optimizable {
42 |   val default: LabelEstimator[A, B, L]
43 | 
44 |   // Fit using whatever the default is.
45 |   override def fit(data: RDD[A], labels: RDD[L]): Transformer[A, B] = {
46 |     default.fit(data, labels)
47 |   }
48 | 
49 |   def optimize(sample: RDD[A], sampleLabels: RDD[L], numPerPartition: Map[Int, Int]): LabelEstimator[A, B, L]
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/PipelineDataset.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | /**
 6 |  * This class is a lazy wrapper around the output of a pipeline that was passed an RDD as input.
 7 |  *
 8 |  * Under the hood, it extends [[PipelineResult]] and keeps track of the necessary execution plan.
 9 |  */
10 | class PipelineDataset[T] private[workflow](executor: GraphExecutor, sink: SinkId)
11 |   extends PipelineResult[RDD[T]](
12 |     executor,
13 |     sink)
14 | 
15 | object PipelineDataset {
16 |   private[workflow] def apply[T](rdd: RDD[T]): PipelineDataset[T] = {
17 |     val emptyGraph = Graph(Set(), Map(), Map(), Map())
18 |     val (graphWithDataset, nodeId) = emptyGraph.addNode(new DatasetOperator(rdd), Seq())
19 |     val (graph, sinkId) = graphWithDataset.addSink(nodeId)
20 | 
21 |     new PipelineDataset[T](new GraphExecutor(graph), sinkId)
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/PipelineDatum.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | /**
 4 |  * This class is a lazy wrapper around the output of a pipeline that was passed a single datum as input.
 5 |  *
 6 |  * Under the hood, it extends [[PipelineResult]] and keeps track of the necessary execution plan.
 7 |  */
 8 | class PipelineDatum[T] private[workflow](executor: GraphExecutor, sink: SinkId)
 9 |   extends PipelineResult[T](
10 |     executor,
11 |     sink)
12 | 
13 | object PipelineDatum {
14 |   private[workflow] def apply[T](datum: T): PipelineDatum[T] = {
15 |     val emptyGraph = Graph(Set(), Map(), Map(), Map())
16 |     val (graphWithDataset, nodeId) = emptyGraph.addNode(new DatumOperator(datum), Seq())
17 |     val (graph, sinkId) = graphWithDataset.addSink(nodeId)
18 | 
19 |     new PipelineDatum[T](new GraphExecutor(graph), sinkId)
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/PipelineEnv.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | /**
 4 |  * PipelineEnv is an environment shared by multiple [[Pipeline]]s, containing variables
 5 |  * such as the Prefix state table and the current Pipeline [[Optimizer]].
 6 |  */
 7 | class PipelineEnv {
 8 |   /**
 9 |    * This is the global execution state of Pipelines with this environment.
10 |    * It is a mutable hashmap of logical prefix to the executed result at that prefix.
11 |    * It is not currently thread-safe.
12 |    */
13 |   private[workflow] val state: scala.collection.mutable.Map[Prefix, Expression] = scala.collection.mutable.Map()
14 | 
15 |   /**
16 |    * The internally stored optimizer used for all Pipeline execution. Accessible using getter and setter.
17 |    */
18 |   private var _optimizer: Optimizer = DefaultOptimizer
19 | 
20 |   /**
21 |    * @return The current optimizer used during Pipeline execution.
22 |    */
23 |   def getOptimizer: Optimizer = _optimizer
24 | 
25 |   /**
26 |    * Globally set a new optimizer to use during Pipeline execution.
27 |    *
28 |    * @param optimizer The new optimizer to use
29 |    */
30 |   def setOptimizer(optimizer: Optimizer): Unit = {
31 |     _optimizer = optimizer
32 |   }
33 | 
34 |   /**
35 |    * Reset this PipelineEnv (clear state and set the Optimizer to the DefaultOptimizer)
36 |    */
37 |   private [workflow] def reset(): Unit = {
38 |     state.clear()
39 |     setOptimizer(DefaultOptimizer)
40 |   }
41 | }
42 | 
43 | object PipelineEnv {
44 |   lazy val getOrCreate: PipelineEnv = new PipelineEnv
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/PipelineResult.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | /**
 4 |  * A PipelineResult is a lazy wrapper around the result of applying a [[Pipeline]] to data.
 5 |  * Internally it contains the Pipeline's execution plan with data sources inserted,
 6 |  * and the sink that the Pipeline's output is expected to be produced by.
 7 |  *
 8 |  * @param executor The Pipeline's underlying execution plan,
 9 |  *                 with the Pipeline's sources inserted into the [[Graph]]
10 |  * @param sink The Pipeline's sink
11 |  * @tparam T The type of the result.
12 |  */
13 | abstract class PipelineResult[T] private[workflow] (
14 |     private[workflow] val executor: GraphExecutor,
15 |     private[workflow] val sink: SinkId
16 |   ) {
17 | 
18 |   private lazy val result: T = executor.execute(sink).get.asInstanceOf[T]
19 |   final def get(): T = result
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/Prefix.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | 
 4 | private[workflow] object Prefix {
 5 |   /**
 6 |    * Given a graph and a node, output the prefix of the id.
 7 |    * Will error if provided a node with a source in the dependencies.
 8 |    *
 9 |    * @param graph The graph to use
10 |    * @param node A node in the graph
11 |    * @return The prefix of that id
12 |    */
13 |   def findPrefix(graph: Graph, node: NodeId): Prefix = {
14 |     val rootOp = graph.getOperator(node)
15 |     val deps = graph.getDependencies(node).map {
16 |       case dep: NodeId => findPrefix(graph, dep)
17 |       case dep: SourceId =>
18 |         throw new IllegalArgumentException("May not get the prefix of a node with Sources in the dependencies.")
19 |     }
20 | 
21 |     Prefix(rootOp, deps)
22 |   }
23 | }
24 | 
25 | /**
26 |  * This case class represents the logical prefix of a node in a Pipeline.
27 |  * @param operator The operator stored at the node
28 |  * @param deps The prefixes of the operator's dependencies
29 |  */
30 | private[workflow] case class Prefix(operator: Operator, deps: Seq[Prefix])
31 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/Rule.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | /**
 4 |  * Represents a DAG transformation rule: A transformation from one DAG
 5 |  * to a differently-executed but logically equivalent DAG.
 6 |  *
 7 |  * A rule must also produce execution state for
 8 |  * the new DAG, logically equivalent to the execution state
 9 |  * attached to the old DAG.
10 |  */
11 | abstract class Rule {
12 |   /** Name for this rule, automatically inferred based on class name. */
13 |   val ruleName: String = {
14 |     val className = getClass.getName
15 |     if (className endsWith "$") className.dropRight(1) else className
16 |   }
17 | 
18 |   def apply(plan: Graph, prefixes: Map[NodeId, Prefix]): (Graph, Map[NodeId, Prefix])
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/SavedStateLoadRule.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | /**
 4 |  * A rule to load any saved state for the [[PipelineEnv.state]] prefix state table
 5 |  * for nodes we want to consider either loading or saving the results of.
 6 |  */
 7 | object SavedStateLoadRule extends Rule {
 8 |   override def apply(plan: Graph, prefixes: Map[NodeId, Prefix]): (Graph, Map[NodeId, Prefix]) = {
 9 |     val newGraph = prefixes.foldLeft(plan) {
10 |       case (curGraph, (node, prefix)) =>
11 |         PipelineEnv.getOrCreate.state.get(prefix).map {
12 |           case expression =>
13 |             curGraph.setOperator(node, new ExpressionOperator(expression))
14 |               .setDependencies(node, Seq())
15 |         }.getOrElse(curGraph)
16 |     }
17 | 
18 |     (newGraph, prefixes)
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/SparkUtilWrapper.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.util
2 | 
3 | object SparkUtilWrapper {
4 |   def estimateSize(obj: AnyRef): Long = SizeEstimator.estimate(obj)
5 | }


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/Transformer.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | import scala.reflect.ClassTag
 6 | 
 7 | /**
 8 |  * Transformers are operators that may be applied both to single input items and to RDDs of input items.
 9 |  * They may be chained together, along with [[Estimator]]s and [[LabelEstimator]]s, to produce complex
10 |  * pipelines.
11 |  *
12 |  * Transformer extends [[Pipeline]], meaning that its publicly exposed methods for transforming data
13 |  * and chaining are implemented there.
14 |  *
15 |  * @tparam A input item type the transformer takes
16 |  * @tparam B output item type the transformer produces
17 |  */
18 | abstract class Transformer[A, B : ClassTag] extends TransformerOperator with Chainable[A, B] {
19 |   override def toPipeline: Pipeline[A, B] = new Pipeline(
20 |     executor = new GraphExecutor(Graph(
21 |       sources = Set(SourceId(0)),
22 |       sinkDependencies = Map(SinkId(0) -> NodeId(0)),
23 |       operators = Map(NodeId(0) -> this),
24 |       dependencies = Map(NodeId(0) -> Seq(SourceId(0)))
25 |     )),
26 |     source = SourceId(0),
27 |     sink = SinkId(0)
28 |   )
29 | 
30 |   /**
31 |    * The application of this Transformer to a single input item.
32 |    * This method MUST be overridden by ML developers.
33 |    *
34 |    * @param in  The input item to pass into this transformer
35 |    * @return  The output value
36 |    */
37 |   def apply(in: A): B
38 | 
39 |   /**
40 |    * The application of this Transformer to an RDD of input items.
41 |    * This method may optionally be overridden by ML developers.
42 |    *
43 |    * @param in The bulk RDD input to pass into this transformer
44 |    * @return The bulk RDD output for the given input
45 |    */
46 |   def apply(in: RDD[A]): RDD[B] = in.map(apply)
47 | 
48 |   final override private[workflow] def singleTransform(inputs: Seq[DatumExpression]): Any = {
49 |     apply(inputs.head.get.asInstanceOf[A])
50 |   }
51 | 
52 |   final override private[workflow] def batchTransform(inputs: Seq[DatasetExpression]): RDD[_] = {
53 |     apply(inputs.head.get.asInstanceOf[RDD[A]])
54 |   }
55 | }
56 | 
57 | object Transformer {
58 |   /**
59 |    * This constructor takes a function and returns a Transformer that maps it over the input RDD
60 |    *
61 |    * @param f The function to apply to every item in the RDD being transformed
62 |    * @tparam I input type of the transformer
63 |    * @tparam O output type of the transformer
64 |    * @return Transformer that applies the given function to all items in the RDD
65 |    */
66 |   def apply[I, O : ClassTag](f: I => O): Transformer[I, O] = new Transformer[I, O] {
67 |     override def apply(in: RDD[I]): RDD[O] = in.map(f)
68 |     override def apply(in: I): O = f(in)
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/TransformerGraph.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | /**
 4 |  * TransformerGraphs are similar to [[Graph]]s, but unlike normal Graphs they may only contain
 5 |  * [[TransformerOperator]]s as operators, and as a result are guaranteed to be serializable.
 6 |  *
 7 |  * @param sources  The set of all [[SourceId]]s of sources in the graph
 8 |  * @param sinkDependencies  A map of [[SinkId]] to the id of the node or source the sink depends on
 9 |  * @param operators  A map of [[NodeId]] to the operator contained within that node
10 |  * @param dependencies  A map of [[NodeId]] to the node's ordered dependencies
11 |  */
12 | private[workflow] case class TransformerGraph(
13 |   sources: Set[SourceId],
14 |   sinkDependencies: Map[SinkId, NodeOrSourceId],
15 |   operators: Map[NodeId, TransformerOperator],
16 |   dependencies: Map[NodeId, Seq[NodeOrSourceId]]
17 | ) {
18 | 
19 |   /**
20 |    * Convert this TransformerGraph into a standard [[Graph]]
21 |    */
22 |   private[workflow] def toGraph: Graph = {
23 |     Graph(
24 |       sources = sources,
25 |       sinkDependencies = sinkDependencies,
26 |       operators = operators,
27 |       dependencies = dependencies)
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/UnusedBranchRemovalRule.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | /**
 4 |  * A rule to remove all nodes & sources in a graph that don't lead to any sink,
 5 |  * and are effectively unused.
 6 |  */
 7 | object UnusedBranchRemovalRule extends Rule {
 8 |   override def apply(plan: Graph, prefixes: Map[NodeId, Prefix]): (Graph, Map[NodeId, Prefix]) = {
 9 |     val ancestorsOfSinks = plan.sinks.foldLeft(Set[GraphId]()) {
10 |       case (ancestors, sink) => ancestors ++ AnalysisUtils.getAncestors(plan, sink)
11 |     }
12 | 
13 |     val nodesToRemove = plan.nodes -- ancestorsOfSinks.collect { case node: NodeId => node }
14 |     val sourcesToRemove = plan.sources -- ancestorsOfSinks.collect { case source: SourceId => source }
15 | 
16 |     val afterSourceRemoval = sourcesToRemove.foldLeft(plan) {
17 |       case (curPlan, sourceToRemove) => curPlan.removeSource(sourceToRemove)
18 |     }
19 | 
20 |     nodesToRemove.foldLeft((afterSourceRemoval, prefixes)) {
21 |       case ((curPlan, curPrefixes), nodeToRemove) => (curPlan.removeNode(nodeToRemove), curPrefixes - nodeToRemove)
22 |     }
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/WeightedNode.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | /**
 4 |  * A mix-in that attaches a weight to a node that represents how often it must iterate
 5 |  * over its input.
 6 |  */
 7 | trait WeightedNode {
 8 |   val weight: Int
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/WeightedOperator.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | /**
 4 |  * A mix-in that attaches a weight to an operator that represents how often it must iterate
 5 |  * over its input.
 6 |  */
 7 | trait WeightedOperator {
 8 |   val weight: Int
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/scala/keystoneml/workflow/WorkflowUtils.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | object WorkflowUtils {
 6 |   /**
 7 |     * Return the number of items in each partition in an RDD.
 8 |     * @param rdd Input RDD.
 9 |     * @tparam T RDD Type.
10 |     * @return A [[Map]] keyed by partition ID containing the number of elements in each partition of the RDD.
11 |     */
12 |   def numPerPartition[T](rdd: RDD[T]): Map[Int, Int] = {
13 |     rdd.mapPartitionsWithIndex {
14 |       case (id, partition) => Iterator.single((id, partition.length))
15 |     }.collect().toMap
16 |   }
17 | 
18 | }


--------------------------------------------------------------------------------
/src/test/python/images/pyconv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import csv
 5 | import numpy as np
 6 | from scipy.misc import imread, imsave
 7 | from scipy.signal import convolve
 8 | 
 9 | #This script convolves an image in python and its output is used in
10 | #the Convolver unit tests to ensure that convolver output matches
11 | #an equivalent python call.
12 | 
13 | #This script was run from src/test/resources/images/ as:
14 | #python pyconv.py gantrycrane.png convolved.gantrycrane.png convolved.gantrycrane.csv
15 | 
16 | 
17 | def main():
18 |   x = imread(sys.argv[1])
19 |   k1 = np.array([i for i in range(27)]).reshape((3,3,3))
20 |   out = np.sum(convolve(x, k1, mode='valid'), 2)
21 |   imsave(sys.argv[2], out)
22 |   cwriter = csv.writer(open(sys.argv[3], 'w'))
23 |   for x in range(out.shape[0]):
24 |     for y in range(out.shape[1]):
25 |       cwriter.writerow([x,y,out[x,y]])
26 |       
27 | 
28 | if __name__ == "__main__":
29 |   main()
30 | 


--------------------------------------------------------------------------------
/src/test/resources/aMat-1class.csv:
--------------------------------------------------------------------------------
1 | 0.10266850507085126,0.4499763204326901,-0.15374850502641021,-0.015879756324382748,0.4437926700329148,0.7808071690334957,-0.08218768514428863,0.48140007039716,-0.019712057549647364,0.2009836160928337,-0.8000566853661935,0.12167303371323324
2 | 0.44732263301010217,0.9951414121993158,0.8130665381040776,-1.183012821913078,-0.7081795326278753,-0.2365018666630304,1.1966589648301693,-1.2916743784290192,-0.09425629499384529,-0.9651145207437652,-0.8953331802899065,-0.9220777634896545
3 | -0.7623817369690132,0.9257676421568312,-1.4667522264035207,0.05272020922346383,1.2149725887284197,-0.8779025816833662,-0.762795288627363,0.39898952926221504,0.40825734564162786,3.103511435086207,1.5310257139379873,-0.6868105045330928
4 | -0.5008969913101462,0.4532396861574774,-0.29393358849474976,0.5592102787356051,0.6916956616970765,-1.3004633365428844,2.019373540599413,0.3652134453707413,1.910512585516455,2.751731295807471,1.059249138315071,0.10725052982484896
5 | -0.3530558373493292,1.0070284676996972,0.31828544648906393,-0.41233492717046566,0.45555494507753697,-1.7027192789791656,-2.405329542540906,-0.4703247395781227,-0.6821969614843767,-1.065966277390593,-0.8263294641770074,0.1788389733691391
6 | 


--------------------------------------------------------------------------------
/src/test/resources/bMat-1class.csv:
--------------------------------------------------------------------------------
1 | 1.0,-1.0,-1.0
2 | 1.0,-1.0,-1.0
3 | 1.0,-1.0,-1.0
4 | 1.0,-1.0,-1.0
5 | 1.0,-1.0,-1.0
6 | 


--------------------------------------------------------------------------------
/src/test/resources/bMat.csv:
--------------------------------------------------------------------------------
 1 | 1.0,-1.0,-1.0
 2 | 1.0,-1.0,-1.0
 3 | 1.0,-1.0,-1.0
 4 | 1.0,-1.0,-1.0
 5 | 1.0,-1.0,-1.0
 6 | -1.0,1.0,-1.0
 7 | -1.0,1.0,-1.0
 8 | -1.0,1.0,-1.0
 9 | -1.0,1.0,-1.0
10 | -1.0,1.0,-1.0
11 | -1.0,-1.0,1.0
12 | -1.0,-1.0,1.0
13 | -1.0,-1.0,1.0
14 | -1.0,-1.0,1.0
15 | -1.0,-1.0,1.0
16 | 


--------------------------------------------------------------------------------
/src/test/resources/bMatShuffled.csv:
--------------------------------------------------------------------------------
 1 | 1.0,-1.0,-1.0
 2 | 1.0,-1.0,-1.0
 3 | 1.0,-1.0,-1.0
 4 | -1.0,1.0,-1.0
 5 | -1.0,1.0,-1.0
 6 | -1.0,1.0,-1.0
 7 | -1.0,1.0,-1.0
 8 | -1.0,1.0,-1.0
 9 | -1.0,-1.0,1.0
10 | -1.0,-1.0,1.0
11 | -1.0,-1.0,1.0
12 | -1.0,-1.0,1.0
13 | -1.0,-1.0,1.0
14 | 1.0,-1.0,-1.0
15 | 1.0,-1.0,-1.0
16 | 


--------------------------------------------------------------------------------
/src/test/resources/images/000012.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amplab/keystone/74e2fb5efaff55675603508bd0c479bb8875901f/src/test/resources/images/000012.jpg


--------------------------------------------------------------------------------
/src/test/resources/images/convolved.gantrycrane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amplab/keystone/74e2fb5efaff55675603508bd0c479bb8875901f/src/test/resources/images/convolved.gantrycrane.png


--------------------------------------------------------------------------------
/src/test/resources/images/gantrycrane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amplab/keystone/74e2fb5efaff55675603508bd0c479bb8875901f/src/test/resources/images/gantrycrane.png


--------------------------------------------------------------------------------
/src/test/resources/images/imagenet-test-labels:
--------------------------------------------------------------------------------
1 | n15075141 12
2 | 


--------------------------------------------------------------------------------
/src/test/resources/images/imagenet/n15075141.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amplab/keystone/74e2fb5efaff55675603508bd0c479bb8875901f/src/test/resources/images/imagenet/n15075141.tar


--------------------------------------------------------------------------------
/src/test/resources/images/voc/voctest.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amplab/keystone/74e2fb5efaff55675603508bd0c479bb8875901f/src/test/resources/images/voc/voctest.tar


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/evaluation/BinaryClassifierEvaluatorSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.evaluation
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.scalatest.FunSuite
 5 | import keystoneml.utils.Stats
 6 | import keystoneml.workflow.PipelineContext
 7 | 
 8 | class BinaryClassifierEvaluatorSuite extends FunSuite with PipelineContext {
 9 |   test("Multiclass keystoneml.evaluation metrics") {
10 |     /*
11 |      * Contingency table for binary classification with total 12 instances:
12 |      * |6|2| true label: positive
13 |      * |1|3| true label: negative
14 |      */
15 |     sc = new SparkContext("local", "test")
16 | 
17 |     val predictionAndLabels = sc.parallelize( Seq.fill(6)((true, true)) ++ Seq.fill(2)((false, true))
18 |         ++ Seq.fill(1)((true, false)) ++ Seq.fill(3)((false, false)), 2)
19 |     val metrics = BinaryClassifierEvaluator.evaluate(predictionAndLabels.map(_._1), predictionAndLabels.map(_._2))
20 | 
21 |     assert(metrics.tp === 6)
22 |     assert(metrics.fp === 1)
23 |     assert(metrics.tn === 3)
24 |     assert(metrics.fn === 2)
25 | 
26 |     assert(Stats.aboutEq(metrics.precision, 6.0/7.0))
27 |     assert(Stats.aboutEq(metrics.recall, 6.0/8.0))
28 |     assert(Stats.aboutEq(metrics.accuracy, 9.0/12.0))
29 |     assert(Stats.aboutEq(metrics.specificity, 3.0/4.0))
30 |     assert(Stats.aboutEq(metrics.fScore(), 2.0 * 6.0 / (2.0 * 6.0 + 2.0 + 1.0)))
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/evaluation/MeanAveragePrecisionSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.evaluation
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import org.scalatest.FunSuite
 5 | import org.apache.spark.SparkContext
 6 | import keystoneml.utils.Stats
 7 | import keystoneml.workflow.PipelineContext
 8 | 
 9 | class MeanAveragePrecisionSuite extends FunSuite with PipelineContext {
10 | 
11 |   test("random map test") {
12 |     sc = new SparkContext("local", "test")
13 | 
14 |     // Build some random test data with 4 classes 0,1,2,3
15 |     val actual = List(Array(0, 3), Array(2), Array(1, 2), Array(0))
16 |     val actualRdd = sc.parallelize(actual)
17 | 
18 |     val predicted = List(
19 |       DenseVector(0.1, -0.05, 0.12, 0.5),
20 |       DenseVector(-0.23, -0.45, 0.23, 0.1),
21 |       DenseVector(-0.34, -0.32, -0.66, 1.52),
22 |       DenseVector(-0.1, -0.2, 0.5, 0.8))
23 | 
24 |     val predictedRdd = sc.parallelize(predicted)
25 | 
26 |     val map = new MeanAveragePrecisionEvaluator(4).evaluate(predictedRdd, actualRdd)
27 | 
28 |     // Expected values from running this in MATLAB
29 |     val expected = DenseVector(1.0, 0.3333, 0.5, 0.3333)
30 | 
31 |     assert(Stats.aboutEq(map, expected, 1e-4))
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/loaders/ImageNetLoaderSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.loaders
 2 | 
 3 | import org.scalatest.FunSuite
 4 | import org.apache.spark.SparkContext
 5 | import keystoneml.utils.TestUtils
 6 | import keystoneml.workflow.PipelineContext
 7 | 
 8 | class ImageNetLoaderSuite extends FunSuite with PipelineContext {
 9 |   test("load a sample of imagenet data") {
10 |     sc = new SparkContext("local", "test")
11 |     val dataPath = TestUtils.getTestResourceFileName("images/imagenet")
12 |     val labelsPath = TestUtils.getTestResourceFileName("images/imagenet-test-labels")
13 | 
14 |     val imgs = ImageNetLoader.apply(sc, dataPath, labelsPath).collect()
15 |     // We should have 5 images
16 |     assert(imgs.length === 5)
17 | 
18 |     // The images should all have label 12
19 |     assert(imgs.map(_.label).distinct.length === 1)
20 |     assert(imgs.map(_.label).distinct.head === 12)
21 | 
22 |     // The image filenames should begin with n15075141
23 |     assert(imgs.forall(_.filename.get.startsWith("n15075141")), "Image filenames should be correct")
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/loaders/VOCLoaderSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.loaders
 2 | 
 3 | import org.scalatest.FunSuite
 4 | import org.apache.spark.SparkContext
 5 | import keystoneml.utils.TestUtils
 6 | import keystoneml.workflow.PipelineContext
 7 | 
 8 | class VOCLoaderSuite extends FunSuite with PipelineContext {
 9 |   test("load a sample of VOC data") {
10 |     sc = new SparkContext("local", "test")
11 |     val dataPath = TestUtils.getTestResourceFileName("images/voc")
12 |     val labelsPath = TestUtils.getTestResourceFileName("images/voclabels.csv")
13 | 
14 |     val imgs = VOCLoader(sc,
15 |       VOCDataPath(dataPath, "VOCdevkit/VOC2007/JPEGImages/", Some(1)),
16 |       VOCLabelPath(labelsPath)).collect()
17 | 
18 |     // We should have 10 images
19 |     assert(imgs.length === 10)
20 | 
21 |     // There should be one file whose name ends with "000104.jpg"
22 |     val personMonitor = imgs.filter(_.filename.get.endsWith("000104.jpg"))
23 |     assert(personMonitor.length === 1)
24 | 
25 |     // It should have two labels, 14 and 19.
26 |     assert(personMonitor(0).label.contains(14) && personMonitor(0).label.contains(19))
27 | 
28 |     // There should be two 13 labels total and 9 should be distinct.
29 |     assert(imgs.map(_.label).flatten.length === 13)
30 |     assert(imgs.map(_.label).flatten.distinct.length === 9)
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/images/CenterCornerPatcherSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import org.scalatest.FunSuite
 4 | import keystoneml.pipelines.Logging
 5 | import keystoneml.utils.{ChannelMajorArrayVectorizedImage, ImageMetadata, TestUtils}
 6 | 
 7 | class CenterCornerPatcherSuite extends FunSuite with Logging {
 8 | 
 9 |   test("check number and dimension of patches") {
10 |     val image = TestUtils.loadTestImage("images/000012.jpg")
11 |     val xDim = image.metadata.xDim
12 |     val yDim = image.metadata.yDim
13 |     val patchSizeX = xDim / 2 
14 |     val patchSizeY = yDim / 2
15 | 
16 |     val withFlipPatcher = CenterCornerPatcher(patchSizeX, patchSizeY, true)
17 |     val withFlipPatches = withFlipPatcher.centerCornerPatchImage(image).toSeq
18 | 
19 |     assert(withFlipPatches.map(_.metadata.xDim).forall(_ == patchSizeX) &&
20 |       withFlipPatches.map(_.metadata.yDim).forall(_ == patchSizeY) &&
21 |       withFlipPatches.map(_.metadata.numChannels).forall(_ == image.metadata.numChannels),
22 |       "All patches must have right dimensions")
23 | 
24 |     assert(withFlipPatches.size === 10, "Number of patches must match")
25 | 
26 |     val noFlipPatcher = CenterCornerPatcher(patchSizeX, patchSizeY, false) 
27 |     val noFlipPatches = noFlipPatcher.centerCornerPatchImage(image).toSeq
28 | 
29 |     assert(noFlipPatches.map(_.metadata.xDim).forall(_ == patchSizeX) &&
30 |       noFlipPatches.map(_.metadata.yDim).forall(_ == patchSizeY) &&
31 |       noFlipPatches.map(_.metadata.numChannels).forall(_ == image.metadata.numChannels),
32 |       "All patches must have right dimensions")
33 | 
34 |     assert(noFlipPatches.size === 5, "Number of patches must match")
35 |   }
36 | 
37 |   test("1x1 image patches") {
38 |     val imgArr =
39 |       (0 until 5).flatMap { x =>
40 |         (0 until 5).flatMap { y =>
41 |           (0 until 1).map { c =>
42 |             (c + x * 1 + y * 5 * 1).toDouble
43 |           }
44 |         }
45 |       }.toArray
46 | 
47 |     val image = new ChannelMajorArrayVectorizedImage(imgArr, ImageMetadata(5, 5, 1))
48 |     val patchSizeX = 1
49 |     val patchSizeY = 1
50 | 
51 |     val noFlipPatcher = CenterCornerPatcher(patchSizeX, patchSizeY, false)
52 |     val noFlipPatches = noFlipPatcher.centerCornerPatchImage(image).toSeq
53 | 
54 |     assert(noFlipPatches.length === 5)
55 |     // NOTE(shivaram): This assumes order of patches returned stays the same. 
56 |     assert(noFlipPatches(0).get(0, 0, 0) === 0.0)
57 |     assert(noFlipPatches(1).get(0, 0, 0) === 20.0)
58 |     assert(noFlipPatches(2).get(0, 0, 0) === 4.0)
59 |     assert(noFlipPatches(3).get(0, 0, 0) === 24.0)
60 |     assert(noFlipPatches(4).get(0, 0, 0) === 12.0)
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/images/DaisyExtractorSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import breeze.linalg._
 4 | import keystoneml.nodes.images.external.SIFTExtractor
 5 | import org.scalatest.FunSuite
 6 | 
 7 | import keystoneml.pipelines.Logging
 8 | import keystoneml.utils.{ImageUtils, Stats, TestUtils}
 9 | 
10 | class DaisyExtractorSuite extends FunSuite with Logging {
11 |   test("Load an Image and compute Daisy Features") {
12 |     val testImage = TestUtils.loadTestImage("images/gantrycrane.png")
13 |     val grayImage = ImageUtils.toGrayScale(testImage)
14 | 
15 |     val df = new DaisyExtractor()
16 |     val daisyDescriptors = convert(df.apply(grayImage), Double)
17 | 
18 |     val firstKeyPointSum = sum(daisyDescriptors(::, 0))
19 |     val fullFeatureSum = sum(daisyDescriptors)
20 | 
21 |     // Values found from running matlab code on same input file.
22 |     val matlabFirstKeyPointSum = 55.127217737738533
23 |     val matlabFullFeatureSum = 3.240635661296463E5
24 | 
25 |     // TODO: This should be at most 1e-8 as we are using Floats. But its 1e-5, 1e-7 right now ?
26 |     assert(Stats.aboutEq(
27 |       (firstKeyPointSum - matlabFirstKeyPointSum)/matlabFirstKeyPointSum, 0, 1e-5),
28 |       "First keypoint sum must match for Daisy")
29 |     assert(Stats.aboutEq((fullFeatureSum - matlabFullFeatureSum)/matlabFullFeatureSum, 0, 1e-7),
30 |       "Sum of Daisys must match expected sum")
31 |   }
32 | 
33 |   test("Daisy and SIFT extractors should have same row/column ordering.") {
34 |     val testImage = TestUtils.loadTestImage("images/gantrycrane.png")
35 |     val grayImage = ImageUtils.toGrayScale(testImage)
36 | 
37 |     val df = new DaisyExtractor()
38 |     val daisyDescriptors = convert(df.apply(grayImage), Double)
39 | 
40 |     val se = SIFTExtractor(scaleStep = 2)
41 |     val siftDescriptors = se.apply(grayImage)
42 | 
43 |     assert(daisyDescriptors.rows == df.daisyFeatureSize && siftDescriptors.rows == se.descriptorSize)
44 | 
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/images/HogExtractorSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import breeze.linalg._
 4 | import org.scalatest.FunSuite
 5 | 
 6 | import keystoneml.pipelines.Logging
 7 | import keystoneml.utils.{ImageUtils, Stats, TestUtils}
 8 | 
 9 | class HogExtractorSuite extends FunSuite with Logging {
10 |   test("Load an Image and compute Hog Features") {
11 |     val testImage = TestUtils.loadTestImage("images/gantrycrane.png")
12 | 
13 |     // NOTE: The MATLAB implementation from voc-release5 uses
14 |     // images in double range -- So convert our image by rescaling
15 |     val testImageScaled = ImageUtils.mapPixels(testImage, x => x/255.0)
16 | 
17 |     val binSize = 50
18 |     val hog = new HogExtractor(binSize)
19 |     val descriptors = hog.apply(testImageScaled)
20 | 
21 |     val ourSum = sum(descriptors)
22 |     val matlabSum = 59.2162514
23 | 
24 |     assert(Stats.aboutEq((ourSum - matlabSum) / ourSum, 0, 1e-8),
25 |       "Hog features sum should match")
26 | 
27 |     // With a smaller bin size
28 |     val hog1 = new HogExtractor(binSize=8)
29 |     val descriptors1 = hog1.apply(testImageScaled)
30 | 
31 |     val matlabSum1 = 4.5775269e+03
32 |     val ourSum1 = sum(descriptors1)
33 | 
34 |     // TODO: Figure out why error is a bit higher here ?
35 |     assert(Stats.aboutEq((ourSum1 - matlabSum1) / ourSum1, 0, 1e-4),
36 |       "Hog features sum should match")
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/images/LCSExtractorSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import breeze.linalg._
 4 | import org.scalatest.FunSuite
 5 | 
 6 | import keystoneml.pipelines.Logging
 7 | import keystoneml.utils.{ImageUtils, Stats, TestUtils}
 8 | 
 9 | class LCSExtractorSuite extends FunSuite with Logging {
10 |   test("Load an Image and compute LCS Features") {
11 |     val testImage = TestUtils.loadTestImage("images/gantrycrane.png")
12 | 
13 |     val lf = new LCSExtractor(stride=4, subPatchSize=6, strideStart=16)
14 |     val lcsDescriptors = convert(lf.apply(testImage), Double)
15 | 
16 |     val firstKeyPointSum = sum(lcsDescriptors(::, 0))
17 |     val fullFeatureSum = sum(lcsDescriptors)
18 | 
19 |     // Values found from running matlab code on same input file.
20 |     val matlabFirstKeyPointSum = 3.786557667540610e+03
21 |     val matlabFullFeatureSum = 3.171963632855949e+07
22 | 
23 |     assert(
24 |       Stats.aboutEq((firstKeyPointSum - matlabFirstKeyPointSum)/matlabFirstKeyPointSum, 0, 1e-8),
25 |       "First keypoint sum must match for LCS")
26 |     assert(Stats.aboutEq((fullFeatureSum - matlabFullFeatureSum)/matlabFullFeatureSum, 0, 1e-8),
27 |       "Sum of LCS must match expected sum")
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/images/PoolingSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import breeze.linalg.{DenseVector, sum}
 4 | import keystoneml.nodes._
 5 | import org.scalatest.FunSuite
 6 | import keystoneml.pipelines.Logging
 7 | import keystoneml.utils.{ChannelMajorArrayVectorizedImage, ImageMetadata}
 8 | 
 9 | class PoolingSuite extends FunSuite with Logging {
10 | 
11 |   test("pooling") {
12 |     val imgArr =
13 |       (0 until 4).flatMap { x =>
14 |         (0 until 4).flatMap { y =>
15 |           (0 until 1).map { c =>
16 |             (c + x * 1 + y * 4 * 1).toDouble
17 |           }
18 |         }
19 |       }.toArray
20 | 
21 |     val image = new ChannelMajorArrayVectorizedImage(imgArr, ImageMetadata(4, 4, 1))
22 |     val pooling = new Pooler(2, 2, x => x, x => x.max)
23 | 
24 |     val poolImage = pooling(image)
25 | 
26 |     assert(poolImage.get(0, 0, 0) === 5.0)
27 |     assert(poolImage.get(0, 1, 0) === 7.0)
28 |     assert(poolImage.get(1, 0, 0) === 13.0)
29 |     assert(poolImage.get(1, 1, 0) === 15.0)
30 |   }
31 | 
32 |   test("pooling odd") {
33 |     val hogImgSize = 14
34 |     val convSizes = List(1, 2, 3, 4, 6, 8)
35 |     convSizes.foreach { convSize =>
36 |       val convResSize = hogImgSize - convSize + 1
37 | 
38 |       val imgArr =
39 |         (0 until convResSize).flatMap { x =>
40 |           (0 until convResSize).flatMap { y =>
41 |             (0 until 1000).map { c =>
42 |               (c + x * 1 + y * 4 * 1).toDouble
43 |             }
44 |           }
45 |         }.toArray
46 | 
47 |       val image = new ChannelMajorArrayVectorizedImage(
48 |         imgArr, ImageMetadata(convResSize, convResSize, 1000))
49 | 
50 |       val poolSizeReqd = math.ceil(convResSize / 2.0).toInt
51 | 
52 |       // We want poolSize to be even !!
53 |       val poolSize = (math.ceil(poolSizeReqd / 2.0) * 2).toInt
54 |       // overlap as little as possible
55 |       val poolStride = convResSize - poolSize
56 | 
57 | 
58 |       println(s"VALUES: $convSize $convResSize $poolSizeReqd $poolSize $poolStride")
59 | 
60 |       def summ(x: DenseVector[Double]): Double = sum(x)
61 | 
62 |       val pooling = new Pooler(poolStride, poolSize, identity, summ)
63 |       val poolImage = pooling(image)
64 |     }
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/images/RandomPatcherSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images
 2 | 
 3 | import org.scalatest.FunSuite
 4 | import keystoneml.pipelines.Logging
 5 | import keystoneml.utils.{ChannelMajorArrayVectorizedImage, ImageMetadata, TestUtils}
 6 | 
 7 | class RandomPatcherSuite extends FunSuite with Logging {
 8 | 
 9 |   test("patch dimensions, number") {
10 |     val image = TestUtils.loadTestImage("images/000012.jpg")
11 |     val xDim = image.metadata.xDim
12 |     val yDim = image.metadata.yDim
13 |     val patchSizeX = xDim / 2 
14 |     val patchSizeY = yDim / 2
15 |     val numPatches = 5
16 | 
17 |     val patcher = RandomPatcher(numPatches, patchSizeX, patchSizeY)
18 | 
19 |     val patches = patcher.randomPatchImage(image).toSeq
20 | 
21 |     assert(patches.map(_.metadata.xDim).forall(_ == patchSizeX) &&
22 |       patches.map(_.metadata.yDim).forall(_ == patchSizeY) &&
23 |       patches.map(_.metadata.numChannels).forall(_ == image.metadata.numChannels),
24 |       "All patches must have right dimensions")
25 | 
26 |     assert(patches.size === numPatches,
27 |       "Number of patches must match argument passed in")
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/images/SIFTExtractorSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.images.external
 2 | 
 3 | import org.scalatest.FunSuite
 4 | import keystoneml.pipelines.Logging
 5 | import keystoneml.utils.{ImageUtils, TestUtils}
 6 | 
 7 | class SIFTExtractorSuite extends FunSuite with Logging {
 8 |   test("Test Sift on a single image RDD, scaleStep=1 and scaleStep=0, 0 should have more descriptors") {
 9 |     val testImage = TestUtils.loadTestImage("images/000012.jpg")
10 |     val singleImage = ImageUtils.mapPixels(testImage, _/255.0)
11 |     val grayImage = ImageUtils.toGrayScale(singleImage)
12 | 
13 |     val se1 = SIFTExtractor(scaleStep = 1)
14 |     val res1 = se1(grayImage)
15 | 
16 |     val se0 = SIFTExtractor(scaleStep = 0)
17 |     val res0 = se0(grayImage)
18 | 
19 |     logInfo(s"Scale 1 shape is: ${res1.rows}x${res1.cols}")
20 |     logInfo(s"Scale 0 shape is: ${res0.rows}x${res0.cols}")
21 | 
22 |     assert(res1.cols < res0.cols)
23 | 
24 |   }
25 | 
26 | }


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/learning/BlockLinearMapperSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.learning
 2 | 
 3 | import breeze.linalg.{DenseVector, DenseMatrix}
 4 | import breeze.stats.distributions.Rand
 5 | import keystoneml.workflow.PipelineContext
 6 | import scala.collection.mutable.ArrayBuffer
 7 | 
 8 | import org.scalatest.FunSuite
 9 | 
10 | import org.apache.spark.SparkContext
11 | import org.apache.spark.rdd.RDD
12 | 
13 | import keystoneml.pipelines._
14 | import keystoneml.utils.Stats
15 | 
16 | class BlockLinearMapperSuite extends FunSuite with PipelineContext with Logging {
17 | 
18 |   test("BlockLinearMapper transformation") {
19 |     sc = new SparkContext("local", "test")
20 | 
21 |     val inDims = 1000
22 |     val outDims = 100
23 |     val numChunks = 5
24 |     val numPerChunk = inDims/numChunks
25 | 
26 |     val mat = DenseMatrix.rand(inDims, outDims, Rand.gaussian)
27 |     val vec = DenseVector.rand(inDims, Rand.gaussian)
28 |     val intercept = DenseVector.rand(outDims, Rand.gaussian)
29 | 
30 |     val splitVec = (0 until numChunks).map(i => vec((numPerChunk*i) until (numPerChunk*i + numPerChunk)))
31 |     val splitMat = (0 until numChunks).map(i => mat((numPerChunk*i) until (numPerChunk*i + numPerChunk), ::))
32 | 
33 |     val linearMapper = new LinearMapper[DenseVector[Double]](mat, Some(intercept))
34 |     val blockLinearMapper = new BlockLinearMapper(splitMat, numPerChunk, Some(intercept))
35 | 
36 |     val linearOut = linearMapper(vec)
37 | 
38 |     // Test with intercept
39 |     assert(Stats.aboutEq(blockLinearMapper(vec), linearOut, 1e-4))
40 | 
41 |     // Test the apply and evaluate call
42 |     val blmOuts = new ArrayBuffer[RDD[DenseVector[Double]]]
43 |     val splitVecRDDs = splitVec.map { vec =>
44 |       sc.parallelize(Seq(vec), 1)
45 |     }
46 |     blockLinearMapper.applyAndEvaluate(splitVecRDDs,
47 |       (predictedValues: RDD[DenseVector[Double]]) => {
48 |         blmOuts += predictedValues
49 |         ()
50 |       }
51 |     )
52 | 
53 |     // The last blmOut should match the linear mapper's output
54 |     assert(Stats.aboutEq(blmOuts.last.collect()(0), linearOut, 1e-4))
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/learning/KernelModelSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.learning
 2 | 
 3 | import breeze.linalg._
 4 | 
 5 | import org.apache.spark.SparkContext
 6 | import org.scalatest.FunSuite
 7 | 
 8 | import keystoneml.workflow.PipelineContext
 9 | import keystoneml.utils.{MatrixUtils, Stats}
10 | 
11 | class KernelModelSuite extends FunSuite with PipelineContext {
12 | 
13 |   test("KernelModel XOR test") {
14 |     sc = new SparkContext("local", "test")
15 | 
16 |     val x = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0),DenseVector(1.0, -1.0))
17 |     val xTest = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0))
18 |     val y = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0), DenseVector(1.0, 0.0))
19 |     val yTest = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0))
20 | 
21 |     val xRDD = sc.parallelize(x, 2)
22 |     val yRDD = sc.parallelize(y, 2)
23 |     val xTestRDD = sc.parallelize(xTest, 2)
24 | 
25 |     val gaussian = new GaussianKernelGenerator(10)
26 |     // Set block size to number of data points so no blocking happens
27 |     val clf = new KernelRidgeRegression(gaussian, 0, 4, 2)
28 | 
29 |     val kernelModel = clf.fit(xRDD, yRDD)
30 |     val yHat = kernelModel(xTestRDD).collect()
31 |     // Fit should be good
32 |     val delta = MatrixUtils.rowsToMatrix(yHat) - MatrixUtils.rowsToMatrix(yTest)
33 | 
34 |     delta :*= delta
35 |     println("SUM OF DELTA1 " + sum(delta))
36 |     assert(Stats.aboutEq(sum(delta), 0, 1e-4))
37 |   }
38 | 
39 |   test("KernelModel XOR blocked test") {
40 |     sc = new SparkContext("local", "test")
41 | 
42 |     val x = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0),DenseVector(1.0, -1.0))
43 |     val xTest = Array(DenseVector(-1.0, -1.0), DenseVector(1.0, 1.0), DenseVector(-1.0, 1.0))
44 |     val y = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0), DenseVector(1.0, 0.0))
45 |     val yTest = Array(DenseVector(0.0, 1.0), DenseVector(0.0, 1.0), DenseVector(1.0, 0.0))
46 | 
47 |     val xRDD = sc.parallelize(x, 2)
48 |     val yRDD = sc.parallelize(y, 2)
49 |     val xTestRDD = sc.parallelize(xTest, 2)
50 | 
51 |     val gaussian = new GaussianKernelGenerator(10)
52 | 
53 |     // Set block size to half number of data points so blocking happens
54 |     val clf = new KernelRidgeRegression(gaussian, 0, 2, 2)
55 | 
56 |     val kernelModel = clf.fit(xRDD, yRDD)
57 |     val yHat = kernelModel(xTestRDD).collect()
58 |     // Fit should be good
59 |     val delta = MatrixUtils.rowsToMatrix(yHat) - MatrixUtils.rowsToMatrix(yTest)
60 | 
61 |     delta :*= delta
62 |     assert(Stats.aboutEq(sum(delta), 0, 1e-4))
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/learning/ZCAWhiteningSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.learning
 2 | 
 3 | import breeze.linalg._
 4 | import breeze.numerics._
 5 | import breeze.stats.distributions._
 6 | import org.scalatest.FunSuite
 7 | import keystoneml.pipelines._
 8 | import keystoneml.workflow.PipelineContext
 9 | 
10 | class ZCAWhiteningSuite extends FunSuite with PipelineContext with Logging {
11 | 
12 |   val nrows = 10000
13 |   val ndim = 10
14 | 
15 |   val x = DenseMatrix.rand[Double](nrows, ndim, Gaussian(0.0, 1.0))
16 | 
17 |   def fitAndCompare(x: DenseMatrix[Double], eps: Double, thresh: Double): Boolean = {
18 |     val whitener = new ZCAWhitenerEstimator(eps).fitSingle(x)
19 | 
20 |     val wx = whitener(x)
21 | 
22 |     //Checks max(max(abs(cov(whiten(x))) - eye(10)) < sqrt(eps)
23 |     max(abs(cov(convert(wx, Double)) - DenseMatrix.eye[Double](ndim))) < thresh
24 |   }
25 | 
26 |   test("whitening with small epsilon") {
27 |     assert(fitAndCompare(x, 1e-12, 1e-4),
28 |       "Whitening the base matrix should produce unit variance and zero covariance.")
29 |   }
30 | 
31 |   test("whitening with large epsilon") {
32 |     assert(fitAndCompare(x, 0.1, 0.1),
33 |       "Whitening the base matrix should produce unit variance and zero covariance.")
34 | 
35 |     assert(!fitAndCompare(x, 0.1, 1e-4),
36 |       "Whitening the base matrix with a large epsilon should be somewhat noisy.")
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/misc/SparseFeatureVectorizerSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.misc
 2 | 
 3 | import keystoneml.nodes.util.{SparseFeatureVectorizer, AllSparseFeatures, CommonSparseFeatures}
 4 | import org.apache.spark.SparkContext
 5 | import org.scalatest.FunSuite
 6 | import keystoneml.pipelines.Logging
 7 | import keystoneml.workflow.PipelineContext
 8 | 
 9 | class SparseFeatureVectorizerSuite extends FunSuite with PipelineContext with Logging {
10 |   test("sparse feature vectorization") {
11 |     sc = new SparkContext("local", "test")
12 | 
13 |     val featureVectorizer = new SparseFeatureVectorizer(Map("First" -> 0, "Second" -> 1, "Third" -> 2))
14 |     val test = Seq(("Third", 4.0), ("Fourth", 6.0), ("First", 1.0))
15 |     val vector = featureVectorizer.apply(sc.parallelize(Seq(test))).first()
16 | 
17 |     assert(vector.size == 3)
18 |     assert(vector(0) == 1)
19 |     assert(vector(1) == 0)
20 |     assert(vector(2) == 4)
21 |   }
22 | 
23 |   test("all sparse feature selection") {
24 |     sc = new SparkContext("local", "test")
25 |     val train = sc.parallelize(List(Seq(("First", 0.0), ("Second", 6.0)), Seq(("Third", 3.0), ("Second", 4.0))))
26 | 
27 |     val featureVectorizer = AllSparseFeatures().fit(train.map(x => x))
28 |     // The selected features should now be "First", "Second", and "Third"
29 | 
30 |     val test = Seq(("Third", 4.0), ("Fourth", 6.0), ("First", 1.0))
31 |     val out = featureVectorizer.apply(sc.parallelize(Seq(test))).first().toArray
32 | 
33 |     assert(out === Array(1.0, 0.0, 4.0))
34 |   }
35 | 
36 |   test("common sparse feature selection") {
37 |     sc = new SparkContext("local", "test")
38 |     val train = sc.parallelize(List(
39 |       Seq(("First", 0.0), ("Second", 6.0)),
40 |       Seq(("Third", 3.0), ("Second", 4.8)),
41 |       Seq(("Third", 7.0), ("Fourth", 5.0)),
42 |       Seq(("Fifth", 5.0), ("Second", 7.3))
43 |     ))
44 | 
45 |     val featureVectorizer = CommonSparseFeatures(2).fit(train.map(x => x))
46 |     // The selected features should now be "Second", and "Third"
47 | 
48 |     val test = Seq(("Third", 4.0), ("Seventh", 8.0), ("Second", 1.3), ("Fourth", 6.0), ("First", 1.0))
49 |     val out = featureVectorizer.apply(sc.parallelize(Seq(test))).first().toArray
50 | 
51 |     assert(out === Array(1.3, 4.0))
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/misc/TermFrequencySuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.misc
 2 | 
 3 | import keystoneml.nodes.stats.TermFrequency
 4 | import org.apache.spark.SparkContext
 5 | import org.scalatest.FunSuite
 6 | import keystoneml.workflow.PipelineContext
 7 | 
 8 | class TermFrequencySuite extends FunSuite with PipelineContext {
 9 |   test("term frequency of simple strings") {
10 |     sc = new SparkContext("local", "test")
11 |     val in = Seq(Seq[Any]("b", "a", "c", "b", "b", "a", "b"))
12 |     val out = TermFrequency().apply(sc.parallelize(in)).first().toMap
13 |     assert(out === Map("a" -> 2, "b" -> 4, "c" -> 1))
14 |   }
15 | 
16 |   test("term frequency of varying types") {
17 |     sc = new SparkContext("local", "test")
18 |     val in = Seq(Seq("b", "a", "c", ("b", "b"), ("b", "b"), 12, 12, "a", "b", 12))
19 |     val out = TermFrequency().apply(sc.parallelize(in)).first().toMap
20 |     assert(out === Map("a" -> 2, "b" -> 2, "c" -> 1, ("b", "b") -> 2, 12 -> 3))
21 |   }
22 | 
23 |   test("log term frequency") {
24 |     sc = new SparkContext("local", "test")
25 |     val in = Seq(Seq[Any]("b", "a", "c", "b", "b", "a", "b"))
26 |     val out = TermFrequency(x => math.log(x + 1)).apply(sc.parallelize(in)).first().toMap
27 |     assert(out === Map("a" -> math.log(3), "b" -> math.log(5), "c" -> math.log(2)))
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/nlp/CoreNLPFeatureExtractorSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.nlp
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.scalatest.FunSuite
 5 | import keystoneml.pipelines.Logging
 6 | import keystoneml.workflow.PipelineContext
 7 | 
 8 | class CoreNLPFeatureExtractorSuite extends FunSuite with PipelineContext with Logging {
 9 |   test("lemmatization") {
10 |     sc = new SparkContext("local", "test")
11 | 
12 |     val text = "jumping snakes lakes oceans hunted"
13 |     val tokens = CoreNLPFeatureExtractor(1 to 3).apply(sc.parallelize(Seq(text))).first().toSet
14 | 
15 |     // Make sure at least very simple cases were lemmatized
16 |     assert(tokens.contains("jump"))
17 |     assert(tokens.contains("snake"))
18 |     assert(tokens.contains("lake"))
19 |     assert(tokens.contains("ocean"))
20 |     assert(tokens.contains("hunt"))
21 | 
22 |     // Assert the unlemmatized tokens are no longer there
23 |     assert(!tokens.contains("jumping"))
24 |     assert(!tokens.contains("snakes"))
25 |     assert(!tokens.contains("oceans"))
26 |     assert(!tokens.contains("lakes"))
27 |     assert(!tokens.contains("hunted"))
28 |   }
29 | 
30 |   test("entity extraction") {
31 |     sc = new SparkContext("local", "test")
32 | 
33 |     val text = "John likes cake and he lives in Florida"
34 |     val tokens = CoreNLPFeatureExtractor(1 to 3).apply(sc.parallelize(Seq(text))).first().toSet
35 | 
36 |     // Make sure at least very simple entities were identified and extracted
37 |     assert(tokens.contains("PERSON"))
38 |     assert(tokens.contains("LOCATION"))
39 | 
40 |     // Assert the original tokens are no longer there
41 |     assert(!tokens.contains("John"))
42 |     assert(!tokens.contains("Florida"))
43 |   }
44 | 
45 |   test("1-2-3-grams") {
46 |     sc = new SparkContext("local", "test")
47 | 
48 |     val text = "a b c d"
49 |     val tokens = CoreNLPFeatureExtractor(1 to 3).apply(sc.parallelize(Seq(text))).first().toSet
50 | 
51 |     // Make sure expected unigrams appear
52 |     assert(tokens.contains("a"))
53 |     assert(tokens.contains("b"))
54 |     assert(tokens.contains("c"))
55 |     assert(tokens.contains("d"))
56 | 
57 |     // Make sure expected bigrams appear
58 |     assert(tokens.contains("a b"))
59 |     assert(tokens.contains("b c"))
60 |     assert(tokens.contains("c d"))
61 | 
62 |     // Make sure expected 3-grams appear
63 |     assert(tokens.contains("a b c"))
64 |     assert(tokens.contains("b c d"))
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/nlp/HashingTFSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.nlp
 2 | 
 3 | import org.scalatest.FunSuite
 4 | import keystoneml.workflow.PipelineContext
 5 | 
 6 | class HashingTFSuite extends FunSuite with PipelineContext {
 7 | 
 8 |   test("HashingTF with no collisions") {
 9 |     val dims = 4000
10 |     val hashingTF = HashingTF[Seq[String]](dims)
11 | 
12 |     val testDatum = Seq("1", "2", "4", "4", "4", "4", "2")
13 | 
14 |     val vector = hashingTF(testDatum)
15 | 
16 |     // Assert that the vector is actually sparse and has the right number of active positions
17 |     assert(vector.activeSize === 3)
18 |     assert(vector.length === dims)
19 | 
20 |     val termFrequenciesSet = vector.toArray.toSet
21 | 
22 |     // Assert that there are indices with all of the correct values
23 |     assert(termFrequenciesSet === Set(0, 1, 2, 4))
24 |   }
25 | 
26 |   test("HashingTF with collisions") {
27 |     val hashingTF = HashingTF[Seq[String]](2)
28 | 
29 |     val testDatum = Seq("1", "2", "4", "4", "4", "4", "2")
30 | 
31 |     val vector = hashingTF(testDatum)
32 |     assert(vector.activeSize === 2)
33 |     assert(vector.length === 2)
34 | 
35 |     // Assert that the sum of the tf's is still correct even though there were collisions
36 |     assert(vector.toArray.sum === testDatum.size)
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/nlp/NGramIndexerSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.nlp
 2 | 
 3 | import org.scalatest.FunSuite
 4 | 
 5 | class NGramIndexerSuite extends FunSuite {
 6 | 
 7 |   test("pack()") {
 8 |     require(NaiveBitPackIndexer.pack(Seq(1)) == math.pow(2, 40).toLong)
 9 | 
10 |     require(NaiveBitPackIndexer.pack(Seq(1, 1)) ==
11 |       math.pow(2, 40).toLong + math.pow(2, 20).toLong + math.pow(2, 60).toLong)
12 | 
13 |     require(NaiveBitPackIndexer.pack(Seq(1, 1, 1)) ==
14 |       1 + math.pow(2, 40).toLong + math.pow(2, 20).toLong + math.pow(2, 61).toLong)
15 | 
16 |     val ngramIndexer = new NGramIndexerImpl[Int]
17 |     val seq = ngramIndexer.minNgramOrder to ngramIndexer.maxNgramOrder
18 |     require(ngramIndexer.pack(seq).equals(new NGram(seq)))
19 |   }
20 | 
21 |   test("removeFarthestWord()") {
22 |     def testWith[Word >: Int, Ngram](indexer: BackoffIndexer[Word, Ngram]) = {
23 |       var ngramId = indexer.pack(Seq(1, 2, 3))
24 |       var context = indexer.removeFarthestWord(ngramId)
25 |       var expected = indexer.pack(Seq(2, 3))
26 |       require(context == expected, s"actual $context, expected $expected")
27 | 
28 |       ngramId = indexer.pack(Seq(1, 2))
29 |       context = indexer.removeFarthestWord(ngramId)
30 |       expected = indexer.pack(Seq(2))
31 |       require(context == expected, s"actual $context, expected $expected")
32 |     }
33 | 
34 |     testWith(new NGramIndexerImpl[Int])
35 |     testWith(NaiveBitPackIndexer)
36 |   }
37 | 
38 |   test("removeCurrentWord()") {
39 |     def testWith[Word >: Int, Ngram](indexer: BackoffIndexer[Word, Ngram]) = {
40 |       var ngramId = indexer.pack(Seq(1, 2, 3))
41 |       var context = indexer.removeCurrentWord(ngramId)
42 |       var expected = indexer.pack(Seq(1, 2))
43 |       require(context == expected, s"actual $context, expected $expected")
44 | 
45 |       ngramId = indexer.pack(Seq(1, 2))
46 |       context = indexer.removeCurrentWord(ngramId)
47 |       expected = indexer.pack(Seq(1))
48 |       require(context == expected, s"actual $context, expected $expected")
49 |     }
50 | 
51 |     testWith(new NGramIndexerImpl[Int])
52 |     testWith(NaiveBitPackIndexer)
53 |   }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/nlp/NGramsHashingTFSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.nlp
 2 | 
 3 | import org.scalatest.FunSuite
 4 | import keystoneml.workflow.PipelineContext
 5 | 
 6 | class NGramsHashingTFSuite extends FunSuite with PipelineContext {
 7 | 
 8 |   test("NGramsHashingTF 1 to 1") {
 9 |     val dims = 40000
10 | 
11 |     val testDatum = "this sentence is a sentence is the some there some then there some".split(" ")
12 |     val ngrams = NGramsFeaturizer(1 to 1).apply(testDatum)
13 |     val tfVector = HashingTF(dims).apply(ngrams)
14 | 
15 |     val ngramsHashingTFVector = NGramsHashingTF(1 to 1, dims).apply(testDatum)
16 | 
17 |     // Assert that the NGramsHashingTF node returns the same output as first getting n-grams then hashing
18 |     assert(ngramsHashingTFVector === tfVector)
19 |   }
20 | 
21 |   test("NGramsHashingTF 1 to 3") {
22 |     val dims = 40000
23 | 
24 |     val testDatum = "this sentence is a sentence is the some there some then there some".split(" ")
25 |     val ngrams = NGramsFeaturizer(1 to 3).apply(testDatum)
26 |     val tfVector = HashingTF(dims).apply(ngrams)
27 | 
28 |     val ngramsHashingTFVector = NGramsHashingTF(1 to 3, dims).apply(testDatum)
29 | 
30 |     // Assert that the NGramsHashingTF node returns the same output as first getting n-grams then hashing
31 |     assert(ngramsHashingTFVector === tfVector)
32 |   }
33 | 
34 |   test("NGramsHashingTF 2 to 3") {
35 |     val dims = 40000
36 | 
37 |     val testDatum = "this sentence is a sentence is the some there some then there some".split(" ")
38 |     val ngrams = NGramsFeaturizer(2 to 3).apply(testDatum)
39 |     val tfVector = HashingTF(dims).apply(ngrams)
40 | 
41 |     val ngramsHashingTFVector = NGramsHashingTF(2 to 3, dims).apply(testDatum)
42 | 
43 |     // Assert that the NGramsHashingTF node returns the same output as first getting n-grams then hashing
44 |     assert(ngramsHashingTFVector === tfVector)
45 |   }
46 | 
47 |   test("NGramsHashingTF with collisions 1 to 3") {
48 |     val dims = 6
49 | 
50 |     val testDatum = "this sentence is a sentence is the some there some then there some".split(" ")
51 |     val ngrams = NGramsFeaturizer(1 to 3).apply(testDatum)
52 |     val tfVector = HashingTF(dims).apply(ngrams)
53 | 
54 |     val ngramsHashingTFVector = NGramsHashingTF(1 to 3, dims).apply(testDatum)
55 | 
56 |     // Assert that the NGramsHashingTF node returns the same output as first getting n-grams then hashing
57 |     assert(ngramsHashingTFVector === tfVector)
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/nlp/StringUtilsSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.nlp
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.scalatest.FunSuite
 5 | import keystoneml.workflow.PipelineContext
 6 | 
 7 | class StringUtilsSuite extends FunSuite with PipelineContext {
 8 |   val stringToManip = Array("  The quick BROWN fo.X ", " ! !.,)JumpeD. ovER the LAZy DOG.. ! ")
 9 |   test("trim") {
10 |     sc = new SparkContext("local", "test")
11 |     val out = Trim.apply(sc.parallelize(stringToManip, 1)).collect().toSeq
12 |     assert(out === Seq("The quick BROWN fo.X", "! !.,)JumpeD. ovER the LAZy DOG.. !"))
13 |   }
14 | 
15 |   test("lower case") {
16 |     sc = new SparkContext("local", "test")
17 |     val out = LowerCase().apply(sc.parallelize(stringToManip, 1)).collect().toSeq
18 |     assert(out === Seq("  the quick brown fo.x ", " ! !.,)jumped. over the lazy dog.. ! "))
19 |   }
20 | 
21 |   test("tokenizer") {
22 |     sc = new SparkContext("local", "test")
23 |     val out = Tokenizer().apply(sc.parallelize(stringToManip, 1)).collect().toSeq
24 |     assert(out === Seq(Seq("", "The", "quick", "BROWN", "fo", "X"), Seq("", "JumpeD", "ovER", "the", "LAZy", "DOG")))
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/nlp/WordFrequencyEncoderSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.nlp
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | import org.scalatest.FunSuite
 6 | import keystoneml.workflow.PipelineContext
 7 | 
 8 | class WordFrequencyEncoderSuite extends FunSuite with PipelineContext {
 9 | 
10 |   val text = Seq("Winter coming", "Winter Winter is coming")
11 | 
12 |   test("WordFrequencyEncoder") {
13 |     sc = new SparkContext("local[2]", "WordFrequencyEncoderSuite")
14 |     val rdd = Tokenizer()(sc.parallelize(text, 2))
15 |     val encoder = WordFrequencyEncoder.fit(rdd)
16 | 
17 |     assert(encoder(rdd).collect().sameElements(Seq(Seq(0, 1), Seq(0, 0, 2, 1))),
18 |       "frequency-encoded result incorrect")
19 |     assert(encoder.unigramCounts === Map(0 -> 3, 1 -> 2, 2 -> 1),
20 |       "fitted value unigramCounts incorrect")
21 | 
22 |     assert(encoder(sc.parallelize(Seq(Seq("hi")), 1)).collect() === Array(Seq(-1)),
23 |       "OOV words not mapped to -1")
24 |   }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/stats/CosineRandomFeaturesSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import breeze.linalg._
 4 | import breeze.numerics.cos
 5 | import breeze.stats._
 6 | import breeze.stats.distributions.{CauchyDistribution, Rand}
 7 | import org.scalatest.FunSuite
 8 | import keystoneml.utils.Stats
 9 | 
10 | 
11 | class CosineRandomFeaturesSuite extends FunSuite {
12 |   val gamma = 1.34
13 |   val numInputFeatures = 400
14 |   val numOutputFeatures = 1000
15 | 
16 |   test("Guassian cosine random features") {
17 |     val rf = CosineRandomFeatures(numInputFeatures, numOutputFeatures, gamma)
18 | 
19 |     // Check that b is uniform
20 |     assert(max(rf.b) <= 2*math.Pi)
21 |     assert(min(rf.b) >= 0)
22 |     assert(rf.b.size == numOutputFeatures)
23 | 
24 |     // Check that W is gaussian
25 |     assert(rf.W.rows == numOutputFeatures)
26 |     assert(rf.W.cols == numInputFeatures)
27 |     assert(Stats.aboutEq(mean(rf.W),0, 10e-3 * gamma))
28 |     assert(Stats.aboutEq(variance(rf.W), gamma * gamma, 10e-3 * gamma * gamma))
29 | 
30 |     //check the mapping
31 |     val in = DenseVector.rand(numInputFeatures, Rand.uniform)
32 |     val out = cos((in.t * rf.W.t).t + rf.b)
33 |     assert(Stats.aboutEq(rf(in), out, 10e-3))
34 |   }
35 | 
36 |   test("Cauchy cosine random features") {
37 |     val rf = CosineRandomFeatures(
38 |       numInputFeatures,
39 |       numOutputFeatures,
40 |       gamma,
41 |       new CauchyDistribution(0, 1))
42 | 
43 |     // Check that b is uniform
44 |     assert(max(rf.b) <= 2*math.Pi)
45 |     assert(min(rf.b) >= 0)
46 |     assert(rf.b.size == numOutputFeatures)
47 | 
48 |     // Check that W is cauchy
49 |     assert(rf.W.rows == numOutputFeatures)
50 |     assert(rf.W.cols == numInputFeatures)
51 |     assert(Stats.aboutEq(median(rf.W),0,10e-3 * gamma))
52 | 
53 |     //check the mapping
54 |     val in = DenseVector.rand(numInputFeatures, Rand.uniform)
55 |     val out = cos((in.t * rf.W.t).t + rf.b)
56 |     assert(Stats.aboutEq(rf(in), out, 10e-3))
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/stats/LinearRectifierSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import breeze.linalg.DenseMatrix
 4 | import breeze.stats.distributions.Rand
 5 | import org.apache.spark.SparkContext
 6 | import org.scalatest.FunSuite
 7 | import keystoneml.pipelines._
 8 | import keystoneml.utils.{TestUtils, MatrixUtils}
 9 | import keystoneml.workflow.PipelineContext
10 | 
11 | class LinearRectifierSuite extends FunSuite with PipelineContext with Logging {
12 | 
13 |   test("Test MaxVal") {
14 |     sc = new SparkContext("local", "test")
15 |     val matrixParts = TestUtils.createRandomMatrix(sc, 128, 16, 4).rdd.map(_.mat)
16 | 
17 |     val x = matrixParts.flatMap(y => MatrixUtils.matrixToRowArray(y))
18 |     val y = x.map(r => r.forall(_ >= 0.0))
19 | 
20 |     val valmaxNode = LinearRectifier()
21 |     val maxy = valmaxNode.apply(x).map(r => r.forall(_ >= 0.0))
22 | 
23 |     //The random matrix should *not* all be >= 0
24 |     assert(!y.reduce {(a,b) => a | b})
25 | 
26 |     //The valmax'ed random matrix *should* all be >= 0.
27 |     assert(maxy.reduce {(a,b) => a | b})
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/stats/PaddedFFTSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import breeze.linalg._
 4 | import org.apache.spark.SparkContext
 5 | import org.scalatest.FunSuite
 6 | import keystoneml.pipelines.Logging
 7 | import keystoneml.utils.Stats
 8 | import keystoneml.workflow.PipelineContext
 9 | 
10 | 
11 | class PaddedFFTSuite extends FunSuite with PipelineContext with Logging {
12 |   test("Test PaddedFFT node") {
13 |     sc = new SparkContext("local", "test")
14 | 
15 |     // Set up a test matrix.
16 |     val ones = DenseVector.zeros[Double](100)
17 |     val twos = DenseVector.zeros[Double](100)
18 |     ones(0) = 1.0
19 |     twos(2) = 1.0
20 | 
21 |     val x = sc.parallelize(Seq(twos, ones))
22 |     val fftd = PaddedFFT().apply(x).collect()
23 | 
24 |     val twosout = fftd(0)
25 |     val onesout = fftd(1)
26 | 
27 |     // Proof by agreement w/ R: Re(fft(c(0, 0, 1, rep(0, 125))))
28 |     assert(twosout.length === 64)
29 |     assert(Stats.aboutEq(twosout(0), 1.0))
30 |     assert(Stats.aboutEq(twosout(16), 0.0))
31 |     assert(Stats.aboutEq(twosout(32), -1.0))
32 |     assert(Stats.aboutEq(twosout(48), 0.0))
33 | 
34 |     // Proof by agreement w/ R: Re(fft(c(1, rep(0, 127))))
35 |     assert(Stats.aboutEq(onesout, DenseVector.ones[Double](64)))
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/stats/RandomSignNodeSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import breeze.linalg._
 4 | import org.scalatest.FunSuite
 5 | import org.scalatest.matchers.ShouldMatchers
 6 | import keystoneml.pipelines.Logging
 7 | 
 8 | class RandomSignNodeSuite extends FunSuite with Logging with ShouldMatchers {
 9 | 
10 |   test("RandomSignNode") {
11 |     val signs = DenseVector(1.0, -1.0, 1.0)
12 |     val node = RandomSignNode(signs)
13 |     val data: DenseVector[Double] = DenseVector(1.0, 2.0, 3.0)
14 |     val result = node(data)
15 |     Seq(result) should equal (Seq(DenseVector(1.0, -2.0, 3.0)))
16 |   }
17 | 
18 |   test("RandomSignNode.create") {
19 |     val node = RandomSignNode(1000)
20 |     
21 |     node.signs.foreach(elt => assert(elt == -1.0 || elt == 1.0))
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/stats/SignedHellingerMapperSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.stats
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import org.scalatest.FunSuite
 5 | 
 6 | class SignedHellingerMapperSuite extends FunSuite {
 7 |   test("signed hellinger mapper") {
 8 |     val x = DenseVector(1.0, -4.0, 0.0, -9.0, 16.0)
 9 |     val shmx = DenseVector(1.0, -2.0, 0.0, -3.0, 4.0)
10 | 
11 |     assert(SignedHellingerMapper(x) == shmx, "Result should be signed square root of input.")
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/util/ClassLabelIndicatorsSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import org.scalatest.FunSuite
 5 | 
 6 | class ClassLabelIndicatorsSuite extends FunSuite {
 7 |   test("single label indicators") {
 8 |     intercept[AssertionError] {
 9 |       val zerolabels = ClassLabelIndicatorsFromIntLabels(0)
10 |     }
11 | 
12 |     intercept[AssertionError] {
13 |       val onelabel = ClassLabelIndicatorsFromIntLabels(1)
14 |     }
15 | 
16 | 
17 |     val fivelabel = ClassLabelIndicatorsFromIntLabels(5)
18 |     assert(fivelabel(2) === DenseVector(-1.0,-1.0,1.0,-1.0,-1.0))
19 | 
20 |     intercept[RuntimeException] {
21 |       fivelabel(5)
22 |     }
23 |   }
24 | 
25 |   test("multiple label indicators without validation") {
26 |     intercept[AssertionError] {
27 |       val zerolabels = ClassLabelIndicatorsFromIntArrayLabels(0)
28 |     }
29 | 
30 |     intercept[AssertionError] {
31 |       val onelabel = ClassLabelIndicatorsFromIntArrayLabels(1)
32 |     }
33 | 
34 |     val fivelabel = ClassLabelIndicatorsFromIntArrayLabels(5)
35 | 
36 |     assert(fivelabel(Array(2,1)) === DenseVector(-1.0,1.0,1.0,-1.0,-1.0))
37 | 
38 |     intercept[IndexOutOfBoundsException] {
39 |       fivelabel(Array(4,6))
40 |     }
41 | 
42 |     assert(fivelabel(Array(-1,2)) === DenseVector(-1.0,-1.0,1.0,-1.0,1.0),
43 |       "In the unchecked case, we should get weird behavior.")
44 | 
45 |   }
46 | 
47 |   test("multiple label indicators with validation") {
48 |     intercept[AssertionError] {
49 |       val zerolabels = ClassLabelIndicatorsFromIntArrayLabels(0, true)
50 |     }
51 | 
52 |     intercept[AssertionError] {
53 |       val onelabel = ClassLabelIndicatorsFromIntArrayLabels(1, true)
54 |     }
55 | 
56 |     val fivelabel = ClassLabelIndicatorsFromIntArrayLabels(5, true)
57 | 
58 |     assert(fivelabel(Array(2,1)) === DenseVector(-1.0,1.0,1.0,-1.0,-1.0))
59 | 
60 |     intercept[RuntimeException] {
61 |       fivelabel(Array(4,6))
62 |     }
63 | 
64 |     intercept[RuntimeException] {
65 |       fivelabel(Array(-1,2))
66 |     }
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/util/MaxClassifierSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import org.scalatest.FunSuite
 5 | 
 6 | class MaxClassifierSuite extends FunSuite {
 7 |   test("max classifier") {
 8 |     assert(MaxClassifier.apply(DenseVector(-10.0, 42.4, 335.23, -43.0)) === 2)
 9 |     assert(MaxClassifier.apply(DenseVector(Double.MinValue)) === 0)
10 |     assert(MaxClassifier.apply(DenseVector(3.0, -23.2, 2.99)) === 0)
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/util/TopKClassifierSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import org.apache.spark.SparkContext
 5 | import org.scalatest.FunSuite
 6 | import keystoneml.workflow.PipelineContext
 7 | 
 8 | class TopKClassifierSuite extends FunSuite with PipelineContext {
 9 |   test("top k classifier, k <= vector size") {
10 |     sc = new SparkContext("local", "test")
11 | 
12 |     assert(TopKClassifier(2).apply(DenseVector(-10.0, 42.4, -43.0, 23.0)) === Array(1, 3))
13 |     assert(TopKClassifier(4).apply(DenseVector(Double.MinValue, Double.MaxValue, 12.0, 11.0, 10.0)) === Array(1, 2, 3, 4))
14 |     assert(TopKClassifier(3).apply(DenseVector(3.0, -23.2, 2.99)) === Array(0, 2, 1))
15 |   }
16 | 
17 |   test("top k classifier, k > vector size") {
18 |     sc = new SparkContext("local", "test")
19 | 
20 |     assert(TopKClassifier(5).apply(DenseVector(-10.0, 42.4, -43.0, 23.0)) === Array(1, 3, 0, 2))
21 |     assert(TopKClassifier(2).apply(DenseVector(Double.MinValue)) === Array(0))
22 |     assert(TopKClassifier(20).apply(DenseVector(3.0, -23.2, 2.99)) === Array(0, 2, 1))
23 |   }
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/nodes/util/VectorSplitterSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.nodes.util
 2 | 
 3 | import breeze.linalg._
 4 | import org.scalatest.FunSuite
 5 | 
 6 | class VectorSplitterSuite extends FunSuite {
 7 |   test("vector splitter") {
 8 |     for (
 9 |       bs <- Array(128, 256, 512, 1024, 2048);
10 |       mul <- 0 to 2;
11 |       off <- 0 to 20 by 5;
12 |       feats <- Array(Some(bs*mul + off), None)
13 |     ) {
14 |       val sp = new VectorSplitter(bs, feats)
15 |       val vec = DenseVector.zeros[Double](bs*mul + off)
16 | 
17 |       val expectedSplits = (bs*mul + off)/bs + (if ((bs*mul + off) % bs == 0) 0 else 1)
18 | 
19 |       assert(sp.splitVector(vec).length === expectedSplits,
20 |         s"True length is ${sp.splitVector(vec).length}, expected length is ${expectedSplits}")
21 |     }
22 |   }
23 | 
24 |   test("vector splitter maintains order") {
25 |     for (
26 |       bs <- Array(128, 256, 512, 1024, 2048);
27 |       mul <- 0 to 2;
28 |       off <- 0 to 20 by 5;
29 |       feats <- Array(Some(bs*mul + off), None)
30 |     ) {
31 |       val sp = new VectorSplitter(bs, feats)
32 |       val vec = rand(bs*mul + off)
33 | 
34 |       assert(DenseVector.vertcat(sp.splitVector(vec):_*) === vec,
35 |         s"Recombinded split vector of length ${bs*mul + off} with block size $bs did not match its input")
36 |     }
37 |   }
38 | }


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/utils/ImageUtilsSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.utils
 2 | 
 3 | import org.scalatest.FunSuite
 4 | 
 5 | class ImageUtilsSuite extends FunSuite {
 6 | 
 7 |   test("crop") {
 8 |     val imgArr =
 9 |       (0 until 4).flatMap { x =>
10 |         (0 until 4).flatMap { y =>
11 |           (0 until 1).map { c =>
12 |             (c + x * 1 + y * 4 * 1).toDouble
13 |           }
14 |         }
15 |       }.toArray
16 | 
17 |     val image = new ChannelMajorArrayVectorizedImage(imgArr, ImageMetadata(4, 4, 1))
18 |     val cropped = ImageUtils.crop(image, 1, 1, 3, 3)
19 | 
20 |     assert(cropped.metadata.xDim == 2)
21 |     assert(cropped.metadata.yDim == 2)
22 |     assert(cropped.metadata.numChannels == 1)
23 | 
24 |     assert(cropped.get(0, 0, 0) == 5.0)
25 |     assert(cropped.get(0, 1, 0) == 6.0)
26 |     assert(cropped.get(1, 0, 0) == 9.0)
27 |     assert(cropped.get(1, 1, 0) == 10.0)
28 |   }
29 | 
30 |   test("flipHorizontal") {
31 |     val imgArr =
32 |       (0 until 4).flatMap { x =>
33 |         (0 until 4).flatMap { y =>
34 |           (0 until 1).map { c =>
35 |             (c + x * 1 + y * 4 * 1).toDouble
36 |           }
37 |         }
38 |       }.toArray
39 | 
40 |     val image = new ChannelMajorArrayVectorizedImage(imgArr, ImageMetadata(4, 4, 1))
41 | 
42 |     val flipped = ImageUtils.flipHorizontal(image)
43 | 
44 |     assert(flipped.metadata.xDim == 4)
45 |     assert(flipped.metadata.yDim == 4)
46 |     assert(flipped.metadata.numChannels == 1)
47 | 
48 |     (0 until 4).foreach { x =>
49 |       assert(flipped.get(x, 0, 0) == image.get(x, 3, 0))
50 |       assert(flipped.get(x, 1, 0) == image.get(x, 2, 0))
51 |       assert(flipped.get(x, 2, 0) == image.get(x, 1, 0))
52 |       assert(flipped.get(x, 3, 0) == image.get(x, 0, 0))
53 |     }
54 |   }
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/utils/MLlibUtilsSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.utils
 2 | 
 3 | import org.apache.spark.mllib.linalg._
 4 | import breeze.linalg.{DenseVector => BDV, SparseVector => BSV}
 5 | import org.scalatest.FunSuite
 6 | 
 7 | class MLlibUtilsSuite extends FunSuite {
 8 |   val arr = Array(0.1, 0.2, 0.3, 0.4)
 9 |   val n = 20
10 |   val indices = Array(0, 3, 5, 10, 13)
11 |   val values = Array(0.1, 0.5, 0.3, -0.8, -1.0)
12 | 
13 |   test("dense vector to breeze dense") {
14 |     val vec = Vectors.dense(arr)
15 |     assert(MLlibUtils.mllibVectorToDenseBreeze(vec) === new BDV[Double](arr))
16 |   }
17 | 
18 |   test("sparse vector to breeze dense") {
19 |     val vec = Vectors.sparse(n, indices, values)
20 |     val breeze = new BDV[Double](n)
21 |     indices.zip(values).foreach { case (x, y) =>
22 |       breeze(x) = y
23 |     }
24 |     assert(MLlibUtils.mllibVectorToDenseBreeze(vec) === breeze)
25 |   }
26 | 
27 |   test("dense breeze to vector") {
28 |     val breeze = new BDV[Double](arr)
29 |     val vec = MLlibUtils.breezeVectorToMLlib(breeze).asInstanceOf[DenseVector]
30 |     assert(vec.size === arr.length)
31 |     assert(vec.values.eq(arr), "should not copy data")
32 |   }
33 | 
34 |   test("sparse breeze to vector") {
35 |     val breeze = new BSV[Double](indices, values, n)
36 |     val vec = MLlibUtils.breezeVectorToMLlib(breeze).asInstanceOf[SparseVector]
37 |     assert(vec.size === n)
38 |     assert(vec.indices.eq(indices), "should not copy data")
39 |     assert(vec.values.eq(values), "should not copy data")
40 |   }
41 | 
42 |   test("sparse breeze with partially-used arrays to vector") {
43 |     val activeSize = 3
44 |     val breeze = new BSV[Double](indices, values, activeSize, n)
45 |     val vec = MLlibUtils.breezeVectorToMLlib(breeze).asInstanceOf[SparseVector]
46 |     assert(vec.size === n)
47 |     assert(vec.indices === indices.slice(0, activeSize))
48 |     assert(vec.values === values.slice(0, activeSize))
49 |   }
50 | 
51 |   test("dense matrix to breeze dense") {
52 |     val mat = Matrices.dense(3, 2, Array(0.0, 1.0, 2.0, 3.0, 4.0, 5.0))
53 |     val breeze = MLlibUtils.mllibMatrixToDenseBreeze(mat)
54 |     assert(breeze.rows === mat.numRows)
55 |     assert(breeze.cols === mat.numCols)
56 |     assert(breeze.data.eq(mat.asInstanceOf[DenseMatrix].values), "should not copy data")
57 |   }
58 | 
59 |   test("sparse matrix to breeze dense") {
60 |     val values = Array(1.0, 2.0, 4.0, 5.0)
61 |     val colPtrs = Array(0, 2, 4)
62 |     val rowIndices = Array(1, 2, 1, 2)
63 |     val mat = Matrices.sparse(3, 2, colPtrs, rowIndices, values)
64 |     val breeze = MLlibUtils.mllibMatrixToDenseBreeze(mat)
65 |     assert(breeze.rows === mat.numRows)
66 |     assert(breeze.cols === mat.numCols)
67 |     assert(breeze.toArray === mat.toArray)
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/utils/MatrixUtilsSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.utils
 2 | 
 3 | import org.scalatest.FunSuite
 4 | 
 5 | import breeze.linalg._
 6 | import breeze.stats._
 7 | 
 8 | import org.apache.spark.SparkContext
 9 | 
10 | import keystoneml.pipelines._
11 | import keystoneml.workflow.PipelineContext
12 | 
13 | class MatrixUtilsSuite extends FunSuite with PipelineContext {
14 | 
15 |   test("computeMean works correctly") {
16 |     val numRows = 1000
17 |     val numCols = 32
18 |     val numParts = 4
19 |     sc = new SparkContext("local", "test")
20 |     val in = DenseMatrix.rand(numRows, numCols)
21 |     val inArr = MatrixUtils.matrixToRowArray(in)
22 |     val rdd = sc.parallelize(inArr, numParts).mapPartitions { iter => 
23 |       Iterator.single(MatrixUtils.rowsToMatrix(iter))
24 |     }
25 |     val expected = mean(in(::, *)).t
26 |     val actual = MatrixUtils.computeMean(rdd)
27 |     assert(Stats.aboutEq(expected, actual, 1e-6))
28 |   }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/utils/external/EncEvalSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.utils.external
 2 | 
 3 | import java.io.File
 4 | 
 5 | import breeze.linalg._
 6 | import breeze.stats.distributions.Gaussian
 7 | import keystoneml.nodes.learning.GaussianMixtureModel
 8 | import keystoneml.nodes.learning.external.GaussianMixtureModelEstimator
 9 | import org.scalatest.FunSuite
10 | import keystoneml.pipelines.Logging
11 | import keystoneml.utils.{Stats, TestUtils}
12 | 
13 | class EncEvalSuite extends FunSuite with Logging {
14 | 
15 |   test("Load SIFT Descriptors and compute Fisher Vector Features") {
16 | 
17 |     val siftDescriptor = csvread(new File(TestUtils.getTestResourceFileName("images/feats.csv")))
18 | 
19 |     val gmmMeans = TestUtils.getTestResourceFileName("images/voc_codebook/means.csv")
20 |     val gmmVars = TestUtils.getTestResourceFileName("images/voc_codebook/variances.csv")
21 |     val gmmWeights = TestUtils.getTestResourceFileName("images/voc_codebook/priors")
22 | 
23 |     val gmm = GaussianMixtureModel.load(gmmMeans, gmmVars, gmmWeights)
24 | 
25 |     val nCenters = gmm.means.cols
26 |     val nDim = gmm.means.rows
27 | 
28 |     val extLib = new EncEval
29 | 
30 |     val fisherVector = extLib.calcAndGetFVs(
31 |       gmm.means.toArray.map(_.toFloat),
32 |       nCenters,
33 |       nDim,
34 |       gmm.variances.toArray.map(_.toFloat),
35 |       gmm.weights.toArray.map(_.toFloat),
36 |       siftDescriptor.toArray.map(_.toFloat))
37 | 
38 |     log.info(s"Fisher Vector is ${fisherVector.sum}")
39 |     assert(Stats.aboutEq(fisherVector.sum, 40.109097, 1e-4), "SUM of Fisher Vectors must match expected sum.")
40 | 
41 |   }
42 | 
43 |   test("Compute a GMM from scala") {
44 |     val nsamps = 10000
45 | 
46 |     // Generate two gaussians.
47 |     val x = Gaussian(-1.0, 0.5).samples.take(nsamps).toArray
48 |     val y = Gaussian(5.0, 1.0).samples.take(nsamps).toArray
49 | 
50 |     val z = shuffle(x ++ y).map(x => DenseVector(x))
51 | 
52 |     // Compute a 1-d GMM.
53 |     val extLib = new EncEval
54 |     val gmm = new GaussianMixtureModelEstimator(2).fit(z)
55 | 
56 |     logInfo(s"GMM means: ${gmm.means.toArray.mkString(",")}")
57 |     logInfo(s"GMM vars: ${gmm.variances.toArray.mkString(",")}")
58 |     logInfo(s"GMM weights: ${gmm.weights.toArray.mkString(",")}")
59 | 
60 |     // The results should be close to the distribution we set up.
61 |     assert(Stats.aboutEq(min(gmm.means), -1.0, 1e-1), "Smallest mean should be close to -1.0")
62 |     assert(Stats.aboutEq(max(gmm.means), 5.0, 1e-1), "Largest mean should be close to 1.0")
63 |     assert(Stats.aboutEq(math.sqrt(min(gmm.variances)), 0.5, 1e-1), "Smallest SD should be close to 0.25")
64 |     assert(Stats.aboutEq(math.sqrt(max(gmm.variances)), 1.0, 1e-1), "Largest SD should be close to 5.0")
65 |   }
66 | }


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/utils/external/VLFeatSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.utils.external
 2 | 
 3 | import java.io.File
 4 | 
 5 | import breeze.linalg._
 6 | import breeze.numerics.abs
 7 | import org.scalatest.FunSuite
 8 | import keystoneml.pipelines.Logging
 9 | import keystoneml.utils.{ImageUtils, MatrixUtils, TestUtils}
10 | 
11 | class VLFeatSuite extends FunSuite with Logging {
12 |   test("Load an Image and compute SIFT Features") {
13 |     val testImage = TestUtils.loadTestImage("images/000012.jpg")
14 |     val singleImage = ImageUtils.mapPixels(testImage, _/255.0)
15 |     val grayImage = ImageUtils.toGrayScale(singleImage)
16 | 
17 |     val extLib = new VLFeat
18 | 
19 |     val stepSize = 3
20 |     val binSize = 4
21 |     val scales = 4
22 |     val descriptorLength = 128
23 |     val scaleStep = 0
24 | 
25 |     val rawDescDataShort = extLib.getSIFTs(grayImage.metadata.xDim, grayImage.metadata.yDim,
26 |       stepSize, binSize, scales, scaleStep, grayImage.getSingleChannelAsFloatArray())
27 | 
28 |     assert(rawDescDataShort.length % descriptorLength == 0, "Resulting SIFTs must be 128-dimensional.")
29 | 
30 |     val numCols = rawDescDataShort.length/descriptorLength
31 |     val result = new DenseMatrix(descriptorLength, numCols, rawDescDataShort.map(_.toDouble))
32 | 
33 |     // Compare with the output of running this image through vl_phow with matlab from the enceval package:
34 |     // featpipem_addpaths;
35 |     // im = im2single(imread('images/000012.jpg'));
36 |     // featextr = featpipem.features.PhowExtractor();
37 |     // featextr.step = 3;
38 |     // [frames feats] = featextr.compute(im);
39 |     // csvwrite('images/feats128.csv', feats)
40 | 
41 |     val testFeatures = csvread(new File(TestUtils.getTestResourceFileName("images/feats128.csv")))
42 | 
43 |     val diff = result - testFeatures
44 | 
45 |     // Because of subtle differences in the way image smoothing works in the VLFeat C library and the VLFeat matlab
46 |     // library (vl_imsmooth_f vs. _vl_imsmooth_f), these two matrices will not be exactly the same.
47 |     // Instead, we check that 99.5% of the matrix entries are off by at most 1.
48 |     val absdiff = abs(diff).toDenseVector
49 | 
50 |     assert(absdiff.findAll(_ > 1.0).length.toDouble < 0.005*absdiff.length,
51 |       "Fewer than 0.05% of entries may be different by more than 1.")
52 |   }
53 | }


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/utils/images/ImageSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.utils.images
 2 | 
 3 | import org.scalatest.FunSuite
 4 | import keystoneml.pipelines.Logging
 5 | import keystoneml.utils.VectorizedImage
 6 | import keystoneml.utils.TestUtils._
 7 | 
 8 | class ImageSuite extends FunSuite with Logging {
 9 |   test("Vectorized Image Coordinates Should be Correct") {
10 |     val (x,y,z) = (100,100,3)
11 | 
12 |     val images = Array[VectorizedImage](
13 |       genChannelMajorArrayVectorizedImage(x,y,z),
14 |       genColumnMajorArrayVectorizedImage(x,y,z),
15 |       genRowMajorArrayVectorizedImage(x,y,z),
16 |       genRowColumnMajorByteArrayVectorizedImage(x,y,z)
17 |     )
18 | 
19 |     for (
20 |       img <- images;
21 |       idx <- 0 until x*y*z
22 |     ) {
23 |       val coord = img.vectorToImageCoords(idx)
24 |       assert(img.imageToVectorCoords(coord.x,coord.y,coord.channelIdx) == idx,
25 |         s"imageToVectorCoords(vectorToImageCoords(idx)) should be equivalent to identity(idx) for img $img")
26 |     }
27 | 
28 |     for (
29 |       img <- images;
30 |       xi <- 0 until x;
31 |       yi <- 0 until y;
32 |       zi <- 0 until z
33 |     ) {
34 |       val coord = img.vectorToImageCoords(img.imageToVectorCoords(xi,yi,zi))
35 |       assert((coord.x, coord.y, coord.channelIdx) == (xi,yi,zi),
36 |         s"vectorToImageCoords(imageToVectorCoords(x,y,z)) should be equivalent to identity(x,y,z) for img $img")
37 |     }
38 |   }
39 | }


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/workflow/EstimatorSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.rdd.RDD
 5 | import org.scalatest.FunSuite
 6 | import keystoneml.pipelines.Logging
 7 | 
 8 | class EstimatorSuite extends FunSuite with PipelineContext with Logging {
 9 |   test("Estimator fit RDD") {
10 |     sc = new SparkContext("local", "test")
11 | 
12 |     val intEstimator = new Estimator[Int, Int] {
13 |       def fit(data: RDD[Int]): Transformer[Int, Int] = {
14 |         val first = data.first()
15 |         Transformer(x => x + first)
16 |       }
17 |     }
18 | 
19 |     val trainData = sc.parallelize(Seq(32, 94, 12))
20 |     val testData = sc.parallelize(Seq(42, 58, 61))
21 | 
22 |     val pipeline = intEstimator.withData(trainData)
23 |     assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + 32, 58 + 32, 61 + 32))
24 |   }
25 | 
26 |   test("Estimator fit Pipeline Data") {
27 |     sc = new SparkContext("local", "test")
28 | 
29 |     val transformer = Transformer[Int, Int](_ * 2)
30 | 
31 |     val intEstimator = new Estimator[Int, Int] {
32 |       def fit(data: RDD[Int]): Transformer[Int, Int] = {
33 |         val first = data.first()
34 |         Transformer(x => x + first)
35 |       }
36 |     }
37 | 
38 |     val trainData = sc.parallelize(Seq(32, 94, 12))
39 |     val testData = sc.parallelize(Seq(42, 58, 61))
40 | 
41 |     val pipeline = intEstimator.withData(transformer(trainData))
42 |     assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + 64, 58 + 64, 61 + 64))
43 |   }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/workflow/LabelEstimatorSuite.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.rdd.RDD
 5 | import org.scalatest.FunSuite
 6 | import keystoneml.pipelines.Logging
 7 | 
 8 | class LabelEstimatorSuite extends FunSuite with PipelineContext with Logging {
 9 |   test("LabelEstimator fit RDD") {
10 |     sc = new SparkContext("local", "test")
11 | 
12 |     val intEstimator = new LabelEstimator[Int, Int, String] {
13 |       def fit(data: RDD[Int], labels: RDD[String]): Transformer[Int, Int] = {
14 |         val first = data.first()
15 |         val label = labels.first().hashCode
16 |         Transformer(x => x + first + label)
17 | 
18 |       }
19 |     }
20 | 
21 |     val trainData = sc.parallelize(Seq(32, 94, 12))
22 |     val trainLabels = sc.parallelize(Seq("sjkfdl", "iw", "432"))
23 |     val testData = sc.parallelize(Seq(42, 58, 61))
24 | 
25 |     val pipeline = intEstimator.withData(trainData, trainLabels)
26 |     val offset = 32 + "sjkfdl".hashCode
27 |     assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + offset, 58 + offset, 61 + offset))
28 |   }
29 | 
30 |   test("LabelEstimator fit pipeline data") {
31 |     sc = new SparkContext("local", "test")
32 | 
33 |     val dataTransformer = Transformer[Int, Int](_ * 2)
34 |     val labelTransformer = Transformer[String, String](_ + "hi")
35 | 
36 |     val intEstimator = new LabelEstimator[Int, Int, String] {
37 |       def fit(data: RDD[Int], labels: RDD[String]): Transformer[Int, Int] = {
38 |         val first = data.first()
39 |         val label = labels.first().hashCode
40 |         Transformer(x => x + first + label)
41 | 
42 |       }
43 |     }
44 | 
45 |     val trainData = sc.parallelize(Seq(32, 94, 12))
46 |     val trainLabels = sc.parallelize(Seq("sjkfdl", "iw", "432"))
47 |     val testData = sc.parallelize(Seq(42, 58, 61))
48 | 
49 |     val pipeline = intEstimator.withData(dataTransformer(trainData), labelTransformer(trainLabels))
50 |     val offset = 64 + "sjkfdlhi".hashCode
51 |     assert(pipeline.apply(testData).get().collect().toSeq === Seq(42 + offset, 58 + offset, 61 + offset))
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/test/scala/keystoneml/workflow/PipelineContext.scala:
--------------------------------------------------------------------------------
 1 | package keystoneml.workflow
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.scalatest.{BeforeAndAfterEach, Suite}
 5 | 
 6 | // TODO: delete this file and use the version from Spark once SPARK-750 is fixed.
 7 | 
 8 | /** Manages a local `sc` {@link SparkContext} variable, and the PipelineEnv, correctly stopping it after each test. */
 9 | trait PipelineContext extends BeforeAndAfterEach { self: Suite =>
10 | 
11 |   @transient var sc: SparkContext = _
12 | 
13 |   override def afterEach() {
14 |     PipelineEnv.getOrCreate.reset()
15 |     resetSparkContext()
16 |     super.afterEach()
17 |   }
18 | 
19 |   def resetSparkContext() = {
20 |     if (sc != null) {
21 |       PipelineContext.stop(sc)
22 |       sc = null
23 |     }
24 |   }
25 | }
26 | 
27 | object PipelineContext {
28 |   def stop(sc: SparkContext) {
29 |     sc.stop()
30 |     // To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
31 |     System.clearProperty("spark.driver.port")
32 |   }
33 | 
34 |   /** Runs `f` by passing in `sc` and ensures that `sc` is stopped. */
35 |   def withSpark[T](sc: SparkContext)(f: SparkContext => T) = {
36 |     try {
37 |       f(sc)
38 |     } finally {
39 |       stop(sc)
40 |     }
41 |   }
42 | 
43 | }


--------------------------------------------------------------------------------