├── .envrc ├── .github └── workflows │ ├── ci.yml │ └── clean.yml ├── .gitignore ├── .jvmopts ├── .scalafix.conf ├── .scalafmt.conf ├── LICENSE ├── NOTICE ├── README.md ├── build.sbt ├── catalog-info.yaml ├── core └── src │ ├── main │ ├── scala-2.12 │ │ └── com │ │ │ └── spotify │ │ │ └── featran │ │ │ ├── FeatureSpecCompat.scala │ │ │ └── converters │ │ │ └── CaseClassConverter.scala │ ├── scala-2.13 │ │ └── com │ │ │ └── spotify │ │ │ └── featran │ │ │ ├── FeatureSpecCompat.scala │ │ │ └── converters │ │ │ └── CaseClassConverter.scala │ ├── scala-3 │ │ └── com │ │ │ └── spotify │ │ │ └── featran │ │ │ ├── FeatureSpecCompat.scala │ │ │ └── converters │ │ │ └── CaseClassConverter.scala │ └── scala │ │ └── com │ │ └── spotify │ │ └── featran │ │ ├── CanBuild.scala │ │ ├── CollectionType.scala │ │ ├── CrossingFeatureBuilder.scala │ │ ├── FeatureBuilder.scala │ │ ├── FeatureExtractor.scala │ │ ├── FeatureSpec.scala │ │ ├── FlatConverter.scala │ │ ├── FlatExtractor.scala │ │ ├── FloatingPoint.scala │ │ ├── MultiFeatureExtractor.scala │ │ ├── MultiFeatureSpec.scala │ │ ├── converters │ │ ├── DefaultTransform.scala │ │ └── package.scala │ │ ├── json │ │ ├── Implicits.scala │ │ ├── JsonOps.scala │ │ └── package.scala │ │ └── transformers │ │ ├── Binarizer.scala │ │ ├── Bucketizer.scala │ │ ├── HashNHotEncoder.scala │ │ ├── HashNHotWeightedEncoder.scala │ │ ├── HashOneHotEncoder.scala │ │ ├── HeavyHitters.scala │ │ ├── IQROutlierRejector.scala │ │ ├── Identity.scala │ │ ├── Indicator.scala │ │ ├── MDL.scala │ │ ├── MaxAbsScaler.scala │ │ ├── MinMaxScaler.scala │ │ ├── NGrams.scala │ │ ├── NHotEncoder.scala │ │ ├── NHotWeightedEncoder.scala │ │ ├── Normalizer.scala │ │ ├── OneHotEncoder.scala │ │ ├── PolynomialExpansion.scala │ │ ├── PositionEncoder.scala │ │ ├── QuantileDiscretizer.scala │ │ ├── QuantileOutlierRejector.scala │ │ ├── StandardScaler.scala │ │ ├── TopNOneHotEncoder.scala │ │ ├── Transformer.scala │ │ ├── VectorIdentity.scala │ │ ├── VonMisesEvaluator.scala │ │ └── mdl │ │ ├── MDLPDiscretizer.scala │ │ ├── MDLUtil.scala │ │ └── ThresholdFinder.scala │ └── test │ ├── resources │ └── cars.data │ └── scala │ └── com │ └── spotify │ └── featran │ ├── CrossingSpec.scala │ ├── FeatureBuilderSpec.scala │ ├── FeatureSpecSpec.scala │ ├── Fixtures.scala │ ├── MultiFeatureSpecSpec.scala │ ├── SerializableUtils.scala │ ├── converters │ ├── CaseClassConverterTest.scala │ └── ConverterSpec.scala │ └── transformers │ ├── BinarizerSpec.scala │ ├── BucketizerSpec.scala │ ├── HashNHotEncoderSpec.scala │ ├── HashNHotWeightedEncoderSpec.scala │ ├── HashOneHotEncoderSpec.scala │ ├── HeavyHittersSpec.scala │ ├── IQROutlierRejectorSpec.scala │ ├── IdentitySpec.scala │ ├── IndicatorSpec.scala │ ├── MDLSpec.scala │ ├── MaxAbsScalerSpec.scala │ ├── MinMaxScalerSpec.scala │ ├── NGramsSpec.scala │ ├── NHotEncoderSpec.scala │ ├── NHotWeightedEncoderSpec.scala │ ├── NormalizerSpec.scala │ ├── OneHotEncoderSpec.scala │ ├── PolynomialExpansionSpec.scala │ ├── PositionEncoderSpec.scala │ ├── QuantileDiscretizerSpec.scala │ ├── QuantileOutlierRejectorSpec.scala │ ├── StandardScalerSpec.scala │ ├── TopNOneHotEncoderSpec.scala │ ├── TransformerProp.scala │ ├── VectorIdentitySpec.scala │ ├── VonMisesEvaluatorSpec.scala │ └── mdl │ ├── MDLPDiscretizerTest.scala │ ├── TestUtility.scala │ └── ThresholdFinderTest.scala ├── docs └── README.md ├── examples └── src │ └── main │ └── scala │ └── Examples.scala ├── flink └── src │ ├── main │ └── scala │ │ └── com │ │ └── spotify │ │ └── featran │ │ └── flink │ │ └── package.scala │ └── test │ └── scala │ └── com │ └── spotify │ └── featran │ └── flink │ └── FlinkTest.scala ├── java └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── spotify │ │ │ └── featran │ │ │ └── java │ │ │ ├── JFeatureExtractor.java │ │ │ ├── JFeatureSpec.java │ │ │ ├── JRecordExtractor.java │ │ │ └── SerializableFunction.java │ └── scala │ │ └── com │ │ └── spotify │ │ └── featran │ │ └── java │ │ ├── JListFeatureExtractor.scala │ │ └── JavaOps.scala │ └── test │ ├── java │ └── com │ │ └── spotify │ │ └── featran │ │ └── java │ │ ├── JavaTestUtil.java │ │ └── examples │ │ └── JavaExample.java │ └── scala │ └── com │ └── spotify │ └── featran │ └── java │ └── JavaTest.scala ├── jmh └── src │ └── test │ └── scala │ └── com │ └── spotify │ └── featran │ └── jmh │ ├── ExtractorBenchmark.scala │ ├── FeatureBuilderBenchmark.scala │ └── TransformerBenchmark.scala ├── numpy └── src │ ├── main │ └── scala │ │ └── com │ │ └── spotify │ │ └── featran │ │ └── numpy │ │ └── NumPy.scala │ └── test │ ├── resources │ ├── a1d-double.npy │ ├── a1d-float.npy │ ├── a1d-int.npy │ ├── a1d-long.npy │ ├── a2d-double.npy │ ├── a2d-float.npy │ ├── a2d-int.npy │ └── a2d-long.npy │ └── scala │ └── com │ └── spotify │ └── featran │ └── numpy │ └── NumPyTest.scala ├── project ├── build.properties └── plugins.sbt ├── scalding └── src │ ├── main │ └── scala │ │ └── com │ │ └── spotify │ │ └── featran │ │ └── scalding │ │ └── package.scala │ └── test │ └── scala │ └── com │ └── spotify │ └── featran │ └── scalding │ └── ScaldingTest.scala ├── scio └── src │ ├── main │ └── scala │ │ └── com │ │ └── spotify │ │ └── featran │ │ └── scio │ │ └── package.scala │ └── test │ └── scala │ └── com │ └── spotify │ └── featran │ └── scio │ └── ScioTest.scala ├── scripts ├── make-site.sh └── npy.py ├── shell.nix ├── spark └── src │ ├── main │ └── scala │ │ ├── com │ │ └── spotify │ │ │ └── featran │ │ │ └── spark │ │ │ └── package.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── rdd │ │ └── RDDUtil.scala │ └── test │ └── scala │ └── com │ └── spotify │ └── featran │ └── spark │ └── SparkTest.scala ├── tensorflow └── src │ ├── main │ └── scala │ │ └── com │ │ └── spotify │ │ └── featran │ │ └── tensorflow │ │ ├── FeatureBuilder.scala │ │ ├── FeatureName.scala │ │ ├── TensorFlowType.scala │ │ └── package.scala │ └── test │ └── scala │ └── com │ └── spotify │ └── featran │ └── tensorflow │ ├── ExampleConverterSpec.scala │ ├── ExampleExtractorSpec.scala │ └── TensorFlowFeatureBuilderSpec.scala └── xgboost └── src ├── main └── scala │ ├── com │ └── spotify │ │ └── featran │ │ └── xgboost │ │ ├── FeatureBuilder.scala │ │ ├── SparseLabeledPoint.scala │ │ └── package.scala │ └── ml │ └── dmlc │ └── xgboost4j │ └── LabeledPoint.scala └── test └── scala └── com └── spotify └── featran └── xgboost └── XGBoostFeatureBuilderSpec.scala /.envrc: -------------------------------------------------------------------------------- 1 | use nix -------------------------------------------------------------------------------- /.github/workflows/clean.yml: -------------------------------------------------------------------------------- 1 | # This file was automatically generated by sbt-github-actions using the 2 | # githubWorkflowGenerate task. You should add and commit this file to 3 | # your git repository. It goes without saying that you shouldn't edit 4 | # this file by hand! Instead, if you wish to make changes, you should 5 | # change your sbt build configuration to revise the workflow description 6 | # to meet your needs, then regenerate this file. 7 | 8 | name: Clean 9 | 10 | on: push 11 | 12 | jobs: 13 | delete-artifacts: 14 | name: Delete Artifacts 15 | runs-on: ubuntu-latest 16 | env: 17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | steps: 19 | - name: Delete artifacts 20 | run: | 21 | # Customize those three lines with your repository and credentials: 22 | REPO=${GITHUB_API_URL}/repos/${{ github.repository }} 23 | 24 | # A shortcut to call GitHub API. 25 | ghapi() { curl --silent --location --user _:$GITHUB_TOKEN "$@"; } 26 | 27 | # A temporary file which receives HTTP response headers. 28 | TMPFILE=/tmp/tmp.$$ 29 | 30 | # An associative array, key: artifact name, value: number of artifacts of that name. 31 | declare -A ARTCOUNT 32 | 33 | # Process all artifacts on this repository, loop on returned "pages". 34 | URL=$REPO/actions/artifacts 35 | while [[ -n "$URL" ]]; do 36 | 37 | # Get current page, get response headers in a temporary file. 38 | JSON=$(ghapi --dump-header $TMPFILE "$URL") 39 | 40 | # Get URL of next page. Will be empty if we are at the last page. 41 | URL=$(grep '^Link:' "$TMPFILE" | tr ',' '\n' | grep 'rel="next"' | head -1 | sed -e 's/.*.*//') 42 | rm -f $TMPFILE 43 | 44 | # Number of artifacts on this page: 45 | COUNT=$(( $(jq <<<$JSON -r '.artifacts | length') )) 46 | 47 | # Loop on all artifacts on this page. 48 | for ((i=0; $i < $COUNT; i++)); do 49 | 50 | # Get name of artifact and count instances of this name. 51 | name=$(jq <<<$JSON -r ".artifacts[$i].name?") 52 | ARTCOUNT[$name]=$(( $(( ${ARTCOUNT[$name]} )) + 1)) 53 | 54 | id=$(jq <<<$JSON -r ".artifacts[$i].id?") 55 | size=$(( $(jq <<<$JSON -r ".artifacts[$i].size_in_bytes?") )) 56 | printf "Deleting '%s' #%d, %'d bytes\n" $name ${ARTCOUNT[$name]} $size 57 | ghapi -X DELETE $REPO/actions/artifacts/$id 58 | done 59 | done 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .vscode 3 | 4 | .direnv 5 | 6 | target 7 | *.swp 8 | .DS_Store 9 | .metals 10 | .bloop 11 | .bsp 12 | .java-version 13 | -------------------------------------------------------------------------------- /.jvmopts: -------------------------------------------------------------------------------- 1 | # Same JVM opts as https://github.com/typelevel/cats/blob/master/.jvmopts 2 | -Dfile.encoding=UTF8 3 | -Xms1G 4 | -Xmx6G 5 | -XX:ReservedCodeCacheSize=250M 6 | -XX:+TieredCompilation 7 | -XX:+UseParallelGC 8 | # Additional flag that might help in case of serialization failures 9 | -Dsun.io.serialization.extendedDebugInfo=true 10 | 11 | # https://github.com/spotify/scio/issues/2096 12 | # # https://github.com/sbt/sbt/issues/4794#issuecomment-500602546 13 | # -Dsbt.classloader.close=false 14 | -------------------------------------------------------------------------------- /.scalafix.conf: -------------------------------------------------------------------------------- 1 | rules = [ 2 | RemoveUnused, 3 | LeakingImplicitClassVal 4 | ProcedureSyntax 5 | ExplicitResultTypes 6 | ] 7 | 8 | ExplicitResultTypes.memberKind = [Def, Val, Var] 9 | ExplicitResultTypes.memberVisibility = [Public] 10 | ExplicitResultTypes.skipSimpleDefinition = false 11 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = "3.6.1" 2 | maxColumn = 100 3 | binPack.literalArgumentLists = true 4 | 5 | continuationIndent { 6 | callSite = 2 7 | defnSite = 2 8 | } 9 | 10 | newlines { 11 | alwaysBeforeMultilineDef = false 12 | sometimesBeforeColonInMethodReturnType = true 13 | } 14 | 15 | docstrings = JavaDoc 16 | docstrings.oneline = fold 17 | docstrings.style = Asterisk 18 | 19 | project.git = false 20 | 21 | rewrite { 22 | rules = [ 23 | PreferCurlyFors, 24 | SortImports, 25 | RedundantBraces, 26 | RedundantParens, 27 | SortModifiers 28 | ] 29 | redundantBraces.generalExpressions = false 30 | redundantBraces.maxLines = 1 31 | } 32 | 33 | 34 | runner.dialect = scala212 35 | fileOverride { 36 | "glob:**/src/main/scala-3/**" { 37 | runner.dialect = scala3 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Featran 2 | Copyright 2017 Spotify AB 3 | -------------------------------------------------------------------------------- /catalog-info.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: backstage.io/v1alpha1 2 | kind: Resource 3 | metadata: 4 | name: featran 5 | spec: 6 | type: resource 7 | owner: flatmap 8 | -------------------------------------------------------------------------------- /core/src/main/scala-2.12/com/spotify/featran/FeatureSpecCompat.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | import com.spotify.featran.converters.{CaseClassConverter, DefaultTransform} 21 | 22 | import scala.reflect.ClassTag 23 | import scala.reflect.runtime.universe.TypeTag 24 | 25 | trait FeatureSpecCompat { 26 | 27 | /** 28 | * Generates a new [[FeatureSpec]] for case class of type `T`. This method defaults the 29 | * transformers based on the types of the fields. 30 | * 31 | * The implicit parameter can be used to change the default of the Transformer used for continuous 32 | * values. When another isn't supplied Identity will be used. 33 | */ 34 | def from[T <: Product: ClassTag: TypeTag](implicit dt: DefaultTransform[Double]): FeatureSpec[T] = 35 | CaseClassConverter.toSpec[T] 36 | } 37 | -------------------------------------------------------------------------------- /core/src/main/scala-2.13/com/spotify/featran/FeatureSpecCompat.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | import com.spotify.featran.converters.{CaseClassConverter, DefaultTransform} 21 | 22 | import scala.reflect.ClassTag 23 | import scala.reflect.runtime.universe.TypeTag 24 | 25 | trait FeatureSpecCompat { 26 | 27 | /** 28 | * Generates a new [[FeatureSpec]] for case class of type `T`. This method defaults the 29 | * transformers based on the types of the fields. 30 | * 31 | * The implicit parameter can be used to change the default of the Transformer used for continuous 32 | * values. When another isn't supplied Identity will be used. 33 | */ 34 | def from[T <: Product: ClassTag: TypeTag](implicit dt: DefaultTransform[Double]): FeatureSpec[T] = 35 | CaseClassConverter.toSpec[T] 36 | } 37 | -------------------------------------------------------------------------------- /core/src/main/scala-3/com/spotify/featran/FeatureSpecCompat.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | import com.spotify.featran.converters.{CaseClassConverter, DefaultTransform} 21 | 22 | import scala.deriving._ 23 | 24 | trait FeatureSpecCompat { 25 | 26 | /** 27 | * Generates a new [[FeatureSpec]] for case class of type `T`. This method defaults the 28 | * transformers based on the types of the fields. 29 | * 30 | * The implicit parameter can be used to change the default of the Transformer used for continuous 31 | * values. When another isn't supplied Identity will be used. 32 | */ 33 | inline def from[T <: Product](implicit 34 | m: Mirror.ProductOf[T], 35 | dt: DefaultTransform[Double] 36 | ): FeatureSpec[T] = 37 | CaseClassConverter.toSpec[T] 38 | } 39 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/CanBuild.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | import scala.collection.mutable 21 | import scala.reflect.ClassTag 22 | 23 | // Workaround for CanBuildFrom not serializable 24 | trait CanBuild[T, M[_]] extends Serializable { 25 | def apply(): mutable.Builder[T, M[T]] 26 | } 27 | 28 | object CanBuild { 29 | // Collection types in _root_.scala.* 30 | implicit def iterableCB[T]: CanBuild[T, Iterable] = new CanBuild[T, Iterable] { 31 | override def apply(): mutable.Builder[T, Iterable[T]] = Iterable.newBuilder 32 | } 33 | 34 | implicit def seqCB[T]: CanBuild[T, Seq] = new CanBuild[T, Seq] { 35 | override def apply(): mutable.Builder[T, Seq[T]] = Seq.newBuilder 36 | } 37 | 38 | implicit def indexedSeqCB[T]: CanBuild[T, IndexedSeq] = new CanBuild[T, IndexedSeq] { 39 | override def apply(): mutable.Builder[T, IndexedSeq[T]] = IndexedSeq.newBuilder 40 | } 41 | 42 | implicit def listCB[T]: CanBuild[T, List] = new CanBuild[T, List] { 43 | override def apply(): mutable.Builder[T, List[T]] = List.newBuilder 44 | } 45 | 46 | implicit def vectorCB[T]: CanBuild[T, Vector] = new CanBuild[T, Vector] { 47 | override def apply(): mutable.Builder[T, Vector[T]] = Vector.newBuilder 48 | } 49 | 50 | implicit def bufferCB[T]: CanBuild[T, mutable.Buffer] = new CanBuild[T, mutable.Buffer] { 51 | override def apply(): mutable.Builder[T, mutable.Buffer[T]] = mutable.Buffer.newBuilder 52 | } 53 | 54 | implicit def floatArrayCB: CanBuild[Float, Array] = new CanBuild[Float, Array] { 55 | override def apply(): mutable.Builder[Float, Array[Float]] = Array.newBuilder[Float] 56 | } 57 | 58 | implicit def doubleArrayCB: CanBuild[Double, Array] = new CanBuild[Double, Array] { 59 | override def apply(): mutable.Builder[Double, Array[Double]] = Array.newBuilder[Double] 60 | } 61 | 62 | implicit def arrayCB[T: ClassTag]: CanBuild[T, Array] = new CanBuild[T, Array] { 63 | override def apply(): mutable.Builder[T, Array[T]] = Array.newBuilder[T] 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/FloatingPoint.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | import simulacrum.typeclass 21 | 22 | import scala.annotation.implicitNotFound 23 | 24 | /** Type class for floating point primitives. */ 25 | @implicitNotFound("Could not find an instance of FloatingPoint for ${T}") 26 | @typeclass trait FloatingPoint[@specialized(Float, Double) T] extends Serializable { 27 | def fromDouble(x: Double): T 28 | } 29 | 30 | object FloatingPoint { 31 | implicit val floatFP: FloatingPoint[Float] = new FloatingPoint[Float] { 32 | override def fromDouble(x: Double): Float = x.toFloat 33 | } 34 | implicit val doubleFP: FloatingPoint[Double] = new FloatingPoint[Double] { 35 | override def fromDouble(x: Double): Double = x 36 | } 37 | 38 | /* ======================================================================== */ 39 | /* THE FOLLOWING CODE IS MANAGED BY SIMULACRUM; PLEASE DO NOT EDIT!!!! */ 40 | /* ======================================================================== */ 41 | 42 | /** Summon an instance of [[FloatingPoint]] for `T`. */ 43 | @inline def apply[T](implicit instance: FloatingPoint[T]): FloatingPoint[T] = instance 44 | 45 | object ops { 46 | implicit def toAllFloatingPointOps[T](target: T)(implicit tc: FloatingPoint[T]): AllOps[T] { 47 | type TypeClassType = FloatingPoint[T] 48 | } = new AllOps[T] { 49 | type TypeClassType = FloatingPoint[T] 50 | val self: T = target 51 | val typeClassInstance: TypeClassType = tc 52 | } 53 | } 54 | trait Ops[@specialized(Float, Double) T] extends Serializable { 55 | type TypeClassType <: FloatingPoint[T] 56 | def self: T 57 | val typeClassInstance: TypeClassType 58 | } 59 | trait AllOps[@specialized(Float, Double) T] extends Ops[T] 60 | trait ToFloatingPointOps extends Serializable { 61 | implicit def toFloatingPointOps[T](target: T)(implicit tc: FloatingPoint[T]): Ops[T] { 62 | type TypeClassType = FloatingPoint[T] 63 | } = new Ops[T] { 64 | type TypeClassType = FloatingPoint[T] 65 | val self: T = target 66 | val typeClassInstance: TypeClassType = tc 67 | } 68 | } 69 | object nonInheritedOps extends ToFloatingPointOps 70 | 71 | /* ======================================================================== */ 72 | /* END OF SIMULACRUM-MANAGED CODE */ 73 | /* ======================================================================== */ 74 | 75 | } 76 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/MultiFeatureExtractor.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | import scala.collection.compat.immutable.ArraySeq 21 | import scala.reflect.ClassTag 22 | 23 | /** 24 | * Encapsulate features extracted from a [[MultiFeatureSpec]]. Allows separation back into specs by 25 | * names or vectors. 26 | * @tparam M 27 | * input collection type, e.g. `Array`, List 28 | * @tparam T 29 | * input record type to extract features from 30 | */ 31 | class MultiFeatureExtractor[M[_]: CollectionType, T] private[featran] ( 32 | private val fs: M[MultiFeatureSet[T]], 33 | @transient private val input: M[T], 34 | @transient private val settings: Option[M[String]] 35 | ) extends Serializable { 36 | import CollectionType.ops._ 37 | 38 | private[this] val extractor = 39 | new FeatureExtractor(fs.asInstanceOf[M[FeatureSet[T]]], input, settings) 40 | 41 | /** 42 | * JSON settings of the [[MultiFeatureSpec]] and aggregated feature summary. 43 | * 44 | * This can be used with [[MultiFeatureSpec.extractWithSettings]] to bypass the `reduce` step when 45 | * extracting new records of the same type. 46 | */ 47 | @transient lazy val featureSettings: M[String] = extractor.featureSettings 48 | 49 | /** Names of the extracted features, in the same order as values in [[featureValues]]. */ 50 | @transient lazy val featureNames: M[Seq[Seq[String]]] = 51 | extractor.aggregate.cross(fs).map(x => x._2.multiFeatureNames(x._1)) 52 | 53 | /** 54 | * Values of the extracted features, in the same order as names in [[featureNames]]. 55 | * @tparam F 56 | * output data type, e.g. `Array[Float]`, `Array[Double]`, `DenseVector[Float]`, 57 | * `DenseVector[Double]` 58 | */ 59 | def featureValues[F: FeatureBuilder: ClassTag]: M[Seq[F]] = 60 | featureResults.map(_._1) 61 | 62 | /** 63 | * Values of the extracted features, in the same order as names in [[featureNames]] with 64 | * rejections keyed on feature name and the original input record. 65 | * @tparam F 66 | * output data type, e.g. `Array[Float]`, `Array[Double]`, `DenseVector[Float]`, 67 | * `DenseVector[Double]` 68 | */ 69 | def featureResults[F: FeatureBuilder: ClassTag] 70 | : M[(Seq[F], Seq[Map[String, FeatureRejection]], T)] = 71 | extractor.as 72 | .cross(extractor.aggregate) 73 | .cross(fs) 74 | .map { case (((o, a), c), featureSet) => 75 | val fb = featureSet.multiFeatureBuilders 76 | featureSet.multiFeatureValues(a, c, fb) 77 | val res = ArraySeq.unsafeWrapArray(fb.map(_.result)) 78 | val rej = ArraySeq.unsafeWrapArray(fb.map(_.rejections)) 79 | (res, rej, o) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/converters/DefaultTransform.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.converters 19 | 20 | import com.spotify.featran.transformers.Transformer 21 | import simulacrum.typeclass 22 | import scala.annotation.implicitNotFound 23 | 24 | /** Default Type Class used by the from generator for Case Class Conversions */ 25 | @implicitNotFound("Could not find an instance of DefaultTransform for ${T}") 26 | @typeclass trait DefaultTransform[T] extends Serializable { 27 | def apply(featureName: String): Transformer[T, _, _] 28 | } 29 | 30 | object DefaultTransform { 31 | /* ======================================================================== */ 32 | /* THE FOLLOWING CODE IS MANAGED BY SIMULACRUM; PLEASE DO NOT EDIT!!!! */ 33 | /* ======================================================================== */ 34 | 35 | /** Summon an instance of [[DefaultTransform]] for `T`. */ 36 | @inline def apply[T](implicit instance: DefaultTransform[T]): DefaultTransform[T] = instance 37 | 38 | object ops { 39 | implicit def toAllDefaultTransformOps[T]( 40 | target: T 41 | )(implicit tc: DefaultTransform[T]): AllOps[T] { 42 | type TypeClassType = DefaultTransform[T] 43 | } = new AllOps[T] { 44 | type TypeClassType = DefaultTransform[T] 45 | val self: T = target 46 | val typeClassInstance: TypeClassType = tc 47 | } 48 | } 49 | trait Ops[T] extends Serializable { 50 | type TypeClassType <: DefaultTransform[T] 51 | def self: T 52 | val typeClassInstance: TypeClassType 53 | } 54 | trait AllOps[T] extends Ops[T] 55 | trait ToDefaultTransformOps extends Serializable { 56 | implicit def toDefaultTransformOps[T](target: T)(implicit tc: DefaultTransform[T]): Ops[T] { 57 | type TypeClassType = DefaultTransform[T] 58 | } = new Ops[T] { 59 | type TypeClassType = DefaultTransform[T] 60 | val self: T = target 61 | val typeClassInstance: TypeClassType = tc 62 | } 63 | } 64 | object nonInheritedOps extends ToDefaultTransformOps 65 | 66 | /* ======================================================================== */ 67 | /* END OF SIMULACRUM-MANAGED CODE */ 68 | /* ======================================================================== */ 69 | 70 | } 71 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/converters/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | import com.spotify.featran.transformers.{Identity, Transformer} 21 | 22 | package object converters { 23 | implicit class RichBoolean(private val self: Boolean) extends AnyVal { 24 | final def asDouble: Double = if (self) 1.0 else 0.0 25 | } 26 | 27 | implicit val identityDefault: DefaultTransform[Double] = new DefaultTransform[Double] { 28 | final def apply(featureName: String): Transformer[Double, _, _] = Identity(featureName) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/json/JsonOps.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.json 19 | 20 | import io.circe.{Decoder, Encoder, Error, Json} 21 | import io.circe.parser 22 | 23 | private[json] trait JsonEncoder { 24 | final def encode[T: Encoder](t: T): Json = Encoder[T].apply(t) 25 | } 26 | 27 | private[json] trait JsonDecoder { 28 | final def decode[T: Decoder](str: String): Either[Error, T] = parser.decode[T](str) 29 | } 30 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/json/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | /** Package for json serialization */ 21 | package object json extends JsonEncoder with JsonDecoder with Implicits 22 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/Binarizer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.{FlatReader, FlatWriter} 21 | 22 | /** 23 | * Transform numerical features to binary features. 24 | * 25 | * Feature values greater than `threshold` are binarized to 1.0; values equal to or less than 26 | * `threshold` are binarized to 0.0. 27 | * 28 | * Missing values are binarized to 0.0. 29 | */ 30 | object Binarizer extends SettingsBuilder { 31 | 32 | /** 33 | * Create a new [[Binarizer$]] instance. 34 | * @param threshold 35 | * threshold to binarize continuous features 36 | */ 37 | def apply(name: String, threshold: Double = 0.0): Transformer[Double, Unit, Unit] = 38 | new Binarizer(name, threshold) 39 | 40 | /** 41 | * Create a new [[Binarizer$]] from a settings object 42 | * @param setting 43 | * Settings object 44 | */ 45 | def fromSettings(setting: Settings): Transformer[Double, Unit, Unit] = { 46 | val threshold = setting.params("threshold").toDouble 47 | Binarizer(setting.name, threshold) 48 | } 49 | } 50 | 51 | private[featran] class Binarizer(name: String, val threshold: Double) extends MapOne[Double](name) { 52 | override def map(a: Double): Double = if (a > threshold) 1.0 else 0.0 53 | override def params: Map[String, String] = 54 | Map("threshold" -> threshold.toString) 55 | 56 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readDouble(name) 57 | 58 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[Double] => fw.IF = 59 | fw.writeDouble(name) 60 | } 61 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/Bucketizer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import java.util.{TreeMap => JTreeMap} 21 | 22 | import com.spotify.featran.{FeatureBuilder, FeatureRejection, FlatReader, FlatWriter} 23 | import com.twitter.algebird.Aggregator 24 | 25 | /** 26 | * Transform a column of continuous features to n columns of feature buckets. 27 | * 28 | * With n+1 splits, there are n buckets. A bucket defined by splits x,y holds values in the range 29 | * [x,y) except the last bucket, which also includes y. Splits should be strictly increasing. Values 30 | * at -inf, inf must be explicitly provided to cover all double values; Otherwise, 31 | * [[FeatureRejection.OutOfBound]] rejection will be reported for values outside the splits 32 | * specified.. Two examples of splits are `Array(Double.NegativeInfinity, 0.0, 1.0, 33 | * Double.PositiveInfinity)` and `Array(0.0, 1.0, 2.0)`. 34 | * 35 | * Note that if you have no idea of the upper and lower bounds of the targeted column, you should 36 | * add `Double.NegativeInfinity` and `Double.PositiveInfinity` as the bounds of your splits to 37 | * prevent a potential [[FeatureRejection.OutOfBound]] rejection. 38 | * 39 | * Note also that the splits that you provided have to be in strictly increasing order, i.e. `s0 < 40 | * s1 < s2 < ... < sn`. 41 | * 42 | * Missing values are transformed to zero vectors. 43 | */ 44 | object Bucketizer extends SettingsBuilder { 45 | 46 | /** 47 | * Create a new [[Bucketizer$]] instance. 48 | * @param splits 49 | * parameter for mapping continuous features into buckets 50 | */ 51 | def apply(name: String, splits: Array[Double]): Transformer[Double, Unit, Unit] = 52 | new Bucketizer(name, splits) 53 | 54 | /** 55 | * Create a new [[Bucketizer$]] from a settings object 56 | * @param setting 57 | * Settings object 58 | */ 59 | def fromSettings(setting: Settings): Transformer[Double, Unit, Unit] = { 60 | val params = setting.params 61 | val str = params("splits") 62 | val splits = str.slice(1, str.length - 1).split(",").map(_.toDouble).sorted 63 | Bucketizer(setting.name, splits) 64 | } 65 | } 66 | 67 | private[featran] class Bucketizer(name: String, val splits: Array[Double]) 68 | extends Transformer[Double, Unit, Unit](name) { 69 | require(splits.length >= 3, "splits.length must be >= 3") 70 | private val lower = splits.head 71 | private val upper = splits.last 72 | private val map = { 73 | val m = new JTreeMap[Double, Int]() 74 | var i = 1 75 | while (i < splits.length) { 76 | require(splits(i) > splits(i - 1), "splits must be in increasing order") 77 | m.put(splits(i), i - 1) 78 | i += 1 79 | } 80 | m 81 | } 82 | override val aggregator: Aggregator[Double, Unit, Unit] = 83 | Aggregators.unit[Double] 84 | override def featureDimension(c: Unit): Int = splits.length - 1 85 | override def featureNames(c: Unit): Seq[String] = names(splits.length - 1) 86 | override def buildFeatures(a: Option[Double], c: Unit, fb: FeatureBuilder[_]): Unit = a match { 87 | case Some(x) => 88 | if (x < lower || x > upper) { 89 | fb.skip(splits.length - 1) 90 | fb.reject(this, FeatureRejection.OutOfBound(lower, upper, x)) 91 | } else { 92 | val e = map.higherEntry(x) 93 | val offset = if (e != null) e.getValue else splits.length - 2 94 | fb.skip(offset) 95 | fb.add(nameAt(offset), 1.0) 96 | fb.skip(splits.length - 2 - offset) 97 | } 98 | case None => fb.skip(splits.length - 1) 99 | } 100 | 101 | override def encodeAggregator(c: Unit): String = "" 102 | override def decodeAggregator(s: String): Unit = () 103 | override def params: Map[String, String] = 104 | Map("splits" -> splits.mkString("[", ",", "]")) 105 | 106 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readDouble(name) 107 | 108 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[Double] => fw.IF = 109 | fw.writeDouble(name) 110 | } 111 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/HashNHotEncoder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.{FeatureBuilder, FlatReader, FlatWriter} 21 | import com.twitter.algebird.HLL 22 | 23 | import scala.collection.SortedSet 24 | 25 | /** 26 | * Transform a collection of categorical features to binary columns, with at most N one-values. 27 | * Similar to [[NHotEncoder$]] but uses MurmursHash3 to hash features into buckets to reduce CPU and 28 | * memory overhead. 29 | * 30 | * Missing values are transformed to zero vectors. 31 | * 32 | * If hashBucketSize is inferred with HLL, the estimate is scaled by sizeScalingFactor to reduce the 33 | * number of collisions. 34 | * 35 | * Rough table of relationship of scaling factor to % collisions, measured from a corpus of 466544 36 | * English words: 37 | * 38 | * {{{ 39 | * sizeScalingFactor % Collisions 40 | * ----------------- ------------ 41 | * 2 17.9934% 42 | * 4 10.5686% 43 | * 8 5.7236% 44 | * 16 3.0019% 45 | * 32 1.5313% 46 | * 64 0.7864% 47 | * 128 0.3920% 48 | * 256 0.1998% 49 | * 512 0.0975% 50 | * 1024 0.0478% 51 | * 2048 0.0236% 52 | * 4096 0.0071% 53 | * }}} 54 | */ 55 | object HashNHotEncoder extends SettingsBuilder { 56 | 57 | /** 58 | * Create a new [[HashNHotEncoder$]] instance. 59 | * @param hashBucketSize 60 | * number of buckets, or 0 to infer from data with HyperLogLog 61 | * @param sizeScalingFactor 62 | * when hashBucketSize is 0, scale HLL estimate by this amount 63 | */ 64 | def apply( 65 | name: String, 66 | hashBucketSize: Int = 0, 67 | sizeScalingFactor: Double = 8.0 68 | ): Transformer[Seq[String], HLL, Int] = 69 | new HashNHotEncoder(name, hashBucketSize, sizeScalingFactor) 70 | 71 | /** 72 | * Create a new [[HashNHotEncoder$]] from a settings object 73 | * @param setting 74 | * Settings object 75 | */ 76 | def fromSettings(setting: Settings): Transformer[Seq[String], HLL, Int] = { 77 | val hashBucketSize = setting.params("hashBucketSize").toInt 78 | val sizeScalingFactor = setting.params("sizeScalingFactor").toDouble 79 | HashNHotEncoder(setting.name, hashBucketSize, sizeScalingFactor) 80 | } 81 | } 82 | 83 | private[featran] class HashNHotEncoder(name: String, hashBucketSize: Int, sizeScalingFactor: Double) 84 | extends BaseHashHotEncoder[Seq[String]](name, hashBucketSize, sizeScalingFactor) { 85 | override def prepare(a: Seq[String]): HLL = 86 | a.map(hllMonoid.toHLL(_)).fold(hllMonoid.zero)(hllMonoid.plus) 87 | 88 | override def buildFeatures(a: Option[Seq[String]], c: Int, fb: FeatureBuilder[_]): Unit = 89 | a match { 90 | case Some(xs) => 91 | var prev = -1 92 | SortedSet(xs.map(HashEncoder.bucket(_, c)): _*).foreach { curr => 93 | val gap = curr - prev - 1 94 | if (gap > 0) fb.skip(gap) 95 | fb.add(name + '_' + curr, 1.0) 96 | prev = curr 97 | } 98 | val gap = c - prev - 1 99 | if (gap > 0) fb.skip(gap) 100 | case None => fb.skip(c) 101 | } 102 | 103 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readStrings(name) 104 | 105 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[Seq[String]] => fw.IF = 106 | fw.writeStrings(name) 107 | } 108 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/HashNHotWeightedEncoder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.{FeatureBuilder, FlatReader, FlatWriter} 21 | import com.twitter.algebird.HLL 22 | 23 | import scala.jdk.CollectionConverters._ 24 | 25 | /** 26 | * Transform a collection of weighted categorical features to columns of weight sums, with at most N 27 | * values. Similar to [[NHotWeightedEncoder$]] but uses MurmursHash3 to hash features into buckets 28 | * to reduce CPU and memory overhead. 29 | * 30 | * Weights of the same labels in a row are summed instead of 1.0 as is the case with the normal 31 | * [[NHotEncoder$]]. 32 | * 33 | * If hashBucketSize is inferred with HLL, the estimate is scaled by sizeScalingFactor to reduce the 34 | * number of collisions. 35 | * 36 | * Rough table of relationship of scaling factor to % collisions, measured from a corpus of 466544 37 | * English words: 38 | * 39 | * {{{ 40 | * sizeScalingFactor % Collisions 41 | * ----------------- ------------ 42 | * 2 17.9934% 43 | * 4 10.5686% 44 | * 8 5.7236% 45 | * 16 3.0019% 46 | * 32 1.5313% 47 | * 64 0.7864% 48 | * 128 0.3920% 49 | * 256 0.1998% 50 | * 512 0.0975% 51 | * 1024 0.0478% 52 | * 2048 0.0236% 53 | * 4096 0.0071% 54 | * }}} 55 | */ 56 | object HashNHotWeightedEncoder extends SettingsBuilder { 57 | 58 | /** 59 | * Create a new [[HashNHotWeightedEncoder$]] instance. 60 | * @param hashBucketSize 61 | * number of buckets, or 0 to infer from data with HyperLogLog 62 | * @param sizeScalingFactor 63 | * when hashBucketSize is 0, scale HLL estimate by this amount 64 | */ 65 | def apply( 66 | name: String, 67 | hashBucketSize: Int = 0, 68 | sizeScalingFactor: Double = 8.0 69 | ): Transformer[Seq[WeightedLabel], HLL, Int] = 70 | new HashNHotWeightedEncoder(name, hashBucketSize, sizeScalingFactor) 71 | 72 | /** 73 | * Create a new [[HashOneHotEncoder$]] from a settings object 74 | * @param setting 75 | * Settings object 76 | */ 77 | def fromSettings(setting: Settings): Transformer[Seq[WeightedLabel], HLL, Int] = { 78 | val hashBucketSize = setting.params("hashBucketSize").toInt 79 | val sizeScalingFactor = setting.params("sizeScalingFactor").toDouble 80 | HashNHotWeightedEncoder(setting.name, hashBucketSize, sizeScalingFactor) 81 | } 82 | } 83 | 84 | private[featran] class HashNHotWeightedEncoder( 85 | name: String, 86 | hashBucketSize: Int, 87 | sizeScalingFactor: Double 88 | ) extends BaseHashHotEncoder[Seq[WeightedLabel]](name, hashBucketSize, sizeScalingFactor) { 89 | override def prepare(a: Seq[WeightedLabel]): HLL = 90 | a.map(_.name).map(hllMonoid.toHLL(_)).fold(hllMonoid.zero)(hllMonoid.plus) 91 | 92 | override def buildFeatures(a: Option[Seq[WeightedLabel]], c: Int, fb: FeatureBuilder[_]): Unit = 93 | a match { 94 | case Some(xs) => 95 | val weights = 96 | new java.util.TreeMap[Int, Double]().asScala.withDefaultValue(0.0) 97 | xs.foreach(x => weights(HashEncoder.bucket(x.name, c)) += x.value) 98 | var prev = -1 99 | weights.foreach { v => 100 | val (curr, value) = v 101 | val gap = curr - prev - 1 102 | if (gap > 0) fb.skip(gap) 103 | fb.add(name + '_' + curr, value) 104 | prev = curr 105 | } 106 | val gap = c - prev - 1 107 | if (gap > 0) fb.skip(gap) 108 | case None => fb.skip(c) 109 | } 110 | 111 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readWeightedLabel(name) 112 | 113 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[Seq[WeightedLabel]] => fw.IF = 114 | fw.writeWeightedLabel(name) 115 | } 116 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/IQROutlierRejector.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.twitter.algebird.QTreeAggregator 21 | 22 | /** 23 | * Reject values if they fall outside of either `factor * IQR` below the first quartile or `factor * 24 | * IQR` above the third quartile. 25 | * 26 | * IQR or inter quartile range is the range between the first and the third quartiles. 27 | * 28 | * The bin ranges are chosen using the Algebird's QTree approximate data structure. The precision of 29 | * the approximation can be controlled with the `k` parameter. 30 | * 31 | * All values are transformed to zeros. 32 | * 33 | * Values `factor * IQR` below the first quartile or `factor * IQR` above the third quartile are 34 | * rejected as [[FeatureRejection.Outlier]]. 35 | * 36 | * When using aggregated feature summary from a previous session, values outside of previously seen 37 | * `[min, max]` will also report [[FeatureRejection.Outlier]] as rejection. 38 | */ 39 | object IQROutlierRejector extends SettingsBuilder { 40 | import BaseQuantileRejector._ 41 | private val DefaultFactor = 1.5 42 | 43 | /** 44 | * Create a new [[IQROutlierRejector]] instance. 45 | * 46 | * @param rejectLower 47 | * whether to reject outliers `factor` * IQR below the first quartile 48 | * @param rejectUpper 49 | * whether to reject outliers `factor` * IQR above the third quartile 50 | * @param k 51 | * precision of the underlying Algebird QTree approximation 52 | */ 53 | def apply( 54 | name: String, 55 | rejectLower: Boolean = true, 56 | rejectUpper: Boolean = true, 57 | k: Int = QTreeAggregator.DefaultK, 58 | factor: Double = DefaultFactor 59 | ): Transformer[Double, BaseQuantileRejector.B, BaseQuantileRejector.C] = 60 | new IQROutlierRejector(name, rejectLower, rejectUpper, k, factor) 61 | 62 | /** 63 | * Create a new [[IQROutlierRejector]] from a settings object 64 | * @param setting 65 | * Settings object 66 | */ 67 | def fromSettings(setting: Settings): Transformer[Double, B, C] = 68 | IQROutlierRejector(setting.name) 69 | } 70 | 71 | private class IQROutlierRejector( 72 | name: String, 73 | rejectLower: Boolean, 74 | rejectUpper: Boolean, 75 | k: Int, 76 | val factor: Double 77 | ) extends QuantileOutlierRejector(name, rejectLower, rejectUpper, 4, k) { 78 | override def calculateBounds(fq: Double, lq: Double): (Double, Double) = { 79 | val iqr = lq - fq 80 | val l = fq - (iqr * factor) 81 | val u = lq - (iqr * factor) 82 | (l, u) 83 | } 84 | 85 | override def params: Map[String, String] = 86 | super.params ++ Map("factor" -> factor.toString) 87 | } 88 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/Identity.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.{FlatReader, FlatWriter} 21 | 22 | /** 23 | * Transform features by passing them through. 24 | * 25 | * Missing values are transformed to 0.0. 26 | */ 27 | object Identity extends SettingsBuilder { 28 | 29 | /** Create a new [[Identity$]] instance. */ 30 | def apply(name: String): Transformer[Double, Unit, Unit] = new Identity(name) 31 | 32 | /** 33 | * Create a new [[Identity$]] from a settings object 34 | * @param setting 35 | * Settings object 36 | */ 37 | def fromSettings(setting: Settings): Transformer[Double, Unit, Unit] = 38 | Identity(setting.name) 39 | } 40 | 41 | private[featran] class Identity(name: String) extends MapOne[Double](name) { 42 | override def map(a: Double): Double = a 43 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readDouble(name) 44 | 45 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[Double] => fw.IF = 46 | fw.writeDouble(name) 47 | } 48 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/Indicator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.{FlatReader, FlatWriter} 21 | 22 | /** 23 | * Transform an optional 1D feature to an indicator variable indicating presence. 24 | * 25 | * Missing values are mapped to 0.0. Present values are mapped to 1.0. 26 | */ 27 | object Indicator extends SettingsBuilder { 28 | 29 | /** Create a new [[Indicator$]] instance. */ 30 | def apply(name: String): Transformer[Double, Unit, Unit] = 31 | new Indicator(name) 32 | 33 | /** 34 | * Create a new [[Indicator$]] from a settings object 35 | * @param setting 36 | * Settings object 37 | */ 38 | def fromSettings(setting: Settings): Transformer[Double, Unit, Unit] = 39 | Indicator(setting.name) 40 | } 41 | 42 | private[featran] class Indicator(name: String) extends MapOne[Double](name) { 43 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readDouble(name) 44 | 45 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[Double] => fw.IF = 46 | fw.writeDouble(name) 47 | 48 | override def map(a: Double): Double = 1 49 | } 50 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/MaxAbsScaler.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.{FeatureBuilder, FeatureRejection, FlatReader, FlatWriter} 21 | import com.twitter.algebird.{Aggregator, Max} 22 | 23 | /** 24 | * Transform features by rescaling each feature to range [-1, 1] by dividing through the maximum 25 | * absolute value in each feature. 26 | * 27 | * Missing values are transformed to 0.0. 28 | * 29 | * When using aggregated feature summary from a previous session, out of bound values are truncated 30 | * to -1.0 or 1.0 and [[FeatureRejection.OutOfBound]] rejections are reported. 31 | */ 32 | object MaxAbsScaler extends SettingsBuilder { 33 | 34 | /** Create a new [[MaxAbsScaler$]] instance. */ 35 | def apply(name: String): Transformer[Double, Max[Double], Double] = 36 | new MaxAbsScaler(name) 37 | 38 | /** 39 | * Create a new [[MaxAbsScaler$]] from a settings object 40 | * @param setting 41 | * Settings object 42 | */ 43 | def fromSettings(setting: Settings): Transformer[Double, Max[Double], Double] = 44 | MaxAbsScaler(setting.name) 45 | } 46 | 47 | private[featran] class MaxAbsScaler(name: String) 48 | extends OneDimensional[Double, Max[Double], Double](name) { 49 | override val aggregator: Aggregator[Double, Max[Double], Double] = 50 | Aggregators.from[Double](x => Max(math.abs(x))).to(_.get) 51 | override def buildFeatures(a: Option[Double], c: Double, fb: FeatureBuilder[_]): Unit = a match { 52 | case Some(x) => 53 | // truncate x to [-max, max] 54 | val truncated = math.min(math.abs(x), c) * math.signum(x) 55 | fb.add(name, truncated / c) 56 | if (math.abs(x) > c) { 57 | fb.reject(this, FeatureRejection.OutOfBound(-c, c, x)) 58 | } 59 | case None => fb.skip() 60 | } 61 | override def encodeAggregator(c: Double): String = c.toString 62 | override def decodeAggregator(s: String): Double = s.toDouble 63 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readDouble(name) 64 | 65 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[Double] => fw.IF = 66 | fw.writeDouble(name) 67 | } 68 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/MinMaxScaler.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.{FeatureBuilder, FeatureRejection, FlatReader, FlatWriter} 21 | import com.twitter.algebird.{Aggregator, Max, Min} 22 | 23 | /** 24 | * Transform features by rescaling each feature to a specific range [`min`, `max`] (default [0, 1]). 25 | * 26 | * Missing values are transformed to `min`. 27 | * 28 | * When using aggregated feature summary from a previous session, out of bound values are truncated 29 | * to `min` or `max` and [[FeatureRejection.OutOfBound]] rejections are reported. 30 | */ 31 | object MinMaxScaler extends SettingsBuilder { 32 | 33 | /** 34 | * Create a new [[MinMaxScaler$]] instance. 35 | * @param min 36 | * lower bound after transformation, shared by all features 37 | * @param max 38 | * upper bound after transformation, shared by all features 39 | */ 40 | def apply( 41 | name: String, 42 | min: Double = 0.0, 43 | max: Double = 1.0 44 | ): Transformer[Double, (Min[Double], Max[Double]), C] = 45 | new MinMaxScaler(name, min, max) 46 | 47 | /** 48 | * Create a new [[MinMaxScaler$]] from a settings object 49 | * @param setting 50 | * Settings object 51 | */ 52 | def fromSettings(setting: Settings): Transformer[Double, (Min[Double], Max[Double]), C] = { 53 | val min = setting.params("min").toDouble 54 | val max = setting.params("max").toDouble 55 | MinMaxScaler(setting.name, min, max) 56 | } 57 | 58 | private type C = (Double, Double, Double) 59 | } 60 | 61 | private[featran] class MinMaxScaler(name: String, val min: Double, val max: Double) 62 | extends OneDimensional[Double, (Min[Double], Max[Double]), MinMaxScaler.C](name) { 63 | require(max > min, s"max must be > min") 64 | 65 | import MinMaxScaler.C 66 | 67 | override val aggregator: Aggregator[Double, (Min[Double], Max[Double]), C] = 68 | Aggregators.from[Double](x => (Min(x), Max(x))).to { r => 69 | val (aMin, aMax) = (r._1.get, r._2.get) 70 | val f = if ((aMax - aMin).isInfinity) 2.0 else 1.0 // scaling factor to avoid overflow 71 | (aMin / f, aMax / f, f) 72 | } 73 | 74 | override def buildFeatures(a: Option[Double], c: C, fb: FeatureBuilder[_]): Unit = a match { 75 | case Some(x) => 76 | val (aMin, aMax, f) = c 77 | val truncated = math.max(math.min(x / f, aMax), aMin) 78 | fb.add(name, (truncated - aMin) / (aMax - aMin) * (max - min) + min) 79 | if (x < aMin || x > aMax) { 80 | fb.reject(this, FeatureRejection.OutOfBound(aMin, aMax, x)) 81 | } 82 | case None => fb.add(name, min) 83 | } 84 | 85 | override def encodeAggregator(c: C): String = s"${c._1},${c._2},${c._3}" 86 | override def decodeAggregator(s: String): C = { 87 | val t = s.split(",") 88 | (t(0).toDouble, t(1).toDouble, t(2).toDouble) 89 | } 90 | override def params: Map[String, String] = 91 | Map("min" -> min.toString, "max" -> max.toString) 92 | 93 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readDouble(name) 94 | 95 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[Double] => fw.IF = 96 | fw.writeDouble(name) 97 | } 98 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/NGrams.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.FeatureBuilder 21 | 22 | import scala.collection.{mutable, SortedMap} 23 | 24 | /** 25 | * Transform a collection of sentences, where each row is a `Seq[String]` of the words / tokens, 26 | * into a collection containing all the n-grams that can be constructed from each row. The feature 27 | * representation is an n-hot encoding (see [[NHotEncoder$]]) constructed from an expanded 28 | * vocabulary of all of the generated n-grams. 29 | * 30 | * N-grams are generated based on a specified range of `low` to `high` (inclusive) and are joined by 31 | * the given `sep` (default is " "). For example, with `low = 2`, `high = 3` and `sep = ""`, row 32 | * `["a", "b", "c", "d", "e"]` would produce `["ab", "bc", "cd", "de", "abc", "bcd", "cde"]`. 33 | * 34 | * As with [[NHotEncoder$]], missing values are transformed to [0.0, 0.0, ...]. 35 | */ 36 | object NGrams extends SettingsBuilder { 37 | 38 | /** 39 | * Create a new [[NGrams$]] instance. 40 | * 41 | * @param low 42 | * the smallest size of the generated *-grams 43 | * @param high 44 | * the largest size of the generated *-grams, or -1 for the full length of the input 45 | * `Seq[String]` 46 | * @param sep 47 | * a string separator used to join individual tokens 48 | */ 49 | def apply( 50 | name: String, 51 | low: Int = 1, 52 | high: Int = -1, 53 | sep: String = " " 54 | ): Transformer[Seq[String], Set[String], SortedMap[String, Int]] = { 55 | require(low > 0, "low must be > 0") 56 | require(high >= low || high == -1, "high must >= low or -1") 57 | new NGrams(name, low, high, sep) 58 | } 59 | 60 | /** 61 | * Create a new [[NGrams$]] from a settings object 62 | * @param setting 63 | * Settings object 64 | */ 65 | def fromSettings( 66 | setting: Settings 67 | ): Transformer[Seq[String], Set[String], SortedMap[String, Int]] = 68 | NGrams(setting.name) 69 | } 70 | 71 | private[featran] class NGrams(name: String, val low: Int, val high: Int, val sep: String) 72 | extends NHotEncoder(name, false) { 73 | override def prepare(a: Seq[String]): Set[String] = ngrams(a).toSet 74 | 75 | override def buildFeatures( 76 | a: Option[Seq[String]], 77 | c: SortedMap[String, Int], 78 | fb: FeatureBuilder[_] 79 | ): Unit = 80 | super.buildFeatures(a.map(ngrams), c, fb) 81 | 82 | private[transformers] def ngrams(a: Seq[String]): Seq[String] = { 83 | val max = if (high == -1) a.length else high 84 | val b = Seq.newBuilder[String] 85 | var i = low 86 | while (i <= max) { 87 | if (i == 1) { 88 | b ++= a 89 | } else if (i <= a.size) { 90 | val q = mutable.Queue[String]() 91 | var j = 0 92 | val it = a.iterator 93 | while (j < i) { 94 | q.enqueue(it.next()) 95 | j += 1 96 | } 97 | b += mkNGram(q, sep) 98 | while (it.hasNext) { 99 | q.dequeue() 100 | q.enqueue(it.next()) 101 | b += mkNGram(q, sep) 102 | } 103 | } 104 | i += 1 105 | } 106 | b.result() 107 | } 108 | 109 | private def mkNGram(xs: mutable.Queue[String], sep: String): String = { 110 | val sb = new StringBuilder() 111 | val i = xs.iterator 112 | sb.append(i.next()) 113 | while (i.hasNext) { 114 | sb.append(sep).append(i.next()) 115 | } 116 | sb.mkString 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/NHotEncoder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.{FeatureBuilder, FeatureRejection, FlatReader, FlatWriter} 21 | 22 | import scala.collection.SortedMap 23 | import scala.collection.mutable.{Set => MSet} 24 | 25 | /** 26 | * Transform a collection of categorical features to binary columns, with at most N one-values. 27 | * 28 | * Missing values are either transformed to zero vectors or encoded as a missing value. 29 | * 30 | * When using aggregated feature summary from a previous session, unseen labels are either 31 | * transformed to zero vectors or encoded as `__unknown__` (if `encodeMissingValue` is true) and 32 | * [FeatureRejection.Unseen]] rejections are reported. 33 | */ 34 | object NHotEncoder extends SettingsBuilder { 35 | 36 | /** Create a new [[NHotEncoder$]] instance. */ 37 | def apply( 38 | name: String, 39 | encodeMissingValue: Boolean = false 40 | ): Transformer[Seq[String], Set[String], SortedMap[String, Int]] = 41 | new NHotEncoder(name, encodeMissingValue) 42 | 43 | /** 44 | * Create a new [[NHotEncoder$]] from a settings object 45 | * @param setting 46 | * Settings object 47 | */ 48 | def fromSettings( 49 | setting: Settings 50 | ): Transformer[Seq[String], Set[String], SortedMap[String, Int]] = { 51 | val encodeMissingValue = setting.params("encodeMissingValue").toBoolean 52 | NHotEncoder(setting.name, encodeMissingValue) 53 | } 54 | } 55 | 56 | private[featran] class NHotEncoder(name: String, encodeMissingValue: Boolean) 57 | extends BaseHotEncoder[Seq[String]](name, encodeMissingValue) { 58 | import MissingValue.MissingValueToken 59 | 60 | def addMissingValue(fb: FeatureBuilder[_], unseen: MSet[String], keys: Seq[String]): Unit = 61 | if ( 62 | unseen.isEmpty 63 | && keys.nonEmpty 64 | ) { 65 | fb.skip() 66 | } else { 67 | fb.add(name + '_' + MissingValueToken, 1.0) 68 | } 69 | 70 | override def prepare(a: Seq[String]): Set[String] = Set(a: _*) 71 | override def buildFeatures( 72 | a: Option[Seq[String]], 73 | c: SortedMap[String, Int], 74 | fb: FeatureBuilder[_] 75 | ): Unit = a match { 76 | case Some(xs) => 77 | val keys = xs.distinct.sorted 78 | var prev = -1 79 | val unseen = MSet[String]() 80 | keys.foreach { key => 81 | c.get(key) match { 82 | case Some(curr) => 83 | val gap = curr - prev - 1 84 | if (gap > 0) fb.skip(gap) 85 | fb.add(name + '_' + key, 1.0) 86 | prev = curr 87 | case None => 88 | unseen += key 89 | } 90 | } 91 | val gap = c.size - prev - 1 92 | if (gap > 0) fb.skip(gap) 93 | if (encodeMissingValue) { 94 | addMissingValue(fb, unseen, keys) 95 | } 96 | if (unseen.nonEmpty) { 97 | fb.reject(this, FeatureRejection.Unseen(unseen.toSet)) 98 | } 99 | case None => addMissingItem(c, fb) 100 | } 101 | 102 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readStrings(name) 103 | 104 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[Seq[String]] => fw.IF = 105 | fw.writeStrings(name) 106 | } 107 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/NHotWeightedEncoder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.{FeatureBuilder, FeatureRejection, FlatReader, FlatWriter} 21 | 22 | import scala.collection.SortedMap 23 | import scala.collection.mutable.{Map => MMap, Set => MSet} 24 | 25 | /** Weighted label. Also can be thought as a weighted value in a named sparse vector. */ 26 | case class WeightedLabel(name: String, value: Double) 27 | 28 | /** 29 | * Transform a collection of weighted categorical features to columns of weight sums, with at most N 30 | * values. 31 | * 32 | * Weights of the same labels in a row are summed instead of 1.0 as is the case with the normal 33 | * [[NHotEncoder$]]. 34 | * 35 | * Missing values are either transformed to zero vectors or encoded as a missing value. 36 | * 37 | * When using aggregated feature summary from a previous session, unseen labels are either 38 | * transformed to zero vectors or encoded as `__unknown__` (if `encodeMissingValue` is true) and 39 | * [FeatureRejection.Unseen]] rejections are reported. 40 | */ 41 | object NHotWeightedEncoder extends SettingsBuilder { 42 | 43 | /** Create a new [[NHotWeightedEncoder$]] instance. */ 44 | def apply( 45 | name: String, 46 | encodeMissingValue: Boolean = false 47 | ): Transformer[Seq[WeightedLabel], Set[String], SortedMap[String, Int]] = 48 | new NHotWeightedEncoder(name, encodeMissingValue) 49 | 50 | /** 51 | * Create a new [[NHotWeightedEncoder$]] from a settings object 52 | * @param setting 53 | * Settings object 54 | */ 55 | def fromSettings( 56 | setting: Settings 57 | ): Transformer[Seq[WeightedLabel], Set[String], SortedMap[String, Int]] = { 58 | val encodeMissingValue = setting.params("encodeMissingValue").toBoolean 59 | NHotWeightedEncoder(setting.name, encodeMissingValue) 60 | } 61 | } 62 | 63 | private[featran] class NHotWeightedEncoder(name: String, encodeMissingValue: Boolean) 64 | extends BaseHotEncoder[Seq[WeightedLabel]](name, encodeMissingValue) { 65 | import MissingValue.MissingValueToken 66 | 67 | def addMissingValue( 68 | fb: FeatureBuilder[_], 69 | unseen: MSet[String], 70 | keys: Seq[String], 71 | unseenWeight: Double 72 | ): Unit = 73 | if (keys.isEmpty) { 74 | fb.add(name + '_' + MissingValueToken, 1.0) 75 | } else if (unseen.isEmpty) { 76 | fb.skip() 77 | } else { 78 | fb.add(name + '_' + MissingValueToken, unseenWeight) 79 | } 80 | 81 | override def prepare(a: Seq[WeightedLabel]): Set[String] = 82 | Set(a.map(_.name): _*) 83 | override def buildFeatures( 84 | a: Option[Seq[WeightedLabel]], 85 | c: SortedMap[String, Int], 86 | fb: FeatureBuilder[_] 87 | ): Unit = a match { 88 | case Some(xs) => 89 | val weights = MMap.empty[String, Double].withDefaultValue(0.0) 90 | xs.foreach(x => weights(x.name) += x.value) 91 | var unseenWeight = 0.0 92 | 93 | val keys = weights.keySet.toList.sorted 94 | var prev = -1 95 | val unseen = MSet[String]() 96 | keys.foreach { key => 97 | c.get(key) match { 98 | case Some(curr) => 99 | val gap = curr - prev - 1 100 | if (gap > 0) fb.skip(gap) 101 | fb.add(name + '_' + key, weights(key)) 102 | prev = curr 103 | case None => 104 | unseen += key 105 | unseenWeight += weights(key) 106 | } 107 | } 108 | val gap = c.size - prev - 1 109 | if (gap > 0) fb.skip(gap) 110 | if (encodeMissingValue) { 111 | addMissingValue(fb, unseen, keys, unseenWeight) 112 | } 113 | if (unseen.nonEmpty) { 114 | fb.reject(this, FeatureRejection.Unseen(unseen.toSet)) 115 | } 116 | case None => addMissingItem(c, fb) 117 | } 118 | 119 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readWeightedLabel(name) 120 | 121 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[Seq[WeightedLabel]] => fw.IF = 122 | fw.writeWeightedLabel(name) 123 | } 124 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/Normalizer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import breeze.linalg._ 21 | import com.spotify.featran.{FeatureBuilder, FeatureRejection, FlatReader, FlatWriter} 22 | import com.twitter.algebird.Aggregator 23 | 24 | import scala.collection.compat.immutable.ArraySeq 25 | 26 | /** 27 | * Transform vector features by normalizing each vector to have unit norm. Parameter `p` specifies 28 | * the p-norm used for normalization (default 2). 29 | * 30 | * Missing values are transformed to zero vectors. 31 | * 32 | * When using aggregated feature summary from a previous session, vectors of different dimensions 33 | * are transformed to zero vectors and [[FeatureRejection.WrongDimension]] rejections are reported. 34 | */ 35 | object Normalizer extends SettingsBuilder { 36 | 37 | /** 38 | * Create a new [[Normalizer$]] instance. 39 | * @param p 40 | * normalization in L^p^ space, must be greater than or equal to 1.0 41 | * @param expectedLength 42 | * expected length of the input vectors, or 0 to infer from data 43 | */ 44 | def apply( 45 | name: String, 46 | p: Double = 2.0, 47 | expectedLength: Int = 0 48 | ): Transformer[Array[Double], Int, Int] = 49 | new Normalizer(name, p, expectedLength) 50 | 51 | /** 52 | * Create a new [[Normalizer$]] from a settings object 53 | * @param setting 54 | * Settings object 55 | */ 56 | def fromSettings(setting: Settings): Transformer[Array[Double], Int, Int] = { 57 | val p = setting.params("p").toDouble 58 | val expectedLength = setting.params("expectedLength").toInt 59 | Normalizer(setting.name, p, expectedLength) 60 | } 61 | } 62 | 63 | private[featran] class Normalizer(name: String, val p: Double, val expectedLength: Int) 64 | extends Transformer[Array[Double], Int, Int](name) { 65 | require(p >= 1.0, "p must be >= 1.0") 66 | override val aggregator: Aggregator[Array[Double], Int, Int] = 67 | Aggregators.seqLength[Double, Array](expectedLength)(ArraySeq.unsafeWrapArray) 68 | override def featureDimension(c: Int): Int = c 69 | override def featureNames(c: Int): Seq[String] = names(c) 70 | override def buildFeatures(a: Option[Array[Double]], c: Int, fb: FeatureBuilder[_]): Unit = 71 | a match { 72 | case Some(x) => 73 | if (x.length != c) { 74 | fb.skip(c) 75 | fb.reject(this, FeatureRejection.WrongDimension(c, x.length)) 76 | } else { 77 | val dv = DenseVector(x) 78 | fb.add[Array](names(c), (dv / norm(dv, p)).data)(ArraySeq.unsafeWrapArray) 79 | } 80 | case None => fb.skip(c) 81 | } 82 | override def encodeAggregator(c: Int): String = c.toString 83 | override def decodeAggregator(s: String): Int = s.toInt 84 | override def params: Map[String, String] = 85 | Map("p" -> p.toString, "expectedLength" -> expectedLength.toString) 86 | 87 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readDoubleArray(name) 88 | 89 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[Array[Double]] => fw.IF = 90 | fw.writeDoubleArray(name) 91 | } 92 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/PositionEncoder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.{FeatureBuilder, FeatureRejection, FlatReader, FlatWriter} 21 | 22 | import scala.collection.SortedMap 23 | 24 | /** 25 | * Transform a collection of categorical features to a single value that is the position of that 26 | * feature within the complete set of categories. 27 | * 28 | * Missing values are transformed to zeros so may collide with the first position. Rejections can be 29 | * used to remove this case. 30 | * 31 | * When using aggregated feature summary from a previous session, unseen labels are ignored and 32 | * [[FeatureRejection.Unseen]] rejections are reported. 33 | */ 34 | object PositionEncoder extends SettingsBuilder { 35 | 36 | /** Create a new [[PositionEncoder$]] instance. */ 37 | def apply(name: String): Transformer[String, Set[String], SortedMap[String, Int]] = 38 | new PositionEncoder(name) 39 | 40 | /** 41 | * Create a new [[PositionEncoder$]] from a settings object 42 | * @param setting 43 | * Settings object 44 | */ 45 | def fromSettings(setting: Settings): Transformer[String, Set[String], SortedMap[String, Int]] = 46 | PositionEncoder(setting.name) 47 | } 48 | 49 | private[featran] class PositionEncoder(name: String) extends BaseHotEncoder[String](name, false) { 50 | override def prepare(a: String): Set[String] = Set(a) 51 | override def featureDimension(c: SortedMap[String, Int]): Int = 1 52 | override def featureNames(c: SortedMap[String, Int]): Seq[String] = Seq(name) 53 | override def buildFeatures( 54 | a: Option[String], 55 | c: SortedMap[String, Int], 56 | fb: FeatureBuilder[_] 57 | ): Unit = 58 | a match { 59 | case Some(k) => 60 | c.get(k) match { 61 | case Some(v) => fb.add(name, v.toDouble) 62 | case None => 63 | fb.skip(1) 64 | fb.reject(this, FeatureRejection.Unseen(Set(k))) 65 | } 66 | case None => 67 | fb.skip(1) 68 | fb.reject(this, FeatureRejection.Collision) 69 | } 70 | 71 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readString(name) 72 | 73 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[String] => fw.IF = 74 | fw.writeString(name) 75 | } 76 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/StandardScaler.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.{FeatureBuilder, FlatReader, FlatWriter} 21 | import com.twitter.algebird.{Aggregator, Moments} 22 | 23 | /** 24 | * Transform features by normalizing each feature to have unit standard deviation and/or zero mean. 25 | * When `withStd` is true, it scales the data to unit standard deviation. When `withMean` is true, 26 | * it centers the data with mean before scaling. 27 | * 28 | * Missing values are transformed to 0.0 if `withMean` is true or population mean otherwise. 29 | */ 30 | object StandardScaler extends SettingsBuilder { 31 | 32 | /** 33 | * Create a new [[StandardScaler$]] instance. 34 | * @param withStd 35 | * whether to scale the data to unit standard deviation 36 | * @param withMean 37 | * whether to center the data with mean before scaling 38 | */ 39 | def apply( 40 | name: String, 41 | withStd: Boolean = true, 42 | withMean: Boolean = false 43 | ): Transformer[Double, Moments, (Double, Double)] = 44 | new StandardScaler(name, withStd, withMean) 45 | 46 | /** 47 | * Create a new [[StandardScaler$]] from a settings object 48 | * @param setting 49 | * Settings object 50 | */ 51 | def fromSettings(setting: Settings): Transformer[Double, Moments, (Double, Double)] = { 52 | val withStd = setting.params("withStd").toBoolean 53 | val withMean = setting.params("withMean").toBoolean 54 | StandardScaler(setting.name, withStd, withMean) 55 | } 56 | } 57 | 58 | private[featran] class StandardScaler(name: String, val withStd: Boolean, val withMean: Boolean) 59 | extends OneDimensional[Double, Moments, (Double, Double)](name) { 60 | override val aggregator: Aggregator[Double, Moments, (Double, Double)] = 61 | Aggregators.from[Double](Moments(_)).to(r => (r.mean, r.stddev)) 62 | override def buildFeatures(a: Option[Double], c: (Double, Double), fb: FeatureBuilder[_]): Unit = 63 | a match { 64 | case Some(x) => 65 | val r = (withStd, withMean) match { 66 | case (true, true) => (x - c._1) / c._2 67 | case (true, false) => (x - c._1) / c._2 + c._1 68 | case (false, true) => x - c._1 69 | case (false, false) => x 70 | } 71 | fb.add(name, r) 72 | case None => fb.add(name, if (withMean) 0.0 else c._1) 73 | } 74 | override def encodeAggregator(c: (Double, Double)): String = 75 | s"${c._1},${c._2}" 76 | override def decodeAggregator(s: String): (Double, Double) = { 77 | val t = s.split(",") 78 | (t(0).toDouble, t(1).toDouble) 79 | } 80 | override def params: Map[String, String] = 81 | Map("withStd" -> withStd.toString, "withMean" -> withMean.toString) 82 | 83 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readDouble(name) 84 | 85 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[Double] => fw.IF = 86 | fw.writeDouble(name) 87 | } 88 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/VectorIdentity.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.{FeatureBuilder, FeatureRejection, FlatReader, FlatWriter} 21 | import com.twitter.algebird.Aggregator 22 | 23 | /** 24 | * Takes fixed length vectors by passing them through. 25 | * 26 | * Similar to [[Identity$]] but for a sequence of doubles. 27 | * 28 | * Missing values are transformed to zero vectors. 29 | * 30 | * When using aggregated feature summary from a previous session, vectors of different dimensions 31 | * are transformed to zero vectors and [[FeatureRejection.WrongDimension]] rejections are reported. 32 | */ 33 | object VectorIdentity extends SettingsBuilder { 34 | 35 | /** 36 | * Create a new [[VectorIdentity$]] instance. 37 | * @param expectedLength 38 | * expected length of the input vectors, or 0 to infer from data 39 | */ 40 | def apply[M[_]](name: String, expectedLength: Int = 0)(implicit 41 | ev: M[Double] => Seq[Double] 42 | ): Transformer[M[Double], Int, Int] = 43 | new VectorIdentity(name, expectedLength)(ev) 44 | 45 | /** 46 | * Create a new [[VectorIdentity$]] from a settings object 47 | * @param setting 48 | * Settings object 49 | */ 50 | def fromSettings(setting: Settings): Transformer[Seq[Double], Int, Int] = { 51 | val el = setting.params("expectedLength").toInt 52 | VectorIdentity[Seq](setting.name, el) 53 | } 54 | } 55 | 56 | private[featran] class VectorIdentity[M[_]](name: String, val expectedLength: Int)(implicit 57 | ev: M[Double] => Seq[Double] 58 | ) extends Transformer[M[Double], Int, Int](name) { 59 | override val aggregator: Aggregator[M[Double], Int, Int] = 60 | Aggregators.seqLength(expectedLength) 61 | override def featureDimension(c: Int): Int = c 62 | override def featureNames(c: Int): Seq[String] = names(c) 63 | override def buildFeatures(a: Option[M[Double]], c: Int, fb: FeatureBuilder[_]): Unit = a match { 64 | case Some(x) => 65 | val length = ev(x).length 66 | if (length != c) { 67 | fb.skip(c) 68 | fb.reject(this, FeatureRejection.WrongDimension(c, length)) 69 | } else { 70 | fb.add(names(c), x) 71 | } 72 | case None => fb.skip(c) 73 | } 74 | 75 | override def encodeAggregator(c: Int): String = c.toString 76 | override def decodeAggregator(s: String): Int = s.toInt 77 | override def params: Map[String, String] = 78 | Map("expectedLength" -> expectedLength.toString) 79 | 80 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readDoubles(name) 81 | 82 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[M[Double]] => fw.IF = 83 | (v: Option[M[Double]]) => fw.writeDoubles(name)(v.map(ev)) 84 | } 85 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/VonMisesEvaluator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import breeze.stats.distributions.{Rand, VonMises} 21 | import com.spotify.featran.{FeatureBuilder, FlatReader, FlatWriter} 22 | import com.twitter.algebird.Aggregator 23 | 24 | import scala.collection.compat.immutable.ArraySeq 25 | 26 | /** 27 | * Transform a column of continuous features that represent the mean of a von Mises distribution to 28 | * n columns of continuous features. The number n represent the number of points to evaluate the von 29 | * Mises distribution. The von Mises pdf is given by 30 | * 31 | * f(x | mu, kappa, scale) = exp(kappa * cos(scale*(x-mu)) / (2*pi*Io(kappa)) 32 | * 33 | * and is only valid for x, mu in the interval [0, 2*pi/scale]. 34 | */ 35 | object VonMisesEvaluator extends SettingsBuilder { 36 | 37 | /** 38 | * Create a new [[VonMisesEvaluator$]] instance. 39 | * @param kappa 40 | * measure of concentration 41 | * @param scale 42 | * scaling factor 43 | * @param points 44 | * points to evaluate the distribution with 45 | */ 46 | def apply( 47 | name: String, 48 | kappa: Double, 49 | scale: Double, 50 | points: Array[Double] 51 | ): Transformer[Double, Unit, Unit] = 52 | new VonMisesEvaluator(name, kappa, scale, points) 53 | 54 | /** 55 | * Create a new [[VonMisesEvaluator$]] from a settings object 56 | * @param setting 57 | * Settings object 58 | */ 59 | def fromSettings(setting: Settings): Transformer[Double, Unit, Unit] = { 60 | val params = setting.params 61 | val k = params("kappa").toDouble 62 | val s = params("scale").toDouble 63 | val str = params("points") 64 | val points = str.slice(1, str.length - 1).split(",").map(_.toDouble) 65 | VonMisesEvaluator(setting.name, k, s, points) 66 | } 67 | 68 | def getProbability(x: Double, mu: Double, kappa: Double, scale: Double): Double = { 69 | val muScaled = mu * scale 70 | val vm = VonMises(muScaled, kappa)(Rand) 71 | vm.pdf(scale * x) 72 | } 73 | } 74 | 75 | private[featran] class VonMisesEvaluator( 76 | name: String, 77 | val kappa: Double, 78 | val scale: Double, 79 | val points: Array[Double] 80 | ) extends Transformer[Double, Unit, Unit](name) { 81 | private val pMax = points.max 82 | private val upperBound = 2 * math.Pi / scale 83 | checkRange("point", pMax, 0.0, upperBound) 84 | override val aggregator: Aggregator[Double, Unit, Unit] = 85 | Aggregators.unit[Double] 86 | override def featureDimension(c: Unit): Int = points.length 87 | override def featureNames(c: Unit): Seq[String] = names(points.length) 88 | 89 | override def buildFeatures(a: Option[Double], c: Unit, fb: FeatureBuilder[_]): Unit = a match { 90 | case Some(mu) => 91 | checkRange("mu", mu, 0.0, upperBound) 92 | val probs = points.map(VonMisesEvaluator.getProbability(_, mu, kappa, scale)) 93 | fb.add[Array](names(points.length), probs)(ArraySeq.unsafeWrapArray) 94 | case None => fb.skip(points.length) 95 | } 96 | 97 | override def encodeAggregator(c: Unit): String = "" 98 | override def decodeAggregator(s: String): Unit = () 99 | override def params: Map[String, String] = 100 | Map( 101 | "kappa" -> kappa.toString, 102 | "scale" -> scale.toString, 103 | "points" -> points.mkString("[", ",", "]") 104 | ) 105 | 106 | override def flatRead[T: FlatReader]: T => Option[Any] = FlatReader[T].readDouble(name) 107 | 108 | override def flatWriter[T](implicit fw: FlatWriter[T]): Option[Double] => fw.IF = 109 | fw.writeDouble(name) 110 | } 111 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/mdl/MDLPDiscretizer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers.mdl 19 | 20 | import scala.annotation.nowarn 21 | import scala.jdk.CollectionConverters._ 22 | import scala.collection.mutable 23 | import scala.reflect.ClassTag 24 | 25 | @nowarn("msg=evidence parameter evidence.* of type scala.reflect.ClassTag\\[.\\] .* is never used") 26 | private[transformers] class MDLPDiscretizer[T: ClassTag]( 27 | data: Seq[(T, Double)], 28 | stoppingCriterion: Double = MDLPDiscretizer.DefaultStoppingCriterion, 29 | minBinPercentage: Double = MDLPDiscretizer.DefaultMinBinPercentage 30 | ) extends Serializable { 31 | private val labels = { 32 | val m = mutable.Map.empty[T, Int] 33 | data.foreach { case (k, _) => 34 | if (!m.contains(k)) { 35 | m(k) = m.size 36 | } 37 | } 38 | m 39 | } 40 | 41 | private def isBoundary(f1: Array[Long], f2: Array[Long]): Boolean = { 42 | val l = math.min(f1.length, f2.length) 43 | var count = 0 44 | var i = 0 45 | while (i < l && count <= 1) { 46 | if (f1(i) + f2(i) != 0) { 47 | count += 1 48 | } 49 | i += 1 50 | } 51 | count > 1 52 | } 53 | 54 | private def midpoint(x1: Float, x2: Float): Float = (x1 + x2) / 2.0f 55 | 56 | def discretize(maxBins: Int = MDLPDiscretizer.DefaultMaxBins): Seq[Double] = { 57 | val featureValues = new java.util.TreeMap[Float, Array[Long]]() 58 | data.foreach { case (label, value) => 59 | val key = value.toFloat 60 | val i = labels(label) 61 | val x = featureValues.get(key) 62 | if (x == null) { 63 | val y = Array.fill(labels.size)(0L) 64 | y(i) = 1L 65 | featureValues.put(key, y) 66 | } else { 67 | x(i) += 1L 68 | } 69 | } 70 | 71 | val cutPoint = if (!featureValues.isEmpty) { 72 | val it = featureValues.asScala.iterator 73 | var (lastX, lastFreqs) = it.next() 74 | var result = List.empty[(Float, Array[Long])] 75 | var accumFreqs = lastFreqs 76 | while (it.hasNext) { 77 | val (x, freqs) = it.next() 78 | if (isBoundary(freqs, lastFreqs)) { 79 | result = (midpoint(x, lastX), accumFreqs) :: result 80 | accumFreqs = Array.fill(labels.size)(0L) 81 | } 82 | lastX = x 83 | lastFreqs = freqs 84 | MDLUtil.plusI(accumFreqs, freqs) 85 | } 86 | (lastX, accumFreqs) :: result 87 | } else { 88 | Nil 89 | } 90 | 91 | val minBinWeight: Long = (minBinPercentage * data.length / 100.0).toLong 92 | val finder = 93 | new ThresholdFinder(labels.size, stoppingCriterion, maxBins, minBinWeight) 94 | finder.findThresholds(cutPoint.sortBy(_._1)).map(_.toDouble) 95 | } 96 | } 97 | 98 | private[transformers] object MDLPDiscretizer { 99 | val DefaultStoppingCriterion: Double = 0.0 100 | val DefaultMinBinPercentage: Double = 0.0 101 | val DefaultMaxBins: Int = 50 102 | } 103 | -------------------------------------------------------------------------------- /core/src/main/scala/com/spotify/featran/transformers/mdl/MDLUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers.mdl 19 | 20 | private object MDLUtil { 21 | def plusI(x: Array[Long], y: Array[Long]): Unit = { 22 | var i = 0 23 | while (i < x.length) { 24 | x(i) += y(i) 25 | i += 1 26 | } 27 | } 28 | 29 | def plus(x: Array[Long], y: Array[Long]): Array[Long] = { 30 | val r = Array.fill(x.length)(0L) 31 | var i = 0 32 | while (i < x.length) { 33 | r(i) = x(i) + y(i) 34 | i += 1 35 | } 36 | r 37 | } 38 | 39 | def minus(x: Array[Long], y: Array[Long]): Array[Long] = { 40 | val r = Array.fill(x.length)(0L) 41 | var i = 0 42 | while (i < x.length) { 43 | r(i) = x(i) - y(i) 44 | i += 1 45 | } 46 | r 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/SerializableUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | import java.{io => jio} 21 | 22 | object SerializableUtils { 23 | private def serializeToByteArray(value: Serializable): Array[Byte] = { 24 | val buffer = new jio.ByteArrayOutputStream() 25 | val oos = new jio.ObjectOutputStream(buffer) 26 | oos.writeObject(value) 27 | buffer.toByteArray 28 | } 29 | 30 | private def deserializeFromByteArray(encodedValue: Array[Byte]): AnyRef = { 31 | val ois = new jio.ObjectInputStream(new jio.ByteArrayInputStream(encodedValue)) 32 | ois.readObject() 33 | } 34 | 35 | def ensureSerializable[T <: Serializable](value: T): T = 36 | deserializeFromByteArray(serializeToByteArray(value)).asInstanceOf[T] 37 | 38 | trait SerializableFunction[A, B] extends (A => B) with Serializable 39 | } 40 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/converters/CaseClassConverterTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.converters 19 | 20 | import com.spotify.featran.transformers.{MDLRecord, WeightedLabel} 21 | import org.scalatest.flatspec.AnyFlatSpec 22 | import org.scalatest.matchers.should.Matchers 23 | 24 | case class TestData( 25 | num: Int, 26 | str: String, 27 | d: Double, 28 | l: Long, 29 | s: List[String], 30 | b: Boolean 31 | ) 32 | 33 | case class TestOpt(num: Option[Int]) 34 | 35 | case class TestDataOpt(num: Option[Int], d: Option[Double]) 36 | 37 | case class TestAllNatives( 38 | i: Int = 1, 39 | s: Short = 1, 40 | l: Long = 1L, 41 | d: Double = 1.0, 42 | io: Option[Int] = Some(1), 43 | so: Option[Short] = Some(1), 44 | lo: Option[Long] = Some(1L), 45 | dopt: Option[Double] = Some(1.0), 46 | il: List[Int] = List(1), 47 | sl: List[Short] = List(1), 48 | ll: List[Long] = List(1L), 49 | dl: List[Double] = List(1.0) 50 | ) 51 | 52 | case class TestObjects( 53 | str: String = "a", 54 | strs: List[String] = List("a"), 55 | mdl: MDLRecord[String] = MDLRecord("a", 1.0), 56 | we: List[WeightedLabel] = List(WeightedLabel("a", 1.0)) 57 | ) 58 | 59 | class CaseClassConverterTest extends AnyFlatSpec with Matchers { 60 | it should "convert a case class to a spec" in { 61 | val data = List( 62 | TestData(1, "a", 1.0, 1L, List("c"), b = true), 63 | TestData(2, "b", 1.0, 1L, List("d"), b = true) 64 | ) 65 | 66 | val spec = CaseClassConverter.toSpec[TestData] 67 | val features = spec.extract(data).featureValues[Seq[Double]] 68 | assert( 69 | features === List( 70 | Seq(1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0), 71 | Seq(2.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0) 72 | ) 73 | ) 74 | } 75 | 76 | it should "convert a simple option" in { 77 | val data = List( 78 | TestOpt(Some(1)), 79 | TestOpt(None) 80 | ) 81 | 82 | val spec = CaseClassConverter.toSpec[TestOpt] 83 | val features = spec.extract(data).featureValues[Seq[Double]] 84 | assert(features === List(Seq(1.0), Seq(0.0))) 85 | } 86 | 87 | it should "convert a case class to a spec with optionals" in { 88 | val data = List( 89 | TestDataOpt(Some(1), Some(1.0)), 90 | TestDataOpt(None, None) 91 | ) 92 | 93 | val spec = CaseClassConverter.toSpec[TestDataOpt] 94 | val features = spec.extract(data).featureValues[Seq[Double]] 95 | assert(features === List(Seq(1.0, 1.0), Seq(0.0, 0.0))) 96 | } 97 | 98 | it should "test all native types" in { 99 | val data = List(TestAllNatives()) 100 | 101 | val spec = CaseClassConverter.toSpec[TestAllNatives] 102 | val features = spec.extract(data).featureValues[Seq[Double]] 103 | assert(features === List(0.until(12).toList.map(_ => 1.0))) 104 | } 105 | 106 | it should "test all object types" in { 107 | val data = List(TestObjects()) 108 | 109 | val spec = CaseClassConverter.toSpec[TestObjects] 110 | val features = spec.extract(data).featureValues[Seq[Double]] 111 | assert(features === List(0.until(4).toList.map(_ => 1.0))) 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/converters/ConverterSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.converters 19 | 20 | import org.scalacheck._ 21 | 22 | object ConverterSpec extends Properties("converters") { 23 | property("boolean.asDouble") = Prop.forAll((x: Boolean) => x.asDouble == (if (x) 1.0 else 0.0)) 24 | } 25 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/BinarizerSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck._ 21 | 22 | object BinarizerSpec extends TransformerProp("Binarizer") { 23 | property("default") = Prop.forAll { (xs: List[Double]) => 24 | val expected = xs.map(x => Seq(if (x > 0.0) 1.0 else 0.0)) 25 | test(Binarizer("id"), xs, Seq("id"), expected, Seq(0.0)) 26 | } 27 | 28 | property("threshold") = Prop.forAll { (xs: List[Double], threshold: Double) => 29 | val expected = xs.map(x => Seq(if (x > threshold) 1.0 else 0.0)) 30 | test(Binarizer("id", threshold), xs, Seq("id"), expected, Seq(0.0)) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/BucketizerSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck._ 21 | 22 | object BucketizerSpec extends TransformerProp("Bucketizer") { 23 | private val SplitsGen = 24 | Gen.choose(3, 10).flatMap(n => Gen.listOfN(n, Arbitrary.arbitrary[Double])) 25 | 26 | property("default") = Prop.forAll(list[Double].arbitrary, SplitsGen) { (xs, sp) => 27 | test(xs, sp.toArray.sorted) 28 | } 29 | 30 | // last bucket should be inclusive 31 | property("inclusive") = Prop.forAll { (xs: List[Double]) => 32 | val (l, u) = (xs.min, xs.max) 33 | val m = l / 2 + u / 2 // (l + u) might overflow 34 | val splits = Array(l, m, u) 35 | test(xs, splits) 36 | } 37 | 38 | private def test(xs: List[Double], splits: Array[Double]): Prop = { 39 | val upper = splits.last 40 | val names = (0 until splits.length - 1).map("bucketizer_" + _) 41 | val missing = (0 until splits.length - 1).map(_ => 0.0) 42 | val expected = xs.map { x => 43 | val offset = 44 | if (x == upper) splits.length - 2 else splits.indexWhere(x < _) - 1 45 | if (offset >= 0) { 46 | (0 until splits.length - 1).map(i => if (i == offset) 1.0 else 0.0) 47 | } else { 48 | missing 49 | } 50 | } 51 | val rejections = 52 | xs.zip(expected).filter(x => x._1 < splits.head || x._1 > splits.last).map(_._2) 53 | val oob = 54 | List((lowerBound(splits.min), missing), (upperBound(splits.max), missing)) 55 | test(Bucketizer("bucketizer", splits), xs, names, expected, missing, oob, rejections) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/HashNHotEncoderSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.twitter.algebird.HyperLogLogMonoid 21 | import org.scalacheck._ 22 | 23 | import scala.math.ceil 24 | 25 | object HashNHotEncoderSpec extends TransformerProp("HashNHotEncoder") { 26 | implicit private val labelArb: Arbitrary[String] = Arbitrary(Gen.alphaStr) 27 | 28 | private def estimateSize(xs: List[List[String]]): Double = { 29 | val m = new HyperLogLogMonoid(12) 30 | xs.flatten.map(m.toHLL(_)).reduce(m.plus).estimatedSize 31 | } 32 | 33 | implicit override def list[T](implicit arb: Arbitrary[T]): Arbitrary[List[T]] = Arbitrary { 34 | Gen.listOfN(10, arb.arbitrary).suchThat(_.nonEmpty) // workaround for shrinking failure 35 | } 36 | 37 | property("default") = Prop.forAll { (xs: List[List[String]]) => 38 | val size = ceil(estimateSize(xs) * 8.0).toInt 39 | test(HashNHotEncoder("n_hot"), size, xs) 40 | } 41 | 42 | property("size") = Prop.forAll { (xs: List[List[String]]) => 43 | val size = 100 44 | test(HashNHotEncoder("n_hot", size), size, xs) 45 | } 46 | 47 | property("scaling") = Prop.forAll { (xs: List[List[String]]) => 48 | val scalingFactor = 2.0 49 | val size = ceil(estimateSize(xs) * scalingFactor).toInt 50 | test(HashNHotEncoder("n_hot", 0, scalingFactor), size, xs) 51 | } 52 | 53 | private def test( 54 | encoder: Transformer[List[String], _, _], 55 | size: Int, 56 | xs: List[List[String]] 57 | ): Prop = { 58 | val cats = 0 until size 59 | val names = cats.map("n_hot_" + _) 60 | val expected = xs.map { s => 61 | val hashes = s.map(HashEncoder.bucket(_, size)).toSet 62 | cats.map(c => if (hashes.contains(c)) 1.0 else 0.0) 63 | } 64 | val missing = cats.map(_ => 0.0) 65 | test(encoder, xs, names, expected, missing) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/HashNHotWeightedEncoderSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.twitter.algebird.HyperLogLogMonoid 21 | import org.scalacheck._ 22 | 23 | import scala.math.ceil 24 | 25 | object HashNHotWeightedEncoderSpec extends TransformerProp("HashNHotWeightedEncoder") { 26 | implicit private val weightedVectors: Arbitrary[List[WeightedLabel]] = Arbitrary { 27 | val weightedValueGen = for { 28 | value <- Gen.chooseNum(-1.0, 1.0) 29 | n <- Gen.alphaStr 30 | } yield WeightedLabel(n, value) 31 | Gen.choose(1, 5).flatMap(Gen.listOfN(_, weightedValueGen)) 32 | } 33 | 34 | private def estimateSize(xs: List[List[WeightedLabel]]): Double = { 35 | val m = new HyperLogLogMonoid(12) 36 | xs.flatten.map(_.name).map(m.toHLL(_)).reduce(m.plus).estimatedSize 37 | } 38 | 39 | implicit override def list[T](implicit arb: Arbitrary[T]): Arbitrary[List[T]] = Arbitrary { 40 | Gen.listOfN(10, arb.arbitrary).suchThat(_.nonEmpty) // workaround for shrinking failure 41 | } 42 | 43 | property("default") = Prop.forAll { (xs: List[List[WeightedLabel]]) => 44 | val size = ceil(estimateSize(xs) * 8.0).toInt 45 | test(HashNHotWeightedEncoder("n_hot"), size, xs) 46 | } 47 | 48 | property("size") = Prop.forAll { (xs: List[List[WeightedLabel]]) => 49 | val size = 100 50 | test(HashNHotWeightedEncoder("n_hot", size), size, xs) 51 | } 52 | 53 | property("scaling factor") = Prop.forAll { (xs: List[List[WeightedLabel]]) => 54 | val scalingFactor = 4.0 55 | val size = ceil(estimateSize(xs) * scalingFactor).toInt 56 | test(HashNHotWeightedEncoder("n_hot", 0, scalingFactor), size, xs) 57 | } 58 | 59 | private def test( 60 | encoder: Transformer[List[WeightedLabel], _, _], 61 | size: Int, 62 | xs: List[List[WeightedLabel]] 63 | ): Prop = { 64 | val cats = 0 until size 65 | val names = cats.map("n_hot_" + _) 66 | val expected = xs.map { s => 67 | val hashes = s 68 | .map(x => (HashEncoder.bucket(x.name, size), x.value)) 69 | .groupBy(_._1) 70 | .map(l => (l._1, l._2.map(_._2).sum)) 71 | cats.map { c => 72 | hashes.get(c) match { 73 | case Some(v) => v 74 | case None => 0.0 75 | } 76 | } 77 | } 78 | val missing = cats.map(_ => 0.0) 79 | test(encoder, xs, names, expected, missing) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/HashOneHotEncoderSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.twitter.algebird.HyperLogLogMonoid 21 | import org.scalacheck._ 22 | 23 | import scala.math.ceil 24 | 25 | object HashOneHotEncoderSpec extends TransformerProp("HashOneHotEncoder") { 26 | implicit private val labelArb: Arbitrary[String] = Arbitrary(Gen.alphaStr) 27 | 28 | private def estimateSize(xs: List[String]): Double = { 29 | val m = new HyperLogLogMonoid(12) 30 | xs.map(m.toHLL(_)).reduce(m.plus).estimatedSize 31 | } 32 | 33 | property("default") = Prop.forAll { (xs: List[String]) => 34 | val size = ceil(estimateSize(xs) * 8.0).toInt 35 | test(HashOneHotEncoder("one_hot"), size, xs) 36 | } 37 | 38 | property("size") = Prop.forAll { (xs: List[String]) => 39 | val size = 100 40 | test(HashOneHotEncoder("one_hot", size), size, xs) 41 | } 42 | 43 | property("scaling factor") = Prop.forAll { (xs: List[String]) => 44 | val scalingFactor = 2.0 45 | val size = ceil(estimateSize(xs) * scalingFactor).toInt 46 | test(HashOneHotEncoder("one_hot", 0, scalingFactor), size, xs) 47 | } 48 | 49 | private def test(encoder: Transformer[String, _, _], size: Int, xs: List[String]): Prop = { 50 | val cats = 0 until size 51 | val names = cats.map("one_hot_" + _) 52 | val expected = xs.map(s => cats.map(c => if (HashEncoder.bucket(s, size) == c) 1.0 else 0.0)) 53 | val missing = cats.map(_ => 0.0) 54 | test(encoder, xs, names, expected, missing) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/HeavyHittersSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.twitter.algebird.{HeavyHitters => _, _} 21 | import org.scalacheck._ 22 | 23 | object HeavyHittersSpec extends TransformerProp("HeavyHitters") { 24 | implicit private val labelArb: Arbitrary[String] = Arbitrary { 25 | val infrequent = Gen.listOfN(50, Gen.alphaStr).flatMap(xs => Gen.oneOf(xs)) 26 | val frequent = Gen.listOfN(5, Gen.alphaStr).flatMap(xs => Gen.oneOf(xs)) 27 | Gen.frequency((1, infrequent), (50, frequent)) 28 | } 29 | private val seed = 1 30 | 31 | private def test( 32 | transformer: Transformer[String, _, _], 33 | xs: List[String], 34 | count: Int, 35 | eps: Double, 36 | delta: Double 37 | ): Prop = { 38 | val params = SketchMapParams[String](seed, eps, delta, count)(_.getBytes) 39 | val aggregator = SketchMap.aggregator[String, Long](params) 40 | val sm = xs.map(x => aggregator.prepare((x, 1L))).reduce(aggregator.monoid.plus) 41 | val m = sm.heavyHitterKeys.zipWithIndex.map { case (x, idx) => x -> (idx + 1) }.toMap 42 | val expected = xs.map { x => 43 | m.get(x) match { 44 | case Some(rank) => 45 | Seq(rank.toDouble, params.frequency(x, sm.valuesTable).toDouble) 46 | case None => Seq(0.0, 0.0) 47 | } 48 | } 49 | val names = Seq("hh_rank", "hh_freq") 50 | val missing = Seq(0.0, 0.0) 51 | test(transformer, xs, names, expected, missing) 52 | } 53 | 54 | property("default") = Prop.forAll { (xs: List[String]) => 55 | test(HeavyHitters("hh", 10, seed = 1), xs, 10, 0.001, 0.001) 56 | } 57 | 58 | property("count") = Prop.forAll { (xs: List[String]) => 59 | test(HeavyHitters("hh", 100, seed = 1), xs, 100, 0.001, 0.001) 60 | } 61 | 62 | property("eps") = Prop.forAll { (xs: List[String]) => 63 | test(HeavyHitters("hh", 10, eps = 0.01, seed = 1), xs, 10, 0.01, 0.001) 64 | } 65 | 66 | property("delta") = Prop.forAll { (xs: List[String]) => 67 | test(HeavyHitters("hh", 10, delta = 0.01, seed = 1), xs, 10, 0.001, 0.01) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/IQROutlierRejectorSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.twitter.algebird.{QTree, QTreeAggregator, QTreeSemigroup} 21 | import org.scalacheck.{Arbitrary, Gen, Prop} 22 | 23 | object IQROutlierRejectorSpec extends TransformerProp("IQROutlierRejector") { 24 | implicit private val arbPosDouble: Arbitrary[Double] = Arbitrary(Gen.posNum[Double]) 25 | 26 | def lowerUpper(xs: List[Double]): (Double, Double) = { 27 | val qt = xs.map(QTree(_)).reduce(new QTreeSemigroup[Double](QTreeAggregator.DefaultK).plus) 28 | val (lq, _) = qt.quantileBounds(0.75) 29 | val (_, fq) = qt.quantileBounds(0.25) 30 | val iqr = lq - fq 31 | val l = fq - (iqr * 1.5) 32 | val u = lq - (iqr * 1.5) 33 | (l, u) 34 | } 35 | 36 | property("default") = Prop.forAll(list[Double].arbitrary) { xs => 37 | val (l, u) = lowerUpper(xs) 38 | val rejected = xs.filter(_ => xs.min < xs.max).filter(x => x > u || x < l).map(_ => Seq(0d)) 39 | // records that are not within bounds should always be rejected 40 | val oob = List((lowerBound(xs.min), Seq(0d)), (upperBound(xs.max), Seq(0d))) 41 | val r = IQROutlierRejector("iqr") 42 | test(r, xs, Seq("iqr"), xs.map(_ => Seq(0d)), Seq(0.0), oob, rejected) 43 | } 44 | 45 | property("rejectLower don't rejectUpper") = Prop.forAll(list[Double].arbitrary) { xs => 46 | val (l, _) = lowerUpper(xs) 47 | val rejected = 48 | xs.filter(_ => xs.min < xs.max).filter(_ < l).map(_ => Seq(0d)) 49 | val r = IQROutlierRejector("iqr", rejectLower = true, rejectUpper = false) 50 | test(r, xs, Seq("iqr"), xs.map(_ => Seq(0d)), Seq(0.0), rejected = rejected) 51 | } 52 | 53 | property("rejectUpper don't rejectLower") = Prop.forAll(list[Double].arbitrary) { xs => 54 | val (_, u) = lowerUpper(xs) 55 | val rejected = 56 | xs.filter(_ => xs.min < xs.max).filter(_ > u).map(_ => Seq(0d)) 57 | val r = IQROutlierRejector("iqr", rejectLower = false, rejectUpper = true) 58 | test(r, xs, Seq("iqr"), xs.map(_ => Seq(0d)), Seq(0.0), rejected = rejected) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/IdentitySpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck._ 21 | 22 | object IdentitySpec extends TransformerProp("Identity") { 23 | property("default") = Prop.forAll { (xs: List[Double]) => 24 | test(Identity("id"), xs, Seq("id"), xs.map(Seq(_)), Seq(0.0)) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/IndicatorSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck._ 21 | 22 | object IndicatorSpec extends TransformerProp("Indicator") { 23 | property("default") = Prop.forAll { (xs: List[Double]) => 24 | val expected = xs.map(_ => Seq(1.0)) 25 | test(Indicator("id"), xs, Seq("id"), expected, Seq(0.0)) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/MDLSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.spotify.featran.transformers.mdl.MDLPDiscretizer 21 | import org.scalacheck._ 22 | 23 | object MDLSpec extends TransformerProp("MDL") { 24 | implicit private val arbMdlRecord: Arbitrary[MDLRecord[String]] = Arbitrary { 25 | for { 26 | label <- Gen.oneOf("1", "2", "3") 27 | value <- Gen.posNum[Double] 28 | } yield MDLRecord(label, value) 29 | } 30 | 31 | property("default") = Prop.forAll { (xs: List[MDLRecord[String]]) => 32 | val ranges = 33 | new MDLPDiscretizer(xs.map(l => (l.label, l.value))).discretize() 34 | val slices = ranges.tail 35 | val names = slices.indices.map("mdl_" + _) 36 | 37 | val expected = xs.map { case MDLRecord(_, x) => 38 | val array = Array.fill(slices.size)(0.0) 39 | val bin = slices.zipWithIndex.find(_._1 > x).map(_._2).getOrElse(slices.length - 1) 40 | 41 | array(bin) = 1.0 42 | array.toList 43 | } 44 | 45 | val missing = Seq.fill(slices.size)(0.0) 46 | test(MDL[String]("mdl"), xs, names, expected, missing) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/MaxAbsScalerSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck._ 21 | 22 | object MaxAbsScalerSpec extends TransformerProp("MaxAbsScaler") { 23 | property("default") = Prop.forAll { (xs: List[Double]) => 24 | val max = xs.map(math.abs).max 25 | val expected = xs.map(x => Seq(x / max)) 26 | val oob = List((lowerBound(-max), Seq(-1.0)), (upperBound(max), Seq(1.0))) 27 | test(MaxAbsScaler("max_abs"), xs, Seq("max_abs"), expected, Seq(0.0), oob) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/MinMaxScalerSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck._ 21 | 22 | object MinMaxScalerSpec extends TransformerProp("MinMaxScaler") { 23 | property("default") = Prop.forAll((xs: List[Double]) => test(xs, 0.0, 1.0)) 24 | 25 | // limit the range of min and max to avoid overflow 26 | private val minMaxGen = (for { 27 | min <- Gen.choose(-1000.0, 1000.0) 28 | range <- Gen.choose(1.0, 2000.0) 29 | } yield (min, min + range)).suchThat(t => t._2 > t._1) 30 | 31 | property("params") = Prop.forAll(list[Double].arbitrary, minMaxGen) { (xs, p) => 32 | val (minP, maxP) = p 33 | test(xs, minP, maxP) 34 | } 35 | 36 | private def test(xs: List[Double], minP: Double, maxP: Double): Prop = { 37 | val (min, max) = (xs.min, xs.max) 38 | val f = if ((max - min).isPosInfinity) 2.0 else 1.0 39 | val delta = max / f - min / f 40 | val expected = 41 | xs.map(x => Seq((x / f - min / f) / delta * (maxP - minP) + minP)) 42 | val rejected = xs.zip(expected).filter(x => x._1 < xs.min / f || x._1 > xs.max / f).map(_._2) 43 | val oob = List((lowerBound(min), Seq(minP)), (upperBound(max), Seq(maxP))) 44 | val t = MinMaxScaler("min_max", minP, maxP) 45 | test(t, xs, Seq("min_max"), expected, Seq(minP), oob, rejected) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/NGramsSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck._ 21 | 22 | object NGramsSpec extends TransformerProp("NHotEncoder") { 23 | implicit private val labelArb: Arbitrary[List[String]] = Arbitrary { 24 | Gen.choose(1, 5).flatMap(Gen.listOfN(_, Gen.alphaStr)) 25 | } 26 | 27 | property("default") = Prop.forAll { (xs: List[List[String]]) => 28 | val transformer = new NGrams("n_gram", 2, 4, " ") 29 | val ngrams = xs.map(transformer.ngrams(_)) 30 | val cats = ngrams.flatten.distinct.sorted 31 | val names = cats.map("n_gram_" + _) 32 | val expected = 33 | ngrams.map(s => cats.map(c => if (s.contains(c)) 1.0 else 0.0)) 34 | val missing = cats.map(_ => 0.0) 35 | val oob = List((List("s1", "s2"), missing)) // unseen labels 36 | test(transformer, xs, names, expected, missing, oob) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/NHotEncoderSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck._ 21 | 22 | object NHotEncoderSpec extends TransformerProp("NHotEncoder") { 23 | implicit private val labelArb: Arbitrary[List[String]] = Arbitrary { 24 | Gen.choose(1, 10).flatMap(Gen.listOfN(_, Gen.alphaStr)) 25 | } 26 | 27 | property("default") = Prop.forAll { (xs: List[List[String]]) => 28 | val cats = xs.flatten.distinct.sorted 29 | val names = cats.map("n_hot_" + _) 30 | val expected = xs.map(s => cats.map(c => if (s.contains(c)) 1.0 else 0.0)) 31 | val missing = cats.map(_ => 0.0) 32 | val oob = List((List("s1", "s2"), missing)) // unseen labels 33 | test(NHotEncoder("n_hot"), xs, names, expected, missing, oob) 34 | } 35 | 36 | property("encodeMissingValue") = Prop.forAll { (xs: List[List[String]]) => 37 | import MissingValue.MissingValueToken 38 | val cats = xs.flatten.distinct.sorted :+ MissingValueToken 39 | val names = cats.map("n_hot_" + _) 40 | val missing = cats.map(c => if (c == MissingValueToken) 1.0 else 0.0) 41 | val expected = xs.map { s => 42 | if (s.isEmpty) missing else cats.map(c => if (s.contains(c)) 1.0 else 0.0) 43 | } 44 | val partialMiss = expected.head.zip(missing).map { case (a, b) => a + b } 45 | 46 | // unseen or partially unseen labels 47 | val oob = List((List("s1", "s2"), missing), (List("s1", "s2") ++ xs.head, partialMiss)) 48 | test(NHotEncoder("n_hot", encodeMissingValue = true), xs, names, expected, missing, oob) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/NHotWeightedEncoderSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck.{Arbitrary, Gen, Prop} 21 | 22 | object NHotWeightedEncoderSpec extends TransformerProp("NHotWeightedEncoder") { 23 | implicit private val weightedVectors: Arbitrary[List[WeightedLabel]] = Arbitrary { 24 | val weightedValueGen = for { 25 | value <- Gen.chooseNum(-1.0, 1.0) 26 | n <- Gen.alphaStr 27 | } yield WeightedLabel(n, value) 28 | 29 | Gen.choose(1, 5).flatMap(Gen.listOfN(_, weightedValueGen)) 30 | } 31 | 32 | property("default") = Prop.forAll { (xs: List[List[WeightedLabel]]) => 33 | val cats = xs.flatten.map(_.name).distinct.sorted 34 | val names = cats.map("n_hot_" + _) 35 | val expected = 36 | xs.map(s => cats.map(c => s.filter(_.name == c).map(_.value).sum)) 37 | val missing = cats.map(_ => 0.0) 38 | val oob = 39 | List((List(WeightedLabel("s1", 0.2), WeightedLabel("s2", 0.1)), missing)) 40 | test(NHotWeightedEncoder("n_hot"), xs, names, expected, missing, oob) 41 | } 42 | 43 | property("encodeMissingValue") = Prop.forAll { (xs: List[List[WeightedLabel]]) => 44 | import MissingValue.MissingValueToken 45 | val cats = xs.flatten.map(_.name).distinct.sorted :+ MissingValueToken 46 | val names = cats.map("n_hot_" + _) 47 | val expected = 48 | xs.map(s => cats.map(c => s.filter(_.name == c).map(_.value).sum)) 49 | val missingBase = cats.map(c => if (c == MissingValueToken) 1.0 else 0.0) 50 | 51 | val oob = List( 52 | (List(WeightedLabel("s1", 0.2), WeightedLabel("s2", 0.1)), missingBase.map(v => v * 0.3)) 53 | ) 54 | test( 55 | NHotWeightedEncoder("n_hot", encodeMissingValue = true), 56 | xs, 57 | names, 58 | expected, 59 | missingBase, 60 | oob 61 | ) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/NormalizerSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import breeze.linalg._ 21 | import org.scalacheck._ 22 | 23 | object NormalizerSpec extends TransformerProp("Normalizer") { 24 | property("default") = Prop.forAll(list[Array[Double]].arbitrary, Gen.choose(1.0, 3.0)) { 25 | (xs, p) => 26 | val names = (0 until 10).map("norm_" + _) 27 | val expected = xs.map { x => 28 | val dv = DenseVector(x) 29 | (dv / norm(dv, p)).data.toSeq 30 | } 31 | val missing = (0 until 10).map(_ => 0.0) 32 | val oob = List((xs.head :+ 1.0, missing)) // vector of different dimension 33 | test(Normalizer("norm", p), xs, names, expected, missing, oob) 34 | } 35 | 36 | property("length") = Prop.forAll { (xs: List[Array[Double]]) => 37 | val msg = "requirement failed: Invalid input length, " + 38 | s"expected: ${xs.head.length + 1}, actual: ${xs.head.length}" 39 | testException[Array[Double]](Normalizer("norm", 2.0, xs.head.length + 1), xs) { e => 40 | e.isInstanceOf[IllegalArgumentException] && e.getMessage == msg 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/OneHotEncoderSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck._ 21 | 22 | object OneHotEncoderSpec extends TransformerProp("OneHotEncoder") { 23 | implicit private val labelArb: Arbitrary[String] = Arbitrary(Gen.alphaStr) 24 | 25 | property("default") = Prop.forAll { (xs: List[String]) => 26 | val cats = xs.distinct.sorted 27 | val names = cats.map("one_hot_" + _) 28 | val expected = xs.map(s => cats.map(c => if (s == c) 1.0 else 0.0)) 29 | val missing = cats.map(_ => 0.0) 30 | val oob = List(("s1", missing), ("s2", missing)) // unseen labels 31 | test(OneHotEncoder("one_hot"), xs, names, expected, missing, oob) 32 | } 33 | 34 | property("encodeMissingValue") = Prop.forAll { (xs: List[String]) => 35 | import MissingValue.MissingValueToken 36 | val cats = xs.distinct.sorted :+ MissingValueToken 37 | val names = cats.map("one_hot_" + _) 38 | val expected = xs.map(s => cats.map(c => if (s == c) 1.0 else 0.0)) 39 | val missing = cats.map(c => if (c == MissingValueToken) 1.0 else 0.0) 40 | val oob = List(("s1", missing), ("s2", missing)) // unseen labels 41 | test(OneHotEncoder("one_hot", encodeMissingValue = true), xs, names, expected, missing, oob) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/PolynomialExpansionSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck._ 21 | 22 | import scala.util.Try 23 | 24 | object PolynomialExpansionSpec extends TransformerProp("PolynomialExpansion") { 25 | property("default") = Prop.forAll(list[Array[Double]].arbitrary, Gen.choose(2, 4)) { 26 | (xs, degree) => 27 | val dim = PolynomialExpansion.expand(xs.head, degree).length 28 | val names = (0 until dim).map("poly_" + _) 29 | val expected = xs.map(v => PolynomialExpansion.expand(v, degree).toSeq) 30 | val missing = (0 until dim).map(_ => 0.0) 31 | val oob = List((xs.head :+ 1.0, missing)) // vector of different dimension 32 | test(PolynomialExpansion("poly", degree), xs, names, expected, missing, oob) 33 | } 34 | 35 | property("length") = Prop.forAll { (xs: List[Array[Double]]) => 36 | val msg = "requirement failed: Invalid input length, " + 37 | s"expected: ${xs.head.length + 1}, actual: ${xs.head.length}" 38 | testException[Array[Double]](PolynomialExpansion("id", 2, xs.head.length + 1), xs) { e => 39 | e.isInstanceOf[IllegalArgumentException] && e.getMessage == msg 40 | } 41 | } 42 | 43 | import org.apache.commons.math3.{util => cmu} 44 | 45 | private val genNK = { 46 | // cover all overflow scenarios 47 | val genN = 48 | Gen.frequency((10, Gen.choose(0, 61)), (5, Gen.choose(62, 66)), (1, Gen.choose(67, 70))) 49 | // n must be >= k 50 | for { 51 | n <- genN 52 | k <- Gen.choose(0, n) 53 | } yield (n, k) 54 | } 55 | 56 | property("binomial") = Prop.forAll(genNK) { case (n, k) => 57 | val actual = Try(CombinatoricsUtils.binomialCoefficient(n, k)) 58 | val expected = Try(cmu.CombinatoricsUtils.binomialCoefficient(n, k)) 59 | actual.toOption == expected.toOption 60 | } 61 | 62 | property("gcd") = Prop.forAll { (x: Int, y: Int) => 63 | val actual = Try(CombinatoricsUtils.gcd(x, y)) 64 | val expected = Try(cmu.ArithmeticUtils.gcd(x, y)) 65 | actual.toOption == expected.toOption 66 | } 67 | 68 | property("mulAndCheck") = Prop.forAll { (x: Long, y: Long) => 69 | val actual = Try(CombinatoricsUtils.mulAndCheck(x, y)) 70 | val expected = Try(cmu.ArithmeticUtils.mulAndCheck(x, y)) 71 | actual.toOption == expected.toOption 72 | } 73 | 74 | property("abs") = Prop.forAll((x: Int) => CombinatoricsUtils.abs(x) == math.abs(x)) 75 | } 76 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/PositionEncoderSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck.{Arbitrary, Gen, Prop} 21 | 22 | object PositionEncoderSpec extends TransformerProp("PositionEncoder") { 23 | implicit private val labelArb: Arbitrary[String] = Arbitrary(Gen.alphaStr) 24 | 25 | property("default") = Prop.forAll { (xs: List[String]) => 26 | val cats = xs.distinct.sorted 27 | val expected = 28 | xs.map(s => Seq(cats.zipWithIndex.find(c => s == c._1).map(_._2).getOrElse(0).toDouble)) 29 | val oob = List(("s1", Seq(0.0)), ("s2", Seq(0.0))) // unseen labels 30 | test(PositionEncoder("position"), xs, List("position"), expected, Seq(0.0), oob) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/QuantileDiscretizerSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import java.util.{TreeMap => JTreeMap} 21 | 22 | import com.twitter.algebird.{QTree, QTreeAggregator, QTreeSemigroup} 23 | import org.scalacheck._ 24 | 25 | object QuantileDiscretizerSpec extends TransformerProp("QuantileDiscretizer") { 26 | implicit private val arbPosDouble: Arbitrary[Double] = Arbitrary(Gen.posNum[Double]) 27 | 28 | property("default") = Prop.forAll(list[Double].arbitrary, Gen.oneOf(2, 4, 5)) { 29 | (xs, numBuckets) => 30 | // FIXME: make this a black box 31 | val qt = xs.map(QTree(_)).reduce(new QTreeSemigroup[Double](QTreeAggregator.DefaultK).plus) 32 | val m = new JTreeMap[Double, Int]() 33 | val interval = 1.0 / numBuckets 34 | for (i <- 1 until numBuckets) { 35 | val (l, u) = qt.quantileBounds(interval * i) 36 | val k = l / 2 + u / 2 // (l + u) might overflow 37 | if (!m.containsKey(k)) { 38 | m.put(k, i - 1) 39 | } 40 | } 41 | m.put(qt.upperBound, numBuckets - 1) 42 | val expected = xs.map { x => 43 | (0 until numBuckets).map(i => if (i == m.higherEntry(x).getValue) 1.0 else 0.0) 44 | } 45 | val names = (0 until numBuckets).map("quantile_" + _) 46 | val missing = (0 until numBuckets).map(_ => 0.0) 47 | val oob = List( 48 | (lowerBound(xs.min), 1.0 +: (0 until numBuckets - 1).map(_ => 0.0)), 49 | (upperBound(xs.max), (0 until numBuckets - 1).map(_ => 0.0) :+ 1.0) 50 | ) 51 | test(QuantileDiscretizer("quantile", numBuckets), xs, names, expected, missing, oob) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/QuantileOutlierRejectorSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.twitter.algebird.{QTree, QTreeAggregator, QTreeSemigroup} 21 | import org.scalacheck._ 22 | 23 | object QuantileOutlierRejectorSpec extends TransformerProp("QuantileOutlierRejector") { 24 | implicit private val arbPosDouble: Arbitrary[Double] = Arbitrary(Gen.posNum[Double]) 25 | 26 | def lowerUpper(xs: List[Double], numBuckets: Int): (Double, Double) = { 27 | val qt = xs.map(QTree(_)).reduce(new QTreeSemigroup[Double](QTreeAggregator.DefaultK).plus) 28 | val lq = (numBuckets - 1.0) / numBuckets 29 | val fq = 1.0 / numBuckets 30 | val (u, _) = qt.quantileBounds(lq) 31 | val (_, l) = qt.quantileBounds(fq) 32 | (l, u) 33 | } 34 | 35 | property("default") = Prop.forAll(list[Double].arbitrary, Gen.oneOf(3 to 20)) { 36 | (xs, numBuckets) => 37 | val (l, u) = lowerUpper(xs, numBuckets) 38 | val rejected = xs.filter(_ => xs.min < xs.max).filter(x => x > u || x < l).map(_ => Seq(0d)) 39 | // records that are not within bounds should always be rejected 40 | val oob = 41 | List((lowerBound(xs.min), Seq(0d)), (upperBound(xs.max), Seq(0d))) 42 | val r = QuantileOutlierRejector("quantile", numBuckets = numBuckets) 43 | test(r, xs, Seq("quantile"), xs.map(_ => Seq(0d)), Seq(0.0), oob, rejected) 44 | } 45 | 46 | property("rejectLower don't rejectUpper") = 47 | Prop.forAll(list[Double].arbitrary, Gen.oneOf(3 to 20)) { (xs, numBuckets) => 48 | val (l, _) = lowerUpper(xs, numBuckets) 49 | val rejected = 50 | xs.filter(_ => xs.min < xs.max).filter(_ < l).map(_ => Seq(0d)) 51 | val r = QuantileOutlierRejector( 52 | "quantile", 53 | rejectLower = true, 54 | rejectUpper = false, 55 | numBuckets = numBuckets 56 | ) 57 | test(r, xs, Seq("quantile"), xs.map(_ => Seq(0d)), Seq(0.0), rejected = rejected) 58 | } 59 | 60 | property("rejectUpper don't rejectLower") = 61 | Prop.forAll(list[Double].arbitrary, Gen.oneOf(3 to 20)) { (xs, numBuckets) => 62 | val (_, u) = lowerUpper(xs, numBuckets) 63 | val rejected = 64 | xs.filter(_ => xs.min < xs.max).filter(_ > u).map(_ => Seq(0d)) 65 | val r = QuantileOutlierRejector( 66 | "quantile", 67 | rejectLower = false, 68 | rejectUpper = true, 69 | numBuckets = numBuckets 70 | ) 71 | test(r, xs, Seq("quantile"), xs.map(_ => Seq(0d)), Seq(0.0), rejected = rejected) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/StandardScalerSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck._ 21 | 22 | object StandardScalerSpec extends TransformerProp("StandardScaler") { 23 | def meanAndStddev(xs: List[Double]): (Double, Double) = { 24 | // breeze.stats.stddev is sample stddev 25 | val mean = xs.map(_ / xs.length).sum 26 | (mean, math.sqrt(xs.map(x => math.pow(x - mean, 2)).sum / xs.size)) 27 | } 28 | 29 | property("default") = Prop.forAll { (xs: List[Double]) => 30 | val (mean, stddev) = meanAndStddev(xs) 31 | val expected = xs.map(x => Seq((x - mean) / stddev + mean)) 32 | test(StandardScaler("std"), xs, Seq("std"), expected, Seq(mean)) 33 | } 34 | 35 | property("withStd withMean") = Prop.forAll { (xs: List[Double]) => 36 | val (mean, stddev) = meanAndStddev(xs) 37 | val expected = xs.map(x => Seq((x - mean) / stddev)) 38 | val (withStd, withMean) = (true, true) 39 | test(StandardScaler("std", withStd, withMean), xs, Seq("std"), expected, Seq(0.0)) 40 | } 41 | 42 | property("withStd withoutMean") = Prop.forAll { (xs: List[Double]) => 43 | val (mean, stddev) = meanAndStddev(xs) 44 | val expected = xs.map(x => Seq((x - mean) / stddev + mean)) 45 | val (withStd, withMean) = (true, false) 46 | test(StandardScaler("std", withStd, withMean), xs, Seq("std"), expected, Seq(mean)) 47 | } 48 | 49 | property("withoutStd withMean") = Prop.forAll { (xs: List[Double]) => 50 | val (mean, _) = meanAndStddev(xs) 51 | val expected = xs.map(x => Seq(x - mean)) 52 | val (withStd, withMean) = (false, true) 53 | test(StandardScaler("std", withStd, withMean), xs, Seq("std"), expected, Seq(0.0)) 54 | } 55 | 56 | property("withoutStd withoutMean") = Prop.forAll { (xs: List[Double]) => 57 | val (mean, _) = meanAndStddev(xs) 58 | val expected = xs.map(Seq(_)) 59 | val (withStd, withMean) = (false, false) 60 | test(StandardScaler("std", withStd, withMean), xs, Seq("std"), expected, Seq(mean)) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/TopNOneHotEncoderSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import com.twitter.algebird._ 21 | import org.scalacheck._ 22 | 23 | object TopNOneHotEncoderSpec extends TransformerProp("TopNOneHotEncoder") { 24 | implicit private val labelArb: Arbitrary[String] = Arbitrary { 25 | val infrequent = Gen.listOfN(50, Gen.alphaStr).flatMap(xs => Gen.oneOf(xs)) 26 | val frequent = Gen.listOfN(5, Gen.alphaStr).flatMap(xs => Gen.oneOf(xs)) 27 | Gen.frequency((1, infrequent), (50, frequent)) 28 | } 29 | private val Seed = 1 30 | 31 | import MissingValue.MissingValueToken 32 | 33 | def getExpectedOutputVector( 34 | s: String, 35 | cats: List[String], 36 | encodeMissingValue: Boolean 37 | ): Seq[Double] = { 38 | val v = cats.map(c => if (s == c) 1.0 else 0.0) 39 | if (encodeMissingValue && v.sum == 0.0) { 40 | cats.map(c => if (c == MissingValueToken) 1.0 else 0.0) 41 | } else { 42 | v 43 | } 44 | } 45 | 46 | private def test(transformer: Transformer[String, _, _], xs: List[String]): Prop = { 47 | val encoder = transformer.asInstanceOf[TopNOneHotEncoder] 48 | val (n, eps, delta) = (encoder.n, encoder.eps, encoder.delta) 49 | val encodeMissingValue = encoder.encodeMissingValue 50 | 51 | val params = SketchMapParams[String](Seed, eps, delta, n)(_.getBytes) 52 | val aggregator = SketchMap.aggregator[String, Long](params) 53 | val sm = 54 | xs.map(x => aggregator.prepare((x, 1L))).reduce(aggregator.monoid.plus) 55 | val keys = sm.heavyHitterKeys.sorted 56 | val cats = if (encodeMissingValue) keys :+ MissingValueToken else keys 57 | val names = cats.map("tn1h_" + _) 58 | val expected = 59 | xs.map(s => getExpectedOutputVector(s, cats, encodeMissingValue)) 60 | val missing = if (encodeMissingValue) { 61 | cats.map(c => if (c == MissingValueToken) 1.0 else 0.0) 62 | } else { 63 | cats.map(_ => 0.0) 64 | } 65 | val oob = List(("s1", missing), ("s2", missing)) // unseen labels 66 | val rejected = 67 | xs.flatMap(x => if (cats.contains(x)) None else Some(missing)) 68 | 69 | test(transformer, xs, names, expected, missing, oob, rejected) 70 | } 71 | 72 | property("default") = Prop.forAll { (xs: List[String]) => 73 | test(TopNOneHotEncoder("tn1h", 10, seed = 1), xs) 74 | } 75 | 76 | property("count") = Prop.forAll { (xs: List[String]) => 77 | test(TopNOneHotEncoder("tn1h", 100, seed = 1), xs) 78 | } 79 | 80 | property("eps") = Prop.forAll { (xs: List[String]) => 81 | test(TopNOneHotEncoder("tn1h", 10, eps = 0.01, seed = 1), xs) 82 | } 83 | 84 | property("delta") = Prop.forAll { (xs: List[String]) => 85 | test(TopNOneHotEncoder("tn1h", 10, delta = 0.01, seed = 1), xs) 86 | } 87 | 88 | property("encodeMissingValue") = Prop.forAll { (xs: List[String]) => 89 | test(TopNOneHotEncoder("tn1h", 10, delta = 0.01, seed = 1, encodeMissingValue = true), xs) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/VectorIdentitySpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import org.scalacheck.Prop 21 | 22 | object VectorIdentitySpec extends TransformerProp("VectorIdentity") { 23 | property("default") = Prop.forAll { (xs: List[List[Double]]) => 24 | val dim = xs.head.length 25 | val names = (0 until dim).map("id_" + _) 26 | val expected = xs.map(_.toSeq) 27 | val missing = (0 until dim).map(_ => 0.0) 28 | val oob = List((xs.head :+ 1.0, missing)) // vector of different dimension 29 | test[List[Double]](VectorIdentity("id"), xs, names, expected, missing, oob) 30 | } 31 | 32 | property("length") = Prop.forAll { (xs: List[List[Double]]) => 33 | val msg = "requirement failed: Invalid input length, " + 34 | s"expected: ${xs.head.length + 1}, actual: ${xs.head.length}" 35 | testException[List[Double]](VectorIdentity("id", xs.head.length + 1), xs) { e => 36 | e.isInstanceOf[IllegalArgumentException] && e.getMessage == msg 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/VonMisesEvaluatorSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers 19 | 20 | import breeze.stats.distributions.{Rand, VonMises} 21 | import org.scalacheck._ 22 | 23 | object VonMisesEvaluatorSpec extends TransformerProp("VonMisesEvaluator") { 24 | private val MinPoint = 0.0 25 | private val MaxPoint = 1000.0 26 | private val Scale = 2 * math.Pi / MaxPoint 27 | 28 | private val MuGen = Gen.nonEmptyListOf(Gen.choose(MinPoint, MaxPoint)) 29 | private val PointGen = 30 | Gen.choose(3, 10).flatMap(n => Gen.listOfN(n, Gen.choose(MinPoint, MaxPoint))) 31 | private val KappaGen = Gen.choose(0.0, 100.0) 32 | 33 | property("default") = Prop.forAll(MuGen, PointGen, KappaGen) { (xs, points, kappa) => 34 | val dim = points.size 35 | val names = (0 until dim).map("vm_" + _) 36 | val missing = (0 until dim).map(_ => 0.0) 37 | val expected = xs.map(mu => points.map(p => VonMises(mu * Scale, kappa)(Rand).pdf(Scale * p))) 38 | test(VonMisesEvaluator("vm", kappa, Scale, points.toArray), xs, names, expected, missing) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/mdl/MDLPDiscretizerTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers.mdl 19 | 20 | import org.scalatest.flatspec.AnyFlatSpec 21 | import org.scalatest.matchers.should.Matchers 22 | 23 | class MDLPDiscretizerTest extends AnyFlatSpec with Matchers { 24 | import com.spotify.featran.transformers.mdl.TestUtility._ 25 | 26 | "MDLPDiscretizer" should "work with cars data (maxBins = 10)" in { 27 | val data = Cars.map(v => (v.origin, v.mpg)) 28 | val result = new MDLPDiscretizer(data).discretize(10).sorted 29 | val expected = 30 | List(Double.NegativeInfinity, 16.1, 21.05, 30.95, Double.PositiveInfinity) 31 | result.length shouldBe expected.length 32 | result.zip(expected).map { case (r, e) => r shouldEqual e } 33 | } 34 | 35 | it should "work with cars data (maxBins = 2)" in { 36 | val data = Cars.map(v => (v.origin, v.mpg)) 37 | val result = new MDLPDiscretizer(data).discretize(2).sorted 38 | val expected = List(Double.NegativeInfinity, 21.05, Double.PositiveInfinity) 39 | result.length shouldBe expected.length 40 | result.zip(expected).map { case (r, e) => r shouldEqual e } 41 | } 42 | 43 | it should "work with empty data" in { 44 | val data = List.empty[(String, Double)] 45 | val result = new MDLPDiscretizer(data).discretize(2).sorted 46 | val expected = List(Double.NegativeInfinity, Double.PositiveInfinity) 47 | result.length shouldBe expected.length 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/mdl/TestUtility.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers.mdl 19 | 20 | import org.scalactic._ 21 | 22 | import scala.io.Source 23 | 24 | case class CarRecord( 25 | mpg: Double, 26 | cylinders: Int, 27 | cubicInches: Int, 28 | horsePower: Double, 29 | weightLbs: Double, 30 | timeToSixty: Double, 31 | year: Int, 32 | brand: String, 33 | origin: String 34 | ) 35 | 36 | object TestUtility { 37 | implicit val doubleEquality: Equality[Double] = 38 | TolerantNumerics.tolerantDoubleEquality(0.01) 39 | 40 | lazy val Cars: List[CarRecord] = 41 | Source.fromInputStream(this.getClass.getResourceAsStream("/cars.data")).getLines().toList.map { 42 | line => 43 | val x = line.split(",").map(elem => elem.trim) 44 | CarRecord( 45 | x(0).toDouble, 46 | x(1).toInt, 47 | x(2).toInt, 48 | x(3).toDouble, 49 | x(4).toDouble, 50 | x(5).toDouble, 51 | x(6).toInt, 52 | x(7), 53 | x(8) 54 | ) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /core/src/test/scala/com/spotify/featran/transformers/mdl/ThresholdFinderTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.transformers.mdl 19 | 20 | import org.scalatest.flatspec.AnyFlatSpec 21 | import org.scalatest.matchers.should.Matchers 22 | 23 | class ThresholdFinderTest extends AnyFlatSpec with Matchers { 24 | "ThresholdFinder" should "work with nLabels = 3 and feature size = 4" in { 25 | val finder = 26 | new ThresholdFinder(nLabels = 3, stoppingCriterion = 0, maxBins = 100, minBinWeight = 1) 27 | 28 | val feature = Seq( 29 | (5.0f, Array(1L, 2L, 3L)), 30 | (4.0f, Array(5L, 4L, 20L)), 31 | (3.5f, Array(3L, 20L, 12L)), 32 | (3.0f, Array(8L, 18L, 2L)) 33 | ) 34 | 35 | val result = finder.findThresholds(feature) 36 | result shouldBe Seq(Float.NegativeInfinity, 4.0, Float.PositiveInfinity) 37 | } 38 | 39 | it should "work with duplicates" in { 40 | val finder = 41 | new ThresholdFinder(nLabels = 3, stoppingCriterion = 0, maxBins = 100, minBinWeight = 1) 42 | 43 | val best = 44 | finder.bestThreshold( 45 | List((1.0f, Array.empty, Array.empty, Array.empty)), 46 | Some(1.0f), 47 | Array.empty 48 | ) 49 | best shouldBe empty 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /flink/src/main/scala/com/spotify/featran/flink/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | import com.esotericsoftware.kryo.serializers.JavaSerializer 21 | import org.apache.flink.api.common.typeinfo.TypeInformation 22 | import org.apache.flink.api.scala.DataSet 23 | 24 | import scala.reflect.ClassTag 25 | 26 | package object flink { 27 | 28 | /** [[CollectionType]] for extraction from Apache Flink `DataSet` type. */ 29 | implicit object FlinkCollectionType extends CollectionType[DataSet] { 30 | // force fallback to default serializer 31 | private val Ti = TypeInformation.of(classOf[Any]) 32 | 33 | override def map[A, B: ClassTag](ma: DataSet[A])(f: A => B): DataSet[B] = { 34 | implicit val tib = Ti.asInstanceOf[TypeInformation[B]] 35 | ma.map(f) 36 | } 37 | override def reduce[A](ma: DataSet[A])(f: (A, A) => A): DataSet[A] = 38 | ma.reduce(f) 39 | 40 | override def cross[A, B: ClassTag](ma: DataSet[A])(mb: DataSet[B]): DataSet[(A, B)] = 41 | ma.crossWithTiny(mb) 42 | 43 | override def pure[A, B: ClassTag](ma: DataSet[A])(b: B): DataSet[B] = { 44 | implicit val tib = Ti.asInstanceOf[TypeInformation[B]] 45 | val env = ma.getExecutionEnvironment 46 | // Kryo throws NPE on `Feature`, use Java serialization instead 47 | env.addDefaultKryoSerializer(classOf[FeatureSet[Any]], classOf[JavaSerializer]) 48 | env.fromElements(b) 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /flink/src/test/scala/com/spotify/featran/flink/FlinkTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.flink 19 | 20 | import com.spotify.featran._ 21 | import org.apache.flink.api.scala._ 22 | import org.scalatest.flatspec.AnyFlatSpec 23 | import org.scalatest.matchers.should.Matchers 24 | 25 | class FlinkTest extends AnyFlatSpec with Matchers { 26 | import Fixtures._ 27 | 28 | "Flink" should "work with FeatureSpec" in { 29 | val env = ExecutionEnvironment.getExecutionEnvironment 30 | val f = TestSpec.extract(env.fromCollection(TestData)) 31 | f.featureNames.collect() shouldBe Seq(ExpectedNames) 32 | f.featureValues[Seq[Double]].collect() should contain theSameElementsAs ExpectedValues 33 | } 34 | 35 | it should "work with MultiFeatureSpec" in { 36 | noException shouldBe thrownBy { 37 | val env = ExecutionEnvironment.getExecutionEnvironment 38 | val f = RecordSpec.extract(env.fromCollection(Records)) 39 | f.featureNames.collect() 40 | f.featureValues[Seq[Double]].collect() 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /java/src/main/java/com/spotify/featran/java/JFeatureExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.java; 19 | 20 | import com.spotify.featran.FeatureBuilder; 21 | import com.spotify.featran.FeatureExtractor; 22 | import com.spotify.featran.xgboost.SparseLabeledPoint; 23 | import ml.dmlc.xgboost4j.LabeledPoint; 24 | import org.tensorflow.proto.example.Example; 25 | import scala.reflect.ClassTag; 26 | 27 | import java.util.List; 28 | 29 | /** 30 | * Java wrapper for {@link FeatureExtractor}. 31 | * 32 | * Note that {@code float[]}, {@code double[]}, {@link FloatSparseArray} and 33 | * {@link DoubleSparseArray} are the only supported as output type. 34 | */ 35 | public class JFeatureExtractor { 36 | 37 | private final JListFeatureExtractor self; 38 | 39 | JFeatureExtractor(JListFeatureExtractor self) { 40 | this.self = self; 41 | } 42 | 43 | /** 44 | * Java wrapper for {@link FeatureExtractor#featureSettings()}. 45 | */ 46 | public String featureSettings() { 47 | return JavaOps.featureSettings(self); 48 | } 49 | 50 | /** 51 | * Java wrapper for {@link FeatureExtractor#featureNames()}. 52 | */ 53 | public List featureNames() { 54 | return JavaOps.featureNames(self); 55 | } 56 | 57 | /** 58 | * Java wrapper for {@link FeatureExtractor#featureValues(FeatureBuilder, ClassTag)}. 59 | */ 60 | public List featureValuesFloat() { 61 | return JavaOps.featureValuesFloat(self); 62 | } 63 | 64 | /** 65 | * Java wrapper for {@link FeatureExtractor#featureValues(FeatureBuilder, ClassTag)}. 66 | */ 67 | public List featureValuesDouble() { 68 | return JavaOps.featureValuesDouble(self); 69 | } 70 | 71 | /** 72 | * Java wrapper for {@link FeatureExtractor#featureValues(FeatureBuilder, ClassTag)}. 73 | */ 74 | public List featureValuesFloatSparse() { 75 | return JavaOps.featureValuesFloatSparseArray(self); 76 | } 77 | 78 | /** 79 | * Java wrapper for {@link FeatureExtractor#featureValues(FeatureBuilder, ClassTag)}. 80 | */ 81 | public List featureValuesDoubleSparse() { 82 | return JavaOps.featureValuesDoubleSparseArray(self); 83 | } 84 | 85 | /** 86 | * Java wrapper for {@link FeatureExtractor#featureValues(FeatureBuilder, ClassTag)}. 87 | */ 88 | public List featureValuesFloatNamedSparse() { 89 | return JavaOps.featureValuesFloatNamedSparseArray(self); 90 | } 91 | 92 | /** 93 | * Java wrapper for {@link FeatureExtractor#featureValues(FeatureBuilder, ClassTag)}. 94 | */ 95 | public List featureValuesDoubleNamedSparse() { 96 | return JavaOps.featureValuesDoubleNamedSparseArray(self); 97 | } 98 | 99 | /** 100 | * Java wrapper for {@link FeatureExtractor#featureValues(FeatureBuilder, ClassTag)}. 101 | */ 102 | public List featureValuesExample() { 103 | return JavaOps.featureValuesExample(self); 104 | } 105 | 106 | /** 107 | * Java wrapper for {@link FeatureExtractor#featureValues(FeatureBuilder, ClassTag)}. 108 | */ 109 | public List featureValuesLabeledPoint() { 110 | return JavaOps.featureValuesLabeledPoint(self); 111 | } 112 | 113 | /** 114 | * Java wrapper for {@link FeatureExtractor#featureValues(FeatureBuilder, ClassTag)}. 115 | */ 116 | public List featureValuesSparseLabeledPoint() { 117 | return JavaOps.featureValuesSparseLabeledPoint(self); 118 | } 119 | 120 | } 121 | -------------------------------------------------------------------------------- /java/src/main/java/com/spotify/featran/java/JRecordExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.java; 19 | 20 | import com.spotify.featran.RecordExtractor; 21 | 22 | import java.util.List; 23 | 24 | /** 25 | * Java wrapper for {@link RecordExtractor}. 26 | */ 27 | public class JRecordExtractor { 28 | 29 | private final RecordExtractor self; 30 | 31 | JRecordExtractor(RecordExtractor self) { 32 | this.self = self; 33 | } 34 | 35 | /** 36 | * Java wrapper for {@link RecordExtractor#featureNames()}. 37 | */ 38 | public List featureNames() { 39 | return JavaOps.featureNames(self); 40 | } 41 | 42 | /** 43 | * Java wrapper for {@link RecordExtractor#featureValue(Object)}. 44 | */ 45 | public F featureValue(T record) { 46 | return self.featureValue(record); 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /java/src/main/java/com/spotify/featran/java/SerializableFunction.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.java; 19 | 20 | import java.io.Serializable; 21 | 22 | public interface SerializableFunction extends Serializable { 23 | OutputT apply(InputT input); 24 | } 25 | -------------------------------------------------------------------------------- /java/src/main/scala/com/spotify/featran/java/JListFeatureExtractor.scala: -------------------------------------------------------------------------------- 1 | package com.spotify.featran.java 2 | 3 | import com.spotify.featran.FeatureExtractor 4 | 5 | import java.util.{List => JList} 6 | 7 | // java does not support higher-kind types 8 | // specialize FeatureExtractor for java.util.List in the scala code 9 | private class JListFeatureExtractor[T](other: FeatureExtractor[JList, T]) 10 | extends FeatureExtractor[JList, T](other)(JavaOps.jListCollectionType) 11 | -------------------------------------------------------------------------------- /java/src/test/java/com/spotify/featran/java/JavaTestUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.java; 19 | 20 | import com.spotify.featran.transformers.MinMaxScaler; 21 | import com.spotify.featran.transformers.OneHotEncoder; 22 | import scala.Tuple2; 23 | 24 | import java.util.Optional; 25 | 26 | public class JavaTestUtil { 27 | 28 | private JavaTestUtil() {} 29 | 30 | public static JFeatureSpec> spec() { 31 | return JFeatureSpec.>create() 32 | .required(t -> t._1, OneHotEncoder.apply("one_hot", false)) 33 | .required(t -> t._2.doubleValue(), MinMaxScaler.apply("min_max", 0.0, 1.0)); 34 | } 35 | 36 | public static JFeatureSpec optionalSpec() { 37 | return JFeatureSpec.create() 38 | .optional(Optional::ofNullable, OneHotEncoder.apply("one_hot", false)); 39 | } 40 | 41 | public static JFeatureSpec> crossSpec() { 42 | return JFeatureSpec.>create() 43 | .required(t -> t._1, OneHotEncoder.apply("one_hot_a", false)) 44 | .required(t -> t._2, OneHotEncoder.apply("one_hot_b", false)) 45 | .cross("one_hot_a", "one_hot_b", (a, b) -> a * b); 46 | } 47 | 48 | public static int[] getIndicies(FloatSparseArray a) { 49 | return a.indices(); 50 | } 51 | 52 | public static int[] getIndicies(DoubleSparseArray a) { 53 | return a.indices(); 54 | } 55 | 56 | public static float[] getValues(FloatSparseArray a) { 57 | return a.values(); 58 | } 59 | 60 | public static double[] getValues(DoubleSparseArray a) { 61 | return a.values(); 62 | } 63 | 64 | public static float[] getDense(FloatSparseArray a) { 65 | return a.toDense(); 66 | } 67 | 68 | public static double[] getDense(DoubleSparseArray a) { 69 | return a.toDense(); 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /java/src/test/java/com/spotify/featran/java/examples/JavaExample.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.java.examples; 19 | 20 | import com.spotify.featran.java.DoubleSparseArray; 21 | import com.spotify.featran.java.JFeatureExtractor; 22 | import com.spotify.featran.java.JFeatureSpec; 23 | import com.spotify.featran.transformers.MinMaxScaler; 24 | import com.spotify.featran.transformers.OneHotEncoder; 25 | 26 | import java.util.*; 27 | 28 | public class JavaExample { 29 | 30 | private static class Record { 31 | private final double d; 32 | private final Optional s; 33 | 34 | Record(double d, Optional s) { 35 | this.d = d; 36 | this.s = s; 37 | } 38 | } 39 | 40 | private static final Random rand = new Random(); 41 | 42 | private static List randomRecords() { 43 | List records = new ArrayList<>(); 44 | for (int i = 0; i < 20; i++) { 45 | Optional s = i % 5 == 0 ? Optional.empty() : Optional.of("s" + rand.nextInt(5)); 46 | records.add(new Record(rand.nextDouble(), s)); 47 | } 48 | return records; 49 | } 50 | 51 | public static void main(String[] args) { 52 | // Random input 53 | List records = randomRecords(); 54 | 55 | // Start building a feature specification 56 | JFeatureSpec fs = JFeatureSpec.create() 57 | .required(r -> r.d, MinMaxScaler.apply("min-max", 0.0, 1.0)) 58 | .optional(r -> r.s, OneHotEncoder.apply("one-hot", false)); 59 | 60 | // Extract features from List 61 | JFeatureExtractor f1 = fs.extract(records); 62 | 63 | System.out.println(f1.featureNames()); 64 | 65 | // Get feature values as double[] 66 | for (double[] f : f1.featureValuesDouble()) { 67 | System.out.println(Arrays.toString(f)); 68 | } 69 | 70 | // Get feature values as DoubleSparseArray 71 | for (DoubleSparseArray f : f1.featureValuesDoubleSparse()) { 72 | String s = String.format( 73 | "indices: [%s], values: [%s], length: %d", 74 | Arrays.toString(f.indices()), Arrays.toString(f.values()), f.length()); 75 | System.out.println(s); 76 | } 77 | 78 | // Extract settings as a JSON string 79 | String settings = f1.featureSettings(); 80 | System.out.println(settings); 81 | 82 | // Extract features from new records ,but reuse previously saved settings 83 | fs.extractWithSettings(randomRecords(), settings); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /jmh/src/test/scala/com/spotify/featran/jmh/ExtractorBenchmark.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.jmh 19 | 20 | import java.util.concurrent.TimeUnit 21 | 22 | import com.spotify.featran._ 23 | import com.spotify.featran.transformers._ 24 | import org.openjdk.jmh.annotations._ 25 | 26 | @BenchmarkMode(Array(Mode.AverageTime)) 27 | @OutputTimeUnit(TimeUnit.NANOSECONDS) 28 | @State(Scope.Thread) 29 | class ExtractorBenchmark { 30 | 31 | type A = (Double, String) 32 | val fs: FeatureSpec[A] = FeatureSpec 33 | .of[A] 34 | .required(_._1)(StandardScaler("std")) 35 | .required(_._2)(OneHotEncoder("onehot")) 36 | val input: Seq[A] = (1 to 10).map(x => (x.toDouble, x.toString)) 37 | val settings: Seq[String] = fs.extract(input).featureSettings 38 | val re: RecordExtractor[A, Seq[Double]] = fs.extractWithSettings(settings.head) 39 | 40 | @Benchmark def collection: Seq[Seq[Double]] = 41 | fs.extractWithSettings(input, settings).featureValues[Seq[Double]] 42 | @Benchmark def collection1: Seq[Double] = 43 | fs.extractWithSettings(Seq((1.0, "1.0")), settings).featureValues[Seq[Double]].head 44 | @Benchmark def record: Seq[Seq[Double]] = input.map(re.featureValue) 45 | @Benchmark def record1: Seq[Double] = re.featureValue((1.0, "1.0")) 46 | 47 | } 48 | -------------------------------------------------------------------------------- /jmh/src/test/scala/com/spotify/featran/jmh/FeatureBuilderBenchmark.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.jmh 19 | 20 | import java.util.concurrent.TimeUnit 21 | 22 | import breeze.linalg._ 23 | import com.spotify.featran._ 24 | import com.spotify.featran.tensorflow._ 25 | import org.openjdk.jmh.annotations._ 26 | import org.tensorflow.proto.example.Example 27 | 28 | @BenchmarkMode(Array(Mode.AverageTime)) 29 | @OutputTimeUnit(TimeUnit.NANOSECONDS) 30 | @State(Scope.Thread) 31 | class FeatureBuilderBenchmark { 32 | 33 | private val names = (750 until 1000).map(_.toString) 34 | private val values = (750 until 1000).map(_.toDouble) 35 | 36 | def benchmark[T: FeatureBuilder]: T = { 37 | val fb = FeatureBuilder[T] 38 | fb.init(1000) 39 | var i = 0 40 | while (i < 500) { 41 | fb.add(i.toString, i.toDouble) 42 | fb.skip() 43 | i += 2 44 | } 45 | fb.skip(250) 46 | fb.add(names, values) 47 | fb.result 48 | } 49 | 50 | @Benchmark def array: Unit = benchmark[Array[Double]] 51 | @Benchmark def seq: Unit = benchmark[Seq[Double]] 52 | @Benchmark def sparseArray: Unit = benchmark[SparseArray[Double]] 53 | @Benchmark def denseVector: Unit = benchmark[DenseVector[Double]] 54 | @Benchmark def sparseVector: Unit = benchmark[SparseVector[Double]] 55 | @Benchmark def map: Unit = benchmark[Map[String, Double]] 56 | @Benchmark def tensorflow: Unit = benchmark[Example] 57 | 58 | } 59 | -------------------------------------------------------------------------------- /jmh/src/test/scala/com/spotify/featran/jmh/TransformerBenchmark.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.jmh 19 | 20 | import java.util.concurrent.TimeUnit 21 | 22 | import com.spotify.featran.transformers._ 23 | import com.spotify.featran._ 24 | import org.openjdk.jmh.annotations._ 25 | import org.openjdk.jmh.infra.Blackhole 26 | 27 | @BenchmarkMode(Array(Mode.AverageTime)) 28 | @OutputTimeUnit(TimeUnit.NANOSECONDS) 29 | @State(Scope.Thread) 30 | class TransformerBenchmark { 31 | 32 | import Fixtures._ 33 | 34 | def benchmark[A](transformer: Transformer[A, _, _], bh: Blackhole)(implicit 35 | fixture: Seq[A] 36 | ): Seq[Unit] = { 37 | implicit val fb: FeatureBuilder[Unit] = new NoOpFeatureBuilder(bh) 38 | val fe = FeatureSpec.of[A].required(identity)(transformer).extract(fixture) 39 | fe.featureValues[Unit] 40 | } 41 | 42 | // TODO: figure out how to verify that all transformers are covered 43 | 44 | @Benchmark def binarizer(bh: Blackhole): Seq[Unit] = benchmark(Binarizer("t"), bh) 45 | @Benchmark def bucketizer(bh: Blackhole): Seq[Unit] = 46 | benchmark(Bucketizer("t", Array(0.0, 250.0, 500.0, 750.0, 1000.0)), bh) 47 | @Benchmark def hashNHotEncoder(bh: Blackhole): Seq[Unit] = benchmark(HashNHotEncoder("t"), bh) 48 | @Benchmark def hashNHotWeightedEncoder(bh: Blackhole): Seq[Unit] = 49 | benchmark(HashNHotWeightedEncoder("t"), bh) 50 | @Benchmark def hashOneHotEncoder(bh: Blackhole): Seq[Unit] = benchmark(HashOneHotEncoder("t"), bh) 51 | @Benchmark def heavyHitters(bh: Blackhole): Seq[Unit] = benchmark(HeavyHitters("t", 100), bh) 52 | @Benchmark def identityB(bh: Blackhole): Seq[Unit] = benchmark(Identity("t"), bh) 53 | @Benchmark def maxAbsScaler(bh: Blackhole): Seq[Unit] = benchmark(MaxAbsScaler("t"), bh) 54 | @Benchmark def mdl(bh: Blackhole): Seq[Unit] = benchmark(MDL[String]("t"), bh) 55 | @Benchmark def minMaxScaler(bh: Blackhole): Seq[Unit] = benchmark(MinMaxScaler("t"), bh) 56 | @Benchmark def nGrams(bh: Blackhole): Seq[Unit] = benchmark(NGrams("t"), bh) 57 | @Benchmark def nHotEncoder(bh: Blackhole): Seq[Unit] = benchmark(NHotEncoder("t"), bh) 58 | @Benchmark def nHotWeightedEncoder(bh: Blackhole): Seq[Unit] = 59 | benchmark(NHotWeightedEncoder("t"), bh) 60 | @Benchmark def normalizer(bh: Blackhole): Seq[Unit] = benchmark(Normalizer("t"), bh) 61 | @Benchmark def oneHotEncoder(bh: Blackhole): Seq[Unit] = benchmark(OneHotEncoder("t"), bh) 62 | @Benchmark def polynomialExpansion(bh: Blackhole): Seq[Unit] = 63 | benchmark(PolynomialExpansion("t"), bh) 64 | @Benchmark def quantileDiscretizer(bh: Blackhole): Seq[Unit] = 65 | benchmark(QuantileDiscretizer("t"), bh) 66 | @Benchmark def standardScaler(bh: Blackhole): Seq[Unit] = benchmark(StandardScaler("t"), bh) 67 | @Benchmark def topNOneHotEncoder(bh: Blackhole): Seq[Unit] = 68 | benchmark(TopNOneHotEncoder("t", 100), bh) 69 | @Benchmark def vectorIdentity(bh: Blackhole): Seq[Unit] = 70 | benchmark(VectorIdentity[Array]("t"), bh) 71 | @Benchmark def vonMisesEvaluator(bh: Blackhole): Seq[Unit] = 72 | benchmark(VonMisesEvaluator("t", 100.0, 0.001, Array(1.0, 2.0, 3.0, 4.0, 5.0)), bh) 73 | 74 | } 75 | 76 | private object Fixtures { 77 | implicit val doubles: Seq[Double] = (0 until 1000).map(_.toDouble) 78 | implicit val labels: Seq[String] = (0 until 1000).map(x => "l" + (x % 50)) 79 | implicit val mdlRecords: Seq[MDLRecord[String]] = 80 | (0 until 1000).map(x => MDLRecord((x % 3).toString, x.toDouble)) 81 | implicit val nLabels: Seq[Seq[String]] = 82 | (0 until 1000).map(x => (0 until (x % 50 + 1)).map("l" + _)) 83 | implicit val nWeightedLabels: Seq[Seq[WeightedLabel]] = nLabels.map(_.map(WeightedLabel(_, 1.0))) 84 | implicit val vectors: Seq[Array[Double]] = (0 until 1000).map(x => Array.fill(10)(x / 1000.0)) 85 | } 86 | 87 | private class NoOpFeatureBuilder(val bh: Blackhole) extends FeatureBuilder[Unit] { 88 | override def init(dimension: Int): Unit = bh.consume(dimension) 89 | override def result: Unit = bh.consume(Unit) 90 | override def add(name: String, value: Double): Unit = { 91 | bh.consume(name) 92 | bh.consume(value) 93 | } 94 | override def skip(): Unit = bh.consume(Unit) 95 | override def newBuilder: FeatureBuilder[Unit] = new NoOpFeatureBuilder(bh) 96 | } 97 | -------------------------------------------------------------------------------- /numpy/src/test/resources/a1d-double.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spotify/featran/6359cc941c95c3d4574f8a36c07dd7664d3b1bd0/numpy/src/test/resources/a1d-double.npy -------------------------------------------------------------------------------- /numpy/src/test/resources/a1d-float.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spotify/featran/6359cc941c95c3d4574f8a36c07dd7664d3b1bd0/numpy/src/test/resources/a1d-float.npy -------------------------------------------------------------------------------- /numpy/src/test/resources/a1d-int.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spotify/featran/6359cc941c95c3d4574f8a36c07dd7664d3b1bd0/numpy/src/test/resources/a1d-int.npy -------------------------------------------------------------------------------- /numpy/src/test/resources/a1d-long.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spotify/featran/6359cc941c95c3d4574f8a36c07dd7664d3b1bd0/numpy/src/test/resources/a1d-long.npy -------------------------------------------------------------------------------- /numpy/src/test/resources/a2d-double.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spotify/featran/6359cc941c95c3d4574f8a36c07dd7664d3b1bd0/numpy/src/test/resources/a2d-double.npy -------------------------------------------------------------------------------- /numpy/src/test/resources/a2d-float.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spotify/featran/6359cc941c95c3d4574f8a36c07dd7664d3b1bd0/numpy/src/test/resources/a2d-float.npy -------------------------------------------------------------------------------- /numpy/src/test/resources/a2d-int.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spotify/featran/6359cc941c95c3d4574f8a36c07dd7664d3b1bd0/numpy/src/test/resources/a2d-int.npy -------------------------------------------------------------------------------- /numpy/src/test/resources/a2d-long.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spotify/featran/6359cc941c95c3d4574f8a36c07dd7664d3b1bd0/numpy/src/test/resources/a2d-long.npy -------------------------------------------------------------------------------- /numpy/src/test/scala/com/spotify/featran/numpy/NumPyTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.numpy 19 | 20 | import java.io.{ByteArrayOutputStream, OutputStream} 21 | import org.scalatest.flatspec.AnyFlatSpec 22 | import org.scalatest.matchers.should.Matchers 23 | 24 | import scala.annotation.nowarn 25 | 26 | class NumPyTest extends AnyFlatSpec with Matchers { 27 | private def test(f: OutputStream => Unit)(expectedFile: String): Unit = { 28 | val actual = { 29 | val baos = new ByteArrayOutputStream() 30 | f(baos) 31 | baos.toByteArray 32 | } 33 | 34 | val expected = { 35 | val in = this.getClass.getResourceAsStream(expectedFile) 36 | val out = new ByteArrayOutputStream(math.max(32, in.available())) 37 | val buf = new Array[Byte](8192) 38 | var r = in.read(buf) 39 | while (r != -1) { 40 | out.write(buf, 0, r) 41 | r = in.read(buf) 42 | } 43 | out.toByteArray 44 | } 45 | 46 | actual shouldBe expected: @nowarn 47 | } 48 | 49 | "NumPy" should "work with 1-dimensional arrays" in { 50 | val a1d = (0 until 10).toArray 51 | test(NumPy.write(_, a1d))("/a1d-int.npy") 52 | test(NumPy.write(_, a1d.map(_.toLong)))("/a1d-long.npy") 53 | test(NumPy.write(_, a1d.map(_.toFloat)))("/a1d-float.npy") 54 | test(NumPy.write(_, a1d.map(_.toDouble)))("/a1d-double.npy") 55 | 56 | the[IllegalArgumentException] thrownBy { 57 | test(NumPy.write(_, a1d, Seq(20)))("/a1d-int.npy") 58 | } should have message "requirement failed: Invalid shape, 20 != 10" 59 | 60 | } 61 | 62 | it should "work with 2-dimensional arrays" in { 63 | val a2d = (for { 64 | i <- 0 until 10 65 | j <- 0 until 5 66 | } yield i * 10 + j).toArray 67 | test(NumPy.write(_, a2d, Seq(10, 5)))("/a2d-int.npy") 68 | test(NumPy.write(_, a2d.map(_.toLong), Seq(10, 5)))("/a2d-long.npy") 69 | test(NumPy.write(_, a2d.map(_.toFloat), Seq(10, 5)))("/a2d-float.npy") 70 | test(NumPy.write(_, a2d.map(_.toDouble), Seq(10, 5)))("/a2d-double.npy") 71 | 72 | the[IllegalArgumentException] thrownBy { 73 | test(NumPy.write(_, a2d, Seq(20, 5)))("/a1d-int.npy") 74 | } should have message "requirement failed: Invalid shape, 20 * 5 != 50" 75 | 76 | } 77 | 78 | it should "work with iterators" in { 79 | val a2d = (0 until 10).map(i => (0 until 5).map(j => i * 10 + j).toArray) 80 | test(NumPy.write(_, a2d.iterator, 10, 5))("/a2d-int.npy") 81 | test(NumPy.write(_, a2d.iterator.map(_.map(_.toLong)), 10, 5))("/a2d-long.npy") 82 | test(NumPy.write(_, a2d.iterator.map(_.map(_.toFloat)), 10, 5))("/a2d-float.npy") 83 | test(NumPy.write(_, a2d.iterator.map(_.map(_.toDouble)), 10, 5))("/a2d-double.npy") 84 | 85 | the[IllegalArgumentException] thrownBy { 86 | test(NumPy.write(_, a2d.iterator, 10, 10))("/a2d-int.npy") 87 | } should have message "requirement failed: Invalid row size, expected: 10, actual: 5" 88 | 89 | the[IllegalArgumentException] thrownBy { 90 | test(NumPy.write(_, a2d.iterator, 20, 5))("/a2d-int.npy") 91 | } should have message "requirement failed: Invalid number of rows, expected: 20, actual: 10" 92 | 93 | // hit the header.length % 16 == 0 condition 94 | the[IllegalArgumentException] thrownBy { 95 | test(NumPy.write(_, a2d.iterator, 1000000000, 50))("/a2d-int.npy") 96 | } should have message "requirement failed: Invalid row size, expected: 50, actual: 5" 97 | 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.8.2 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("org.typelevel" % "sbt-typelevel" % "0.4.18") 2 | addSbtPlugin("org.typelevel" % "sbt-typelevel-site" % "0.4.18") 3 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.6") 4 | addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.3") 5 | -------------------------------------------------------------------------------- /scalding/src/main/scala/com/spotify/featran/scalding/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | import com.twitter.algebird.Semigroup 21 | import com.twitter.scalding.typed.TypedPipe 22 | 23 | import scala.reflect.ClassTag 24 | 25 | package object scalding { 26 | 27 | /** [[CollectionType]] for extraction from Scalding `TypedPipe` type. */ 28 | implicit object ScaldingCollectionType extends CollectionType[TypedPipe] { 29 | override def map[A, B: ClassTag](ma: TypedPipe[A])(f: A => B): TypedPipe[B] = ma.map(f) 30 | 31 | override def reduce[A](ma: TypedPipe[A])(f: (A, A) => A): TypedPipe[A] = 32 | ma.sum(Semigroup.from(f)) 33 | 34 | override def cross[A, B: ClassTag](ma: TypedPipe[A])(mb: TypedPipe[B]): TypedPipe[(A, B)] = 35 | ma.cross(mb) 36 | 37 | override def pure[A, B: ClassTag](ma: TypedPipe[A])(b: B): TypedPipe[B] = 38 | TypedPipe.from(Iterable(b)) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /scalding/src/test/scala/com/spotify/featran/scalding/ScaldingTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.scalding 19 | 20 | import com.spotify.featran._ 21 | import com.twitter.scalding._ 22 | import org.scalatest.flatspec.AnyFlatSpec 23 | import org.scalatest.matchers.should.Matchers 24 | 25 | class ScaldingTest extends AnyFlatSpec with Matchers { 26 | import Fixtures._ 27 | 28 | def materialize[T](p: TypedPipe[T]): Iterable[T] = 29 | p.toIterableExecution.waitFor(Config.default, Local(true)).get 30 | 31 | "FeatureSpec" should "work with Scalding" in { 32 | val f = TestSpec.extract(TypedPipe.from(TestData)) 33 | materialize(f.featureNames) shouldBe Iterable(ExpectedNames) 34 | materialize(f.featureValues[Seq[Double]]) should contain theSameElementsAs ExpectedValues 35 | } 36 | 37 | it should "work with MultiFeatureSpec" in { 38 | noException shouldBe thrownBy { 39 | val f = RecordSpec.extract(TypedPipe.from(Records)) 40 | materialize(f.featureNames) 41 | materialize(f.featureValues[Seq[Double]]) 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /scio/src/main/scala/com/spotify/featran/scio/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | import com.spotify.scio.coders.Coder 21 | import com.spotify.scio.values.SCollection 22 | 23 | import scala.reflect.ClassTag 24 | 25 | package object scio { 26 | 27 | /** [[CollectionType]] for extraction from Scio `SCollection` type. */ 28 | implicit object ScioCollectionType extends CollectionType[SCollection] { 29 | override def map[A, B: ClassTag](ma: SCollection[A])(f: A => B): SCollection[B] = { 30 | implicit val coder: Coder[B] = Coder.kryo 31 | ma.map(f) 32 | } 33 | 34 | override def reduce[A](ma: SCollection[A])(f: (A, A) => A): SCollection[A] = 35 | ma.reduce(f) 36 | 37 | override def cross[A, B: ClassTag]( 38 | ma: SCollection[A] 39 | )(mb: SCollection[B]): SCollection[(A, B)] = 40 | ma.cross(mb) 41 | 42 | override def pure[A, B: ClassTag](ma: SCollection[A])(b: B): SCollection[B] = { 43 | implicit val coder: Coder[B] = Coder.kryo 44 | ma.context.parallelize(Seq(b)) 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /scio/src/test/scala/com/spotify/featran/scio/ScioTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.scio 19 | 20 | import com.spotify.featran._ 21 | import com.spotify.featran.transformers._ 22 | import com.spotify.scio.testing._ 23 | import com.spotify.featran.json._ 24 | import com.spotify.scio.values.SCollection 25 | 26 | class ScioTest extends PipelineSpec { 27 | import Fixtures._ 28 | 29 | "Scio" should "work with FeatureSpec" in { 30 | runWithContext { sc => 31 | val f = TestSpec.extract(sc.parallelize(TestData)) 32 | f.featureNames should containSingleValue(ExpectedNames) 33 | f.featureValues[Seq[Double]] should containInAnyOrder(ExpectedValues) 34 | } 35 | } 36 | 37 | it should "work with MultiFeatureSpec" in { 38 | noException shouldBe thrownBy { 39 | runWithContext { sc => 40 | val f = RecordSpec.extract(sc.parallelize(Records)) 41 | f.featureNames 42 | f.featureValues[Seq[Double]] 43 | } 44 | } 45 | } 46 | 47 | it should "work with FlatConverter on FeatureSpec" in { 48 | runWithContext { sc => 49 | FlatConverter[(String, Int), String](TestSpec) 50 | .convert(sc.parallelize(TestData)) 51 | } 52 | } 53 | 54 | it should "work with FlatConverter on MultiFeatureSpec" in { 55 | noException shouldBe thrownBy { 56 | runWithContext { sc => 57 | FlatConverter 58 | .multiSpec[Record, String](RecordSpec) 59 | .convert(sc.parallelize(Records)) 60 | } 61 | } 62 | } 63 | 64 | it should "work with FlatExtractor on FeatureSpec" in { 65 | runWithContext { sc => 66 | val json = FlatConverter[(String, Int), String](TestSpec) 67 | .convert(sc.parallelize(TestData)) 68 | FlatExtractor.flatSpec(TestSpec).extract(json) 69 | } 70 | } 71 | 72 | it should "work with FlatExtractor on MuiltiFeatureSpec" in { 73 | runWithContext { sc => 74 | val json = FlatConverter 75 | .multiSpec[Record, String](RecordSpec) 76 | .convert(sc.parallelize(Records)) 77 | FlatExtractor.multiFlatSpec(RecordSpec).extract(json) 78 | } 79 | } 80 | 81 | it should "work with FlatExtractor on Settings" in { 82 | runWithContext { sc => 83 | val settings = TestSpec 84 | .extract(sc.parallelize(TestData)) 85 | .featureSettings 86 | 87 | val json = FlatConverter[(String, Int), String](TestSpec) 88 | .convert(sc.parallelize(TestData)) 89 | 90 | FlatExtractor[SCollection, String](settings) 91 | .featureValues[Seq[Double]](json) should containInAnyOrder(ExpectedValues) 92 | } 93 | } 94 | 95 | private class NonSerializable { 96 | def method(a: String): Double = a.toDouble 97 | } 98 | 99 | it should "fail on serialization error" in { 100 | an[Exception] should be thrownBy { 101 | runWithContext { sc => 102 | val foo = new NonSerializable() 103 | val f = FeatureSpec 104 | .of[(String, Int)] 105 | .required(e => foo.method(e._1))(Identity("foo")) 106 | .extract(sc.parallelize(TestData)) 107 | 108 | f.featureValues[Seq[Double]] 109 | } 110 | } 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /scripts/make-site.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SOCCO=true sbt ++2.12.8 examples/clean examples/compile 4 | sbt ghpagesPushSite 5 | -------------------------------------------------------------------------------- /scripts/npy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | 5 | path = 'numpy/src/test/resources/' 6 | 7 | a1d = range(0, 10) 8 | np.save(open(path + 'a1d-int.npy', 'w'), np.array(a1d, np.int32)) 9 | np.save(open(path + 'a1d-long.npy', 'w'), np.array(a1d, np.int64)) 10 | np.save(open(path + 'a1d-float.npy', 'w'), np.array(a1d, np.float32)) 11 | np.save(open(path + 'a1d-double.npy', 'w'), np.array(a1d, np.float64)) 12 | 13 | a2d = [[i * 10 + j for j in range(0, 5)] for i in range(0, 10)] 14 | np.save(open(path + 'a2d-int.npy', 'w'), np.array(a2d, np.int32)) 15 | np.save(open(path + 'a2d-long.npy', 'w'), np.array(a2d, np.int64)) 16 | np.save(open(path + 'a2d-float.npy', 'w'), np.array(a2d, np.float32)) 17 | np.save(open(path + 'a2d-double.npy', 'w'), np.array(a2d, np.float64)) 18 | -------------------------------------------------------------------------------- /shell.nix: -------------------------------------------------------------------------------- 1 | let 2 | 3 | # use a pinned version of nixpkgs for reproducability 4 | nixpkgs-version = "22.11"; 5 | pkgs = import 6 | (builtins.fetchTarball { 7 | # Descriptive name to make the store path easier to identify 8 | name = "nixpkgs-${nixpkgs-version}"; 9 | url = "https://github.com/nixos/nixpkgs/archive/${nixpkgs-version}.tar.gz"; 10 | # Hash obtained using `nix-prefetch-url --unpack ` 11 | sha256 = "11w3wn2yjhaa5pv20gbfbirvjq6i3m7pqrq2msf0g7cv44vijwgw"; 12 | }) 13 | { }; 14 | in 15 | with pkgs; 16 | stdenv.mkDerivation { 17 | name = "featran-dev-env"; 18 | 19 | buildInputs = [ 20 | (sbt.override { jre = jdk11; }) 21 | jdk11 22 | ]; 23 | } 24 | -------------------------------------------------------------------------------- /spark/src/main/scala/com/spotify/featran/spark/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | import org.apache.spark.rdd.{RDD, RDDUtil} 21 | 22 | import scala.reflect.ClassTag 23 | 24 | package object spark { 25 | 26 | /** [[CollectionType]] for extraction from Apache Spark `RDD` type. */ 27 | implicit object SparkCollectionType extends CollectionType[RDD] { 28 | override def map[A, B: ClassTag](ma: RDD[A])(f: A => B): RDD[B] = 29 | ma.map(f) 30 | 31 | override def reduce[A](ma: RDD[A])(f: (A, A) => A): RDD[A] = 32 | ma.context.parallelize(Seq(ma.reduce(f)))(RDDUtil.classTag(ma)) 33 | 34 | override def cross[A, B: ClassTag](ma: RDD[A])(mb: RDD[B]): RDD[(A, B)] = { 35 | val b = mb.first() 36 | ma.map((_, b)) 37 | } 38 | 39 | override def pure[A, B: ClassTag](ma: RDD[A])(b: B): RDD[B] = ma.context.parallelize(Seq(b)) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /spark/src/main/scala/org/apache/spark/rdd/RDDUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package org.apache.spark.rdd 19 | 20 | import scala.reflect.ClassTag 21 | 22 | object RDDUtil { 23 | def classTag[T](rdd: RDD[T]): ClassTag[T] = rdd.elementClassTag 24 | } 25 | -------------------------------------------------------------------------------- /spark/src/test/scala/com/spotify/featran/spark/SparkTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.spark 19 | 20 | import com.spotify.featran._ 21 | import org.apache.spark.SparkContext 22 | import org.scalatest.flatspec.AnyFlatSpec 23 | import org.scalatest.matchers.should.Matchers 24 | 25 | class SparkTest extends AnyFlatSpec with Matchers { 26 | import Fixtures._ 27 | 28 | "FeatureSpec" should "work with Spark" in { 29 | val sc = new SparkContext("local[4]", "test") 30 | sc.setLogLevel("ERROR") 31 | val f = TestSpec.extract(sc.parallelize(TestData)) 32 | f.featureNames.collect() shouldBe Array(ExpectedNames) 33 | f.featureValues[Seq[Double]].collect() should contain theSameElementsAs ExpectedValues 34 | f.featureValues[Map[String, Double]] 35 | .collect() should contain theSameElementsAs ExpectedMapValues 36 | 37 | sc.stop() 38 | } 39 | 40 | it should "work with MultiFeatureSpec" in { 41 | noException shouldBe thrownBy { 42 | val sc = new SparkContext("local[4]", "test") 43 | sc.setLogLevel("ERROR") 44 | val f = RecordSpec.extract(sc.parallelize(Records)) 45 | f.featureNames.collect() 46 | f.featureValues[Seq[Double]].collect() 47 | sc.stop() 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /tensorflow/src/main/scala/com/spotify/featran/tensorflow/FeatureBuilder.scala: -------------------------------------------------------------------------------- 1 | package com.spotify.featran.tensorflow 2 | 3 | import com.spotify.featran.FeatureBuilder 4 | import org.tensorflow.proto.{example => tf} 5 | 6 | import scala.annotation.nowarn 7 | 8 | final case class TensorFlowFeatureBuilder( 9 | @transient private var underlying: tf.Features.Builder = tf.Features.newBuilder() 10 | ) extends FeatureBuilder[tf.Example] { 11 | override def init(dimension: Int): Unit = { 12 | if (underlying == null) { 13 | underlying = tf.Features.newBuilder() 14 | } 15 | underlying.clear(): @nowarn 16 | } 17 | 18 | override def add(name: String, value: Double): Unit = { 19 | val feature = tf.Feature 20 | .newBuilder() 21 | .setFloatList(tf.FloatList.newBuilder().addValue(value.toFloat)) 22 | .build() 23 | val normalized = FeatureNameNormalization.normalize(name) 24 | underlying.putFeature(normalized, feature): @nowarn 25 | } 26 | 27 | override def skip(): Unit = () 28 | 29 | override def skip(n: Int): Unit = () 30 | 31 | override def result: tf.Example = 32 | tf.Example.newBuilder().setFeatures(underlying).build() 33 | 34 | override def newBuilder: FeatureBuilder[tf.Example] = TensorFlowFeatureBuilder() 35 | } 36 | -------------------------------------------------------------------------------- /tensorflow/src/main/scala/com/spotify/featran/tensorflow/FeatureName.scala: -------------------------------------------------------------------------------- 1 | package com.spotify.featran.tensorflow 2 | 3 | import org.tensorflow.proto.{example => tf} 4 | 5 | import java.util.concurrent.ConcurrentHashMap 6 | import java.util.regex.Pattern 7 | 8 | final case class NamedTFFeature(name: String, f: tf.Feature) 9 | 10 | private object FeatureNameNormalization { 11 | private val NamePattern = Pattern.compile("[^A-Za-z0-9_]") 12 | 13 | val normalize: String => String = { 14 | lazy val cache = new ConcurrentHashMap[String, String]() 15 | fn => 16 | cache.computeIfAbsent( 17 | fn, 18 | (n: String) => NamePattern.matcher(n).replaceAll("_") 19 | ) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /tensorflow/src/main/scala/com/spotify/featran/tensorflow/TensorFlowType.scala: -------------------------------------------------------------------------------- 1 | package com.spotify.featran.tensorflow 2 | 3 | import com.google.protobuf.ByteString 4 | import org.tensorflow.proto.{example => tf} 5 | 6 | private object TensorFlowType { 7 | 8 | import scala.jdk.CollectionConverters._ 9 | 10 | def toFeature(name: String, ex: tf.Example): Option[tf.Feature] = { 11 | val fm = ex.getFeatures.getFeatureMap 12 | if (fm.containsKey(name)) { 13 | Some(fm.get(name)) 14 | } else { 15 | None 16 | } 17 | } 18 | 19 | def toFloats(f: tf.Feature): Seq[Float] = 20 | f.getFloatList.getValueList.asScala.toSeq.asInstanceOf[Seq[Float]] 21 | 22 | def toDoubles(f: tf.Feature): Seq[Double] = toFloats(f).map(_.toDouble) 23 | 24 | def toByteStrings(f: tf.Feature): Seq[ByteString] = f.getBytesList.getValueList.asScala.toSeq 25 | 26 | def toStrings(f: tf.Feature): Seq[String] = toByteStrings(f).map(_.toStringUtf8) 27 | 28 | def fromFloats(xs: Seq[Float]): tf.Feature.Builder = 29 | tf.Feature 30 | .newBuilder() 31 | .setFloatList(xs.foldLeft(tf.FloatList.newBuilder())(_.addValue(_)).build()) 32 | 33 | def fromDoubles(xs: Seq[Double]): tf.Feature.Builder = fromFloats(xs.map(_.toFloat)) 34 | 35 | def fromByteStrings(xs: Seq[ByteString]): tf.Feature.Builder = 36 | tf.Feature.newBuilder().setBytesList(tf.BytesList.newBuilder().addAllValue(xs.asJava)) 37 | 38 | def fromStrings(xs: Seq[String]): tf.Feature.Builder = 39 | fromByteStrings(xs.map(ByteString.copyFromUtf8)) 40 | 41 | } 42 | -------------------------------------------------------------------------------- /tensorflow/src/test/scala/com/spotify/featran/tensorflow/ExampleConverterSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.tensorflow 19 | 20 | import com.spotify.featran.{FeatureSpec, FlatConverter} 21 | import com.spotify.featran.transformers.{MDLRecord, WeightedLabel} 22 | import org.scalacheck._ 23 | import com.spotify.featran.transformers._ 24 | import org.tensorflow.proto.example.Example 25 | 26 | import scala.jdk.CollectionConverters._ 27 | 28 | class ExampleConverterSpec extends Properties("ExampleConverterSpec") { 29 | import TensorFlowType._ 30 | 31 | case class Record(d: Double, optD: Option[Double]) 32 | case class TransformerTypes( 33 | d: Double, 34 | s: String, 35 | ds: List[Double], 36 | ss: List[String], 37 | we: List[WeightedLabel], 38 | mdl: MDLRecord[String] 39 | ) 40 | 41 | implicit val arbRecords: Arbitrary[List[Record]] = Arbitrary { 42 | Gen.listOfN(100, Arbitrary.arbitrary[(Double, Option[Double])].map(r => Record(r._1, r._2))) 43 | } 44 | 45 | implicit val arbTypes: Arbitrary[List[TransformerTypes]] = Arbitrary { 46 | Gen.listOfN( 47 | 100, 48 | Arbitrary 49 | .arbitrary[(Float, String)] 50 | .map { case (num, str) => 51 | TransformerTypes( 52 | num.toDouble, 53 | str, 54 | List(num.toDouble), 55 | List(str), 56 | List(WeightedLabel(str, num.toDouble)), 57 | MDLRecord(str, num.toDouble) 58 | ) 59 | } 60 | ) 61 | } 62 | 63 | property("converter") = Prop.forAll { (xs: List[Record]) => 64 | val spec = FeatureSpec.of[Record].required(_.d)(Identity("id")) 65 | val f = FlatConverter[Record, Example](spec).convert(xs) 66 | Prop.all( 67 | f.map(_.getFeatures.getFeatureMap.get("id").getFloatList.getValue(0)) == xs.map(_.d.toFloat) 68 | ) 69 | } 70 | 71 | property("converter all types") = Prop.forAll { (xs: List[TransformerTypes]) => 72 | val spec = FeatureSpec 73 | .of[TransformerTypes] 74 | .required(_.d)(Identity("d")) 75 | .required(_.s)(OneHotEncoder("s.with$pecial characters")) 76 | .required(_.ds)(VectorIdentity("ds")) 77 | .required(_.ss)(NHotEncoder("ss")) 78 | .required(_.we)(NHotWeightedEncoder("we")) 79 | .required(_.mdl)(MDL("mdl")) 80 | 81 | val f = FlatConverter[TransformerTypes, Example](spec).convert(xs) 82 | 83 | val results = f.map { ex => 84 | val fm = ex.getFeatures.getFeatureMap.asScala 85 | TransformerTypes( 86 | toDoubles(fm("d")).head, 87 | toStrings(fm("s_with_pecial_characters")).head, 88 | toDoubles(fm("ds")).toList, 89 | toStrings(fm("ss")).toList, 90 | List(WeightedLabel(toStrings(fm("we_key")).head, toDoubles(fm("we_value")).head)), 91 | MDLRecord(toStrings(fm("mdl_label")).head, toDoubles(fm("mdl_value")).head) 92 | ) 93 | } 94 | 95 | Prop.all(results == xs) 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /tensorflow/src/test/scala/com/spotify/featran/tensorflow/TensorFlowFeatureBuilderSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.tensorflow 19 | 20 | import com.spotify.featran.{FeatureBuilder, SerializableUtils} 21 | import org.scalacheck._ 22 | import org.tensorflow.proto.example.{Example, Feature, Features, FloatList} 23 | 24 | object TensorFlowFeatureBuilderSpec extends Properties("TensorFlowFeatureBuilder") { 25 | private def list[T](implicit arb: Arbitrary[Option[T]]): Gen[List[Option[T]]] = 26 | Gen.listOfN(100, arb.arbitrary) 27 | 28 | property("TensorFlow Example") = Prop.forAll(list[Double]) { xs => 29 | val fb = SerializableUtils.ensureSerializable(FeatureBuilder[Example]) 30 | fb.init(xs.size + 4) 31 | val b = Features.newBuilder() 32 | xs.zipWithIndex.foreach { 33 | case (Some(x), i) => 34 | val key = "key" + i.toString 35 | fb.add(key, x) 36 | b.putFeature( 37 | key, 38 | Feature.newBuilder().setFloatList(FloatList.newBuilder().addValue(x.toFloat)).build() 39 | ) 40 | case (None, _) => fb.skip() 41 | } 42 | fb.add(Iterable("x", "y"), Seq(0.0, 0.0)) 43 | fb.skip(2) 44 | val actual = fb.result 45 | b.putFeature( 46 | "x", 47 | Feature.newBuilder().setFloatList(FloatList.newBuilder().addValue(0.0f)).build() 48 | ) 49 | b.putFeature( 50 | "y", 51 | Feature.newBuilder().setFloatList(FloatList.newBuilder().addValue(0.0f)).build() 52 | ) 53 | val expected = Example.newBuilder().setFeatures(b).build() 54 | actual == expected 55 | } 56 | 57 | property("feature names") = Prop.forAll { (key: String) => 58 | val fb = SerializableUtils.ensureSerializable(FeatureBuilder[Example]) 59 | fb.init(1) 60 | fb.add(key, 0.0) 61 | val actual = fb.result.getFeatures.getFeatureMap.keySet().iterator().next() 62 | Prop.all(actual.length == key.length, actual.replaceAll("[^A-Za-z0-9_]", "_") == actual) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /xgboost/src/main/scala/com/spotify/featran/xgboost/FeatureBuilder.scala: -------------------------------------------------------------------------------- 1 | package com.spotify.featran.xgboost 2 | 3 | import com.spotify.featran.{FeatureBuilder, SparseArray} 4 | import ml.dmlc.xgboost4j.LabeledPoint 5 | 6 | final private case class LabeledPointFB( 7 | private val underlying: FeatureBuilder[Array[Float]] = FeatureBuilder[Array[Float]].newBuilder 8 | ) extends FeatureBuilder[LabeledPoint] { 9 | override def init(dimension: Int): Unit = 10 | underlying.init(dimension) 11 | 12 | override def result: LabeledPoint = { 13 | val result = underlying.result 14 | LabeledPoint(0.0f, result.length, null, result) 15 | } 16 | 17 | override def add(name: String, value: Double): Unit = 18 | underlying.add(name, value) 19 | 20 | override def skip(): Unit = underlying.skip() 21 | 22 | override def newBuilder: FeatureBuilder[LabeledPoint] = LabeledPointFB() 23 | } 24 | 25 | final private case class SparseLabeledPointFB( 26 | private val underlying: FeatureBuilder[SparseArray[Float]] = 27 | FeatureBuilder[SparseArray[Float]].newBuilder 28 | ) extends FeatureBuilder[SparseLabeledPoint] { 29 | override def init(dimension: Int): Unit = underlying.init(dimension) 30 | 31 | override def result: SparseLabeledPoint = 32 | new SparseLabeledPoint( 33 | 0.0f, 34 | underlying.result.length, 35 | underlying.result.indices, 36 | underlying.result.values 37 | ) 38 | 39 | override def add(name: String, value: Double): Unit = 40 | underlying.add(name, value) 41 | 42 | override def skip(): Unit = underlying.skip() 43 | 44 | override def newBuilder: FeatureBuilder[SparseLabeledPoint] = SparseLabeledPointFB() 45 | 46 | } 47 | -------------------------------------------------------------------------------- /xgboost/src/main/scala/com/spotify/featran/xgboost/SparseLabeledPoint.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.xgboost 19 | 20 | import ml.dmlc.xgboost4j.LabeledPoint 21 | 22 | /** 23 | * Class to distinguish sparse `LabeledPoint` from its dense type. 24 | * 25 | * See `LabeledPoint` doc for field doc. 26 | */ 27 | final class SparseLabeledPoint private[xgboost] ( 28 | label: Float, 29 | size: Int, 30 | indices: Array[Int], 31 | values: Array[Float], 32 | weight: Float = 1f, 33 | group: Int = -1, 34 | baseMargin: Float = Float.NaN 35 | ) extends Serializable { 36 | require(indices != null, "Indices can't be null") 37 | val labeledPoint: LabeledPoint = 38 | LabeledPoint(label, size, indices, values, weight, group, baseMargin) 39 | } 40 | -------------------------------------------------------------------------------- /xgboost/src/main/scala/com/spotify/featran/xgboost/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran 19 | 20 | import ml.dmlc.xgboost4j.LabeledPoint 21 | 22 | package object xgboost { 23 | 24 | /** 25 | * [[FeatureBuilder]] for output as XGBoost's `LabeledPoint` type. 26 | * 27 | * NOTE: `LabeledPoint` stores values as `Float`s, so you might loose precision by moving from 28 | * `Double`s to `Float`s. 29 | */ 30 | implicit def denseXGBoostLabeledPointFeatureBuilder: FeatureBuilder[LabeledPoint] = 31 | LabeledPointFB() 32 | 33 | /** 34 | * [[FeatureBuilder]] for output as XGBoost's sparse `LabeledPoint` type. 35 | * 36 | * NOTE: `LabeledPoint` stores values as `Float`s, so you might loose precision by moving from 37 | * `Double`s to `Float`s. 38 | */ 39 | implicit def sparseXGBoostLabeledPointFeatureBuilder: FeatureBuilder[SparseLabeledPoint] = 40 | SparseLabeledPointFB() 41 | } 42 | -------------------------------------------------------------------------------- /xgboost/src/main/scala/ml/dmlc/xgboost4j/LabeledPoint.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package ml.dmlc.xgboost4j 19 | 20 | // XGBoost does not cross build for multiple Scala versions 21 | // Copied here to decouple XGBoost dependency and enable Featran cross builds 22 | /** 23 | * Labeled training data point. 24 | * 25 | * @param label 26 | * Label of this point. 27 | * @param size 28 | * Feature dimensionality 29 | * @param indices 30 | * Feature indices of this point or `null` if the data is dense. 31 | * @param values 32 | * Feature values of this point. 33 | * @param weight 34 | * Weight of this point. 35 | * @param group 36 | * Group of this point (used for ranking) or -1. 37 | * @param baseMargin 38 | * Initial prediction on this point or `Float.NaN` 39 | */ 40 | case class LabeledPoint( 41 | label: Float, 42 | size: Int, 43 | indices: Array[Int], 44 | values: Array[Float], 45 | weight: Float = 1f, 46 | group: Int = -1, 47 | baseMargin: Float = Float.NaN 48 | ) extends Serializable { 49 | require( 50 | indices == null || indices.length == values.length, 51 | "indices and values must have the same number of elements" 52 | ) 53 | 54 | require( 55 | indices == null || size >= indices.length, 56 | "feature dimensionality must be greater equal than size of indices" 57 | ) 58 | 59 | def this(label: Float, size: Int, indices: Array[Int], values: Array[Float]) = 60 | // [[weight]] default duplicated to disambiguate the constructor call. 61 | this(label, size, indices, values, 1.0f) 62 | } 63 | -------------------------------------------------------------------------------- /xgboost/src/test/scala/com/spotify/featran/xgboost/XGBoostFeatureBuilderSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 Spotify AB. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, 11 | * software distributed under the License is distributed on an 12 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | * KIND, either express or implied. See the License for the 14 | * specific language governing permissions and limitations 15 | * under the License. 16 | */ 17 | 18 | package com.spotify.featran.xgboost 19 | 20 | import com.spotify.featran.{FeatureBuilder, SerializableUtils, SparseArray} 21 | import ml.dmlc.xgboost4j.LabeledPoint 22 | import org.scalacheck.{Arbitrary, Gen, Prop, Properties} 23 | 24 | import scala.annotation.nowarn 25 | import scala.reflect.ClassTag 26 | 27 | object XGBoostFeatureBuilderSpec extends Properties("XGBoostFeatureBuilder") { 28 | private def list[T](implicit arb: Arbitrary[Option[T]]): Gen[List[Option[T]]] = 29 | Gen.listOfN(100, arb.arbitrary) 30 | 31 | @nowarn( 32 | "msg=evidence parameter evidence.* of type scala.reflect.ClassTag\\[.\\] .* is never used" 33 | ) 34 | private def test[T: ClassTag: Numeric, F](xs: List[Option[T]], builder: FeatureBuilder[F])( 35 | toSeq: F => Seq[Float] 36 | ): Prop = { 37 | val num = implicitly[Numeric[T]] 38 | val fb = SerializableUtils.ensureSerializable(builder) 39 | fb.init(xs.size + 4) 40 | fb.prepare(null) 41 | xs.zipWithIndex.foreach { 42 | case (Some(x), i) => fb.add("key" + i.toString, num.toDouble(x)) 43 | case (None, _) => fb.skip() 44 | } 45 | fb.add(Iterable("x", "y"), Seq(0.0, 0.0)) 46 | fb.skip(2) 47 | // keep in mind that we force the RHS to be floats because that is what LabeledPoint stores 48 | toSeq(fb.result) == (xs.map(_.getOrElse(num.zero)) ++ List.fill(4)(num.zero)).map(num.toFloat) 49 | } 50 | 51 | property("LabeledPoint on Float input") = Prop.forAll(list[Float]) { xs => 52 | test(xs, FeatureBuilder[LabeledPoint])(_.values.toSeq) 53 | } 54 | 55 | property("LabeledPoint on Double input") = Prop.forAll(list[Double]) { xs => 56 | test(xs, FeatureBuilder[LabeledPoint])(_.values.toSeq) 57 | } 58 | 59 | property("Sparse LabeledPoint on Float input") = Prop.forAll(list[Float]) { xs => 60 | test(xs, FeatureBuilder[SparseLabeledPoint])(r => 61 | SparseArray(r.labeledPoint.indices, r.labeledPoint.values, 4 + xs.size).toDense.toSeq 62 | ) 63 | val n = 1024 / xs.size + 1 64 | val xs2 = Seq.fill(n)(xs).reduce(_ ++ _) 65 | test(xs2, FeatureBuilder[SparseLabeledPoint])(r => 66 | SparseArray(r.labeledPoint.indices, r.labeledPoint.values, 4 + xs2.size).toDense.toSeq 67 | ) 68 | } 69 | 70 | property("Sparse LabeledPoint on Double input") = Prop.forAll(list[Double]) { xs => 71 | test(xs, FeatureBuilder[SparseLabeledPoint])(r => 72 | SparseArray(r.labeledPoint.indices, r.labeledPoint.values, 4 + xs.size).toDense.toSeq 73 | ) 74 | val n = 1024 / xs.size + 1 75 | val xs2 = Seq.fill(n)(xs).reduce(_ ++ _) 76 | test(xs2, FeatureBuilder[SparseLabeledPoint])(r => 77 | SparseArray(r.labeledPoint.indices, r.labeledPoint.values, 4 + xs2.size).toDense.toSeq 78 | ) 79 | } 80 | } 81 | --------------------------------------------------------------------------------