├── .github ├── CODEOWNERS ├── FUNDING.yml ├── dependabot.yml └── workflows │ ├── scorecard.yml │ └── tests.yml ├── .gitignore ├── docs ├── bench_lin.png ├── bench_log.png ├── benchmark_compare │ └── python-sklearn │ │ ├── requirements.txt │ │ ├── Makefile │ │ ├── macbook_2017 │ │ └── bench.py ├── codegen_transform_cpu_profile.png ├── reflect_transform_cpu_profile.png ├── codegen_transform_cpu_profile_selected.png └── benchmarks │ └── macbook_2017 ├── SECURITY.md ├── go.mod ├── transformers ├── common.go ├── discretization.go ├── samplenormalizers.go ├── discretization_test.go ├── categorical.go ├── scalers.go ├── samplenormalizers_test.go ├── categorical_test.go ├── textprocesors.go ├── scalers_test.go └── textprocessors_test.go ├── CITATION.cff ├── cmd └── generate │ ├── tests │ ├── readme.go │ ├── examplefile.go │ ├── weirdtagsfp.go │ ├── largememorytransformerfp.go │ ├── employeefp.go │ ├── alltransformersfp.go │ ├── readme_test.go │ ├── with32fieldsfp_test.go │ ├── employeefp_test.go │ ├── weirdtagsfp_test.go │ ├── alltransformersfp_test.go │ ├── with32fieldsfp.go │ └── largememorytransformerfp_test.go │ ├── main.go │ ├── templatecode.go │ ├── parser.go │ └── templatetests.go ├── LICENSE ├── go.sum └── structtransformer ├── structtransformer.go └── structtransformer_test.go /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @nikolaydubina 2 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: nikolaydubina 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.test 2 | docs/benchmark_profiles/* 3 | -------------------------------------------------------------------------------- /docs/bench_lin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikolaydubina/go-featureprocessing/HEAD/docs/bench_lin.png -------------------------------------------------------------------------------- /docs/bench_log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikolaydubina/go-featureprocessing/HEAD/docs/bench_log.png -------------------------------------------------------------------------------- /docs/benchmark_compare/python-sklearn/requirements.txt: -------------------------------------------------------------------------------- 1 | scipy 2 | pandas 3 | numpy 4 | sklearn 5 | argparse -------------------------------------------------------------------------------- /docs/codegen_transform_cpu_profile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikolaydubina/go-featureprocessing/HEAD/docs/codegen_transform_cpu_profile.png -------------------------------------------------------------------------------- /docs/reflect_transform_cpu_profile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikolaydubina/go-featureprocessing/HEAD/docs/reflect_transform_cpu_profile.png -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | Contact [@nikolaydubina](https://github.com/nikolaydubina) over email or linkedin. 6 | -------------------------------------------------------------------------------- /docs/codegen_transform_cpu_profile_selected.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikolaydubina/go-featureprocessing/HEAD/docs/codegen_transform_cpu_profile_selected.png -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/nikolaydubina/go-featureprocessing 2 | 3 | go 1.15 4 | 5 | require ( 6 | github.com/google/gofuzz v1.2.0 7 | github.com/stretchr/testify v1.10.0 8 | go.uber.org/multierr v1.9.0 9 | ) 10 | -------------------------------------------------------------------------------- /transformers/common.go: -------------------------------------------------------------------------------- 1 | package transformers 2 | 3 | import "math" 4 | 5 | func std(vals []float64, mean float64) float64 { 6 | sum := 0. 7 | for _, v := range vals { 8 | sum += math.Abs(v-mean) * math.Abs(v-mean) 9 | } 10 | return math.Sqrt(sum / (float64(len(vals)) - 1)) 11 | } 12 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "gomod" 9 | directory: "/" 10 | schedule: 11 | interval: "daily" 12 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: If you reference this library in publication, please cite it as below. 3 | title: Feature Pre-processing in Go 4 | abstract: High-performance machine learning feature preprocessing in Go 5 | authors: 6 | - family-names: Dubina 7 | given-names: Nikolay 8 | version: 2.1 9 | date-released: 2020-12-21 10 | license: MIT 11 | repository-code: https://github.com/nikolaydubina/go-featureprocessing 12 | url: https://github.com/nikolaydubina/go-featureprocessing 13 | -------------------------------------------------------------------------------- /cmd/generate/tests/readme.go: -------------------------------------------------------------------------------- 1 | package examplemodule 2 | 3 | //go:generate go run github.com/nikolaydubina/go-featureprocessing/cmd/generate -struct=Employee 4 | 5 | // Employee is example from readme 6 | type Employee struct { 7 | Age int `feature:"identity"` 8 | Salary float64 `feature:"minmax"` 9 | Kids int `feature:"maxabs"` 10 | Weight float64 `feature:"standard"` 11 | Height float64 `feature:"quantile"` 12 | City string `feature:"onehot"` 13 | Car string `feature:"ordinal"` 14 | Income float64 `feature:"kbins"` 15 | Description string `feature:"tfidf"` 16 | SecretValue float64 17 | } 18 | -------------------------------------------------------------------------------- /transformers/discretization.go: -------------------------------------------------------------------------------- 1 | package transformers 2 | 3 | import "sort" 4 | 5 | // KBinsDiscretizer based on quantile strategy 6 | type KBinsDiscretizer struct { 7 | QuantileScaler 8 | } 9 | 10 | // Fit fits quantile scaler 11 | func (t *KBinsDiscretizer) Fit(vals []float64) { 12 | t.QuantileScaler.Fit(vals) 13 | } 14 | 15 | // Transform finds index of matched quantile for input 16 | func (t *KBinsDiscretizer) Transform(v float64) float64 { 17 | if len(t.QuantileScaler.Quantiles) == 0 { 18 | return 0 19 | } 20 | i := sort.SearchFloat64s(t.Quantiles[:], v) 21 | if i >= len(t.Quantiles) { 22 | return float64(len(t.Quantiles)) + 1 23 | } 24 | return float64(i) + 1 25 | } 26 | -------------------------------------------------------------------------------- /docs/benchmark_compare/python-sklearn/Makefile: -------------------------------------------------------------------------------- 1 | install: 2 | pip3 install -r requirements.txt 3 | 4 | clean: 5 | rm -rf macbook_2017 6 | 7 | bench: install clean 8 | python3 bench.py --nsamples=1 --ntrials=10 --ntrialsgroup=100 >> macbook_2017 9 | python3 bench.py --nsamples=10 --ntrials=10 --ntrialsgroup=100 >> macbook_2017 10 | python3 bench.py --nsamples=100 --ntrials=10 --ntrialsgroup=100 >> macbook_2017 11 | python3 bench.py --nsamples=1000 --ntrials=10 --ntrialsgroup=100 >> macbook_2017 12 | python3 bench.py --nsamples=10000 --ntrials=10 --ntrialsgroup=10 >> macbook_2017 13 | python3 bench.py --nsamples=100000 --ntrials=10 --ntrialsgroup=10 >> macbook_2017 14 | python3 bench.py --nsamples=1000000 --ntrials=10 --ntrialsgroup=1 >> macbook_2017 15 | python3 bench.py --nsamples=5000000 --ntrials=10 --ntrialsgroup=1 >> macbook_2017 16 | python3 bench.py --nsamples=15000000 --ntrials=10 --ntrialsgroup=1 >> macbook_2017 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Nikolay Dubina 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/scorecard.yml: -------------------------------------------------------------------------------- 1 | name: Scorecard supply-chain security 2 | on: 3 | branch_protection_rule: 4 | schedule: 5 | - cron: '42 6 * * 2' 6 | push: 7 | branches: [ "main" ] 8 | 9 | permissions: read-all 10 | 11 | jobs: 12 | analysis: 13 | name: Scorecard analysis 14 | runs-on: ubuntu-latest 15 | permissions: 16 | security-events: write 17 | id-token: write 18 | 19 | steps: 20 | - name: "Checkout code" 21 | uses: actions/checkout@v3.1.0 22 | with: 23 | persist-credentials: false 24 | 25 | - name: "Run analysis" 26 | uses: ossf/scorecard-action@v2.3.1 27 | with: 28 | results_file: results.sarif 29 | results_format: sarif 30 | publish_results: true 31 | 32 | - name: "Upload artifact" 33 | uses: actions/upload-artifact@v3.1.0 34 | with: 35 | name: SARIF file 36 | path: results.sarif 37 | retention-days: 5 38 | 39 | - name: "Upload to code-scanning" 40 | uses: github/codeql-action/upload-sarif@v2.2.4 41 | with: 42 | sarif_file: results.sarif 43 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | permissions: read-all 10 | 11 | jobs: 12 | build: 13 | name: Tests 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout code 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up Go 1.x 20 | uses: actions/setup-go@v5 21 | with: 22 | go-version: ^1.15 23 | 24 | - name: Test 25 | run: | 26 | go generate ./... 27 | go get -v -t -d ./... 28 | go install github.com/jstemmer/go-junit-report/v2@latest 29 | go test -coverprofile=coverage.out -covermode=atomic -cover -json -v ./... 2>&1 | go-junit-report -set-exit-code > tests.xml 30 | 31 | - name: Upload test results to Codecov 32 | uses: codecov/test-results-action@v1 33 | with: 34 | token: ${{ secrets.CODECOV_TOKEN }} 35 | files: tests.xml 36 | 37 | - name: Upload coverage to Codecov 38 | uses: codecov/codecov-action@v4.1.1 39 | with: 40 | token: ${{ secrets.CODECOV_TOKEN }} 41 | files: coverage.out 42 | -------------------------------------------------------------------------------- /docs/benchmark_compare/python-sklearn/macbook_2017: -------------------------------------------------------------------------------- 1 | nsamples=1 ntrials=10 ntrialsgroup=100 avg=12824253 ns min=12219147 ns max=13943499 ns samples_dataframe_size=8 B setuptook=27610184 ns 2 | nsamples=10 ntrials=10 ntrialsgroup=100 avg=13809201 ns min=12688076 ns max=14746466 ns samples_dataframe_size=80 B setuptook=27280819 ns 3 | nsamples=100 ntrials=10 ntrialsgroup=100 avg=14324627 ns min=13311803 ns max=15129684 ns samples_dataframe_size=800 B setuptook=25503670 ns 4 | nsamples=1000 ntrials=10 ntrialsgroup=100 avg=15042673 ns min=13605346 ns max=17810513 ns samples_dataframe_size=8000 B setuptook=32386977 ns 5 | nsamples=10000 ntrials=10 ntrialsgroup=10 avg=20092639 ns min=18415227 ns max=22949523 ns samples_dataframe_size=80000 B setuptook=98518650 ns 6 | nsamples=100000 ntrials=10 ntrialsgroup=10 avg=73354263 ns min=71922718 ns max=75853612 ns samples_dataframe_size=800000 B setuptook=758389751 ns 7 | nsamples=1000000 ntrials=10 ntrialsgroup=1 avg=660746274 ns min=645929252 ns max=697522591 ns samples_dataframe_size=8000000 B setuptook=6992992088 ns 8 | nsamples=5000000 ntrials=10 ntrialsgroup=1 avg=3839594987 ns min=3557765533 ns max=4383723914 ns samples_dataframe_size=40000000 B setuptook=37772393178 ns 9 | nsamples=15000000 ntrials=10 ntrialsgroup=1 avg=19546411996 ns min=15810273557 ns max=21872775279 ns samples_dataframe_size=120000000 B setuptook=197243552642 ns 10 | -------------------------------------------------------------------------------- /transformers/samplenormalizers.go: -------------------------------------------------------------------------------- 1 | package transformers 2 | 3 | import "math" 4 | 5 | // SampleNormalizerL1 transforms features for single sample to have norm L1=1 6 | type SampleNormalizerL1 struct{} 7 | 8 | // Fit is empty, kept only to keep same interface 9 | func (t *SampleNormalizerL1) Fit(_ []float64) {} 10 | 11 | // Transform returns L1 normalized vector 12 | func (t *SampleNormalizerL1) Transform(vs []float64) []float64 { 13 | if t == nil || vs == nil { 14 | return nil 15 | } 16 | vsnorm := make([]float64, len(vs)) 17 | t.TransformInplace(vsnorm, vs) 18 | return vsnorm 19 | } 20 | 21 | // TransformInplace returns L1 normalized vector, inplace 22 | func (t *SampleNormalizerL1) TransformInplace(dest []float64, vs []float64) { 23 | if t == nil || vs == nil || dest == nil || len(dest) != len(vs) { 24 | return 25 | } 26 | 27 | sum := 0. 28 | for _, v := range vs { 29 | sum += math.Abs(v) 30 | } 31 | 32 | for i := range dest { 33 | if sum == 0 { 34 | dest[i] = 0 35 | } else { 36 | dest[i] = vs[i] / sum 37 | } 38 | } 39 | } 40 | 41 | // SampleNormalizerL2 transforms features for single sample to have norm L2=1 42 | type SampleNormalizerL2 struct{} 43 | 44 | // Fit is empty, kept only to keep same interface 45 | func (t *SampleNormalizerL2) Fit(_ []float64) {} 46 | 47 | // Transform returns L2 normalized vector 48 | func (t *SampleNormalizerL2) Transform(vs []float64) []float64 { 49 | if t == nil || vs == nil { 50 | return nil 51 | } 52 | vsnorm := make([]float64, len(vs)) 53 | t.TransformInplace(vsnorm, vs) 54 | return vsnorm 55 | } 56 | 57 | // TransformInplace returns L2 normalized vector, inplace 58 | func (t *SampleNormalizerL2) TransformInplace(dest []float64, vs []float64) { 59 | if t == nil || vs == nil || dest == nil || len(dest) != len(vs) { 60 | return 61 | } 62 | 63 | sum := 0. 64 | for _, v := range vs { 65 | sum += v * v 66 | } 67 | sum = math.Sqrt(sum) 68 | 69 | for i := range dest { 70 | if sum == 0 { 71 | dest[i] = 0 72 | } else { 73 | dest[i] = vs[i] / sum 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 2 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 3 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 4 | github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= 5 | github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= 6 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 7 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 8 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 9 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= 10 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= 11 | github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= 12 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 13 | github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 14 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 15 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= 16 | github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= 17 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= 18 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 19 | go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= 20 | go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= 21 | go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= 22 | go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= 23 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 24 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 25 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 26 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 27 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 28 | -------------------------------------------------------------------------------- /transformers/discretization_test.go: -------------------------------------------------------------------------------- 1 | package transformers_test 2 | 3 | import ( 4 | "testing" 5 | 6 | . "github.com/nikolaydubina/go-featureprocessing/transformers" 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func TestKBinsDiscretizerTransform(t *testing.T) { 11 | samples := []struct { 12 | name string 13 | quantiles []float64 14 | input float64 15 | output float64 16 | }{ 17 | {"basic1", []float64{25, 50, 75, 100}, 0, 1}, 18 | {"basic2", []float64{25, 50, 75, 100}, 11, 1}, 19 | {"basic3", []float64{25, 50, 75, 100}, 25, 1}, 20 | {"basic4", []float64{25, 50, 75, 100}, 40, 2}, 21 | {"basic5", []float64{25, 50, 75, 100}, 50, 2}, 22 | {"basic6", []float64{25, 50, 75, 100}, 80, 4}, 23 | {"above_max", []float64{25, 50, 75, 100}, 101, 5}, 24 | {"empty", nil, 10, 0}, 25 | } 26 | for _, s := range samples { 27 | t.Run(s.name, func(t *testing.T) { 28 | encoder := KBinsDiscretizer{QuantileScaler{Quantiles: s.quantiles}} 29 | features := encoder.Transform((s.input)) 30 | assert.Equal(t, s.output, features) 31 | }) 32 | } 33 | } 34 | 35 | func TestKBinsDiscretizerTransformFit(t *testing.T) { 36 | samples := []struct { 37 | name string 38 | quantiles []float64 39 | vals []float64 40 | }{ 41 | {"noinput", nil, nil}, 42 | {"basic", []float64{25, 50, 75, 100}, []float64{25, 50, 75, 100}}, 43 | {"reverse_order", []float64{25, 50, 75, 100}, []float64{100, 75, 50, 25}}, 44 | {"negative", []float64{-100, -75, -50, -25}, []float64{-25, -50, -75, -100}}, 45 | {"one_element", []float64{10}, []float64{10}}, 46 | {"less_elements_than_quantiles", []float64{1, 2, 3}, []float64{1, 2, 3}}, 47 | {"less_elements_than_quantiles_negative", []float64{-3, -2, -1}, []float64{-1, -3, -2}}, 48 | } 49 | for _, s := range samples { 50 | t.Run(s.name, func(t *testing.T) { 51 | encoder := KBinsDiscretizer{QuantileScaler{}} 52 | encoder.Fit(s.vals) 53 | assert.Equal(t, KBinsDiscretizer{QuantileScaler{Quantiles: s.quantiles}}, encoder) 54 | }) 55 | } 56 | 57 | t.Run("number of quantiles is larger than num input vals", func(t *testing.T) { 58 | encoder := KBinsDiscretizer{QuantileScaler{Quantiles: []float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}}} 59 | encoder.Fit([]float64{1, 2, 3}) 60 | assert.Equal(t, KBinsDiscretizer{QuantileScaler{Quantiles: []float64{1, 2, 3}}}, encoder) 61 | }) 62 | 63 | t.Run("when fit on nil data not zero value", func(t *testing.T) { 64 | encoder := KBinsDiscretizer{} 65 | encoder.Fit(nil) 66 | assert.Equal(t, KBinsDiscretizer{}, encoder) 67 | }) 68 | } 69 | -------------------------------------------------------------------------------- /cmd/generate/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "flag" 6 | "fmt" 7 | "go/format" 8 | "io/ioutil" 9 | "log" 10 | "os" 11 | "path/filepath" 12 | "strings" 13 | "text/template" 14 | 15 | "go.uber.org/multierr" 16 | ) 17 | 18 | func run() error { 19 | structName := "" 20 | fileName := os.Getenv("GOFILE") 21 | packageName := os.Getenv("GOPACKAGE") 22 | 23 | flag.StringVar(&structName, "struct", "", "struct to be generated for") 24 | flag.Parse() 25 | 26 | if structName == "" || fileName == "" || packageName == "" { 27 | return fmt.Errorf("missing arguments or environment variables") 28 | } 29 | 30 | log.Printf("go-featureprocessing is writing struct transfomer for struct '%s' $GOFILE=%s $GOPACKAGE=%s ", structName, fileName, packageName) 31 | 32 | inputCode, err := ioutil.ReadFile(fileName) 33 | if err != nil { 34 | return fmt.Errorf("can not open input file: %w", err) 35 | } 36 | 37 | params, err := parseCode(fileName, inputCode, structName, packageName) 38 | if err != nil { 39 | return fmt.Errorf("can not parse code: %w", err) 40 | } 41 | 42 | codeFilePath := fmt.Sprintf("%sfp.go", strings.ToLower(structName)) 43 | testFilePath := fmt.Sprintf("%sfp_test.go", strings.ToLower(structName)) 44 | 45 | if err := generate(params, codeFilePath, "templateCode", templateCode); err != nil { 46 | return fmt.Errorf("can not make code: %w", err) 47 | } 48 | if err := generate(params, testFilePath, "templateTests", templateTests); err != nil { 49 | return fmt.Errorf("can not make tests: %w", err) 50 | } 51 | 52 | return nil 53 | } 54 | 55 | func generate(params *TemplateParams, outfilepath string, templateName string, templateVal string) error { 56 | code := bytes.NewBufferString("") 57 | parsedTemplate, err := template.New(templateName).Parse(templateVal) 58 | if err != nil { 59 | return fmt.Errorf("can not initialize template: %w", err) 60 | } 61 | if err := parsedTemplate.Execute(code, params); err != nil { 62 | return fmt.Errorf("can not execute template: %w", err) 63 | } 64 | 65 | if err := writeCodeToFile(code.Bytes(), outfilepath); err != nil { 66 | return fmt.Errorf("can not write code: %w", err) 67 | } 68 | return nil 69 | } 70 | 71 | func writeCodeToFile(code []byte, outfilepath string) (err error) { 72 | formattedCode, err := format.Source(code) 73 | if err != nil { 74 | return fmt.Errorf("can not format code: %w, code: %s", err, code) 75 | } 76 | 77 | if err := os.MkdirAll(filepath.Dir(outfilepath), 0700); err != nil { 78 | return fmt.Errorf("can not make dir for output file: %w", err) 79 | } 80 | 81 | file, err := os.Create(outfilepath) 82 | if err != nil { 83 | return fmt.Errorf("can not create file: %w", err) 84 | } 85 | defer func() { err = multierr.Combine(err, file.Close()) }() 86 | 87 | if _, err := file.Write(formattedCode); err != nil { 88 | return fmt.Errorf("can not write code to file: %w", err) 89 | } 90 | return nil 91 | } 92 | 93 | func main() { 94 | if err := run(); err != nil { 95 | log.Fatalf(fmt.Errorf("go-featureprocessing encountered error: %w", err).Error()) 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /structtransformer/structtransformer.go: -------------------------------------------------------------------------------- 1 | package structtransformer 2 | 3 | import ( 4 | "reflect" 5 | ) 6 | 7 | type numericalTransformer interface { 8 | Fit(vals []float64) 9 | Transform(val float64) float64 10 | } 11 | 12 | type stringTransformer interface { 13 | Fit(vals []string) 14 | Transform(val string) float64 15 | } 16 | 17 | type stringExpandingTransformer interface { 18 | Fit(vals []string) 19 | NumFeatures() int 20 | Transform(val string) []float64 21 | } 22 | 23 | // StructTransformer uses reflection to encode struct into feature vector. 24 | // It uses struct tags to create feature transformers for each field. 25 | // Since it is using reflection, there is a slight overhead for large structs, which can be seen in benchmarks. 26 | // For better performance, use codegen version for your struct, refer to README of this repo. 27 | type StructTransformer struct { 28 | Transformers []interface{} 29 | } 30 | 31 | // Fit will fit all field transformers 32 | func (s *StructTransformer) Fit(_ []interface{}) { 33 | // TODO: go through encoders, make slice for each with data, call fit on that data 34 | panic("not implemented") 35 | } 36 | 37 | // Transform applies all field transformers 38 | func (s *StructTransformer) Transform(v interface{}) []float64 { 39 | if v == nil || s == nil { 40 | return nil 41 | } 42 | 43 | if s.getNumFeatures() == 0 { 44 | return nil 45 | } 46 | 47 | features := make([]float64, 0, s.getNumFeatures()) 48 | 49 | val := reflect.ValueOf(v) 50 | for i := 0; i < val.NumField() && i < len(s.Transformers); i++ { 51 | transformer := s.Transformers[i] 52 | if transformer == nil || reflect.ValueOf(transformer).IsNil() { 53 | continue 54 | } 55 | 56 | field := val.Field(i) 57 | switch field.Type().Kind() { 58 | case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: 59 | features = append(features, s.transformNumerical(transformer, float64(field.Int()))...) 60 | case reflect.Float32, reflect.Float64: 61 | features = append(features, s.transformNumerical(transformer, field.Float())...) 62 | case reflect.String: 63 | features = append(features, s.transformString(transformer, field.String())...) 64 | default: 65 | panic("unsupported type in struct") 66 | } 67 | } 68 | 69 | return features 70 | } 71 | 72 | func (s *StructTransformer) getNumFeatures() int { 73 | count := 0 74 | for _, tr := range s.Transformers { 75 | if tr, ok := tr.(stringExpandingTransformer); ok { 76 | count += tr.NumFeatures() 77 | } else { 78 | count++ 79 | } 80 | } 81 | return count 82 | } 83 | 84 | func (s *StructTransformer) transformNumerical(transformer interface{}, val float64) []float64 { 85 | if transformer, ok := transformer.(numericalTransformer); ok { 86 | return []float64{transformer.Transform(val)} 87 | } 88 | return nil 89 | } 90 | 91 | func (s *StructTransformer) transformString(transformer interface{}, val string) []float64 { 92 | if transformer, ok := transformer.(stringTransformer); ok { 93 | return []float64{transformer.Transform(val)} 94 | } 95 | if transformer, ok := transformer.(stringExpandingTransformer); ok { 96 | return transformer.Transform(val) 97 | } 98 | return nil 99 | } 100 | -------------------------------------------------------------------------------- /transformers/categorical.go: -------------------------------------------------------------------------------- 1 | package transformers 2 | 3 | // OneHotEncoder encodes string value to corresponding index 4 | // 5 | // Mapping should contain all values from 0 to N where N is len(Mapping). 6 | // Responsibility to ensure this is on caller. 7 | // If some index is higher than N or lower than 0, then code will panic. 8 | // If some index is not set, then that index will be skipped. 9 | // If some index is set twice, then index will have effect of either of words. 10 | type OneHotEncoder struct { 11 | Mapping map[string]uint // word to index 12 | } 13 | 14 | // Fit assigns each value from inputs a number 15 | // based on order of occurrence in input data. 16 | // Ignoring empty strings in input. 17 | func (t *OneHotEncoder) Fit(vs []string) { 18 | if t == nil || len(vs) == 0 { 19 | return 20 | } 21 | t.Mapping = make(map[string]uint) 22 | for _, v := range vs { 23 | if v == "" { 24 | continue 25 | } 26 | if _, ok := t.Mapping[v]; !ok { 27 | t.Mapping[v] = uint(len(t.Mapping)) 28 | } 29 | } 30 | } 31 | 32 | // NumFeatures returns number of features one field is expanded 33 | func (t *OneHotEncoder) NumFeatures() int { 34 | return len(t.Mapping) 35 | } 36 | 37 | // Transform assigns 1 to value that is found 38 | func (t *OneHotEncoder) Transform(v string) []float64 { 39 | if t == nil || len(t.Mapping) == 0 { 40 | return nil 41 | } 42 | features := make([]float64, t.NumFeatures()) 43 | t.TransformInplace(features, v) 44 | return features 45 | } 46 | 47 | // TransformInplace assigns 1 to value that is found, inplace. 48 | // It is responsibility of a caller to reset destination to 0. 49 | func (t *OneHotEncoder) TransformInplace(dest []float64, v string) { 50 | if t == nil || len(t.Mapping) == 0 || len(dest) != t.NumFeatures() { 51 | return 52 | } 53 | if idx, ok := t.Mapping[v]; ok { 54 | dest[idx] = 1 55 | } 56 | } 57 | 58 | // FeatureNames returns names of each produced value. 59 | func (t *OneHotEncoder) FeatureNames() []string { 60 | if t == nil || len(t.Mapping) == 0 { 61 | return nil 62 | } 63 | names := make([]string, t.NumFeatures()) 64 | for w, i := range t.Mapping { 65 | names[i] = w 66 | } 67 | return names 68 | } 69 | 70 | // OrdinalEncoder returns 0 for string that is not found, or else a number for that string 71 | // 72 | // Mapping should contain all values from 0 to N where N is len(Mapping). 73 | // Responsibility to ensure this is on caller. 74 | // If some index is higher than N or lower than 0, then code will panic. 75 | // If some index is not set, then that index will be skipped. 76 | // If some index is set twice, then index will have effect of either of words. 77 | type OrdinalEncoder struct { 78 | Mapping map[string]uint 79 | } 80 | 81 | // Fit assigns each word value from 1 to N 82 | // Ignoring empty strings in input. 83 | func (t *OrdinalEncoder) Fit(vals []string) { 84 | if t == nil || len(vals) == 0 { 85 | return 86 | } 87 | t.Mapping = make(map[string]uint) 88 | for _, v := range vals { 89 | if v == "" { 90 | continue 91 | } 92 | if _, ok := t.Mapping[v]; !ok { 93 | t.Mapping[v] = uint(len(t.Mapping) + 1) 94 | } 95 | } 96 | } 97 | 98 | // Transform returns number of input, if not found returns zero value which is 0 99 | func (t *OrdinalEncoder) Transform(v string) float64 { 100 | if t == nil { 101 | return 0 102 | } 103 | return float64(t.Mapping[v]) 104 | } 105 | -------------------------------------------------------------------------------- /cmd/generate/tests/examplefile.go: -------------------------------------------------------------------------------- 1 | package examplemodule 2 | 3 | // SomeOther is ignored since there is no gencode command in source file 4 | type SomeOther struct { 5 | Name1 float64 6 | Name2 float64 7 | Name3 string 8 | } 9 | 10 | // SomeOtherWithTags is ignored since there is no gencode command in source file, even though it has correct feature tags 11 | type SomeOtherWithTags struct { 12 | Name1 float64 `feature:"minmax"` 13 | Name2 float64 `feature:"maxabs"` 14 | Name3 string `feature:"onehot"` 15 | Name4 string `feature:""` 16 | } 17 | 18 | //go:generate go run github.com/nikolaydubina/go-featureprocessing/cmd/generate -struct=AllTransformers 19 | 20 | // AllTransformers has all transformer 21 | type AllTransformers struct { 22 | Name0 int `feature:"identity"` 23 | Name1 int32 `feature:"minmax"` 24 | Name2 float32 `feature:"maxabs"` 25 | Name3 float64 `feature:"standard"` 26 | Name4 float64 `feature:"quantile"` 27 | Name5 string `feature:"onehot"` 28 | Name6 string `feature:"ordinal"` 29 | Name7 float64 `feature:"kbins"` 30 | Name8 string `feature:"countvectorizer"` 31 | Name9 string `feature:"tfidf"` 32 | } 33 | 34 | //go:generate go run github.com/nikolaydubina/go-featureprocessing/cmd/generate -struct=With32Fields 35 | 36 | // With32Fields has many fields 37 | type With32Fields struct { 38 | Name1 float64 `feature:"minmax"` 39 | Name2 float64 `feature:"minmax"` 40 | Name3 float64 `feature:"minmax"` 41 | Name4 float64 `feature:"minmax"` 42 | Name5 float64 `feature:"minmax"` 43 | Name6 float64 `feature:"minmax"` 44 | Name7 float64 `feature:"minmax"` 45 | Name8 float64 `feature:"minmax"` 46 | Name9 float64 `feature:"minmax"` 47 | Name10 float64 `feature:"minmax"` 48 | Name11 float64 `feature:"minmax"` 49 | Name12 float64 `feature:"minmax"` 50 | Name13 float64 `feature:"minmax"` 51 | Name14 float64 `feature:"minmax"` 52 | Name15 float64 `feature:"minmax"` 53 | Name16 float64 `feature:"minmax"` 54 | Name17 float64 `feature:"minmax"` 55 | Name18 float64 `feature:"minmax"` 56 | Name19 float64 `feature:"minmax"` 57 | Name21 float64 `feature:"minmax"` 58 | Name22 float64 `feature:"minmax"` 59 | Name23 float64 `feature:"minmax"` 60 | Name24 float64 `feature:"minmax"` 61 | Name25 float64 `feature:"minmax"` 62 | Name26 float64 `feature:"minmax"` 63 | Name27 float64 `feature:"minmax"` 64 | Name28 float64 `feature:"minmax"` 65 | Name29 float64 `feature:"minmax"` 66 | Name30 float64 `feature:"minmax"` 67 | Name31 float64 `feature:"minmax"` 68 | Name32 float64 `feature:"minmax"` 69 | } 70 | 71 | //go:generate go run github.com/nikolaydubina/go-featureprocessing/cmd/generate -struct=LargeMemoryTransformer 72 | 73 | // LargeMemoryTransformer has large memory footprint since each transformer is large 74 | type LargeMemoryTransformer struct { 75 | Name1 string `feature:"onehot"` 76 | Name2 string `feature:"onehot"` 77 | Name3 string `feature:"ordinal"` 78 | Name4 string `feature:"ordinal"` 79 | Name5 float64 `feature:"quantile"` 80 | Name6 float64 `feature:"quantile"` 81 | Name7 float64 `feature:"kbins"` 82 | Name8 float64 `feature:"kbins"` 83 | } 84 | 85 | //go:generate go run github.com/nikolaydubina/go-featureprocessing/cmd/generate -struct=WeirdTags 86 | 87 | // WeirdTags has unusual but valid tags 88 | type WeirdTags struct { 89 | OnlyFeature float64 `feature:"minmax"` 90 | FeatureNotFirst float64 `json:"name2" feature:"maxabs"` 91 | FirstFeature string `feature:"onehot" json:"some_json_tag"` 92 | Multiline float64 `json:"multiline" feature:"maxabs"` 93 | WithoutFeatureTag string `json:"with_tag"` 94 | 95 | WithoutTag string 96 | 97 | // UTF-8 is allowed 98 | A안녕하세요 int `feature:"minmax"` 99 | B안녕하세요1 string `feature:"onehot"` 100 | C안녕하세요0 string `feature:"tfidf"` 101 | } 102 | -------------------------------------------------------------------------------- /docs/benchmark_compare/python-sklearn/bench.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | import time 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from sklearn.pipeline import * 9 | from sklearn.compose import * 10 | from sklearn.preprocessing import * 11 | from sklearn.feature_extraction.text import * 12 | 13 | """ 14 | Example from Go: 15 | 16 | // Employee is example from readme 17 | type Employee struct { 18 | Age int `feature:"identity"` 19 | Salary float64 `feature:"minmax"` 20 | Kids int `feature:"maxabs"` 21 | Weight float64 `feature:"standard"` 22 | Height float64 `feature:"quantile"` 23 | City string `feature:"onehot"` 24 | Car string `feature:"ordinal"` 25 | Income float64 `feature:"kbins"` 26 | Description string `feature:"tfidf"` 27 | SecretValue float64 28 | } 29 | """ 30 | 31 | parser = argparse.ArgumentParser(description='Benchmarking feature preprocessing from structs for sklearn') 32 | parser.add_argument('--nsamples', type=int, default=100000, help='Number of samples') 33 | parser.add_argument('--ntrials', type=int, default=20, help='Number of trials') 34 | parser.add_argument('--ntrialsgroup', type=int, default=20, help='Number of trials') 35 | args = parser.parse_args() 36 | 37 | nsamples = args.nsamples 38 | ntrials = args.ntrials 39 | 40 | setupstartt = time.perf_counter_ns() 41 | 42 | samples = [ 43 | { 44 | 'age': int(random.uniform(1, 100)), 45 | 'salary': random.uniform(0, 9000), 46 | 'kids': int(random.uniform(1, 10)), 47 | 'weight': random.uniform(1, 200), 48 | 'height': random.uniform(1, 200), 49 | 'city': random.choice(["seoul", "pangyo", "daejeon", "busan", "something_else"]), 50 | 'car': random.choice(["bmw", "tesla", "volvo", "hyndai", "something_else"]), 51 | 'income': random.uniform(1, 200), 52 | 'description': "some very long description here some very long description here some very long description here some very long description here ", 53 | 'secret': 42.1, 54 | } 55 | for i in range(nsamples) 56 | ] 57 | df = pd.DataFrame.from_records(samples, nrows=nsamples) 58 | 59 | corpus = ['this is the first document', 'this document is the second document', 'and this is the third one', 'is this the first document'] 60 | vocabulary = ['this', 'document', 'first', 'is', 'second', 'the', 'and', 'one'] 61 | pipeTfidf = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)), ('tfid', TfidfTransformer())]) 62 | 63 | preprocessor = ColumnTransformer( 64 | transformers=[ 65 | ('age', StandardScaler(), ["age"]), 66 | ('salary', MinMaxScaler(), ["salary"]), 67 | ('kids', MaxAbsScaler(), ["kids"]), 68 | ('weight', StandardScaler(), ["weight"]), 69 | ('height', Normalizer(), ["height"]), 70 | ('city', OneHotEncoder(), ["city"]), 71 | ('car', OrdinalEncoder(), ["car"]), 72 | ('income', KBinsDiscretizer(), ["income"]), 73 | #('description', pipeTfidf, ["description"]), #cant not run it 74 | ], 75 | ) 76 | tr = preprocessor.fit(df) 77 | 78 | setupendt = time.perf_counter_ns() 79 | 80 | def benchmark(): 81 | data = tr.transform(df) 82 | 83 | # evaluate 84 | # perf_counter_ns ~ 83ns precision 85 | # monotonic_ns ~ 83ns precision 86 | # process_time_ns ~ 2ms precision 87 | # https://www.python.org/dev/peps/pep-0564/ 88 | runs = np.zeros(ntrials) 89 | for i in range(ntrials): 90 | tic = time.perf_counter_ns() 91 | for j in range(args.ntrialsgroup): 92 | data = tr.transform(df) 93 | toc = time.perf_counter_ns() 94 | runs[i] = (toc - tic) / args.ntrialsgroup 95 | 96 | print(f"nsamples={nsamples}\t ntrials={ntrials}\t ntrialsgroup={args.ntrialsgroup}\t avg={int(np.mean(runs))} ns\t min={int(np.min(runs))} ns\t max={int(np.max(runs))} ns\t samples_dataframe_size={df.memory_usage(index=False, deep=True)[1].sum()} B setuptook={int(setupendt - setupstartt)} ns ") 97 | -------------------------------------------------------------------------------- /transformers/scalers.go: -------------------------------------------------------------------------------- 1 | package transformers 2 | 3 | import ( 4 | "math" 5 | "sort" 6 | ) 7 | 8 | // Identity is a transformer that returns unmodified input value 9 | type Identity struct{} 10 | 11 | // Fit is not used, it is here only to keep same interface as rest of transformers 12 | func (t *Identity) Fit(_ []float64) {} 13 | 14 | // Transform returns same value as input 15 | func (t *Identity) Transform(v float64) float64 { 16 | return v 17 | } 18 | 19 | // MinMaxScaler is a transformer that rescales value into range between min and max 20 | type MinMaxScaler struct { 21 | Min float64 22 | Max float64 23 | } 24 | 25 | // Fit findx min and max value in range 26 | func (t *MinMaxScaler) Fit(vals []float64) { 27 | for i, v := range vals { 28 | if i == 0 { 29 | t.Min = v 30 | t.Max = v 31 | } 32 | if v < t.Min { 33 | t.Min = v 34 | } 35 | if v > t.Max { 36 | t.Max = v 37 | } 38 | } 39 | } 40 | 41 | // Transform scales value from 0 to 1 linearly 42 | func (t *MinMaxScaler) Transform(v float64) float64 { 43 | if t.Min == t.Max { 44 | return 0 45 | } 46 | if v < t.Min { 47 | return 0. 48 | } 49 | if v > t.Max { 50 | return 1. 51 | } 52 | return (v - t.Min) / (t.Max - t.Min) 53 | } 54 | 55 | // MaxAbsScaler transforms value into -1 to +1 range linearly 56 | type MaxAbsScaler struct { 57 | Max float64 58 | } 59 | 60 | // Fit finds maximum abssolute value 61 | func (t *MaxAbsScaler) Fit(vals []float64) { 62 | for i, v := range vals { 63 | if i == 0 { 64 | t.Max = v 65 | } 66 | if math.Abs(v) > t.Max { 67 | t.Max = math.Abs(v) 68 | } 69 | } 70 | } 71 | 72 | // Transform scales value into -1 to +1 range 73 | func (t *MaxAbsScaler) Transform(v float64) float64 { 74 | if t.Max == 0 { 75 | return 0 76 | } 77 | if v > math.Abs(t.Max) { 78 | return 1. 79 | } 80 | if v < -math.Abs(t.Max) { 81 | return -1. 82 | } 83 | return v / math.Abs(t.Max) 84 | } 85 | 86 | // StandardScaler transforms feature into normal standard distribution. 87 | type StandardScaler struct { 88 | Mean float64 89 | STD float64 90 | } 91 | 92 | // Fit computes mean and standard deviation 93 | func (t *StandardScaler) Fit(vals []float64) { 94 | sum := 0. 95 | for _, v := range vals { 96 | sum += v 97 | } 98 | if len(vals) > 0 { 99 | t.Mean = sum / float64(len(vals)) 100 | t.STD = std(vals, t.Mean) 101 | } 102 | } 103 | 104 | // Transform centralizes and scales based on standard deviation and mean 105 | func (t *StandardScaler) Transform(v float64) float64 { 106 | return (v - t.Mean) / t.STD 107 | } 108 | 109 | // QuantileScaler transforms any distribution to uniform distribution 110 | // This is done by mapping values to quantiles they belong to. 111 | type QuantileScaler struct { 112 | Quantiles []float64 113 | } 114 | 115 | // Fit sets parameters for quantiles based on input. 116 | // Number of quantiles are specified by size of Quantiles slice. 117 | // If it is empty or nil, then 100 is used as default. 118 | // If input is smaller than number of quantiles, then using length of input. 119 | func (t *QuantileScaler) Fit(vals []float64) { 120 | if len(vals) == 0 { 121 | return 122 | } 123 | if len(t.Quantiles) == 0 { 124 | t.Quantiles = make([]float64, 100) 125 | } 126 | if len(vals) < len(t.Quantiles) { 127 | t.Quantiles = t.Quantiles[:len(vals)] 128 | } 129 | 130 | sorted := make([]float64, len(vals)) 131 | copy(sorted, vals) 132 | sort.Float64s(sorted) 133 | 134 | f := float64(len(sorted)) / float64(len(t.Quantiles)) 135 | for i := range t.Quantiles { 136 | idx := int(float64(i) * f) 137 | t.Quantiles[i] = sorted[idx] 138 | } 139 | } 140 | 141 | // Transform changes distribution into uniform one from 0 to 1 142 | func (t *QuantileScaler) Transform(v float64) float64 { 143 | if t == nil || len(t.Quantiles) == 0 { 144 | return 0 145 | } 146 | i := sort.SearchFloat64s(t.Quantiles[:], v) 147 | if i >= len(t.Quantiles) { 148 | return 1. 149 | } 150 | return float64(i+1) / float64(len(t.Quantiles)) 151 | } 152 | -------------------------------------------------------------------------------- /transformers/samplenormalizers_test.go: -------------------------------------------------------------------------------- 1 | package transformers_test 2 | 3 | import ( 4 | "testing" 5 | 6 | . "github.com/nikolaydubina/go-featureprocessing/transformers" 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func TestSampleNormalizserL1(t *testing.T) { 11 | samples := []struct { 12 | name string 13 | input []float64 14 | output []float64 15 | }{ 16 | {"basic", []float64{1, 2, 3, 4}, []float64{0.1, 0.2, 0.3, 0.4}}, 17 | {"empty", []float64{}, []float64{}}, 18 | {"nil", nil, nil}, 19 | {"zeros", []float64{0, 0, 0}, []float64{0, 0, 0}}, 20 | {"zeros_single", []float64{0}, []float64{0}}, 21 | {"single", []float64{5}, []float64{1}}, 22 | {"single_negative", []float64{-5}, []float64{-1}}, 23 | {"negative", []float64{1, 2, 3, -4}, []float64{0.1, 0.2, 0.3, -0.4}}, 24 | } 25 | 26 | for _, s := range samples { 27 | t.Run(s.name, func(t *testing.T) { 28 | encoder := SampleNormalizerL1{} 29 | features := encoder.Transform((s.input)) 30 | assert.Equal(t, s.output, features) 31 | 32 | // inplace 33 | if len(s.output) > 0 { 34 | features := make([]float64, len(s.input)) 35 | encoder.TransformInplace(features, s.input) 36 | assert.Equal(t, s.output, features) 37 | 38 | features = make([]float64, len(s.input)+100) 39 | features[0] = 11223344556677 40 | features[1] = 10101010110101 41 | features[99] = 223312112233 42 | copy(features[10:], s.output) 43 | expected := make([]float64, len(features)) 44 | copy(expected, features) 45 | 46 | encoder.TransformInplace(features[10:10+len(s.input)], s.input) 47 | assert.Equal(t, expected, features) 48 | } 49 | }) 50 | } 51 | 52 | t.Run("fit", func(t *testing.T) { 53 | encoder := SampleNormalizerL1{} 54 | encoder.Fit(nil) 55 | assert.Equal(t, SampleNormalizerL1{}, encoder) 56 | }) 57 | 58 | t.Run("inplace does not run when input mismatches", func(t *testing.T) { 59 | encoder := SampleNormalizerL1{} 60 | f := []float64{1, 2} 61 | encoder.TransformInplace(f, []float64{1, 2, 3, 4}) 62 | assert.Equal(t, []float64{1, 2}, f) 63 | }) 64 | } 65 | 66 | func TestSampleNormalizserL2(t *testing.T) { 67 | samples := []struct { 68 | name string 69 | input []float64 70 | output []float64 71 | }{ 72 | {"basic", []float64{1, 1, 3, 5, 8}, []float64{0.1, 0.1, 0.3, 0.5, 0.8}}, 73 | {"empty", []float64{}, []float64{}}, 74 | {"nil", nil, nil}, 75 | {"zeros", []float64{0, 0, 0}, []float64{0, 0, 0}}, 76 | {"zeros_single", []float64{0}, []float64{0}}, 77 | {"single", []float64{5}, []float64{1}}, 78 | {"single_negative", []float64{-5}, []float64{-1}}, 79 | {"basic", []float64{1, 1, -3, 5, -8}, []float64{0.1, 0.1, -0.3, 0.5, -0.8}}, 80 | } 81 | 82 | for _, s := range samples { 83 | t.Run(s.name, func(t *testing.T) { 84 | encoder := SampleNormalizerL2{} 85 | features := encoder.Transform((s.input)) 86 | assert.Equal(t, s.output, features) 87 | }) 88 | 89 | if len(s.output) > 0 { 90 | t.Run(s.name+"_inplace", func(t *testing.T) { 91 | encoder := SampleNormalizerL2{} 92 | 93 | features := make([]float64, len(s.input)) 94 | encoder.TransformInplace(features, s.input) 95 | assert.Equal(t, s.output, features) 96 | 97 | features = make([]float64, len(s.input)+100) 98 | features[0] = 1 99 | features[1] = 2 100 | features[10] = 12312 // has to overwrite this 101 | features[99] = 5 102 | 103 | expected := make([]float64, len(features)) 104 | copy(expected, features) 105 | copy(expected[10:], s.output) 106 | 107 | encoder.TransformInplace(features[10:10+len(s.input)], s.input) 108 | assert.Equal(t, expected, features) 109 | }) 110 | } 111 | } 112 | 113 | t.Run("fit", func(t *testing.T) { 114 | encoder := SampleNormalizerL2{} 115 | encoder.Fit(nil) 116 | assert.Equal(t, SampleNormalizerL2{}, encoder) 117 | }) 118 | 119 | t.Run("inplace does not run when input mismatches", func(t *testing.T) { 120 | encoder := SampleNormalizerL2{} 121 | f := []float64{1, 2} 122 | encoder.TransformInplace(f, []float64{1, 2, 3, 4}) 123 | assert.Equal(t, []float64{1, 2}, f) 124 | }) 125 | } 126 | -------------------------------------------------------------------------------- /docs/benchmarks/macbook_2017: -------------------------------------------------------------------------------- 1 | GOMAXPROCS=8 go test -timeout=1h -bench=. -benchtime=10s -benchmem ./... 2 | ? github.com/nikolaydubina/go-featureprocessing/cmd/generate [no test files] 3 | goos: darwin 4 | goarch: amd64 5 | pkg: github.com/nikolaydubina/go-featureprocessing/cmd/generate/tests 6 | BenchmarkAllTransformersFeatureTransformer_Fit_100elements-8 84565 154175 ns/op 129252 B/op 343 allocs/op 7 | BenchmarkAllTransformersFeatureTransformer_Fit_1000elements-8 3060 3842749 ns/op 8362158 B/op 3122 allocs/op 8 | BenchmarkAllTransformersFeatureTransformer_Fit_10000elements-8 58 198200932 ns/op 785001236 B/op 30662 allocs/op 9 | BenchmarkAllTransformersFeatureTransformer_Transform-8 48809920 227 ns/op 288 B/op 1 allocs/op 10 | BenchmarkAllTransformersFeatureTransformer_Transform_Inplace-8 81078682 167 ns/op 0 B/op 0 allocs/op 11 | BenchmarkAllTransformersFeatureTransformer_TransformAll_10elems-8 5265136 2400 ns/op 3072 B/op 1 allocs/op 12 | BenchmarkAllTransformersFeatureTransformer_TransformAll_100elems-8 525598 25025 ns/op 32768 B/op 1 allocs/op 13 | BenchmarkAllTransformersFeatureTransformer_TransformAll_1000elems-8 39865 304406 ns/op 303105 B/op 1 allocs/op 14 | BenchmarkAllTransformersFeatureTransformer_TransformAll_10000elems-8 4161 3034926 ns/op 3047428 B/op 1 allocs/op 15 | BenchmarkAllTransformersFeatureTransformer_TransformAll_100000elems-8 388 31737672 ns/op 29605890 B/op 1 allocs/op 16 | BenchmarkAllTransformersFeatureTransformer_TransformAll_1000000elems-8 34 306209484 ns/op 296001536 B/op 1 allocs/op 17 | BenchmarkAllTransformersFeatureTransformer_TransformAll_10elems_8workers-8 1508948 7915 ns/op 3088 B/op 2 allocs/op 18 | BenchmarkAllTransformersFeatureTransformer_TransformAll_100elems_8workers-8 362482 32626 ns/op 32784 B/op 2 allocs/op 19 | BenchmarkAllTransformersFeatureTransformer_TransformAll_1000elems_8workers-8 58651 205823 ns/op 303121 B/op 2 allocs/op 20 | BenchmarkAllTransformersFeatureTransformer_TransformAll_10000elems_8workers-8 6986 1947879 ns/op 2801686 B/op 2 allocs/op 21 | BenchmarkAllTransformersFeatureTransformer_TransformAll_100000elems_8workers-8 668 17773264 ns/op 30400529 B/op 2 allocs/op 22 | BenchmarkAllTransformersFeatureTransformer_TransformAll_1000000elems_8workers-8 67 174749358 ns/op 280002579 B/op 2 allocs/op 23 | BenchmarkAllTransformersFeatureTransformer_TransformAll_5000000elems_8workers-8 6 2465129104 ns/op 1480007696 B/op 2 allocs/op 24 | BenchmarkAllTransformersFeatureTransformer_TransformAll_15000000elems_8workers-8 1 24361808755 ns/op 4560003088 B/op 2 allocs/op 25 | BenchmarkAllTransformersFeatureTransformer_Transform_LargeComposites_100elements-8 8935842 1397 ns/op 2688 B/op 1 allocs/op 26 | BenchmarkAllTransformersFeatureTransformer_Transform_LargeComposites_1000elements-8 960136 11899 ns/op 24576 B/op 1 allocs/op 27 | BenchmarkAllTransformersFeatureTransformer_Transform_LargeComposites_10000elements-8 139093 74061 ns/op 229376 B/op 1 allocs/op 28 | BenchmarkAllTransformersFeatureTransformer_Transform_LargeComposites_100000elements-8 13260 809367 ns/op 2252800 B/op 1 allocs/op 29 | BenchmarkEmployeeFeatureTransformer_Fit_100elements-8 92524 130821 ns/op 117716 B/op 232 allocs/op 30 | BenchmarkEmployeeFeatureTransformer_Fit_1000elements-8 2898 4025417 ns/op 8296921 B/op 2090 allocs/op 31 | BenchmarkEmployeeFeatureTransformer_Fit_10000elements-8 51 217919326 ns/op 784843594 B/op 20396 allocs/op 32 | BenchmarkEmployeeFeatureTransformer_Transform-8 signal: interrupt 33 | FAIL github.com/nikolaydubina/go-featureprocessing/cmd/generate/tests 668.566s 34 | -------------------------------------------------------------------------------- /cmd/generate/templatecode.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | const templateCode = ` 4 | // Code generated by go-featureprocessing DO NOT EDIT 5 | 6 | package {{$.PackageName}} 7 | 8 | import ( 9 | "sync" 10 | 11 | fp "github.com/nikolaydubina/go-featureprocessing/transformers" 12 | ) 13 | 14 | // {{$.StructName}}FeatureTransformer is a feature processor for {{$.StructName}}. 15 | // It was automatically generated by go-featureprocessing tool. 16 | type {{$.StructName}}FeatureTransformer struct { 17 | {{range $i, $tr := $.Fields}}{{$tr.Name}} fp.{{$tr.Transformer}} ` + "`" + `json:"{{$tr.Name}}_{{$tr.TransformerTag}}"` + "`" + ` 18 | {{end}} 19 | } 20 | 21 | // Fit fits transformer for each field 22 | func (e *{{$.StructName}}FeatureTransformer) Fit(s []{{$.StructName}}) { 23 | if e == nil || len(s) == 0 { 24 | return 25 | } 26 | 27 | {{if $.HasNumericalTransformers}}dataNum := make([]float64, len(s)){{end}} 28 | {{if $.HasStringTransformers}}dataStr := make([]string, len(s)){{end}} 29 | 30 | {{range $i, $tr := $.Fields}} 31 | 32 | for i, v := range s { 33 | {{if $tr.NumericalInput }}dataNum[i] = float64(v.{{$tr.Name}}){{else}}dataStr[i] = v.{{$tr.Name}}{{end}} 34 | } 35 | 36 | e.{{$tr.Name}}.Fit({{if $tr.NumericalInput }}dataNum{{else}}dataStr{{end}}) 37 | 38 | {{end}} 39 | } 40 | 41 | // Transform transforms struct into feature vector accordingly to transformers 42 | func (e *{{$.StructName}}FeatureTransformer) Transform(s *{{$.StructName}}) []float64 { 43 | if s == nil || e == nil { 44 | return nil 45 | } 46 | features := make([]float64, e.NumFeatures()) 47 | e.TransformInplace(features, s) 48 | return features 49 | } 50 | 51 | // TransformInplace transforms struct into feature vector accordingly to transformers, and does so inplace 52 | func (e *{{$.StructName}}FeatureTransformer) TransformInplace(dst []float64, s *{{$.StructName}}) { 53 | if s == nil || e == nil || len(dst) != e.NumFeatures() { 54 | return 55 | } 56 | idx := 0 57 | {{range $i, $tr := $.Fields}} 58 | {{if $tr.Expanding }}e.{{$tr.Name}}.TransformInplace(dst[idx:idx + e.{{$tr.Name}}.NumFeatures()], s.{{$tr.Name}}) 59 | idx += e.{{$tr.Name}}.NumFeatures() 60 | {{else}}dst[idx] = e.{{$tr.Name}}.Transform( {{if $tr.NumericalInput }}float64{{end}}( s.{{$tr.Name}} )) 61 | idx++ 62 | {{end}} 63 | {{end}} 64 | } 65 | 66 | // TransformAll transforms a slice of {{$.StructName}} 67 | func (e *{{$.StructName}}FeatureTransformer) TransformAll(s []{{$.StructName}}) []float64 { 68 | if e == nil { 69 | return nil 70 | } 71 | features := make([]float64, len(s) * e.NumFeatures()) 72 | e.TransformAllInplace(features, s) 73 | return features 74 | } 75 | 76 | // TransformAllInplace transforms a slice of {{$.StructName}} inplace 77 | func (e *{{$.StructName}}FeatureTransformer) TransformAllInplace(dst []float64, s []{{$.StructName}}) { 78 | if e == nil { 79 | return 80 | } 81 | n := e.NumFeatures() 82 | if len(dst) != n * len(s) { 83 | return 84 | } 85 | for i := range s { 86 | e.TransformInplace(dst[i * n: (i + 1) * n], &s[i]) 87 | } 88 | } 89 | 90 | // TransformAllParallel transforms a slice of {{$.StructName}} in parallel 91 | func (e *{{$.StructName}}FeatureTransformer) TransformAllParallel(s []{{$.StructName}}, nworkers uint) []float64 { 92 | if e == nil { 93 | return nil 94 | } 95 | features := make([]float64, len(s) * e.NumFeatures()) 96 | e.TransformAllInplaceParallel(features, s, nworkers) 97 | return features 98 | } 99 | 100 | // TransformAllInplaceParallel transforms a slice of {{$.StructName}} inplace parallel 101 | // Useful for very large slices. 102 | func (e *{{$.StructName}}FeatureTransformer) TransformAllInplaceParallel(dst []float64, s []{{$.StructName}}, nworkers uint) { 103 | if e == nil || nworkers == 0 { 104 | return 105 | } 106 | ns := uint(len(s)) 107 | nf := uint(e.NumFeatures()) 108 | if uint(len(dst)) != nf * ns { 109 | return 110 | } 111 | 112 | nbatch := ns / nworkers 113 | var wg sync.WaitGroup 114 | 115 | for i := uint(0); i < nworkers; i++ { 116 | wg.Add(1) 117 | go func (i uint) { 118 | defer wg.Done() 119 | iStart := nbatch * i 120 | iEnd := nbatch * (i + 1) 121 | if i == (nworkers - 1) { 122 | iEnd = ns 123 | } 124 | e.TransformAllInplace(dst[iStart * nf: iEnd * nf], s[iStart:iEnd]) 125 | } (i); 126 | } 127 | 128 | wg.Wait() 129 | } 130 | 131 | // NumFeatures returns number of features in output feature vector 132 | func (e *{{$.StructName}}FeatureTransformer) NumFeatures() int { 133 | if e == nil { 134 | return 0 135 | } 136 | 137 | count := {{$.NumFieldsFlat}} 138 | {{range $i, $tr := $.Fields}}{{if $tr.Expanding}}count += e.{{$tr.Name}}.NumFeatures(){{end}} 139 | {{end}} 140 | return count 141 | } 142 | 143 | // FeatureNames provides names of features that match output of transform 144 | func (e *{{$.StructName}}FeatureTransformer) FeatureNames() []string { 145 | if e == nil { 146 | return nil 147 | } 148 | 149 | idx := 0 150 | names := make([]string, e.NumFeatures()) 151 | 152 | {{range $i, $tr := $.Fields}} 153 | {{if $tr.Expanding }} 154 | for _, w := range e.{{$tr.Name}}.FeatureNames() { 155 | names[idx] = "{{$tr.Name}}_" + w 156 | idx++ 157 | } 158 | {{else}} 159 | names[idx] = "{{$tr.Name}}" 160 | idx++ 161 | {{end}} 162 | {{end}} 163 | 164 | return names 165 | } 166 | ` 167 | -------------------------------------------------------------------------------- /transformers/categorical_test.go: -------------------------------------------------------------------------------- 1 | package transformers_test 2 | 3 | import ( 4 | "testing" 5 | 6 | . "github.com/nikolaydubina/go-featureprocessing/transformers" 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func TestOneHotEncoderFit(t *testing.T) { 11 | samples := []struct { 12 | name string 13 | input []string 14 | output map[string]uint 15 | n int 16 | }{ 17 | {"basic", []string{"a", "b", "a", "a", "a"}, map[string]uint{"a": 0, "b": 1}, 2}, 18 | {"empty", []string{}, nil, 0}, 19 | {"nil", nil, nil, 0}, 20 | {"same_string", []string{"a", "a", "a"}, map[string]uint{"a": 0}, 1}, 21 | {"empty_string", []string{"", "", ""}, map[string]uint{}, 0}, 22 | {"zeros_single", []string{""}, map[string]uint{}, 0}, 23 | {"single", []string{"a"}, map[string]uint{"a": 0}, 1}, 24 | } 25 | for _, s := range samples { 26 | t.Run(s.name, func(t *testing.T) { 27 | encoder := OneHotEncoder{} 28 | encoder.Fit(s.input) 29 | assert.Equal(t, OneHotEncoder{Mapping: s.output}, encoder) 30 | assert.Equal(t, s.n, encoder.NumFeatures()) 31 | }) 32 | } 33 | } 34 | 35 | func TestOneHotEncoderTransform(t *testing.T) { 36 | samples := []struct { 37 | name string 38 | mapping map[string]uint 39 | input string 40 | output []float64 41 | }{ 42 | {"basic", map[string]uint{"a": 0, "b": 1}, "a", []float64{1, 0}}, 43 | {"basic", map[string]uint{"a": 0, "b": 1}, "b", []float64{0, 1}}, 44 | {"none", map[string]uint{"a": 0, "b": 1}, "c", []float64{0, 0}}, 45 | {"empty_input", map[string]uint{"a": 0, "b": 1}, "", []float64{0, 0}}, 46 | {"empty_vals", nil, "a", nil}, 47 | {"nil_vals", nil, "a", nil}, 48 | {"zeros_single", map[string]uint{"": 0}, "", []float64{1}}, 49 | {"single", map[string]uint{"a": 0}, "a", []float64{1}}, 50 | } 51 | 52 | for _, s := range samples { 53 | t.Run(s.name, func(t *testing.T) { 54 | encoder := OneHotEncoder{Mapping: s.mapping} 55 | assert.Equal(t, s.output, encoder.Transform(s.input)) 56 | }) 57 | 58 | if len(s.output) > 0 { 59 | t.Run(s.name+"_inplace", func(t *testing.T) { 60 | encoder := OneHotEncoder{Mapping: s.mapping} 61 | assert.Equal(t, s.output, encoder.Transform(s.input)) 62 | 63 | features := make([]float64, encoder.NumFeatures()) 64 | encoder.TransformInplace(features, s.input) 65 | assert.Equal(t, s.output, features) 66 | 67 | features = make([]float64, encoder.NumFeatures()+100) 68 | features[0] = 11223344556677 69 | features[1] = 10101010110101 70 | features[99] = 12312312312312 71 | 72 | expected := make([]float64, len(features)) 73 | copy(expected, features) 74 | copy(expected[10:], s.output) 75 | 76 | encoder.TransformInplace(features[10:10+encoder.NumFeatures()], s.input) 77 | assert.Equal(t, expected, features) 78 | }) 79 | } 80 | } 81 | 82 | t.Run("inplace does not compute when input is wrong", func(t *testing.T) { 83 | encoder := OneHotEncoder{Mapping: map[string]uint{"a": 0, "b": 1}} 84 | features := []float64{1.1, 2.1, 3.1, 4.1} 85 | encoder.TransformInplace(features, "a") 86 | assert.Equal(t, []float64{1.1, 2.1, 3.1, 4.1}, features) 87 | }) 88 | 89 | t.Run("transform when encoder is nil", func(t *testing.T) { 90 | var encoder *OneHotEncoder 91 | assert.Equal(t, []float64(nil), encoder.Transform("abcd")) 92 | }) 93 | } 94 | 95 | func TestOneHotEncoderFeatureNames(t *testing.T) { 96 | t.Run("feature names on empty transformer", func(t *testing.T) { 97 | var encoder *OneHotEncoder 98 | assert.Equal(t, []string(nil), encoder.FeatureNames()) 99 | }) 100 | 101 | t.Run("feature names", func(t *testing.T) { 102 | encoder := OneHotEncoder{Mapping: map[string]uint{"a": 0, "b": 1}} 103 | assert.Equal(t, []string{"a", "b"}, encoder.FeatureNames()) 104 | }) 105 | } 106 | 107 | func TestOrdinalEncoderFit(t *testing.T) { 108 | samples := []struct { 109 | name string 110 | input []string 111 | output map[string]uint 112 | }{ 113 | {"basic", []string{"a", "b", "a", "a", "a"}, map[string]uint{"a": 1, "b": 2}}, 114 | {"empty", []string{}, nil}, 115 | {"nil", nil, nil}, 116 | {"same_string", []string{"a", "a", "a"}, map[string]uint{"a": 1}}, 117 | {"empty_string", []string{"", "", ""}, map[string]uint{}}, 118 | {"zeros_single", []string{""}, map[string]uint{}}, 119 | {"single", []string{"a"}, map[string]uint{"a": 1}}, 120 | } 121 | 122 | for _, s := range samples { 123 | t.Run(s.name, func(t *testing.T) { 124 | encoder := OrdinalEncoder{} 125 | encoder.Fit(s.input) 126 | assert.Equal(t, OrdinalEncoder{Mapping: s.output}, encoder) 127 | }) 128 | } 129 | } 130 | 131 | func TestOrdinalEncoderTransform(t *testing.T) { 132 | samples := []struct { 133 | name string 134 | vals map[string]uint 135 | input string 136 | output float64 137 | }{ 138 | {"basic", map[string]uint{"a": 1, "b": 3}, "a", 1}, 139 | {"basic", map[string]uint{"a": 1, "b": 3}, "b", 3}, 140 | {"none", map[string]uint{"a": 1, "b": 3}, "c", 0}, 141 | {"empty_input", map[string]uint{"a": 1, "b": 3}, "", 0}, 142 | {"empty_vals", map[string]uint{}, "a", 0}, 143 | {"nil_vals", nil, "a", 0}, 144 | {"zero_single", map[string]uint{"": 1}, "", 1}, 145 | {"single", map[string]uint{"a": 1}, "a", 1}, 146 | } 147 | for _, s := range samples { 148 | t.Run(s.name, func(t *testing.T) { 149 | encoder := OrdinalEncoder{Mapping: s.vals} 150 | assert.Equal(t, s.output, encoder.Transform(s.input)) 151 | }) 152 | } 153 | 154 | t.Run("transform when encoder is nil", func(t *testing.T) { 155 | var encoder *OrdinalEncoder 156 | assert.Equal(t, 0., encoder.Transform("abcd")) 157 | }) 158 | } 159 | -------------------------------------------------------------------------------- /transformers/textprocesors.go: -------------------------------------------------------------------------------- 1 | package transformers 2 | 3 | import ( 4 | "math" 5 | "strings" 6 | ) 7 | 8 | // CountVectorizer performs bag of words encoding of text. 9 | // 10 | // Separator should not be a part of any word. 11 | // Responsibility to ensure this is on caller. 12 | // Words that have separator as its substring will be ommited. 13 | // 14 | // Mapping should contain all values from 0 to N where N is len(Mapping). 15 | // Responsibility to ensure this is on caller. 16 | // If some index is higher than N or lower than 0, then code will panic. 17 | // If some index is not set, then that index will be skipped. 18 | // If some index is set twice, then index will have sum of words. 19 | type CountVectorizer struct { 20 | Mapping map[string]uint // word to index 21 | Separator string // default space 22 | } 23 | 24 | // Fit assigns a number from 0 to N for each word in input, where N is number of words 25 | func (t *CountVectorizer) Fit(vals []string) { 26 | if t.Separator == "" { 27 | t.Separator = " " 28 | } 29 | if len(vals) == 0 { 30 | return 31 | } 32 | t.Mapping = make(map[string]uint) 33 | var count uint = 0 34 | for _, v := range vals { 35 | ws := strings.Split(v, t.Separator) 36 | for _, w := range ws { 37 | if w == "" { 38 | continue 39 | } 40 | if _, ok := t.Mapping[w]; !ok { 41 | t.Mapping[w] = count 42 | count++ 43 | } 44 | } 45 | } 46 | } 47 | 48 | // NumFeatures returns num of features made for single input field 49 | func (t *CountVectorizer) NumFeatures() int { 50 | if t == nil { 51 | return 0 52 | } 53 | return len(t.Mapping) 54 | } 55 | 56 | // Transform counts how many times each word appeared in input 57 | func (t *CountVectorizer) Transform(v string) []float64 { 58 | if t == nil || v == "" || len(t.Mapping) == 0 { 59 | return nil 60 | } 61 | counts := make([]float64, t.NumFeatures()) 62 | t.TransformInplace(counts, v) 63 | return counts 64 | } 65 | 66 | // FeatureNames returns slice with produced feature names 67 | func (t *CountVectorizer) FeatureNames() []string { 68 | if t == nil || len(t.Mapping) == 0 { 69 | return nil 70 | } 71 | 72 | names := make([]string, t.NumFeatures()) 73 | for w, i := range t.Mapping { 74 | names[i] = w 75 | } 76 | return names 77 | } 78 | 79 | // TransformInplace counts how many time each word appeared in input, inplace version. 80 | // It is responsibility of caller to zero-out destination. 81 | // Using zero memory allocation algorithm based on `strings.Split`. 82 | // Utilizing that string is slice of bytes. 83 | // Works fine with UTF-8. 84 | func (t *CountVectorizer) TransformInplace(dest []float64, v string) { 85 | if t == nil || t.Separator == "" || len(t.Mapping) == 0 || len(dest) != t.NumFeatures() { 86 | return 87 | } 88 | sep := t.Separator 89 | 90 | n := strings.Count(v, sep) 91 | if n == 0 { 92 | // no separators, try to match whole string 93 | if idx, ok := t.Mapping[v]; ok { 94 | dest[idx] = 1 95 | } 96 | return 97 | } 98 | 99 | j := 0 // looking for position of separator in v starting from here 100 | for i := 0; i < n; i++ { 101 | // we are guaranteed to find next separator, m >= 0 102 | m := strings.Index(v[j:], sep) 103 | 104 | // word between separators 105 | if idx, ok := t.Mapping[v[j:j+m]]; ok { 106 | dest[idx]++ 107 | } 108 | 109 | // increment by current word length and separator length 110 | j += m + len(sep) 111 | } 112 | if j != len(v) { 113 | // if string did not end with separator, it ended with word 114 | if idx, ok := t.Mapping[v[j:]]; ok { 115 | dest[idx]++ 116 | } 117 | } 118 | } 119 | 120 | // TFIDFVectorizer performs tf-idf vectorization on top of count vectorization. 121 | // Based on: https://scikit-learn.org/stable/modules/feature_extraction.html 122 | // Using non-smooth version, adding 1 to log instead of denominator in idf. 123 | // 124 | // DocCount should have len of len(CountVectorizer.Mapping). 125 | // It is responsibility of a caller to sensure it is so. 126 | type TFIDFVectorizer struct { 127 | CountVectorizer 128 | DocCount []uint // number of documents where i-th word from CountVectorizer appeared in 129 | NumDocuments int 130 | Normalizer SampleNormalizerL2 131 | } 132 | 133 | // Fit fits CountVectorizer and extra information for tf-idf computation 134 | func (t *TFIDFVectorizer) Fit(vals []string) { 135 | t.CountVectorizer.Fit(vals) 136 | if len(vals) == 0 { 137 | return 138 | } 139 | 140 | t.NumDocuments = len(vals) 141 | t.DocCount = make([]uint, t.NumFeatures()) 142 | 143 | // second pass over whole input to count how many documents each word appeared in 144 | for _, v := range vals { 145 | counts := t.CountVectorizer.Transform(v) 146 | for i, v := range counts { 147 | if v > 0 { 148 | t.DocCount[i]++ 149 | } 150 | } 151 | } 152 | } 153 | 154 | // NumFeatures returns number of features for single field 155 | func (t *TFIDFVectorizer) NumFeatures() int { 156 | if t == nil { 157 | return 0 158 | } 159 | return len(t.CountVectorizer.Mapping) 160 | } 161 | 162 | // Transform performs tf-idf computation 163 | func (t *TFIDFVectorizer) Transform(v string) []float64 { 164 | if t == nil { 165 | return nil 166 | } 167 | features := make([]float64, t.NumFeatures()) 168 | t.TransformInplace(features, v) 169 | return features 170 | } 171 | 172 | // TransformInplace performs tf-idf computation, inplace. 173 | // It is responsibility of caller to zero-out destination. 174 | func (t *TFIDFVectorizer) TransformInplace(dest []float64, v string) { 175 | if t == nil || dest == nil || len(dest) != t.NumFeatures() { 176 | return 177 | } 178 | t.CountVectorizer.TransformInplace(dest, v) 179 | 180 | for i, tf := range dest { 181 | if tf > 0 && t.DocCount[i] > 0 { 182 | dest[i] = tf * (math.Log(float64(t.NumDocuments)/float64(t.DocCount[i])) + 1) 183 | } else { 184 | dest[i] = 0 185 | } 186 | } 187 | 188 | t.Normalizer.TransformInplace(dest, dest) 189 | } 190 | 191 | // FeatureNames returns slice with produced feature names. 192 | func (t *TFIDFVectorizer) FeatureNames() []string { 193 | if t == nil { 194 | return nil 195 | } 196 | return t.CountVectorizer.FeatureNames() 197 | } 198 | -------------------------------------------------------------------------------- /cmd/generate/parser.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "go/ast" 6 | "go/parser" 7 | "go/token" 8 | "strings" 9 | "unicode" 10 | "unicode/utf8" 11 | ) 12 | 13 | // Field represents single transformer and field it transforms, for internal use only 14 | type Field struct { 15 | Name string 16 | Transformer string 17 | Expanding bool 18 | NumericalInput bool 19 | TransformerTag string 20 | } 21 | 22 | // TemplateParams represents all parameters for template, for internal use only 23 | type TemplateParams struct { 24 | PackageName string 25 | StructName string 26 | NumFieldsFlat int 27 | Fields []Field 28 | HasLargeTransformers bool 29 | HasNumericalTransformers bool 30 | HasStringTransformers bool 31 | } 32 | 33 | var tagToTransformer = map[string]string{ 34 | "identity": "Identity", 35 | "minmax": "MinMaxScaler", 36 | "maxabs": "MaxAbsScaler", 37 | "standard": "StandardScaler", 38 | "quantile": "QuantileScaler", 39 | "onehot": "OneHotEncoder", 40 | "ordinal": "OrdinalEncoder", 41 | "kbins": "KBinsDiscretizer", 42 | "countvectorizer": "CountVectorizer", 43 | "tfidf": "TFIDFVectorizer", 44 | } 45 | 46 | var isTransformerExpanding = map[string]bool{ 47 | "onehot": true, 48 | "countvectorizer": true, 49 | "tfidf": true, 50 | } 51 | 52 | var isTransformerLarge = map[string]bool{ 53 | "quantile": true, 54 | "onehot": true, 55 | "ordinal": true, 56 | "kbins": true, 57 | "countvectorizer": true, 58 | "tfidf": true, 59 | } 60 | 61 | var isTypeSupported = map[string]bool{ 62 | "int": true, 63 | "int8": true, 64 | "int16": true, 65 | "int32": true, 66 | "float32": true, 67 | "float64": true, 68 | "string": true, 69 | } 70 | 71 | var isTypeNumerical = map[string]bool{ 72 | "int": true, 73 | "int8": true, 74 | "int16": true, 75 | "int32": true, 76 | "float32": true, 77 | "float64": true, 78 | } 79 | 80 | // parseCode parses provided at filename or code into AST. 81 | // It finds for struct delcarations matching structName and collects fields information 82 | // that is next used to filling all necessary details for constructing StructTransformer. 83 | func parseCode(filename string, code []byte, structName string, packageName string) (*TemplateParams, error) { 84 | var err error 85 | var fields []Field 86 | numFieldsFlat := 0 87 | numLargeTransformers := 0 88 | numNumericalTransformers := 0 89 | numStringTransformers := 0 90 | 91 | f, err := parser.ParseFile(token.NewFileSet(), filename, code, parser.ParseComments) 92 | if err != nil { 93 | return nil, fmt.Errorf("can not parse input file: %w", err) 94 | } 95 | 96 | ast.Inspect(f, func(node ast.Node) bool { 97 | decl, ok := node.(*ast.GenDecl) 98 | if !ok { 99 | return true 100 | } 101 | 102 | for _, spec := range decl.Specs { 103 | typeSpec, ok := spec.(*ast.TypeSpec) 104 | if !ok { 105 | continue 106 | } 107 | 108 | if typeSpec.Name == nil { 109 | continue 110 | } 111 | 112 | if typeSpec.Name.Name != structName { 113 | continue 114 | } 115 | 116 | structSpec, ok := typeSpec.Type.(*ast.StructType) 117 | if !ok { 118 | continue 119 | } 120 | 121 | for _, field := range structSpec.Fields.List { 122 | if field == nil { 123 | continue 124 | } 125 | 126 | // name 127 | if len(field.Names) == 0 { 128 | continue 129 | } 130 | name := field.Names[0].Name 131 | 132 | // Field name has to start from UTF-8 letter. 133 | // This is contraint of Go language spec. 134 | firstRune, _ := utf8.DecodeRuneInString(name) 135 | if !unicode.IsLetter(firstRune) { 136 | continue 137 | } 138 | 139 | // Should start from latin letter, 140 | // otherwise some weird error happens with fields inclusion. 141 | if !unicode.In(firstRune, unicode.Scripts["Latin"]) { 142 | continue 143 | } 144 | 145 | // type 146 | fieldType := field.Type 147 | if fieldType == nil { 148 | continue 149 | } 150 | fieldTypeIndent := fieldType.(*ast.Ident) 151 | if fieldTypeIndent == nil { 152 | continue 153 | } 154 | fieldTypeVal := fieldTypeIndent.Name 155 | 156 | // tag 157 | tagsLit := field.Tag 158 | if tagsLit == nil { 159 | continue 160 | } 161 | tags := tagsLit.Value 162 | 163 | var tag string 164 | for _, t := range strings.Fields(strings.Trim(tags, "`")) { 165 | if strings.HasPrefix(t, "feature:") { 166 | tag = t 167 | } 168 | } 169 | if tag == "" { 170 | continue 171 | } 172 | tag = strings.Trim(strings.TrimPrefix(tag, "feature:"), "\"") 173 | 174 | if _, ok := tagToTransformer[tag]; !ok { 175 | err = fmt.Errorf("unexpected value of struct tag \"%s\"", tag) 176 | return false 177 | } 178 | 179 | if !isTypeSupported[fieldTypeVal] { 180 | err = fmt.Errorf("unsupported type %s, supported field types: %#v, note it has to be raw", fieldTypeVal, isTypeSupported) 181 | return false 182 | } 183 | 184 | field := Field{ 185 | Name: name, 186 | Transformer: tagToTransformer[tag], 187 | Expanding: isTransformerExpanding[tag], 188 | NumericalInput: isTypeNumerical[fieldTypeVal], 189 | TransformerTag: tag, 190 | } 191 | if !isTransformerExpanding[tag] { 192 | numFieldsFlat++ 193 | } 194 | if isTransformerLarge[tag] { 195 | numLargeTransformers++ 196 | } 197 | fields = append(fields, field) 198 | 199 | if isTypeNumerical[fieldTypeVal] { 200 | numNumericalTransformers++ 201 | } else { 202 | numStringTransformers++ 203 | } 204 | } 205 | 206 | } 207 | return true 208 | }) 209 | 210 | params := TemplateParams{ 211 | PackageName: packageName, 212 | StructName: structName, 213 | NumFieldsFlat: numFieldsFlat, 214 | HasLargeTransformers: numLargeTransformers > 0, 215 | Fields: fields, 216 | HasNumericalTransformers: numNumericalTransformers > 0, 217 | HasStringTransformers: numStringTransformers > 0, 218 | } 219 | 220 | return ¶ms, err 221 | } 222 | -------------------------------------------------------------------------------- /cmd/generate/tests/weirdtagsfp.go: -------------------------------------------------------------------------------- 1 | // Code generated by go-featureprocessing DO NOT EDIT 2 | 3 | package examplemodule 4 | 5 | import ( 6 | "sync" 7 | 8 | fp "github.com/nikolaydubina/go-featureprocessing/transformers" 9 | ) 10 | 11 | // WeirdTagsFeatureTransformer is a feature processor for WeirdTags. 12 | // It was automatically generated by go-featureprocessing tool. 13 | type WeirdTagsFeatureTransformer struct { 14 | OnlyFeature fp.MinMaxScaler `json:"OnlyFeature_minmax"` 15 | FeatureNotFirst fp.MaxAbsScaler `json:"FeatureNotFirst_maxabs"` 16 | FirstFeature fp.OneHotEncoder `json:"FirstFeature_onehot"` 17 | Multiline fp.MaxAbsScaler `json:"Multiline_maxabs"` 18 | A안녕하세요 fp.MinMaxScaler `json:"A안녕하세요_minmax"` 19 | B안녕하세요1 fp.OneHotEncoder `json:"B안녕하세요1_onehot"` 20 | C안녕하세요0 fp.TFIDFVectorizer `json:"C안녕하세요0_tfidf"` 21 | } 22 | 23 | // Fit fits transformer for each field 24 | func (e *WeirdTagsFeatureTransformer) Fit(s []WeirdTags) { 25 | if e == nil || len(s) == 0 { 26 | return 27 | } 28 | 29 | dataNum := make([]float64, len(s)) 30 | dataStr := make([]string, len(s)) 31 | 32 | for i, v := range s { 33 | dataNum[i] = float64(v.OnlyFeature) 34 | } 35 | 36 | e.OnlyFeature.Fit(dataNum) 37 | 38 | for i, v := range s { 39 | dataNum[i] = float64(v.FeatureNotFirst) 40 | } 41 | 42 | e.FeatureNotFirst.Fit(dataNum) 43 | 44 | for i, v := range s { 45 | dataStr[i] = v.FirstFeature 46 | } 47 | 48 | e.FirstFeature.Fit(dataStr) 49 | 50 | for i, v := range s { 51 | dataNum[i] = float64(v.Multiline) 52 | } 53 | 54 | e.Multiline.Fit(dataNum) 55 | 56 | for i, v := range s { 57 | dataNum[i] = float64(v.A안녕하세요) 58 | } 59 | 60 | e.A안녕하세요.Fit(dataNum) 61 | 62 | for i, v := range s { 63 | dataStr[i] = v.B안녕하세요1 64 | } 65 | 66 | e.B안녕하세요1.Fit(dataStr) 67 | 68 | for i, v := range s { 69 | dataStr[i] = v.C안녕하세요0 70 | } 71 | 72 | e.C안녕하세요0.Fit(dataStr) 73 | 74 | } 75 | 76 | // Transform transforms struct into feature vector accordingly to transformers 77 | func (e *WeirdTagsFeatureTransformer) Transform(s *WeirdTags) []float64 { 78 | if s == nil || e == nil { 79 | return nil 80 | } 81 | features := make([]float64, e.NumFeatures()) 82 | e.TransformInplace(features, s) 83 | return features 84 | } 85 | 86 | // TransformInplace transforms struct into feature vector accordingly to transformers, and does so inplace 87 | func (e *WeirdTagsFeatureTransformer) TransformInplace(dst []float64, s *WeirdTags) { 88 | if s == nil || e == nil || len(dst) != e.NumFeatures() { 89 | return 90 | } 91 | idx := 0 92 | 93 | dst[idx] = e.OnlyFeature.Transform(float64(s.OnlyFeature)) 94 | idx++ 95 | 96 | dst[idx] = e.FeatureNotFirst.Transform(float64(s.FeatureNotFirst)) 97 | idx++ 98 | 99 | e.FirstFeature.TransformInplace(dst[idx:idx+e.FirstFeature.NumFeatures()], s.FirstFeature) 100 | idx += e.FirstFeature.NumFeatures() 101 | 102 | dst[idx] = e.Multiline.Transform(float64(s.Multiline)) 103 | idx++ 104 | 105 | dst[idx] = e.A안녕하세요.Transform(float64(s.A안녕하세요)) 106 | idx++ 107 | 108 | e.B안녕하세요1.TransformInplace(dst[idx:idx+e.B안녕하세요1.NumFeatures()], s.B안녕하세요1) 109 | idx += e.B안녕하세요1.NumFeatures() 110 | 111 | e.C안녕하세요0.TransformInplace(dst[idx:idx+e.C안녕하세요0.NumFeatures()], s.C안녕하세요0) 112 | idx += e.C안녕하세요0.NumFeatures() 113 | 114 | } 115 | 116 | // TransformAll transforms a slice of WeirdTags 117 | func (e *WeirdTagsFeatureTransformer) TransformAll(s []WeirdTags) []float64 { 118 | if e == nil { 119 | return nil 120 | } 121 | features := make([]float64, len(s)*e.NumFeatures()) 122 | e.TransformAllInplace(features, s) 123 | return features 124 | } 125 | 126 | // TransformAllInplace transforms a slice of WeirdTags inplace 127 | func (e *WeirdTagsFeatureTransformer) TransformAllInplace(dst []float64, s []WeirdTags) { 128 | if e == nil { 129 | return 130 | } 131 | n := e.NumFeatures() 132 | if len(dst) != n*len(s) { 133 | return 134 | } 135 | for i := range s { 136 | e.TransformInplace(dst[i*n:(i+1)*n], &s[i]) 137 | } 138 | } 139 | 140 | // TransformAllParallel transforms a slice of WeirdTags in parallel 141 | func (e *WeirdTagsFeatureTransformer) TransformAllParallel(s []WeirdTags, nworkers uint) []float64 { 142 | if e == nil { 143 | return nil 144 | } 145 | features := make([]float64, len(s)*e.NumFeatures()) 146 | e.TransformAllInplaceParallel(features, s, nworkers) 147 | return features 148 | } 149 | 150 | // TransformAllInplaceParallel transforms a slice of WeirdTags inplace parallel 151 | // Useful for very large slices. 152 | func (e *WeirdTagsFeatureTransformer) TransformAllInplaceParallel(dst []float64, s []WeirdTags, nworkers uint) { 153 | if e == nil || nworkers == 0 { 154 | return 155 | } 156 | ns := uint(len(s)) 157 | nf := uint(e.NumFeatures()) 158 | if uint(len(dst)) != nf*ns { 159 | return 160 | } 161 | 162 | nbatch := ns / nworkers 163 | var wg sync.WaitGroup 164 | 165 | for i := uint(0); i < nworkers; i++ { 166 | wg.Add(1) 167 | go func(i uint) { 168 | defer wg.Done() 169 | iStart := nbatch * i 170 | iEnd := nbatch * (i + 1) 171 | if i == (nworkers - 1) { 172 | iEnd = ns 173 | } 174 | e.TransformAllInplace(dst[iStart*nf:iEnd*nf], s[iStart:iEnd]) 175 | }(i) 176 | } 177 | 178 | wg.Wait() 179 | } 180 | 181 | // NumFeatures returns number of features in output feature vector 182 | func (e *WeirdTagsFeatureTransformer) NumFeatures() int { 183 | if e == nil { 184 | return 0 185 | } 186 | 187 | count := 4 188 | 189 | count += e.FirstFeature.NumFeatures() 190 | 191 | count += e.B안녕하세요1.NumFeatures() 192 | count += e.C안녕하세요0.NumFeatures() 193 | 194 | return count 195 | } 196 | 197 | // FeatureNames provides names of features that match output of transform 198 | func (e *WeirdTagsFeatureTransformer) FeatureNames() []string { 199 | if e == nil { 200 | return nil 201 | } 202 | 203 | idx := 0 204 | names := make([]string, e.NumFeatures()) 205 | 206 | names[idx] = "OnlyFeature" 207 | idx++ 208 | 209 | names[idx] = "FeatureNotFirst" 210 | idx++ 211 | 212 | for _, w := range e.FirstFeature.FeatureNames() { 213 | names[idx] = "FirstFeature_" + w 214 | idx++ 215 | } 216 | 217 | names[idx] = "Multiline" 218 | idx++ 219 | 220 | names[idx] = "A안녕하세요" 221 | idx++ 222 | 223 | for _, w := range e.B안녕하세요1.FeatureNames() { 224 | names[idx] = "B안녕하세요1_" + w 225 | idx++ 226 | } 227 | 228 | for _, w := range e.C안녕하세요0.FeatureNames() { 229 | names[idx] = "C안녕하세요0_" + w 230 | idx++ 231 | } 232 | 233 | return names 234 | } 235 | -------------------------------------------------------------------------------- /cmd/generate/tests/largememorytransformerfp.go: -------------------------------------------------------------------------------- 1 | // Code generated by go-featureprocessing DO NOT EDIT 2 | 3 | package examplemodule 4 | 5 | import ( 6 | "sync" 7 | 8 | fp "github.com/nikolaydubina/go-featureprocessing/transformers" 9 | ) 10 | 11 | // LargeMemoryTransformerFeatureTransformer is a feature processor for LargeMemoryTransformer. 12 | // It was automatically generated by go-featureprocessing tool. 13 | type LargeMemoryTransformerFeatureTransformer struct { 14 | Name1 fp.OneHotEncoder `json:"Name1_onehot"` 15 | Name2 fp.OneHotEncoder `json:"Name2_onehot"` 16 | Name3 fp.OrdinalEncoder `json:"Name3_ordinal"` 17 | Name4 fp.OrdinalEncoder `json:"Name4_ordinal"` 18 | Name5 fp.QuantileScaler `json:"Name5_quantile"` 19 | Name6 fp.QuantileScaler `json:"Name6_quantile"` 20 | Name7 fp.KBinsDiscretizer `json:"Name7_kbins"` 21 | Name8 fp.KBinsDiscretizer `json:"Name8_kbins"` 22 | } 23 | 24 | // Fit fits transformer for each field 25 | func (e *LargeMemoryTransformerFeatureTransformer) Fit(s []LargeMemoryTransformer) { 26 | if e == nil || len(s) == 0 { 27 | return 28 | } 29 | 30 | dataNum := make([]float64, len(s)) 31 | dataStr := make([]string, len(s)) 32 | 33 | for i, v := range s { 34 | dataStr[i] = v.Name1 35 | } 36 | 37 | e.Name1.Fit(dataStr) 38 | 39 | for i, v := range s { 40 | dataStr[i] = v.Name2 41 | } 42 | 43 | e.Name2.Fit(dataStr) 44 | 45 | for i, v := range s { 46 | dataStr[i] = v.Name3 47 | } 48 | 49 | e.Name3.Fit(dataStr) 50 | 51 | for i, v := range s { 52 | dataStr[i] = v.Name4 53 | } 54 | 55 | e.Name4.Fit(dataStr) 56 | 57 | for i, v := range s { 58 | dataNum[i] = float64(v.Name5) 59 | } 60 | 61 | e.Name5.Fit(dataNum) 62 | 63 | for i, v := range s { 64 | dataNum[i] = float64(v.Name6) 65 | } 66 | 67 | e.Name6.Fit(dataNum) 68 | 69 | for i, v := range s { 70 | dataNum[i] = float64(v.Name7) 71 | } 72 | 73 | e.Name7.Fit(dataNum) 74 | 75 | for i, v := range s { 76 | dataNum[i] = float64(v.Name8) 77 | } 78 | 79 | e.Name8.Fit(dataNum) 80 | 81 | } 82 | 83 | // Transform transforms struct into feature vector accordingly to transformers 84 | func (e *LargeMemoryTransformerFeatureTransformer) Transform(s *LargeMemoryTransformer) []float64 { 85 | if s == nil || e == nil { 86 | return nil 87 | } 88 | features := make([]float64, e.NumFeatures()) 89 | e.TransformInplace(features, s) 90 | return features 91 | } 92 | 93 | // TransformInplace transforms struct into feature vector accordingly to transformers, and does so inplace 94 | func (e *LargeMemoryTransformerFeatureTransformer) TransformInplace(dst []float64, s *LargeMemoryTransformer) { 95 | if s == nil || e == nil || len(dst) != e.NumFeatures() { 96 | return 97 | } 98 | idx := 0 99 | 100 | e.Name1.TransformInplace(dst[idx:idx+e.Name1.NumFeatures()], s.Name1) 101 | idx += e.Name1.NumFeatures() 102 | 103 | e.Name2.TransformInplace(dst[idx:idx+e.Name2.NumFeatures()], s.Name2) 104 | idx += e.Name2.NumFeatures() 105 | 106 | dst[idx] = e.Name3.Transform((s.Name3)) 107 | idx++ 108 | 109 | dst[idx] = e.Name4.Transform((s.Name4)) 110 | idx++ 111 | 112 | dst[idx] = e.Name5.Transform(float64(s.Name5)) 113 | idx++ 114 | 115 | dst[idx] = e.Name6.Transform(float64(s.Name6)) 116 | idx++ 117 | 118 | dst[idx] = e.Name7.Transform(float64(s.Name7)) 119 | idx++ 120 | 121 | dst[idx] = e.Name8.Transform(float64(s.Name8)) 122 | idx++ 123 | 124 | } 125 | 126 | // TransformAll transforms a slice of LargeMemoryTransformer 127 | func (e *LargeMemoryTransformerFeatureTransformer) TransformAll(s []LargeMemoryTransformer) []float64 { 128 | if e == nil { 129 | return nil 130 | } 131 | features := make([]float64, len(s)*e.NumFeatures()) 132 | e.TransformAllInplace(features, s) 133 | return features 134 | } 135 | 136 | // TransformAllInplace transforms a slice of LargeMemoryTransformer inplace 137 | func (e *LargeMemoryTransformerFeatureTransformer) TransformAllInplace(dst []float64, s []LargeMemoryTransformer) { 138 | if e == nil { 139 | return 140 | } 141 | n := e.NumFeatures() 142 | if len(dst) != n*len(s) { 143 | return 144 | } 145 | for i := range s { 146 | e.TransformInplace(dst[i*n:(i+1)*n], &s[i]) 147 | } 148 | } 149 | 150 | // TransformAllParallel transforms a slice of LargeMemoryTransformer in parallel 151 | func (e *LargeMemoryTransformerFeatureTransformer) TransformAllParallel(s []LargeMemoryTransformer, nworkers uint) []float64 { 152 | if e == nil { 153 | return nil 154 | } 155 | features := make([]float64, len(s)*e.NumFeatures()) 156 | e.TransformAllInplaceParallel(features, s, nworkers) 157 | return features 158 | } 159 | 160 | // TransformAllInplaceParallel transforms a slice of LargeMemoryTransformer inplace parallel 161 | // Useful for very large slices. 162 | func (e *LargeMemoryTransformerFeatureTransformer) TransformAllInplaceParallel(dst []float64, s []LargeMemoryTransformer, nworkers uint) { 163 | if e == nil || nworkers == 0 { 164 | return 165 | } 166 | ns := uint(len(s)) 167 | nf := uint(e.NumFeatures()) 168 | if uint(len(dst)) != nf*ns { 169 | return 170 | } 171 | 172 | nbatch := ns / nworkers 173 | var wg sync.WaitGroup 174 | 175 | for i := uint(0); i < nworkers; i++ { 176 | wg.Add(1) 177 | go func(i uint) { 178 | defer wg.Done() 179 | iStart := nbatch * i 180 | iEnd := nbatch * (i + 1) 181 | if i == (nworkers - 1) { 182 | iEnd = ns 183 | } 184 | e.TransformAllInplace(dst[iStart*nf:iEnd*nf], s[iStart:iEnd]) 185 | }(i) 186 | } 187 | 188 | wg.Wait() 189 | } 190 | 191 | // NumFeatures returns number of features in output feature vector 192 | func (e *LargeMemoryTransformerFeatureTransformer) NumFeatures() int { 193 | if e == nil { 194 | return 0 195 | } 196 | 197 | count := 6 198 | count += e.Name1.NumFeatures() 199 | count += e.Name2.NumFeatures() 200 | 201 | return count 202 | } 203 | 204 | // FeatureNames provides names of features that match output of transform 205 | func (e *LargeMemoryTransformerFeatureTransformer) FeatureNames() []string { 206 | if e == nil { 207 | return nil 208 | } 209 | 210 | idx := 0 211 | names := make([]string, e.NumFeatures()) 212 | 213 | for _, w := range e.Name1.FeatureNames() { 214 | names[idx] = "Name1_" + w 215 | idx++ 216 | } 217 | 218 | for _, w := range e.Name2.FeatureNames() { 219 | names[idx] = "Name2_" + w 220 | idx++ 221 | } 222 | 223 | names[idx] = "Name3" 224 | idx++ 225 | 226 | names[idx] = "Name4" 227 | idx++ 228 | 229 | names[idx] = "Name5" 230 | idx++ 231 | 232 | names[idx] = "Name6" 233 | idx++ 234 | 235 | names[idx] = "Name7" 236 | idx++ 237 | 238 | names[idx] = "Name8" 239 | idx++ 240 | 241 | return names 242 | } 243 | -------------------------------------------------------------------------------- /cmd/generate/tests/employeefp.go: -------------------------------------------------------------------------------- 1 | // Code generated by go-featureprocessing DO NOT EDIT 2 | 3 | package examplemodule 4 | 5 | import ( 6 | "sync" 7 | 8 | fp "github.com/nikolaydubina/go-featureprocessing/transformers" 9 | ) 10 | 11 | // EmployeeFeatureTransformer is a feature processor for Employee. 12 | // It was automatically generated by go-featureprocessing tool. 13 | type EmployeeFeatureTransformer struct { 14 | Age fp.Identity `json:"Age_identity"` 15 | Salary fp.MinMaxScaler `json:"Salary_minmax"` 16 | Kids fp.MaxAbsScaler `json:"Kids_maxabs"` 17 | Weight fp.StandardScaler `json:"Weight_standard"` 18 | Height fp.QuantileScaler `json:"Height_quantile"` 19 | City fp.OneHotEncoder `json:"City_onehot"` 20 | Car fp.OrdinalEncoder `json:"Car_ordinal"` 21 | Income fp.KBinsDiscretizer `json:"Income_kbins"` 22 | Description fp.TFIDFVectorizer `json:"Description_tfidf"` 23 | } 24 | 25 | // Fit fits transformer for each field 26 | func (e *EmployeeFeatureTransformer) Fit(s []Employee) { 27 | if e == nil || len(s) == 0 { 28 | return 29 | } 30 | 31 | dataNum := make([]float64, len(s)) 32 | dataStr := make([]string, len(s)) 33 | 34 | for i, v := range s { 35 | dataNum[i] = float64(v.Age) 36 | } 37 | 38 | e.Age.Fit(dataNum) 39 | 40 | for i, v := range s { 41 | dataNum[i] = float64(v.Salary) 42 | } 43 | 44 | e.Salary.Fit(dataNum) 45 | 46 | for i, v := range s { 47 | dataNum[i] = float64(v.Kids) 48 | } 49 | 50 | e.Kids.Fit(dataNum) 51 | 52 | for i, v := range s { 53 | dataNum[i] = float64(v.Weight) 54 | } 55 | 56 | e.Weight.Fit(dataNum) 57 | 58 | for i, v := range s { 59 | dataNum[i] = float64(v.Height) 60 | } 61 | 62 | e.Height.Fit(dataNum) 63 | 64 | for i, v := range s { 65 | dataStr[i] = v.City 66 | } 67 | 68 | e.City.Fit(dataStr) 69 | 70 | for i, v := range s { 71 | dataStr[i] = v.Car 72 | } 73 | 74 | e.Car.Fit(dataStr) 75 | 76 | for i, v := range s { 77 | dataNum[i] = float64(v.Income) 78 | } 79 | 80 | e.Income.Fit(dataNum) 81 | 82 | for i, v := range s { 83 | dataStr[i] = v.Description 84 | } 85 | 86 | e.Description.Fit(dataStr) 87 | 88 | } 89 | 90 | // Transform transforms struct into feature vector accordingly to transformers 91 | func (e *EmployeeFeatureTransformer) Transform(s *Employee) []float64 { 92 | if s == nil || e == nil { 93 | return nil 94 | } 95 | features := make([]float64, e.NumFeatures()) 96 | e.TransformInplace(features, s) 97 | return features 98 | } 99 | 100 | // TransformInplace transforms struct into feature vector accordingly to transformers, and does so inplace 101 | func (e *EmployeeFeatureTransformer) TransformInplace(dst []float64, s *Employee) { 102 | if s == nil || e == nil || len(dst) != e.NumFeatures() { 103 | return 104 | } 105 | idx := 0 106 | 107 | dst[idx] = e.Age.Transform(float64(s.Age)) 108 | idx++ 109 | 110 | dst[idx] = e.Salary.Transform(float64(s.Salary)) 111 | idx++ 112 | 113 | dst[idx] = e.Kids.Transform(float64(s.Kids)) 114 | idx++ 115 | 116 | dst[idx] = e.Weight.Transform(float64(s.Weight)) 117 | idx++ 118 | 119 | dst[idx] = e.Height.Transform(float64(s.Height)) 120 | idx++ 121 | 122 | e.City.TransformInplace(dst[idx:idx+e.City.NumFeatures()], s.City) 123 | idx += e.City.NumFeatures() 124 | 125 | dst[idx] = e.Car.Transform((s.Car)) 126 | idx++ 127 | 128 | dst[idx] = e.Income.Transform(float64(s.Income)) 129 | idx++ 130 | 131 | e.Description.TransformInplace(dst[idx:idx+e.Description.NumFeatures()], s.Description) 132 | idx += e.Description.NumFeatures() 133 | 134 | } 135 | 136 | // TransformAll transforms a slice of Employee 137 | func (e *EmployeeFeatureTransformer) TransformAll(s []Employee) []float64 { 138 | if e == nil { 139 | return nil 140 | } 141 | features := make([]float64, len(s)*e.NumFeatures()) 142 | e.TransformAllInplace(features, s) 143 | return features 144 | } 145 | 146 | // TransformAllInplace transforms a slice of Employee inplace 147 | func (e *EmployeeFeatureTransformer) TransformAllInplace(dst []float64, s []Employee) { 148 | if e == nil { 149 | return 150 | } 151 | n := e.NumFeatures() 152 | if len(dst) != n*len(s) { 153 | return 154 | } 155 | for i := range s { 156 | e.TransformInplace(dst[i*n:(i+1)*n], &s[i]) 157 | } 158 | } 159 | 160 | // TransformAllParallel transforms a slice of Employee in parallel 161 | func (e *EmployeeFeatureTransformer) TransformAllParallel(s []Employee, nworkers uint) []float64 { 162 | if e == nil { 163 | return nil 164 | } 165 | features := make([]float64, len(s)*e.NumFeatures()) 166 | e.TransformAllInplaceParallel(features, s, nworkers) 167 | return features 168 | } 169 | 170 | // TransformAllInplaceParallel transforms a slice of Employee inplace parallel 171 | // Useful for very large slices. 172 | func (e *EmployeeFeatureTransformer) TransformAllInplaceParallel(dst []float64, s []Employee, nworkers uint) { 173 | if e == nil || nworkers == 0 { 174 | return 175 | } 176 | ns := uint(len(s)) 177 | nf := uint(e.NumFeatures()) 178 | if uint(len(dst)) != nf*ns { 179 | return 180 | } 181 | 182 | nbatch := ns / nworkers 183 | var wg sync.WaitGroup 184 | 185 | for i := uint(0); i < nworkers; i++ { 186 | wg.Add(1) 187 | go func(i uint) { 188 | defer wg.Done() 189 | iStart := nbatch * i 190 | iEnd := nbatch * (i + 1) 191 | if i == (nworkers - 1) { 192 | iEnd = ns 193 | } 194 | e.TransformAllInplace(dst[iStart*nf:iEnd*nf], s[iStart:iEnd]) 195 | }(i) 196 | } 197 | 198 | wg.Wait() 199 | } 200 | 201 | // NumFeatures returns number of features in output feature vector 202 | func (e *EmployeeFeatureTransformer) NumFeatures() int { 203 | if e == nil { 204 | return 0 205 | } 206 | 207 | count := 7 208 | 209 | count += e.City.NumFeatures() 210 | 211 | count += e.Description.NumFeatures() 212 | 213 | return count 214 | } 215 | 216 | // FeatureNames provides names of features that match output of transform 217 | func (e *EmployeeFeatureTransformer) FeatureNames() []string { 218 | if e == nil { 219 | return nil 220 | } 221 | 222 | idx := 0 223 | names := make([]string, e.NumFeatures()) 224 | 225 | names[idx] = "Age" 226 | idx++ 227 | 228 | names[idx] = "Salary" 229 | idx++ 230 | 231 | names[idx] = "Kids" 232 | idx++ 233 | 234 | names[idx] = "Weight" 235 | idx++ 236 | 237 | names[idx] = "Height" 238 | idx++ 239 | 240 | for _, w := range e.City.FeatureNames() { 241 | names[idx] = "City_" + w 242 | idx++ 243 | } 244 | 245 | names[idx] = "Car" 246 | idx++ 247 | 248 | names[idx] = "Income" 249 | idx++ 250 | 251 | for _, w := range e.Description.FeatureNames() { 252 | names[idx] = "Description_" + w 253 | idx++ 254 | } 255 | 256 | return names 257 | } 258 | -------------------------------------------------------------------------------- /cmd/generate/tests/alltransformersfp.go: -------------------------------------------------------------------------------- 1 | // Code generated by go-featureprocessing DO NOT EDIT 2 | 3 | package examplemodule 4 | 5 | import ( 6 | "sync" 7 | 8 | fp "github.com/nikolaydubina/go-featureprocessing/transformers" 9 | ) 10 | 11 | // AllTransformersFeatureTransformer is a feature processor for AllTransformers. 12 | // It was automatically generated by go-featureprocessing tool. 13 | type AllTransformersFeatureTransformer struct { 14 | Name0 fp.Identity `json:"Name0_identity"` 15 | Name1 fp.MinMaxScaler `json:"Name1_minmax"` 16 | Name2 fp.MaxAbsScaler `json:"Name2_maxabs"` 17 | Name3 fp.StandardScaler `json:"Name3_standard"` 18 | Name4 fp.QuantileScaler `json:"Name4_quantile"` 19 | Name5 fp.OneHotEncoder `json:"Name5_onehot"` 20 | Name6 fp.OrdinalEncoder `json:"Name6_ordinal"` 21 | Name7 fp.KBinsDiscretizer `json:"Name7_kbins"` 22 | Name8 fp.CountVectorizer `json:"Name8_countvectorizer"` 23 | Name9 fp.TFIDFVectorizer `json:"Name9_tfidf"` 24 | } 25 | 26 | // Fit fits transformer for each field 27 | func (e *AllTransformersFeatureTransformer) Fit(s []AllTransformers) { 28 | if e == nil || len(s) == 0 { 29 | return 30 | } 31 | 32 | dataNum := make([]float64, len(s)) 33 | dataStr := make([]string, len(s)) 34 | 35 | for i, v := range s { 36 | dataNum[i] = float64(v.Name0) 37 | } 38 | 39 | e.Name0.Fit(dataNum) 40 | 41 | for i, v := range s { 42 | dataNum[i] = float64(v.Name1) 43 | } 44 | 45 | e.Name1.Fit(dataNum) 46 | 47 | for i, v := range s { 48 | dataNum[i] = float64(v.Name2) 49 | } 50 | 51 | e.Name2.Fit(dataNum) 52 | 53 | for i, v := range s { 54 | dataNum[i] = float64(v.Name3) 55 | } 56 | 57 | e.Name3.Fit(dataNum) 58 | 59 | for i, v := range s { 60 | dataNum[i] = float64(v.Name4) 61 | } 62 | 63 | e.Name4.Fit(dataNum) 64 | 65 | for i, v := range s { 66 | dataStr[i] = v.Name5 67 | } 68 | 69 | e.Name5.Fit(dataStr) 70 | 71 | for i, v := range s { 72 | dataStr[i] = v.Name6 73 | } 74 | 75 | e.Name6.Fit(dataStr) 76 | 77 | for i, v := range s { 78 | dataNum[i] = float64(v.Name7) 79 | } 80 | 81 | e.Name7.Fit(dataNum) 82 | 83 | for i, v := range s { 84 | dataStr[i] = v.Name8 85 | } 86 | 87 | e.Name8.Fit(dataStr) 88 | 89 | for i, v := range s { 90 | dataStr[i] = v.Name9 91 | } 92 | 93 | e.Name9.Fit(dataStr) 94 | 95 | } 96 | 97 | // Transform transforms struct into feature vector accordingly to transformers 98 | func (e *AllTransformersFeatureTransformer) Transform(s *AllTransformers) []float64 { 99 | if s == nil || e == nil { 100 | return nil 101 | } 102 | features := make([]float64, e.NumFeatures()) 103 | e.TransformInplace(features, s) 104 | return features 105 | } 106 | 107 | // TransformInplace transforms struct into feature vector accordingly to transformers, and does so inplace 108 | func (e *AllTransformersFeatureTransformer) TransformInplace(dst []float64, s *AllTransformers) { 109 | if s == nil || e == nil || len(dst) != e.NumFeatures() { 110 | return 111 | } 112 | idx := 0 113 | 114 | dst[idx] = e.Name0.Transform(float64(s.Name0)) 115 | idx++ 116 | 117 | dst[idx] = e.Name1.Transform(float64(s.Name1)) 118 | idx++ 119 | 120 | dst[idx] = e.Name2.Transform(float64(s.Name2)) 121 | idx++ 122 | 123 | dst[idx] = e.Name3.Transform(float64(s.Name3)) 124 | idx++ 125 | 126 | dst[idx] = e.Name4.Transform(float64(s.Name4)) 127 | idx++ 128 | 129 | e.Name5.TransformInplace(dst[idx:idx+e.Name5.NumFeatures()], s.Name5) 130 | idx += e.Name5.NumFeatures() 131 | 132 | dst[idx] = e.Name6.Transform((s.Name6)) 133 | idx++ 134 | 135 | dst[idx] = e.Name7.Transform(float64(s.Name7)) 136 | idx++ 137 | 138 | e.Name8.TransformInplace(dst[idx:idx+e.Name8.NumFeatures()], s.Name8) 139 | idx += e.Name8.NumFeatures() 140 | 141 | e.Name9.TransformInplace(dst[idx:idx+e.Name9.NumFeatures()], s.Name9) 142 | idx += e.Name9.NumFeatures() 143 | 144 | } 145 | 146 | // TransformAll transforms a slice of AllTransformers 147 | func (e *AllTransformersFeatureTransformer) TransformAll(s []AllTransformers) []float64 { 148 | if e == nil { 149 | return nil 150 | } 151 | features := make([]float64, len(s)*e.NumFeatures()) 152 | e.TransformAllInplace(features, s) 153 | return features 154 | } 155 | 156 | // TransformAllInplace transforms a slice of AllTransformers inplace 157 | func (e *AllTransformersFeatureTransformer) TransformAllInplace(dst []float64, s []AllTransformers) { 158 | if e == nil { 159 | return 160 | } 161 | n := e.NumFeatures() 162 | if len(dst) != n*len(s) { 163 | return 164 | } 165 | for i := range s { 166 | e.TransformInplace(dst[i*n:(i+1)*n], &s[i]) 167 | } 168 | } 169 | 170 | // TransformAllParallel transforms a slice of AllTransformers in parallel 171 | func (e *AllTransformersFeatureTransformer) TransformAllParallel(s []AllTransformers, nworkers uint) []float64 { 172 | if e == nil { 173 | return nil 174 | } 175 | features := make([]float64, len(s)*e.NumFeatures()) 176 | e.TransformAllInplaceParallel(features, s, nworkers) 177 | return features 178 | } 179 | 180 | // TransformAllInplaceParallel transforms a slice of AllTransformers inplace parallel 181 | // Useful for very large slices. 182 | func (e *AllTransformersFeatureTransformer) TransformAllInplaceParallel(dst []float64, s []AllTransformers, nworkers uint) { 183 | if e == nil || nworkers == 0 { 184 | return 185 | } 186 | ns := uint(len(s)) 187 | nf := uint(e.NumFeatures()) 188 | if uint(len(dst)) != nf*ns { 189 | return 190 | } 191 | 192 | nbatch := ns / nworkers 193 | var wg sync.WaitGroup 194 | 195 | for i := uint(0); i < nworkers; i++ { 196 | wg.Add(1) 197 | go func(i uint) { 198 | defer wg.Done() 199 | iStart := nbatch * i 200 | iEnd := nbatch * (i + 1) 201 | if i == (nworkers - 1) { 202 | iEnd = ns 203 | } 204 | e.TransformAllInplace(dst[iStart*nf:iEnd*nf], s[iStart:iEnd]) 205 | }(i) 206 | } 207 | 208 | wg.Wait() 209 | } 210 | 211 | // NumFeatures returns number of features in output feature vector 212 | func (e *AllTransformersFeatureTransformer) NumFeatures() int { 213 | if e == nil { 214 | return 0 215 | } 216 | 217 | count := 7 218 | 219 | count += e.Name5.NumFeatures() 220 | 221 | count += e.Name8.NumFeatures() 222 | count += e.Name9.NumFeatures() 223 | 224 | return count 225 | } 226 | 227 | // FeatureNames provides names of features that match output of transform 228 | func (e *AllTransformersFeatureTransformer) FeatureNames() []string { 229 | if e == nil { 230 | return nil 231 | } 232 | 233 | idx := 0 234 | names := make([]string, e.NumFeatures()) 235 | 236 | names[idx] = "Name0" 237 | idx++ 238 | 239 | names[idx] = "Name1" 240 | idx++ 241 | 242 | names[idx] = "Name2" 243 | idx++ 244 | 245 | names[idx] = "Name3" 246 | idx++ 247 | 248 | names[idx] = "Name4" 249 | idx++ 250 | 251 | for _, w := range e.Name5.FeatureNames() { 252 | names[idx] = "Name5_" + w 253 | idx++ 254 | } 255 | 256 | names[idx] = "Name6" 257 | idx++ 258 | 259 | names[idx] = "Name7" 260 | idx++ 261 | 262 | for _, w := range e.Name8.FeatureNames() { 263 | names[idx] = "Name8_" + w 264 | idx++ 265 | } 266 | 267 | for _, w := range e.Name9.FeatureNames() { 268 | names[idx] = "Name9_" + w 269 | idx++ 270 | } 271 | 272 | return names 273 | } 274 | -------------------------------------------------------------------------------- /transformers/scalers_test.go: -------------------------------------------------------------------------------- 1 | package transformers_test 2 | 3 | import ( 4 | "testing" 5 | 6 | . "github.com/nikolaydubina/go-featureprocessing/transformers" 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func TestIdentity(t *testing.T) { 11 | samples := []struct { 12 | name string 13 | input float64 14 | output float64 15 | }{ 16 | {"basic", 42, 42}, 17 | {"negative", -42, -42}, 18 | {"zero", 0, 0}, 19 | {"fraction", 0.5, 0.5}, 20 | } 21 | for _, s := range samples { 22 | t.Run(s.name, func(t *testing.T) { 23 | encoder := Identity{} 24 | features := encoder.Transform((s.input)) 25 | assert.Equal(t, s.output, features) 26 | }) 27 | } 28 | 29 | t.Run("fit", func(t *testing.T) { 30 | encoder := Identity{} 31 | encoder.Fit(nil) 32 | assert.Equal(t, Identity{}, encoder) 33 | }) 34 | } 35 | 36 | func TestMinMaxScalerTransform(t *testing.T) { 37 | samples := []struct { 38 | name string 39 | min float64 40 | max float64 41 | input float64 42 | output float64 43 | }{ 44 | {"basic", 1, 101, 51, 0.5}, 45 | {"basic", 1, 101, 71, 0.7}, 46 | {"bellow", 1, 101, 0.5, 0}, 47 | {"above", 1, 101, 102, 1}, 48 | {"negative", 1, 101, -1, 0}, 49 | {"zero", 1, 101, 0, 0}, 50 | {"same1", 1, 1, 1, 0}, 51 | {"same2", 1, 1, 0.5, 0}, 52 | {"same2", 1, 1, 2, 0}, 53 | } 54 | for _, s := range samples { 55 | t.Run(s.name, func(t *testing.T) { 56 | encoder := MinMaxScaler{Min: s.min, Max: s.max} 57 | features := encoder.Transform((s.input)) 58 | assert.Equal(t, s.output, features) 59 | }) 60 | } 61 | } 62 | 63 | func TestMinMaxScalerFit(t *testing.T) { 64 | samples := []struct { 65 | name string 66 | min float64 67 | max float64 68 | vals []float64 69 | }{ 70 | {"noinput", 0, 0, nil}, 71 | {"basic", 1, 101, []float64{1, 101}}, 72 | {"negative_1", -1, 101, []float64{-1, 101}}, 73 | {"negative_2", -10, -1, []float64{-10, -1}}, 74 | {"zero", 0, 0, []float64{0, 0}}, 75 | {"same", 1, 1, []float64{1, 1}}, 76 | {"reverse_order", 1, 10, []float64{10, 1}}, 77 | {"reverse_order_negative", -10, -1, []float64{-1, -10}}, 78 | } 79 | for _, s := range samples { 80 | t.Run(s.name, func(t *testing.T) { 81 | encoder := MinMaxScaler{} 82 | encoder.Fit(s.vals) 83 | assert.Equal(t, MinMaxScaler{Min: s.min, Max: s.max}, encoder) 84 | }) 85 | } 86 | } 87 | 88 | func TestMaxAbsScalerTransform(t *testing.T) { 89 | samples := []struct { 90 | name string 91 | max float64 92 | input float64 93 | output float64 94 | }{ 95 | {"basic", 100, 50, 0.5}, 96 | {"basic", 100, 70, 0.7}, 97 | {"above", 100, 102, 1}, 98 | {"above_negative", 100, -102, -1}, 99 | {"negative", 100, -50, -0.5}, 100 | {"zero1", 100, 0, 0}, 101 | {"zero2", 0, 0, 0}, 102 | } 103 | for _, s := range samples { 104 | t.Run(s.name, func(t *testing.T) { 105 | encoder := MaxAbsScaler{Max: s.max} 106 | features := encoder.Transform((s.input)) 107 | assert.Equal(t, s.output, features) 108 | }) 109 | } 110 | } 111 | 112 | func TestMaxAbsScalerFit(t *testing.T) { 113 | samples := []struct { 114 | name string 115 | max float64 116 | vals []float64 117 | }{ 118 | {"noinput", 0, nil}, 119 | {"basic", 100, []float64{1, 100}}, 120 | {"negative", 100, []float64{-1, -100}}, 121 | {"zero", 0, []float64{0, 0}}, 122 | {"same", 1, []float64{1, 1}}, 123 | {"reverse_order", 10, []float64{10, 1}}, 124 | {"reverse_order_negative", 10, []float64{-1, -10}}, 125 | } 126 | for _, s := range samples { 127 | t.Run(s.name, func(t *testing.T) { 128 | encoder := MaxAbsScaler{} 129 | encoder.Fit(s.vals) 130 | assert.Equal(t, MaxAbsScaler{Max: s.max}, encoder) 131 | }) 132 | } 133 | } 134 | 135 | func TestStandardScalerTransform(t *testing.T) { 136 | samples := []struct { 137 | name string 138 | mean float64 139 | std float64 140 | input float64 141 | output float64 142 | }{ 143 | {"basic_0", 100, 50, 100, 0}, 144 | {"basic_-0.5", 100, 50, 75, -0.5}, 145 | {"basic_0.5", 100, 50, 125, 0.5}, 146 | {"basic_-1", 100, 50, 50, -1}, 147 | {"basic_+1", 100, 50, 150, 1}, 148 | {"basic_-2", 100, 50, 0, -2}, 149 | {"basic_+2", 100, 50, 200, 2}, 150 | {"basic_-3", 100, 50, -50, -3}, 151 | {"basic_+3", 100, 50, 250, 3}, 152 | } 153 | for _, s := range samples { 154 | t.Run(s.name, func(t *testing.T) { 155 | encoder := StandardScaler{Mean: s.mean, STD: s.std} 156 | assert.Equal(t, s.output, encoder.Transform(s.input)) 157 | }) 158 | } 159 | } 160 | 161 | func TestStandardScalerFit(t *testing.T) { 162 | samples := []struct { 163 | name string 164 | mean float64 165 | std float64 166 | vals []float64 167 | }{ 168 | {"noinput", 0, 0, nil}, 169 | {"basic", 50.5, 70.0035713374682, []float64{1, 100}}, 170 | {"negative", -50.5, 70.0035713374682, []float64{-1, -100}}, 171 | {"zero", 0, 0, []float64{0, 0}}, 172 | {"same", 1, 0, []float64{1, 1, 1, 1}}, 173 | } 174 | for _, s := range samples { 175 | t.Run(s.name, func(t *testing.T) { 176 | encoder := StandardScaler{} 177 | encoder.Fit(s.vals) 178 | assert.Equal(t, StandardScaler{Mean: s.mean, STD: s.std}, encoder) 179 | }) 180 | } 181 | } 182 | 183 | func TestQuantileScalerTransform(t *testing.T) { 184 | samples := []struct { 185 | name string 186 | quantiles []float64 187 | input float64 188 | output float64 189 | }{ 190 | {"basic1", []float64{25, 50, 75, 100}, 0, 0.25}, 191 | {"basic2", []float64{25, 50, 75, 100}, 11, 0.25}, 192 | {"basic3", []float64{25, 50, 75, 100}, 25, 0.25}, 193 | {"basic4", []float64{25, 50, 75, 100}, 40, 0.5}, 194 | {"basic5", []float64{25, 50, 75, 100}, 50, 0.5}, 195 | {"basic6", []float64{25, 50, 75, 100}, 80, 1}, 196 | {"basic7", []float64{25, 50, 75, 100}, 101, 1}, 197 | {"empty", nil, 10, 0}, 198 | } 199 | for _, s := range samples { 200 | t.Run(s.name, func(t *testing.T) { 201 | encoder := QuantileScaler{Quantiles: s.quantiles} 202 | features := encoder.Transform((s.input)) 203 | assert.Equal(t, s.output, features) 204 | }) 205 | } 206 | } 207 | 208 | func TestQuantileScalerFit(t *testing.T) { 209 | samples := []struct { 210 | name string 211 | n int 212 | quantiles []float64 213 | vals []float64 214 | }{ 215 | {"basic", 4, []float64{25, 50, 75, 100}, []float64{25, 50, 75, 100}}, 216 | {"reverse_order", 4, []float64{25, 50, 75, 100}, []float64{100, 75, 50, 25}}, 217 | {"negative", 4, []float64{-100, -75, -50, -25}, []float64{-25, -50, -75, -100}}, 218 | {"one_quantile", 1, []float64{1}, []float64{1, 2, 3, 4, 5}}, 219 | {"one_value", 4, []float64{1}, []float64{1}}, 220 | {"less_elements_than_quantiles", 6, []float64{1, 2, 3}, []float64{1, 2, 3}}, 221 | {"more_inputs_than_quantiles", 3, []float64{1, 6, 11}, []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}, 222 | {"more_inputs_than_quantiles_reverse", 3, []float64{1, 6, 11}, []float64{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}}, 223 | {"more_inputs_than_quantiles_shape", 3, []float64{1, 6, 8}, []float64{1, 1, 1, 1, 1, 6, 6, 7, 8, 7, 8, 12, 6, 8, 15}}, 224 | } 225 | for _, s := range samples { 226 | t.Run(s.name, func(t *testing.T) { 227 | encoder := QuantileScaler{Quantiles: make([]float64, s.n)} 228 | encoder.Fit(s.vals) 229 | assert.Equal(t, QuantileScaler{Quantiles: s.quantiles}, encoder) 230 | }) 231 | } 232 | 233 | t.Run("no input", func(t *testing.T) { 234 | encoder := QuantileScaler{} 235 | encoder.Fit(nil) 236 | assert.Equal(t, QuantileScaler{}, encoder) 237 | }) 238 | 239 | t.Run("nquantiles is zero in beginning", func(t *testing.T) { 240 | encoder := QuantileScaler{} 241 | encoder.Fit(nil) 242 | assert.Equal(t, QuantileScaler{}, encoder) 243 | }) 244 | } 245 | -------------------------------------------------------------------------------- /transformers/textprocessors_test.go: -------------------------------------------------------------------------------- 1 | package transformers_test 2 | 3 | import ( 4 | "testing" 5 | 6 | . "github.com/nikolaydubina/go-featureprocessing/transformers" 7 | "github.com/stretchr/testify/assert" 8 | ) 9 | 10 | func TestCountVectorizer(t *testing.T) { 11 | samplesFit := []struct { 12 | name string 13 | input []string 14 | output map[string]uint 15 | }{ 16 | {"basic", []string{"a b", "b a", "a", "b", ""}, map[string]uint{"a": 0, "b": 1}}, 17 | {"same_string", []string{"a", "a", "a"}, map[string]uint{"a": 0}}, 18 | {"empty_string", []string{"", "", ""}, map[string]uint{}}, 19 | {"zeros_single", []string{""}, map[string]uint{}}, 20 | {"single", []string{"a"}, map[string]uint{"a": 0}}, 21 | {"empty", nil, nil}, 22 | } 23 | 24 | for _, s := range samplesFit { 25 | t.Run(s.name, func(t *testing.T) { 26 | encoder := CountVectorizer{} 27 | encoder.Fit(s.input) 28 | assert.Equal(t, CountVectorizer{Mapping: s.output, Separator: " "}, encoder) 29 | }) 30 | } 31 | 32 | t.Run("num features is zero for nil encoder", func(t *testing.T) { 33 | var encoder *CountVectorizer 34 | assert.Equal(t, 0, encoder.NumFeatures()) 35 | }) 36 | 37 | t.Run("transform returns nil on nil encoder", func(t *testing.T) { 38 | var encoder *CountVectorizer 39 | assert.Equal(t, []float64(nil), encoder.Transform("asdf")) 40 | }) 41 | 42 | t.Run("feature names on empty transformer", func(t *testing.T) { 43 | var encoder *CountVectorizer 44 | assert.Equal(t, []string(nil), encoder.FeatureNames()) 45 | }) 46 | 47 | t.Run("feature names", func(t *testing.T) { 48 | encoder := CountVectorizer{Mapping: map[string]uint{"a": 1, "b": 0}} 49 | assert.Equal(t, []string{"b", "a"}, encoder.FeatureNames()) 50 | }) 51 | 52 | samplesTransform := []struct { 53 | name string 54 | sep string 55 | mapping map[string]uint 56 | input string 57 | output []float64 58 | }{ 59 | {"empty string", "", map[string]uint{"a": 0, "b": 1, "c": 2}, "a b c", []float64{0, 0, 0}}, 60 | {"no separator", " ", map[string]uint{"a": 0, "b": 1, "c": 2}, "a", []float64{1, 0, 0}}, 61 | {"no separator repeating not counted", " ", map[string]uint{"a": 0, "b": 1, "c": 2}, "aaa", []float64{0, 0, 0}}, 62 | {"no separator utf-8", " ", map[string]uint{"안녕": 0, "b": 1, "c": 2}, "안녕", []float64{1, 0, 0}}, 63 | {"no separator utf-8 repeating not counted", " ", map[string]uint{"a": 0, "b": 1, "c": 2}, "안녕안녕안녕", []float64{0, 0, 0}}, 64 | {"basic", " ", map[string]uint{"a": 0, "b": 1, "c": 2}, "a b c", []float64{1, 1, 1}}, 65 | {"ending with separator", " ", map[string]uint{"a": 0, "b": 1, "c": 2}, "a b c ", []float64{1, 1, 1}}, 66 | {"separators continuosly", " ", map[string]uint{"a": 0, "b": 1, "c": 2}, " a b c ", []float64{1, 1, 1}}, 67 | {"counting", " ", map[string]uint{"a": 0, "b": 1, "c": 2}, "a a a b b c", []float64{3, 2, 1}}, 68 | } 69 | 70 | for _, s := range samplesTransform { 71 | t.Run("transform_inplace_"+s.name, func(t *testing.T) { 72 | tr := CountVectorizer{Separator: s.sep, Mapping: s.mapping} 73 | assert.Equal(t, s.output, tr.Transform(s.input)) 74 | }) 75 | } 76 | } 77 | 78 | func TestTFIDFVectorizerFit(t *testing.T) { 79 | samples := []struct { 80 | name string 81 | ndocs int 82 | doccount []uint 83 | mapping map[string]uint 84 | input []string 85 | numFeatures int 86 | }{ 87 | {"basic", 6, []uint{6, 1, 2}, map[string]uint{"a": 0, "b": 1, "c": 2}, []string{"a a a b b", "a a a c", "a a", "a a a", "a a a a", "a a a c c"}, 3}, 88 | {"empty encoder empty input", 0, []uint(nil), map[string]uint(nil), nil, 0}, 89 | } 90 | 91 | for _, s := range samples { 92 | t.Run(s.name, func(t *testing.T) { 93 | encoder := TFIDFVectorizer{} 94 | expectedEncoder := TFIDFVectorizer{ 95 | CountVectorizer: CountVectorizer{Mapping: s.mapping, Separator: " "}, 96 | NumDocuments: s.ndocs, 97 | DocCount: s.doccount, 98 | } 99 | encoder.Fit(s.input) 100 | assert.Equal(t, expectedEncoder, encoder) 101 | assert.Equal(t, s.numFeatures, encoder.NumFeatures()) 102 | }) 103 | } 104 | 105 | t.Run("transofmer is nil", func(t *testing.T) { 106 | var encoder *TFIDFVectorizer 107 | assert.Equal(t, []float64(nil), encoder.Transform("asdf asdf")) 108 | assert.Equal(t, 0, encoder.NumFeatures()) 109 | }) 110 | } 111 | 112 | // test is based on data from: https://scikit-learn.org/stable/modules/feature_extraction.html 113 | func TestTFIDFVectorizerTransform(t *testing.T) { 114 | samples := []struct { 115 | name string 116 | ndocs int 117 | doccount []uint 118 | mapping map[string]uint 119 | input string 120 | output []float64 121 | }{ 122 | {"basic_1", 6, []uint{6, 1, 2}, map[string]uint{"a": 0, "b": 1, "c": 2}, "a a a c", []float64{0.8194099510753755, 0, 0.5732079309279058}}, 123 | {"basic_2", 6, []uint{6, 1, 2}, map[string]uint{"a": 0, "b": 1, "c": 2}, "a a", []float64{1, 0, 0}}, 124 | {"basic_3", 6, []uint{6, 1, 2}, map[string]uint{"a": 0, "b": 1, "c": 2}, "a a a", []float64{1, 0, 0}}, 125 | {"basic_4", 6, []uint{6, 1, 2}, map[string]uint{"a": 0, "b": 1, "c": 2}, "a a a a", []float64{1, 0, 0}}, 126 | {"basic_5", 6, []uint{6, 1, 2}, map[string]uint{"a": 0, "b": 1, "c": 2}, "a a a b b", []float64{0.47330339145578754, 0.8808994832762984, 0}}, 127 | {"basic_6", 6, []uint{6, 1, 2}, map[string]uint{"a": 0, "b": 1, "c": 2}, "a a a c c", []float64{0.58149260706886, 0, 0.8135516873095773}}, 128 | {"not found", 6, []uint{6, 1, 2}, map[string]uint{"a": 0, "b": 1, "c": 2}, "dddd", []float64{0, 0, 0}}, 129 | {"empty input", 2, []uint{1, 2}, map[string]uint{"a": 0, "b": 1}, " ", []float64{0, 0}}, 130 | {"empty vals", 2, []uint{1, 2}, map[string]uint{}, " b a ", []float64{}}, 131 | {"nil input", 2, []uint{1, 2}, map[string]uint{}, "", []float64{}}, 132 | } 133 | 134 | for _, s := range samples { 135 | t.Run(s.name, func(t *testing.T) { 136 | encoder := TFIDFVectorizer{ 137 | CountVectorizer: CountVectorizer{Mapping: s.mapping, Separator: " "}, 138 | NumDocuments: s.ndocs, 139 | DocCount: s.doccount, 140 | } 141 | assert.Equal(t, s.output, encoder.Transform(s.input)) 142 | }) 143 | 144 | if len(s.output) > 0 { 145 | t.Run(s.name+"_inplace", func(t *testing.T) { 146 | encoder := TFIDFVectorizer{ 147 | CountVectorizer: CountVectorizer{Mapping: s.mapping, Separator: " "}, 148 | NumDocuments: s.ndocs, 149 | DocCount: s.doccount, 150 | } 151 | 152 | features := make([]float64, encoder.NumFeatures()) 153 | encoder.TransformInplace(features, s.input) 154 | assert.Equal(t, s.output, features) 155 | 156 | // note, values in copied range should be zero 157 | features = make([]float64, encoder.NumFeatures()+100) 158 | features[0] = 11223344556677 159 | features[1] = 10101010110101 160 | features[99] = 1231231231 161 | 162 | expected := make([]float64, len(features)) 163 | copy(expected, features) 164 | copy(expected[10:], s.output) 165 | 166 | encoder.TransformInplace(features[10:10+encoder.NumFeatures()], s.input) 167 | assert.Equal(t, expected, features) 168 | }) 169 | } 170 | } 171 | 172 | t.Run("inplace does not run when dest len is not equal num features", func(t *testing.T) { 173 | encoder := TFIDFVectorizer{ 174 | CountVectorizer: CountVectorizer{Mapping: map[string]uint{"a": 0, "b": 1}, Separator: " "}, 175 | NumDocuments: 5, 176 | DocCount: []uint{2, 5}, 177 | } 178 | 179 | features := []float64{1, 2, 3, 4} 180 | encoder.TransformInplace(features, "a b c d") 181 | assert.Equal(t, []float64{1, 2, 3, 4}, features) 182 | }) 183 | } 184 | 185 | func TestTFIDFVectorizerFeatureNames(t *testing.T) { 186 | t.Run("feature names on empty transformer", func(t *testing.T) { 187 | var encoder *TFIDFVectorizer 188 | assert.Equal(t, []string(nil), encoder.FeatureNames()) 189 | }) 190 | 191 | t.Run("feature names", func(t *testing.T) { 192 | encoder := TFIDFVectorizer{CountVectorizer: CountVectorizer{Mapping: map[string]uint{"a": 1, "b": 0}}} 193 | assert.Equal(t, []string{"b", "a"}, encoder.FeatureNames()) 194 | }) 195 | } 196 | -------------------------------------------------------------------------------- /cmd/generate/tests/readme_test.go: -------------------------------------------------------------------------------- 1 | package examplemodule 2 | 3 | import ( 4 | "encoding/json" 5 | "testing" 6 | 7 | . "github.com/nikolaydubina/go-featureprocessing/transformers" 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | func TestEmployeeFeatureTransformerReadme(t *testing.T) { 12 | t.Run("transform", func(t *testing.T) { 13 | employee := Employee{ 14 | Age: 22, 15 | Salary: 1000.0, 16 | Kids: 2, 17 | Weight: 85.1, 18 | Height: 160.0, 19 | City: "Pangyo", 20 | Car: "Tesla", 21 | Income: 9000.1, 22 | SecretValue: 42, 23 | Description: "large text fields are not a problem neither, tf-idf can help here too! more advanced NLP will be added later!", 24 | } 25 | 26 | tr := EmployeeFeatureTransformer{ 27 | Salary: MinMaxScaler{Min: 500, Max: 900}, 28 | Kids: MaxAbsScaler{Max: 4}, 29 | Weight: StandardScaler{Mean: 60, STD: 25}, 30 | Height: QuantileScaler{Quantiles: []float64{20, 100, 110, 120, 150}}, 31 | City: OneHotEncoder{Mapping: map[string]uint{"Pangyo": 0, "Seoul": 1, "Daejeon": 2, "Busan": 3}}, 32 | Car: OrdinalEncoder{Mapping: map[string]uint{"Tesla": 1, "BMW": 90000}}, 33 | Income: KBinsDiscretizer{QuantileScaler: QuantileScaler{Quantiles: []float64{1000, 1100, 2000, 3000, 10000}}}, 34 | Description: TFIDFVectorizer{ 35 | NumDocuments: 2, 36 | DocCount: []uint{1, 2, 2}, 37 | CountVectorizer: CountVectorizer{Mapping: map[string]uint{"text": 0, "problem": 1, "help": 2}, Separator: " "}, 38 | }, 39 | } 40 | 41 | features := tr.Transform(&employee) 42 | expected := []float64{22, 1, 0.5, 1.0039999999999998, 1, 1, 0, 0, 0, 1, 5, 0.7674945674619879, 0.4532946552278861, 0.4532946552278861} 43 | assert.Equal(t, expected, features) 44 | }) 45 | 46 | t.Run("transform_all", func(t *testing.T) { 47 | employee := Employee{ 48 | Age: 22, 49 | Salary: 1000.0, 50 | Kids: 2, 51 | Weight: 85.1, 52 | Height: 160.0, 53 | City: "Pangyo", 54 | Car: "Tesla", 55 | Income: 9000.1, 56 | SecretValue: 42, 57 | Description: "large text fields are not a problem neither, tf-idf can help here too! more advanced NLP will be added later!", 58 | } 59 | 60 | employees := []Employee{ 61 | employee, 62 | employee, 63 | employee, 64 | } 65 | 66 | tr := EmployeeFeatureTransformer{ 67 | Salary: MinMaxScaler{Min: 500, Max: 900}, 68 | Kids: MaxAbsScaler{Max: 4}, 69 | Weight: StandardScaler{Mean: 60, STD: 25}, 70 | Height: QuantileScaler{Quantiles: []float64{20, 100, 110, 120, 150}}, 71 | City: OneHotEncoder{Mapping: map[string]uint{"Pangyo": 0, "Seoul": 1, "Daejeon": 2, "Busan": 3}}, 72 | Car: OrdinalEncoder{Mapping: map[string]uint{"Tesla": 1, "BMW": 90000}}, 73 | Income: KBinsDiscretizer{QuantileScaler: QuantileScaler{Quantiles: []float64{1000, 1100, 2000, 3000, 10000}}}, 74 | Description: TFIDFVectorizer{ 75 | NumDocuments: 2, 76 | DocCount: []uint{1, 2, 2}, 77 | CountVectorizer: CountVectorizer{Mapping: map[string]uint{"text": 0, "problem": 1, "help": 2}, Separator: " "}, 78 | }, 79 | } 80 | 81 | features := tr.TransformAll(employees) 82 | expectedOne := []float64{22, 1, 0.5, 1.0039999999999998, 1, 1, 0, 0, 0, 1, 5, 0.7674945674619879, 0.4532946552278861, 0.4532946552278861} 83 | var expected []float64 84 | expected = append(expected, expectedOne...) 85 | expected = append(expected, expectedOne...) 86 | expected = append(expected, expectedOne...) 87 | assert.Equal(t, expected, features) 88 | }) 89 | 90 | t.Run("transform_all_parallel", func(t *testing.T) { 91 | employee := Employee{ 92 | Age: 22, 93 | Salary: 1000.0, 94 | Kids: 2, 95 | Weight: 85.1, 96 | Height: 160.0, 97 | City: "Pangyo", 98 | Car: "Tesla", 99 | Income: 9000.1, 100 | SecretValue: 42, 101 | Description: "large text fields are not a problem neither, tf-idf can help here too! more advanced NLP will be added later!", 102 | } 103 | 104 | employees := []Employee{ 105 | employee, 106 | employee, 107 | employee, 108 | employee, 109 | employee, 110 | employee, 111 | } 112 | 113 | tr := EmployeeFeatureTransformer{ 114 | Salary: MinMaxScaler{Min: 500, Max: 900}, 115 | Kids: MaxAbsScaler{Max: 4}, 116 | Weight: StandardScaler{Mean: 60, STD: 25}, 117 | Height: QuantileScaler{Quantiles: []float64{20, 100, 110, 120, 150}}, 118 | City: OneHotEncoder{Mapping: map[string]uint{"Pangyo": 0, "Seoul": 1, "Daejeon": 2, "Busan": 3}}, 119 | Car: OrdinalEncoder{Mapping: map[string]uint{"Tesla": 1, "BMW": 90000}}, 120 | Income: KBinsDiscretizer{QuantileScaler: QuantileScaler{Quantiles: []float64{1000, 1100, 2000, 3000, 10000}}}, 121 | Description: TFIDFVectorizer{ 122 | NumDocuments: 2, 123 | DocCount: []uint{1, 2, 2}, 124 | CountVectorizer: CountVectorizer{Mapping: map[string]uint{"text": 0, "problem": 1, "help": 2}, Separator: " "}, 125 | }, 126 | } 127 | 128 | features := tr.TransformAllParallel(employees, 3) 129 | expectedOne := []float64{22, 1, 0.5, 1.0039999999999998, 1, 1, 0, 0, 0, 1, 5, 0.7674945674619879, 0.4532946552278861, 0.4532946552278861} 130 | var expected []float64 131 | expected = append(expected, expectedOne...) 132 | expected = append(expected, expectedOne...) 133 | expected = append(expected, expectedOne...) 134 | expected = append(expected, expectedOne...) 135 | expected = append(expected, expectedOne...) 136 | expected = append(expected, expectedOne...) 137 | assert.Equal(t, expected, features) 138 | }) 139 | 140 | t.Run("feature names", func(t *testing.T) { 141 | tr := EmployeeFeatureTransformer{ 142 | Salary: MinMaxScaler{Min: 500, Max: 900}, 143 | Kids: MaxAbsScaler{Max: 4}, 144 | Weight: StandardScaler{Mean: 60, STD: 25}, 145 | Height: QuantileScaler{Quantiles: []float64{20, 100, 110, 120, 150}}, 146 | City: OneHotEncoder{Mapping: map[string]uint{"Pangyo": 0, "Seoul": 1, "Daejeon": 2, "Busan": 3}}, 147 | Car: OrdinalEncoder{Mapping: map[string]uint{"Tesla": 1, "BMW": 90000}}, 148 | Income: KBinsDiscretizer{QuantileScaler: QuantileScaler{Quantiles: []float64{1000, 1100, 2000, 3000, 10000}}}, 149 | Description: TFIDFVectorizer{ 150 | NumDocuments: 2, 151 | DocCount: []uint{1, 2, 2}, 152 | CountVectorizer: CountVectorizer{Mapping: map[string]uint{"text": 0, "problem": 1, "help": 2}, Separator: " "}, 153 | }, 154 | } 155 | names := tr.FeatureNames() 156 | expected := []string{"Age", "Salary", "Kids", "Weight", "Height", "City_Pangyo", "City_Seoul", "City_Daejeon", "City_Busan", "Car", "Income", "Description_text", "Description_problem", "Description_help"} 157 | assert.Equal(t, expected, names) 158 | }) 159 | 160 | t.Run("feature names empty categorical skipped", func(t *testing.T) { 161 | tr := EmployeeFeatureTransformer{} 162 | names := tr.FeatureNames() 163 | expected := []string{"Age", "Salary", "Kids", "Weight", "Height", "Car", "Income"} 164 | assert.Equal(t, expected, names) 165 | }) 166 | 167 | t.Run("fit", func(t *testing.T) { 168 | employee := []Employee{ 169 | { 170 | Age: 22, 171 | Salary: 500.0, 172 | Kids: 2, 173 | Weight: 50, 174 | Height: 160.0, 175 | City: "Pangyo", 176 | Car: "Tesla", 177 | Income: 9000.1, 178 | SecretValue: 42, 179 | Description: "text problem help", 180 | }, 181 | { 182 | Age: 10, 183 | Salary: 900.0, 184 | Kids: 0, 185 | Weight: 10, 186 | Height: 120.0, 187 | City: "Seoul", 188 | Car: "BMW", 189 | Income: 420.1, 190 | Description: "problem help", 191 | }, 192 | } 193 | 194 | tr := EmployeeFeatureTransformer{} 195 | tr.Fit(employee) 196 | 197 | trExpected := EmployeeFeatureTransformer{ 198 | Salary: MinMaxScaler{Min: 500, Max: 900}, 199 | Kids: MaxAbsScaler{Max: 2}, 200 | Weight: StandardScaler{Mean: 30, STD: 28.284271247461902}, 201 | Height: QuantileScaler{Quantiles: []float64{120, 160}}, 202 | City: OneHotEncoder{Mapping: map[string]uint{"Pangyo": 0, "Seoul": 1}}, 203 | Car: OrdinalEncoder{Mapping: map[string]uint{"Tesla": 1, "BMW": 2}}, 204 | Income: KBinsDiscretizer{QuantileScaler: QuantileScaler{Quantiles: []float64{420.1, 9000.1}}}, 205 | Description: TFIDFVectorizer{ 206 | NumDocuments: 2, 207 | DocCount: []uint{1, 2, 2}, 208 | CountVectorizer: CountVectorizer{Mapping: map[string]uint{"text": 0, "problem": 1, "help": 2}, Separator: " "}, 209 | }, 210 | } 211 | 212 | assert.Equal(t, trExpected, tr) 213 | }) 214 | 215 | t.Run("serialize transformer", func(t *testing.T) { 216 | tr := EmployeeFeatureTransformer{ 217 | Salary: MinMaxScaler{Min: 500, Max: 900}, 218 | Kids: MaxAbsScaler{Max: 4}, 219 | Weight: StandardScaler{Mean: 60, STD: 25}, 220 | Height: QuantileScaler{Quantiles: []float64{20, 100, 110, 120, 150}}, 221 | City: OneHotEncoder{Mapping: map[string]uint{"Pangyo": 0, "Seoul": 1, "Daejeon": 2, "Busan": 3}}, 222 | Car: OrdinalEncoder{Mapping: map[string]uint{"Tesla": 1, "BMW": 90000}}, 223 | Income: KBinsDiscretizer{QuantileScaler: QuantileScaler{Quantiles: []float64{1000, 1100, 2000, 3000, 10000}}}, 224 | Description: TFIDFVectorizer{ 225 | NumDocuments: 2, 226 | DocCount: []uint{1, 2, 2}, 227 | CountVectorizer: CountVectorizer{Mapping: map[string]uint{"text": 0, "problem": 1, "help": 2}, Separator: " "}, 228 | }, 229 | } 230 | 231 | output, err := json.MarshalIndent(tr, "", " ") 232 | outputStr := string(output) 233 | expected := `{ 234 | "Age_identity": {}, 235 | "Salary_minmax": { 236 | "Min": 500, 237 | "Max": 900 238 | }, 239 | "Kids_maxabs": { 240 | "Max": 4 241 | }, 242 | "Weight_standard": { 243 | "Mean": 60, 244 | "STD": 25 245 | }, 246 | "Height_quantile": { 247 | "Quantiles": [ 248 | 20, 249 | 100, 250 | 110, 251 | 120, 252 | 150 253 | ] 254 | }, 255 | "City_onehot": { 256 | "Mapping": { 257 | "Busan": 3, 258 | "Daejeon": 2, 259 | "Pangyo": 0, 260 | "Seoul": 1 261 | } 262 | }, 263 | "Car_ordinal": { 264 | "Mapping": { 265 | "BMW": 90000, 266 | "Tesla": 1 267 | } 268 | }, 269 | "Income_kbins": { 270 | "Quantiles": [ 271 | 1000, 272 | 1100, 273 | 2000, 274 | 3000, 275 | 10000 276 | ] 277 | }, 278 | "Description_tfidf": { 279 | "Mapping": { 280 | "help": 2, 281 | "problem": 1, 282 | "text": 0 283 | }, 284 | "Separator": " ", 285 | "DocCount": [ 286 | 1, 287 | 2, 288 | 2 289 | ], 290 | "NumDocuments": 2, 291 | "Normalizer": {} 292 | } 293 | }` 294 | assert.Nil(t, err) 295 | assert.Equal(t, expected, outputStr) 296 | }) 297 | } 298 | -------------------------------------------------------------------------------- /cmd/generate/tests/with32fieldsfp_test.go: -------------------------------------------------------------------------------- 1 | // Code generated by go-featureprocessing DO NOT EDIT 2 | 3 | package examplemodule 4 | 5 | import ( 6 | "encoding/json" 7 | "testing" 8 | 9 | "github.com/google/gofuzz" 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | // makeMock creates some valid With32FieldsFeatureTransformer by fitting on fuzzy data. 14 | // This function is handy for tests. 15 | func makeMockWith32FieldsFeatureTransformer() *With32FieldsFeatureTransformer { 16 | s := make([]With32Fields, 10) 17 | fuzz.New().NilChance(0).NumElements(10, 10).Fuzz(&s) 18 | 19 | tr := With32FieldsFeatureTransformer{} 20 | tr.Fit(s) 21 | return &tr 22 | } 23 | 24 | func TestWith32FieldsFeatureTransformerFeatureNames(t *testing.T) { 25 | tr := makeMockWith32FieldsFeatureTransformer() 26 | 27 | t.Run("feature names", func(t *testing.T) { 28 | names := tr.FeatureNames() 29 | assert.True(t, len(names) > 0) 30 | assert.Equal(t, len(names), tr.NumFeatures()) 31 | }) 32 | 33 | t.Run("feature name transformer is empty", func(t *testing.T) { 34 | tr := With32FieldsFeatureTransformer{} 35 | names := tr.FeatureNames() 36 | assert.True(t, len(names) > 0) 37 | assert.Equal(t, len(names), tr.NumFeatures()) 38 | }) 39 | 40 | t.Run("feature name transformer is nil", func(t *testing.T) { 41 | var tr *With32FieldsFeatureTransformer 42 | names := tr.FeatureNames() 43 | assert.Nil(t, names) 44 | }) 45 | } 46 | 47 | func TestWith32FieldsFeatureTransformerTransform(t *testing.T) { 48 | tr := makeMockWith32FieldsFeatureTransformer() 49 | 50 | t.Run("empty struct", func(t *testing.T) { 51 | s := With32Fields{} 52 | features := tr.Transform(&s) 53 | 54 | assert.NotNil(t, features) 55 | assert.True(t, len(features) > 0) 56 | assert.Equal(t, tr.NumFeatures(), len(features)) 57 | }) 58 | 59 | t.Run("fuzzy struct", func(t *testing.T) { 60 | var s With32Fields 61 | fuzz.New().Fuzz(&s) 62 | 63 | tr := With32FieldsFeatureTransformer{} 64 | fuzz.New().NilChance(0).NumElements(1, 1).Fuzz(&tr) 65 | 66 | features := tr.Transform(&s) 67 | 68 | assert.NotNil(t, features) 69 | assert.True(t, len(features) > 0) 70 | assert.Equal(t, tr.NumFeatures(), len(features)) 71 | }) 72 | 73 | t.Run("struct is nil", func(t *testing.T) { 74 | var s *With32Fields 75 | features := tr.Transform(s) 76 | assert.Nil(t, features) 77 | assert.True(t, tr.NumFeatures() > 0) 78 | }) 79 | 80 | t.Run("transformer is nil", func(t *testing.T) { 81 | var s With32Fields 82 | fuzz.New().Fuzz(&s) 83 | 84 | var tr *With32FieldsFeatureTransformer 85 | features := tr.Transform(&s) 86 | 87 | assert.Nil(t, features) 88 | assert.Equal(t, tr.NumFeatures(), 0) 89 | }) 90 | 91 | t.Run("serialize and deserialize transformer", func(t *testing.T) { 92 | output, err := json.Marshal(tr) 93 | assert.Nil(t, err) 94 | assert.NotEmpty(t, output) 95 | 96 | var tr2 With32FieldsFeatureTransformer 97 | err = json.Unmarshal(output, &tr2) 98 | assert.Nil(t, err) 99 | assert.Equal(t, *tr, tr2) 100 | }) 101 | 102 | t.Run("inplace transform does not run when destination does not match num features", func(t *testing.T) { 103 | var s With32Fields 104 | fuzz.New().Fuzz(&s) 105 | 106 | tr := With32FieldsFeatureTransformer{} 107 | 108 | features := make([]float64, 1000) 109 | features[0] = 123456789.0 110 | tr.TransformInplace(features, &s) 111 | 112 | assert.Equal(t, 123456789.0, features[0]) 113 | }) 114 | } 115 | 116 | func TestWith32FieldsFeatureTransformerTransformAll(t *testing.T) { 117 | t.Run("when transformer is nil", func(t *testing.T) { 118 | s := make([]With32Fields, 100) 119 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 120 | 121 | dst := make([]float64, 100*100) 122 | 123 | var tr *With32FieldsFeatureTransformer 124 | assert.Nil(t, tr.TransformAll(s)) 125 | assert.Nil(t, tr.TransformAllParallel(s, 4)) 126 | 127 | // does not panic 128 | tr.TransformAllInplace(dst, s) 129 | tr.TransformAllInplaceParallel(dst, s, 4) 130 | }) 131 | 132 | t.Run("inplace with wrong output dimensions, output is smaller", func(t *testing.T) { 133 | s := make([]With32Fields, 100) 134 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 135 | 136 | dst := make([]float64, 100) 137 | 138 | tr := makeMockWith32FieldsFeatureTransformer() 139 | 140 | // does not panic 141 | tr.TransformAllInplace(dst, s) 142 | tr.TransformAllInplaceParallel(dst, s, 4) 143 | }) 144 | 145 | t.Run("inplace with wrong output dimensions, output is bigger", func(t *testing.T) { 146 | s := make([]With32Fields, 100) 147 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 148 | 149 | dst := make([]float64, 100*120) 150 | 151 | tr := makeMockWith32FieldsFeatureTransformer() 152 | 153 | // does not panic 154 | tr.TransformAllInplace(dst, s) 155 | tr.TransformAllInplaceParallel(dst, s, 4) 156 | }) 157 | 158 | t.Run("transform all", func(t *testing.T) { 159 | s := make([]With32Fields, 100) 160 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 161 | 162 | tr := makeMockWith32FieldsFeatureTransformer() 163 | 164 | features := tr.TransformAll(s) 165 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 166 | }) 167 | 168 | t.Run("transform all parallel 1 worker", func(t *testing.T) { 169 | s := make([]With32Fields, 100) 170 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 171 | 172 | tr := makeMockWith32FieldsFeatureTransformer() 173 | 174 | features := tr.TransformAllParallel(s, 1) 175 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 176 | }) 177 | 178 | t.Run("transform all parallel 4 workers", func(t *testing.T) { 179 | s := make([]With32Fields, 100) 180 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 181 | 182 | tr := makeMockWith32FieldsFeatureTransformer() 183 | 184 | features := tr.TransformAllParallel(s, 4) 185 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 186 | }) 187 | } 188 | 189 | func TestWith32FieldsFeatureTransformerFit(t *testing.T) { 190 | t.Run("fuzzy input", func(t *testing.T) { 191 | s := make([]With32Fields, 10) 192 | fuzz.New().NilChance(0).NumElements(1, 1).Fuzz(&s) 193 | 194 | trEmpty := With32FieldsFeatureTransformer{} 195 | tr := With32FieldsFeatureTransformer{} 196 | tr.Fit(s) 197 | 198 | assert.NotNil(t, tr) 199 | assert.NotEqual(t, tr, trEmpty) 200 | }) 201 | 202 | t.Run("not nil transformer nil input", func(t *testing.T) { 203 | trEmpty := With32FieldsFeatureTransformer{} 204 | tr := With32FieldsFeatureTransformer{} 205 | tr.Fit(nil) 206 | 207 | assert.Equal(t, trEmpty, tr) 208 | }) 209 | 210 | t.Run("nil transformer not nil input", func(t *testing.T) { 211 | s := make([]With32Fields, 10) 212 | 213 | var tr *With32FieldsFeatureTransformer 214 | tr.Fit(s) 215 | 216 | assert.Nil(t, tr) 217 | }) 218 | } 219 | 220 | func fitTransformerWith32Fields(b *testing.B, numelem int) { 221 | s := make([]With32Fields, numelem) 222 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 223 | 224 | var tr With32FieldsFeatureTransformer 225 | 226 | b.ResetTimer() 227 | for n := 0; n < b.N; n++ { 228 | tr.Fit(s) 229 | } 230 | } 231 | 232 | func BenchmarkWith32FieldsFeatureTransformer_Fit_100elements(b *testing.B) { 233 | fitTransformerWith32Fields(b, 100) 234 | } 235 | 236 | func BenchmarkWith32FieldsFeatureTransformer_Fit_1000elements(b *testing.B) { 237 | fitTransformerWith32Fields(b, 1000) 238 | } 239 | 240 | func BenchmarkWith32FieldsFeatureTransformer_Fit_10000elements(b *testing.B) { 241 | fitTransformerWith32Fields(b, 10000) 242 | } 243 | 244 | func BenchmarkWith32FieldsFeatureTransformer_Transform(b *testing.B) { 245 | var s With32Fields 246 | fuzz.New().Fuzz(&s) 247 | 248 | tr := makeMockWith32FieldsFeatureTransformer() 249 | 250 | b.ResetTimer() 251 | for n := 0; n < b.N; n++ { 252 | tr.Transform(&s) 253 | } 254 | } 255 | 256 | func BenchmarkWith32FieldsFeatureTransformer_Transform_Inplace(b *testing.B) { 257 | var s With32Fields 258 | fuzz.New().Fuzz(&s) 259 | 260 | tr := makeMockWith32FieldsFeatureTransformer() 261 | 262 | features := make([]float64, tr.NumFeatures()) 263 | 264 | b.ResetTimer() 265 | for n := 0; n < b.N; n++ { 266 | tr.TransformInplace(features, &s) 267 | } 268 | } 269 | 270 | func benchTransformAllWith32Fields(b *testing.B, numelem int) { 271 | s := make([]With32Fields, numelem) 272 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 273 | 274 | tr := makeMockWith32FieldsFeatureTransformer() 275 | 276 | b.ResetTimer() 277 | for n := 0; n < b.N; n++ { 278 | tr.TransformAll(s) 279 | } 280 | } 281 | 282 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_10elems(b *testing.B) { 283 | benchTransformAllWith32Fields(b, 10) 284 | } 285 | 286 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_100elems(b *testing.B) { 287 | benchTransformAllWith32Fields(b, 100) 288 | } 289 | 290 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_1000elems(b *testing.B) { 291 | benchTransformAllWith32Fields(b, 1000) 292 | } 293 | 294 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_10000elems(b *testing.B) { 295 | benchTransformAllWith32Fields(b, 10000) 296 | } 297 | 298 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_100000elems(b *testing.B) { 299 | benchTransformAllWith32Fields(b, 100000) 300 | } 301 | 302 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_1000000elems(b *testing.B) { 303 | benchTransformAllWith32Fields(b, 1000000) 304 | } 305 | 306 | func benchTransformAllParallelWith32Fields(b *testing.B, numelem int, nworkers uint) { 307 | s := make([]With32Fields, numelem) 308 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 309 | 310 | tr := makeMockWith32FieldsFeatureTransformer() 311 | 312 | b.ResetTimer() 313 | for n := 0; n < b.N; n++ { 314 | tr.TransformAllParallel(s, nworkers) 315 | } 316 | } 317 | 318 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_10elems_8workers(b *testing.B) { 319 | benchTransformAllParallelWith32Fields(b, 10, 8) 320 | } 321 | 322 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_100elems_8workers(b *testing.B) { 323 | benchTransformAllParallelWith32Fields(b, 100, 8) 324 | } 325 | 326 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_1000elems_8workers(b *testing.B) { 327 | benchTransformAllParallelWith32Fields(b, 1000, 8) 328 | } 329 | 330 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_10000elems_8workers(b *testing.B) { 331 | benchTransformAllParallelWith32Fields(b, 10000, 8) 332 | } 333 | 334 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_100000elems_8workers(b *testing.B) { 335 | benchTransformAllParallelWith32Fields(b, 100000, 8) 336 | } 337 | 338 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_1000000elems_8workers(b *testing.B) { 339 | benchTransformAllParallelWith32Fields(b, 1000000, 8) 340 | } 341 | 342 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_5000000elems_8workers(b *testing.B) { 343 | benchTransformAllParallelWith32Fields(b, 5000000, 8) 344 | } 345 | 346 | func BenchmarkWith32FieldsFeatureTransformer_TransformAll_15000000elems_8workers(b *testing.B) { 347 | benchTransformAllParallelWith32Fields(b, 15000000, 8) 348 | } 349 | -------------------------------------------------------------------------------- /cmd/generate/tests/employeefp_test.go: -------------------------------------------------------------------------------- 1 | // Code generated by go-featureprocessing DO NOT EDIT 2 | 3 | package examplemodule 4 | 5 | import ( 6 | "encoding/json" 7 | "testing" 8 | 9 | "github.com/google/gofuzz" 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | // makeMock creates some valid EmployeeFeatureTransformer by fitting on fuzzy data. 14 | // This function is handy for tests. 15 | func makeMockEmployeeFeatureTransformer() *EmployeeFeatureTransformer { 16 | s := make([]Employee, 10) 17 | fuzz.New().NilChance(0).NumElements(10, 10).Fuzz(&s) 18 | 19 | tr := EmployeeFeatureTransformer{} 20 | tr.Fit(s) 21 | return &tr 22 | } 23 | 24 | func TestEmployeeFeatureTransformerFeatureNames(t *testing.T) { 25 | tr := makeMockEmployeeFeatureTransformer() 26 | 27 | t.Run("feature names", func(t *testing.T) { 28 | names := tr.FeatureNames() 29 | assert.True(t, len(names) > 0) 30 | assert.Equal(t, len(names), tr.NumFeatures()) 31 | }) 32 | 33 | t.Run("feature name transformer is empty", func(t *testing.T) { 34 | tr := EmployeeFeatureTransformer{} 35 | names := tr.FeatureNames() 36 | assert.True(t, len(names) > 0) 37 | assert.Equal(t, len(names), tr.NumFeatures()) 38 | }) 39 | 40 | t.Run("feature name transformer is nil", func(t *testing.T) { 41 | var tr *EmployeeFeatureTransformer 42 | names := tr.FeatureNames() 43 | assert.Nil(t, names) 44 | }) 45 | } 46 | 47 | func TestEmployeeFeatureTransformerTransform(t *testing.T) { 48 | tr := makeMockEmployeeFeatureTransformer() 49 | 50 | t.Run("empty struct", func(t *testing.T) { 51 | s := Employee{} 52 | features := tr.Transform(&s) 53 | 54 | assert.NotNil(t, features) 55 | assert.True(t, len(features) > 0) 56 | assert.Equal(t, tr.NumFeatures(), len(features)) 57 | }) 58 | 59 | t.Run("fuzzy struct", func(t *testing.T) { 60 | var s Employee 61 | fuzz.New().Fuzz(&s) 62 | 63 | tr := EmployeeFeatureTransformer{} 64 | fuzz.New().NilChance(0).NumElements(1, 1).Fuzz(&tr) 65 | 66 | features := tr.Transform(&s) 67 | 68 | assert.NotNil(t, features) 69 | assert.True(t, len(features) > 0) 70 | assert.Equal(t, tr.NumFeatures(), len(features)) 71 | }) 72 | 73 | t.Run("struct is nil", func(t *testing.T) { 74 | var s *Employee 75 | features := tr.Transform(s) 76 | assert.Nil(t, features) 77 | assert.True(t, tr.NumFeatures() > 0) 78 | }) 79 | 80 | t.Run("transformer is nil", func(t *testing.T) { 81 | var s Employee 82 | fuzz.New().Fuzz(&s) 83 | 84 | var tr *EmployeeFeatureTransformer 85 | features := tr.Transform(&s) 86 | 87 | assert.Nil(t, features) 88 | assert.Equal(t, tr.NumFeatures(), 0) 89 | }) 90 | 91 | t.Run("serialize and deserialize transformer", func(t *testing.T) { 92 | output, err := json.Marshal(tr) 93 | assert.Nil(t, err) 94 | assert.NotEmpty(t, output) 95 | 96 | var tr2 EmployeeFeatureTransformer 97 | err = json.Unmarshal(output, &tr2) 98 | assert.Nil(t, err) 99 | assert.Equal(t, *tr, tr2) 100 | }) 101 | 102 | t.Run("inplace transform does not run when destination does not match num features", func(t *testing.T) { 103 | var s Employee 104 | fuzz.New().Fuzz(&s) 105 | 106 | tr := EmployeeFeatureTransformer{} 107 | 108 | features := make([]float64, 1000) 109 | features[0] = 123456789.0 110 | tr.TransformInplace(features, &s) 111 | 112 | assert.Equal(t, 123456789.0, features[0]) 113 | }) 114 | } 115 | 116 | func TestEmployeeFeatureTransformerTransformAll(t *testing.T) { 117 | t.Run("when transformer is nil", func(t *testing.T) { 118 | s := make([]Employee, 100) 119 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 120 | 121 | dst := make([]float64, 100*100) 122 | 123 | var tr *EmployeeFeatureTransformer 124 | assert.Nil(t, tr.TransformAll(s)) 125 | assert.Nil(t, tr.TransformAllParallel(s, 4)) 126 | 127 | // does not panic 128 | tr.TransformAllInplace(dst, s) 129 | tr.TransformAllInplaceParallel(dst, s, 4) 130 | }) 131 | 132 | t.Run("inplace with wrong output dimensions, output is smaller", func(t *testing.T) { 133 | s := make([]Employee, 100) 134 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 135 | 136 | dst := make([]float64, 100) 137 | 138 | tr := makeMockEmployeeFeatureTransformer() 139 | 140 | // does not panic 141 | tr.TransformAllInplace(dst, s) 142 | tr.TransformAllInplaceParallel(dst, s, 4) 143 | }) 144 | 145 | t.Run("inplace with wrong output dimensions, output is bigger", func(t *testing.T) { 146 | s := make([]Employee, 100) 147 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 148 | 149 | dst := make([]float64, 100*120) 150 | 151 | tr := makeMockEmployeeFeatureTransformer() 152 | 153 | // does not panic 154 | tr.TransformAllInplace(dst, s) 155 | tr.TransformAllInplaceParallel(dst, s, 4) 156 | }) 157 | 158 | t.Run("transform all", func(t *testing.T) { 159 | s := make([]Employee, 100) 160 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 161 | 162 | tr := makeMockEmployeeFeatureTransformer() 163 | 164 | features := tr.TransformAll(s) 165 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 166 | }) 167 | 168 | t.Run("transform all parallel 1 worker", func(t *testing.T) { 169 | s := make([]Employee, 100) 170 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 171 | 172 | tr := makeMockEmployeeFeatureTransformer() 173 | 174 | features := tr.TransformAllParallel(s, 1) 175 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 176 | }) 177 | 178 | t.Run("transform all parallel 4 workers", func(t *testing.T) { 179 | s := make([]Employee, 100) 180 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 181 | 182 | tr := makeMockEmployeeFeatureTransformer() 183 | 184 | features := tr.TransformAllParallel(s, 4) 185 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 186 | }) 187 | } 188 | 189 | func TestEmployeeFeatureTransformerFit(t *testing.T) { 190 | t.Run("fuzzy input", func(t *testing.T) { 191 | s := make([]Employee, 10) 192 | fuzz.New().NilChance(0).NumElements(1, 1).Fuzz(&s) 193 | 194 | trEmpty := EmployeeFeatureTransformer{} 195 | tr := EmployeeFeatureTransformer{} 196 | tr.Fit(s) 197 | 198 | assert.NotNil(t, tr) 199 | assert.NotEqual(t, tr, trEmpty) 200 | }) 201 | 202 | t.Run("not nil transformer nil input", func(t *testing.T) { 203 | trEmpty := EmployeeFeatureTransformer{} 204 | tr := EmployeeFeatureTransformer{} 205 | tr.Fit(nil) 206 | 207 | assert.Equal(t, trEmpty, tr) 208 | }) 209 | 210 | t.Run("nil transformer not nil input", func(t *testing.T) { 211 | s := make([]Employee, 10) 212 | 213 | var tr *EmployeeFeatureTransformer 214 | tr.Fit(s) 215 | 216 | assert.Nil(t, tr) 217 | }) 218 | } 219 | 220 | func fitTransformerEmployee(b *testing.B, numelem int) { 221 | s := make([]Employee, numelem) 222 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 223 | 224 | var tr EmployeeFeatureTransformer 225 | 226 | b.ResetTimer() 227 | for n := 0; n < b.N; n++ { 228 | tr.Fit(s) 229 | } 230 | } 231 | 232 | func BenchmarkEmployeeFeatureTransformer_Fit_100elements(b *testing.B) { 233 | fitTransformerEmployee(b, 100) 234 | } 235 | 236 | func BenchmarkEmployeeFeatureTransformer_Fit_1000elements(b *testing.B) { 237 | fitTransformerEmployee(b, 1000) 238 | } 239 | 240 | func BenchmarkEmployeeFeatureTransformer_Fit_10000elements(b *testing.B) { 241 | fitTransformerEmployee(b, 10000) 242 | } 243 | 244 | func BenchmarkEmployeeFeatureTransformer_Transform(b *testing.B) { 245 | var s Employee 246 | fuzz.New().Fuzz(&s) 247 | 248 | tr := makeMockEmployeeFeatureTransformer() 249 | 250 | b.ResetTimer() 251 | for n := 0; n < b.N; n++ { 252 | tr.Transform(&s) 253 | } 254 | } 255 | 256 | func BenchmarkEmployeeFeatureTransformer_Transform_Inplace(b *testing.B) { 257 | var s Employee 258 | fuzz.New().Fuzz(&s) 259 | 260 | tr := makeMockEmployeeFeatureTransformer() 261 | 262 | features := make([]float64, tr.NumFeatures()) 263 | 264 | b.ResetTimer() 265 | for n := 0; n < b.N; n++ { 266 | tr.TransformInplace(features, &s) 267 | } 268 | } 269 | 270 | func benchTransformAllEmployee(b *testing.B, numelem int) { 271 | s := make([]Employee, numelem) 272 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 273 | 274 | tr := makeMockEmployeeFeatureTransformer() 275 | 276 | b.ResetTimer() 277 | for n := 0; n < b.N; n++ { 278 | tr.TransformAll(s) 279 | } 280 | } 281 | 282 | func BenchmarkEmployeeFeatureTransformer_TransformAll_10elems(b *testing.B) { 283 | benchTransformAllEmployee(b, 10) 284 | } 285 | 286 | func BenchmarkEmployeeFeatureTransformer_TransformAll_100elems(b *testing.B) { 287 | benchTransformAllEmployee(b, 100) 288 | } 289 | 290 | func BenchmarkEmployeeFeatureTransformer_TransformAll_1000elems(b *testing.B) { 291 | benchTransformAllEmployee(b, 1000) 292 | } 293 | 294 | func BenchmarkEmployeeFeatureTransformer_TransformAll_10000elems(b *testing.B) { 295 | benchTransformAllEmployee(b, 10000) 296 | } 297 | 298 | func BenchmarkEmployeeFeatureTransformer_TransformAll_100000elems(b *testing.B) { 299 | benchTransformAllEmployee(b, 100000) 300 | } 301 | 302 | func BenchmarkEmployeeFeatureTransformer_TransformAll_1000000elems(b *testing.B) { 303 | benchTransformAllEmployee(b, 1000000) 304 | } 305 | 306 | func benchTransformAllParallelEmployee(b *testing.B, numelem int, nworkers uint) { 307 | s := make([]Employee, numelem) 308 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 309 | 310 | tr := makeMockEmployeeFeatureTransformer() 311 | 312 | b.ResetTimer() 313 | for n := 0; n < b.N; n++ { 314 | tr.TransformAllParallel(s, nworkers) 315 | } 316 | } 317 | 318 | func BenchmarkEmployeeFeatureTransformer_TransformAll_10elems_8workers(b *testing.B) { 319 | benchTransformAllParallelEmployee(b, 10, 8) 320 | } 321 | 322 | func BenchmarkEmployeeFeatureTransformer_TransformAll_100elems_8workers(b *testing.B) { 323 | benchTransformAllParallelEmployee(b, 100, 8) 324 | } 325 | 326 | func BenchmarkEmployeeFeatureTransformer_TransformAll_1000elems_8workers(b *testing.B) { 327 | benchTransformAllParallelEmployee(b, 1000, 8) 328 | } 329 | 330 | func BenchmarkEmployeeFeatureTransformer_TransformAll_10000elems_8workers(b *testing.B) { 331 | benchTransformAllParallelEmployee(b, 10000, 8) 332 | } 333 | 334 | func BenchmarkEmployeeFeatureTransformer_TransformAll_100000elems_8workers(b *testing.B) { 335 | benchTransformAllParallelEmployee(b, 100000, 8) 336 | } 337 | 338 | func BenchmarkEmployeeFeatureTransformer_TransformAll_1000000elems_8workers(b *testing.B) { 339 | benchTransformAllParallelEmployee(b, 1000000, 8) 340 | } 341 | 342 | func BenchmarkEmployeeFeatureTransformer_TransformAll_5000000elems_8workers(b *testing.B) { 343 | benchTransformAllParallelEmployee(b, 5000000, 8) 344 | } 345 | 346 | func BenchmarkEmployeeFeatureTransformer_TransformAll_15000000elems_8workers(b *testing.B) { 347 | benchTransformAllParallelEmployee(b, 15000000, 8) 348 | } 349 | 350 | func benchLargeTransformerEmployee(b *testing.B, numelem int) { 351 | var s []Employee 352 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 353 | 354 | tr := EmployeeFeatureTransformer{} 355 | tr.Fit(s) 356 | 357 | b.ResetTimer() 358 | for n := 0; n < b.N; n++ { 359 | tr.Transform(&s[0]) 360 | } 361 | } 362 | 363 | func BenchmarkEmployeeFeatureTransformer_Transform_LargeComposites_100elements(b *testing.B) { 364 | benchLargeTransformerEmployee(b, 100) 365 | } 366 | 367 | func BenchmarkEmployeeFeatureTransformer_Transform_LargeComposites_1000elements(b *testing.B) { 368 | benchLargeTransformerEmployee(b, 1000) 369 | } 370 | 371 | func BenchmarkEmployeeFeatureTransformer_Transform_LargeComposites_10000elements(b *testing.B) { 372 | benchLargeTransformerEmployee(b, 10000) 373 | } 374 | 375 | func BenchmarkEmployeeFeatureTransformer_Transform_LargeComposites_100000elements(b *testing.B) { 376 | benchLargeTransformerEmployee(b, 100000) 377 | } 378 | -------------------------------------------------------------------------------- /cmd/generate/tests/weirdtagsfp_test.go: -------------------------------------------------------------------------------- 1 | // Code generated by go-featureprocessing DO NOT EDIT 2 | 3 | package examplemodule 4 | 5 | import ( 6 | "encoding/json" 7 | "testing" 8 | 9 | "github.com/google/gofuzz" 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | // makeMock creates some valid WeirdTagsFeatureTransformer by fitting on fuzzy data. 14 | // This function is handy for tests. 15 | func makeMockWeirdTagsFeatureTransformer() *WeirdTagsFeatureTransformer { 16 | s := make([]WeirdTags, 10) 17 | fuzz.New().NilChance(0).NumElements(10, 10).Fuzz(&s) 18 | 19 | tr := WeirdTagsFeatureTransformer{} 20 | tr.Fit(s) 21 | return &tr 22 | } 23 | 24 | func TestWeirdTagsFeatureTransformerFeatureNames(t *testing.T) { 25 | tr := makeMockWeirdTagsFeatureTransformer() 26 | 27 | t.Run("feature names", func(t *testing.T) { 28 | names := tr.FeatureNames() 29 | assert.True(t, len(names) > 0) 30 | assert.Equal(t, len(names), tr.NumFeatures()) 31 | }) 32 | 33 | t.Run("feature name transformer is empty", func(t *testing.T) { 34 | tr := WeirdTagsFeatureTransformer{} 35 | names := tr.FeatureNames() 36 | assert.True(t, len(names) > 0) 37 | assert.Equal(t, len(names), tr.NumFeatures()) 38 | }) 39 | 40 | t.Run("feature name transformer is nil", func(t *testing.T) { 41 | var tr *WeirdTagsFeatureTransformer 42 | names := tr.FeatureNames() 43 | assert.Nil(t, names) 44 | }) 45 | } 46 | 47 | func TestWeirdTagsFeatureTransformerTransform(t *testing.T) { 48 | tr := makeMockWeirdTagsFeatureTransformer() 49 | 50 | t.Run("empty struct", func(t *testing.T) { 51 | s := WeirdTags{} 52 | features := tr.Transform(&s) 53 | 54 | assert.NotNil(t, features) 55 | assert.True(t, len(features) > 0) 56 | assert.Equal(t, tr.NumFeatures(), len(features)) 57 | }) 58 | 59 | t.Run("fuzzy struct", func(t *testing.T) { 60 | var s WeirdTags 61 | fuzz.New().Fuzz(&s) 62 | 63 | tr := WeirdTagsFeatureTransformer{} 64 | fuzz.New().NilChance(0).NumElements(1, 1).Fuzz(&tr) 65 | 66 | features := tr.Transform(&s) 67 | 68 | assert.NotNil(t, features) 69 | assert.True(t, len(features) > 0) 70 | assert.Equal(t, tr.NumFeatures(), len(features)) 71 | }) 72 | 73 | t.Run("struct is nil", func(t *testing.T) { 74 | var s *WeirdTags 75 | features := tr.Transform(s) 76 | assert.Nil(t, features) 77 | assert.True(t, tr.NumFeatures() > 0) 78 | }) 79 | 80 | t.Run("transformer is nil", func(t *testing.T) { 81 | var s WeirdTags 82 | fuzz.New().Fuzz(&s) 83 | 84 | var tr *WeirdTagsFeatureTransformer 85 | features := tr.Transform(&s) 86 | 87 | assert.Nil(t, features) 88 | assert.Equal(t, tr.NumFeatures(), 0) 89 | }) 90 | 91 | t.Run("serialize and deserialize transformer", func(t *testing.T) { 92 | output, err := json.Marshal(tr) 93 | assert.Nil(t, err) 94 | assert.NotEmpty(t, output) 95 | 96 | var tr2 WeirdTagsFeatureTransformer 97 | err = json.Unmarshal(output, &tr2) 98 | assert.Nil(t, err) 99 | assert.Equal(t, *tr, tr2) 100 | }) 101 | 102 | t.Run("inplace transform does not run when destination does not match num features", func(t *testing.T) { 103 | var s WeirdTags 104 | fuzz.New().Fuzz(&s) 105 | 106 | tr := WeirdTagsFeatureTransformer{} 107 | 108 | features := make([]float64, 1000) 109 | features[0] = 123456789.0 110 | tr.TransformInplace(features, &s) 111 | 112 | assert.Equal(t, 123456789.0, features[0]) 113 | }) 114 | } 115 | 116 | func TestWeirdTagsFeatureTransformerTransformAll(t *testing.T) { 117 | t.Run("when transformer is nil", func(t *testing.T) { 118 | s := make([]WeirdTags, 100) 119 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 120 | 121 | dst := make([]float64, 100*100) 122 | 123 | var tr *WeirdTagsFeatureTransformer 124 | assert.Nil(t, tr.TransformAll(s)) 125 | assert.Nil(t, tr.TransformAllParallel(s, 4)) 126 | 127 | // does not panic 128 | tr.TransformAllInplace(dst, s) 129 | tr.TransformAllInplaceParallel(dst, s, 4) 130 | }) 131 | 132 | t.Run("inplace with wrong output dimensions, output is smaller", func(t *testing.T) { 133 | s := make([]WeirdTags, 100) 134 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 135 | 136 | dst := make([]float64, 100) 137 | 138 | tr := makeMockWeirdTagsFeatureTransformer() 139 | 140 | // does not panic 141 | tr.TransformAllInplace(dst, s) 142 | tr.TransformAllInplaceParallel(dst, s, 4) 143 | }) 144 | 145 | t.Run("inplace with wrong output dimensions, output is bigger", func(t *testing.T) { 146 | s := make([]WeirdTags, 100) 147 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 148 | 149 | dst := make([]float64, 100*120) 150 | 151 | tr := makeMockWeirdTagsFeatureTransformer() 152 | 153 | // does not panic 154 | tr.TransformAllInplace(dst, s) 155 | tr.TransformAllInplaceParallel(dst, s, 4) 156 | }) 157 | 158 | t.Run("transform all", func(t *testing.T) { 159 | s := make([]WeirdTags, 100) 160 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 161 | 162 | tr := makeMockWeirdTagsFeatureTransformer() 163 | 164 | features := tr.TransformAll(s) 165 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 166 | }) 167 | 168 | t.Run("transform all parallel 1 worker", func(t *testing.T) { 169 | s := make([]WeirdTags, 100) 170 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 171 | 172 | tr := makeMockWeirdTagsFeatureTransformer() 173 | 174 | features := tr.TransformAllParallel(s, 1) 175 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 176 | }) 177 | 178 | t.Run("transform all parallel 4 workers", func(t *testing.T) { 179 | s := make([]WeirdTags, 100) 180 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 181 | 182 | tr := makeMockWeirdTagsFeatureTransformer() 183 | 184 | features := tr.TransformAllParallel(s, 4) 185 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 186 | }) 187 | } 188 | 189 | func TestWeirdTagsFeatureTransformerFit(t *testing.T) { 190 | t.Run("fuzzy input", func(t *testing.T) { 191 | s := make([]WeirdTags, 10) 192 | fuzz.New().NilChance(0).NumElements(1, 1).Fuzz(&s) 193 | 194 | trEmpty := WeirdTagsFeatureTransformer{} 195 | tr := WeirdTagsFeatureTransformer{} 196 | tr.Fit(s) 197 | 198 | assert.NotNil(t, tr) 199 | assert.NotEqual(t, tr, trEmpty) 200 | }) 201 | 202 | t.Run("not nil transformer nil input", func(t *testing.T) { 203 | trEmpty := WeirdTagsFeatureTransformer{} 204 | tr := WeirdTagsFeatureTransformer{} 205 | tr.Fit(nil) 206 | 207 | assert.Equal(t, trEmpty, tr) 208 | }) 209 | 210 | t.Run("nil transformer not nil input", func(t *testing.T) { 211 | s := make([]WeirdTags, 10) 212 | 213 | var tr *WeirdTagsFeatureTransformer 214 | tr.Fit(s) 215 | 216 | assert.Nil(t, tr) 217 | }) 218 | } 219 | 220 | func fitTransformerWeirdTags(b *testing.B, numelem int) { 221 | s := make([]WeirdTags, numelem) 222 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 223 | 224 | var tr WeirdTagsFeatureTransformer 225 | 226 | b.ResetTimer() 227 | for n := 0; n < b.N; n++ { 228 | tr.Fit(s) 229 | } 230 | } 231 | 232 | func BenchmarkWeirdTagsFeatureTransformer_Fit_100elements(b *testing.B) { 233 | fitTransformerWeirdTags(b, 100) 234 | } 235 | 236 | func BenchmarkWeirdTagsFeatureTransformer_Fit_1000elements(b *testing.B) { 237 | fitTransformerWeirdTags(b, 1000) 238 | } 239 | 240 | func BenchmarkWeirdTagsFeatureTransformer_Fit_10000elements(b *testing.B) { 241 | fitTransformerWeirdTags(b, 10000) 242 | } 243 | 244 | func BenchmarkWeirdTagsFeatureTransformer_Transform(b *testing.B) { 245 | var s WeirdTags 246 | fuzz.New().Fuzz(&s) 247 | 248 | tr := makeMockWeirdTagsFeatureTransformer() 249 | 250 | b.ResetTimer() 251 | for n := 0; n < b.N; n++ { 252 | tr.Transform(&s) 253 | } 254 | } 255 | 256 | func BenchmarkWeirdTagsFeatureTransformer_Transform_Inplace(b *testing.B) { 257 | var s WeirdTags 258 | fuzz.New().Fuzz(&s) 259 | 260 | tr := makeMockWeirdTagsFeatureTransformer() 261 | 262 | features := make([]float64, tr.NumFeatures()) 263 | 264 | b.ResetTimer() 265 | for n := 0; n < b.N; n++ { 266 | tr.TransformInplace(features, &s) 267 | } 268 | } 269 | 270 | func benchTransformAllWeirdTags(b *testing.B, numelem int) { 271 | s := make([]WeirdTags, numelem) 272 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 273 | 274 | tr := makeMockWeirdTagsFeatureTransformer() 275 | 276 | b.ResetTimer() 277 | for n := 0; n < b.N; n++ { 278 | tr.TransformAll(s) 279 | } 280 | } 281 | 282 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_10elems(b *testing.B) { 283 | benchTransformAllWeirdTags(b, 10) 284 | } 285 | 286 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_100elems(b *testing.B) { 287 | benchTransformAllWeirdTags(b, 100) 288 | } 289 | 290 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_1000elems(b *testing.B) { 291 | benchTransformAllWeirdTags(b, 1000) 292 | } 293 | 294 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_10000elems(b *testing.B) { 295 | benchTransformAllWeirdTags(b, 10000) 296 | } 297 | 298 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_100000elems(b *testing.B) { 299 | benchTransformAllWeirdTags(b, 100000) 300 | } 301 | 302 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_1000000elems(b *testing.B) { 303 | benchTransformAllWeirdTags(b, 1000000) 304 | } 305 | 306 | func benchTransformAllParallelWeirdTags(b *testing.B, numelem int, nworkers uint) { 307 | s := make([]WeirdTags, numelem) 308 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 309 | 310 | tr := makeMockWeirdTagsFeatureTransformer() 311 | 312 | b.ResetTimer() 313 | for n := 0; n < b.N; n++ { 314 | tr.TransformAllParallel(s, nworkers) 315 | } 316 | } 317 | 318 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_10elems_8workers(b *testing.B) { 319 | benchTransformAllParallelWeirdTags(b, 10, 8) 320 | } 321 | 322 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_100elems_8workers(b *testing.B) { 323 | benchTransformAllParallelWeirdTags(b, 100, 8) 324 | } 325 | 326 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_1000elems_8workers(b *testing.B) { 327 | benchTransformAllParallelWeirdTags(b, 1000, 8) 328 | } 329 | 330 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_10000elems_8workers(b *testing.B) { 331 | benchTransformAllParallelWeirdTags(b, 10000, 8) 332 | } 333 | 334 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_100000elems_8workers(b *testing.B) { 335 | benchTransformAllParallelWeirdTags(b, 100000, 8) 336 | } 337 | 338 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_1000000elems_8workers(b *testing.B) { 339 | benchTransformAllParallelWeirdTags(b, 1000000, 8) 340 | } 341 | 342 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_5000000elems_8workers(b *testing.B) { 343 | benchTransformAllParallelWeirdTags(b, 5000000, 8) 344 | } 345 | 346 | func BenchmarkWeirdTagsFeatureTransformer_TransformAll_15000000elems_8workers(b *testing.B) { 347 | benchTransformAllParallelWeirdTags(b, 15000000, 8) 348 | } 349 | 350 | func benchLargeTransformerWeirdTags(b *testing.B, numelem int) { 351 | var s []WeirdTags 352 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 353 | 354 | tr := WeirdTagsFeatureTransformer{} 355 | tr.Fit(s) 356 | 357 | b.ResetTimer() 358 | for n := 0; n < b.N; n++ { 359 | tr.Transform(&s[0]) 360 | } 361 | } 362 | 363 | func BenchmarkWeirdTagsFeatureTransformer_Transform_LargeComposites_100elements(b *testing.B) { 364 | benchLargeTransformerWeirdTags(b, 100) 365 | } 366 | 367 | func BenchmarkWeirdTagsFeatureTransformer_Transform_LargeComposites_1000elements(b *testing.B) { 368 | benchLargeTransformerWeirdTags(b, 1000) 369 | } 370 | 371 | func BenchmarkWeirdTagsFeatureTransformer_Transform_LargeComposites_10000elements(b *testing.B) { 372 | benchLargeTransformerWeirdTags(b, 10000) 373 | } 374 | 375 | func BenchmarkWeirdTagsFeatureTransformer_Transform_LargeComposites_100000elements(b *testing.B) { 376 | benchLargeTransformerWeirdTags(b, 100000) 377 | } 378 | -------------------------------------------------------------------------------- /cmd/generate/tests/alltransformersfp_test.go: -------------------------------------------------------------------------------- 1 | // Code generated by go-featureprocessing DO NOT EDIT 2 | 3 | package examplemodule 4 | 5 | import ( 6 | "encoding/json" 7 | "testing" 8 | 9 | "github.com/google/gofuzz" 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | // makeMock creates some valid AllTransformersFeatureTransformer by fitting on fuzzy data. 14 | // This function is handy for tests. 15 | func makeMockAllTransformersFeatureTransformer() *AllTransformersFeatureTransformer { 16 | s := make([]AllTransformers, 10) 17 | fuzz.New().NilChance(0).NumElements(10, 10).Fuzz(&s) 18 | 19 | tr := AllTransformersFeatureTransformer{} 20 | tr.Fit(s) 21 | return &tr 22 | } 23 | 24 | func TestAllTransformersFeatureTransformerFeatureNames(t *testing.T) { 25 | tr := makeMockAllTransformersFeatureTransformer() 26 | 27 | t.Run("feature names", func(t *testing.T) { 28 | names := tr.FeatureNames() 29 | assert.True(t, len(names) > 0) 30 | assert.Equal(t, len(names), tr.NumFeatures()) 31 | }) 32 | 33 | t.Run("feature name transformer is empty", func(t *testing.T) { 34 | tr := AllTransformersFeatureTransformer{} 35 | names := tr.FeatureNames() 36 | assert.True(t, len(names) > 0) 37 | assert.Equal(t, len(names), tr.NumFeatures()) 38 | }) 39 | 40 | t.Run("feature name transformer is nil", func(t *testing.T) { 41 | var tr *AllTransformersFeatureTransformer 42 | names := tr.FeatureNames() 43 | assert.Nil(t, names) 44 | }) 45 | } 46 | 47 | func TestAllTransformersFeatureTransformerTransform(t *testing.T) { 48 | tr := makeMockAllTransformersFeatureTransformer() 49 | 50 | t.Run("empty struct", func(t *testing.T) { 51 | s := AllTransformers{} 52 | features := tr.Transform(&s) 53 | 54 | assert.NotNil(t, features) 55 | assert.True(t, len(features) > 0) 56 | assert.Equal(t, tr.NumFeatures(), len(features)) 57 | }) 58 | 59 | t.Run("fuzzy struct", func(t *testing.T) { 60 | var s AllTransformers 61 | fuzz.New().Fuzz(&s) 62 | 63 | tr := AllTransformersFeatureTransformer{} 64 | fuzz.New().NilChance(0).NumElements(1, 1).Fuzz(&tr) 65 | 66 | features := tr.Transform(&s) 67 | 68 | assert.NotNil(t, features) 69 | assert.True(t, len(features) > 0) 70 | assert.Equal(t, tr.NumFeatures(), len(features)) 71 | }) 72 | 73 | t.Run("struct is nil", func(t *testing.T) { 74 | var s *AllTransformers 75 | features := tr.Transform(s) 76 | assert.Nil(t, features) 77 | assert.True(t, tr.NumFeatures() > 0) 78 | }) 79 | 80 | t.Run("transformer is nil", func(t *testing.T) { 81 | var s AllTransformers 82 | fuzz.New().Fuzz(&s) 83 | 84 | var tr *AllTransformersFeatureTransformer 85 | features := tr.Transform(&s) 86 | 87 | assert.Nil(t, features) 88 | assert.Equal(t, tr.NumFeatures(), 0) 89 | }) 90 | 91 | t.Run("serialize and deserialize transformer", func(t *testing.T) { 92 | output, err := json.Marshal(tr) 93 | assert.Nil(t, err) 94 | assert.NotEmpty(t, output) 95 | 96 | var tr2 AllTransformersFeatureTransformer 97 | err = json.Unmarshal(output, &tr2) 98 | assert.Nil(t, err) 99 | assert.Equal(t, *tr, tr2) 100 | }) 101 | 102 | t.Run("inplace transform does not run when destination does not match num features", func(t *testing.T) { 103 | var s AllTransformers 104 | fuzz.New().Fuzz(&s) 105 | 106 | tr := AllTransformersFeatureTransformer{} 107 | 108 | features := make([]float64, 1000) 109 | features[0] = 123456789.0 110 | tr.TransformInplace(features, &s) 111 | 112 | assert.Equal(t, 123456789.0, features[0]) 113 | }) 114 | } 115 | 116 | func TestAllTransformersFeatureTransformerTransformAll(t *testing.T) { 117 | t.Run("when transformer is nil", func(t *testing.T) { 118 | s := make([]AllTransformers, 100) 119 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 120 | 121 | dst := make([]float64, 100*100) 122 | 123 | var tr *AllTransformersFeatureTransformer 124 | assert.Nil(t, tr.TransformAll(s)) 125 | assert.Nil(t, tr.TransformAllParallel(s, 4)) 126 | 127 | // does not panic 128 | tr.TransformAllInplace(dst, s) 129 | tr.TransformAllInplaceParallel(dst, s, 4) 130 | }) 131 | 132 | t.Run("inplace with wrong output dimensions, output is smaller", func(t *testing.T) { 133 | s := make([]AllTransformers, 100) 134 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 135 | 136 | dst := make([]float64, 100) 137 | 138 | tr := makeMockAllTransformersFeatureTransformer() 139 | 140 | // does not panic 141 | tr.TransformAllInplace(dst, s) 142 | tr.TransformAllInplaceParallel(dst, s, 4) 143 | }) 144 | 145 | t.Run("inplace with wrong output dimensions, output is bigger", func(t *testing.T) { 146 | s := make([]AllTransformers, 100) 147 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 148 | 149 | dst := make([]float64, 100*120) 150 | 151 | tr := makeMockAllTransformersFeatureTransformer() 152 | 153 | // does not panic 154 | tr.TransformAllInplace(dst, s) 155 | tr.TransformAllInplaceParallel(dst, s, 4) 156 | }) 157 | 158 | t.Run("transform all", func(t *testing.T) { 159 | s := make([]AllTransformers, 100) 160 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 161 | 162 | tr := makeMockAllTransformersFeatureTransformer() 163 | 164 | features := tr.TransformAll(s) 165 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 166 | }) 167 | 168 | t.Run("transform all parallel 1 worker", func(t *testing.T) { 169 | s := make([]AllTransformers, 100) 170 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 171 | 172 | tr := makeMockAllTransformersFeatureTransformer() 173 | 174 | features := tr.TransformAllParallel(s, 1) 175 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 176 | }) 177 | 178 | t.Run("transform all parallel 4 workers", func(t *testing.T) { 179 | s := make([]AllTransformers, 100) 180 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 181 | 182 | tr := makeMockAllTransformersFeatureTransformer() 183 | 184 | features := tr.TransformAllParallel(s, 4) 185 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 186 | }) 187 | } 188 | 189 | func TestAllTransformersFeatureTransformerFit(t *testing.T) { 190 | t.Run("fuzzy input", func(t *testing.T) { 191 | s := make([]AllTransformers, 10) 192 | fuzz.New().NilChance(0).NumElements(1, 1).Fuzz(&s) 193 | 194 | trEmpty := AllTransformersFeatureTransformer{} 195 | tr := AllTransformersFeatureTransformer{} 196 | tr.Fit(s) 197 | 198 | assert.NotNil(t, tr) 199 | assert.NotEqual(t, tr, trEmpty) 200 | }) 201 | 202 | t.Run("not nil transformer nil input", func(t *testing.T) { 203 | trEmpty := AllTransformersFeatureTransformer{} 204 | tr := AllTransformersFeatureTransformer{} 205 | tr.Fit(nil) 206 | 207 | assert.Equal(t, trEmpty, tr) 208 | }) 209 | 210 | t.Run("nil transformer not nil input", func(t *testing.T) { 211 | s := make([]AllTransformers, 10) 212 | 213 | var tr *AllTransformersFeatureTransformer 214 | tr.Fit(s) 215 | 216 | assert.Nil(t, tr) 217 | }) 218 | } 219 | 220 | func fitTransformerAllTransformers(b *testing.B, numelem int) { 221 | s := make([]AllTransformers, numelem) 222 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 223 | 224 | var tr AllTransformersFeatureTransformer 225 | 226 | b.ResetTimer() 227 | for n := 0; n < b.N; n++ { 228 | tr.Fit(s) 229 | } 230 | } 231 | 232 | func BenchmarkAllTransformersFeatureTransformer_Fit_100elements(b *testing.B) { 233 | fitTransformerAllTransformers(b, 100) 234 | } 235 | 236 | func BenchmarkAllTransformersFeatureTransformer_Fit_1000elements(b *testing.B) { 237 | fitTransformerAllTransformers(b, 1000) 238 | } 239 | 240 | func BenchmarkAllTransformersFeatureTransformer_Fit_10000elements(b *testing.B) { 241 | fitTransformerAllTransformers(b, 10000) 242 | } 243 | 244 | func BenchmarkAllTransformersFeatureTransformer_Transform(b *testing.B) { 245 | var s AllTransformers 246 | fuzz.New().Fuzz(&s) 247 | 248 | tr := makeMockAllTransformersFeatureTransformer() 249 | 250 | b.ResetTimer() 251 | for n := 0; n < b.N; n++ { 252 | tr.Transform(&s) 253 | } 254 | } 255 | 256 | func BenchmarkAllTransformersFeatureTransformer_Transform_Inplace(b *testing.B) { 257 | var s AllTransformers 258 | fuzz.New().Fuzz(&s) 259 | 260 | tr := makeMockAllTransformersFeatureTransformer() 261 | 262 | features := make([]float64, tr.NumFeatures()) 263 | 264 | b.ResetTimer() 265 | for n := 0; n < b.N; n++ { 266 | tr.TransformInplace(features, &s) 267 | } 268 | } 269 | 270 | func benchTransformAllAllTransformers(b *testing.B, numelem int) { 271 | s := make([]AllTransformers, numelem) 272 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 273 | 274 | tr := makeMockAllTransformersFeatureTransformer() 275 | 276 | b.ResetTimer() 277 | for n := 0; n < b.N; n++ { 278 | tr.TransformAll(s) 279 | } 280 | } 281 | 282 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_10elems(b *testing.B) { 283 | benchTransformAllAllTransformers(b, 10) 284 | } 285 | 286 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_100elems(b *testing.B) { 287 | benchTransformAllAllTransformers(b, 100) 288 | } 289 | 290 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_1000elems(b *testing.B) { 291 | benchTransformAllAllTransformers(b, 1000) 292 | } 293 | 294 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_10000elems(b *testing.B) { 295 | benchTransformAllAllTransformers(b, 10000) 296 | } 297 | 298 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_100000elems(b *testing.B) { 299 | benchTransformAllAllTransformers(b, 100000) 300 | } 301 | 302 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_1000000elems(b *testing.B) { 303 | benchTransformAllAllTransformers(b, 1000000) 304 | } 305 | 306 | func benchTransformAllParallelAllTransformers(b *testing.B, numelem int, nworkers uint) { 307 | s := make([]AllTransformers, numelem) 308 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 309 | 310 | tr := makeMockAllTransformersFeatureTransformer() 311 | 312 | b.ResetTimer() 313 | for n := 0; n < b.N; n++ { 314 | tr.TransformAllParallel(s, nworkers) 315 | } 316 | } 317 | 318 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_10elems_8workers(b *testing.B) { 319 | benchTransformAllParallelAllTransformers(b, 10, 8) 320 | } 321 | 322 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_100elems_8workers(b *testing.B) { 323 | benchTransformAllParallelAllTransformers(b, 100, 8) 324 | } 325 | 326 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_1000elems_8workers(b *testing.B) { 327 | benchTransformAllParallelAllTransformers(b, 1000, 8) 328 | } 329 | 330 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_10000elems_8workers(b *testing.B) { 331 | benchTransformAllParallelAllTransformers(b, 10000, 8) 332 | } 333 | 334 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_100000elems_8workers(b *testing.B) { 335 | benchTransformAllParallelAllTransformers(b, 100000, 8) 336 | } 337 | 338 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_1000000elems_8workers(b *testing.B) { 339 | benchTransformAllParallelAllTransformers(b, 1000000, 8) 340 | } 341 | 342 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_5000000elems_8workers(b *testing.B) { 343 | benchTransformAllParallelAllTransformers(b, 5000000, 8) 344 | } 345 | 346 | func BenchmarkAllTransformersFeatureTransformer_TransformAll_15000000elems_8workers(b *testing.B) { 347 | benchTransformAllParallelAllTransformers(b, 15000000, 8) 348 | } 349 | 350 | func benchLargeTransformerAllTransformers(b *testing.B, numelem int) { 351 | var s []AllTransformers 352 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 353 | 354 | tr := AllTransformersFeatureTransformer{} 355 | tr.Fit(s) 356 | 357 | b.ResetTimer() 358 | for n := 0; n < b.N; n++ { 359 | tr.Transform(&s[0]) 360 | } 361 | } 362 | 363 | func BenchmarkAllTransformersFeatureTransformer_Transform_LargeComposites_100elements(b *testing.B) { 364 | benchLargeTransformerAllTransformers(b, 100) 365 | } 366 | 367 | func BenchmarkAllTransformersFeatureTransformer_Transform_LargeComposites_1000elements(b *testing.B) { 368 | benchLargeTransformerAllTransformers(b, 1000) 369 | } 370 | 371 | func BenchmarkAllTransformersFeatureTransformer_Transform_LargeComposites_10000elements(b *testing.B) { 372 | benchLargeTransformerAllTransformers(b, 10000) 373 | } 374 | 375 | func BenchmarkAllTransformersFeatureTransformer_Transform_LargeComposites_100000elements(b *testing.B) { 376 | benchLargeTransformerAllTransformers(b, 100000) 377 | } 378 | -------------------------------------------------------------------------------- /cmd/generate/tests/with32fieldsfp.go: -------------------------------------------------------------------------------- 1 | // Code generated by go-featureprocessing DO NOT EDIT 2 | 3 | package examplemodule 4 | 5 | import ( 6 | "sync" 7 | 8 | fp "github.com/nikolaydubina/go-featureprocessing/transformers" 9 | ) 10 | 11 | // With32FieldsFeatureTransformer is a feature processor for With32Fields. 12 | // It was automatically generated by go-featureprocessing tool. 13 | type With32FieldsFeatureTransformer struct { 14 | Name1 fp.MinMaxScaler `json:"Name1_minmax"` 15 | Name2 fp.MinMaxScaler `json:"Name2_minmax"` 16 | Name3 fp.MinMaxScaler `json:"Name3_minmax"` 17 | Name4 fp.MinMaxScaler `json:"Name4_minmax"` 18 | Name5 fp.MinMaxScaler `json:"Name5_minmax"` 19 | Name6 fp.MinMaxScaler `json:"Name6_minmax"` 20 | Name7 fp.MinMaxScaler `json:"Name7_minmax"` 21 | Name8 fp.MinMaxScaler `json:"Name8_minmax"` 22 | Name9 fp.MinMaxScaler `json:"Name9_minmax"` 23 | Name10 fp.MinMaxScaler `json:"Name10_minmax"` 24 | Name11 fp.MinMaxScaler `json:"Name11_minmax"` 25 | Name12 fp.MinMaxScaler `json:"Name12_minmax"` 26 | Name13 fp.MinMaxScaler `json:"Name13_minmax"` 27 | Name14 fp.MinMaxScaler `json:"Name14_minmax"` 28 | Name15 fp.MinMaxScaler `json:"Name15_minmax"` 29 | Name16 fp.MinMaxScaler `json:"Name16_minmax"` 30 | Name17 fp.MinMaxScaler `json:"Name17_minmax"` 31 | Name18 fp.MinMaxScaler `json:"Name18_minmax"` 32 | Name19 fp.MinMaxScaler `json:"Name19_minmax"` 33 | Name21 fp.MinMaxScaler `json:"Name21_minmax"` 34 | Name22 fp.MinMaxScaler `json:"Name22_minmax"` 35 | Name23 fp.MinMaxScaler `json:"Name23_minmax"` 36 | Name24 fp.MinMaxScaler `json:"Name24_minmax"` 37 | Name25 fp.MinMaxScaler `json:"Name25_minmax"` 38 | Name26 fp.MinMaxScaler `json:"Name26_minmax"` 39 | Name27 fp.MinMaxScaler `json:"Name27_minmax"` 40 | Name28 fp.MinMaxScaler `json:"Name28_minmax"` 41 | Name29 fp.MinMaxScaler `json:"Name29_minmax"` 42 | Name30 fp.MinMaxScaler `json:"Name30_minmax"` 43 | Name31 fp.MinMaxScaler `json:"Name31_minmax"` 44 | Name32 fp.MinMaxScaler `json:"Name32_minmax"` 45 | } 46 | 47 | // Fit fits transformer for each field 48 | func (e *With32FieldsFeatureTransformer) Fit(s []With32Fields) { 49 | if e == nil || len(s) == 0 { 50 | return 51 | } 52 | 53 | dataNum := make([]float64, len(s)) 54 | 55 | for i, v := range s { 56 | dataNum[i] = float64(v.Name1) 57 | } 58 | 59 | e.Name1.Fit(dataNum) 60 | 61 | for i, v := range s { 62 | dataNum[i] = float64(v.Name2) 63 | } 64 | 65 | e.Name2.Fit(dataNum) 66 | 67 | for i, v := range s { 68 | dataNum[i] = float64(v.Name3) 69 | } 70 | 71 | e.Name3.Fit(dataNum) 72 | 73 | for i, v := range s { 74 | dataNum[i] = float64(v.Name4) 75 | } 76 | 77 | e.Name4.Fit(dataNum) 78 | 79 | for i, v := range s { 80 | dataNum[i] = float64(v.Name5) 81 | } 82 | 83 | e.Name5.Fit(dataNum) 84 | 85 | for i, v := range s { 86 | dataNum[i] = float64(v.Name6) 87 | } 88 | 89 | e.Name6.Fit(dataNum) 90 | 91 | for i, v := range s { 92 | dataNum[i] = float64(v.Name7) 93 | } 94 | 95 | e.Name7.Fit(dataNum) 96 | 97 | for i, v := range s { 98 | dataNum[i] = float64(v.Name8) 99 | } 100 | 101 | e.Name8.Fit(dataNum) 102 | 103 | for i, v := range s { 104 | dataNum[i] = float64(v.Name9) 105 | } 106 | 107 | e.Name9.Fit(dataNum) 108 | 109 | for i, v := range s { 110 | dataNum[i] = float64(v.Name10) 111 | } 112 | 113 | e.Name10.Fit(dataNum) 114 | 115 | for i, v := range s { 116 | dataNum[i] = float64(v.Name11) 117 | } 118 | 119 | e.Name11.Fit(dataNum) 120 | 121 | for i, v := range s { 122 | dataNum[i] = float64(v.Name12) 123 | } 124 | 125 | e.Name12.Fit(dataNum) 126 | 127 | for i, v := range s { 128 | dataNum[i] = float64(v.Name13) 129 | } 130 | 131 | e.Name13.Fit(dataNum) 132 | 133 | for i, v := range s { 134 | dataNum[i] = float64(v.Name14) 135 | } 136 | 137 | e.Name14.Fit(dataNum) 138 | 139 | for i, v := range s { 140 | dataNum[i] = float64(v.Name15) 141 | } 142 | 143 | e.Name15.Fit(dataNum) 144 | 145 | for i, v := range s { 146 | dataNum[i] = float64(v.Name16) 147 | } 148 | 149 | e.Name16.Fit(dataNum) 150 | 151 | for i, v := range s { 152 | dataNum[i] = float64(v.Name17) 153 | } 154 | 155 | e.Name17.Fit(dataNum) 156 | 157 | for i, v := range s { 158 | dataNum[i] = float64(v.Name18) 159 | } 160 | 161 | e.Name18.Fit(dataNum) 162 | 163 | for i, v := range s { 164 | dataNum[i] = float64(v.Name19) 165 | } 166 | 167 | e.Name19.Fit(dataNum) 168 | 169 | for i, v := range s { 170 | dataNum[i] = float64(v.Name21) 171 | } 172 | 173 | e.Name21.Fit(dataNum) 174 | 175 | for i, v := range s { 176 | dataNum[i] = float64(v.Name22) 177 | } 178 | 179 | e.Name22.Fit(dataNum) 180 | 181 | for i, v := range s { 182 | dataNum[i] = float64(v.Name23) 183 | } 184 | 185 | e.Name23.Fit(dataNum) 186 | 187 | for i, v := range s { 188 | dataNum[i] = float64(v.Name24) 189 | } 190 | 191 | e.Name24.Fit(dataNum) 192 | 193 | for i, v := range s { 194 | dataNum[i] = float64(v.Name25) 195 | } 196 | 197 | e.Name25.Fit(dataNum) 198 | 199 | for i, v := range s { 200 | dataNum[i] = float64(v.Name26) 201 | } 202 | 203 | e.Name26.Fit(dataNum) 204 | 205 | for i, v := range s { 206 | dataNum[i] = float64(v.Name27) 207 | } 208 | 209 | e.Name27.Fit(dataNum) 210 | 211 | for i, v := range s { 212 | dataNum[i] = float64(v.Name28) 213 | } 214 | 215 | e.Name28.Fit(dataNum) 216 | 217 | for i, v := range s { 218 | dataNum[i] = float64(v.Name29) 219 | } 220 | 221 | e.Name29.Fit(dataNum) 222 | 223 | for i, v := range s { 224 | dataNum[i] = float64(v.Name30) 225 | } 226 | 227 | e.Name30.Fit(dataNum) 228 | 229 | for i, v := range s { 230 | dataNum[i] = float64(v.Name31) 231 | } 232 | 233 | e.Name31.Fit(dataNum) 234 | 235 | for i, v := range s { 236 | dataNum[i] = float64(v.Name32) 237 | } 238 | 239 | e.Name32.Fit(dataNum) 240 | 241 | } 242 | 243 | // Transform transforms struct into feature vector accordingly to transformers 244 | func (e *With32FieldsFeatureTransformer) Transform(s *With32Fields) []float64 { 245 | if s == nil || e == nil { 246 | return nil 247 | } 248 | features := make([]float64, e.NumFeatures()) 249 | e.TransformInplace(features, s) 250 | return features 251 | } 252 | 253 | // TransformInplace transforms struct into feature vector accordingly to transformers, and does so inplace 254 | func (e *With32FieldsFeatureTransformer) TransformInplace(dst []float64, s *With32Fields) { 255 | if s == nil || e == nil || len(dst) != e.NumFeatures() { 256 | return 257 | } 258 | idx := 0 259 | 260 | dst[idx] = e.Name1.Transform(float64(s.Name1)) 261 | idx++ 262 | 263 | dst[idx] = e.Name2.Transform(float64(s.Name2)) 264 | idx++ 265 | 266 | dst[idx] = e.Name3.Transform(float64(s.Name3)) 267 | idx++ 268 | 269 | dst[idx] = e.Name4.Transform(float64(s.Name4)) 270 | idx++ 271 | 272 | dst[idx] = e.Name5.Transform(float64(s.Name5)) 273 | idx++ 274 | 275 | dst[idx] = e.Name6.Transform(float64(s.Name6)) 276 | idx++ 277 | 278 | dst[idx] = e.Name7.Transform(float64(s.Name7)) 279 | idx++ 280 | 281 | dst[idx] = e.Name8.Transform(float64(s.Name8)) 282 | idx++ 283 | 284 | dst[idx] = e.Name9.Transform(float64(s.Name9)) 285 | idx++ 286 | 287 | dst[idx] = e.Name10.Transform(float64(s.Name10)) 288 | idx++ 289 | 290 | dst[idx] = e.Name11.Transform(float64(s.Name11)) 291 | idx++ 292 | 293 | dst[idx] = e.Name12.Transform(float64(s.Name12)) 294 | idx++ 295 | 296 | dst[idx] = e.Name13.Transform(float64(s.Name13)) 297 | idx++ 298 | 299 | dst[idx] = e.Name14.Transform(float64(s.Name14)) 300 | idx++ 301 | 302 | dst[idx] = e.Name15.Transform(float64(s.Name15)) 303 | idx++ 304 | 305 | dst[idx] = e.Name16.Transform(float64(s.Name16)) 306 | idx++ 307 | 308 | dst[idx] = e.Name17.Transform(float64(s.Name17)) 309 | idx++ 310 | 311 | dst[idx] = e.Name18.Transform(float64(s.Name18)) 312 | idx++ 313 | 314 | dst[idx] = e.Name19.Transform(float64(s.Name19)) 315 | idx++ 316 | 317 | dst[idx] = e.Name21.Transform(float64(s.Name21)) 318 | idx++ 319 | 320 | dst[idx] = e.Name22.Transform(float64(s.Name22)) 321 | idx++ 322 | 323 | dst[idx] = e.Name23.Transform(float64(s.Name23)) 324 | idx++ 325 | 326 | dst[idx] = e.Name24.Transform(float64(s.Name24)) 327 | idx++ 328 | 329 | dst[idx] = e.Name25.Transform(float64(s.Name25)) 330 | idx++ 331 | 332 | dst[idx] = e.Name26.Transform(float64(s.Name26)) 333 | idx++ 334 | 335 | dst[idx] = e.Name27.Transform(float64(s.Name27)) 336 | idx++ 337 | 338 | dst[idx] = e.Name28.Transform(float64(s.Name28)) 339 | idx++ 340 | 341 | dst[idx] = e.Name29.Transform(float64(s.Name29)) 342 | idx++ 343 | 344 | dst[idx] = e.Name30.Transform(float64(s.Name30)) 345 | idx++ 346 | 347 | dst[idx] = e.Name31.Transform(float64(s.Name31)) 348 | idx++ 349 | 350 | dst[idx] = e.Name32.Transform(float64(s.Name32)) 351 | idx++ 352 | 353 | } 354 | 355 | // TransformAll transforms a slice of With32Fields 356 | func (e *With32FieldsFeatureTransformer) TransformAll(s []With32Fields) []float64 { 357 | if e == nil { 358 | return nil 359 | } 360 | features := make([]float64, len(s)*e.NumFeatures()) 361 | e.TransformAllInplace(features, s) 362 | return features 363 | } 364 | 365 | // TransformAllInplace transforms a slice of With32Fields inplace 366 | func (e *With32FieldsFeatureTransformer) TransformAllInplace(dst []float64, s []With32Fields) { 367 | if e == nil { 368 | return 369 | } 370 | n := e.NumFeatures() 371 | if len(dst) != n*len(s) { 372 | return 373 | } 374 | for i := range s { 375 | e.TransformInplace(dst[i*n:(i+1)*n], &s[i]) 376 | } 377 | } 378 | 379 | // TransformAllParallel transforms a slice of With32Fields in parallel 380 | func (e *With32FieldsFeatureTransformer) TransformAllParallel(s []With32Fields, nworkers uint) []float64 { 381 | if e == nil { 382 | return nil 383 | } 384 | features := make([]float64, len(s)*e.NumFeatures()) 385 | e.TransformAllInplaceParallel(features, s, nworkers) 386 | return features 387 | } 388 | 389 | // TransformAllInplaceParallel transforms a slice of With32Fields inplace parallel 390 | // Useful for very large slices. 391 | func (e *With32FieldsFeatureTransformer) TransformAllInplaceParallel(dst []float64, s []With32Fields, nworkers uint) { 392 | if e == nil || nworkers == 0 { 393 | return 394 | } 395 | ns := uint(len(s)) 396 | nf := uint(e.NumFeatures()) 397 | if uint(len(dst)) != nf*ns { 398 | return 399 | } 400 | 401 | nbatch := ns / nworkers 402 | var wg sync.WaitGroup 403 | 404 | for i := uint(0); i < nworkers; i++ { 405 | wg.Add(1) 406 | go func(i uint) { 407 | defer wg.Done() 408 | iStart := nbatch * i 409 | iEnd := nbatch * (i + 1) 410 | if i == (nworkers - 1) { 411 | iEnd = ns 412 | } 413 | e.TransformAllInplace(dst[iStart*nf:iEnd*nf], s[iStart:iEnd]) 414 | }(i) 415 | } 416 | 417 | wg.Wait() 418 | } 419 | 420 | // NumFeatures returns number of features in output feature vector 421 | func (e *With32FieldsFeatureTransformer) NumFeatures() int { 422 | if e == nil { 423 | return 0 424 | } 425 | 426 | count := 31 427 | 428 | return count 429 | } 430 | 431 | // FeatureNames provides names of features that match output of transform 432 | func (e *With32FieldsFeatureTransformer) FeatureNames() []string { 433 | if e == nil { 434 | return nil 435 | } 436 | 437 | idx := 0 438 | names := make([]string, e.NumFeatures()) 439 | 440 | names[idx] = "Name1" 441 | idx++ 442 | 443 | names[idx] = "Name2" 444 | idx++ 445 | 446 | names[idx] = "Name3" 447 | idx++ 448 | 449 | names[idx] = "Name4" 450 | idx++ 451 | 452 | names[idx] = "Name5" 453 | idx++ 454 | 455 | names[idx] = "Name6" 456 | idx++ 457 | 458 | names[idx] = "Name7" 459 | idx++ 460 | 461 | names[idx] = "Name8" 462 | idx++ 463 | 464 | names[idx] = "Name9" 465 | idx++ 466 | 467 | names[idx] = "Name10" 468 | idx++ 469 | 470 | names[idx] = "Name11" 471 | idx++ 472 | 473 | names[idx] = "Name12" 474 | idx++ 475 | 476 | names[idx] = "Name13" 477 | idx++ 478 | 479 | names[idx] = "Name14" 480 | idx++ 481 | 482 | names[idx] = "Name15" 483 | idx++ 484 | 485 | names[idx] = "Name16" 486 | idx++ 487 | 488 | names[idx] = "Name17" 489 | idx++ 490 | 491 | names[idx] = "Name18" 492 | idx++ 493 | 494 | names[idx] = "Name19" 495 | idx++ 496 | 497 | names[idx] = "Name21" 498 | idx++ 499 | 500 | names[idx] = "Name22" 501 | idx++ 502 | 503 | names[idx] = "Name23" 504 | idx++ 505 | 506 | names[idx] = "Name24" 507 | idx++ 508 | 509 | names[idx] = "Name25" 510 | idx++ 511 | 512 | names[idx] = "Name26" 513 | idx++ 514 | 515 | names[idx] = "Name27" 516 | idx++ 517 | 518 | names[idx] = "Name28" 519 | idx++ 520 | 521 | names[idx] = "Name29" 522 | idx++ 523 | 524 | names[idx] = "Name30" 525 | idx++ 526 | 527 | names[idx] = "Name31" 528 | idx++ 529 | 530 | names[idx] = "Name32" 531 | idx++ 532 | 533 | return names 534 | } 535 | -------------------------------------------------------------------------------- /cmd/generate/templatetests.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | const templateTests = ` 4 | // Code generated by go-featureprocessing DO NOT EDIT 5 | 6 | package {{$.PackageName}} 7 | 8 | import ( 9 | "encoding/json" 10 | "testing" 11 | 12 | "github.com/google/gofuzz" 13 | "github.com/stretchr/testify/assert" 14 | ) 15 | 16 | // makeMock creates some valid {{$.StructName}}FeatureTransformer by fitting on fuzzy data. 17 | // This function is handy for tests. 18 | func makeMock{{$.StructName}}FeatureTransformer() *{{$.StructName}}FeatureTransformer { 19 | s := make([]{{$.StructName}}, 10) 20 | fuzz.New().NilChance(0).NumElements(10, 10).Fuzz(&s) 21 | 22 | tr := {{$.StructName}}FeatureTransformer{} 23 | tr.Fit(s) 24 | return &tr 25 | } 26 | 27 | func Test{{$.StructName}}FeatureTransformerFeatureNames(t *testing.T) { 28 | tr := makeMock{{$.StructName}}FeatureTransformer() 29 | 30 | t.Run("feature names", func(t *testing.T) { 31 | names := tr.FeatureNames() 32 | assert.True(t, len(names) > 0) 33 | assert.Equal(t, len(names), tr.NumFeatures()) 34 | }) 35 | 36 | t.Run("feature name transformer is empty", func(t *testing.T) { 37 | tr := {{$.StructName}}FeatureTransformer{} 38 | names := tr.FeatureNames() 39 | assert.True(t, len(names) > 0) 40 | assert.Equal(t, len(names), tr.NumFeatures()) 41 | }) 42 | 43 | t.Run("feature name transformer is nil", func(t *testing.T) { 44 | var tr *{{$.StructName}}FeatureTransformer 45 | names := tr.FeatureNames() 46 | assert.Nil(t, names) 47 | }) 48 | } 49 | 50 | func Test{{$.StructName}}FeatureTransformerTransform(t *testing.T) { 51 | tr := makeMock{{$.StructName}}FeatureTransformer() 52 | 53 | t.Run("empty struct", func(t *testing.T) { 54 | s := {{$.StructName}}{} 55 | features := tr.Transform(&s) 56 | 57 | assert.NotNil(t, features) 58 | assert.True(t, len(features) > 0) 59 | assert.Equal(t, tr.NumFeatures(), len(features)) 60 | }) 61 | 62 | t.Run("fuzzy struct", func(t *testing.T) { 63 | var s {{$.StructName}} 64 | fuzz.New().Fuzz(&s) 65 | 66 | tr := {{$.StructName}}FeatureTransformer{} 67 | fuzz.New().NilChance(0).NumElements(1, 1).Fuzz(&tr) 68 | 69 | features := tr.Transform(&s) 70 | 71 | assert.NotNil(t, features) 72 | assert.True(t, len(features) > 0) 73 | assert.Equal(t, tr.NumFeatures(), len(features)) 74 | }) 75 | 76 | t.Run("struct is nil", func(t *testing.T) { 77 | var s *{{$.StructName}} 78 | features := tr.Transform(s) 79 | assert.Nil(t, features) 80 | assert.True(t, tr.NumFeatures() > 0) 81 | }) 82 | 83 | t.Run("transformer is nil", func(t *testing.T) { 84 | var s {{$.StructName}} 85 | fuzz.New().Fuzz(&s) 86 | 87 | var tr *{{$.StructName}}FeatureTransformer 88 | features := tr.Transform(&s) 89 | 90 | assert.Nil(t, features) 91 | assert.Equal(t, tr.NumFeatures(), 0) 92 | }) 93 | 94 | t.Run("serialize and deserialize transformer", func(t *testing.T) { 95 | output, err := json.Marshal(tr) 96 | assert.Nil(t, err) 97 | assert.NotEmpty(t, output) 98 | 99 | var tr2 {{$.StructName}}FeatureTransformer 100 | err = json.Unmarshal(output, &tr2) 101 | assert.Nil(t, err) 102 | assert.Equal(t, *tr, tr2) 103 | }) 104 | 105 | t.Run("inplace transform does not run when destination does not match num features", func(t *testing.T) { 106 | var s {{$.StructName}} 107 | fuzz.New().Fuzz(&s) 108 | 109 | tr := {{$.StructName}}FeatureTransformer{} 110 | 111 | features := make([]float64, 1000) 112 | features[0] = 123456789.0 113 | tr.TransformInplace(features, &s) 114 | 115 | assert.Equal(t, 123456789.0, features[0]) 116 | }) 117 | } 118 | 119 | func Test{{$.StructName}}FeatureTransformerTransformAll(t *testing.T) { 120 | t.Run("when transformer is nil", func(t *testing.T) { 121 | s := make([]{{$.StructName}}, 100) 122 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 123 | 124 | dst := make([]float64, 100 * 100) 125 | 126 | var tr *{{$.StructName}}FeatureTransformer 127 | assert.Nil(t, tr.TransformAll(s)) 128 | assert.Nil(t, tr.TransformAllParallel(s, 4)) 129 | 130 | // does not panic 131 | tr.TransformAllInplace(dst, s) 132 | tr.TransformAllInplaceParallel(dst, s, 4) 133 | }) 134 | 135 | t.Run("inplace with wrong output dimensions, output is smaller", func(t *testing.T) { 136 | s := make([]{{$.StructName}}, 100) 137 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 138 | 139 | dst := make([]float64, 100) 140 | 141 | tr := makeMock{{$.StructName}}FeatureTransformer() 142 | 143 | // does not panic 144 | tr.TransformAllInplace(dst, s) 145 | tr.TransformAllInplaceParallel(dst, s, 4) 146 | }) 147 | 148 | t.Run("inplace with wrong output dimensions, output is bigger", func(t *testing.T) { 149 | s := make([]{{$.StructName}}, 100) 150 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 151 | 152 | dst := make([]float64, 100 * 120) 153 | 154 | tr := makeMock{{$.StructName}}FeatureTransformer() 155 | 156 | // does not panic 157 | tr.TransformAllInplace(dst, s) 158 | tr.TransformAllInplaceParallel(dst, s, 4) 159 | }) 160 | 161 | t.Run("transform all", func(t *testing.T) { 162 | s := make([]{{$.StructName}}, 100) 163 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 164 | 165 | tr := makeMock{{$.StructName}}FeatureTransformer() 166 | 167 | features := tr.TransformAll(s) 168 | assert.Equal(t, len(s) * tr.NumFeatures(), len(features)) 169 | }) 170 | 171 | t.Run("transform all parallel 1 worker", func(t *testing.T) { 172 | s := make([]{{$.StructName}}, 100) 173 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 174 | 175 | tr := makeMock{{$.StructName}}FeatureTransformer() 176 | 177 | features := tr.TransformAllParallel(s, 1) 178 | assert.Equal(t, len(s) * tr.NumFeatures(), len(features)) 179 | }) 180 | 181 | t.Run("transform all parallel 4 workers", func(t *testing.T) { 182 | s := make([]{{$.StructName}}, 100) 183 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 184 | 185 | tr := makeMock{{$.StructName}}FeatureTransformer() 186 | 187 | features := tr.TransformAllParallel(s, 4) 188 | assert.Equal(t, len(s) * tr.NumFeatures(), len(features)) 189 | }) 190 | } 191 | 192 | func Test{{$.StructName}}FeatureTransformerFit(t *testing.T) { 193 | t.Run("fuzzy input", func(t *testing.T) { 194 | s := make([]{{$.StructName}}, 10) 195 | fuzz.New().NilChance(0).NumElements(1, 1).Fuzz(&s) 196 | 197 | trEmpty := {{$.StructName}}FeatureTransformer{} 198 | tr := {{$.StructName}}FeatureTransformer{} 199 | tr.Fit(s) 200 | 201 | assert.NotNil(t, tr) 202 | assert.NotEqual(t, tr, trEmpty) 203 | }) 204 | 205 | t.Run("not nil transformer nil input", func(t *testing.T) { 206 | trEmpty := {{$.StructName}}FeatureTransformer{} 207 | tr := {{$.StructName}}FeatureTransformer{} 208 | tr.Fit(nil) 209 | 210 | assert.Equal(t, trEmpty, tr) 211 | }) 212 | 213 | t.Run("nil transformer not nil input", func(t *testing.T) { 214 | s := make([]{{$.StructName}}, 10) 215 | 216 | var tr *{{$.StructName}}FeatureTransformer 217 | tr.Fit(s) 218 | 219 | assert.Nil(t, tr) 220 | }) 221 | } 222 | 223 | func fitTransformer{{$.StructName}}(b *testing.B, numelem int) { 224 | s := make([]{{$.StructName}}, numelem) 225 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 226 | 227 | var tr {{$.StructName}}FeatureTransformer 228 | 229 | b.ResetTimer() 230 | for n := 0; n < b.N; n++ { 231 | tr.Fit(s) 232 | } 233 | } 234 | 235 | func Benchmark{{$.StructName}}FeatureTransformer_Fit_100elements(b *testing.B) { 236 | fitTransformer{{$.StructName}}(b, 100) 237 | } 238 | 239 | func Benchmark{{$.StructName}}FeatureTransformer_Fit_1000elements(b *testing.B) { 240 | fitTransformer{{$.StructName}}(b, 1000) 241 | } 242 | 243 | func Benchmark{{$.StructName}}FeatureTransformer_Fit_10000elements(b *testing.B) { 244 | fitTransformer{{$.StructName}}(b, 10000) 245 | } 246 | 247 | func Benchmark{{$.StructName}}FeatureTransformer_Transform(b *testing.B) { 248 | var s {{$.StructName}} 249 | fuzz.New().Fuzz(&s) 250 | 251 | tr := makeMock{{$.StructName}}FeatureTransformer() 252 | 253 | b.ResetTimer() 254 | for n := 0; n < b.N; n++ { 255 | tr.Transform(&s) 256 | } 257 | } 258 | 259 | func Benchmark{{$.StructName}}FeatureTransformer_Transform_Inplace(b *testing.B) { 260 | var s {{$.StructName}} 261 | fuzz.New().Fuzz(&s) 262 | 263 | tr := makeMock{{$.StructName}}FeatureTransformer() 264 | 265 | features := make([]float64, tr.NumFeatures()) 266 | 267 | b.ResetTimer() 268 | for n := 0; n < b.N; n++ { 269 | tr.TransformInplace(features, &s) 270 | } 271 | } 272 | 273 | func benchTransformAll{{$.StructName}}(b *testing.B, numelem int) { 274 | s := make([]{{$.StructName}}, numelem) 275 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 276 | 277 | tr := makeMock{{$.StructName}}FeatureTransformer() 278 | 279 | b.ResetTimer() 280 | for n := 0; n < b.N; n++ { 281 | tr.TransformAll(s) 282 | } 283 | } 284 | 285 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_10elems(b *testing.B) { 286 | benchTransformAll{{$.StructName}}(b, 10) 287 | } 288 | 289 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_100elems(b *testing.B) { 290 | benchTransformAll{{$.StructName}}(b, 100) 291 | } 292 | 293 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_1000elems(b *testing.B) { 294 | benchTransformAll{{$.StructName}}(b, 1000) 295 | } 296 | 297 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_10000elems(b *testing.B) { 298 | benchTransformAll{{$.StructName}}(b, 10000) 299 | } 300 | 301 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_100000elems(b *testing.B) { 302 | benchTransformAll{{$.StructName}}(b, 100000) 303 | } 304 | 305 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_1000000elems(b *testing.B) { 306 | benchTransformAll{{$.StructName}}(b, 1000000) 307 | } 308 | 309 | func benchTransformAllParallel{{$.StructName}}(b *testing.B, numelem int, nworkers uint) { 310 | s := make([]{{$.StructName}}, numelem) 311 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 312 | 313 | tr := makeMock{{$.StructName}}FeatureTransformer() 314 | 315 | b.ResetTimer() 316 | for n := 0; n < b.N; n++ { 317 | tr.TransformAllParallel(s, nworkers) 318 | } 319 | } 320 | 321 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_10elems_8workers(b *testing.B) { 322 | benchTransformAllParallel{{$.StructName}}(b, 10, 8) 323 | } 324 | 325 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_100elems_8workers(b *testing.B) { 326 | benchTransformAllParallel{{$.StructName}}(b, 100, 8) 327 | } 328 | 329 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_1000elems_8workers(b *testing.B) { 330 | benchTransformAllParallel{{$.StructName}}(b, 1000, 8) 331 | } 332 | 333 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_10000elems_8workers(b *testing.B) { 334 | benchTransformAllParallel{{$.StructName}}(b, 10000, 8) 335 | } 336 | 337 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_100000elems_8workers(b *testing.B) { 338 | benchTransformAllParallel{{$.StructName}}(b, 100000, 8) 339 | } 340 | 341 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_1000000elems_8workers(b *testing.B) { 342 | benchTransformAllParallel{{$.StructName}}(b, 1000000, 8) 343 | } 344 | 345 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_5000000elems_8workers(b *testing.B) { 346 | benchTransformAllParallel{{$.StructName}}(b, 5000000, 8) 347 | } 348 | 349 | func Benchmark{{$.StructName}}FeatureTransformer_TransformAll_15000000elems_8workers(b *testing.B) { 350 | benchTransformAllParallel{{$.StructName}}(b, 15000000, 8) 351 | } 352 | 353 | {{if $.HasLargeTransformers}} 354 | 355 | func benchLargeTransformer{{$.StructName}}(b *testing.B, numelem int) { 356 | var s []{{$.StructName}} 357 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 358 | 359 | tr := {{$.StructName}}FeatureTransformer{} 360 | tr.Fit(s) 361 | 362 | b.ResetTimer() 363 | for n := 0; n < b.N; n++ { 364 | tr.Transform(&s[0]) 365 | } 366 | } 367 | 368 | func Benchmark{{$.StructName}}FeatureTransformer_Transform_LargeComposites_100elements(b *testing.B) { 369 | benchLargeTransformer{{$.StructName}}(b, 100) 370 | } 371 | 372 | func Benchmark{{$.StructName}}FeatureTransformer_Transform_LargeComposites_1000elements(b *testing.B) { 373 | benchLargeTransformer{{$.StructName}}(b, 1000) 374 | } 375 | 376 | func Benchmark{{$.StructName}}FeatureTransformer_Transform_LargeComposites_10000elements(b *testing.B) { 377 | benchLargeTransformer{{$.StructName}}(b, 10000) 378 | } 379 | 380 | func Benchmark{{$.StructName}}FeatureTransformer_Transform_LargeComposites_100000elements(b *testing.B) { 381 | benchLargeTransformer{{$.StructName}}(b, 100000) 382 | } 383 | 384 | {{end}} 385 | ` 386 | -------------------------------------------------------------------------------- /cmd/generate/tests/largememorytransformerfp_test.go: -------------------------------------------------------------------------------- 1 | // Code generated by go-featureprocessing DO NOT EDIT 2 | 3 | package examplemodule 4 | 5 | import ( 6 | "encoding/json" 7 | "testing" 8 | 9 | "github.com/google/gofuzz" 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | // makeMock creates some valid LargeMemoryTransformerFeatureTransformer by fitting on fuzzy data. 14 | // This function is handy for tests. 15 | func makeMockLargeMemoryTransformerFeatureTransformer() *LargeMemoryTransformerFeatureTransformer { 16 | s := make([]LargeMemoryTransformer, 10) 17 | fuzz.New().NilChance(0).NumElements(10, 10).Fuzz(&s) 18 | 19 | tr := LargeMemoryTransformerFeatureTransformer{} 20 | tr.Fit(s) 21 | return &tr 22 | } 23 | 24 | func TestLargeMemoryTransformerFeatureTransformerFeatureNames(t *testing.T) { 25 | tr := makeMockLargeMemoryTransformerFeatureTransformer() 26 | 27 | t.Run("feature names", func(t *testing.T) { 28 | names := tr.FeatureNames() 29 | assert.True(t, len(names) > 0) 30 | assert.Equal(t, len(names), tr.NumFeatures()) 31 | }) 32 | 33 | t.Run("feature name transformer is empty", func(t *testing.T) { 34 | tr := LargeMemoryTransformerFeatureTransformer{} 35 | names := tr.FeatureNames() 36 | assert.True(t, len(names) > 0) 37 | assert.Equal(t, len(names), tr.NumFeatures()) 38 | }) 39 | 40 | t.Run("feature name transformer is nil", func(t *testing.T) { 41 | var tr *LargeMemoryTransformerFeatureTransformer 42 | names := tr.FeatureNames() 43 | assert.Nil(t, names) 44 | }) 45 | } 46 | 47 | func TestLargeMemoryTransformerFeatureTransformerTransform(t *testing.T) { 48 | tr := makeMockLargeMemoryTransformerFeatureTransformer() 49 | 50 | t.Run("empty struct", func(t *testing.T) { 51 | s := LargeMemoryTransformer{} 52 | features := tr.Transform(&s) 53 | 54 | assert.NotNil(t, features) 55 | assert.True(t, len(features) > 0) 56 | assert.Equal(t, tr.NumFeatures(), len(features)) 57 | }) 58 | 59 | t.Run("fuzzy struct", func(t *testing.T) { 60 | var s LargeMemoryTransformer 61 | fuzz.New().Fuzz(&s) 62 | 63 | tr := LargeMemoryTransformerFeatureTransformer{} 64 | fuzz.New().NilChance(0).NumElements(1, 1).Fuzz(&tr) 65 | 66 | features := tr.Transform(&s) 67 | 68 | assert.NotNil(t, features) 69 | assert.True(t, len(features) > 0) 70 | assert.Equal(t, tr.NumFeatures(), len(features)) 71 | }) 72 | 73 | t.Run("struct is nil", func(t *testing.T) { 74 | var s *LargeMemoryTransformer 75 | features := tr.Transform(s) 76 | assert.Nil(t, features) 77 | assert.True(t, tr.NumFeatures() > 0) 78 | }) 79 | 80 | t.Run("transformer is nil", func(t *testing.T) { 81 | var s LargeMemoryTransformer 82 | fuzz.New().Fuzz(&s) 83 | 84 | var tr *LargeMemoryTransformerFeatureTransformer 85 | features := tr.Transform(&s) 86 | 87 | assert.Nil(t, features) 88 | assert.Equal(t, tr.NumFeatures(), 0) 89 | }) 90 | 91 | t.Run("serialize and deserialize transformer", func(t *testing.T) { 92 | output, err := json.Marshal(tr) 93 | assert.Nil(t, err) 94 | assert.NotEmpty(t, output) 95 | 96 | var tr2 LargeMemoryTransformerFeatureTransformer 97 | err = json.Unmarshal(output, &tr2) 98 | assert.Nil(t, err) 99 | assert.Equal(t, *tr, tr2) 100 | }) 101 | 102 | t.Run("inplace transform does not run when destination does not match num features", func(t *testing.T) { 103 | var s LargeMemoryTransformer 104 | fuzz.New().Fuzz(&s) 105 | 106 | tr := LargeMemoryTransformerFeatureTransformer{} 107 | 108 | features := make([]float64, 1000) 109 | features[0] = 123456789.0 110 | tr.TransformInplace(features, &s) 111 | 112 | assert.Equal(t, 123456789.0, features[0]) 113 | }) 114 | } 115 | 116 | func TestLargeMemoryTransformerFeatureTransformerTransformAll(t *testing.T) { 117 | t.Run("when transformer is nil", func(t *testing.T) { 118 | s := make([]LargeMemoryTransformer, 100) 119 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 120 | 121 | dst := make([]float64, 100*100) 122 | 123 | var tr *LargeMemoryTransformerFeatureTransformer 124 | assert.Nil(t, tr.TransformAll(s)) 125 | assert.Nil(t, tr.TransformAllParallel(s, 4)) 126 | 127 | // does not panic 128 | tr.TransformAllInplace(dst, s) 129 | tr.TransformAllInplaceParallel(dst, s, 4) 130 | }) 131 | 132 | t.Run("inplace with wrong output dimensions, output is smaller", func(t *testing.T) { 133 | s := make([]LargeMemoryTransformer, 100) 134 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 135 | 136 | dst := make([]float64, 100) 137 | 138 | tr := makeMockLargeMemoryTransformerFeatureTransformer() 139 | 140 | // does not panic 141 | tr.TransformAllInplace(dst, s) 142 | tr.TransformAllInplaceParallel(dst, s, 4) 143 | }) 144 | 145 | t.Run("inplace with wrong output dimensions, output is bigger", func(t *testing.T) { 146 | s := make([]LargeMemoryTransformer, 100) 147 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 148 | 149 | dst := make([]float64, 100*120) 150 | 151 | tr := makeMockLargeMemoryTransformerFeatureTransformer() 152 | 153 | // does not panic 154 | tr.TransformAllInplace(dst, s) 155 | tr.TransformAllInplaceParallel(dst, s, 4) 156 | }) 157 | 158 | t.Run("transform all", func(t *testing.T) { 159 | s := make([]LargeMemoryTransformer, 100) 160 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 161 | 162 | tr := makeMockLargeMemoryTransformerFeatureTransformer() 163 | 164 | features := tr.TransformAll(s) 165 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 166 | }) 167 | 168 | t.Run("transform all parallel 1 worker", func(t *testing.T) { 169 | s := make([]LargeMemoryTransformer, 100) 170 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 171 | 172 | tr := makeMockLargeMemoryTransformerFeatureTransformer() 173 | 174 | features := tr.TransformAllParallel(s, 1) 175 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 176 | }) 177 | 178 | t.Run("transform all parallel 4 workers", func(t *testing.T) { 179 | s := make([]LargeMemoryTransformer, 100) 180 | fuzz.New().NilChance(0).NumElements(100, 100).Fuzz(&s) 181 | 182 | tr := makeMockLargeMemoryTransformerFeatureTransformer() 183 | 184 | features := tr.TransformAllParallel(s, 4) 185 | assert.Equal(t, len(s)*tr.NumFeatures(), len(features)) 186 | }) 187 | } 188 | 189 | func TestLargeMemoryTransformerFeatureTransformerFit(t *testing.T) { 190 | t.Run("fuzzy input", func(t *testing.T) { 191 | s := make([]LargeMemoryTransformer, 10) 192 | fuzz.New().NilChance(0).NumElements(1, 1).Fuzz(&s) 193 | 194 | trEmpty := LargeMemoryTransformerFeatureTransformer{} 195 | tr := LargeMemoryTransformerFeatureTransformer{} 196 | tr.Fit(s) 197 | 198 | assert.NotNil(t, tr) 199 | assert.NotEqual(t, tr, trEmpty) 200 | }) 201 | 202 | t.Run("not nil transformer nil input", func(t *testing.T) { 203 | trEmpty := LargeMemoryTransformerFeatureTransformer{} 204 | tr := LargeMemoryTransformerFeatureTransformer{} 205 | tr.Fit(nil) 206 | 207 | assert.Equal(t, trEmpty, tr) 208 | }) 209 | 210 | t.Run("nil transformer not nil input", func(t *testing.T) { 211 | s := make([]LargeMemoryTransformer, 10) 212 | 213 | var tr *LargeMemoryTransformerFeatureTransformer 214 | tr.Fit(s) 215 | 216 | assert.Nil(t, tr) 217 | }) 218 | } 219 | 220 | func fitTransformerLargeMemoryTransformer(b *testing.B, numelem int) { 221 | s := make([]LargeMemoryTransformer, numelem) 222 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 223 | 224 | var tr LargeMemoryTransformerFeatureTransformer 225 | 226 | b.ResetTimer() 227 | for n := 0; n < b.N; n++ { 228 | tr.Fit(s) 229 | } 230 | } 231 | 232 | func BenchmarkLargeMemoryTransformerFeatureTransformer_Fit_100elements(b *testing.B) { 233 | fitTransformerLargeMemoryTransformer(b, 100) 234 | } 235 | 236 | func BenchmarkLargeMemoryTransformerFeatureTransformer_Fit_1000elements(b *testing.B) { 237 | fitTransformerLargeMemoryTransformer(b, 1000) 238 | } 239 | 240 | func BenchmarkLargeMemoryTransformerFeatureTransformer_Fit_10000elements(b *testing.B) { 241 | fitTransformerLargeMemoryTransformer(b, 10000) 242 | } 243 | 244 | func BenchmarkLargeMemoryTransformerFeatureTransformer_Transform(b *testing.B) { 245 | var s LargeMemoryTransformer 246 | fuzz.New().Fuzz(&s) 247 | 248 | tr := makeMockLargeMemoryTransformerFeatureTransformer() 249 | 250 | b.ResetTimer() 251 | for n := 0; n < b.N; n++ { 252 | tr.Transform(&s) 253 | } 254 | } 255 | 256 | func BenchmarkLargeMemoryTransformerFeatureTransformer_Transform_Inplace(b *testing.B) { 257 | var s LargeMemoryTransformer 258 | fuzz.New().Fuzz(&s) 259 | 260 | tr := makeMockLargeMemoryTransformerFeatureTransformer() 261 | 262 | features := make([]float64, tr.NumFeatures()) 263 | 264 | b.ResetTimer() 265 | for n := 0; n < b.N; n++ { 266 | tr.TransformInplace(features, &s) 267 | } 268 | } 269 | 270 | func benchTransformAllLargeMemoryTransformer(b *testing.B, numelem int) { 271 | s := make([]LargeMemoryTransformer, numelem) 272 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 273 | 274 | tr := makeMockLargeMemoryTransformerFeatureTransformer() 275 | 276 | b.ResetTimer() 277 | for n := 0; n < b.N; n++ { 278 | tr.TransformAll(s) 279 | } 280 | } 281 | 282 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_10elems(b *testing.B) { 283 | benchTransformAllLargeMemoryTransformer(b, 10) 284 | } 285 | 286 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_100elems(b *testing.B) { 287 | benchTransformAllLargeMemoryTransformer(b, 100) 288 | } 289 | 290 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_1000elems(b *testing.B) { 291 | benchTransformAllLargeMemoryTransformer(b, 1000) 292 | } 293 | 294 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_10000elems(b *testing.B) { 295 | benchTransformAllLargeMemoryTransformer(b, 10000) 296 | } 297 | 298 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_100000elems(b *testing.B) { 299 | benchTransformAllLargeMemoryTransformer(b, 100000) 300 | } 301 | 302 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_1000000elems(b *testing.B) { 303 | benchTransformAllLargeMemoryTransformer(b, 1000000) 304 | } 305 | 306 | func benchTransformAllParallelLargeMemoryTransformer(b *testing.B, numelem int, nworkers uint) { 307 | s := make([]LargeMemoryTransformer, numelem) 308 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 309 | 310 | tr := makeMockLargeMemoryTransformerFeatureTransformer() 311 | 312 | b.ResetTimer() 313 | for n := 0; n < b.N; n++ { 314 | tr.TransformAllParallel(s, nworkers) 315 | } 316 | } 317 | 318 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_10elems_8workers(b *testing.B) { 319 | benchTransformAllParallelLargeMemoryTransformer(b, 10, 8) 320 | } 321 | 322 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_100elems_8workers(b *testing.B) { 323 | benchTransformAllParallelLargeMemoryTransformer(b, 100, 8) 324 | } 325 | 326 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_1000elems_8workers(b *testing.B) { 327 | benchTransformAllParallelLargeMemoryTransformer(b, 1000, 8) 328 | } 329 | 330 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_10000elems_8workers(b *testing.B) { 331 | benchTransformAllParallelLargeMemoryTransformer(b, 10000, 8) 332 | } 333 | 334 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_100000elems_8workers(b *testing.B) { 335 | benchTransformAllParallelLargeMemoryTransformer(b, 100000, 8) 336 | } 337 | 338 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_1000000elems_8workers(b *testing.B) { 339 | benchTransformAllParallelLargeMemoryTransformer(b, 1000000, 8) 340 | } 341 | 342 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_5000000elems_8workers(b *testing.B) { 343 | benchTransformAllParallelLargeMemoryTransformer(b, 5000000, 8) 344 | } 345 | 346 | func BenchmarkLargeMemoryTransformerFeatureTransformer_TransformAll_15000000elems_8workers(b *testing.B) { 347 | benchTransformAllParallelLargeMemoryTransformer(b, 15000000, 8) 348 | } 349 | 350 | func benchLargeTransformerLargeMemoryTransformer(b *testing.B, numelem int) { 351 | var s []LargeMemoryTransformer 352 | fuzz.New().NilChance(0).NumElements(numelem, numelem).Fuzz(&s) 353 | 354 | tr := LargeMemoryTransformerFeatureTransformer{} 355 | tr.Fit(s) 356 | 357 | b.ResetTimer() 358 | for n := 0; n < b.N; n++ { 359 | tr.Transform(&s[0]) 360 | } 361 | } 362 | 363 | func BenchmarkLargeMemoryTransformerFeatureTransformer_Transform_LargeComposites_100elements(b *testing.B) { 364 | benchLargeTransformerLargeMemoryTransformer(b, 100) 365 | } 366 | 367 | func BenchmarkLargeMemoryTransformerFeatureTransformer_Transform_LargeComposites_1000elements(b *testing.B) { 368 | benchLargeTransformerLargeMemoryTransformer(b, 1000) 369 | } 370 | 371 | func BenchmarkLargeMemoryTransformerFeatureTransformer_Transform_LargeComposites_10000elements(b *testing.B) { 372 | benchLargeTransformerLargeMemoryTransformer(b, 10000) 373 | } 374 | 375 | func BenchmarkLargeMemoryTransformerFeatureTransformer_Transform_LargeComposites_100000elements(b *testing.B) { 376 | benchLargeTransformerLargeMemoryTransformer(b, 100000) 377 | } 378 | -------------------------------------------------------------------------------- /structtransformer/structtransformer_test.go: -------------------------------------------------------------------------------- 1 | package structtransformer_test 2 | 3 | import ( 4 | "math/rand" 5 | "testing" 6 | 7 | . "github.com/nikolaydubina/go-featureprocessing/structtransformer" 8 | . "github.com/nikolaydubina/go-featureprocessing/transformers" 9 | 10 | "github.com/stretchr/testify/assert" 11 | ) 12 | 13 | func TestStructTransformer(t *testing.T) { 14 | t.Run("test transform basic", func(t *testing.T) { 15 | type S struct { 16 | Age int `feature:"minmax"` 17 | Salary float64 `feature:"standard"` 18 | Gender string `feature:"onehot"` 19 | City string `feature:"ordinal"` 20 | } 21 | 22 | tr := StructTransformer{Transformers: []interface{}{ 23 | &MinMaxScaler{Min: 1, Max: 10}, 24 | &StandardScaler{Mean: 15, STD: 2.5}, 25 | &OneHotEncoder{Mapping: map[string]uint{"male": 0, "female": 1}}, 26 | &OrdinalEncoder{Mapping: map[string]uint{"city-A": 1, "city-B": 2}}, 27 | }} 28 | 29 | assert.Equal(t, []float64{1, 1, 0, 1, 2}, tr.Transform(S{Age: 23, Salary: 17.5, Gender: "female", City: "city-B"})) 30 | }) 31 | 32 | t.Run("test transform struct has fields but transformers missing", func(t *testing.T) { 33 | type S struct { 34 | Age int `feature:"minmax"` 35 | Salary float64 `feature:"standard"` 36 | Gender string `feature:"onehot"` 37 | City string `feature:"ordinal"` 38 | } 39 | 40 | tr := StructTransformer{} 41 | assert.Equal(t, []float64(nil), tr.Transform(S{Age: 23, Salary: 17.5, Gender: "female", City: "city-B"})) 42 | }) 43 | 44 | t.Run("test transform nil", func(t *testing.T) { 45 | tr := StructTransformer{} 46 | assert.Equal(t, []float64(nil), tr.Transform(nil)) 47 | }) 48 | 49 | t.Run("test transform nil pointer to struct", func(t *testing.T) { 50 | type S struct { 51 | Age int `feature:"minmax"` 52 | Salary float64 `feature:"standard"` 53 | Gender string `feature:"onehot"` 54 | City string `feature:"ordinal"` 55 | } 56 | var s S 57 | tr := StructTransformer{} 58 | assert.Equal(t, []float64(nil), tr.Transform(&s)) 59 | }) 60 | 61 | t.Run("test transform unexpected type panics", func(t *testing.T) { 62 | type T int 63 | type S struct { 64 | Age T `feature:"minmax"` 65 | Salary bool `feature:"standard"` 66 | Gender string `feature:"onehot"` 67 | City string `feature:"ordinal"` 68 | } 69 | s := S{} 70 | tr := StructTransformer{Transformers: []interface{}{ 71 | &MinMaxScaler{Min: 1, Max: 10}, 72 | &StandardScaler{Mean: 15, STD: 2.5}, 73 | &OneHotEncoder{Mapping: map[string]uint{"male": 0, "female": 1}}, 74 | &OrdinalEncoder{Mapping: map[string]uint{"city-A": 1, "city-B": 2}}, 75 | }} 76 | assert.PanicsWithValue(t, "unsupported type in struct", func() { tr.Transform(s) }) 77 | }) 78 | 79 | t.Run("test transform nil transformer skipped", func(t *testing.T) { 80 | type S struct { 81 | Age int `feature:"minmax"` 82 | Salary float64 `feature:"standard"` 83 | Gender string `feature:"onehot"` 84 | City string `feature:"ordinal"` 85 | } 86 | 87 | tr := StructTransformer{Transformers: []interface{}{ 88 | &MinMaxScaler{Min: 1, Max: 10}, 89 | nil, 90 | &OneHotEncoder{Mapping: map[string]uint{"male": 0, "female": 1}}, 91 | &OrdinalEncoder{Mapping: map[string]uint{"city-A": 1, "city-B": 2}}, 92 | }} 93 | 94 | assert.Equal(t, []float64{1, 0, 1, 2}, tr.Transform(S{Age: 23, Salary: 17.5, Gender: "female", City: "city-B"})) 95 | }) 96 | 97 | t.Run("test transform unexpected numerical transformer skipped", func(t *testing.T) { 98 | type S struct { 99 | Age int `feature:"minmax"` 100 | Salary float64 `feature:"standard"` 101 | Gender string `feature:"onehot"` 102 | City string `feature:"ordinal"` 103 | } 104 | type T struct{} 105 | 106 | tr := StructTransformer{Transformers: []interface{}{ 107 | &MinMaxScaler{Min: 1, Max: 10}, 108 | &T{}, 109 | &OneHotEncoder{Mapping: map[string]uint{"male": 0, "female": 1}}, 110 | &OrdinalEncoder{Mapping: map[string]uint{"city-A": 1, "city-B": 2}}, 111 | }} 112 | 113 | assert.Equal(t, []float64{1, 0, 1, 2}, tr.Transform(S{Age: 23, Salary: 17.5, Gender: "female", City: "city-B"})) 114 | }) 115 | 116 | t.Run("test transform unexpected string transformer skipped", func(t *testing.T) { 117 | type S struct { 118 | Age int `feature:"minmax"` 119 | Salary float64 `feature:"standard"` 120 | Gender string `feature:"onehot"` 121 | City string `feature:"ordinal"` 122 | } 123 | type T struct{} 124 | 125 | tr := StructTransformer{Transformers: []interface{}{ 126 | &MinMaxScaler{Min: 1, Max: 10}, 127 | &StandardScaler{Mean: 15, STD: 2.5}, 128 | &OneHotEncoder{Mapping: map[string]uint{"male": 0, "female": 1}}, 129 | &T{}, 130 | }} 131 | 132 | assert.Equal(t, []float64{1, 1, 0, 1}, tr.Transform(S{Age: 23, Salary: 17.5, Gender: "female", City: "city-B"})) 133 | }) 134 | 135 | t.Run("test transform nil interface", func(t *testing.T) { 136 | type S interface { 137 | Get() int 138 | } 139 | var s S 140 | tr := StructTransformer{} 141 | assert.Equal(t, []float64(nil), tr.Transform(&s)) 142 | assert.Equal(t, []float64(nil), tr.Transform(s)) 143 | }) 144 | 145 | t.Run("test fit not implemented", func(t *testing.T) { 146 | type S struct { 147 | Age int `feature:"minmax"` 148 | Salary float64 `feature:"standard"` 149 | Gender string `feature:"onehot"` 150 | City string `feature:"ordinal"` 151 | } 152 | s := []interface{}{&S{}, &S{}} 153 | tr := StructTransformer{} 154 | assert.PanicsWithValue(t, "not implemented", func() { tr.Fit(s) }) 155 | 156 | }) 157 | } 158 | 159 | func BenchmarkStructTransformer_Transform_Small(b *testing.B) { 160 | type S struct { 161 | Age int `feature:"minmax"` 162 | Salary float64 `feature:"standard"` 163 | Gender string `feature:"onehot"` 164 | City string `feature:"ordinal"` 165 | } 166 | 167 | tr := StructTransformer{Transformers: []interface{}{ 168 | &MinMaxScaler{Min: 1, Max: 10}, 169 | &StandardScaler{Mean: 15, STD: 2.5}, 170 | &OneHotEncoder{Mapping: map[string]uint{"male": 0, "female": 1}}, 171 | &OrdinalEncoder{Mapping: map[string]uint{"city-A": 1, "city-B": 2}}, 172 | }} 173 | 174 | s := S{ 175 | Age: 23, 176 | Salary: 17.5, 177 | Gender: "female", 178 | City: "city-B", 179 | } 180 | 181 | b.ResetTimer() 182 | for n := 0; n < b.N; n++ { 183 | tr.Transform(s) 184 | } 185 | } 186 | 187 | func randomInt(min, max int) int { 188 | return min + rand.Intn(max-min) 189 | } 190 | 191 | func randomString(len int) string { 192 | bytes := make([]byte, len) 193 | for i := 0; i < len; i++ { 194 | bytes[i] = byte(randomInt(65, 90)) 195 | } 196 | return string(bytes) 197 | } 198 | 199 | func randomSliceFloat64(num int) []float64 { 200 | ret := make([]float64, num) 201 | for i := 0; i < num; i++ { 202 | ret[i] = rand.Float64() 203 | } 204 | return ret 205 | } 206 | 207 | func randomMappingString(num int, strlen int) map[string]uint { 208 | ret := make(map[string]uint) 209 | for i := 0; i < num; i++ { 210 | ret[randomString(strlen)] = uint(i) 211 | } 212 | return ret 213 | } 214 | 215 | func getAnyKeyFromMap(mp map[string]uint) string { 216 | for k := range mp { 217 | return k 218 | } 219 | return "" 220 | } 221 | 222 | func benchLargeTransformer(b *testing.B, numelem int) { 223 | type S struct { 224 | Name1 string `feature:"onehot"` 225 | Name2 string `feature:"onehot"` 226 | Name3 string `feature:"ordinal"` 227 | Name4 string `feature:"ordinal"` 228 | Name5 float64 `feature:"quantile"` 229 | Name6 float64 `feature:"quantile"` 230 | Name7 float64 `feature:"kbins"` 231 | Name8 float64 `feature:"kbins"` 232 | } 233 | 234 | tr := StructTransformer{Transformers: []interface{}{ 235 | &OneHotEncoder{Mapping: randomMappingString(numelem, 20)}, 236 | &OneHotEncoder{Mapping: randomMappingString(numelem, 20)}, 237 | &OrdinalEncoder{Mapping: randomMappingString(numelem, 20)}, 238 | &OrdinalEncoder{Mapping: randomMappingString(numelem, 20)}, 239 | &QuantileScaler{Quantiles: randomSliceFloat64(numelem)}, 240 | &QuantileScaler{Quantiles: randomSliceFloat64(numelem)}, 241 | &KBinsDiscretizer{QuantileScaler: QuantileScaler{Quantiles: randomSliceFloat64(numelem)}}, 242 | &KBinsDiscretizer{QuantileScaler: QuantileScaler{Quantiles: randomSliceFloat64(numelem)}}, 243 | }} 244 | 245 | s := S{ 246 | Name1: getAnyKeyFromMap(tr.Transformers[0].(*OrdinalEncoder).Mapping), 247 | Name2: getAnyKeyFromMap(tr.Transformers[1].(*OrdinalEncoder).Mapping), 248 | Name3: getAnyKeyFromMap(tr.Transformers[2].(*OrdinalEncoder).Mapping), 249 | Name4: getAnyKeyFromMap(tr.Transformers[3].(*OrdinalEncoder).Mapping), 250 | Name5: tr.Transformers[4].(*QuantileScaler).Quantiles[randomInt(1, numelem-1)], 251 | Name6: tr.Transformers[5].(*QuantileScaler).Quantiles[randomInt(1, numelem-1)], 252 | Name7: tr.Transformers[6].(*KBinsDiscretizer).Quantiles[randomInt(1, numelem-1)], 253 | Name8: tr.Transformers[7].(*KBinsDiscretizer).Quantiles[randomInt(1, numelem-1)], 254 | } 255 | 256 | b.ResetTimer() 257 | for n := 0; n < b.N; n++ { 258 | tr.Transform(s) 259 | } 260 | } 261 | 262 | func BenchmarkStructTransformer_Transform_LargeComposites_100elements(b *testing.B) { 263 | benchLargeTransformer(b, 100) 264 | } 265 | 266 | func BenchmarkStructTransformer_Transform_LargeComposites_1000elements(b *testing.B) { 267 | benchLargeTransformer(b, 1000) 268 | } 269 | 270 | func BenchmarkStructTransformer_Transform_LargeComposites_10000elements(b *testing.B) { 271 | benchLargeTransformer(b, 10000) 272 | } 273 | 274 | func BenchmarkStructTransformer_Transform_LargeComposites_100000elements(b *testing.B) { 275 | benchLargeTransformer(b, 100000) 276 | } 277 | 278 | func BenchmarkStructTransformer_Transform_32fields(b *testing.B) { 279 | 280 | type S struct { 281 | F1 float64 `feature:"minmax"` 282 | F2 float64 `feature:"standard"` 283 | F3 float64 `feature:"minmax"` 284 | F4 float64 `feature:"standard"` 285 | F5 float64 `feature:"minmax"` 286 | F6 float64 `feature:"standard"` 287 | F7 float64 `feature:"minmax"` 288 | F8 float64 `feature:"standard"` 289 | F9 float64 `feature:"minmax"` 290 | F10 float64 `feature:"standard"` 291 | F11 float64 `feature:"minmax"` 292 | F12 float64 `feature:"standard"` 293 | F13 float64 `feature:"minmax"` 294 | F14 float64 `feature:"standard"` 295 | F15 float64 `feature:"minmax"` 296 | F16 float64 `feature:"standard"` 297 | F17 float64 `feature:"minmax"` 298 | F18 float64 `feature:"standard"` 299 | F19 float64 `feature:"minmax"` 300 | F20 float64 `feature:"standard"` 301 | F21 float64 `feature:"minmax"` 302 | F22 float64 `feature:"standard"` 303 | F23 float64 `feature:"minmax"` 304 | F24 float64 `feature:"standard"` 305 | F25 float64 `feature:"minmax"` 306 | F26 float64 `feature:"standard"` 307 | F27 float64 `feature:"minmax"` 308 | F28 float64 `feature:"standard"` 309 | F29 float64 `feature:"minmax"` 310 | F30 float64 `feature:"standard"` 311 | F31 float64 `feature:"minmax"` 312 | F32 float64 `feature:"standard"` 313 | } 314 | 315 | tr := StructTransformer{Transformers: []interface{}{ 316 | &MinMaxScaler{Min: 1, Max: 10}, 317 | &StandardScaler{Mean: 15, STD: 2.5}, 318 | &MinMaxScaler{Min: 1, Max: 10}, 319 | &StandardScaler{Mean: 15, STD: 2.5}, 320 | &MinMaxScaler{Min: 1, Max: 10}, 321 | &StandardScaler{Mean: 15, STD: 2.5}, 322 | &MinMaxScaler{Min: 1, Max: 10}, 323 | &StandardScaler{Mean: 15, STD: 2.5}, 324 | &MinMaxScaler{Min: 1, Max: 10}, 325 | &StandardScaler{Mean: 15, STD: 2.5}, 326 | &MinMaxScaler{Min: 1, Max: 10}, 327 | &StandardScaler{Mean: 15, STD: 2.5}, 328 | &MinMaxScaler{Min: 1, Max: 10}, 329 | &StandardScaler{Mean: 15, STD: 2.5}, 330 | &MinMaxScaler{Min: 1, Max: 10}, 331 | &StandardScaler{Mean: 15, STD: 2.5}, 332 | &MinMaxScaler{Min: 1, Max: 10}, 333 | &StandardScaler{Mean: 15, STD: 2.5}, 334 | &MinMaxScaler{Min: 1, Max: 10}, 335 | &StandardScaler{Mean: 15, STD: 2.5}, 336 | &MinMaxScaler{Min: 1, Max: 10}, 337 | &StandardScaler{Mean: 15, STD: 2.5}, 338 | &MinMaxScaler{Min: 1, Max: 10}, 339 | &StandardScaler{Mean: 15, STD: 2.5}, 340 | &MinMaxScaler{Min: 1, Max: 10}, 341 | &StandardScaler{Mean: 15, STD: 2.5}, 342 | &MinMaxScaler{Min: 1, Max: 10}, 343 | &StandardScaler{Mean: 15, STD: 2.5}, 344 | &MinMaxScaler{Min: 1, Max: 10}, 345 | &StandardScaler{Mean: 15, STD: 2.5}, 346 | &MinMaxScaler{Min: 1, Max: 10}, 347 | &StandardScaler{Mean: 15, STD: 2.5}, 348 | }} 349 | 350 | s := S{ 351 | F1: 1231231.123, 352 | F2: 1231231.123, 353 | F3: 1231231.123, 354 | F4: 1231231.123, 355 | F5: 1231231.123, 356 | F6: 1231231.123, 357 | F7: 1231231.123, 358 | F8: 1231231.123, 359 | F9: 1231231.123, 360 | F10: 1231231.123, 361 | F11: 1231231.123, 362 | F12: 1231231.123, 363 | F13: 1231231.123, 364 | F14: 1231231.123, 365 | F15: 1231231.123, 366 | F16: 1231231.123, 367 | F17: 1231231.123, 368 | F18: 1231231.123, 369 | F19: 1231231.123, 370 | F20: 1231231.123, 371 | F21: 1231231.123, 372 | F22: 1231231.123, 373 | F23: 1231231.123, 374 | F24: 1231231.123, 375 | F25: 1231231.123, 376 | F26: 1231231.123, 377 | F27: 1231231.123, 378 | F28: 1231231.123, 379 | F29: 1231231.123, 380 | F30: 1231231.123, 381 | F31: 1231231.123, 382 | F32: 1231231.123, 383 | } 384 | 385 | b.ResetTimer() 386 | for n := 0; n < b.N; n++ { 387 | tr.Transform(s) 388 | } 389 | } 390 | --------------------------------------------------------------------------------