├── .gitignore
├── CHANGELOG.md
├── CITATION.cff
├── LICENSE
├── Makefile
├── README.md
├── bin
└── kxy
├── docker
└── kxy
│ └── Dockerfile
├── docs
├── Makefile
├── _static
│ ├── kxy.css
│ ├── matomo.js
│ └── theme_override.css
├── _templates
│ └── layout.html
├── conf.py
├── images
│ ├── bn_importance.png
│ ├── bn_incremental_importance.png
│ ├── bn_separability.png
│ ├── classification_accuracy_frontier.png
│ ├── classification_accuracy_frontier_2.png
│ ├── entropy_venn.png
│ ├── favicon.png
│ ├── gm_separability.png
│ ├── gm_separability_mov.gif
│ ├── incremental_input_importance.png
│ ├── logo.png
│ └── logo.svg
├── index.rst
├── latest
│ ├── api
│ │ └── index.rst
│ ├── applications
│ │ ├── case_studies
│ │ │ ├── conditionally_useful_features.ipynb
│ │ │ ├── empirical_validation_classification.ipynb
│ │ │ ├── empirical_validation_regression.ipynb
│ │ │ ├── features_pruning.ipynb
│ │ │ ├── index.rst
│ │ │ └── unbalanced_datasets.ipynb
│ │ ├── cheat_sheet
│ │ │ └── index.rst
│ │ ├── illustrations
│ │ │ ├── abalone.ipynb
│ │ │ ├── adult.ipynb
│ │ │ ├── air_quality.ipynb
│ │ │ ├── airfoil.ipynb
│ │ │ ├── aps.ipynb
│ │ │ ├── avila.ipynb
│ │ │ ├── bank_marketing.ipynb
│ │ │ ├── bank_note.ipynb
│ │ │ ├── bike_sharing.ipynb
│ │ │ ├── blog_feedback.ipynb
│ │ │ ├── card_default.ipynb
│ │ │ ├── concrete.ipynb
│ │ │ ├── ct_slices.ipynb
│ │ │ ├── diabetic.ipynb
│ │ │ ├── eeg.ipynb
│ │ │ ├── empirical_validation_regression.ipynb
│ │ │ ├── energy_efficiency.ipynb
│ │ │ ├── facebook_comments.ipynb
│ │ │ ├── heart_attack.ipynb
│ │ │ ├── house_prices_advanced.ipynb
│ │ │ ├── index.rst
│ │ │ ├── landsat.ipynb
│ │ │ ├── letter_recognition.ipynb
│ │ │ ├── magic_gamma.ipynb
│ │ │ ├── naval_propulsion.ipynb
│ │ │ ├── online_news.ipynb
│ │ │ ├── parkinson.ipynb
│ │ │ ├── power_plant.ipynb
│ │ │ ├── real_estate.ipynb
│ │ │ ├── sensorless_drive.ipynb
│ │ │ ├── shuttle.ipynb
│ │ │ ├── skin_segmentation.ipynb
│ │ │ ├── social_media_buzz.ipynb
│ │ │ ├── superconductivity.ipynb
│ │ │ ├── titanic.ipynb
│ │ │ ├── water_quality.ipynb
│ │ │ ├── white_wine_quality.ipynb
│ │ │ ├── yacht.ipynb
│ │ │ └── year_prediction_msd.ipynb
│ │ └── index.rst
│ ├── data_transfer
│ │ └── index.rst
│ ├── data_valuation
│ │ └── index.rst
│ ├── index
│ │ └── index.rst
│ ├── learning
│ │ └── index.rst
│ ├── model_explanation
│ │ └── index.rst
│ ├── model_free_variable_selection
│ │ └── index.rst
│ ├── model_improvability
│ │ └── index.rst
│ ├── model_wrapped_feature_selection
│ │ └── index.rst
│ ├── pandas
│ │ └── index.rst
│ ├── quickstart
│ │ └── getting_started.ipynb
│ ├── theoretical_foundation
│ │ ├── memoryful
│ │ │ └── index.rst
│ │ └── memoryless
│ │ │ ├── applications.rst
│ │ │ ├── estimation.rst
│ │ │ ├── index.rst
│ │ │ ├── problem_formulation.rst
│ │ │ └── quantifying_informativeness.rst
│ └── utilities
│ │ └── index.rst
└── make.bat
├── kxy
├── __init__.py
├── api
│ ├── __init__.py
│ ├── client.py
│ ├── data_transfer.py
│ ├── decorators.py
│ └── utils.py
├── billing
│ ├── __init__.py
│ └── billing_details.py
├── examples
│ ├── autogluon_compression.ipynb
│ ├── feature_selection_benchmark.py
│ ├── feature_selection_example.py
│ ├── lightgbm_model_compression.ipynb
│ ├── numerai_example.py
│ ├── random_forest_model_compression.ipynb
│ └── xgboost_model_compression.ipynb
├── finance
│ ├── __init__.py
│ └── corr.py
├── learning
│ ├── __init__.py
│ ├── base_learners.py
│ ├── leanml_predictor.py
│ ├── pytorch_early_termination.py
│ └── tensorflow_early_termination.py
├── misc
│ ├── __init__.py
│ ├── boruta.py
│ ├── exceptions.py
│ ├── mind.py
│ ├── naive.py
│ ├── predictors.py
│ ├── rfe.py
│ └── tf
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── generators.py
│ │ ├── initializers.py
│ │ ├── layers.py
│ │ ├── learners.py
│ │ ├── losses.py
│ │ ├── models.py
│ │ └── ops.py
├── pandas_extension
│ ├── __init__.py
│ ├── accessor.py
│ ├── base_accessor.py
│ ├── features_accessor.py
│ ├── features_utils.py
│ ├── finance_accessor.py
│ ├── learning_accessor.py
│ ├── post_learning_accessor.py
│ └── pre_learning_accessor.py
├── pfs
│ ├── __init__.py
│ ├── pfs_predictor.py
│ └── pfs_selector.py
├── post_learning
│ ├── __init__.py
│ ├── improvability.py
│ └── model_explanation.py
└── pre_learning
│ ├── __init__.py
│ ├── achievable_performance.py
│ └── variable_selection.py
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
├── test_boruta.py
├── test_data_valuation.py
├── test_features.py
├── test_finance.py
├── test_flow.py
├── test_learning.py
├── test_load_save_base_learners.py
├── test_load_save_predictors.py
├── test_misc.py
├── test_pca.py
├── test_pfs.py
└── test_rfe.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # local files
132 | local/
133 | lambdas/
134 | learners/
135 |
136 | .DS_Store
137 | UCI*/
138 |
139 | *.csv
140 | *.pkl
141 | *.sav
142 | *.sav-*
143 | *.json
144 | *.parquet
145 | *.h5
146 | *.png
147 | local_*.py
148 | *do-not-commit*
149 | AutogluonModels*
150 | *-PFSPredictor
151 | *-PCAPredictor
152 | *-LeanMLPredictor
153 | *-NaivePredictor
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 |
2 | # Change Log
3 |
4 | ## v.1.4.10 Changes
5 |
6 | * Added a function to construct features derived from PFS mutual information estimation that should be expected to be linearly related to the target.
7 | * Fixed a global name conflict in `kxy.learning.base_learners`.
8 |
9 |
10 | ## v.1.4.9 Changes
11 |
12 | * Change the activation function used by PFS from ReLU to switch/SILU.
13 | * Leaving it to the user to set the logging level.
14 |
15 |
16 | ## v.1.4.8 Changes
17 |
18 | * Froze the versions of all python packages in the docker file.
19 |
20 |
21 | ## v.1.4.7 Changes
22 |
23 | Changes related to optimizing Principal Feature Selection.
24 |
25 | * Made it easy to change PFS' default learning parameters.
26 | * Changed PFS' default learning parameters (learning rate is now 0.005 and epsilon 1e-04)
27 | * Adding a seed parameter to PFS' fit for reproducibility.
28 |
29 | To globally change the learning rate to 0.003, change Adam's epsilon to 1e-5, and the number of epochs to 25, do
30 |
31 | ```Python
32 | from kxy.misc.tf import set_default_parameter
33 | set_default_parameter('lr', 0.003)
34 | set_default_parameter('epsilon', 1e-5)
35 | set_default_parameter('epochs', 25)
36 | ```
37 |
38 | To change the number epochs for a single iteration of PFS, use the `epochs` argument of the `fit` method of your `PFS` object. The `fit` method now also has a `seed` parameter you may use to make the PFS implementation deterministic.
39 |
40 | Example:
41 | ```Python
42 | from kxy.pfs import PFS
43 | selector = PFS()
44 | selector.fit(x, y, epochs=25, seed=123)
45 | ```
46 |
47 | Alternatively, you may also use the `kxy.misc.tf.set_seed` method to make PFS deterministic.
48 |
49 |
50 | ## v.1.4.6 Changes
51 |
52 | Minor PFS improvements.
53 |
54 | * Adding more (robust) mutual information loss functions.
55 | * Exposing the learned total mutual information between principal features and target as an attribute of PFS.
56 | * Exposing the number of epochs as a parameter of PFS' fit.
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: If you use this software, please cite it using these metadata.
3 | authors:
4 | - family-names: Kom Samo
5 | given-names: Yves-Laurent
6 | orcid: "https://orcid.org/0000-0003-2901-6930"
7 | title: KXY: A Seemless API to 10x The Productivity of Machine Learning Engineers.
8 | version: 1.4.3
9 | date-released: "2021-10-12"
10 | abstract: KXY is a powerful serverless analysis toolkit that takes trial-and-error out of machine learning projects.
11 | url: "https://github.com/kxytechnologies/kxy-python"
12 | license: GPL-3.0
13 |
14 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | VERSION = 1.4.11
2 |
3 | # Update the s3 bucket of the docs website
4 | deploy_docs:
5 | aws s3 sync docs/_build/html s3://www.kxy.ai/reference/ --acl public-read --metadata-directive REPLACE --cache-control max-age=86400 --profile kxy
6 |
7 | # Invalidate certain cached files in the cloudfront distribution
8 | refresh_web:
9 | aws cloudfront create-invalidation --distribution-id EJZS9SM07YXKX --paths $(PATHS) --profile kxy
10 |
11 | # Cut a PyPi release
12 | pypi_release:
13 | python setup.py sdist bdist_wheel
14 | twine check dist/*
15 | twine upload --skip-existing dist/*
16 |
17 | install:
18 | pip install .
19 |
20 |
21 | docker_release:
22 | docker build -t kxytechnologies/kxy:latest ./docker/kxy/
23 | docker login --username drylnks && docker push kxytechnologies/kxy:latest
24 |
25 |
26 | docker_release_github:
27 | docker build -t ghcr.io/kxytechnologies/kxy-python:latest ./docker/kxy/
28 | # echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:latest
29 | docker push ghcr.io/kxytechnologies/kxy-python:latest
30 | docker build -t ghcr.io/kxytechnologies/kxy-python:$(VERSION) ./docker/kxy/
31 | # echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:$(VERSION)
32 | docker push ghcr.io/kxytechnologies/kxy-python:$(VERSION)
33 |
34 |
35 | one_shot_release:
36 | make clean
37 | make html
38 | make deploy_docs
39 | make refresh_web PATHS=/reference/*
40 | make docker_release
41 |
42 |
43 | update_docs:
44 | make clean
45 | make html
46 | make deploy_docs
47 | make refresh_web PATHS=/reference/*
48 |
49 |
50 | github_release:
51 | gh release create v$(VERSION) -F CHANGELOG.md
52 |
53 |
54 | package_release:
55 | make pypi_release
56 | make github_release
57 | timeout 5
58 | make docker_release_github
59 | make docker_release
60 |
61 |
62 | osr:
63 | make one_shot_release
64 |
65 |
66 | # Route any other make target to Sphinx
67 | # You can set these variables from the command line, and also
68 | # from the environment for the first two.
69 | SPHINXOPTS ?=
70 | SPHINXBUILD ?= sphinx-build
71 | SOURCEDIR = docs
72 | BUILDDIR = docs/_build
73 |
74 | help:
75 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
76 |
77 | .PHONY: help Makefile
78 |
79 | # Catch-all target: route all unknown targets to Sphinx using the new
80 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
81 | %: Makefile
82 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)# You can set these variables from the command line, and also
83 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | -----------------
6 |
7 | # Boosting The Productivity of Machine Learning Engineers
8 | [](https://github.com/kxytechnologies/kxy-python/blob/master/LICENSE)
9 | [](https://www.kxy.ai/)
10 | [](https://www.kxy.ai/)
11 |
12 |
13 | ## Documentation
14 | https://www.kxy.ai/reference/
15 |
16 | ## Blog
17 | https://blog.kxy.ai
18 |
19 |
20 | ## Installation
21 | From PyPi:
22 | ```Bash
23 | pip install kxy -U
24 | ```
25 | From GitHub:
26 | ```Bash
27 | git clone https://github.com/kxytechnologies/kxy-python.git & cd ./kxy-python & pip install .
28 | ```
29 | ## Authentication
30 | All heavy-duty computations are run on our serverless infrastructure and require an API key. To configure the package with your API key, run
31 | ```Bash
32 | kxy configure
33 | ```
34 | and follow the instructions. To get your own API key you need an account; you can sign up [here](https://www.kxy.ai/signup/). You'll then be automatically given an API key which you can find [here](https://www.kxy.ai/portal/profile/identity/).
35 |
36 |
37 | ## Docker
38 | The Docker image [kxytechnologies/kxy](https://hub.docker.com/repository/docker/kxytechnologies/kxy) has been built for your convenience, and comes with anaconda, auto-sklearn, and the kxy package.
39 |
40 | To start a Jupyter Notebook server from a sandboxed Docker environment, run
41 | ```Bash
42 | docker run -i -t -p 5555:8888 kxytechnologies/kxy:latest /bin/bash -c "kxy configure && /opt/conda/bin/jupyter notebook --notebook-dir=/opt/notebooks --ip='*' --port=8888 --no-browser --allow-root --NotebookApp.token=''"
43 | ```
44 | where you should replace `` with your API key and navigate to [http://localhost:5555](http://localhost:5555) in your browser. This docker environment comes with [all examples available on the documentation website](https://www.kxy.ai/reference/latest/examples/).
45 |
46 | To start a Jupyter Notebook server from an existing directory of notebooks, run
47 | ```Bash
48 | docker run -i -t --mount src=,target=/opt/notebooks,type=bind -p 5555:8888 kxytechnologies/kxy:latest /bin/bash -c "kxy configure && /opt/conda/bin/jupyter notebook --notebook-dir=/opt/notebooks --ip='*' --port=8888 --no-browser --allow-root --NotebookApp.token=''"
49 | ```
50 | where you should replace `` with the path to your local notebook folder and navigate to [http://localhost:5555](http://localhost:5555) in your browser.
51 |
52 | You can also get the same Docker image from GitHub [here](https://github.com/kxytechnologies/kxy-python/pkgs/container/kxy-python).
53 |
54 | ## Other Programming Language
55 | We plan to release friendly API client in more programming language.
56 |
57 | In the meantime, you can directly issue requests to our [RESTFul API](https://www.kxy.ai/reference/latest/api/index.html) using your favorite programming language.
58 |
59 | ## Pricing
60 | All API keys are given a free quota (a few dozen backend tasks) that should be enough to try out the package and see if you love it. Beyond the free quota you will be billed a small fee per task.
61 |
62 | KXY is free for academic use; simply signup with your university email.
63 |
64 | KXY is also free for Kaggle competitions; sign up and email kaggle@kxy.ai to get a promotional code.
65 |
--------------------------------------------------------------------------------
/bin/kxy:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 | import os
4 | if os.environ.get('LC_CTYPE', '') == 'UTF-8':
5 | os.environ['LC_CTYPE'] = 'en_US.UTF-8'
6 |
7 | import json
8 |
9 | def main(api_key=None):
10 | home = os.path.expanduser("~")
11 | path = os.path.join(home, '.kxy')
12 | os.makedirs(path, exist_ok=True)
13 | file_name = os.path.join(path, 'config')
14 |
15 | if not os.path.exists(file_name):
16 | with open(file_name, 'w') as f:
17 | json.dump({}, f)
18 |
19 | with open(file_name, 'r') as f:
20 | config = json.load(f)
21 | existing_key = config.get('KXY_API_KEY', '')
22 |
23 | if existing_key != '':
24 | existing_key = '(' + existing_key[:4] + '*' * (len(existing_key)-4) + ') '
25 |
26 | if api_key is None:
27 | api_key = input('KXY API Key: %s' % existing_key)
28 | if api_key is None or api_key == '':
29 | api_key = config.get('KXY_API_KEY', '')
30 |
31 | config['KXY_API_KEY'] = api_key
32 |
33 | with open(file_name, 'w') as f:
34 | json.dump(config, f)
35 |
36 | return
37 |
38 |
39 | if __name__ == '__main__':
40 | if len(sys.argv) > 1 and sys.argv[1] == 'configure':
41 | api_key = sys.argv[2] if len(sys.argv) > 2 else None
42 | sys.exit(main(api_key=api_key))
--------------------------------------------------------------------------------
/docker/kxy/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM continuumio/anaconda3
2 |
3 | RUN apt-get update
4 | RUN apt-get install build-essential -y
5 | RUN apt-get install swig -y
6 |
7 | RUN /opt/conda/bin/conda install gxx_linux-64 gcc_linux-64
8 | RUN /opt/conda/bin/conda install jupyter -y --quiet
9 | RUN pip install --upgrade pip
10 | RUN pip install pyarrow==7.0.0
11 | RUN pip install fastparquet==0.8.0
12 | RUN pip install emcee==3.1.1 scikit-optimize==0.9.0 pyDOE==0.3.8
13 | RUN pip install auto-sklearn==0.14.6
14 |
15 | # Install other ML open source librairies
16 | RUN pip install xgboost==1.5.2
17 | RUN pip install lightgbm==3.3.2
18 | RUN pip install tensorflow==2.8.0
19 | RUN pip install tensorflow_probability==0.16.0
20 | RUN pip install botocore==1.24.27
21 | RUN pip install boto3==1.21.27
22 | RUN pip install tqdm==4.62.3
23 |
24 | # Install kxy
25 | RUN pip install kxy==1.4.10
26 |
27 | # Copy examples into the Notebooks folder
28 | RUN git clone https://github.com/kxytechnologies/kxy-python.git /opt/kxy-python
29 | RUN mkdir /opt/notebooks
30 | RUN cp -R /opt/kxy-python/docs/latest/applications/case_studies/* /opt/notebooks/
31 | RUN cp -R /opt/kxy-python/docs/latest/applications/illustrations/* /opt/notebooks/
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 |
18 | # Catch-all target: route all unknown targets to Sphinx using the new
19 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
20 | %: Makefile
21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/docs/_static/kxy.css:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | .rst-content .admonition-summary .admonition-title {
5 | background: #2c3d5e;
6 | }
7 |
8 | .rst-content .admonition-summary .admonition-title:before {
9 | content: "\f0c6";
10 | }
11 |
12 | .rst-content .admonition-theoretical-foundation .admonition-title {
13 | background: #2c3d5e;
14 | }
15 |
16 | .rst-content .admonition-theoretical-foundation .admonition-title:before {
17 | content: "\f140";
18 | }
19 |
20 |
21 | .rst-content .note .admonition-title {
22 | background: #2c3d5e;
23 | }
24 |
25 | .rst-content .note .admonition-title:before {
26 | content: "\f0c6";
27 | }
28 |
29 | .rst-content .seealso .admonition-title {
30 | background: #343131;
31 | }
32 |
33 | .rst-content .seealso .admonition-title:before {
34 | content: "\f140";
35 | }
36 |
37 | .rst-content .admonition, .rst-content .admonition-title {
38 | border-radius: 15px;
39 | }
40 |
41 | .rst-content .important .admonition-title:before {
42 | content: "\f071";
43 | }
44 |
45 | .rst-content .important .admonition-title {
46 | background: #ffa98f;
47 | }
48 |
49 | .rst-content .admonition-important-equation .admonition-title {
50 | background: #1abc9c;
51 | }
52 |
53 | .rst-content .admonition-important-equation {
54 | background: #eeffcc;
55 | }
56 |
57 | .rst-content .important {
58 | background: #ffe4dc;
59 | }
60 |
61 | .underline {
62 | text-decoration: underline;
63 | }
64 |
65 | .wy-menu .caption-text {
66 | color: #02e966;
67 | }
68 |
69 | .rst-content .admonition-properties .admonition-title:before,
70 | .rst-content .admonition-property .admonition-title:before,
71 | .rst-content .admonition-important-equation .admonition-title:before {
72 | content: "\f08d";
73 | }
74 |
75 | .rst-content .admonition-properties {
76 | counter-reset: properties;
77 |
78 | }
79 |
80 | .rst-content .admonition-properties ol>li {
81 | list-style-type: none;
82 | }
83 |
84 | .rst-content .admonition-properties ol>li:before {
85 | counter-increment: properties;
86 | content: "P" counter(properties) ".\00a0\00a0";
87 | font-weight: bold;
88 | }
89 |
90 | span.eqno {
91 | float: right;
92 | }
93 |
94 |
95 | .rst-content .footnote-reference, .rst-content .citation-reference {
96 | top: 0;
97 | }
98 |
99 |
100 | .math .eqno a.headerlink {
101 | visibility: hidden;
102 | }
103 |
104 |
105 | mark.kxy-blue{
106 | color: #2C3960;
107 | background: rgba(2,233,102, 0.3);
108 | border-radius: 2px;
109 | font-weight: normal;
110 | }
111 |
112 | i {
113 | font-style: italic;
114 | }
115 |
116 |
117 | @media only screen
118 | and (max-device-width : 767px) {
119 | .wy-nav-top {
120 | background: #2C3960;
121 | }
122 | }
--------------------------------------------------------------------------------
/docs/_static/matomo.js:
--------------------------------------------------------------------------------
1 |
2 | var _paq = window._paq || [];
3 | /* tracker methods like "setCustomDimension" should be called before "trackPageView" */
4 | _paq.push(['trackPageView']);
5 | _paq.push(['enableLinkTracking']);
6 | (function() {
7 | var u="https://kxyai.matomo.cloud/";
8 | _paq.push(['setTrackerUrl', u+'matomo.php']);
9 | _paq.push(['setSiteId', '1']);
10 | var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
11 | g.type='text/javascript'; g.async=true; g.defer=true; g.src='//cdn.matomo.cloud/kxyai.matomo.cloud/matomo.js'; s.parentNode.insertBefore(g,s);
12 | })();
13 |
--------------------------------------------------------------------------------
/docs/_static/theme_override.css:
--------------------------------------------------------------------------------
1 |
2 | @media screen and (min-width: 1500px) {
3 | .wy-nav-content {
4 | max-width: calc(100vw - 600px);
5 | }
6 | }
7 |
8 | @media screen and (max-width: 768px) {
9 | .wy-nav-content-wrap .wy-nav-content {
10 | max-width: 100%;
11 | }
12 | }
--------------------------------------------------------------------------------
/docs/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 | {% set css_files = css_files + ["_static/kxy.css", "_static/theme_override.css"] %}
3 | {% set script_files = script_files + ["_static/matomo.js"] %}
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('..'))
16 |
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | project = 'KXY (Lean AutoML, As A Service)'
21 | copyright = '2021, KXY Technologies, Inc'
22 | author = 'Dr. Yves-Laurent Kom Samo'
23 | version = 'latest'
24 | autodoc_inherit_docstrings = False
25 |
26 |
27 | # -- General configuration ---------------------------------------------------
28 |
29 | # Add any Sphinx extension module names here, as strings. They can be
30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
31 | # ones.
32 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.napoleon', \
33 | 'sphinx.ext.todo', 'sphinx.ext.githubpages', 'sphinxcontrib.bibtex', \
34 | 'sphinx.ext.mathjax', 'sphinx.ext.autosectionlabel', 'nbsphinx', \
35 | 'sphinx_copybutton', 'sphinxcontrib.googleanalytics', 'sphinx_sitemap', \
36 | 'sphinxcontrib.httpdomain']
37 |
38 | # imgmath_image_format = 'svg'
39 | # imgmath_font_size = 13
40 |
41 | # Add any paths that contain templates here, relative to this directory.
42 | templates_path = ['_templates']
43 |
44 | # List of patterns, relative to source directory, that match files and
45 | # directories to ignore when looking for source files.
46 | # This pattern also affects html_static_path and html_extra_path.
47 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
48 |
49 |
50 | # -- Options for HTML output -------------------------------------------------
51 |
52 | # The theme to use for HTML and HTML Help pages. See the documentation for
53 | # a list of builtin themes.
54 | #
55 | html_context = {
56 | # Enable the "Edit in GitHub link within the header of each page.
57 | 'display_github': True,
58 | # Set the following variables to generate the resulting github URL for each page.
59 | # Format Template: https://{{ github_host|default("github.com") }}/{{ github_user }}/{{ github_repo }}/blob/{{ github_version }}{{ conf_py_path }}{{ pagename }}{{ suffix }}
60 | 'github_user': 'kxytechnologies',
61 | 'github_repo': 'kxy-python',
62 | 'github_version': 'master/docs/'
63 | }
64 |
65 | html_theme = 'sphinx_rtd_theme'
66 | html_logo = 'images/logo.png'
67 | html_favicon = 'images/favicon.png'
68 | html_theme_options = {'logo_only': True, 'style_nav_header_background': '#2c3d5e'}
69 |
70 | # Add any paths that contain custom static files (such as style sheets) here,
71 | # relative to this directory. They are copied after the builtin static files,
72 | # so a file named "default.css" will overwrite the builtin "default.css".
73 | html_static_path = ['_static']
74 |
75 | # Notebook
76 | nbsphinx_execute = 'never'
77 | nbsphinx_allow_errors = True
78 | nbsphinx_input_prompt = 'In [%s]:'
79 | nbsphinx_output_prompt = 'Out[%s]:'
80 | source_suffix = ['.rst', '.md', '.ipynb']
81 |
82 | # Google Analytics
83 | googleanalytics_id = 'UA-167632834-2'
84 | googleanalytics_enabled = True
85 |
86 |
87 | # Sitemap
88 | html_baseurl = 'https://www.kxy.ai/reference/'
89 | html_title = 'The KXY Platform: Lean AutoML, As A Service'
90 | # html_extra_path = ['robots.txt']
91 |
92 |
--------------------------------------------------------------------------------
/docs/images/bn_importance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/bn_importance.png
--------------------------------------------------------------------------------
/docs/images/bn_incremental_importance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/bn_incremental_importance.png
--------------------------------------------------------------------------------
/docs/images/bn_separability.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/bn_separability.png
--------------------------------------------------------------------------------
/docs/images/classification_accuracy_frontier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/classification_accuracy_frontier.png
--------------------------------------------------------------------------------
/docs/images/classification_accuracy_frontier_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/classification_accuracy_frontier_2.png
--------------------------------------------------------------------------------
/docs/images/entropy_venn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/entropy_venn.png
--------------------------------------------------------------------------------
/docs/images/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/favicon.png
--------------------------------------------------------------------------------
/docs/images/gm_separability.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/gm_separability.png
--------------------------------------------------------------------------------
/docs/images/gm_separability_mov.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/gm_separability_mov.gif
--------------------------------------------------------------------------------
/docs/images/incremental_input_importance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/incremental_input_importance.png
--------------------------------------------------------------------------------
/docs/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/logo.png
--------------------------------------------------------------------------------
/docs/images/logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
60 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: Documentation of the KXY Lean AutoML platform.
3 | :keywords: AutoML, Lean AutoML, KXY AutoML, Pre-Learning, Post-Learning
4 | :http-equiv=content-language: en
5 |
6 |
7 |
8 | A Powerful Serverless Analysis Toolkit That Takes *Trial And Error* Out of Machine Learning Projects
9 | ====================================================================================================
10 | .. image:: https://img.shields.io/badge/license-GPLv3%2B-blue
11 | :alt: License
12 | :target: https://www.gnu.org/licenses/agpl-3.0.en.html
13 | .. image:: https://img.shields.io/pypi/v/kxy.svg
14 | :alt: PyPI Latest Release
15 | :target: https://pypi.org/project/kxy/
16 | .. image:: https://pepy.tech/badge/kxy
17 | :alt: Downloads
18 | :target: https://github.com/kxytechnologies/kxy-python/
19 |
20 |
21 | ==============
22 | Get An API Key
23 | ==============
24 | To get an API key, simply open an account with us `here `_. As soon as you have an account, you may retrieve your API key `here `_.
25 |
26 |
27 | =================================================================
28 | Boost Your The Productivity Of Your ML Teams Tenfold With Lean ML
29 | =================================================================
30 | The :code:`kxy` package utilizes information theory to take *trial and error* out of machine learning projects.
31 |
32 | -------------------
33 | Project Feasibility
34 | -------------------
35 | From the get-go, the **data valuation** analysis of the :code:`kxy` package tells data scientists whether their datasets are sufficiently informative to achieve a performance (e.g. :math:`R^2`, RMSE, maximum log-likelihood, and classification error) to their liking in a classification or regression problem, and if so what is the best performance that can be achieved using said datasets. *Only spend time and compute resources on a project once you know it can yield the desired business impact*.
36 |
37 | ----------------------------------------
38 | Automatic (Model-Free) Feature Selection
39 | ----------------------------------------
40 | The **model-free variable selection** analysis provided by the :code:`kxy` package allows data scientists to train smaller models, faster, cheaper, and to achieve a higher performance than throwing all inputs in a big model or proceeding by trial-and-error.
41 |
42 |
43 | ---------------------------------------
44 | Production Model Improvability Analyses
45 | ---------------------------------------
46 | **Data-Driven Improvability:** Once a model has been trained, the :code:`kxy` *model-driven improvability* analysis quantifies the extent to which the trained model can be improved without resorting to additional features. This allows data scientists to focus their modeling efforts on high ROI initiatives. *Only throw the might of your ML team and platform at improving the fit of your production model when you know it can be improved. Never again will you spend weeks, if not months, and thousands of dollars in cloud compute, implementing the latest models on specialized hardware to improve your production model, only to find out its fit cannot be improved*.
47 |
48 | **Model-Driven Improvability:** Once the fit of a production model is optimal (i.e. it has successfully extracted all the value in using a given set features to predict the label), the :code:`kxy` *data-driven improvability* allows data scientists to quickly quantify the performance increase (e.g. :math:`R^2`, RMSE, maximum log-likelihood, and classification error) that a new dataset may bring about. *Only retrain models with additional features when you know they can bring about a meaningful performance boost*.
49 |
50 |
51 | ------------------------------------------------------
52 | Reducing Time and Resources Spent on Overfitted Models
53 | ------------------------------------------------------
54 | We provide callbacks in the major Python machine learning libraries that will terminate training when the running best performance seems unrealistic (i.e. far exceeds the theoretical-best achievable). Our callbacks allow saving time and compute resources on models that we can reliably determine will overfit once fully trained, well before training ends. This is a cost-effective alternative to cross-validation.
55 |
56 |
57 |
58 |
59 |
60 | .. toctree::
61 | :hidden:
62 | :caption: QUICKSTART
63 |
64 | latest/quickstart/getting_started
65 |
66 |
67 | .. toctree::
68 | :hidden:
69 | :caption: ILLUSTRATIONS
70 |
71 | latest/applications/cheat_sheet/index
72 |
73 | latest/applications/illustrations/index
74 |
75 | latest/applications/case_studies/index
76 |
77 |
78 | .. toctree::
79 | :hidden:
80 | :caption: THEORETICAL FOUNDATION
81 |
82 | latest/theoretical_foundation/memoryless/index
83 |
84 | latest/theoretical_foundation/memoryful/index
85 |
86 |
87 | .. toctree::
88 | :hidden:
89 | :caption: PYTHON CODE DOCUMENTATION
90 |
91 | latest/data_valuation/index
92 |
93 | latest/model_free_variable_selection/index
94 |
95 | latest/model_wrapped_feature_selection/index
96 |
97 | latest/learning/index
98 |
99 | latest/model_explanation/index
100 |
101 | latest/model_improvability/index
102 |
103 |
104 | .. toctree::
105 | :hidden:
106 | :caption: MISCELLANEOUS
107 |
108 | latest/data_transfer/index
109 |
110 | latest/pandas/index
111 |
112 |
113 | .. toctree::
114 | :hidden:
115 | :caption: OTHER LANGUAGES
116 |
117 | latest/api/index
118 |
119 |
120 | .. toctree::
121 | :hidden:
122 | :caption: INDEX
123 |
124 | latest/index/index
125 |
--------------------------------------------------------------------------------
/docs/latest/applications/case_studies/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: Case studies illustrating how the KXY platform may help customers.
3 | :keywords: KXY Tutorials, KXY Case Studies.
4 | :http-equiv=content-language: en
5 |
6 |
7 |
8 |
9 | ------------
10 | Case Studies
11 | ------------
12 | On this page we illustrate what KxY can do for you.
13 |
14 |
15 | Model Compression
16 | -----------------
17 | * `LightGBM, XGBoost or Random Forest: Same Performance With 80% Fewer Features `_
18 |
19 |
20 | Data Valuation
21 | --------------
22 |
23 | * :ref:`The KXY Data Valuation Function Works (Regression)`
24 | * :ref:`The KXY Data Valuation Function Works (Classification)`
25 |
26 | Model-Free Feature Selection
27 | ----------------------------
28 | * :ref:`Automatically Pruning Redundant Features With KXY`
29 | * :ref:`Detecting Features That Are Only Useful In Conjunction With Others`
30 |
31 |
32 | Better Fraud & Attack Detection
33 | -------------------------------
34 | * :ref:`Better Solving Heavily Unbalanced Classification Problems With KXY`
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/docs/latest/applications/cheat_sheet/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: Description of KXY's main functions, and how to access them in Python.
3 | :keywords: KXY Tutorials, KXY Cheatsheet.
4 | :http-equiv=content-language: en
5 |
6 |
7 |
8 | ----------
9 | Cheatsheet
10 | ----------
11 |
12 | Imports
13 | -------
14 |
15 | .. code-block:: python
16 |
17 | import pandas as pd
18 | import kxy
19 |
20 | From now on, :code:`df` refers to a Pandas dataframe object and :code:`y_column` is the column of :code:`df` to be used as target. All columns in :code:`df` but :code:`y_column` are treated as explanatory variables. :code:`problem_type` is a variable taking value :code:`'regression'` for regression problems and :code:`'classification'` for classification problems.
21 |
22 | Data Valuation
23 | --------------
24 |
25 | .. code-block:: python
26 |
27 | df.kxy.data_valuation(y_column, problem_type=problem_type)
28 |
29 |
30 | By default, your data is transmitted to our backend in clear. To anonymize your data before performing data valuation, simply set :code:`anonymize=True`.
31 |
32 | .. code-block:: python
33 |
34 | df.kxy.data_valuation(y_column, problem_type=problem_type, anonymize=True) # Data valuation using anonymized data.
35 |
36 |
37 |
38 | Automatic (Model-Free) Feature Selection
39 | ----------------------------------------
40 |
41 | .. code-block:: python
42 |
43 | df.kxy.variable_selection(y_column, problem_type=problem_type)
44 |
45 | By default, your data is transmitted to our backend in clear. To anonymize your data before performing automatic feature selection, simply set :code:`anonymize=True`.
46 |
47 | .. code-block:: python
48 |
49 | df.kxy.variable_selection(y_column, problem_type=problem_type, anonymize=True) # Variable selection using anonymized data.
50 |
51 |
52 |
53 | Model Compression
54 | -----------------
55 | Here's how to wrap feature selection around LightGBM in Python.
56 |
57 | .. code-block:: python
58 |
59 | from kxy.learning import get_lightgbm_learner_learning_api
60 |
61 | params = {
62 | 'objective': 'rmse',
63 | 'boosting_type': 'gbdt',
64 | 'num_leaves': 100,
65 | 'n_jobs': -1,
66 | 'learning_rate': 0.1,
67 | 'verbose': -1,
68 | }
69 | learner_func = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \
70 | early_stopping_rounds=50, verbose_eval=50, feature_selection_method='leanml')
71 | results = df.kxy.fit(y_column, learner_func, problem_type=problem_type)
72 |
73 | # The trained model
74 | predictor = results['predictor']
75 |
76 | # Feature columns selected
77 | selected_variables = predictor.selected_variables
78 |
79 | # To make predictions out of a dataframe of test data.
80 | predictions = predictor.predict(test_df)
81 |
82 | Parameters of :code:`get_lightgbm_learner_learning_api` should be the same as those of :code:`lightgbm.train`. See the `LightGBM documentation `_.
83 |
84 |
85 | Wrapping feature selection around another model in Python is identical except for :code:`learner_func`. Here's how to create :code:`learner_func` for other models.
86 |
87 | For XGBoost:
88 |
89 | .. code-block:: python
90 |
91 | from kxy.learning import get_xgboost_learner
92 | # Use 'xgboost.XGBClassifier' for classification problems.
93 | xgboost_learner_func = get_xgboost_learner('xgboost.XGBRegressor')
94 |
95 |
96 | Parameters of :code:`get_xgboost_learner` should be those you'd pass to instantiate :code:`xgboost.XGBRegressor` or :code:`xgboost.XGBClassifier`. See the `XGBoost documentation `_.
97 |
98 |
99 | For Scikit-Learn models:
100 |
101 | .. code-block:: python
102 |
103 | from kxy.learning import get_sklearn_learner
104 | # Replace 'sklearn.ensemble.RandomForestRegressor' with the import path of the sklearn model you want to use.
105 | rf_learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', \
106 | min_samples_split=0.01, max_samples=0.5, n_estimators=100)
107 | df.kxy.fit(y_column, rf_learner_func, problem_type=problem_type)
108 |
109 |
110 | Parameters of :code:`get_sklearn_learner` should be those you'd pass to instantiate the scikit-learn model.
111 |
112 |
113 |
114 | Model-Driven Improvability
115 | --------------------------
116 | For the model-driven improvability analysis, predictions made by the production model should be contained in a column of the :code:`df`. The variable :code:`prediction_column` refers to said column. All columns in :code:`df` but :code:`y_column` and :code:`prediction_column` are considered to be the explanatory variables/features used to train the production model.
117 |
118 |
119 | .. code-block:: python
120 |
121 | anonymize = False # Set to True to anonymize your data before model-driven improvability
122 | df.kxy.model_driven_improvability(y_column, prediction_column, problem_type=problem_type, anonymize=anonymize)
123 |
124 |
125 |
126 | Data-Driven Improvability
127 | -------------------------
128 | For the data-driven improvability analysis, the list of columns representing new features/explanatory variables to consider (:code:`new_variables`) should be provided. All columns in :code:`df` that are neither :code:`y_column` nor contained in :code:`new_variables` are assumed to be the explanatory variables/features used to trained the production model.
129 |
130 |
131 | .. code-block:: python
132 |
133 | anonymize = False # Set to True to anonymize your data before model-driven improvability
134 | df.kxy.data_driven_improvability(y_column, new_variables, problem_type=problem_type, anonymize=anonymize)
135 |
136 |
137 |
138 |
--------------------------------------------------------------------------------
/docs/latest/applications/illustrations/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: Application of KXY's analyses to popular machine learning datasets
3 | :keywords: KXY Tutorials, KXY Examples.
4 | :http-equiv=content-language: en
5 |
6 |
7 |
8 | ------------------------
9 | Kaggle & UCI Experiments
10 | ------------------------
11 |
12 | Below you'll find the results of applying KXY's data-valuation and model-free variable selection analyses to dozens of the most popular Kaggle and UCI regression and classification experiments.
13 |
14 |
15 | Kaggle Experiments
16 | ------------------
17 |
18 | * :ref:`Heart Attack (Kaggle, Classification, n=303, d=13, 2 classes)`
19 | * :ref:`House Prices Advanced (Kaggle, Regression, n=1460, d=79)`
20 | * :ref:`Titanic (Kaggle, Classification, n=891, d=11, 2 classes)`
21 | * :ref:`Water Quality (Kaggle, Classification, n=3276, d=9, 2 classes)`
22 |
23 |
24 |
25 | UCI Experiments
26 | ---------------
27 |
28 | * :ref:`APS Failure (UCI, Classification, n=76000, d=170, 2 classes)`
29 | * :ref:`Abalone (UCI, Regression, n=4177, d=8)`
30 | * :ref:`Adult (UCI, Classification, n=48843, d=14, 3 classes)`
31 | * :ref:`Air Foil (UCI, Regression, n=1503, d=5)`
32 | * :ref:`Air Quality (UCI, Regression, n=8991, d=14)`
33 | * :ref:`Avila (UCI, Classification, n=20867, d=10, 12 classes)`
34 | * :ref:`Bank Marketing (UCI, Classification, n=41188, d=20, 2 classes)`
35 | * :ref:`Bank Note (UCI, Classification, n=1372, d=4, 2 classes)`
36 | * :ref:`Bike Sharing (UCI, Regression, n=17379, d=18)`
37 | * :ref:`Blog Feedback (UCI, Regression, n=60021, d=280)`
38 | * :ref:`Card Default (UCI, Classification, n=30000, d=23, 2 classes)`
39 | * :ref:`Concrete (UCI, Regression, n=1030, d=8)`
40 | * :ref:`CT Slices (UCI, Regression, n=53500, d=385)`
41 | * :ref:`Diabetic Retinopathy (UCI, Classification, n=1151, d=19, 2 classes)`
42 | * :ref:`EEG Eye State (UCI, Classification, n=14980, d=14, 2 classes)`
43 | * :ref:`Energy Efficiency (UCI, Regression, n=768, d=8)`
44 | * :ref:`Facebook Comments (UCI, Regression, n=209074, d=53)`
45 | * :ref:`Landsat (UCI, Classification, n=6435, d=36, 6 classes)`
46 | * :ref:`Letter Recognition (UCI, Classification, n=20000, d=16, 26 classes)`
47 | * :ref:`Magic Gamma (UCI, Classification, n=19020, d=10, 2 classes)`
48 | * :ref:`Naval Propulsion (UCI, Regression, n=11934, d=16)`
49 | * :ref:`Online News (UCI, Regression, n=39644, d=58)`
50 | * :ref:`Parkinson (UCI, Regression, n=5875, d=20)`
51 | * :ref:`Power Plant (UCI, Regression, n=9568, d=4)`
52 | * :ref:`Real Estate (UCI, Regression, n=414, d=6)`
53 | * :ref:`Sensorless Drive (UCI, Classification, n=58509, d=48, 11 classes)`
54 | * :ref:`Shuttle (UCI, Classification, n=58000, d=9, 7 classes)`
55 | * :ref:`Skin Segmentation (UCI, Classification, n=245057, d=3, 2 classes)`
56 | * :ref:`Social Media Buzz (UCI, Regression, n=583250, d=77)`
57 | * :ref:`Superconductivity (UCI, Regression, n=21263, d=81)`
58 | * :ref:`White Wine Quality (UCI, Regression, n=4898, d=11)`
59 | * :ref:`Yacht Hydrodynamics (UCI, Regression, n=308, d=6)`
60 | * :ref:`Year Prediction MSD (UCI, Regression, n=515345, d=90)`
61 |
--------------------------------------------------------------------------------
/docs/latest/applications/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: Examples and tutorials illustrating how the KXY AutoML platform works, and what can be done with it.
3 | :keywords: KXY Tutorials, KXY Examples.
4 | :http-equiv=content-language: en
5 |
6 |
7 |
8 | ----------
9 | Cheatsheet
10 | ----------
11 |
12 | Imports
13 | -------
14 |
15 | .. code-block:: python
16 |
17 | import pandas as pd
18 | import kxy
19 |
20 | From now on, :code:`df` refers to a Pandas dataframe object and :code:`y_column` is the column of :code:`df` to be used as target. All columns in :code:`df` but :code:`y_column` are treated as explanatory variables. :code:`problem_type` is a variable taking value :code:`'regression'` for regression problems and :code:`'classification'` for classification problems.
21 |
22 | Data Valuation
23 | --------------
24 |
25 | .. code-block:: python
26 |
27 | df.kxy.data_valuation(y_column, problem_type=problem_type)
28 |
29 |
30 | By default, your data is transmitted to our backend in clear. To anonymize your data before performing data valuation, simply set :code:`anonymize=True`.
31 |
32 | .. code-block:: python
33 |
34 | df.kxy.data_valuation(y_column, problem_type=problem_type, anonymize=True) # Data valuation using anonymized data.
35 |
36 |
37 |
38 | Automatic (Model-Free) Feature Selection
39 | ----------------------------------------
40 |
41 | .. code-block:: python
42 |
43 | df.kxy.variable_selection(y_column, problem_type=problem_type)
44 |
45 | By default, your data is transmitted to our backend in clear. To anonymize your data before performing automatic feature selection, simply set :code:`anonymize=True`.
46 |
47 | .. code-block:: python
48 |
49 | df.kxy.variable_selection(y_column, problem_type=problem_type, anonymize=True) # Variable selection using anonymized data.
50 |
51 |
52 |
53 |
54 | Model-Driven Improvability
55 | --------------------------
56 | For the model-driven improvability analysis, predictions made by the production model should be contained in a column of the :code:`df`. The variable :code:`prediction_column` refers to said column. All columns in :code:`df` but :code:`y_column` and :code:`prediction_column` are considered to be the explanatory variables/features used to train the production model.
57 |
58 |
59 | .. code-block:: python
60 |
61 | anonymize = False # Set to True to anonymize your data before model-driven improvability
62 | df.kxy.model_driven_improvability(y_column, prediction_column, problem_type=problem_type, anonymize=anonymize)
63 |
64 |
65 |
66 | Data-Driven Improvability
67 | -------------------------
68 | For the data-driven improvability analysis, the list of columns representing new features/explanatory variables to consider (:code:`new_variables`) should be provided. All columns in :code:`df` that are neither :code:`y_column` nor contained in :code:`new_variables` are assumed to be the explanatory variables/features used to trained the production model.
69 |
70 |
71 | .. code-block:: python
72 |
73 | anonymize = False # Set to True to anonymize your data before model-driven improvability
74 | df.kxy.data_driven_improvability(y_column, new_variables, problem_type=problem_type, anonymize=anonymize)
75 |
76 |
77 |
78 | -----------------------
79 | Examples (Kaggle & UCI)
80 | -----------------------
81 |
82 | * :ref:`APS Failure (UCI, Classification, n=76000, d=170, 2 classes)`
83 | * :ref:`Abalone (UCI, Regression, n=4177, d=8)`
84 | * :ref:`Adult (UCI, Classification, n=48843, d=14, 3 classes)`
85 | * :ref:`Air Foil (UCI, Regression, n=1503, d=5)`
86 | * :ref:`Air Quality (UCI, Regression, n=8991, d=14)`
87 | * :ref:`Avila (UCI, Classification, n=20867, d=10, 12 classes)`
88 | * :ref:`Bank Marketing (UCI, Classification, n=41188, d=20, 2 classes)`
89 | * :ref:`Bank Note (UCI, Classification, n=1372, d=4, 2 classes)`
90 | * :ref:`Bike Sharing (UCI, Regression, n=17379, d=18)`
91 | * :ref:`Blog Feedback (UCI, Regression, n=60021, d=280)`
92 | * :ref:`CT Slices (UCI, Regression, n=53500, d=385)`
93 | * :ref:`Card Default (UCI, Classification, n=30000, d=23, 2 classes)`
94 | * :ref:`Concrete (UCI, Regression, n=1030, d=8)`
95 | * :ref:`Diabetic Retinopathy (UCI, Classification, n=1151, d=19, 2 classes)`
96 | * :ref:`EEG Eye State (UCI, Classification, n=14980, d=14, 2 classes)`
97 | * :ref:`Energy Efficiency (UCI, Regression, n=768, d=8)`
98 | * :ref:`Facebook Comments (UCI, Regression, n=209074, d=53)`
99 | * :ref:`Heart Attack (Kaggle, Classification, n=303, d=13, 2 classes)`
100 | * :ref:`Heart Disease (Kaggle, Classification, n=303, d=13, 2 classes)`
101 | * :ref:`House Prices Advanced (Kaggle, Regression, n=1460, d=79)`
102 | * :ref:`Landsat (UCI, Classification, n=6435, d=36, 6 classes)`
103 | * :ref:`Letter Recognition (UCI, Classification, n=20000, d=16, 26 classes)`
104 | * :ref:`Magic Gamma (UCI, Classification, n=19020, d=10, 2 classes)`
105 | * :ref:`Naval Propulsion (UCI, Regression, n=11934, d=16)`
106 | * :ref:`Online News (UCI, Regression, n=39644, d=58)`
107 | * :ref:`Parkinson (UCI, Regression, n=5875, d=20)`
108 | * :ref:`Power Plant (UCI, Regression, n=9568, d=4)`
109 | * :ref:`Real Estate (UCI, Regression, n=414, d=6)`
110 | * :ref:`Sensor Less Drive (UCI, Classification, n=58509, d=48, 11 classes)`
111 | * :ref:`Shuttle (UCI, Classification, n=58000, d=9, 7 classes)`
112 | * :ref:`Skin Segmentation (UCI, Classification, n=245057, d=3, 2 classes)`
113 | * :ref:`Social Media Buzz (UCI, Regression, n=583250, d=77)`
114 | * :ref:`Superconductivity (UCI, Regression, n=21263, d=81)`
115 | * :ref:`Titanic (Kaggle, Classification, n=891, d=11, 2 classes)`
116 | * :ref:`Water Quality (Kaggle, Classification, n=3276, d=9, 2 classes)`
117 | * :ref:`White Wine Quality (UCI, Regression, n=4898, d=11)`
118 | * :ref:`Yacht Hydrodynamics (UCI, Regression, n=308, d=6)`
119 | * :ref:`Year Prediction MSD (UCI, Regression, n=515345, d=90)`
120 |
121 |
122 |
123 | ------------
124 | Case Studies
125 | ------------
126 |
127 | * :ref:`Evaluating KXY's Data Valuation Function (Classification)`
128 | * :ref:`Evaluating KXY's Data Valuation Function (Regression)`
129 | * :ref:`Automatically Pruning Redundant Features With KXY`
130 | * :ref:`Detecting Features That Are Only Useful In Conjunction With Others`
131 | * :ref:`Better Solving Heavily Unbalanced Classification Problems With KXY`
132 |
133 |
134 |
135 | Classification
136 | --------------
137 |
138 | * :ref:`Toy Visual Classification Example`
139 | * :ref:`Classification Problem With Some Useless Variables`
140 | * :ref:`Complex Classification Example`
141 |
142 |
143 | Regression
144 | ----------
145 | * :ref:`Toy 1D Regression Examples`
146 | * :ref:`Toy Multivariate Regression Examples`
147 | * :ref:`Regression Problem With Some Useless Variables`
148 | * :ref:`Complex Regression Example`
149 |
150 |
151 |
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/docs/latest/data_transfer/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: How we use your data.
3 | :keywords: Pandas Dataframe, Lean ML, KXY.
4 | :http-equiv=content-language: en
5 |
6 | =========
7 | Your Data
8 | =========
9 |
10 | How We Use Your Data
11 | --------------------
12 |
13 | .. automodule:: kxy.api.data_transfer
14 | :members:
15 |
16 |
17 | Anonymizing Your Data
18 | ---------------------
19 | Fortunately, our analyses are invariant by various transformations that can completely anonymize your data.
20 |
21 | You may simply run :code:`df_anonymized = df.kxy.anonymize()` on any dataframe :code:`df` to anonymize it, and work with :code:`df_anonymized` instead :code:`df`.
22 |
23 | Check out the function below for more information on how we anonymize your data.
24 |
25 | .. automethod:: kxy.pandas_extension.base_accessor.BaseAccessor.anonymize
26 |
27 |
28 |
--------------------------------------------------------------------------------
/docs/latest/data_valuation/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: How to perform data valuation using the kxy Python package.
3 | :keywords: Data Valuation, Lean ML, AutoML, Lean Machine Learning, KXY.
4 | :http-equiv=content-language: en
5 |
6 |
7 | ==============
8 | Data Valuation
9 | ==============
10 |
11 | .. automodule:: kxy.pre_learning.achievable_performance
12 | :members:
13 |
--------------------------------------------------------------------------------
/docs/latest/index/index.rst:
--------------------------------------------------------------------------------
1 |
2 | .. meta::
3 | :http-equiv=content-language: en
4 |
5 | Indices and tables
6 | ==================
7 |
8 | * :ref:`genindex`
9 | * :ref:`modindex`
10 | * :ref:`search`
11 |
--------------------------------------------------------------------------------
/docs/latest/learning/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: How to use the kxy Python package to terminate training as soon as overfitting can be reliably determined.
3 | :keywords: Early-Termination, Overfitting Mitigation.
4 | :http-equiv=content-language: en
5 |
6 |
7 | =================
8 | Early Termination
9 | =================
10 | Callbacks and event handlers used to terminate training as soon as the running loss becomes lower than the theoretical-smallest.
11 |
12 |
13 | Tensorflow v2
14 | -------------
15 |
16 | .. automodule:: kxy.learning.tensorflow_early_termination
17 | :members:
18 |
19 |
20 | PyTorch
21 | -------
22 |
23 | .. automodule:: kxy.learning.pytorch_early_termination
24 | :members:
--------------------------------------------------------------------------------
/docs/latest/model_explanation/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: How to use the kxy Python package to explain a trained supervised learning model.
3 | :keywords: Model Explanation, Interpretable AI, Post-Learning, KXY.
4 | :http-equiv=content-language: en
5 |
6 |
7 | =================
8 | Model Explanation
9 | =================
10 |
11 | .. automodule:: kxy.post_learning.model_explanation
12 | :members:
--------------------------------------------------------------------------------
/docs/latest/model_free_variable_selection/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: How to perform model-free variable selection (or feature importance) using the kxy Python package.
3 | :keywords: Model-Free Variable Selection, Model-Free Feature Importance, AutoML, Lean ML, Lean Machine Learning, KXY.
4 | :http-equiv=content-language: en
5 |
6 |
7 | =============================
8 | Model-Free Variable Selection
9 | =============================
10 |
11 | .. automodule:: kxy.pre_learning.variable_selection
12 | :members:
--------------------------------------------------------------------------------
/docs/latest/model_improvability/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: How to use the kxy Python package to quantify by how much the performance of a trained supervised learning may be improved in a model-driven fashion (i.e. by simply looking for a better model, and without resorting to additional explanatory variables), or in a data-driven fashion (i.e. how much incremental value a specific new set of explanatory variables may bring about).
3 | :keywords: Model-Driven Improvability, Data-Driven Improvability, Post-Learning, KXY.
4 | :http-equiv=content-language: en
5 |
6 |
7 | ===================
8 | Model Improvability
9 | ===================
10 | Estimation of the amount by which the performance of a trained supervised learning model can be increased, either in a model-driven fashion, or a data-driven fashion.
11 |
12 | Model-Driven Improvability
13 | --------------------------
14 |
15 | .. autofunction:: kxy.post_learning.improvability.model_driven_improvability
16 |
17 |
18 | Data-Driven Improvability
19 | -------------------------
20 |
21 | .. autofunction:: kxy.post_learning.improvability.data_driven_improvability
22 |
--------------------------------------------------------------------------------
/docs/latest/model_wrapped_feature_selection/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: How to seamlessly add feature selection to any predictive model in Python.
3 | :keywords: Feature Selection, LeanML Feature Selection, Boruta Feature Selection, RFE Feature Selection, KXY.
4 | :http-equiv=content-language: en
5 |
6 |
7 | =================
8 | Model Compression
9 | =================
10 |
11 | How to seamlessly add feature selection to any predictive model in Python, so as to achieve the same performance with far fewer features.
12 |
13 |
14 | LeanML Feature Selection
15 | ------------------------
16 |
17 | .. automodule:: kxy.learning.leanml_predictor
18 | :members:
19 |
20 |
21 | Boruta and Recursive Feature Elimination
22 | ----------------------------------------
23 |
24 | .. automodule:: kxy.misc.boruta
25 | :members:
26 |
27 | .. automodule:: kxy.misc.rfe
28 | :members:
29 |
30 | .. automodule:: kxy.misc.predictors
31 | :members:
32 |
33 |
34 | Principal Feature Selection
35 | ---------------------------
36 |
37 | .. automodule:: kxy.pfs.pfs_selector
38 | :members:
39 |
40 | .. automodule:: kxy.pfs.pfs_predictor
41 | :members:
42 |
43 |
44 |
45 | Utilities Generating Learner Functions
46 | --------------------------------------
47 |
48 | .. automodule:: kxy.learning.base_learners
49 | :members:
50 |
51 |
--------------------------------------------------------------------------------
/docs/latest/pandas/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: Extension of pandas with analyses from the KXY Lean AutoML platform.
3 | :keywords: Pandas Dataframe, AutoML, KXY.
4 | :http-equiv=content-language: en
5 |
6 | =============================
7 | DataFrame Extension Deep Dive
8 | =============================
9 |
10 | .. automodule:: kxy.pandas_extension.accessor
11 | :members:
12 | :show-inheritance:
13 |
14 | .. automodule:: kxy.pandas_extension.base_accessor
15 | :members:
16 |
17 | .. automodule:: kxy.pandas_extension.pre_learning_accessor
18 | :members:
19 | :show-inheritance:
20 |
21 | .. automodule:: kxy.pandas_extension.learning_accessor
22 | :members:
23 | :show-inheritance:
24 |
25 | .. automodule:: kxy.pandas_extension.post_learning_accessor
26 | :members:
27 | :show-inheritance:
28 |
29 | .. automodule:: kxy.pandas_extension.finance_accessor
30 | :members:
31 | :show-inheritance:
32 |
--------------------------------------------------------------------------------
/docs/latest/quickstart/getting_started.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "Getting Started\n",
9 | "============\n",
10 | "\n",
11 | "Click [here](https://github.com/kxytechnologies/kxy-python/blob/master/docs/latest/quickstart/getting_started.ipynb) to download this page as a Jupyter Notebook.\n",
12 | "\n",
13 | "\n",
14 | "Installation\n",
15 | "-------------\n",
16 | "\n",
17 | "From PyPi:\n",
18 | "\n",
19 | "```bash\n",
20 | "pip install kxy\n",
21 | "```\n",
22 | "\n",
23 | "From GitHub:\n",
24 | "\n",
25 | "```bash\n",
26 | "git clone https://github.com/kxytechnologies/kxy-python.git & cd ./kxy-python & pip install .\n",
27 | "```\n",
28 | "\n",
29 | "The `kxy` package only supports Python 3. Replace ``pip`` by ``pip3`` in the commands above, as needed."
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "Authentication\n",
37 | "-----------------\n",
38 | "All heavy computations are run on the KXY backend and require an API key. The API key is set by running the command \n",
39 | "```bash\n",
40 | "kxy configure\n",
41 | "```\n",
42 | "as a one-off, and following the instructions. \n",
43 | "\n",
44 | "You may also set your API key as the `KXY_API_KEY` environment variable, for instance by running \n",
45 | "```bash\n",
46 | "import os\n",
47 | "os.environ['KXY_API_KEY'] = ''\n",
48 | "```\n",
49 | "To get an API key, you need to have an account with us. You can create an account [here](https://www.kxy.ai/signup/). \n",
50 | "\n",
51 | "By default, you will start in the free `Starter` plan, and you will be able to experience the KXY backend from the portal for free, and without providing a payment method, by uploading your data as csv files.\n",
52 | "\n",
53 | "Once you have an account and are ready to use our API, you can find your API key on the KXY portal [here](https://www.kxy.ai/portal/profile/identity/). \n",
54 | "\n",
55 | "API access to our backend is billed on a per-request basis, and requires that you provide a valid payment method in the KXY portal.\n"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "Docker\n",
63 | "---------\n",
64 | "The public Docker image [kxytechnologies/kxy](https://hub.docker.com/repository/docker/kxytechnologies/kxy) has been built for your convenience, and comes with anaconda, the kxy package, and various popular machine learning packages (e.g. Tensorflow, XGBoost, LightGBM, scikit-learn, and more).\n",
65 | "\n",
66 | "To start a Jupyter Notebook server from a sandboxed Docker, run\n",
67 | "```bash\n",
68 | "docker run -i -t -p 5555:8888 kxytechnologies/kxy /bin/bash -c \"kxy configure && /opt/conda/bin/jupyter notebook --notebook-dir=/opt/notebooks --ip='*' --port=8888 --no-browser --allow-root --NotebookApp.token=''\"\n",
69 | "```\n",
70 | "where you should replace `` with your API key and navigate to [http://localhost:5555](http://localhost:5555) in your browser.\n",
71 | "\n",
72 | "To start a Jupyter Notebook server from an existing directory of notebooks, run\n",
73 | "```bash\n",
74 | "docker run -i -t --mount src=,target=/opt/notebooks,type=bind -p 5555:8888 kxytechnologies/kxy /bin/bash -c \"kxy configure && /opt/conda/bin/jupyter notebook --notebook-dir=/opt/notebooks --ip='*' --port=8888 --no-browser --allow-root --NotebookApp.token=''\"\n",
75 | "```\n",
76 | "where you should replace `` with the path to your local notebook folder and navigate to [http://localhost:5555](http://localhost:5555) in your browser."
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "Serverless Applications\n",
84 | "----------------------------\n",
85 | "For serverless applications running on AWS, we provide a kxy [AWS Lambda layer](https://docs.aws.amazon.com/lambda/latest/dg/configuration-layers.html) so that you may simply import the `kxy` package from within your AWS Python 3 lambda functions. No additional requirement is needed, other than specifying your API key as the environment variable `KXY_API_KEY`."
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "Working with Pandas DataFrames\n",
93 | "----------------------------------------\n"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "The most convenient way of using the `kxy` package is through pandas DataFrame objects. All our analyses are available as methods of pandas DataFrame objects, under the `kxy` [accessor](https://pandas.pydata.org/pandas-docs/stable/development/extending.html) (i.e. as `df.kxy.`). To access these, all you need is to import the `kxy` package alongside `pandas`."
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 1,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "import pandas as pd\n",
110 | "import kxy"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "Checkout the [Cheat Sheet](https://www.kxy.ai/reference/latest/applications/cheat_sheet/index.html) section for code snippets, the [Case Studies](https://www.kxy.ai/reference/latest/applications/case_studies/index.html) section for interesting applications, and the [Kaggle & UCI Experiments](https://www.kxy.ai/reference/latest/applications/illustrations/index.html) section for experiments on dozens of UCI and Kaggle datasets."
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "Working with the Low Level RESTFul API\n",
125 | "------------------------------------------------\n",
126 | "\n",
127 | "We intend to provide user-friendly API clients in other programming languages (e.g. R, Ruby and JavaScript). For now, if you are working in a programming language other than Python, you may directly access our serverless compute infrastructure through its RESTFul API. Take a look at our [RESTFul API documentation page](https://www.kxy.ai/reference/latest/api/index.html) for more details."
128 | ]
129 | }
130 | ],
131 | "metadata": {
132 | "kernelspec": {
133 | "display_name": "kxy",
134 | "language": "python",
135 | "name": "kxy"
136 | },
137 | "language_info": {
138 | "codemirror_mode": {
139 | "name": "ipython",
140 | "version": 3
141 | },
142 | "file_extension": ".py",
143 | "mimetype": "text/x-python",
144 | "name": "python",
145 | "nbconvert_exporter": "python",
146 | "pygments_lexer": "ipython3",
147 | "version": "3.7.10"
148 | }
149 | },
150 | "nbformat": 4,
151 | "nbformat_minor": 4
152 | }
153 |
--------------------------------------------------------------------------------
/docs/latest/theoretical_foundation/memoryful/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: The theoretical foundation of the KXY Lean AutoML platform, for time series.
3 | :http-equiv=content-language: en
4 |
5 | **********************
6 | Memoryful Observations
7 | **********************
8 |
9 | The **lean ML** approach we advocated for memoryless problems in section :ref:`Memoryless Observations` presents as much potential when applied to time series forecasting problems. We consider predicting a business outcome :math:`y_t` using past values :math:`y_{t-1}, \dots, y_1` and using present and past values of an exogeneous explanatory vector-valued time series :math:`\{x_t\}`.
10 |
11 | The time series approach to modeling expresses two key points: (i) like random variables, we are uncertain about the value of the phenomenon we are modeling until it is observed, (ii) but unlike random variables, the phenomenon of interest may exhibit some memory in that observations drawn at different times may be related.
12 |
13 |
14 | I - From Memoryful to Memoryless
15 | --------------------------------
16 | In practice, we do not have the luxury of being able to replay time so as to gather multiple samples of a phenomenon corresponding to the same time, which would be the equivalent of having multiple draws from the same random variable in the memoryless setting. We need to learn from a single finite-length path :math:`\{(y_1, x_1), \dots, (y_T, x_T) \}`. Consequently, instead of using all past values :math:`(y_{t-1}, x_{t-1}), \dots, (y_1, x_1)` to predict :math:`y_t`, we might have to settle for a shorter time window :math:`(y_{t-1}, x_{t-1}), \dots, (y_{t-q}, x_{t-q})` in the interest of forming low-variance estimates, where the window size :math:`q` can be as large as allowed by our sample size :math:`T`.
17 |
18 | The natural question that arises is, can we simply define :math:`Y_i=y_t` and :math:`X_i=\left(x_t, y_{t-1}, x_{t-1}, \dots, y_{t-q}, x_{t-q}\right)`, and apply all the results developed in section :ref:`Memoryless Observations` to the dataset :math:`(Y_i, X_i)_{i \in [1, T]}`?
19 |
20 | The answer is yes, but we need to be cautious! The main difference with the memoryless setting is that :math:`\left(Y_i, X_i \right)` are not necessarily i.i.d. However, so long as the time series :math:`\{z_t\} = \{y_t, x_t\}` is assumed to be `stationary ergodic `_, all population metrics we previously introduced are well-defined, make as much sense as in the memoryless case, and the associated sample estimates remain consistent.
21 |
22 | More generally, when :math:`\{z_t\}` can be assumed to be trend-stationary and ergodic (i.e. :math:`\{y_t-f(t), x_t-g(t)\}` is stationary ergodic for some deterministic functions :math:`f, g`), we do not need to remove trends explicitly. We may simply add time as an explanatory variable, and apply results from the :ref:`Memoryless Observations` section to :math:`(Y_i, X_i^\prime)_{i \in [1, T]}`, with :math:`X_i^\prime = \left(t, x_t, y_{t-1}, x_{t-1}, \dots, y_{t-q}, x_{t-q}\right)`.
23 |
24 |
25 | In the event (trend-)stationarity is too unrealistic an assumption, the time series may be assumed locally-stationary. In other words, we do not assume (trend-adjusted) marginals to be invariant by any translation, but we consider that the magnitude of a change of marginals resulting from a translation depends on the norm of the translation vector. The smaller the norm of the translation vector, the smaller the magnitude of the changes to the marginals.
26 |
27 | Here too, results from the memoryless section apply, but with two caveats. First, we may not use as large a sample size :math:`T` as we want. :math:`T` has to be large enough that we may achieve low-variance estimates, yet small enough that the path used for training only contains the prevailing *local dynamics*. Second, all estimates from the memoryless section should only be considered valid in a limited time window following the last training timestamp [[*]_], and should be regenerated with new data on a rolling basis.
28 |
29 |
30 | II - Choosing the Window Size
31 | -----------------------------
32 | It is important to note that :math:`q` is only a function of the length :math:`T` of the path we use for training. It is not necessarily chosen so that all lags are relevant. For a given choice of :math:`q`, we will have :math:`m=T-q` distinct samples and an even smaller `effective sample size `_ :math:`nT_j`.
35 |
36 | Once :math:`q` is chosen, section :ref:`2 - Variable Selection Analysis` can be used to determine which lags are actually insightful and should be included in your predictive model.
37 |
38 |
39 |
40 |
41 |
42 |
43 | .. rubric:: References
44 |
45 | .. [1] Kom Samo, Y.-L., Inductive Mutual Information Estimation: A Convex Maximum-Entropy Copula Approach. Proceedings of the 24th International Conference on Artificial Intelligence and Statistics (AISTATS) 2021, San Diego, California, USA. PMLR: Volume 130.
46 |
47 |
48 | .. rubric:: Footnotes
49 |
50 | .. [*] Of size not exceeding :math:`T`.
51 | .. [*] Given the time series memory.
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/docs/latest/theoretical_foundation/memoryless/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: The theoretical foundation of the KXY Lean AutoML platform.
3 | :keywords: Pre-Learning, Post-Learning, Maximum-Entropy Principle, Input Importance, Feature Importance, KXY, Model Explanation, Dataset Valuation, Input Importance, Feature Importance, Model Suboptimality, Model Optimality.
4 | :http-equiv=content-language: en
5 | ***********************
6 | Memoryless Observations
7 | ***********************
8 | In this section we provide an in-depth discussion of what makes the KxY platform work. We begin with predictive problems where input and output variables do not exhibit temporal structures, or their temporal structures are of negligible importance. For time series problems, refer to our :ref:`Memoryful Observations` section.
9 |
10 | The KxY platform aims at **Democratizing Lean AI**. But what is *lean AI*, you might wonder?
11 |
12 | Our estimate is that *1-in-10* machine learning experiments fail, resulting in tremendous amount of avoidable waste (e.g. productivity, compute power and carbon footprint etc.). *Lean AI* is all about developping machine learning techniques to detect experiments in data science projects, or entire data science projects, that are likely to result in dead-ends and, as such, that should be avoided. Done right, this can increase the productivity of your data science teams tenfold, while slashing costs.
13 |
14 | *We are pioneers in this space, and our works are published in top-tier machine learning conferences.*
15 |
16 | Real-life predictive modeling needs are primarily of two types. An organization could be starting a predictive modeling project from scratch, and might be interested in predicting a new business outcome using available data as potential explanatory variables. Alternatively, the organization might be looking to improve a predictive model that was previously trained and released to production.
17 |
18 | We refer to problems arising from attempting to determine whether projects of the former kind (resp. latter kind) would result in a dead-end as **pre-learning** problems (resp. **post-learning** problems).
19 |
20 |
21 |
22 |
23 |
24 | .. toctree::
25 | :hidden:
26 |
27 | problem_formulation
28 | quantifying_informativeness
29 | applications
30 | estimation
31 |
--------------------------------------------------------------------------------
/docs/latest/theoretical_foundation/memoryless/problem_formulation.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :description: Definition of pre-learning and post-learning in supervised learning problems
3 | :keywords: Pre-Learning Explained, Post-Learning Explained, Model Audit, Model Explanation
4 | :http-equiv=content-language: en
5 |
6 | .. role:: raw-html(raw)
7 | :format: html
8 |
9 | I - Problem Formulation
10 | =======================
11 |
12 | .. admonition:: Summary
13 |
14 | We introduce **pre-learning** and **post-learning** problems, and discuss their importance.
15 |
16 |
17 | A supervised learning problem (i.e. regression or classification) aims at reliably learning an association between
18 | a vector of inputs :math:`x` and a label :math:`y` that is either categorical or real-valued. The association is learned using a training dataset, with the hope that, given a value of the inputs vector never seen before, the associated label can be predicted with high enough accuracy.
19 |
20 | :raw-html:`While the adequacy of the learned association between x and y depends solely on the model used, the overall accuracy achieved is bound by how informative the inputs are about the label.` If :math:`x` and :math:`y` are unrelated, no model, no matter
21 | how fancy or deep can infer :math:`y` from :math:`x`, and any attempt to do so would be futile and result in a waste of time and money.
22 |
23 | 1 - Pre-Learning
24 | ----------------
25 |
26 | What Is Pre-Learning?
27 | ^^^^^^^^^^^^^^^^^^^^^
28 | A good analogy to understand **pre-learning** is that pre-learning is to supervised learning what exploration is to oil production.
29 |
30 | It would never occur to an oil company to build a production well first, and then determine whether the site has oil by trying to extract some from the ground. Setting up an oil production site without exploration would be inefficient and very costly. The `exploration phase `_ ought to come first, and is critical to planning and the overall success of operations. In the exploration phase, inference techniques are used to find sites that are likely to be rich in oil, prior to, and independently from oil extraction, a field known as `exploration geophysics `_.
31 |
32 | In a supervised learning setting, **the site is the data used** to predict the business outcome, **the oil is the business value created** through the improvement of decision making, and **the oil extraction is the training of machine learning models**. Starting to train machine learning models on datasets without any expectation on what performance could be achieved, is like setting up an oil extraction site without knowing in advance that the site is rich in oil.
33 |
34 | Selecting and training great predictive models only affects the amount of value *extracted* from the inputs, it does not change the amount of value *there is intrinsically* in those inputs. The same way the amount of oil that can be produced at a site is bound by the amount of oil accessible in the ground, the performance of a predictive model is bound by the intrinsic value that can be found in the inputs :math:`x` about the outcome of interest :math:`y`.
35 |
36 | .. admonition:: Definition
37 |
38 | **Pre-learning** is the study and selection of datasets to use to solve a supervised learning problem, prior to, and independently from any modeling.
39 |
40 |
41 | Why Is Pre-Learning Important?
42 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
43 |
44 | To solve a supervised learning problem, choosing inputs that are collectively insightful about the outcome of interest has as big an impact on overall performance as, if not bigger than, the machine learning model used to extract such insights.
45 |
46 | Additionally, by quantifying the performance that can be achieved in a supervised learning problem, prior to and independently from modeling, the **pre-learning** phase empowers data scientists to know what to aim for, and to focus their efforts and resources accordingly.
47 |
48 |
49 |
50 |
51 | 2 - Post-Learning
52 | -----------------
53 | Once a set of informative inputs have been selected and a model has been trained, overall accuracy can be improved by either looking for a better supervised learning model, or looking for additional complementary datasets to use. Determining which action would result in the highest ROI is one of the objects of **post-learning**.
54 |
55 | Because the learned model did not yield a satisfactory enough predictive accuracy, does not necessarily mean that a more elaborate model could do better using the same datasets. It is very possible that, although it has an unsatisfactory predictive accuracy, the learned model already factors in everything the input datasets can tell us about our label. In such an event, the only possible course of action would be
56 | to look for additional datasets to use.
57 |
58 | Even then, because a new dataset is sufficiently informative about the label to predict does not necessarily mean that it can be used to improve the performance of our trained model. It is important to choose a dataset that is not only informative about the label to predict,
59 | but in a way that is complementary to datasets used to train the existing model.
60 |
61 | Another object of **post-learning** is *model audit*, which entails understanding the decisions made by a trained machine learning model, and detecting any bias it might encode, to name but a couple aims.
62 |
63 |
64 | .. admonition:: Definition
65 |
66 | **Post-learning** is the study and audit of a trained supervised learning model, as well as courses of action to take to improve its predictive accuracy.
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/docs/latest/utilities/index.rst:
--------------------------------------------------------------------------------
1 | .. meta::
2 | :http-equiv=content-language: en
3 | ====
4 | Misc
5 | ====
6 |
7 | .. automodule:: kxy.api.client
8 | :members:
9 |
10 | .. automodule:: kxy.api.data_transfer
11 | :members:
12 |
13 | .. automodule:: kxy.api.utils
14 | :members:
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/kxy/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | __author__ = "Dr. Yves-Laurent Kom Samo"
5 | __copyright__ = "Copyright (C) 2022 KXY Technologies, Inc."
6 | __license__ = """
7 | Copyright (C) 2022 KXY TECHNOLOGIES, INC.
8 |
9 | This program is free software: you can redistribute it and/or modify
10 | it under the terms of the GNU General Public License as published by
11 | the Free Software Foundation, either version 3 of the License, or
12 | (at your option) any later version.
13 |
14 | This program is distributed in the hope that it will be useful,
15 | but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | GNU General Public License for more details.
18 |
19 | You should have received a copy of the GNU Affero General Public License
20 | along with this program. If not, see .
21 | """
22 | __version__ = "1.4.11"
23 |
24 | from kxy.api import *
25 | from kxy.pre_learning import *
26 | from kxy.post_learning import *
27 | from kxy.finance import *
28 | from kxy.pandas_extension import *
29 | from kxy.billing import *
--------------------------------------------------------------------------------
/kxy/api/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC.
5 | Author: Dr Yves-Laurent Kom Samo
6 |
7 | This program is free software: you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 |
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU Affero General Public License
18 | along with this program. If not, see .
19 | """
20 | from .utils import *
21 | from .decorators import *
22 | from .client import *
23 | from .data_transfer import *
24 |
--------------------------------------------------------------------------------
/kxy/api/client.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | Python client for the KXY RESTful API.
6 | """
7 |
8 | from functools import lru_cache
9 | import os
10 | import requests
11 |
12 | from .decorators import requires_api_key, get_api_key, log_backend_warnings
13 | from .. import __version__ as client_version
14 |
15 |
16 | class APIClient(object):
17 | """
18 | Python client for the RESTful KXY API. All API methods require an API key. The API key must be set by running :code:`kxy configure` from the terminal.
19 | """
20 | @staticmethod
21 | def stage():
22 | """
23 | Defines the deployment stage of the RESTful API the client should talk to.
24 |
25 | Returns
26 | -------
27 | v : str
28 | The API stage to use.
29 | """
30 | return 'v2'
31 |
32 | @staticmethod
33 | def url(path):
34 | """
35 | Turns a relative path into a full API endpoint url.
36 |
37 | Parameters
38 | ----------
39 | path: str
40 | The relative path of the API resource.
41 |
42 | Returns
43 | -------
44 | u : str
45 | The full URL of the API resource.
46 | """
47 | path = path.strip('/')
48 |
49 | return 'https://api.kxy.ai/%s/' % APIClient.stage() + path
50 |
51 |
52 | @staticmethod
53 | @requires_api_key
54 | @log_backend_warnings
55 | def get(path, **params):
56 | """
57 | .. important:: This method requires a valid API key.
58 |
59 | Issues a GET request to the API resource identified by the input path.
60 |
61 | Parameters
62 | ----------
63 | path: str
64 | The relative path of the API resource.
65 | params: dict, optional
66 | The query parameters of the GET request. Any keyword argument is
67 | automatically interpreted as a request parameter, its name is used
68 | as the parameter name, and its value as the parameter value.
69 |
70 | Returns
71 | -------
72 | response: requests.Response
73 | The response of the API. The request HTTP status code can be accessed
74 | through `response.status_code`. To check if the request was succesful,
75 | inspect `response.ok`. When the API returned data, they can be accessed
76 | through `response.json()`. Supported status codes are:
77 |
78 | 200:
79 | The request was successful and the API returned some data accessible through
80 | `response.json()`.
81 | 402:
82 | The request failed because your account does not have a valid payment method.
83 | Check `response.json()['reason']` for more information.
84 | 403:
85 | The request failed because some parameter are either invalid or missing.
86 | Check `response.json()['reason']` for more information.
87 | 404:
88 | The request failed because the API couldn't yet solve the problem of interest.
89 | You should typically try again another time. Check `response.json()['reason']`
90 | for more information.
91 | """
92 | url = APIClient.url(path)
93 | api_key = get_api_key()
94 | if 'client_version' not in params:
95 | params['client_version'] = client_version
96 | response = requests.get(url, params=params, headers={'x-api-key': api_key, \
97 | 'content-type': 'application/json'})
98 |
99 | return response
100 |
101 |
102 | @staticmethod
103 | @requires_api_key
104 | @log_backend_warnings
105 | def post(path, **params):
106 | """
107 | .. important:: This method requires a valid API key.
108 |
109 | Issues a POST request to the API resource identified by the input path.
110 |
111 | Parameters
112 | ----------
113 | path: str
114 | The relative path of the API resource.
115 | params: dict, optional
116 | The data to be submitted to the API as part of the POST request, as
117 | a JSON. Any keyword argument is automatically interpreted as a
118 | key of the JSON data that will be submitted to the API,
119 | and its value the associated value in the JSON.
120 |
121 | Returns
122 | -------
123 | response: requests.Response
124 | The response of the API. The request HTTP status code can be accessed
125 | through `response.status_code`. To check if the request was succesful,
126 | inspect `response.ok`. When the API returned data, they can be accessed
127 | through `response.json()`.
128 |
129 | Supported status codes are:
130 |
131 | 200:
132 | The request was successful and the API returned some data accessible through
133 | `response.json()`.
134 | 402:
135 | The request failed because your account does not have a valid payment method.
136 | Check `response.json()['reason']` for more information.
137 | 403:
138 | The request failed because some parameter are either invalid or missing.
139 | Check `response.json()['reason']` for more information.
140 | 404:
141 | The request failed because the API couldn't yet solve the problem of interest.
142 | You should typically try again another time. Check `response.json()['reason']`
143 | for more information.
144 | """
145 | url = APIClient.url(path)
146 | api_key = get_api_key()
147 | if 'client_version' not in params:
148 | params['client_version'] = client_version
149 | response = requests.post(url, json=params, headers={'x-api-key': api_key, \
150 | 'content-type': 'application/json'})
151 |
152 | return response
153 |
154 |
155 | @staticmethod
156 | @lru_cache(maxsize=32)
157 | def route(path=None, method=None, **params):
158 | """
159 | .. important:: This method requires a valid API key.
160 |
161 | Generic method to issue a GET or a POST request to the API resource identified
162 | by the input path.
163 |
164 | Parameters
165 | ----------
166 | path: str
167 | The relative path of the API resource.
168 |
169 | method: str
170 | The REST method. Should be either `'GET'` or `'POST'`.
171 |
172 | params: dict, optional
173 | The data to be submitted to the API as a JSON for POST requests, or
174 | query parameters in the case of GET requests.
175 |
176 | Returns
177 | -------
178 | response: requests.Response
179 | The response of the API. The request HTTP status code can be accessed
180 | through `response.status_code`. To check if the request was succesful,
181 | inspect `response.ok`. When the API returned data, they can be accessed
182 | through `response.json()`.
183 |
184 | Supported status codes are:
185 |
186 | 200:
187 | The request was successful and the API returned some data accessible through
188 | `response.json()`.
189 | 402:
190 | The request failed because your account does not have a valid payment method.
191 | Check `response.json()['reason']` for more information.
192 | 403:
193 | The request failed because some parameter are either invalid or missing.
194 | Check `response.json()['reason']` for more information.
195 | 404:
196 | The request failed because the API couldn't yet solve the problem of interest.
197 | You should typically try again another time. Check `response.json()['reason']`
198 | for more information.
199 |
200 | Raises
201 | ------
202 | ValueError
203 | If path is None or method is neither 'GET', nor 'POST'.
204 | """
205 | if path is None or method is None or \
206 | method.upper() not in ('GET', 'POST'):
207 | return None
208 |
209 | if method.upper() == 'GET':
210 | return APIClient.get(path, **params)
211 |
212 | if method.upper() == 'POST':
213 | return APIClient.post(path, **params)
214 |
215 |
216 |
217 |
218 |
--------------------------------------------------------------------------------
/kxy/api/data_transfer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | To run our analyzes, the KXY backend needs your data. The methods below are the only methods involved in sharing your data with us. The :code:`kxy` package only uploads your data `if` and `when` needed.
6 | """
7 | import hashlib
8 | import logging
9 | from time import time
10 | import requests
11 |
12 | import pandas as pd
13 | import numpy as np
14 | try:
15 | get_ipython().__class__.__name__
16 | from halo import HaloNotebook as Halo
17 | except:
18 | from halo import Halo
19 |
20 | from .client import APIClient
21 |
22 |
23 | UPLOADED_FILES = {}
24 |
25 | def generate_upload_url(file_name):
26 | """
27 | Requests a pre-signed URL to upload a dataset.
28 |
29 | Parameters
30 | ----------
31 | file_name: str
32 | A string that uniquely identifies the content of the file.
33 |
34 | Returns
35 | -------
36 | d : dict or None
37 | The dictionary containing the pre-signed url.
38 | """
39 | api_response = APIClient.route(
40 | path='/wk/generate-signed-upload-url', method='POST',\
41 | file_name=file_name, timestamp=int(time()))
42 |
43 | if api_response.status_code == requests.codes.ok:
44 | api_response = api_response.json()
45 | if 'presigned_url' in api_response:
46 | presigned_url = api_response['presigned_url']
47 | return presigned_url
48 |
49 | elif api_response.get('file_already_exists', False):
50 | logging.debug('This file was previously uploaded.')
51 | return {}
52 |
53 | else:
54 | return None
55 |
56 | else:
57 | api_response = api_response.json()
58 | if 'message' in api_response:
59 | logging.warning("\n%s" % api_response['message'])
60 | return None
61 |
62 |
63 | def upload_data(df, file_name=None):
64 | """
65 | Updloads a dataframe to kxy servers.
66 |
67 | Parameters
68 | ----------
69 | df: pd.DataFrame
70 | The dataframe to upload.
71 |
72 | Returns
73 | -------
74 | d : bool
75 | Whether the upload was successful.
76 | """
77 | if file_name is None:
78 | logging.debug('')
79 | logging.debug('Hashing the data to form the file name')
80 | content = pd.util.hash_pandas_object(df).to_string()
81 | data_identifier = hashlib.sha256(content.encode()).hexdigest()
82 | columns = str(sorted([col for col in df.columns]))
83 | columns_identifier = hashlib.sha256(columns.encode()).hexdigest()
84 | identifier = hashlib.sha256((data_identifier+columns_identifier).encode()).hexdigest()
85 | memory_usage = df.memory_usage(index=False).sum()/(1024.0*1024.0*1024.0)
86 | file_name = identifier + '.parquet.gzip' if memory_usage > 1.5 else identifier + '.parquet' if memory_usage > 0.5 else identifier + '.csv'
87 | logging.debug('Done hashing the data')
88 | else:
89 | identifier = file_name.split('.')[0]
90 |
91 | if UPLOADED_FILES.get(identifier, False):
92 | logging.debug('The file with identifier %s was previously uploaded' % identifier)
93 | return file_name
94 |
95 | logging.debug('Requesting a signed upload URL')
96 | presigned_url = generate_upload_url(file_name)
97 |
98 | if presigned_url is None:
99 | logging.warning('Failed to retrieve the signed upload URL')
100 | return None
101 | else:
102 | logging.debug('Signed upload URL retrieved')
103 |
104 | if presigned_url == {}:
105 | logging.debug('This file was previously uploaded')
106 | UPLOADED_FILES[identifier] = True
107 | return file_name
108 |
109 |
110 | logging.debug('Preparing data for upload')
111 | spinner = Halo(text='Preparing data upload', spinner='dots')
112 | spinner.start()
113 | if file_name.endswith('.parquet.gzip'):
114 | # Truncate floats with excessive precision to save space.
115 | df.columns = df.columns.astype(str)
116 | _bytes = df.to_parquet(index=False, compression='gzip')
117 | elif file_name.endswith('.parquet'):
118 | # Truncate floats with excessive precision to save space.
119 | df.columns = df.columns.astype(str)
120 | _bytes = df.to_parquet(index=False)
121 | else:
122 | _bytes = df.to_csv(index=False)
123 | spinner.succeed()
124 |
125 | files = {'file': _bytes}
126 | url = presigned_url['url']
127 | data = presigned_url['fields']
128 | logging.debug('Done preparing the data to upload')
129 | logging.debug('Uploading the data')
130 | spinner.start('Uploading data')
131 | upload_response = requests.post(url, data=data, files=files)
132 | spinner.succeed()
133 |
134 | if upload_response.status_code in [requests.codes.ok, requests.codes.created, requests.codes.accepted, requests.codes.no_content]:
135 | logging.debug('Data successfully uploaded')
136 | UPLOADED_FILES[identifier] = True
137 | return file_name
138 | else:
139 | logging.warning('Failed to upload the file. Received status code %s.' % (upload_response.status_code))
140 |
141 | return None
142 |
143 |
144 |
--------------------------------------------------------------------------------
/kxy/api/decorators.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | ==================
6 | kxy.api.decorators
7 | ==================
8 | """
9 |
10 | from functools import wraps
11 | import json
12 | import logging
13 | import os
14 | import requests
15 |
16 | TRIAL_API_KEY = 'SZiRisvhzC7KBgROZG5dE1VQIlE8Jk4DbQ1YZdZ0'
17 |
18 | def get_api_key():
19 | """
20 | Retrieves the store API key, or None if none was provided.
21 | """
22 | home = os.path.expanduser("~")
23 | path = os.path.join(home, '.kxy')
24 | file_name = os.path.join(path, 'config')
25 | try:
26 | with open(file_name, 'r') as f:
27 | config = json.load(f)
28 | existing_key = config.get('KXY_API_KEY', TRIAL_API_KEY)
29 | return existing_key
30 | except:
31 | return os.environ.get('KXY_API_KEY', TRIAL_API_KEY)
32 |
33 | return None
34 |
35 |
36 |
37 | def has_api_key():
38 | """
39 | Returns whether or not an API key was provided as a result of running :code:`kxy configure`.
40 | """
41 | return get_api_key() is not None
42 |
43 |
44 |
45 | def requires_api_key(method):
46 | """
47 | Decorator used to make functions and methods calls fail
48 | when they require an API key and the user did not provide on
49 | by running :code:`kxy configure`. The decorated function or method
50 | is otherwise not affected.
51 |
52 | Raises
53 | ------
54 | AssertionError
55 | If an API key was not previously recorded.
56 | """
57 | @wraps(method)
58 | def wrapper(*args, **kw):
59 | assert has_api_key(), "An API key should be provided. Please run 'kxy configure'"
60 | return method(*args, **kw)
61 |
62 | return wrapper
63 |
64 |
65 |
66 | def log_backend_warnings(method):
67 | """
68 | Decorator used to make requests hitting the backend log backend warnings.
69 | """
70 | @wraps(method)
71 | def wrapper(*args, **kw):
72 | response = method(*args, **kw)
73 | try:
74 | if response.status_code == requests.codes.ok:
75 | response_json = response.json()
76 | if 'warning' in response_json:
77 | logging.warning('%s' % response_json['warning'])
78 | except:
79 | pass
80 | return response
81 |
82 | return wrapper
83 |
84 |
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/kxy/api/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
--------------------------------------------------------------------------------
/kxy/billing/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC.
5 | Author: Dr Yves-Laurent Kom Samo
6 |
7 | This program is free software: you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 |
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU Affero General Public License
18 | along with this program. If not, see .
19 | """
20 | from .billing_details import *
21 |
--------------------------------------------------------------------------------
/kxy/billing/billing_details.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Everything billing.
5 | """
6 | import logging
7 | import requests
8 | from time import time
9 |
10 | from kxy.api import APIClient
11 |
12 |
13 | def get_upcoming_invoice():
14 | """
15 | Retrieves all items that will show up in your next invoice.
16 |
17 | Returns
18 | -------
19 | d : dict
20 | The dictionary containing all items that will appear in your next invoice.
21 | E.g. :code:`{'Type of charge': {'total_usd': ..., 'quantity': ..., 'description': ..., 'billing_period_start_timestamp': ..., 'billing_period_end_timestamp': ...}, ... }`
22 | """
23 | api_response = APIClient.route(
24 | path='/wk/billing/upcoming-invoice', method='POST',\
25 | timestamp=int(time()))
26 | try:
27 | if api_response.status_code == requests.codes.ok:
28 | api_response = api_response.json()
29 | return api_response
30 | else:
31 | return {}
32 | except:
33 | logging.exception('Failed to retrieve your upcoming invoice.')
34 | return {}
35 |
--------------------------------------------------------------------------------
/kxy/examples/feature_selection_example.py:
--------------------------------------------------------------------------------
1 | # 0. As a one-off, run 'pip install kxy', then 'kxy configure'
2 | # This import is necessary to get all df.kxy.* methods
3 | import kxy
4 |
5 | # 1. Load your data
6 | # pip install kxy_datasets
7 | from kxy_datasets.classifications import BankMarketing
8 | dataset = BankMarketing()
9 | target_column = dataset.y_column
10 | df = dataset.df
11 |
12 | # 2. Generate candidate features
13 | features_df = df.kxy.generate_features(entity=None, max_lag=None,\
14 | entity_name='*', exclude=[target_column])
15 | features_df = features_df.drop('y_yes', axis=1)
16 | target_column = 'y_no'
17 |
18 | # 3. Training/Testing split
19 | # pip install scikit-learn
20 | from sklearn.model_selection import train_test_split
21 | train_features_df, test_features_df = train_test_split(features_df, \
22 | test_size=0.2, random_state=0)
23 | test_labels_df = test_features_df.loc[:, [target_column]]
24 | test_features_df = test_features_df.drop(target_column, axis=1)
25 |
26 | # 4. Create a LightGBM learner function.
27 |
28 | # A learner function is a function that expects up to two optional
29 | # variables: n_vars and path. When called it returns an instance of
30 | # 'predictive model' expecting n_vars features. The path parameter,
31 | # when provided, allows the learner function to load a saved model
32 | # from disk.
33 |
34 | # A 'predictive model' here is any class with a fit(self, x, y) method
35 | # and predict(self, x) method. To use the path argument of the learner
36 | # function, the class should also define a save(self, path) method to
37 | # save a model to disk, and a load(cls, path) class method to load a
38 | # saved model from disk.
39 |
40 | # See kxy.learning.base_learners for helper functions that allow you
41 | # create learner functions that return instances of popular predictive
42 | # models (e.g. lightgbm, xgboost, sklearn, tensorflow, pytorch models
43 | # etc.).
44 |
45 | from kxy.learning import get_lightgbm_learner_learning_api
46 | params = {
47 | 'objective': 'binary',
48 | 'metric': ['auc', 'binary_logloss'],
49 | }
50 | lightgbm_learner_func = get_lightgbm_learner_learning_api(params, \
51 | num_boost_round=10000, early_stopping_rounds=50, verbose_eval=50, \
52 | split_random_seed=0)
53 |
54 | # 5. Fit a LightGBM classifier wrapped around LeanML feature selection
55 | results = train_features_df.kxy.fit(target_column, \
56 | lightgbm_learner_func, problem_type='classification', \
57 | feature_selection_method='leanml')
58 | predictor = results['predictor']
59 |
60 | # 6. Make predictions from a dataframe of test features
61 | test_predictions_df = predictor.predict(test_features_df)
62 |
63 | # 7. Compute out-of-sample accuracy and AUC
64 | from sklearn.metrics import accuracy_score, roc_auc_score
65 | accuracy = accuracy_score(
66 | test_labels_df[target_column].values, \
67 | test_predictions_df[target_column].values, \
68 | )
69 | auc = roc_auc_score( \
70 | test_labels_df[target_column].values, \
71 | test_predictions_df[target_column].values, \
72 | multi_class='ovr'
73 | )
74 |
75 | print('LeanML -- Testing Accuracy: %.2f, AUC: %.2f' % (accuracy, auc))
76 | selected_features = predictor.selected_variables
77 | print('LeanML -- Selected Variables:')
78 | import pprint as pp
79 | pp.pprint(selected_features)
80 |
81 | # 8. (Optional) Save the trained model.
82 | path = './lightgbm_uci_bank_marketing.sav'
83 | predictor.save(path)
84 |
85 | # 9. (Optional) Load the saved model.
86 | from kxy.learning.leanml_predictor import LeanMLPredictor
87 | loaded_predictor = LeanMLPredictor.load(path, lightgbm_learner_func)
88 |
89 |
90 |
91 | # 10.a Fit a LightGBM classifier wrapped around RFE feature selection
92 | n_leanml_features = len(selected_features)
93 | rfe_results = train_features_df.kxy.fit(target_column, \
94 | lightgbm_learner_func, problem_type='classification', \
95 | feature_selection_method='rfe', rfe_n_features=n_leanml_features)
96 | rfe_predictor = rfe_results['predictor']
97 |
98 | # 10.b Fit a LightGBM classifier wrapped around Boruta feature
99 | # selection.
100 | boruta_results = train_features_df.kxy.fit(target_column, \
101 | lightgbm_learner_func, problem_type='classification', \
102 | feature_selection_method='boruta', boruta_n_evaluations= 20, \
103 | boruta_pval=0.95)
104 | boruta_predictor = boruta_results['predictor']
105 |
106 | # 10.c Fit a LightGBM classifier wrapped around Boruta feature
107 | # selection.
108 | none_results = train_features_df.kxy.fit(target_column, \
109 | lightgbm_learner_func, problem_type='classification', \
110 | feature_selection_method=None)
111 | none_predictor = none_results['predictor']
112 |
113 | # 11. Make predictions from a dataframe of test features
114 | rfe_test_predictions_df = rfe_predictor.predict(test_features_df)
115 | boruta_test_predictions_df = boruta_predictor.predict(test_features_df)
116 | none_test_predictions_df = none_predictor.predict(test_features_df)
117 |
118 | # 12. Compute out-of-sample accuracy and AUC
119 | rfe_accuracy = accuracy_score(
120 | test_labels_df[target_column].values, \
121 | rfe_test_predictions_df[target_column].values, \
122 | )
123 | rfe_auc = roc_auc_score( \
124 | test_labels_df[target_column].values, \
125 | rfe_test_predictions_df[target_column].values, \
126 | multi_class='ovr'
127 | )
128 |
129 | boruta_accuracy = accuracy_score(
130 | test_labels_df[target_column].values, \
131 | boruta_test_predictions_df[target_column].values, \
132 | )
133 | boruta_auc = roc_auc_score( \
134 | test_labels_df[target_column].values, \
135 | boruta_test_predictions_df[target_column].values, \
136 | multi_class='ovr'
137 | )
138 |
139 | none_accuracy = accuracy_score(
140 | test_labels_df[target_column].values, \
141 | none_test_predictions_df[target_column].values, \
142 | )
143 | none_auc = roc_auc_score( \
144 | test_labels_df[target_column].values, \
145 | none_test_predictions_df[target_column].values, \
146 | multi_class='ovr'
147 | )
148 |
149 | print('RFE -- Accuracy: %.2f, AUC: %.2f' % (rfe_accuracy, rfe_auc))
150 | rfe_selected_features = rfe_predictor.selected_variables
151 | print('RFE -- Selected Variables:')
152 | pp.pprint(rfe_selected_features)
153 | print()
154 |
155 | print('Boruta -- Accuracy: %.2f, AUC: %.2f' % (boruta_accuracy, \
156 | boruta_auc))
157 | boruta_selected_features = boruta_predictor.selected_variables
158 | print('Boruta -- Selected Variables:')
159 | pp.pprint(boruta_selected_features)
160 | print()
161 |
162 | print('No Feature Selection -- Accuracy: %.2f, AUC: %.2f' % (none_accuracy, \
163 | none_auc))
164 | all_features = none_predictor.selected_variables
165 | print('No Feature Selection -- Selected Variables:')
166 | pp.pprint(all_features)
167 |
168 |
169 |
--------------------------------------------------------------------------------
/kxy/examples/numerai_example.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import r2_score
2 | import kxy
3 | import pandas as pd
4 | from kxy.learning import get_lightgbm_learner_sklearn_api
5 |
6 | ########
7 | # Data #
8 | ########
9 | ## Uncomemnt to download Numerai data
10 | # from numerapi import NumerAPI
11 | # napi = NumerAPI()
12 | # current_round = napi.get_current_round(tournament=8)
13 | # napi.download_dataset("numerai_training_data_int8.parquet", "numerai_training_data_int8.parquet")
14 |
15 | df = pd.read_parquet('numerai_training_data_int8.parquet')
16 | target_column, problem_type = 'target', 'regression'
17 | feature_columns = [_ for _ in df.columns if _.startswith('feature_')]
18 | columns = feature_columns + [target_column]
19 | df = df[columns]
20 |
21 |
22 | ####################
23 | # Train/Test Split #
24 | ####################
25 | random_seed = 2
26 | test_df = df.sample(frac=0.7, random_state=random_seed)
27 | train_df = df.drop(test_df.index)
28 | train_features = train_df[feature_columns]
29 | train_labels = train_df[[target_column]]
30 | test_features = test_df[feature_columns]
31 | test_labels = test_df[[target_column]]
32 |
33 | x_train = train_features.values
34 | x_test = test_features.values
35 | y_train = train_labels.values
36 | y_test = test_labels.values
37 |
38 |
39 | # Run PFS
40 | from kxy.misc.tf import set_default_parameter
41 | from kxy.pfs import PFS
42 | set_default_parameter('lr', 0.001)
43 | selector = PFS()
44 | selector.fit(x_train, y_train, epochs=10, seed=random_seed, expand_y=False)
45 |
46 | # Extract the features
47 | fx_train = selector.max_ent_features_x(x_train)
48 |
49 | # Run a linear regression relating learned features to y
50 | from sklearn.linear_model import LinearRegression
51 | from sklearn.metrics import r2_score
52 |
53 | # PFS
54 | # Training
55 | m = LinearRegression()
56 | m.fit(fx_train, y_train)
57 |
58 | # Testing accuracy
59 | fx_test = selector.max_ent_features_x(x_test)
60 |
61 | y_test_predicted = m.predict(fx_test)
62 | testing_r2 = r2_score(y_test_predicted, y_test)
63 |
64 | y_train_predicted = m.predict(fx_train)
65 | training_r2 = r2_score(y_train_predicted, y_train)
66 |
67 | print('R^2 -- PFS -- Training: %.4f, Testing: %.4f' % (training_r2, testing_r2))
68 |
69 |
70 | # No PFS
71 | m = LinearRegression()
72 | m.fit(x_train, y_train)
73 |
74 | y_test_predicted_n = m.predict(x_test)
75 | y_train_predicted_n = m.predict(x_train)
76 |
77 | testing_r2_n = r2_score(y_test_predicted_n, y_test)
78 | training_r2_n = r2_score(y_train_predicted_n, y_train)
79 |
80 | print('R^2 -- No PFS -- Training: %.4f, Testing: %.4f' % (training_r2_n, testing_r2_n))
81 |
82 |
83 |
84 |
85 |
86 | # ##########################
87 | # # With Feature Selection #
88 | # ##########################
89 | # # LightGBM model factory
90 | # lightgbm_regressor_learner_cls = get_lightgbm_learner_sklearn_api('lightgbm.LGBMRegressor', \
91 | # n_jobs=-1, colsample_bytree=0.1, learning_rate=0.01, n_estimators=2000, max_depth=5)
92 |
93 | # # Lean boosting fit
94 | # results = train_df.kxy.fit(target_column, lightgbm_regressor_learner_cls, \
95 | # problem_type=problem_type, feature_selection_method='pfs', pfs_p=100, \
96 | # data_identifier='numerai_training_data_int8_train_seed_%d.parquet.gzip' % random_seed)
97 |
98 | # predictor = results['predictor']
99 | # p = predictor.feature_directions.shape[0]
100 | # print('Number of features: %d' % p)
101 |
102 | # # selected_features = predictor.selected_variables
103 | # # print('Selected Variables')
104 | # # print(selected_features)
105 |
106 | # # Training/Testing Predictions
107 | # train_predictions = predictor.predict(train_features)
108 | # test_predictions = predictor.predict(test_features)
109 |
110 | # # Training/Testing Performance
111 | # train_r2 = r2_score(train_labels, train_predictions)
112 | # test_r2 = r2_score(test_labels, test_predictions)
113 |
114 | # print('Compressed LightGBM: Training R^2: %.4f, Testing R^2: %.4f' % (train_r2, test_r2))
115 |
116 |
117 | # #################################
118 | # # Fit Without Feature Selection #
119 | # #################################
120 | # results = train_df.kxy.fit(target_column, lightgbm_regressor_learner_cls, \
121 | # problem_type=problem_type, feature_selection_method=None)
122 | # naive_predictor = results['predictor']
123 |
124 | # # Training/Testing Predictions
125 | # naive_train_predictions = naive_predictor.predict(train_features)
126 | # naive_test_predictions = naive_predictor.predict(test_features)
127 |
128 | # # Training/Testing Performance
129 | # naive_train_r2 = r2_score(train_labels, naive_train_predictions)
130 | # naive_test_r2 = r2_score(test_labels, naive_test_predictions)
131 |
132 | # print('Naive LightGBM: Training R^2: %.4f, Testing R^2: %.4f' % (naive_train_r2, naive_test_r2))
133 |
134 |
135 |
--------------------------------------------------------------------------------
/kxy/finance/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC.
5 | Author: Dr Yves-Laurent Kom Samo
6 |
7 | This program is free software: you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 |
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU Affero General Public License
18 | along with this program. If not, see .
19 | """
20 | from .corr import *
--------------------------------------------------------------------------------
/kxy/finance/corr.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import logging
4 | import requests
5 | import sys
6 | from time import time, sleep
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 | try:
12 | get_ipython().__class__.__name__
13 | from halo import HaloNotebook as Halo
14 | except:
15 | from halo import Halo
16 |
17 | from kxy.api import APIClient, upload_data
18 |
19 | # Cache old job ids to avoid being charged twice for the same job.
20 | IACORR_JOB_IDS = {}
21 |
22 | def information_adjusted_correlation(data_df, market_column, asset_column):
23 | """
24 | Estimate the information-adjusted correlation between an asset return :math:`r` and the market return :math:`r_m`: :math:`\\text{IA-Corr}\\left(r, r_m \\right) := \\text{sgn}\\left(\\text{Corr}\\left(r, r_m \\right) \\right) \\left[1 - e^{-2I(r, r_m)} \\right]`, where :math:`\\text{sgn}\\left(\\text{Corr}\\left(r, r_m \\right) \\right)` the sign of the Pearson correlation coefficient.
25 |
26 | Unlike Pearson's correlation coefficient, which is 0 if and only if asset return and market return are **decorrelated** (i.e. they exhibit no linear relation), information-adjusted correlation is 0 if and only if market and asset returns are **statistically independent** (i.e. the exhibit no relation, linear or nonlinear).
27 |
28 |
29 | Parameters
30 | ----------
31 | data_df : pandas.DataFrame
32 | The pandas DataFrame containing the data.
33 | market_column : str
34 | The name of the column containing market returns.
35 | asset_column : str
36 | The name of the column containing asset returns.
37 |
38 |
39 | Returns
40 | -------
41 | result : float
42 | The information-adjusted correlation.
43 |
44 | """
45 | assert market_column in data_df.columns, 'The market column should be a column of the dataframe.'
46 | assert asset_column in data_df.columns, 'The asset column should be a column of the dataframe.'
47 | assert np.can_cast(data_df[market_column], float), 'The market return column should be numeric'
48 | assert np.can_cast(data_df[asset_column], float), 'The asset return column should be numeric'
49 |
50 | k = 0
51 | kp = 0
52 | max_k = 100
53 | spinner = Halo(text='Waiting for results from the backend.', spinner='dots')
54 | spinner.start()
55 |
56 | df = data_df[[market_column, asset_column]]
57 | file_name = upload_data(df)
58 | if file_name:
59 | job_id = IACORR_JOB_IDS.get(file_name, None)
60 |
61 | if job_id:
62 | api_response = APIClient.route(
63 | path='/wk/ia-corr', method='POST',
64 | file_name=file_name, market_column=market_column, \
65 | asset_column=asset_column, \
66 | timestamp=int(time()), job_id=job_id)
67 | else:
68 | api_response = APIClient.route(
69 | path='/wk/ia-corr', method='POST', \
70 | file_name=file_name, market_column=market_column, \
71 | asset_column=asset_column, \
72 | timestamp=int(time()))
73 |
74 | initial_time = time()
75 | while api_response.status_code == requests.codes.ok and k < max_k:
76 | if kp%2 != 0:
77 | sleep(2 if kp<5 else 5 if k < max_k-4 else 300)
78 | kp += 4
79 | k = kp//2
80 | else:
81 | try:
82 | response = api_response.json()
83 | if 'job_id' in response:
84 | job_id = response['job_id']
85 | IACORR_JOB_IDS[file_name] = job_id
86 | sleep(2 if kp<5 else 5 if k < max_k-4 else 300)
87 | kp += 4
88 | k = kp//2
89 |
90 | # Note: it is important to pass the job_id to avoid being charged twice for the same work.
91 | api_response = APIClient.route(
92 | path='/wk/ia-corr', method='POST',
93 | file_name=file_name, market_column=market_column, \
94 | asset_column=asset_column, \
95 | timestamp=int(time()), job_id=job_id)
96 |
97 | try:
98 | response = api_response.json()
99 | if 'eta' in response:
100 | progress_text = '%s%% Completed.' % response['progress_pct'] if 'progress_pct' in response else ''
101 | spinner.text = 'Waiting for results from the backend. ETA: %s. %s' % (response['eta'], progress_text)
102 | except:
103 | pass
104 |
105 | if 'job_id' not in response:
106 | duration = int(time()-initial_time)
107 | duration = str(duration) + 's' if duration < 60 else str(duration//60) + 'min'
108 | spinner.text = 'Received results from the backend in %s' % duration
109 | spinner.succeed()
110 |
111 | if 'ia-corr' in response:
112 | return response['ia-corr']
113 | else:
114 | return np.nan
115 |
116 | except:
117 | spinner.text = 'The backend encountered an unexpected error we are looking into. Please try again later.'
118 | spinner.fail()
119 | logging.exception('\nInformation-adjusted correlation failed. Last HTTP code: %s' % api_response.status_code)
120 | return None
121 |
122 |
123 | if api_response.status_code != requests.codes.ok:
124 | spinner.text = 'The backend is taking longer than expected. Please try again later.'
125 | spinner.fail()
126 | try:
127 | response = api_response.json()
128 | if 'message' in response:
129 | logging.error('\n%s' % response['message'])
130 | except:
131 | logging.error('\nInformation-adjusted correlation failed. Last HTTP code: %s' % api_response.status_code)
132 |
133 | return None
134 |
135 |
136 |
137 |
--------------------------------------------------------------------------------
/kxy/learning/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC.
5 | Author: Dr Yves-Laurent Kom Samo
6 |
7 | This program is free software: you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 |
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU Affero General Public License
18 | along with this program. If not, see .
19 | """
20 | from .base_learners import *
21 | from .leanml_predictor import *
--------------------------------------------------------------------------------
/kxy/learning/pytorch_early_termination.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | PyTorch code to terminate training of a deep learning regressor or classifier when the running loss is much
5 | lower than a threshold, typically the theoretical-best.
6 | """
7 | import logging
8 | import numpy as np
9 |
10 | class TerminateIfOverfittedPT(object):
11 | '''
12 | PyTorch event handler that terminates training when the running loss is smaller than the theoretical best, which is strong indication that the model will end up overfitting.
13 |
14 | Parameters
15 | ----------
16 | loss_key : str
17 | Which loss to base early-termination on. Example values are: :code:`'loss'`, :code:`'classification_error'`, and any other registered loss metrics.
18 | theoretical_best : float
19 | The theoretical-smallest loss achievable without overfiting, obtained using :code:`df.kxy.data_valuation`.
20 |
21 |
22 | .. seealso::
23 |
24 | :ref:`kxy.pre_learning.achievable_performance.data_valuation `
25 |
26 | '''
27 | def __init__(self, theoretical_best, loss_key):
28 | self.theoretical_best = theoretical_best
29 | self.loss_key = loss_key
30 |
31 |
32 | def __call__(self, engine):
33 | ''' '''
34 | logs = engine.state.metrics or {}
35 | if 'accuracy' in logs:
36 | logs['classification_error'] = 1.-logs['accuracy']
37 |
38 | loss = logs.get(self.loss_key, -np.inf)
39 | if loss < self.theoretical_best:
40 | logging.warning('Loss %s (%.4f) is much smaller than the theoretical best %.4f' % (self.loss_key, loss, self.theoretical_best))
41 | engine.terminate()
42 |
43 |
44 |
--------------------------------------------------------------------------------
/kxy/learning/tensorflow_early_termination.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Tensorflow v2 code to terminate training of a deep learning regressor or classifier when the running loss is much
5 | lower than a threshold, typically the theoretical-best.
6 | """
7 | import logging
8 | import numpy as np
9 | from tensorflow.keras.callbacks import Callback
10 |
11 |
12 | class TerminateIfOverfittedTF(Callback):
13 | '''
14 | Tensorflow callback that terminates training at the end of a batch when the running loss is smaller than the theoretical best, which is strong indication that the model will end up overfitting.
15 |
16 | Parameters
17 | ----------
18 | loss_key : str
19 | Which loss to base early-termination on. Example values are: :code:`'loss'`, :code:`'classification_error'`, and any other registered loss metrics.
20 | theoretical_best : float
21 | The theoretical-smallest loss achievable without overfiting, obtained using :code:`df.kxy.data_valuation`
22 |
23 |
24 |
25 | .. seealso::
26 |
27 | :ref:`kxy.pre_learning.achievable_performance.data_valuation `.
28 |
29 |
30 | '''
31 | def __init__(self, theoretical_best, loss_key):
32 | super(EarlyTermination, self).__init__()
33 | self._supports_tf_logs = True
34 | self.theoretical_best = theoretical_best
35 | self.loss_key = loss_key
36 |
37 | def on_batch_end(self, batch, logs=None):
38 | ''' '''
39 | logs = logs or {}
40 | if 'accuracy' in logs:
41 | logs['classification_error'] = 1.-logs['accuracy']
42 |
43 | loss = logs.get(self.loss_key, -np.inf)
44 | if loss < self.theoretical_best:
45 | logging.warning('Loss %s (%.4f) is much smaller than the theoretical best %.4f' % (self.loss_key, loss, self.theoretical_best))
46 | self.model.stop_training = True
47 |
--------------------------------------------------------------------------------
/kxy/misc/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC.
5 | Author: Dr Yves-Laurent Kom Samo
6 |
7 | This program is free software: you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 |
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU Affero General Public License
18 | along with this program. If not, see .
19 | """
20 | from .boruta import *
21 | from .rfe import *
22 | from .predictors import *
23 | from .exceptions import *
--------------------------------------------------------------------------------
/kxy/misc/exceptions.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | class LongerThanExpectedException(Exception):
4 | pass
--------------------------------------------------------------------------------
/kxy/misc/mind.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | TensorFlow Implementation of MIND ([1]) under Spearman rank correlation constraints.
5 |
6 | [1] Kom Samo, Y. (2021). Inductive Mutual Information Estimation: A Convex Maximum-Entropy Copula Approach . Proceedings of The 24th International Conference on Artificial Intelligence and Statistics, in Proceedings of Machine Learning Research 130:2242-2250 Available from https://proceedings.mlr.press/v130/kom-samo21a.html.
7 | """
8 | import numpy as np
9 |
10 | from kxy.misc.tf import CopulaLearner
11 |
12 | def copula_entropy(z, subsets=[]):
13 | '''
14 | Estimate the entropy of the copula distribution of a d-dimensional random vector using MIND ([1]) with Spearman rank correlation constraints.
15 |
16 |
17 | Parameters
18 | ----------
19 | z : np.array
20 | Vector whose rows are samples from the d-dimensional random vector and columns its coordinates.
21 |
22 |
23 | Returns
24 | -------
25 | ent : float
26 | The estimated copula entropy.
27 | '''
28 | if len(z.shape)==1 or z.shape[1]==1:
29 | return 0.0
30 |
31 | d = z.shape[1]
32 | cl = CopulaLearner(d, subsets=subsets)
33 | cl.fit(z)
34 | ent = min(cl.copula_entropy, 0.0)
35 |
36 | return ent
37 |
38 |
39 |
40 | def mutual_information(y, x):
41 | '''
42 | Estimate the mutual information between two random vectors using MIND ([1]) with Spearman rank correlation constraints.
43 |
44 |
45 | Parameters
46 | ----------
47 | y : np.array
48 | Vector whose rows are samples from the d-dimensional random vector and columns its coordinates.
49 | x : np.array
50 | Vector whose rows are samples from the d-dimensional random vector and columns its coordinates.
51 |
52 |
53 | Returns
54 | -------
55 | mi : float
56 | The estimated mutual information.
57 | '''
58 | y = y[:, None] if len(y.shape)==1 else y
59 | x = x[:, None] if len(x.shape)==1 else x
60 | z = np.concatenate([y, x], axis=1)
61 | huy = copula_entropy(y)
62 | hux = copula_entropy(x)
63 | huz = copula_entropy(z)
64 | mi = max(huy+hux-huz, 0.0)
65 |
66 | return mi
67 |
68 |
69 | def run_d_dimensional_gaussian_experiment(d, rho, n=1000):
70 | '''
71 | '''
72 | # Cholesky decomposition of corr = np.array([[1., rho], [rho, 1.]])
73 | L = np.array([[1., 0.], [rho, np.sqrt(1.-rho*rho)]])
74 | y = np.empty((n, d))
75 | x = np.empty((n, d))
76 | for i in range(d):
77 | u = np.random.randn(n, 2)
78 | z = np.dot(L, u.T).T
79 | y[:, i] = z[:, 0].copy()
80 | x[:, i] = z[:, 1].copy()
81 |
82 | estimated_mi = mutual_information(y, x)
83 | true_mi = -d*0.5*np.log(1.-rho*rho)
84 |
85 | return estimated_mi, true_mi
86 |
87 |
88 |
89 | if __name__ == '__main__':
90 | rho = 0.95
91 | d = 20
92 | estimated_mi, true_mi = run_d_dimensional_gaussian_experiment(d, rho)
93 | print('%dd Gaussian Mutual Information: Estimated %.4f, True (theoretical) %.4f' % (\
94 | d, estimated_mi, true_mi))
95 |
96 |
97 |
98 |
--------------------------------------------------------------------------------
/kxy/misc/naive.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import logging
5 | from time import time
6 | import numpy as np
7 |
8 | class NaiveLearner(object):
9 | """
10 | Implementation of the Recursive Feature Elimination (RFE) feature selection algorithm.
11 |
12 | Reference:
13 | """
14 | def __init__(self, learner_func, path=None):
15 | """
16 | Constructor.
17 |
18 | Parameters
19 | ----------
20 | learner_func : func | callable
21 | Function or callable that expects one optional argument :code:`n_vars` and returns an instance of a superviser learner (regressor or classifier) following the scikit-learn convention, and expecting :code:`n_vars` features.
22 |
23 | Specifically, the learner should have a :code:`fit(x_train, y_train)` method. The learner should also have a :code:`feature_importances_` property or attribute, which is an array or a list containing feature importances once the model has been trained.
24 |
25 | There should be as many importance scores in :code:`feature_importances_` as columns in :code:`x_train`.
26 |
27 | """
28 | self.selected_variables = []
29 | self.learner_func = learner_func
30 | self.path = path
31 |
32 |
33 | def fit(self, x_df, y_df):
34 | """
35 | Fit the model without feature selection.
36 |
37 | Parameters
38 | ----------
39 | x_df : pd.DataFrame
40 | A dataframe containing all features.
41 | y_df : pd.DataFrame
42 | A dataframe containing the target.
43 |
44 | Attributes
45 | ----------
46 | selected_variables : list
47 | The list of features.
48 |
49 | Returns
50 | -------
51 | m : sklearn-like model (an instance returned by :code:`learner_func`)
52 | An instance returned by :code:`learner_func` trained with all features.
53 |
54 | """
55 | columns = [_ for _ in x_df.columns]
56 | y = y_df.values
57 | x = x_df[columns].values
58 | n_vars = len(columns)
59 | m = self.learner_func(n_vars=n_vars, path=self.path)
60 | m.fit(x, y)
61 | self.selected_variables = columns
62 |
63 | return m
64 |
65 |
66 |
--------------------------------------------------------------------------------
/kxy/misc/rfe.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import logging
5 | from time import time
6 | import numpy as np
7 |
8 | try:
9 | from tqdm import tqdm
10 | except:
11 | logging.warning('Boruta requires tqdm, which does not seem installed.')
12 |
13 |
14 | class RFE(object):
15 | """
16 | Implementation of the Recursive Feature Elimination (RFE) feature selection algorithm.
17 |
18 | Reference:
19 | """
20 | def __init__(self, learner_func, path=None):
21 | """
22 | Constructor.
23 |
24 | Parameters
25 | ----------
26 | learner_func : func | callable
27 | Function or callable that expects one optional argument :code:`n_vars` and returns an instance of a superviser learner (regressor or classifier) following the scikit-learn convention, and expecting :code:`n_vars` features.
28 |
29 | Specifically, the learner should have a :code:`fit(x_train, y_train)` method. The learner should also have a :code:`feature_importances_` property or attribute, which is an array or a list containing feature importances once the model has been trained.
30 |
31 | There should be as many importance scores in :code:`feature_importances_` as columns in :code:`x_train`.
32 |
33 | """
34 | self.selected_variables = []
35 | self.learner_func = learner_func
36 | self.path = path
37 |
38 |
39 | def fit(self, x_df, y_df, n_vars, max_duration=None):
40 | """
41 | Performs a run of the Recursive Feature Elimination (RFE) feature selection algorithm.
42 |
43 | Starting with all features, we recursively train a learner, calculate all feature importance scores, remove the least important feature, and repeat until we are left with :code:`n_vars` features.
44 |
45 | Parameters
46 | ----------
47 | x_df : pd.DataFrame
48 | A dataframe containing all features.
49 | y_df : pd.DataFrame
50 | A dataframe containing the target.
51 | n_vars : int
52 | The number of features to keep.
53 | max_duration : float | None (default)
54 | If not None, then feature elimination will stop after this many seconds.
55 |
56 | Attributes
57 | ----------
58 | selected_variables : list
59 | The list of the :code:`n_vars` features we kept.
60 |
61 |
62 | Returns
63 | -------
64 | m : sklearn-like model (an instance returned by :code:`learner_func`)
65 | An instance returned by :code:`learner_func` trained with the :code:`n_vars` features we kept.
66 |
67 | """
68 | columns = [_ for _ in x_df.columns]
69 | y = y_df.values
70 |
71 | # Fit the model
72 | x = x_df[columns].values
73 | current_n_vars = len(columns)
74 | start_time = time()
75 | m = self.learner_func(n_vars=current_n_vars)
76 | m.fit(x, y)
77 | importances = [_ for _ in m.feature_importances_]
78 |
79 | n_rounds = max(current_n_vars-n_vars, 0)
80 | for _ in tqdm(range(n_rounds)):
81 | duration = time()-start_time
82 | if max_duration and duration > max_duration:
83 | logging.warning('We have exceeded the configured maximum duration %.2fs: exiting...' % max_duration)
84 | break
85 |
86 | # Remove the least important variable
87 | importances = [_ for _ in m.feature_importances_]
88 | least_important_ix = np.argmin(np.abs(importances))
89 | importances.pop(least_important_ix)
90 | least_important_feature = columns[least_important_ix]
91 | logging.info('Deleting feature %s' % least_important_feature)
92 | columns.remove(least_important_feature)
93 | current_n_vars = len(columns)
94 |
95 | # Re-fit the model
96 | x = x_df[columns].values
97 | m = self.learner_func(n_vars=current_n_vars, path=self.path)
98 | m.fit(x, y)
99 |
100 | self.selected_variables = [col for _, col in sorted(zip(importances, columns), reverse=True)]
101 |
102 | return m
103 |
104 |
105 |
--------------------------------------------------------------------------------
/kxy/misc/tf/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC.
5 | Author: Dr Yves-Laurent Kom Samo
6 |
7 | This program is free software: you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 |
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU Affero General Public License
18 | along with this program. If not, see .
19 | """
20 | try:
21 | from pkg_resources import parse_version
22 | import tensorflow as tf
23 | assert parse_version(tf.__version__) >= parse_version('2.4.1')
24 | except:
25 | import logging
26 | logging.warning('You need tensorflow version 2.8 or higher to estimate mutual information or copula entropy locally.')
27 |
28 | from .generators import *
29 | from .ops import *
30 | from .config import *
31 | from .initializers import *
32 | from .layers import *
33 | from .losses import *
34 | from .models import *
35 | from .learners import *
--------------------------------------------------------------------------------
/kxy/misc/tf/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Global default training configs
5 | """
6 | # LEARNING PARAMETERS
7 | LR = 0.005
8 | EPOCHS = 20
9 |
10 | # ADAM PARAMETERS
11 | BETA_1 = 0.9
12 | BETA_2 = 0.999
13 | EPSILON = 1e-04
14 | AMSGRAD = False
15 | BATCH_SIZE = 500
16 |
17 |
18 | def set_default_parameter(name, value):
19 | '''
20 | Utility function to change parameters above at runtime.
21 | '''
22 | import logging
23 | globals()[name.upper()] = value
24 | return
25 |
26 | def get_default_parameter(name):
27 | return eval(name.upper())
--------------------------------------------------------------------------------
/kxy/misc/tf/generators.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Custom Tensorflow generators.
5 | """
6 | import numpy as np
7 | import tensorflow as tf
8 | tf.keras.backend.set_floatx('float64')
9 | tf.config.threading.set_inter_op_parallelism_threads(2)
10 | tf.config.threading.set_intra_op_parallelism_threads(8)
11 | tf.config.set_soft_device_placement(True)
12 | from tensorflow.keras.utils import Sequence
13 |
14 | LOCAL_SEED = None
15 |
16 | def set_generators_seed(seed):
17 | globals()['LOCAL_SEED'] = seed
18 |
19 |
20 | rankdata = lambda x: 1.+np.argsort(np.argsort(x, axis=0), axis=0)
21 | class CopulaBatchGenerator(Sequence):
22 | '''
23 | Random batch generator of maximum-entropy copula learning.
24 | '''
25 | def __init__(self, z, batch_size=1000, steps_per_epoch=100):
26 | self.batch_size = batch_size
27 | self.d = z.shape[1]
28 | self.n = z.shape[0]
29 | self.z = z
30 | self.steps_per_epoch = steps_per_epoch
31 | self.emp_u = rankdata(self.z)/(self.n + 1.)
32 | self.emp_u[np.isnan(self.z)] = 0.5
33 | self.rnd_gen = np.random.default_rng(LOCAL_SEED)
34 |
35 | if self.n < 200*self.d:
36 | dn = 200*self.d - self.n
37 | selected_rows = self.rnd_gen.choice(self.n, dn, replace=True)
38 | emp_u = self.emp_u[selected_rows, :].copy()
39 | scale = 1./(100.*self.n)
40 | emp_u += (scale*self.rnd_gen.uniform(size=emp_u.shape) - 0.5*scale)
41 | self.emp_u = np.concatenate([self.emp_u, emp_u], axis=0)
42 | self.n = self.emp_u.shape[0]
43 |
44 | self.batch_selector = self.rnd_gen.choice(self.n, self.batch_size*self.steps_per_epoch, replace=True)
45 | self.batch_selector = self.batch_selector.reshape((self.steps_per_epoch, self.batch_size))
46 |
47 |
48 | def getitem_ndarray(self, idx):
49 | ''' '''
50 | i = idx % self.steps_per_epoch
51 | selected_rows = self.batch_selector[i]
52 | emp_u_ = self.emp_u[selected_rows, :]
53 | z_p = emp_u_.copy()
54 | z_q = self.rnd_gen.uniform(size=emp_u_.shape)
55 |
56 | z = np.empty((self.batch_size, self.d, 2))
57 | z[:, :, 0] = z_p
58 | z[:, :, 1] = z_q
59 | batch_x = z
60 | batch_y = np.ones((self.batch_size, 2)) # Not used
61 | return batch_x, batch_y
62 |
63 |
64 | def __getitem__(self, idx):
65 | ''' '''
66 | batch_x, batch_y = self.getitem_ndarray(idx)
67 | return tf.convert_to_tensor(batch_x), tf.convert_to_tensor(batch_y)
68 |
69 |
70 | def __len__(self):
71 | return self.steps_per_epoch
72 |
73 |
74 |
75 | class PFSBatchGenerator(Sequence):
76 | '''
77 | Random batch generator.
78 | '''
79 | def __init__(self, x, y, ox=None, oy=None, batch_size=1000, steps_per_epoch=100, n_shuffle=5):
80 | self.rnd_gen = np.random.default_rng(LOCAL_SEED)
81 | assert x.shape[0] == y.shape[0]
82 | self.batch_size = batch_size
83 | self.n_shuffle = n_shuffle
84 | self.n = x.shape[0]
85 |
86 | x = x if len(x.shape) > 1 else x[:, None]
87 | y = y if len(y.shape) > 1 else y[:, None]
88 | ox = ox if ox is None or len(ox.shape) > 1 else ox[:, None]
89 | oy = oy if oy is None or len(oy.shape) > 1 else oy[:, None]
90 |
91 | self.x = x
92 | self.y = y
93 | self.ox = ox
94 | self.oy = oy
95 | self.z = np.concatenate([self.x, self.y, self.ox, self.oy], axis=1) if (not self.ox is None and not self.oy is None) else \
96 | np.concatenate([self.x, self.y, self.ox], axis=1) if (not self.ox is None) else \
97 | np.concatenate([self.x, self.y], axis=1)
98 | self.d = self.z.shape[1]
99 |
100 | self.steps_per_epoch = steps_per_epoch
101 | replace = False if self.n > self.batch_size*self.steps_per_epoch else True
102 | self.batch_selector = self.rnd_gen.choice(self.n, self.batch_size*self.steps_per_epoch, replace=replace)
103 | self.batch_selector = self.batch_selector.reshape((self.steps_per_epoch, self.batch_size))
104 |
105 |
106 | def getitem_ndarray(self, idx):
107 | ''' '''
108 | i = idx % self.steps_per_epoch
109 | selected_rows = self.batch_selector[i]
110 | x_ = self.x[selected_rows, :]
111 | y_ = self.y[selected_rows, :]
112 | z_ = self.z[selected_rows, :]
113 | if not self.ox is None:
114 | ox_ = self.ox[selected_rows, :]
115 | if not self.oy is None:
116 | oy_ = self.oy[selected_rows, :]
117 |
118 | z_p = None
119 | z_q = None
120 | for _ in range(self.n_shuffle):
121 | z_p = z_.copy() if z_p is None else np.concatenate([z_p, z_.copy()], axis=0)
122 | y_q = y_.copy()
123 | randomize = np.arange(y_q.shape[0])
124 | self.rnd_gen.shuffle(randomize)
125 | y_q = y_q[randomize]
126 | if not self.oy is None:
127 | oy_q = oy_.copy()
128 | oy_q = oy_q[randomize]
129 | z_q_ = np.concatenate([x_, y_q.copy(), ox_, oy_q], axis=1) if (not self.ox is None and not self.oy is None) else \
130 | np.concatenate([x_, y_q.copy(), ox_], axis=1) if not self.ox is None else \
131 | np.concatenate([x_, y_q.copy()], axis=1)
132 | z_q = z_q_.copy() if z_q is None else np.concatenate([z_q, z_q_.copy()], axis=0)
133 |
134 | z = np.empty((self.batch_size*self.n_shuffle, self.d, 2))
135 | z[:, :, 0] = z_p
136 | z[:, :, 1] = z_q
137 | batch_x = z
138 | batch_y = np.ones((self.batch_size*self.n_shuffle, 2)) # Not used
139 | return batch_x, batch_y
140 |
141 |
142 | def __getitem__(self, idx):
143 | ''' '''
144 | batch_x, batch_y = self.getitem_ndarray(idx)
145 | return tf.convert_to_tensor(batch_x), tf.convert_to_tensor(batch_y)
146 |
147 | def __len__(self):
148 | return self.steps_per_epoch
149 |
150 |
151 |
152 |
153 |
--------------------------------------------------------------------------------
/kxy/misc/tf/initializers.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Custom Tensorflow initializers.
5 | """
6 | import logging
7 |
8 | from tensorflow.keras.initializers import GlorotUniform
9 |
10 | LOCAL_SEED = None
11 | INITIALIZER_COUNT = 0
12 |
13 | def frozen_glorot_uniform():
14 | '''
15 | Deterministic GlorotUniform initializer.
16 | '''
17 | if LOCAL_SEED is not None:
18 | initializer = GlorotUniform(LOCAL_SEED+INITIALIZER_COUNT)
19 | globals()['INITIALIZER_COUNT'] = INITIALIZER_COUNT + 1
20 | return initializer
21 | else:
22 | return GlorotUniform()
23 |
24 | def set_initializers_seed(seed):
25 | globals()['LOCAL_SEED'] = seed
--------------------------------------------------------------------------------
/kxy/misc/tf/layers.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Custom tensorflow layers.
5 | """
6 | import tensorflow as tf
7 | tf.keras.backend.set_floatx('float64')
8 | tf.config.threading.set_inter_op_parallelism_threads(2)
9 | tf.config.threading.set_intra_op_parallelism_threads(8)
10 | tf.config.set_soft_device_placement(True)
11 | from tensorflow.keras.layers import Layer
12 |
13 |
14 | class InitializableDense(Layer):
15 | '''
16 | '''
17 | def __init__(self, units, initial_w=None, initial_b=None, bias=False):
18 | '''
19 | initial_w should be None or a 2D numpy array.
20 | initial_b should be None or a 1D numpy array.
21 | '''
22 | super(InitializableDense, self).__init__()
23 | self.units = units
24 | self.with_bias = bias
25 | self.w_initializer = 'zeros' if initial_w is None else tf.constant_initializer(initial_w)
26 |
27 | if self.with_bias:
28 | self.b_initializer = 'zeros' if initial_b is None else tf.constant_initializer(initial_b)
29 |
30 |
31 | def build(self, input_shape):
32 | ''' '''
33 | self.w = self.add_weight(shape=(input_shape[-1], self.units), \
34 | initializer=self.w_initializer, trainable=True, name='quad_w')
35 |
36 | if self.with_bias:
37 | self.b = self.add_weight(shape=(self.units,), \
38 | initializer=self.b_initializer, trainable=True, name='quad_b')
39 |
40 |
41 | def call(self, inputs):
42 | ''' '''
43 | return tf.matmul(inputs, self.w)+self.b if self.with_bias else tf.matmul(inputs, self.w)
44 |
--------------------------------------------------------------------------------
/kxy/misc/tf/losses.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Custom Tensorflow losses.
5 | """
6 | from multiprocessing import Pool, cpu_count
7 | import numpy as np
8 |
9 | import tensorflow as tf
10 | tf.keras.backend.set_floatx('float64')
11 | tf.config.threading.set_inter_op_parallelism_threads(2)
12 | tf.config.threading.set_intra_op_parallelism_threads(8)
13 | tf.config.set_soft_device_placement(True)
14 | from tensorflow.python.ops import math_ops
15 | from tensorflow.keras.losses import Loss
16 |
17 | from .ops import rectified_exp, d_rectified_exp
18 |
19 |
20 | class MINDLoss(Loss):
21 | '''
22 | MIND loss function: :math:`-E_P(T(x, y)^T\theta) + \log E_Q(e^{T(x, y)^T\theta})`.
23 | '''
24 | def call(self, y_true, y_pred):
25 | ''' '''
26 | p_samples = y_pred[:, 0]
27 | q_samples = y_pred[:, 1]
28 | mi = -tf.reduce_mean(p_samples) + math_ops.log(tf.reduce_mean(math_ops.exp(q_samples)))
29 | return mi
30 |
31 |
32 | class ApproximateMINDLoss(Loss):
33 | '''
34 | MIND loss function with a gentler version of the exponential: :math:`-E_P(r_exp(T(x, y)^T\theta)) + \log E_Q(dr_exp(T(x, y)^T\theta)`. :math:`r_exp(t) = exp(t)` if :math:`t<0` and :math:`r_exp(t) = 1+x+(1/2)x^2+(1/6)x^2`.
35 | '''
36 | def call(self, y_true, y_pred):
37 | ''' '''
38 | p_samples = y_pred[:, 0]
39 | q_samples = y_pred[:, 1]
40 | mi = -tf.reduce_mean(p_samples) + math_ops.log(tf.reduce_mean(rectified_exp(q_samples)))
41 | return mi
42 |
43 |
44 | class RectifiedMINDLoss(Loss):
45 | '''
46 | Rectified-MIND loss function: :math:`-E_P(\log dr_exp((T(x, y)^T\theta)) + \log E_Q(dr_exp(T(x, y)^T\theta)`. :math:`r_exp(t) = exp(t)` if :math:`t<0` and :math:`r_exp(t) = 1+x+(1/2)x^2+(1/6)x^2`.
47 | '''
48 | def call(self, y_true, y_pred):
49 | ''' '''
50 | p_samples = y_pred[:, 0]
51 | q_samples = y_pred[:, 1]
52 | mi = -tf.reduce_mean(math_ops.log(d_rectified_exp(p_samples))) + math_ops.log(tf.reduce_mean(d_rectified_exp(q_samples)))
53 | return mi
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/kxy/misc/tf/ops.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Custom math operations.
5 | """
6 | from multiprocessing import Pool, cpu_count
7 | import numpy as np
8 |
9 | import tensorflow as tf
10 | tf.keras.backend.set_floatx('float64')
11 | tf.config.threading.set_inter_op_parallelism_threads(2)
12 | tf.config.threading.set_intra_op_parallelism_threads(8)
13 | tf.config.set_soft_device_placement(True)
14 | from tensorflow.python.ops import math_ops
15 |
16 | def rectified_exp(t):
17 | '''
18 | :math:`r_exp(t) = exp(t)` if :math:`t<0` and :math:`r_exp(t) = 1+x+(1/2)x^2+(1/6)x^3`.
19 | '''
20 | exp = math_ops.exp(t)
21 | approx_exp = 1.+t+(1./2.)*tf.math.pow(t, 2.)+(1./6.)*tf.math.pow(t, 3.)
22 | condition = tf.greater(t, 0.0)
23 | r_exp = tf.where(condition, x=approx_exp, y=exp)
24 | return r_exp
25 |
26 |
27 | def d_rectified_exp(t):
28 | '''
29 | :math:`dr_exp(t) = exp(t)` if :math:`t<0` and :math:`dr_exp(t) = 1+x+(1/2)x^2`.
30 | '''
31 | dexp = math_ops.exp(t)
32 | approx_dexp = 1.+t+(1./2.)*tf.math.pow(t, 2.)
33 | condition = tf.greater(t, 0.0)
34 | dr_exp = tf.where(condition, x=approx_dexp, y=dexp)
35 | return dr_exp
--------------------------------------------------------------------------------
/kxy/pandas_extension/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC.
5 | Author: Dr Yves-Laurent Kom Samo
6 |
7 | This program is free software: you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 |
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU Affero General Public License
18 | along with this program. If not, see .
19 | """
20 | from .finance_accessor import *
21 | from .features_accessor import *
22 | from .learning_accessor import *
23 | from .post_learning_accessor import *
24 | from .pre_learning_accessor import *
25 | from .accessor import *
--------------------------------------------------------------------------------
/kxy/pandas_extension/accessor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | We define a custom :code:`kxy` `pandas accessor `_ below,
6 | namely the class :code:`Accessor`, that extends the pandas DataFrame class with all our analyses, thereby allowing data scientists to tap into
7 | the power of the :code:`kxy` toolkit within the comfort of their favorite data structure.
8 |
9 | All methods defined in the :code:`Accessor` class are accessible from any DataFrame instance as :code:`df.kxy.`, so long as the :code:`kxy` python
10 | package is imported alongside :code:`pandas`.
11 | """
12 |
13 |
14 | import pandas as pd
15 |
16 | from .features_accessor import FeaturesAccessor
17 | from .finance_accessor import FinanceAccessor
18 | from .learning_accessor import LearningAccessor
19 | from .post_learning_accessor import PostLearningAccessor
20 | from .pre_learning_accessor import PreLearningAccessor
21 |
22 |
23 | @pd.api.extensions.register_dataframe_accessor("kxy")
24 | class Accessor(PreLearningAccessor, LearningAccessor, PostLearningAccessor, FinanceAccessor, FeaturesAccessor):
25 | """
26 | Extension of the pandas.DataFrame class with the full capabilities of the :code:`kxy` platform.
27 | """
28 | pass
--------------------------------------------------------------------------------
/kxy/pandas_extension/base_accessor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import logging
4 | import hashlib
5 | import numpy as np
6 | from scipy.stats import norm
7 | import pandas as pd
8 |
9 | try:
10 | get_ipython().__class__.__name__
11 | from halo import HaloNotebook as Halo
12 | except:
13 | from halo import Halo
14 |
15 |
16 | class BaseAccessor(object):
17 | """
18 | Base class inheritated by our customs accessors.
19 | """
20 | def __init__(self, pandas_obj):
21 | self._obj = pandas_obj
22 |
23 |
24 | def check_problem_type(self, problem_type, target_column):
25 | if problem_type == 'regression':
26 | try:
27 | y = self._obj[target_column].astype(float)
28 | except:
29 | raise ValueError('You specified regression as problem_type but the target column is not numeric')
30 |
31 |
32 | def is_discrete(self, column):
33 | """
34 | Determine whether the input column contains discrete (i.e as opposed to continuous) observations.
35 | """
36 | if self.is_categorical(column):
37 | return True
38 |
39 | n = self._obj.shape[0]
40 | values, counts = np.unique(self._obj[column].values, return_counts=True)
41 | unique_n = len(values)
42 |
43 | if unique_n < 0.05*n:
44 | return True
45 |
46 | counts = np.array(list(sorted(counts)))
47 | if np.sum(counts[-10:]) > 0.8*n:
48 | return True
49 |
50 | return False
51 |
52 |
53 | def is_categorical(self, column):
54 | """
55 | Determine whether the input column contains categorical (i.e. non-ordinal) observations.
56 | """
57 | if self._obj[column].dtype in [float, int, np.float32, np.float64, np.int32, np.int64]:
58 | return False
59 |
60 | try:
61 | casted = self._obj[column].values.astype(float)
62 | return False
63 | except:
64 | return True
65 |
66 |
67 | @property
68 | def is_too_large(self):
69 | return self._obj.memory_usage(index=False).sum()/(1024.0*1024.0*1024.0) > 1.5
70 |
71 |
72 | def describe(self,):
73 | for col in sorted(self._obj.columns):
74 | print(' ')
75 | print('---------' + '-'.join(['' for c in str(col)]))
76 | print('Column: %s' % col)
77 | print('---------' + '-'.join(['' for c in str(col)]))
78 | if self._obj.kxy.is_categorical(col):
79 | print('Type: Categorical')
80 | labels, counts = np.unique(self._obj[col].values.astype(str), return_counts=True)
81 | labels_with_counts = [(labels[i], 100.*counts[i]/self._obj.shape[0]) \
82 | for i in range(len(labels))]
83 | labels_with_counts = sorted(labels_with_counts, key=lambda x: -x[1])
84 | tot = 0.0
85 | for label, freq in labels_with_counts:
86 | print('Frequency: %s%%, Label: %s' % (('%.2f' % freq).rjust(5, ' '), label))
87 | tot += freq
88 | if tot > 90. and tot < 100.:
89 | print('Other Labels: %.2f%%' % (100.-tot))
90 | break
91 | else:
92 | if self._obj[col].isna().min() == True:
93 | raise ValueError('Column %s only contains NaN' % col)
94 |
95 | m = self._obj[col].min(skipna=True)
96 | M = self._obj[col].max(skipna=True)
97 | mn = self._obj[col].mean(skipna=True)
98 | q50 = self._obj[col].median(skipna=True)
99 | q25 = self._obj[col].quantile(0.25)
100 | q75 = self._obj[col].quantile(0.75)
101 |
102 | print('Type: Continuous')
103 | print('Max: %s' % ('%.1f' % M if M < 10. else '{:,}'.format(int(M))))
104 | print('p75: %s' % ('%.1f' % q75 if q75 < 10. else '{:,}'.format(int(q75))))
105 | print('Mean: %s' % ('%.1f' % mn if mn < 10. else '{:,}'.format(int(mn))))
106 | print('Median: %s' % ('%.1f' % q50 if q50 < 10. else '{:,}'.format(int(q50))))
107 | print('p25: %s' % ('%.1f' % q25 if q25 < 10. else '{:,}'.format(int(q25))))
108 | print('Min: %s' % ('%.1f' % m if m < 10. else '{:,}'.format(int(m))))
109 |
110 |
111 | def anonymize(self, columns_to_exclude=[]):
112 | """
113 | Anonymize the dataframe in a manner that leaves all pre-learning and post-learning analyses (including data valuation, variable selection, model-driven improvability, data-driven improvability and model explanation) invariant.
114 |
115 | Any transformation on continuous variables that preserves ranks will not change our pre-learning and post-learning analyses. The same holds for any 1-to-1 transformation on categorical variables.
116 |
117 | This implementation replaces ordinal values (i.e. any column that can be cast as a float) with their within-column Gaussian score. For each non-ordinal column, we form the set of all possible values, we assign a unique integer index to each value in the set, and we systematically replace said value appearing in the dataframe by the hexadecimal code of its associated integer index.
118 |
119 | For regression problems, accurate estimation of RMSE related metrics require the target column (and the prediction column for post-learning analyses) not to be anonymized.
120 |
121 |
122 | Parameters
123 | ----------
124 | columns_to_exclude: list (optional)
125 | List of columns not to anonymize (e.g. target and prediction columns for regression problems).
126 |
127 |
128 | Returns
129 | -------
130 | result : pandas.DataFrame
131 | The result is a pandas.Dataframe with columns (where applicable):
132 | """
133 | spinner = Halo(text='Preparing data upload', spinner='dots')
134 | spinner.start()
135 | df = self._obj.copy()
136 | for col in df.columns:
137 | if col in columns_to_exclude:
138 | continue
139 |
140 | if df.kxy.is_categorical(col) or df[col].dtype.name == 'category':
141 | # Note: By using 'category' as dtype you are implicitly telling us that the 'natural'
142 | # order of values does not matter.
143 | unique_values = list(sorted(set(list(df[col].values))))
144 | mapping = {unique_values[i]: "0x{:03x}".format(i) for i in range(len(unique_values))}
145 | df[col] = df[col].apply(lambda x: mapping.get(x))
146 | else:
147 | # Note: Any monotonic transformation applied to any continuous column would work.
148 | # The gaussian scoring below makes no assumption on marginals whatsoever.
149 | x = df[col].values.astype(float)
150 | x = x - np.nanmean(x)
151 | s = np.nanstd(x)
152 | if s > 0.0:
153 | x = x/s
154 | x = norm.cdf(x)
155 | df[col] = np.around(x.copy(), 3)
156 | spinner.succeed()
157 |
158 | return df
159 |
160 |
161 |
162 | def __hash__(self):
163 | return hashlib.sha256(self._obj.to_string().encode()).hexdigest()
164 |
165 |
166 |
167 |
168 |
--------------------------------------------------------------------------------
/kxy/pandas_extension/features_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import numpy as np
4 | from scipy.stats import kurtosis, skew
5 |
6 | def nanskew(a, axis=0, bias=True):
7 | ''' '''
8 | return skew(a, axis=axis, bias=bias, nan_policy='omit')
9 |
10 | def nankurtosis(a, axis=0, fisher=True, bias=True):
11 | ''' '''
12 | return kurtosis(a, axis=axis, bias=bias, nan_policy='omit')
13 |
14 | def nanmin(a, axis=None, out=None):
15 | ''' '''
16 | try:
17 | return np.nanmin(a, axis=axis, out=out)
18 | except:
19 | return np.nan
20 |
21 | def nanmax(a, axis=None, out=None):
22 | ''' '''
23 | try:
24 | return np.nanmax(a, axis=axis, out=out)
25 | except:
26 | return np.nan
27 |
28 | def nanmaxmmin(a, axis=None, out=None):
29 | ''' '''
30 | return nanmax(a, axis=axis, out=out)-nanmin(a, axis=axis, out=out)
31 |
32 | def nanmean(a, axis=None, out=None):
33 | ''' '''
34 | try:
35 | return np.nanmean(a, axis=axis, out=out)
36 | except:
37 | return np.nan
38 |
39 | def nansum(a, axis=None, out=None):
40 | ''' '''
41 | try:
42 | return np.nansum(a, axis=axis, out=out)
43 | except:
44 | return np.nan
45 |
46 |
47 | def nanstd(a, axis=None, dtype=None, out=None):
48 | ''' '''
49 | try:
50 | return np.nanstd(a, axis=axis, out=out)
51 | except:
52 | return np.nan
53 |
54 | def nanmedian(a, axis=None, out=None, overwrite_input=False):
55 | ''' '''
56 | try:
57 | return np.nanmedian(a, axis=axis, out=out, overwrite_input=overwrite_input)
58 | except:
59 | return np.nan
60 |
61 | def q25(x):
62 | ''' '''
63 | return x.quantile(0.25)
64 |
65 | def q75(x):
66 | ''' '''
67 | return x.quantile(0.75)
68 |
69 | def nanskewabs(a, axis=0, bias=True):
70 | ''' '''
71 | return skew(np.abs(a), axis=axis, bias=bias, nan_policy='omit')
72 |
73 | def nankurtosisabs(a, axis=0, fisher=True, bias=True):
74 | ''' '''
75 | return kurtosis(np.abs(a), axis=axis, bias=bias, nan_policy='omit')
76 |
77 | def nanminabs(a, axis=None, out=None):
78 | ''' '''
79 | try:
80 | return np.nanmin(np.abs(a), axis=axis, out=out)
81 | except:
82 | return np.nan
83 |
84 | def nanmaxabs(a, axis=None, out=None):
85 | ''' '''
86 | try:
87 | return np.nanmax(np.abs(a), axis=axis, out=out)
88 | except:
89 | return np.nan
90 |
91 | def nanmaxmminabs(a, axis=None, out=None):
92 | ''' '''
93 | return nanmax(np.abs(a), axis=axis, out=out)-nanmin(a, axis=axis, out=out)
94 |
95 | def nanmeanabs(a, axis=None, out=None):
96 | ''' '''
97 | try:
98 | return np.nanmean(np.abs(a), axis=axis, out=out)
99 | except:
100 | return np.nan
101 |
102 | def nansumabs(a, axis=None, out=None):
103 | ''' '''
104 | try:
105 | return np.nansum(np.abs(a), axis=axis, out=out)
106 | except:
107 | return np.nan
108 |
109 | def nanstdabs(a, axis=None, dtype=None, out=None):
110 | ''' '''
111 | try:
112 | return np.nanstd(np.abs(a), axis=axis, out=out)
113 | except:
114 | return np.nan
115 |
116 | def nanmedianabs(a, axis=None, out=None, overwrite_input=False):
117 | ''' '''
118 | try:
119 | return np.nanmedian(np.abs(a), axis=axis, out=out, overwrite_input=overwrite_input)
120 | except:
121 | return np.nan
122 |
123 | def q25abs(x):
124 | ''' '''
125 | return np.abs(x).quantile(0.25)
126 |
127 | def q75abs(x):
128 | ''' '''
129 | return np.abs(x).quantile(0.75)
130 |
131 | def n_unique(x):
132 | ''' '''
133 | vc = x.value_counts(normalize=True, sort=True, ascending=False)
134 | return len(vc.index)
135 |
136 | def mode(x):
137 | ''' '''
138 | vc = x.value_counts(normalize=True, sort=True, ascending=False)
139 | return vc.index[0] if len(vc.index) > 0 else np.nan
140 |
141 | def modefreq(x):
142 | ''' '''
143 | vc = x.value_counts(normalize=True, sort=True, ascending=False)
144 | return vc.values[0] if len(vc.index) > 0 else np.nan
145 |
146 | def lastmode(x):
147 | ''' '''
148 | vc = x.value_counts(normalize=True, sort=True, ascending=False)
149 | return vc.index[-1] if len(vc.index) > 0 else np.nan
150 |
151 | def lastmodefreq(x):
152 | ''' '''
153 | vc = x.value_counts(normalize=True, sort=True, ascending=False)
154 | return vc.values[-1] if len(vc.index) > 0 else np.nan
155 |
156 | def nextmode(x):
157 | ''' '''
158 | vc = x.value_counts(normalize=True, sort=True, ascending=False)
159 | return vc.index[1] if len(vc.index) > 1 else vc.index[0] if len(vc.index) > 0 else np.nan
160 |
161 | def nextmodefreq(x):
162 | ''' '''
163 | vc = x.value_counts(normalize=True, sort=True, ascending=False)
164 | return vc.values[1] if len(vc.values) > 1 else vc.values[0] if len(vc.index) > 0 else np.nan
165 |
166 | def rmspe_score(y_true, y_pred):
167 | ''' '''
168 | return np.sqrt(np.nanmean(np.square((y_true.flatten() - y_pred.flatten()) / y_true.flatten())))
169 |
170 | def neg_rmspe_score(y_true, y_pred):
171 | ''' '''
172 | return -rmspe_score(y_true, y_pred)
173 |
174 |
175 | def neg_mae_score(y_true, y_pred):
176 | ''' '''
177 | return -np.nanmean(np.abs(y_true.flatten()-y_pred.flatten()))
178 |
179 |
180 | def neg_rmse_score(y_true, y_pred):
181 | ''' '''
182 | return -np.sqrt(np.nanmean((y_true.flatten()-y_pred.flatten())**2))
183 |
184 |
185 |
--------------------------------------------------------------------------------
/kxy/pandas_extension/finance_accessor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import numpy as np
4 | import pandas as pd
5 |
6 | from kxy.finance import information_adjusted_correlation as ia_corr
7 |
8 | from .base_accessor import BaseAccessor
9 |
10 | @pd.api.extensions.register_dataframe_accessor("kxy_finance")
11 | class FinanceAccessor(BaseAccessor):
12 | """
13 | Extension of the pandas.DataFrame class with various finance-specific analytics.
14 |
15 | This class defines the :code:`kxy_finance` `pandas accessor `_.
16 |
17 | All its methods defined are accessible from any DataFrame instance as :code:`df.kxy_finance.`, so long as the :code:`kxy` python package is imported alongside :code:`pandas`.
18 | """
19 | def information_adjusted_beta(self, market_column, asset_column, anonymize=False):
20 | """
21 | Estimate the information-adjusted beta of an asset return :math:`r` relative to the market return :math:`r_m`: :math:`\\text{IA-}\\beta := \\text{IA-Corr}\\left(r, r_m \\right) \\sqrt{\\frac{\\text{Var}(r)}{\\text{Var}(r_m)}}`,
22 | where :math:`\\text{IA-Corr}\\left(r, r_m \\right) := \\text{sgn}\\left(\\text{Corr}\\left(r, r_m \\right) \\right) \\left[1 - e^{-2I(r, r_m)} \\right]` denotes the information-adjusted correlation coefficient, with :math:`\\text{sgn}\\left(\\text{Corr}\\left(r, r_m \\right) \\right)` the sign of the Pearson correlation coefficient.
23 |
24 | Unlike the traditional beta coefficient, namely :math:`\\beta := \\text{Corr}\\left(r, r_m \\right) \\sqrt{\\frac{\\text{Var}(r)}{\\text{Var}(r_m)}}`, that only captures linear relations between market and asset returns, and that is 0 if and only if the two are **decorrelated**, :math:`\\text{IA-}\\beta` captures any relationship between asset return and market return, linear or nonlinear, and is 0 if and only if the two variables are **statistically independent**.
25 |
26 | Parameters
27 | ----------
28 | market_column : str
29 | The name of the column containing market returns.
30 | asset_column : str
31 | The name of the column containing asset returns.
32 | anonymize : bool
33 | When set to true, your explanatory variables will never be shared with KXY (at no performance cost).
34 |
35 |
36 | Returns
37 | -------
38 | result : float
39 | The information-adjusted beta coefficient.
40 |
41 | """
42 | assert market_column in self._obj.columns, 'The market column should be a column'
43 | assert asset_column in self._obj.columns, 'The asset column should be a column'
44 |
45 | m_std = np.nanstd(self._obj[market_column].values)
46 | a_std = np.nanstd(self._obj[asset_column].values)
47 |
48 | return self.information_adjusted_correlation(market_column, asset_column, anonymize=anonymize)*a_std/m_std
49 |
50 |
51 |
52 | def information_adjusted_correlation(self, market_column, asset_column, anonymize=False):
53 | """
54 | Estimate the information-adjusted correlation between an asset return :math:`r` and the market return :math:`r_m`: :math:`\\text{IA-Corr}\\left(r, r_m \\right) := \\text{sgn}\\left(\\text{Corr}\\left(r, r_m \\right) \\right) \\left[1 - e^{-2I(r, r_m)} \\right]`, where :math:`\\text{sgn}\\left(\\text{Corr}\\left(r, r_m \\right) \\right)` is the sign of the Pearson correlation coefficient.
55 |
56 | Unlike Pearson's correlation coefficient, which is 0 if and only if asset return and market return are **decorrelated** (i.e. they exhibit no linear relation), information-adjusted correlation is 0 if and only if market and asset returns are **statistically independent** (i.e. the exhibit no relation, linear or nonlinear).
57 |
58 |
59 | Parameters
60 | ----------
61 | market_column : str
62 | The name of the column containing market returns.
63 | asset_column : str
64 | The name of the column containing asset returns.
65 | anonymize : bool
66 | When set to true, your explanatory variables will never be shared with KXY (at no performance cost).
67 |
68 |
69 | Returns
70 | -------
71 | result : float
72 | The information-adjusted correlation.
73 |
74 | """
75 | assert market_column in self._obj.columns, 'The market column should be a column'
76 | assert asset_column in self._obj.columns, 'The asset column should be a column'
77 |
78 | _obj = self.anonymize(columns_to_exclude=[]) if anonymize else self._obj
79 |
80 | return ia_corr(_obj, market_column, asset_column)
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/kxy/pandas_extension/pre_learning_accessor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import pandas as pd
4 |
5 | from kxy.pre_learning import data_valuation as dv
6 | from kxy.pre_learning import variable_selection as vs
7 |
8 | from .base_accessor import BaseAccessor
9 |
10 | @pd.api.extensions.register_dataframe_accessor("kxy_pre_learning")
11 | class PreLearningAccessor(BaseAccessor):
12 | """
13 | Extension of the pandas.DataFrame class with various analytics for **post-learning** in supervised learning problems.
14 |
15 | This class defines the :code:`kxy_pre_learning` `pandas accessor `_.
16 |
17 | All its methods defined are accessible from any DataFrame instance as :code:`df.kxy_pre_learning.`, so long as the :code:`kxy` python package is imported alongside :code:`pandas`.
18 | """
19 | def data_valuation(self, target_column, problem_type=None, anonymize=None, snr='auto', include_mutual_information=False, file_name=None):
20 | """
21 | Estimate the highest performance metrics achievable when predicting the :code:`target_column` using all other columns.
22 |
23 | When :code:`problem_type=None`, the nature of the supervised learning problem (i.e. regression or classification) is inferred from whether or not :code:`target_column` is categorical.
24 |
25 |
26 | Parameters
27 | ----------
28 | target_column : str
29 | The name of the column containing true labels.
30 | problem_type : None | 'classification' | 'regression'
31 | The type of supervised learning problem. When None, it is inferred from the column type and the number of distinct values.
32 | anonymize : None | bool
33 | When set to true, your explanatory variables will never be shared with KXY (at no performance cost). When set to None (the default), your data will be anonymized when it is too big.
34 | include_mutual_information : bool
35 | Whether to include the mutual information between target and explanatory variables in the result.
36 |
37 |
38 | Returns
39 | -------
40 | achievable_performance : pandas.Dataframe
41 | The result is a pandas.Dataframe with columns (where applicable):
42 |
43 | * :code:`'Achievable Accuracy'`: The highest classification accuracy that can be achieved by a model using provided inputs to predict the label.
44 | * :code:`'Achievable R^2'`: The highest :math:`R^2` that can be achieved by a model using provided inputs to predict the label.
45 | * :code:`'Achievable RMSE'`: The lowest Root Mean Square Error that can be achieved by a model using provided inputs to predict the label.
46 | * :code:`'Achievable Log-Likelihood Per Sample'`: The highest true log-likelihood per sample that can be achieved by a model using provided inputs to predict the label.
47 |
48 |
49 |
50 | .. admonition:: Theoretical Foundation
51 |
52 | Section :ref:`1 - Achievable Performance`.
53 |
54 |
55 | .. seealso::
56 |
57 | :ref:`kxy.pre_learning.achievable_performance.data_valuation `
58 |
59 | """
60 | assert target_column in self._obj.columns, 'The target_column should be a column'
61 | if problem_type is None:
62 | problem_type = 'classification' if self.is_discrete(target_column) else 'regression'
63 | self.check_problem_type(problem_type, target_column)
64 |
65 | _obj = self.anonymize(columns_to_exclude=[target_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj
66 |
67 | return dv(_obj, target_column, problem_type, snr=snr, include_mutual_information=include_mutual_information, \
68 | file_name=file_name)
69 |
70 |
71 | def variable_selection(self, target_column, problem_type=None, anonymize=None, snr='auto', file_name=None):
72 | """
73 | Runs the model-free variable selection analysis.
74 |
75 | When :code:`problem_type=None`, the nature of the supervised learning problem (i.e. regression or classification) is inferred from whether or not :code:`target_column` is categorical.
76 |
77 |
78 | Parameters
79 | ----------
80 | target_column : str
81 | The name of the column containing true labels.
82 | problem_type : None | 'classification' | 'regression'
83 | The type of supervised learning problem. When None, it is inferred from the column type and the number of distinct values.
84 | anonymize : None | bool
85 | When set to true, your explanatory variables will never be shared with KXY (at no performance cost). When set to None (the default), your data will be anonymized when it is too big.
86 |
87 | Returns
88 | -------
89 | result : pandas.DataFrame
90 | The result is a pandas.DataFrame with columns (where applicable):
91 |
92 | * :code:`'Selection Order'`: The order in which the associated variable was selected, starting at 1 for the most important variable.
93 | * :code:`'Variable'`: The column name corresponding to the input variable.
94 | * :code:`'Running Achievable R^2'`: The highest :math:`R^2` that can be achieved by a classification model using all variables selected so far, including this one.
95 | * :code:`'Running Achievable Accuracy'`: The highest classification accuracy that can be achieved by a classification model using all variables selected so far, including this one.
96 | * :code:`'Running Achievable RMSE'`: The highest classification accuracy that can be achieved by a classification model using all variables selected so far, including this one.
97 |
98 |
99 | .. admonition:: Theoretical Foundation
100 |
101 | Section :ref:`2 - Variable Selection Analysis`.
102 |
103 | .. seealso::
104 |
105 | :ref:`kxy.pre_learning.variable_selection.variable_selection `
106 | """
107 | assert target_column in self._obj.columns, 'The target_column should be a column'
108 | if problem_type is None:
109 | problem_type = 'classification' if self.is_discrete(target_column) else 'regression'
110 | self.check_problem_type(problem_type, target_column)
111 |
112 | _obj = self.anonymize(columns_to_exclude=[target_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj
113 |
114 | return vs(_obj, target_column, problem_type, snr=snr, file_name=file_name)
115 |
116 |
117 |
118 |
119 |
120 |
--------------------------------------------------------------------------------
/kxy/pfs/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC.
5 | Author: Dr Yves-Laurent Kom Samo
6 |
7 | This program is free software: you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 |
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU Affero General Public License
18 | along with this program. If not, see .
19 | """
20 | try:
21 | from .pfs_selector import *
22 | from .pfs_predictor import *
23 | except:
24 | import logging
25 | logging.warn('Importing the PFS submodule failed: Principal Feature Selector might not be available.')
--------------------------------------------------------------------------------
/kxy/pfs/pfs_predictor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import numpy as np
5 | import pandas as pd
6 | import pickle as pkl
7 |
8 | from .pfs_selector import PFS, PCA
9 |
10 |
11 | class PFSPredictor(object):
12 | """
13 | Principal Feature Selection Predictor.
14 | """
15 | def _predict(self, obj):
16 | assert hasattr(self, 'models'), 'The model should be first fitted'
17 | assert hasattr(self, 'feature_directions'), 'The model should first be fitted'
18 | assert hasattr(self, 'x_columns'), 'The model should first be fitted'
19 | assert self.feature_directions.shape[0] > 0, 'There should be at least one feature selected'
20 |
21 | z = np.dot(obj[self.x_columns].values, self.feature_directions.T)
22 | y = self.models[0].predict(z)
23 | predictions = pd.DataFrame(index=obj.index)
24 | predictions[self.target_column] = y
25 |
26 | return predictions
27 |
28 |
29 | def predict(self, obj, memory_bound=False):
30 | """
31 | Make predictions using the fitted model.
32 |
33 |
34 | Parameters
35 | ----------
36 | obj : pandas.DataFrame
37 | A dataframe containing test explanatory variables/features about which we want to make predictions.
38 | memory_bound : bool (Default False)
39 | Whether we should try to save memory.
40 |
41 |
42 | Returns
43 | -------
44 | result : pandas.DataFrame
45 | A dataframe with the same index as :code:`obj`, and with one column whose name is the :code:`target_column` used for training.
46 | """
47 | if memory_bound:
48 | n = obj.shape[0]
49 | max_n = 1000000
50 | res = pd.DataFrame(index=obj.index)
51 | res[self.target_column] = np.nan
52 | i = 0
53 | while i < n:
54 | res.iloc[i:i+max_n] = self._predict(obj.iloc[i:i+max_n])
55 | i += max_n
56 | return res
57 |
58 | else:
59 | return self._predict(obj)
60 |
61 |
62 | def save(self, path):
63 | """
64 | Cache the predictor to disk.
65 | """
66 | meta_path = path + '-meta-' + self.__class__.__name__
67 | meta = {'target_column': self.target_column, 'feature_directions': self.feature_directions, 'x_columns': self.x_columns}
68 | with open(meta_path, 'wb') as f:
69 | pkl.dump(meta, f)
70 | self.models[0].save(path + '-' + self.__class__.__name__)
71 |
72 |
73 | @classmethod
74 | def load(cls, path, learner_func):
75 | """
76 | Load the predictor from disk.
77 | """
78 | meta_path = path + '-meta-' + cls.__name__
79 | with open(meta_path, 'rb') as f:
80 | meta = pkl.load(f)
81 | target_column = meta['target_column']
82 | feature_directions = meta['feature_directions']
83 | x_columns = meta['x_columns']
84 |
85 | n_vars = feature_directions.shape[0]
86 | model = learner_func(n_vars=n_vars, path=path + '-' + cls.__name__, safe=False)
87 |
88 | predictor = cls()
89 | predictor.models = [model]
90 | predictor.feature_directions = feature_directions
91 | predictor.target_column = target_column
92 | predictor.x_columns = x_columns
93 |
94 | return predictor
95 |
96 |
97 | def get_feature_selector(self):
98 | """
99 | """
100 | return PFS()
101 |
102 | @property
103 | def p(self):
104 | return self.feature_directions.shape[0]
105 |
106 |
107 | def fit(self, obj, target_column, learner_func, max_duration=None, path=None, p=None):
108 | """
109 | Fits a supervised learner enriched with feature selection using the Principal Feature Selection (PFS) algorithm.
110 |
111 |
112 | Parameters
113 | ----------
114 | obj : pandas.DataFrame
115 | A dataframe containing training explanatory variables/features as well as the target.
116 | target_column : str
117 | The name of the column in :code:`obj` containing targets.
118 | learner_func : func | callable
119 | Function or callable that expects one optional argument :code:`n_vars` and returns an instance of a superviser learner (regressor or classifier) following the scikit-learn convention, and expecting :code:`n_vars` features. Specifically, the learner should have a :code:`fit(x_train, y_train)` method. The learner should also have a :code:`feature_importances_` property or attribute, which is an array or a list containing feature importances once the model has been trained. There should be as many importance scores in :code:`feature_importances_` as columns in :code:`fit(x_train, y_train)`.
120 | max_duration : float | None (default)
121 | If not None, then feature elimination will stop after this many seconds.
122 | p : int | None (default)
123 | The number of principal features to learn when using one-shot PFS.
124 |
125 |
126 | Attributes
127 | ----------
128 | feature_directions : np.array
129 | The matrix whose rows are the directions in which to project the original features to get principal features.
130 | target_column : str
131 | The name of the column used as target.
132 | models : list
133 | An array whose first entry is the fitted model.
134 | x_columns : list
135 | The list of columns used for PFS sorted alphabetically.
136 |
137 |
138 | Returns
139 | -------
140 | results : dict
141 | A dictionary containing, among other things, feature directions.
142 |
143 | """
144 | if path:
145 | try:
146 | predictor = PFSPredictor.load(path, learner_func)
147 | self.models = predictor.models
148 | self.feature_directions = predictor.feature_directions
149 | self.target_column = predictor.target_column
150 | self.x_columns = predictor.x_columns
151 | return {'Feature Directions': self.feature_directions}
152 | except:
153 | pass
154 | self.target_column = target_column
155 | self.x_columns = sorted([_ for _ in obj.columns if _ != target_column])
156 |
157 | x = obj[self.x_columns].values
158 | y = obj[[target_column]].values
159 |
160 | # Construct principal features
161 | principal_feature_selector = self.get_feature_selector()
162 | self.feature_directions = principal_feature_selector.fit(x, y, max_duration=max_duration, p=p)
163 | z = np.dot(x, self.feature_directions.T) # Principal features
164 |
165 | # Train the learner
166 | n_vars = self.feature_directions.shape[0]
167 | m = learner_func(n_vars=n_vars)
168 | m.fit(z, y)
169 | self.models = [m]
170 | if path:
171 | self.save(path)
172 |
173 | results = {'Feature Directions': self.feature_directions}
174 | return results
175 |
176 |
177 | class PCAPredictor(PFSPredictor):
178 | """
179 | Principal Component Analysis Predictor.
180 | """
181 | def __init__(self, energy_loss_frac=0.05):
182 | self.energy_loss_frac = energy_loss_frac
183 |
184 | def get_feature_selector(self):
185 | return PCA(energy_loss_frac=self.energy_loss_frac)
186 |
187 |
--------------------------------------------------------------------------------
/kxy/pfs/pfs_selector.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import copy
4 | from time import time
5 | import logging
6 | import numpy as np
7 |
8 | import tensorflow as tf
9 | from tensorflow.keras.callbacks import EarlyStopping, TerminateOnNaN
10 | from tensorflow.keras.optimizers import Adam
11 |
12 | from kxy.misc.tf import PFSLearner, PFSOneShotLearner
13 |
14 |
15 |
16 |
17 | def learn_principal_direction(y, x, ox=None, oy=None, epochs=None, expand_y=True):
18 | """
19 | Learn the i-th principal feature when using :math:`x` to predict :math:`y`.
20 |
21 | Parameters
22 | ----------
23 | x : np.array
24 | 2D array of shape :math:`(n, d)` containing original features.
25 | y : np.array
26 | Array of shape :math:`(n)` or :math:`(n, 1)` containing targets.
27 |
28 | Returns
29 | -------
30 | w : np.array
31 | The first principal direction.
32 | mi: float
33 | The mutual information :math:`I(y; w_i^Tx, \\dots, w_1^Tx)`.
34 | """
35 | dx = 1 if len(x.shape) == 1 else x.shape[1]
36 | dy = 1 if len(y.shape) == 1 else y.shape[1]
37 | dox = 0 if ox is None else 1 if len(ox.shape) == 1 else ox.shape[1]
38 | doy = 0 if oy is None else 1 if len(oy.shape) == 1 else oy.shape[1]
39 |
40 | learner = PFSLearner(dx, dy=dy, dox=dox, doy=doy, expand_y=expand_y)
41 | learner.fit(x, y, ox=ox, oy=oy, epochs=epochs)
42 |
43 | mi = learner.mutual_information
44 | w = learner.feature_direction
45 | ox = learner.fx
46 | oy = learner.gy if expand_y else None
47 |
48 | return w, mi, ox, oy, learner
49 |
50 |
51 |
52 | def learn_principal_directions_one_shot(y, x, p, epochs=None, expand_y=True):
53 | """
54 | Jointly learn p principal features.
55 |
56 | Parameters
57 | ----------
58 | x : np.array
59 | 2D array of shape :math:`(n, d)` containing original features.
60 | y : np.array
61 | Array of shape :math:`(n)` or :math:`(n, 1)` containing targets.
62 | p : int
63 | The number of principal features to learn.
64 |
65 | Returns
66 | -------
67 | w : np.array
68 | The matrix whose rows are the p principal directions.
69 | """
70 | dx = 1 if len(x.shape) == 1 else x.shape[1]
71 | learner = PFSOneShotLearner(dx, p=p, expand_y=expand_y)
72 | learner.fit(x, y, epochs=epochs)
73 | w = learner.feature_directions
74 | mi = learner.mutual_information
75 |
76 | return w, mi, learner
77 |
78 |
79 |
80 |
81 | class PFS(object):
82 | """
83 | Principal Feature Selection.
84 | """
85 | def fit(self, x, y, p=None, mi_tolerance=0.0001, max_duration=None, epochs=None, seed=None, expand_y=True):
86 | """
87 | Perform Principal Feature Selection using :math:`x` to predict :math:`y`.
88 |
89 | Specifically, we are looking for a :math:`p x d` matrix :math:`W` whose :math:`p` rows are learned sequentially such that :math:`z := Wx` is a great feature vector for predicting :math:`y`.
90 |
91 | Each row of :math:`W` is normal: :math:`||w_i||=1`, and the corresponding principal feature, namely :math:`w_i^Tx`, points in the same direction as :math:`y` (i.e. :math:`Cov(y, w_i^Tx) > 0`).
92 |
93 | The first row :math:`w_1` is learned so as to maximize the mutual information :math:`I(y; x^Tw_1)`.
94 |
95 | The second row :math:`w_2` is learned so as to maximize the conditional mutual information :math:`I(y; x^Tw_2 | x^Tw_1)`.
96 |
97 | More generally, the :math:`(i+1)`-th row :math:`w_{i+1}` is learned so as to maximize the conditional mutual information :math:`I(y; x^Tw_{i+1} | [x^Tw_1, ..., x^Tw_i])`.
98 |
99 |
100 | Parameters
101 | ----------
102 | x : np.array
103 | 2D array of shape :math:`(n, d)` containing original features.
104 | y : np.array
105 | Array of shape :math:`(n)` or :math:`(n, 1)` containing targets.
106 | p : int | None (default)
107 | The number of features to select. When :code:`None` (the default) we stop when the estimated mutual information smaller than the mutual information tolerance parameter, or when we have exceeded the maximum duration. A value of :code:`p` that is not :code:`None` triggers one-shot PFS.
108 | mi_tolerance: float
109 | The smallest estimated mutual information required to keep looking for new feature directions.
110 | max_duration : float | None (default)
111 | The maximum amount of time (in second) to allocate to PFS.
112 |
113 |
114 | Returns
115 | -------
116 | W : np.array
117 | 2D array whose rows are directions to use to compute principal features: :math:`z = Wx`.
118 | """
119 | if not seed is None:
120 | from kxy.misc.tf import set_seed
121 | set_seed(seed)
122 |
123 | if max_duration:
124 | start_time = time()
125 |
126 | rows = []
127 | d = 1 if len(x.shape) == 1 else x.shape[1]
128 | learners = []
129 | if p is None:
130 | t = y.flatten().copy()
131 | old_mi = 0.0
132 | ox = None
133 | oy = None
134 | for i in range(d):
135 | w, mi, ox, oy, learner = learn_principal_direction(t, x, ox=ox, oy=oy, epochs=epochs, \
136 | expand_y=expand_y)
137 | learners += [copy.copy(learner)]
138 |
139 | if mi-old_mi < mi_tolerance:
140 | logging.info('The mutual information %.4f after %d round has not increase by more than %.4f: stopping.' % (
141 | mi, i+1, mi_tolerance))
142 | break
143 | else:
144 | logging.info('The mutual information has increased from %.4f to %.4f after %d rounds.' % (old_mi, mi, i+1))
145 | rows += [w.copy()]
146 |
147 | if max_duration:
148 | if time()-start_time > max_duration:
149 | logging.info('PFS has exceeded the configured maximum duration: exiting.')
150 | break
151 |
152 | old_mi = mi
153 |
154 | if rows == []:
155 | logging.warning('The only principal feature selected is not informative about the target: I(y; w^Tx)=%.4f' % mi)
156 | rows += [w.copy()]
157 |
158 | self.feature_directions = np.array(rows)
159 | self.mutual_information = old_mi
160 | self.learners = learners
161 | else:
162 | # Learn all p principal features jointly.
163 | feature_directions, mi, learner = learn_principal_directions_one_shot(y, x, p, epochs=epochs, \
164 | expand_y=expand_y)
165 | learners += [copy.copy(learner)]
166 | self.feature_directions = feature_directions
167 | self.mutual_information = mi
168 | self.learners = learners
169 |
170 | return self.feature_directions
171 |
172 |
173 | def max_ent_features_x(self, x):
174 | """
175 | """
176 | assert hasattr(self, 'learners'), 'The object should first be fitted.'
177 |
178 | fxs = []
179 | for learner in self.learners:
180 | fxs += [learner.learned_constraints_x(x)]
181 |
182 | if len(fxs) == 1:
183 | return fxs[0]
184 | else:
185 | return np.concatenate(fxs, axis=1)
186 |
187 |
188 |
189 | class PCA(object):
190 | """
191 | Principal Component Analysis.
192 | """
193 | def __init__(self, energy_loss_frac=0.05):
194 | self.energy_loss_frac = energy_loss_frac
195 |
196 |
197 | def fit(self, x, _, max_duration=None, p=None):
198 | """
199 | """
200 | cov_x = np.cov(x.T) # Columns in x should represent variables and rows observations.
201 | u, d, v = np.linalg.svd(cov_x)
202 | cum_energy = np.cumsum(d)
203 | energy = cum_energy[-1]
204 | p = len([_ for _ in cum_energy if _ <= (1.-self.energy_loss_frac)*energy])
205 |
206 | self.feature_directions = u[:, :p].T
207 |
208 | return self.feature_directions
209 |
210 |
211 |
212 |
213 |
--------------------------------------------------------------------------------
/kxy/post_learning/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC.
5 | Author: Dr Yves-Laurent Kom Samo
6 |
7 | This program is free software: you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 |
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU Affero General Public License
18 | along with this program. If not, see .
19 | """
20 | from .improvability import *
21 | from .model_explanation import *
--------------------------------------------------------------------------------
/kxy/post_learning/model_explanation.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Estimation of the top-:math:`k` most valuable variables in a supervised learning problem for every possible :math:`k`, and
5 | the corresponding achievable performances.
6 | """
7 | import logging
8 | import requests
9 | import sys
10 | from time import time, sleep
11 |
12 | import numpy as np
13 | import pandas as pd
14 |
15 | try:
16 | get_ipython().__class__.__name__
17 | from halo import HaloNotebook as Halo
18 | except:
19 | from halo import Halo
20 |
21 | from kxy.api import APIClient, upload_data
22 | from kxy.misc import LongerThanExpectedException
23 |
24 | # Cache old job ids to avoid being charged twice for the same job.
25 | EXPLANATION_JOB_IDS = {}
26 |
27 | def model_explanation(data_df, prediction_column, problem_type, snr='auto', file_name=None):
28 | """
29 | .. _model-explanation:
30 | Analyzes the variables that a model relies on the most in a brute-force fashion.
31 |
32 | The first variable is the variable the model relies on the most. The second variable is the variable that complements the first variable the most in explaining model decisions etc.
33 |
34 | Running performances should be understood as the performance achievable when trying to guess model predictions using variables with selection order smaller or equal to that of the row.
35 |
36 | When :code:`problem_type=None`, the nature of the supervised learning problem (i.e. regression or classification) is inferred from whether or not :code:`prediction_column` is categorical.
37 |
38 |
39 | Parameters
40 | ----------
41 | data_df : pandas.DataFrame
42 | The pandas DataFrame containing the data.
43 | prediction_column : str
44 | The name of the column containing true labels.
45 | problem_type : None | 'classification' | 'regression'
46 | The type of supervised learning problem. When None, it is inferred from the column type and the number of distinct values.
47 | file_name : None | str
48 | A unique identifier characterizing data_df in the form of a file name. Do not set this unless you know why.
49 |
50 |
51 | Returns
52 | -------
53 | result : pandas.DataFrame
54 | The result is a pandas.Dataframe with columns (where applicable):
55 |
56 | * :code:`'Selection Order'`: The order in which the associated variable was selected, starting at 1 for the most important variable.
57 | * :code:`'Variable'`: The column name corresponding to the input variable.
58 | * :code:`'Running Achievable R-Squared'`: The highest :math:`R^2` that can be achieved by a classification model using all variables selected so far, including this one.
59 | * :code:`'Running Achievable Accuracy'`: The highest classification accuracy that can be achieved by a classification model using all variables selected so far, including this one.
60 | * :code:`'Running Achievable RMSE'`: The highest classification accuracy that can be achieved by a classification model using all variables selected so far, including this one.
61 |
62 |
63 | .. admonition:: Theoretical Foundation
64 |
65 | Section :ref:`a) Model Explanation`.
66 |
67 | """
68 | assert prediction_column in data_df.columns, 'The label column should be a column of the dataframe.'
69 | assert problem_type.lower() in ['classification', 'regression']
70 | if problem_type.lower() == 'regression':
71 | assert np.can_cast(data_df[prediction_column], float), 'The prediction column should be numeric'
72 |
73 | k = 0
74 | kp = 0
75 | max_k = 100
76 |
77 | file_name = upload_data(data_df, file_name=file_name)
78 | spinner = Halo(text='Waiting for results from the backend.', spinner='dots')
79 | spinner.start()
80 |
81 | if file_name:
82 | job_id = EXPLANATION_JOB_IDS.get((file_name, prediction_column, problem_type), None)
83 | if job_id:
84 | api_response = APIClient.route(
85 | path='/wk/variable-selection', method='POST', \
86 | file_name=file_name, target_column=prediction_column, \
87 | problem_type=problem_type, timestamp=int(time()), job_id=job_id, \
88 | snr=snr)
89 | else:
90 | api_response = APIClient.route(
91 | path='/wk/variable-selection', method='POST', \
92 | file_name=file_name, target_column=prediction_column, \
93 | problem_type=problem_type, timestamp=int(time()), snr=snr)
94 |
95 | initial_time = time()
96 | while api_response.status_code == requests.codes.ok and k < max_k:
97 | if kp%2 != 0:
98 | sleep(2 if kp<5 else 10 if k < max_k-4 else 300)
99 | kp += 1
100 | k = kp//2
101 |
102 | else:
103 | try:
104 | response = api_response.json()
105 | if 'job_id' in response:
106 | job_id = response['job_id']
107 | EXPLANATION_JOB_IDS[(file_name, prediction_column, problem_type)] = job_id
108 | sleep(2 if kp<5 else 10 if k < max_k-4 else 300)
109 | kp += 1
110 | k = kp//2
111 |
112 | # Note: it is important to pass the job_id to avoid being charged twice for the work.
113 | api_response = APIClient.route(
114 | path='/wk/variable-selection', method='POST', \
115 | file_name=file_name, target_column=prediction_column, \
116 | problem_type=problem_type, timestamp=int(time()), job_id=job_id, \
117 | snr=snr)
118 |
119 | try:
120 | response = api_response.json()
121 | if 'eta' in response:
122 | progress_text = '%s%% Completed.' % response['progress_pct'] if 'progress_pct' in response else ''
123 | spinner.text = 'Waiting for results from the backend. ETA: %s. %s' % (response['eta'], progress_text)
124 | except:
125 | pass
126 |
127 | if ('job_id' not in response) or ('selection_order' in response):
128 | duration = int(time()-initial_time)
129 | duration = str(duration) + 's' if duration < 60 else str(duration//60) + 'min'
130 |
131 | result = {}
132 |
133 | if 'selection_order' in response:
134 | result['Selection Order'] = response['selection_order']
135 |
136 | if 'variable' in response:
137 | result['Variable'] = response['variable']
138 |
139 | if 'r-squared' in response:
140 | result['Running Achievable R-Squared'] = response['r-squared']
141 |
142 | if 'log-likelihood' in response:
143 | result['Running Achievable Log-Likelihood Per Sample'] = response['log-likelihood']
144 |
145 | if 'rmse' in response and problem_type.lower() == 'regression':
146 | result['Running Achievable RMSE'] = response['rmse']
147 |
148 | if 'accuracy' in response and problem_type.lower() == 'classification':
149 | result['Running Achievable Accuracy'] = response['accuracy']
150 |
151 | result = pd.DataFrame.from_dict(result)
152 |
153 | if 'selection_order' in response:
154 | result.set_index('Selection Order', inplace=True)
155 |
156 | spinner.text = 'Received results from the backend after %s.' % duration
157 | spinner.succeed()
158 | return result
159 |
160 |
161 | except:
162 | logging.exception('\nModel explanation failed. Last HTTP code: %s, Content: %s' % (api_response.status_code, api_response.content))
163 | spinner.text = 'The backend encountered an unexpected error we are looking into. Please try again later.'
164 | spinner.fail()
165 | return None
166 |
167 | if api_response.status_code != requests.codes.ok:
168 | spinner.text = 'The backend is taking longer than expected. Please try again later'
169 | spinner.fail()
170 | try:
171 | response = api_response.json()
172 | if 'message' in response:
173 | logging.error('\n%s' % response['message'])
174 | except:
175 | logging.error('\nModel explanation failed. Last HTTP code: %s, Content: %s' % (api_response.status_code, api_response.content))
176 |
177 | raise LongerThanExpectedException('The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.')
178 |
179 | return None
180 |
181 |
182 |
--------------------------------------------------------------------------------
/kxy/pre_learning/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC.
5 | Author: Dr Yves-Laurent Kom Samo
6 |
7 | This program is free software: you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 |
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 |
17 | You should have received a copy of the GNU Affero General Public License
18 | along with this program. If not, see .
19 | """
20 | from .achievable_performance import *
21 | from .variable_selection import *
--------------------------------------------------------------------------------
/kxy/pre_learning/achievable_performance.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | Estimation of the highest performance achievable in a supervised learning problem.
5 | E.g. :math:`R^2`, RMSE, classification accuracy, true log-likelihood per observation.
6 | """
7 | import logging
8 | import requests
9 | import sys
10 | from time import time, sleep
11 |
12 | import numpy as np
13 | import pandas as pd
14 |
15 | try:
16 | get_ipython().__class__.__name__
17 | from halo import HaloNotebook as Halo
18 | except:
19 | from halo import Halo
20 |
21 | from kxy.api import APIClient, upload_data
22 | from kxy.misc import LongerThanExpectedException
23 |
24 | # Cache old job ids to avoid being charged twice for the same job.
25 | VALUATION_JOB_IDS = {}
26 |
27 | def data_valuation(data_df, target_column, problem_type, snr='auto', include_mutual_information=False, file_name=None):
28 | """
29 | .. _data-valuation:
30 | Estimate the highest performance metrics achievable when predicting the :code:`target_column` using all other columns.
31 |
32 | When :code:`problem_type=None`, the nature of the supervised learning problem (i.e. regression or classification) is inferred from whether or not :code:`target_column` is categorical.
33 |
34 |
35 | Parameters
36 | ----------
37 | data_df : pandas.DataFrame
38 | The pandas DataFrame containing the data.
39 | target_column : str
40 | The name of the column containing true labels.
41 | problem_type : None | 'classification' | 'regression'
42 | The type of supervised learning problem. When None, it is inferred from the column type and the number of distinct values.
43 | include_mutual_information : bool
44 | Whether to include the mutual information between target and explanatory variables in the result.
45 | file_name : None | str
46 | A unique identifier characterizing data_df in the form of a file name. Do not set this unless you know why.
47 |
48 |
49 |
50 | Returns
51 | -------
52 | achievable_performance : pandas.Dataframe
53 | The result is a pandas.Dataframe with columns (where applicable):
54 |
55 | * :code:`'Achievable Accuracy'`: The highest classification accuracy that can be achieved by a model using provided inputs to predict the label.
56 | * :code:`'Achievable R-Squared'`: The highest :math:`R^2` that can be achieved by a model using provided inputs to predict the label.
57 | * :code:`'Achievable RMSE'`: The lowest Root Mean Square Error that can be achieved by a model using provided inputs to predict the label.
58 | * :code:`'Achievable Log-Likelihood Per Sample'`: The highest true log-likelihood per sample that can be achieved by a model using provided inputs to predict the label.
59 |
60 |
61 | .. admonition:: Theoretical Foundation
62 |
63 | Section :ref:`1 - Achievable Performance`.
64 | """
65 | assert target_column in data_df.columns, 'The label column should be a column of the dataframe.'
66 | assert problem_type.lower() in ['classification', 'regression']
67 | if problem_type.lower() == 'regression':
68 | assert np.can_cast(data_df[target_column], float), 'The target column should be numeric'
69 |
70 | k = 0
71 | max_k = 100
72 |
73 | file_name = upload_data(data_df, file_name=file_name)
74 | spinner = Halo(text='Waiting for results from the backend.', spinner='dots')
75 | spinner.start()
76 |
77 | if file_name:
78 | job_id = VALUATION_JOB_IDS.get((file_name, target_column, problem_type, snr), None)
79 |
80 | if job_id:
81 | api_response = APIClient.route(
82 | path='/wk/data-valuation', method='POST',
83 | file_name=file_name, target_column=target_column, \
84 | problem_type=problem_type, \
85 | timestamp=int(time()), job_id=job_id, \
86 | snr=snr)
87 | else:
88 | api_response = APIClient.route(
89 | path='/wk/data-valuation', method='POST', \
90 | file_name=file_name, target_column=target_column, \
91 | problem_type=problem_type, timestamp=int(time()), \
92 | snr=snr)
93 |
94 | initial_time = time()
95 | while api_response.status_code == requests.codes.ok and k < max_k:
96 | try:
97 | response = api_response.json()
98 | if 'eta' in response:
99 | progress_text = '%s%% Completed.' % response['progress_pct'] if 'progress_pct' in response else ''
100 | spinner.text = 'Waiting for results from the backend. ETA: %s. %s' % (response['eta'], progress_text)
101 |
102 | if ('job_id' in response) and ('r-squared' not in response):
103 | job_id = response['job_id']
104 | VALUATION_JOB_IDS[(file_name, target_column, problem_type, snr)] = job_id
105 | k += 1
106 | sleep(15.)
107 |
108 | # Note: it is important to pass the job_id to avoid being charged twice for the same work.
109 | api_response = APIClient.route(
110 | path='/wk/data-valuation', method='POST',
111 | file_name=file_name, target_column=target_column, \
112 | problem_type=problem_type, \
113 | timestamp=int(time()), job_id=job_id, \
114 | snr=snr)
115 |
116 | try:
117 | response = api_response.json()
118 | if 'eta' in response:
119 | progress_text = '%s%% Completed.' % response['progress_pct'] if 'progress_pct' in response else ''
120 | spinner.text = 'Waiting for results from the backend. ETA: %s. %s' % (response['eta'], progress_text)
121 | except:
122 | pass
123 |
124 | if ('job_id' not in response) or ('r-squared' in response):
125 | duration = int(time()-initial_time)
126 | duration = str(duration) + 's' if duration < 60 else str(duration//60) + 'min'
127 |
128 | result = {}
129 | if 'r-squared' in response:
130 | result['Achievable R-Squared'] = [response['r-squared']]
131 |
132 | if 'log-likelihood' in response:
133 | result['Achievable Log-Likelihood Per Sample'] = [response['log-likelihood']]
134 |
135 | if 'rmse' in response and problem_type.lower() == 'regression':
136 | result['Achievable RMSE'] = [response['rmse']]
137 |
138 | if 'accuracy' in response and problem_type.lower() == 'classification':
139 | result['Achievable Accuracy'] = [response['accuracy']]
140 |
141 | if include_mutual_information and 'mi' in response:
142 | result['Mutual Information'] = [response['mi']]
143 |
144 | result = pd.DataFrame.from_dict(result)
145 |
146 | spinner.text = 'Received results from the backend after %s.' % duration
147 | spinner.succeed()
148 |
149 | return result
150 |
151 | except:
152 | logging.exception('\nData valuation failed. Last HTTP code: %s' % api_response.status_code)
153 | spinner.text = 'The backend encountered an unexpected error we are looking into. Please try again later.'
154 | spinner.fail()
155 | return None
156 |
157 |
158 | if api_response.status_code != requests.codes.ok:
159 | spinner.text = 'The backend is taking longer than expected. Try again later.'
160 | spinner.fail()
161 | try:
162 | response = api_response.json()
163 | if 'message' in response:
164 | logging.error('\n%s' % response['message'])
165 | except:
166 | logging.error('\nData valuation failed. Last HTTP code: %s' % api_response.status_code)
167 |
168 | raise LongerThanExpectedException('The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.')
169 |
170 | return None
171 |
172 |
173 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.18.1
2 | scipy>=1.4.1
3 | pandas>=0.23.0
4 | requests>=2.22.0
5 | pandarallel
6 | halo
7 | ipywidgets
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | # Inside of setup.cfg
2 | [metadata]
3 | description-file = README.md
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Feb 27 10:54:00 2020
4 |
5 | @author: ylkomsamo
6 | """
7 |
8 | import sys
9 | sys.path.append('.')
10 | from setuptools import setup, find_packages
11 |
12 | with open('README.md') as f:
13 | long_description = f.read()
14 |
15 | version = "1.4.11"
16 | setup(name="kxy",
17 | version=version,
18 | zip_safe=False,
19 | license="GPLv3",
20 | author="Dr. Yves-Laurent Kom Samo",
21 | author_email="github@kxy.ai",
22 | url="https://www.kxy.ai",
23 | description = "A Powerful Serverless Pre-Learning and Post-Learning Analysis Toolkit",
24 | long_description=long_description,
25 | long_description_content_type='text/markdown', # This is important!
26 | project_urls={
27 | "Documentation": "https://www.kxy.ai/reference",
28 | "Source Code": "https://github.com/kxytechnologies/kxy-python/"},
29 | download_url = "https://github.com/kxytechnologies/kxy-python/archive/v%s.tar.gz" % version,
30 | keywords = ["Feature Engineering", "Feature Selection", "Data Valuation", "Lean ML", "AutoML", "Pre-Learning", "Post-Learning"],
31 | packages=find_packages(exclude=["tests"]),
32 | install_requires=["numpy>=1.13.1", "scipy>=1.4.1", "pandas>=0.23.0", "requests>=2.22.0", "pandarallel", "halo", "ipywidgets", "scikit-learn"],
33 | classifiers=[
34 | "Environment :: Console",
35 | "Intended Audience :: Developers",
36 | "Intended Audience :: Education",
37 | "Intended Audience :: Science/Research",
38 | "Intended Audience :: Information Technology",
39 | "Natural Language :: English",
40 | "Operating System :: OS Independent",
41 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
42 | "Topic :: Scientific/Engineering :: Information Analysis",
43 | "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
44 | "Programming Language :: Python :: 3 :: Only",
45 | "Development Status :: 5 - Production/Stable",
46 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
47 | "Topic :: Scientific/Engineering :: Mathematics"
48 | ],
49 | scripts=['bin/kxy']
50 | )
51 |
--------------------------------------------------------------------------------
/tests/test_data_valuation.py:
--------------------------------------------------------------------------------
1 | from kxy_datasets.regressions import Abalone
2 |
3 |
4 | def test_include_mi():
5 | dataset = Abalone()
6 | target_column = dataset.y_column
7 | df = dataset.df
8 | results = df.kxy.data_valuation(target_column, problem_type='regression', \
9 | include_mutual_information=True)
10 | assert 'Mutual Information' in results
--------------------------------------------------------------------------------
/tests/test_features.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/tests/test_features.py
--------------------------------------------------------------------------------
/tests/test_finance.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import kxy
4 |
5 | def test_ia_corr_anon():
6 | x = np.random.randn(10000, 2)
7 | df = pd.DataFrame(x, columns=['market_column', 'asset_column'])
8 | iab_anon = df.kxy.information_adjusted_beta('market_column','asset_column', anonymize=True)
9 | iab = df.kxy.information_adjusted_beta('market_column','asset_column', anonymize=False)
10 | assert np.allclose(iab, iab_anon, atol=1e-03), 'Anonymized and non-anonymized results should be identical (%.4f vs %.4f)' % (iab, iab_anon)
11 |
12 |
13 | def test_ia_corr_nan():
14 | x = np.random.randn(10000, 2)
15 | x[100:200, 0] = np.nan
16 | x[200:300, 1] = np.nan
17 | df = pd.DataFrame(x, columns=['market_column', 'asset_column'])
18 | iab_anon = df.kxy.information_adjusted_beta('market_column','asset_column', anonymize=True)
19 | assert not np.isnan(iab_anon)
20 | iab = df.kxy.information_adjusted_beta('market_column','asset_column', anonymize=False)
21 | assert not np.isnan(iab)
22 | assert np.allclose(iab, iab_anon, atol=1e-03), 'Anonymized and non-anonymized results should be identical (%.4f vs %.4f)' % (iab, iab_anon)
23 |
24 |
--------------------------------------------------------------------------------
/tests/test_flow.py:
--------------------------------------------------------------------------------
1 | # from __future__ import unicode_literals
2 |
3 | if __name__ == '__main__':
4 | # import logging
5 | # logging.basicConfig(level=logging.DEBUG)
6 | import numpy as np
7 | import pandas as pd
8 | import kxy
9 | from kxy.api import upload_data
10 |
11 | df = pd.DataFrame(np.random.randn(20000, 50))
12 | upload_data(df)
13 | df.kxy.describe()
14 |
15 |
16 |
--------------------------------------------------------------------------------
/tests/test_misc.py:
--------------------------------------------------------------------------------
1 | from kxy_datasets.regressions import Abalone
2 | from kxy_datasets.classifications import BankNote, BankMarketing
3 | from kxy.learning import get_xgboost_learner, get_tensorflow_dense_learner, get_pytorch_dense_learner, \
4 | get_lightgbm_learner_sklearn_api, get_lightgbm_learner_learning_api, get_sklearn_learner
5 |
6 |
7 |
8 | def test_boruta():
9 | # Regression
10 | sklearn_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor')
11 | dataset = Abalone()
12 | target_column = dataset.y_column
13 | df = dataset.df
14 |
15 | # Features generation
16 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column])
17 |
18 | # Model building
19 | results = features_df.kxy.fit(target_column, sklearn_regressor_cls, \
20 | problem_type='regression', feature_selection_method='boruta', boruta_n_evaluations=100)
21 | assert results['Selected Variables'] == ['Shucked weight', 'Shell weight', 'Sex_I', \
22 | 'Shucked weight.ABS(* - Q25(*))', 'Whole weight']
23 |
24 |
25 | def test_rfe():
26 | # Regression
27 | sklearn_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor')
28 | dataset = Abalone()
29 | target_column = dataset.y_column
30 | df = dataset.df
31 |
32 | # Features generation
33 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column])
34 |
35 | # Model building
36 | results = features_df.kxy.fit(target_column, sklearn_regressor_cls, \
37 | problem_type='regression', feature_selection_method='rfe', rfe_n_features=10)
38 | assert results['Selected Variables'] == ['Shell weight', 'Sex_I', 'Shucked weight.ABS(* - Q25(*))', \
39 | 'Whole weight.ABS(* - Q25(*))', 'Shucked weight.ABS(* - MEDIAN(*))', 'Shucked weight', \
40 | 'Shucked weight.ABS(* - Q75(*))', 'Shucked weight.ABS(* - MEAN(*))', 'Diameter.ABS(* - Q25(*))', \
41 | 'Diameter.ABS(* - Q75(*))']
--------------------------------------------------------------------------------
/tests/test_pca.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import kxy
4 | from kxy.learning import get_sklearn_learner, get_lightgbm_learner_learning_api, get_xgboost_learner
5 | from kxy.pfs import PCAPredictor, PCA
6 | from kxy_datasets.regressions import Abalone
7 | from kxy_datasets.classifications import BankNote, BankMarketing
8 |
9 |
10 | def test_shape():
11 | dataset = Abalone()
12 | target_column = dataset.y_column
13 | df = dataset.df
14 |
15 | # Features generation
16 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column])
17 | y = features_df[target_column].values
18 | x_columns = [_ for _ in features_df.columns if _ != target_column]
19 | x = features_df[x_columns].values
20 |
21 | # Principal features construction
22 | feature_directions = PCA().fit(x, y)
23 | assert feature_directions.shape[1] == x.shape[1]
24 |
25 | predictor = PCAPredictor()
26 | learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', random_state=0)
27 | results = predictor.fit(features_df, target_column, learner_func)
28 | feature_directions = results['Feature Directions']
29 | assert feature_directions.shape[1] == x.shape[1]
30 |
31 |
32 | def test_orthonormality():
33 | dataset = Abalone()
34 | target_column = dataset.y_column
35 | df = dataset.df
36 |
37 | # Features generation
38 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column])
39 | y = features_df[target_column].values
40 | x_columns = [_ for _ in features_df.columns if _ != target_column]
41 | x = features_df[x_columns].values
42 |
43 | # Principal features construction
44 | feature_directions = PCA().fit(x, y)
45 | n_directions = feature_directions.shape[0]
46 | for i in range(n_directions):
47 | assert np.allclose(np.dot(feature_directions[i, :], feature_directions[i, :]), 1.)
48 | for j in range(n_directions):
49 | if j != i:
50 | assert np.abs(np.dot(feature_directions[i, :], feature_directions[j, :])) < 1e-7
51 |
52 | predictor = PCAPredictor()
53 | learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', random_state=0)
54 | results = predictor.fit(features_df, target_column, learner_func)
55 | feature_directions = results['Feature Directions']
56 | n_directions = feature_directions.shape[0]
57 | for i in range(n_directions):
58 | assert np.allclose(np.dot(feature_directions[i, :], feature_directions[i, :]), 1.)
59 |
60 |
61 |
62 |
63 |
64 | def test_pca_feature_selection():
65 | # Regression
66 | xgboost_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor')
67 | dataset = Abalone()
68 | target_column = dataset.y_column
69 | df = dataset.df
70 |
71 | # Features generation
72 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column])
73 |
74 | # Model building
75 | results = features_df.kxy.fit(target_column, xgboost_regressor_cls, \
76 | problem_type='regression', feature_selection_method='pfs')
77 | assert results['Feature Directions'].shape[1] == features_df.shape[1]-1
78 | predictor = results['predictor']
79 | predictions = predictor.predict(features_df)
80 | assert len(predictions.columns) == 1
81 | assert target_column in predictions.columns
82 | assert set(features_df.index).difference(set(predictions.index)) == set()
83 | assert set(predictions.index).difference(set(features_df.index)) == set()
84 |
85 |
86 | def test_save_pca():
87 | # Regression
88 | xgboost_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor')
89 | dataset = Abalone()
90 | target_column = dataset.y_column
91 | df = dataset.df
92 |
93 | # Features generation
94 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column])
95 |
96 | # Model building
97 | path = 'Abalone'
98 | results = features_df.kxy.fit(target_column, xgboost_regressor_cls, \
99 | problem_type='regression', feature_selection_method='pca', \
100 | path=path)
101 | loaded_predictor = PCAPredictor().load(path, xgboost_regressor_cls)
102 | feature_directions = loaded_predictor.feature_directions
103 | assert feature_directions.shape[1] == features_df.shape[1]-1
104 | predictions = loaded_predictor.predict(features_df)
105 | assert len(predictions.columns) == 1
106 | assert target_column in predictions.columns
107 | assert set(features_df.index).difference(set(predictions.index)) == set()
108 | assert set(predictions.index).difference(set(features_df.index)) == set()
109 |
110 |
111 |
112 |
--------------------------------------------------------------------------------
/tests/test_pfs.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import kxy
4 | from kxy.learning import get_sklearn_learner, get_lightgbm_learner_learning_api, get_xgboost_learner
5 | from kxy.pfs import PFSPredictor, PFS, PCA
6 | from kxy_datasets.regressions import Abalone
7 | from kxy_datasets.classifications import BankNote, BankMarketing
8 |
9 |
10 | def test_shape():
11 | dataset = Abalone()
12 | target_column = dataset.y_column
13 | df = dataset.df
14 |
15 | # Features generation
16 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column])
17 | y = features_df[target_column].values
18 | x_columns = [_ for _ in features_df.columns if _ != target_column]
19 | x = features_df[x_columns].values
20 |
21 | # Principal features construction
22 | feature_directions = PFS().fit(x, y)
23 | assert feature_directions.shape[1] == x.shape[1]
24 |
25 | predictor = PFSPredictor()
26 | learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', random_state=0)
27 | results = predictor.fit(features_df, target_column, learner_func)
28 | feature_directions = results['Feature Directions']
29 | assert feature_directions.shape[1] == x.shape[1]
30 |
31 |
32 | def test_norm():
33 | dataset = Abalone()
34 | target_column = dataset.y_column
35 | df = dataset.df
36 |
37 | # Features generation
38 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column])
39 | y = features_df[target_column].values
40 | x_columns = [_ for _ in features_df.columns if _ != target_column]
41 | x = features_df[x_columns].values
42 |
43 | # Principal features construction
44 | feature_directions = PFS().fit(x, y)
45 | n_directions = feature_directions.shape[0]
46 | for i in range(n_directions):
47 | assert np.allclose(np.dot(feature_directions[i, :], feature_directions[i, :]), 1.)
48 |
49 | predictor = PFSPredictor()
50 | learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', random_state=0)
51 | results = predictor.fit(features_df, target_column, learner_func)
52 | feature_directions = results['Feature Directions']
53 | n_directions = feature_directions.shape[0]
54 | for i in range(n_directions):
55 | assert np.allclose(np.dot(feature_directions[i, :], feature_directions[i, :]), 1.)
56 |
57 |
58 | def test_pfs_feature_selection():
59 | # Regression
60 | xgboost_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor')
61 | dataset = Abalone()
62 | target_column = dataset.y_column
63 | df = dataset.df
64 |
65 | # Features generation
66 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column])
67 |
68 | # Model building
69 | results = features_df.kxy.fit(target_column, xgboost_regressor_cls, \
70 | problem_type='regression', feature_selection_method='pfs')
71 | assert results['Feature Directions'].shape[1] == features_df.shape[1]-1
72 | predictor = results['predictor']
73 | predictions = predictor.predict(features_df)
74 | assert len(predictions.columns) == 1
75 | assert target_column in predictions.columns
76 | assert set(features_df.index).difference(set(predictions.index)) == set()
77 | assert set(predictions.index).difference(set(features_df.index)) == set()
78 |
79 |
80 | def test_save_pfs():
81 | # Regression
82 | xgboost_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor')
83 | dataset = Abalone()
84 | target_column = dataset.y_column
85 | df = dataset.df
86 |
87 | # Features generation
88 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column])
89 |
90 | # Model building
91 | path = 'Abalone'
92 | results = features_df.kxy.fit(target_column, xgboost_regressor_cls, \
93 | problem_type='regression', feature_selection_method='pfs', \
94 | path=path)
95 | loaded_predictor = PFSPredictor().load(path, xgboost_regressor_cls)
96 | feature_directions = loaded_predictor.feature_directions
97 | assert feature_directions.shape[1] == features_df.shape[1]-1
98 | predictions = loaded_predictor.predict(features_df)
99 | assert len(predictions.columns) == 1
100 | assert target_column in predictions.columns
101 | assert set(features_df.index).difference(set(predictions.index)) == set()
102 | assert set(predictions.index).difference(set(features_df.index)) == set()
103 |
104 |
105 | def test_pfs_accuracy():
106 | # Generate the data
107 | seed = 1
108 | np.random.seed(seed)
109 | d = 100
110 | w = np.ones(d)/d
111 | x = np.random.randn(10000, d)
112 | xTw = np.dot(x, w)
113 | y = xTw + 2.*xTw**2 + 0.5*xTw**3
114 |
115 | # Run PFS
116 | from kxy.misc.tf import set_default_parameter
117 | set_default_parameter('lr', 0.001)
118 | selector = PFS()
119 | selector.fit(x, y, epochs=21, seed=seed, expand_y=True)
120 |
121 | # Learned principal directions
122 | F = selector.feature_directions
123 |
124 | # Learned principal features
125 | z = np.dot(x, F.T)
126 |
127 | # Accuracy
128 | true_f_1 = w/np.linalg.norm(w)
129 | learned_f_1 = F[0, :]
130 | e = np.linalg.norm(true_f_1-learned_f_1)
131 |
132 | assert e <= 0.10
133 | assert selector.mutual_information > 1.0
134 |
135 |
136 | def test_feature_extraction():
137 | # Generate the data
138 | seed = 1
139 | np.random.seed(seed)
140 | d = 100
141 | w = np.ones(d)/d
142 | x_train = np.random.randn(10000, d)
143 | x_trainTw = np.dot(x_train, w)
144 | y_train = x_trainTw + 2.*x_trainTw**2 + 0.5*x_trainTw**3
145 |
146 | # Run PFS
147 | from kxy.misc.tf import set_default_parameter
148 | set_default_parameter('lr', 0.001)
149 | selector = PFS()
150 | selector.fit(x_train, y_train, epochs=21, seed=seed, expand_y=False)
151 |
152 | # Extract the features
153 | fx_train = selector.max_ent_features_x(x_train)
154 | assert fx_train.shape[0] == x_train.shape[0]
155 |
156 | # Run a linear regression relating learned features to y
157 | from sklearn.linear_model import LinearRegression
158 | from sklearn.metrics import r2_score
159 |
160 | # Training
161 | m = LinearRegression()
162 | m.fit(fx_train, y_train)
163 |
164 | # Testing accuracy
165 | x_test = np.random.randn(10000, d)
166 | x_testTw = np.dot(x_test, w)
167 | y_test = x_testTw + 2.*x_testTw**2 + 0.5*x_testTw**3
168 |
169 | fx_test = selector.max_ent_features_x(x_test)
170 | assert fx_test.shape[0] == x_test.shape[0]
171 |
172 | y_test_predicted = m.predict(fx_test)
173 | testing_r2 = r2_score(y_test_predicted, y_test)
174 |
175 | y_train_predicted = m.predict(fx_train)
176 | training_r2 = r2_score(y_train_predicted, y_train)
177 |
178 | assert training_r2>0.99, 'Learned features should be good for linear regression in-sample'
179 | assert testing_r2>0.99, 'Learned features should be good for linear regression out-of-sample'
180 |
181 |
182 |
--------------------------------------------------------------------------------