├── .gitignore ├── CHANGELOG.md ├── CITATION.cff ├── LICENSE ├── Makefile ├── README.md ├── bin └── kxy ├── docker └── kxy │ └── Dockerfile ├── docs ├── Makefile ├── _static │ ├── kxy.css │ ├── matomo.js │ └── theme_override.css ├── _templates │ └── layout.html ├── conf.py ├── images │ ├── bn_importance.png │ ├── bn_incremental_importance.png │ ├── bn_separability.png │ ├── classification_accuracy_frontier.png │ ├── classification_accuracy_frontier_2.png │ ├── entropy_venn.png │ ├── favicon.png │ ├── gm_separability.png │ ├── gm_separability_mov.gif │ ├── incremental_input_importance.png │ ├── logo.png │ └── logo.svg ├── index.rst ├── latest │ ├── api │ │ └── index.rst │ ├── applications │ │ ├── case_studies │ │ │ ├── conditionally_useful_features.ipynb │ │ │ ├── empirical_validation_classification.ipynb │ │ │ ├── empirical_validation_regression.ipynb │ │ │ ├── features_pruning.ipynb │ │ │ ├── index.rst │ │ │ └── unbalanced_datasets.ipynb │ │ ├── cheat_sheet │ │ │ └── index.rst │ │ ├── illustrations │ │ │ ├── abalone.ipynb │ │ │ ├── adult.ipynb │ │ │ ├── air_quality.ipynb │ │ │ ├── airfoil.ipynb │ │ │ ├── aps.ipynb │ │ │ ├── avila.ipynb │ │ │ ├── bank_marketing.ipynb │ │ │ ├── bank_note.ipynb │ │ │ ├── bike_sharing.ipynb │ │ │ ├── blog_feedback.ipynb │ │ │ ├── card_default.ipynb │ │ │ ├── concrete.ipynb │ │ │ ├── ct_slices.ipynb │ │ │ ├── diabetic.ipynb │ │ │ ├── eeg.ipynb │ │ │ ├── empirical_validation_regression.ipynb │ │ │ ├── energy_efficiency.ipynb │ │ │ ├── facebook_comments.ipynb │ │ │ ├── heart_attack.ipynb │ │ │ ├── house_prices_advanced.ipynb │ │ │ ├── index.rst │ │ │ ├── landsat.ipynb │ │ │ ├── letter_recognition.ipynb │ │ │ ├── magic_gamma.ipynb │ │ │ ├── naval_propulsion.ipynb │ │ │ ├── online_news.ipynb │ │ │ ├── parkinson.ipynb │ │ │ ├── power_plant.ipynb │ │ │ ├── real_estate.ipynb │ │ │ ├── sensorless_drive.ipynb │ │ │ ├── shuttle.ipynb │ │ │ ├── skin_segmentation.ipynb │ │ │ ├── social_media_buzz.ipynb │ │ │ ├── superconductivity.ipynb │ │ │ ├── titanic.ipynb │ │ │ ├── water_quality.ipynb │ │ │ ├── white_wine_quality.ipynb │ │ │ ├── yacht.ipynb │ │ │ └── year_prediction_msd.ipynb │ │ └── index.rst │ ├── data_transfer │ │ └── index.rst │ ├── data_valuation │ │ └── index.rst │ ├── index │ │ └── index.rst │ ├── learning │ │ └── index.rst │ ├── model_explanation │ │ └── index.rst │ ├── model_free_variable_selection │ │ └── index.rst │ ├── model_improvability │ │ └── index.rst │ ├── model_wrapped_feature_selection │ │ └── index.rst │ ├── pandas │ │ └── index.rst │ ├── quickstart │ │ └── getting_started.ipynb │ ├── theoretical_foundation │ │ ├── memoryful │ │ │ └── index.rst │ │ └── memoryless │ │ │ ├── applications.rst │ │ │ ├── estimation.rst │ │ │ ├── index.rst │ │ │ ├── problem_formulation.rst │ │ │ └── quantifying_informativeness.rst │ └── utilities │ │ └── index.rst └── make.bat ├── kxy ├── __init__.py ├── api │ ├── __init__.py │ ├── client.py │ ├── data_transfer.py │ ├── decorators.py │ └── utils.py ├── billing │ ├── __init__.py │ └── billing_details.py ├── examples │ ├── autogluon_compression.ipynb │ ├── feature_selection_benchmark.py │ ├── feature_selection_example.py │ ├── lightgbm_model_compression.ipynb │ ├── numerai_example.py │ ├── random_forest_model_compression.ipynb │ └── xgboost_model_compression.ipynb ├── finance │ ├── __init__.py │ └── corr.py ├── learning │ ├── __init__.py │ ├── base_learners.py │ ├── leanml_predictor.py │ ├── pytorch_early_termination.py │ └── tensorflow_early_termination.py ├── misc │ ├── __init__.py │ ├── boruta.py │ ├── exceptions.py │ ├── mind.py │ ├── naive.py │ ├── predictors.py │ ├── rfe.py │ └── tf │ │ ├── __init__.py │ │ ├── config.py │ │ ├── generators.py │ │ ├── initializers.py │ │ ├── layers.py │ │ ├── learners.py │ │ ├── losses.py │ │ ├── models.py │ │ └── ops.py ├── pandas_extension │ ├── __init__.py │ ├── accessor.py │ ├── base_accessor.py │ ├── features_accessor.py │ ├── features_utils.py │ ├── finance_accessor.py │ ├── learning_accessor.py │ ├── post_learning_accessor.py │ └── pre_learning_accessor.py ├── pfs │ ├── __init__.py │ ├── pfs_predictor.py │ └── pfs_selector.py ├── post_learning │ ├── __init__.py │ ├── improvability.py │ └── model_explanation.py └── pre_learning │ ├── __init__.py │ ├── achievable_performance.py │ └── variable_selection.py ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── test_boruta.py ├── test_data_valuation.py ├── test_features.py ├── test_finance.py ├── test_flow.py ├── test_learning.py ├── test_load_save_base_learners.py ├── test_load_save_predictors.py ├── test_misc.py ├── test_pca.py ├── test_pfs.py └── test_rfe.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # local files 132 | local/ 133 | lambdas/ 134 | learners/ 135 | 136 | .DS_Store 137 | UCI*/ 138 | 139 | *.csv 140 | *.pkl 141 | *.sav 142 | *.sav-* 143 | *.json 144 | *.parquet 145 | *.h5 146 | *.png 147 | local_*.py 148 | *do-not-commit* 149 | AutogluonModels* 150 | *-PFSPredictor 151 | *-PCAPredictor 152 | *-LeanMLPredictor 153 | *-NaivePredictor -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 2 | # Change Log 3 | 4 | ## v.1.4.10 Changes 5 | 6 | * Added a function to construct features derived from PFS mutual information estimation that should be expected to be linearly related to the target. 7 | * Fixed a global name conflict in `kxy.learning.base_learners`. 8 | 9 | 10 | ## v.1.4.9 Changes 11 | 12 | * Change the activation function used by PFS from ReLU to switch/SILU. 13 | * Leaving it to the user to set the logging level. 14 | 15 | 16 | ## v.1.4.8 Changes 17 | 18 | * Froze the versions of all python packages in the docker file. 19 | 20 | 21 | ## v.1.4.7 Changes 22 | 23 | Changes related to optimizing Principal Feature Selection. 24 | 25 | * Made it easy to change PFS' default learning parameters. 26 | * Changed PFS' default learning parameters (learning rate is now 0.005 and epsilon 1e-04) 27 | * Adding a seed parameter to PFS' fit for reproducibility. 28 | 29 | To globally change the learning rate to 0.003, change Adam's epsilon to 1e-5, and the number of epochs to 25, do 30 | 31 | ```Python 32 | from kxy.misc.tf import set_default_parameter 33 | set_default_parameter('lr', 0.003) 34 | set_default_parameter('epsilon', 1e-5) 35 | set_default_parameter('epochs', 25) 36 | ``` 37 | 38 | To change the number epochs for a single iteration of PFS, use the `epochs` argument of the `fit` method of your `PFS` object. The `fit` method now also has a `seed` parameter you may use to make the PFS implementation deterministic. 39 | 40 | Example: 41 | ```Python 42 | from kxy.pfs import PFS 43 | selector = PFS() 44 | selector.fit(x, y, epochs=25, seed=123) 45 | ``` 46 | 47 | Alternatively, you may also use the `kxy.misc.tf.set_seed` method to make PFS deterministic. 48 | 49 | 50 | ## v.1.4.6 Changes 51 | 52 | Minor PFS improvements. 53 | 54 | * Adding more (robust) mutual information loss functions. 55 | * Exposing the learned total mutual information between principal features and target as an attribute of PFS. 56 | * Exposing the number of epochs as a parameter of PFS' fit. -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: If you use this software, please cite it using these metadata. 3 | authors: 4 | - family-names: Kom Samo 5 | given-names: Yves-Laurent 6 | orcid: "https://orcid.org/0000-0003-2901-6930" 7 | title: KXY: A Seemless API to 10x The Productivity of Machine Learning Engineers. 8 | version: 1.4.3 9 | date-released: "2021-10-12" 10 | abstract: KXY is a powerful serverless analysis toolkit that takes trial-and-error out of machine learning projects. 11 | url: "https://github.com/kxytechnologies/kxy-python" 12 | license: GPL-3.0 13 | 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | VERSION = 1.4.11 2 | 3 | # Update the s3 bucket of the docs website 4 | deploy_docs: 5 | aws s3 sync docs/_build/html s3://www.kxy.ai/reference/ --acl public-read --metadata-directive REPLACE --cache-control max-age=86400 --profile kxy 6 | 7 | # Invalidate certain cached files in the cloudfront distribution 8 | refresh_web: 9 | aws cloudfront create-invalidation --distribution-id EJZS9SM07YXKX --paths $(PATHS) --profile kxy 10 | 11 | # Cut a PyPi release 12 | pypi_release: 13 | python setup.py sdist bdist_wheel 14 | twine check dist/* 15 | twine upload --skip-existing dist/* 16 | 17 | install: 18 | pip install . 19 | 20 | 21 | docker_release: 22 | docker build -t kxytechnologies/kxy:latest ./docker/kxy/ 23 | docker login --username drylnks && docker push kxytechnologies/kxy:latest 24 | 25 | 26 | docker_release_github: 27 | docker build -t ghcr.io/kxytechnologies/kxy-python:latest ./docker/kxy/ 28 | # echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:latest 29 | docker push ghcr.io/kxytechnologies/kxy-python:latest 30 | docker build -t ghcr.io/kxytechnologies/kxy-python:$(VERSION) ./docker/kxy/ 31 | # echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:$(VERSION) 32 | docker push ghcr.io/kxytechnologies/kxy-python:$(VERSION) 33 | 34 | 35 | one_shot_release: 36 | make clean 37 | make html 38 | make deploy_docs 39 | make refresh_web PATHS=/reference/* 40 | make docker_release 41 | 42 | 43 | update_docs: 44 | make clean 45 | make html 46 | make deploy_docs 47 | make refresh_web PATHS=/reference/* 48 | 49 | 50 | github_release: 51 | gh release create v$(VERSION) -F CHANGELOG.md 52 | 53 | 54 | package_release: 55 | make pypi_release 56 | make github_release 57 | timeout 5 58 | make docker_release_github 59 | make docker_release 60 | 61 | 62 | osr: 63 | make one_shot_release 64 | 65 | 66 | # Route any other make target to Sphinx 67 | # You can set these variables from the command line, and also 68 | # from the environment for the first two. 69 | SPHINXOPTS ?= 70 | SPHINXBUILD ?= sphinx-build 71 | SOURCEDIR = docs 72 | BUILDDIR = docs/_build 73 | 74 | help: 75 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 76 | 77 | .PHONY: help Makefile 78 | 79 | # Catch-all target: route all unknown targets to Sphinx using the new 80 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 81 | %: Makefile 82 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)# You can set these variables from the command line, and also 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | 5 | ----------------- 6 | 7 | # Boosting The Productivity of Machine Learning Engineers 8 | [![License](https://img.shields.io/badge/license-GPLv3%2B-blue)](https://github.com/kxytechnologies/kxy-python/blob/master/LICENSE) 9 | [![PyPI Latest Release](https://img.shields.io/pypi/v/kxy.svg)](https://www.kxy.ai/) 10 | [![Downloads](https://pepy.tech/badge/kxy)](https://www.kxy.ai/) 11 | 12 | 13 | ## Documentation 14 | https://www.kxy.ai/reference/ 15 | 16 | ## Blog 17 | https://blog.kxy.ai 18 | 19 | 20 | ## Installation 21 | From PyPi: 22 | ```Bash 23 | pip install kxy -U 24 | ``` 25 | From GitHub: 26 | ```Bash 27 | git clone https://github.com/kxytechnologies/kxy-python.git & cd ./kxy-python & pip install . 28 | ``` 29 | ## Authentication 30 | All heavy-duty computations are run on our serverless infrastructure and require an API key. To configure the package with your API key, run 31 | ```Bash 32 | kxy configure 33 | ``` 34 | and follow the instructions. To get your own API key you need an account; you can sign up [here](https://www.kxy.ai/signup/). You'll then be automatically given an API key which you can find [here](https://www.kxy.ai/portal/profile/identity/). 35 | 36 | 37 | ## Docker 38 | The Docker image [kxytechnologies/kxy](https://hub.docker.com/repository/docker/kxytechnologies/kxy) has been built for your convenience, and comes with anaconda, auto-sklearn, and the kxy package. 39 | 40 | To start a Jupyter Notebook server from a sandboxed Docker environment, run 41 | ```Bash 42 | docker run -i -t -p 5555:8888 kxytechnologies/kxy:latest /bin/bash -c "kxy configure && /opt/conda/bin/jupyter notebook --notebook-dir=/opt/notebooks --ip='*' --port=8888 --no-browser --allow-root --NotebookApp.token=''" 43 | ``` 44 | where you should replace `` with your API key and navigate to [http://localhost:5555](http://localhost:5555) in your browser. This docker environment comes with [all examples available on the documentation website](https://www.kxy.ai/reference/latest/examples/). 45 | 46 | To start a Jupyter Notebook server from an existing directory of notebooks, run 47 | ```Bash 48 | docker run -i -t --mount src=,target=/opt/notebooks,type=bind -p 5555:8888 kxytechnologies/kxy:latest /bin/bash -c "kxy configure && /opt/conda/bin/jupyter notebook --notebook-dir=/opt/notebooks --ip='*' --port=8888 --no-browser --allow-root --NotebookApp.token=''" 49 | ``` 50 | where you should replace `` with the path to your local notebook folder and navigate to [http://localhost:5555](http://localhost:5555) in your browser. 51 | 52 | You can also get the same Docker image from GitHub [here](https://github.com/kxytechnologies/kxy-python/pkgs/container/kxy-python). 53 | 54 | ## Other Programming Language 55 | We plan to release friendly API client in more programming language. 56 | 57 | In the meantime, you can directly issue requests to our [RESTFul API](https://www.kxy.ai/reference/latest/api/index.html) using your favorite programming language. 58 | 59 | ## Pricing 60 | All API keys are given a free quota (a few dozen backend tasks) that should be enough to try out the package and see if you love it. Beyond the free quota you will be billed a small fee per task. 61 | 62 | KXY is free for academic use; simply signup with your university email. 63 | 64 | KXY is also free for Kaggle competitions; sign up and email kaggle@kxy.ai to get a promotional code. 65 | -------------------------------------------------------------------------------- /bin/kxy: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import os 4 | if os.environ.get('LC_CTYPE', '') == 'UTF-8': 5 | os.environ['LC_CTYPE'] = 'en_US.UTF-8' 6 | 7 | import json 8 | 9 | def main(api_key=None): 10 | home = os.path.expanduser("~") 11 | path = os.path.join(home, '.kxy') 12 | os.makedirs(path, exist_ok=True) 13 | file_name = os.path.join(path, 'config') 14 | 15 | if not os.path.exists(file_name): 16 | with open(file_name, 'w') as f: 17 | json.dump({}, f) 18 | 19 | with open(file_name, 'r') as f: 20 | config = json.load(f) 21 | existing_key = config.get('KXY_API_KEY', '') 22 | 23 | if existing_key != '': 24 | existing_key = '(' + existing_key[:4] + '*' * (len(existing_key)-4) + ') ' 25 | 26 | if api_key is None: 27 | api_key = input('KXY API Key: %s' % existing_key) 28 | if api_key is None or api_key == '': 29 | api_key = config.get('KXY_API_KEY', '') 30 | 31 | config['KXY_API_KEY'] = api_key 32 | 33 | with open(file_name, 'w') as f: 34 | json.dump(config, f) 35 | 36 | return 37 | 38 | 39 | if __name__ == '__main__': 40 | if len(sys.argv) > 1 and sys.argv[1] == 'configure': 41 | api_key = sys.argv[2] if len(sys.argv) > 2 else None 42 | sys.exit(main(api_key=api_key)) -------------------------------------------------------------------------------- /docker/kxy/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/anaconda3 2 | 3 | RUN apt-get update 4 | RUN apt-get install build-essential -y 5 | RUN apt-get install swig -y 6 | 7 | RUN /opt/conda/bin/conda install gxx_linux-64 gcc_linux-64 8 | RUN /opt/conda/bin/conda install jupyter -y --quiet 9 | RUN pip install --upgrade pip 10 | RUN pip install pyarrow==7.0.0 11 | RUN pip install fastparquet==0.8.0 12 | RUN pip install emcee==3.1.1 scikit-optimize==0.9.0 pyDOE==0.3.8 13 | RUN pip install auto-sklearn==0.14.6 14 | 15 | # Install other ML open source librairies 16 | RUN pip install xgboost==1.5.2 17 | RUN pip install lightgbm==3.3.2 18 | RUN pip install tensorflow==2.8.0 19 | RUN pip install tensorflow_probability==0.16.0 20 | RUN pip install botocore==1.24.27 21 | RUN pip install boto3==1.21.27 22 | RUN pip install tqdm==4.62.3 23 | 24 | # Install kxy 25 | RUN pip install kxy==1.4.10 26 | 27 | # Copy examples into the Notebooks folder 28 | RUN git clone https://github.com/kxytechnologies/kxy-python.git /opt/kxy-python 29 | RUN mkdir /opt/notebooks 30 | RUN cp -R /opt/kxy-python/docs/latest/applications/case_studies/* /opt/notebooks/ 31 | RUN cp -R /opt/kxy-python/docs/latest/applications/illustrations/* /opt/notebooks/ -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | 18 | # Catch-all target: route all unknown targets to Sphinx using the new 19 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 20 | %: Makefile 21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /docs/_static/kxy.css: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | .rst-content .admonition-summary .admonition-title { 5 | background: #2c3d5e; 6 | } 7 | 8 | .rst-content .admonition-summary .admonition-title:before { 9 | content: "\f0c6"; 10 | } 11 | 12 | .rst-content .admonition-theoretical-foundation .admonition-title { 13 | background: #2c3d5e; 14 | } 15 | 16 | .rst-content .admonition-theoretical-foundation .admonition-title:before { 17 | content: "\f140"; 18 | } 19 | 20 | 21 | .rst-content .note .admonition-title { 22 | background: #2c3d5e; 23 | } 24 | 25 | .rst-content .note .admonition-title:before { 26 | content: "\f0c6"; 27 | } 28 | 29 | .rst-content .seealso .admonition-title { 30 | background: #343131; 31 | } 32 | 33 | .rst-content .seealso .admonition-title:before { 34 | content: "\f140"; 35 | } 36 | 37 | .rst-content .admonition, .rst-content .admonition-title { 38 | border-radius: 15px; 39 | } 40 | 41 | .rst-content .important .admonition-title:before { 42 | content: "\f071"; 43 | } 44 | 45 | .rst-content .important .admonition-title { 46 | background: #ffa98f; 47 | } 48 | 49 | .rst-content .admonition-important-equation .admonition-title { 50 | background: #1abc9c; 51 | } 52 | 53 | .rst-content .admonition-important-equation { 54 | background: #eeffcc; 55 | } 56 | 57 | .rst-content .important { 58 | background: #ffe4dc; 59 | } 60 | 61 | .underline { 62 | text-decoration: underline; 63 | } 64 | 65 | .wy-menu .caption-text { 66 | color: #02e966; 67 | } 68 | 69 | .rst-content .admonition-properties .admonition-title:before, 70 | .rst-content .admonition-property .admonition-title:before, 71 | .rst-content .admonition-important-equation .admonition-title:before { 72 | content: "\f08d"; 73 | } 74 | 75 | .rst-content .admonition-properties { 76 | counter-reset: properties; 77 | 78 | } 79 | 80 | .rst-content .admonition-properties ol>li { 81 | list-style-type: none; 82 | } 83 | 84 | .rst-content .admonition-properties ol>li:before { 85 | counter-increment: properties; 86 | content: "P" counter(properties) ".\00a0\00a0"; 87 | font-weight: bold; 88 | } 89 | 90 | span.eqno { 91 | float: right; 92 | } 93 | 94 | 95 | .rst-content .footnote-reference, .rst-content .citation-reference { 96 | top: 0; 97 | } 98 | 99 | 100 | .math .eqno a.headerlink { 101 | visibility: hidden; 102 | } 103 | 104 | 105 | mark.kxy-blue{ 106 | color: #2C3960; 107 | background: rgba(2,233,102, 0.3); 108 | border-radius: 2px; 109 | font-weight: normal; 110 | } 111 | 112 | i { 113 | font-style: italic; 114 | } 115 | 116 | 117 | @media only screen 118 | and (max-device-width : 767px) { 119 | .wy-nav-top { 120 | background: #2C3960; 121 | } 122 | } -------------------------------------------------------------------------------- /docs/_static/matomo.js: -------------------------------------------------------------------------------- 1 | 2 | var _paq = window._paq || []; 3 | /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ 4 | _paq.push(['trackPageView']); 5 | _paq.push(['enableLinkTracking']); 6 | (function() { 7 | var u="https://kxyai.matomo.cloud/"; 8 | _paq.push(['setTrackerUrl', u+'matomo.php']); 9 | _paq.push(['setSiteId', '1']); 10 | var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; 11 | g.type='text/javascript'; g.async=true; g.defer=true; g.src='//cdn.matomo.cloud/kxyai.matomo.cloud/matomo.js'; s.parentNode.insertBefore(g,s); 12 | })(); 13 | -------------------------------------------------------------------------------- /docs/_static/theme_override.css: -------------------------------------------------------------------------------- 1 | 2 | @media screen and (min-width: 1500px) { 3 | .wy-nav-content { 4 | max-width: calc(100vw - 600px); 5 | } 6 | } 7 | 8 | @media screen and (max-width: 768px) { 9 | .wy-nav-content-wrap .wy-nav-content { 10 | max-width: 100%; 11 | } 12 | } -------------------------------------------------------------------------------- /docs/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | {% set css_files = css_files + ["_static/kxy.css", "_static/theme_override.css"] %} 3 | {% set script_files = script_files + ["_static/matomo.js"] %} -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('..')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'KXY (Lean AutoML, As A Service)' 21 | copyright = '2021, KXY Technologies, Inc' 22 | author = 'Dr. Yves-Laurent Kom Samo' 23 | version = 'latest' 24 | autodoc_inherit_docstrings = False 25 | 26 | 27 | # -- General configuration --------------------------------------------------- 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.napoleon', \ 33 | 'sphinx.ext.todo', 'sphinx.ext.githubpages', 'sphinxcontrib.bibtex', \ 34 | 'sphinx.ext.mathjax', 'sphinx.ext.autosectionlabel', 'nbsphinx', \ 35 | 'sphinx_copybutton', 'sphinxcontrib.googleanalytics', 'sphinx_sitemap', \ 36 | 'sphinxcontrib.httpdomain'] 37 | 38 | # imgmath_image_format = 'svg' 39 | # imgmath_font_size = 13 40 | 41 | # Add any paths that contain templates here, relative to this directory. 42 | templates_path = ['_templates'] 43 | 44 | # List of patterns, relative to source directory, that match files and 45 | # directories to ignore when looking for source files. 46 | # This pattern also affects html_static_path and html_extra_path. 47 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 48 | 49 | 50 | # -- Options for HTML output ------------------------------------------------- 51 | 52 | # The theme to use for HTML and HTML Help pages. See the documentation for 53 | # a list of builtin themes. 54 | # 55 | html_context = { 56 | # Enable the "Edit in GitHub link within the header of each page. 57 | 'display_github': True, 58 | # Set the following variables to generate the resulting github URL for each page. 59 | # Format Template: https://{{ github_host|default("github.com") }}/{{ github_user }}/{{ github_repo }}/blob/{{ github_version }}{{ conf_py_path }}{{ pagename }}{{ suffix }} 60 | 'github_user': 'kxytechnologies', 61 | 'github_repo': 'kxy-python', 62 | 'github_version': 'master/docs/' 63 | } 64 | 65 | html_theme = 'sphinx_rtd_theme' 66 | html_logo = 'images/logo.png' 67 | html_favicon = 'images/favicon.png' 68 | html_theme_options = {'logo_only': True, 'style_nav_header_background': '#2c3d5e'} 69 | 70 | # Add any paths that contain custom static files (such as style sheets) here, 71 | # relative to this directory. They are copied after the builtin static files, 72 | # so a file named "default.css" will overwrite the builtin "default.css". 73 | html_static_path = ['_static'] 74 | 75 | # Notebook 76 | nbsphinx_execute = 'never' 77 | nbsphinx_allow_errors = True 78 | nbsphinx_input_prompt = 'In [%s]:' 79 | nbsphinx_output_prompt = 'Out[%s]:' 80 | source_suffix = ['.rst', '.md', '.ipynb'] 81 | 82 | # Google Analytics 83 | googleanalytics_id = 'UA-167632834-2' 84 | googleanalytics_enabled = True 85 | 86 | 87 | # Sitemap 88 | html_baseurl = 'https://www.kxy.ai/reference/' 89 | html_title = 'The KXY Platform: Lean AutoML, As A Service' 90 | # html_extra_path = ['robots.txt'] 91 | 92 | -------------------------------------------------------------------------------- /docs/images/bn_importance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/bn_importance.png -------------------------------------------------------------------------------- /docs/images/bn_incremental_importance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/bn_incremental_importance.png -------------------------------------------------------------------------------- /docs/images/bn_separability.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/bn_separability.png -------------------------------------------------------------------------------- /docs/images/classification_accuracy_frontier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/classification_accuracy_frontier.png -------------------------------------------------------------------------------- /docs/images/classification_accuracy_frontier_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/classification_accuracy_frontier_2.png -------------------------------------------------------------------------------- /docs/images/entropy_venn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/entropy_venn.png -------------------------------------------------------------------------------- /docs/images/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/favicon.png -------------------------------------------------------------------------------- /docs/images/gm_separability.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/gm_separability.png -------------------------------------------------------------------------------- /docs/images/gm_separability_mov.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/gm_separability_mov.gif -------------------------------------------------------------------------------- /docs/images/incremental_input_importance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/incremental_input_importance.png -------------------------------------------------------------------------------- /docs/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/docs/images/logo.png -------------------------------------------------------------------------------- /docs/images/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | 11 | 14 | 15 | 16 | 19 | 20 | 23 | 26 | 27 | 28 | 32 | 33 | 35 | 38 | 42 | 44 | 47 | 51 | 54 | 55 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: Documentation of the KXY Lean AutoML platform. 3 | :keywords: AutoML, Lean AutoML, KXY AutoML, Pre-Learning, Post-Learning 4 | :http-equiv=content-language: en 5 | 6 | 7 | 8 | A Powerful Serverless Analysis Toolkit That Takes *Trial And Error* Out of Machine Learning Projects 9 | ==================================================================================================== 10 | .. image:: https://img.shields.io/badge/license-GPLv3%2B-blue 11 | :alt: License 12 | :target: https://www.gnu.org/licenses/agpl-3.0.en.html 13 | .. image:: https://img.shields.io/pypi/v/kxy.svg 14 | :alt: PyPI Latest Release 15 | :target: https://pypi.org/project/kxy/ 16 | .. image:: https://pepy.tech/badge/kxy 17 | :alt: Downloads 18 | :target: https://github.com/kxytechnologies/kxy-python/ 19 | 20 | 21 | ============== 22 | Get An API Key 23 | ============== 24 | To get an API key, simply open an account with us `here `_. As soon as you have an account, you may retrieve your API key `here `_. 25 | 26 | 27 | ================================================================= 28 | Boost Your The Productivity Of Your ML Teams Tenfold With Lean ML 29 | ================================================================= 30 | The :code:`kxy` package utilizes information theory to take *trial and error* out of machine learning projects. 31 | 32 | ------------------- 33 | Project Feasibility 34 | ------------------- 35 | From the get-go, the **data valuation** analysis of the :code:`kxy` package tells data scientists whether their datasets are sufficiently informative to achieve a performance (e.g. :math:`R^2`, RMSE, maximum log-likelihood, and classification error) to their liking in a classification or regression problem, and if so what is the best performance that can be achieved using said datasets. *Only spend time and compute resources on a project once you know it can yield the desired business impact*. 36 | 37 | ---------------------------------------- 38 | Automatic (Model-Free) Feature Selection 39 | ---------------------------------------- 40 | The **model-free variable selection** analysis provided by the :code:`kxy` package allows data scientists to train smaller models, faster, cheaper, and to achieve a higher performance than throwing all inputs in a big model or proceeding by trial-and-error. 41 | 42 | 43 | --------------------------------------- 44 | Production Model Improvability Analyses 45 | --------------------------------------- 46 | **Data-Driven Improvability:** Once a model has been trained, the :code:`kxy` *model-driven improvability* analysis quantifies the extent to which the trained model can be improved without resorting to additional features. This allows data scientists to focus their modeling efforts on high ROI initiatives. *Only throw the might of your ML team and platform at improving the fit of your production model when you know it can be improved. Never again will you spend weeks, if not months, and thousands of dollars in cloud compute, implementing the latest models on specialized hardware to improve your production model, only to find out its fit cannot be improved*. 47 | 48 | **Model-Driven Improvability:** Once the fit of a production model is optimal (i.e. it has successfully extracted all the value in using a given set features to predict the label), the :code:`kxy` *data-driven improvability* allows data scientists to quickly quantify the performance increase (e.g. :math:`R^2`, RMSE, maximum log-likelihood, and classification error) that a new dataset may bring about. *Only retrain models with additional features when you know they can bring about a meaningful performance boost*. 49 | 50 | 51 | ------------------------------------------------------ 52 | Reducing Time and Resources Spent on Overfitted Models 53 | ------------------------------------------------------ 54 | We provide callbacks in the major Python machine learning libraries that will terminate training when the running best performance seems unrealistic (i.e. far exceeds the theoretical-best achievable). Our callbacks allow saving time and compute resources on models that we can reliably determine will overfit once fully trained, well before training ends. This is a cost-effective alternative to cross-validation. 55 | 56 | 57 | 58 | 59 | 60 | .. toctree:: 61 | :hidden: 62 | :caption: QUICKSTART 63 | 64 | latest/quickstart/getting_started 65 | 66 | 67 | .. toctree:: 68 | :hidden: 69 | :caption: ILLUSTRATIONS 70 | 71 | latest/applications/cheat_sheet/index 72 | 73 | latest/applications/illustrations/index 74 | 75 | latest/applications/case_studies/index 76 | 77 | 78 | .. toctree:: 79 | :hidden: 80 | :caption: THEORETICAL FOUNDATION 81 | 82 | latest/theoretical_foundation/memoryless/index 83 | 84 | latest/theoretical_foundation/memoryful/index 85 | 86 | 87 | .. toctree:: 88 | :hidden: 89 | :caption: PYTHON CODE DOCUMENTATION 90 | 91 | latest/data_valuation/index 92 | 93 | latest/model_free_variable_selection/index 94 | 95 | latest/model_wrapped_feature_selection/index 96 | 97 | latest/learning/index 98 | 99 | latest/model_explanation/index 100 | 101 | latest/model_improvability/index 102 | 103 | 104 | .. toctree:: 105 | :hidden: 106 | :caption: MISCELLANEOUS 107 | 108 | latest/data_transfer/index 109 | 110 | latest/pandas/index 111 | 112 | 113 | .. toctree:: 114 | :hidden: 115 | :caption: OTHER LANGUAGES 116 | 117 | latest/api/index 118 | 119 | 120 | .. toctree:: 121 | :hidden: 122 | :caption: INDEX 123 | 124 | latest/index/index 125 | -------------------------------------------------------------------------------- /docs/latest/applications/case_studies/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: Case studies illustrating how the KXY platform may help customers. 3 | :keywords: KXY Tutorials, KXY Case Studies. 4 | :http-equiv=content-language: en 5 | 6 | 7 | 8 | 9 | ------------ 10 | Case Studies 11 | ------------ 12 | On this page we illustrate what KxY can do for you. 13 | 14 | 15 | Model Compression 16 | ----------------- 17 | * `LightGBM, XGBoost or Random Forest: Same Performance With 80% Fewer Features `_ 18 | 19 | 20 | Data Valuation 21 | -------------- 22 | 23 | * :ref:`The KXY Data Valuation Function Works (Regression)` 24 | * :ref:`The KXY Data Valuation Function Works (Classification)` 25 | 26 | Model-Free Feature Selection 27 | ---------------------------- 28 | * :ref:`Automatically Pruning Redundant Features With KXY` 29 | * :ref:`Detecting Features That Are Only Useful In Conjunction With Others` 30 | 31 | 32 | Better Fraud & Attack Detection 33 | ------------------------------- 34 | * :ref:`Better Solving Heavily Unbalanced Classification Problems With KXY` 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /docs/latest/applications/cheat_sheet/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: Description of KXY's main functions, and how to access them in Python. 3 | :keywords: KXY Tutorials, KXY Cheatsheet. 4 | :http-equiv=content-language: en 5 | 6 | 7 | 8 | ---------- 9 | Cheatsheet 10 | ---------- 11 | 12 | Imports 13 | ------- 14 | 15 | .. code-block:: python 16 | 17 | import pandas as pd 18 | import kxy 19 | 20 | From now on, :code:`df` refers to a Pandas dataframe object and :code:`y_column` is the column of :code:`df` to be used as target. All columns in :code:`df` but :code:`y_column` are treated as explanatory variables. :code:`problem_type` is a variable taking value :code:`'regression'` for regression problems and :code:`'classification'` for classification problems. 21 | 22 | Data Valuation 23 | -------------- 24 | 25 | .. code-block:: python 26 | 27 | df.kxy.data_valuation(y_column, problem_type=problem_type) 28 | 29 | 30 | By default, your data is transmitted to our backend in clear. To anonymize your data before performing data valuation, simply set :code:`anonymize=True`. 31 | 32 | .. code-block:: python 33 | 34 | df.kxy.data_valuation(y_column, problem_type=problem_type, anonymize=True) # Data valuation using anonymized data. 35 | 36 | 37 | 38 | Automatic (Model-Free) Feature Selection 39 | ---------------------------------------- 40 | 41 | .. code-block:: python 42 | 43 | df.kxy.variable_selection(y_column, problem_type=problem_type) 44 | 45 | By default, your data is transmitted to our backend in clear. To anonymize your data before performing automatic feature selection, simply set :code:`anonymize=True`. 46 | 47 | .. code-block:: python 48 | 49 | df.kxy.variable_selection(y_column, problem_type=problem_type, anonymize=True) # Variable selection using anonymized data. 50 | 51 | 52 | 53 | Model Compression 54 | ----------------- 55 | Here's how to wrap feature selection around LightGBM in Python. 56 | 57 | .. code-block:: python 58 | 59 | from kxy.learning import get_lightgbm_learner_learning_api 60 | 61 | params = { 62 | 'objective': 'rmse', 63 | 'boosting_type': 'gbdt', 64 | 'num_leaves': 100, 65 | 'n_jobs': -1, 66 | 'learning_rate': 0.1, 67 | 'verbose': -1, 68 | } 69 | learner_func = get_lightgbm_learner_learning_api(params, num_boost_round=10000, \ 70 | early_stopping_rounds=50, verbose_eval=50, feature_selection_method='leanml') 71 | results = df.kxy.fit(y_column, learner_func, problem_type=problem_type) 72 | 73 | # The trained model 74 | predictor = results['predictor'] 75 | 76 | # Feature columns selected 77 | selected_variables = predictor.selected_variables 78 | 79 | # To make predictions out of a dataframe of test data. 80 | predictions = predictor.predict(test_df) 81 | 82 | Parameters of :code:`get_lightgbm_learner_learning_api` should be the same as those of :code:`lightgbm.train`. See the `LightGBM documentation `_. 83 | 84 | 85 | Wrapping feature selection around another model in Python is identical except for :code:`learner_func`. Here's how to create :code:`learner_func` for other models. 86 | 87 | For XGBoost: 88 | 89 | .. code-block:: python 90 | 91 | from kxy.learning import get_xgboost_learner 92 | # Use 'xgboost.XGBClassifier' for classification problems. 93 | xgboost_learner_func = get_xgboost_learner('xgboost.XGBRegressor') 94 | 95 | 96 | Parameters of :code:`get_xgboost_learner` should be those you'd pass to instantiate :code:`xgboost.XGBRegressor` or :code:`xgboost.XGBClassifier`. See the `XGBoost documentation `_. 97 | 98 | 99 | For Scikit-Learn models: 100 | 101 | .. code-block:: python 102 | 103 | from kxy.learning import get_sklearn_learner 104 | # Replace 'sklearn.ensemble.RandomForestRegressor' with the import path of the sklearn model you want to use. 105 | rf_learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', \ 106 | min_samples_split=0.01, max_samples=0.5, n_estimators=100) 107 | df.kxy.fit(y_column, rf_learner_func, problem_type=problem_type) 108 | 109 | 110 | Parameters of :code:`get_sklearn_learner` should be those you'd pass to instantiate the scikit-learn model. 111 | 112 | 113 | 114 | Model-Driven Improvability 115 | -------------------------- 116 | For the model-driven improvability analysis, predictions made by the production model should be contained in a column of the :code:`df`. The variable :code:`prediction_column` refers to said column. All columns in :code:`df` but :code:`y_column` and :code:`prediction_column` are considered to be the explanatory variables/features used to train the production model. 117 | 118 | 119 | .. code-block:: python 120 | 121 | anonymize = False # Set to True to anonymize your data before model-driven improvability 122 | df.kxy.model_driven_improvability(y_column, prediction_column, problem_type=problem_type, anonymize=anonymize) 123 | 124 | 125 | 126 | Data-Driven Improvability 127 | ------------------------- 128 | For the data-driven improvability analysis, the list of columns representing new features/explanatory variables to consider (:code:`new_variables`) should be provided. All columns in :code:`df` that are neither :code:`y_column` nor contained in :code:`new_variables` are assumed to be the explanatory variables/features used to trained the production model. 129 | 130 | 131 | .. code-block:: python 132 | 133 | anonymize = False # Set to True to anonymize your data before model-driven improvability 134 | df.kxy.data_driven_improvability(y_column, new_variables, problem_type=problem_type, anonymize=anonymize) 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /docs/latest/applications/illustrations/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: Application of KXY's analyses to popular machine learning datasets 3 | :keywords: KXY Tutorials, KXY Examples. 4 | :http-equiv=content-language: en 5 | 6 | 7 | 8 | ------------------------ 9 | Kaggle & UCI Experiments 10 | ------------------------ 11 | 12 | Below you'll find the results of applying KXY's data-valuation and model-free variable selection analyses to dozens of the most popular Kaggle and UCI regression and classification experiments. 13 | 14 | 15 | Kaggle Experiments 16 | ------------------ 17 | 18 | * :ref:`Heart Attack (Kaggle, Classification, n=303, d=13, 2 classes)` 19 | * :ref:`House Prices Advanced (Kaggle, Regression, n=1460, d=79)` 20 | * :ref:`Titanic (Kaggle, Classification, n=891, d=11, 2 classes)` 21 | * :ref:`Water Quality (Kaggle, Classification, n=3276, d=9, 2 classes)` 22 | 23 | 24 | 25 | UCI Experiments 26 | --------------- 27 | 28 | * :ref:`APS Failure (UCI, Classification, n=76000, d=170, 2 classes)` 29 | * :ref:`Abalone (UCI, Regression, n=4177, d=8)` 30 | * :ref:`Adult (UCI, Classification, n=48843, d=14, 3 classes)` 31 | * :ref:`Air Foil (UCI, Regression, n=1503, d=5)` 32 | * :ref:`Air Quality (UCI, Regression, n=8991, d=14)` 33 | * :ref:`Avila (UCI, Classification, n=20867, d=10, 12 classes)` 34 | * :ref:`Bank Marketing (UCI, Classification, n=41188, d=20, 2 classes)` 35 | * :ref:`Bank Note (UCI, Classification, n=1372, d=4, 2 classes)` 36 | * :ref:`Bike Sharing (UCI, Regression, n=17379, d=18)` 37 | * :ref:`Blog Feedback (UCI, Regression, n=60021, d=280)` 38 | * :ref:`Card Default (UCI, Classification, n=30000, d=23, 2 classes)` 39 | * :ref:`Concrete (UCI, Regression, n=1030, d=8)` 40 | * :ref:`CT Slices (UCI, Regression, n=53500, d=385)` 41 | * :ref:`Diabetic Retinopathy (UCI, Classification, n=1151, d=19, 2 classes)` 42 | * :ref:`EEG Eye State (UCI, Classification, n=14980, d=14, 2 classes)` 43 | * :ref:`Energy Efficiency (UCI, Regression, n=768, d=8)` 44 | * :ref:`Facebook Comments (UCI, Regression, n=209074, d=53)` 45 | * :ref:`Landsat (UCI, Classification, n=6435, d=36, 6 classes)` 46 | * :ref:`Letter Recognition (UCI, Classification, n=20000, d=16, 26 classes)` 47 | * :ref:`Magic Gamma (UCI, Classification, n=19020, d=10, 2 classes)` 48 | * :ref:`Naval Propulsion (UCI, Regression, n=11934, d=16)` 49 | * :ref:`Online News (UCI, Regression, n=39644, d=58)` 50 | * :ref:`Parkinson (UCI, Regression, n=5875, d=20)` 51 | * :ref:`Power Plant (UCI, Regression, n=9568, d=4)` 52 | * :ref:`Real Estate (UCI, Regression, n=414, d=6)` 53 | * :ref:`Sensorless Drive (UCI, Classification, n=58509, d=48, 11 classes)` 54 | * :ref:`Shuttle (UCI, Classification, n=58000, d=9, 7 classes)` 55 | * :ref:`Skin Segmentation (UCI, Classification, n=245057, d=3, 2 classes)` 56 | * :ref:`Social Media Buzz (UCI, Regression, n=583250, d=77)` 57 | * :ref:`Superconductivity (UCI, Regression, n=21263, d=81)` 58 | * :ref:`White Wine Quality (UCI, Regression, n=4898, d=11)` 59 | * :ref:`Yacht Hydrodynamics (UCI, Regression, n=308, d=6)` 60 | * :ref:`Year Prediction MSD (UCI, Regression, n=515345, d=90)` 61 | -------------------------------------------------------------------------------- /docs/latest/applications/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: Examples and tutorials illustrating how the KXY AutoML platform works, and what can be done with it. 3 | :keywords: KXY Tutorials, KXY Examples. 4 | :http-equiv=content-language: en 5 | 6 | 7 | 8 | ---------- 9 | Cheatsheet 10 | ---------- 11 | 12 | Imports 13 | ------- 14 | 15 | .. code-block:: python 16 | 17 | import pandas as pd 18 | import kxy 19 | 20 | From now on, :code:`df` refers to a Pandas dataframe object and :code:`y_column` is the column of :code:`df` to be used as target. All columns in :code:`df` but :code:`y_column` are treated as explanatory variables. :code:`problem_type` is a variable taking value :code:`'regression'` for regression problems and :code:`'classification'` for classification problems. 21 | 22 | Data Valuation 23 | -------------- 24 | 25 | .. code-block:: python 26 | 27 | df.kxy.data_valuation(y_column, problem_type=problem_type) 28 | 29 | 30 | By default, your data is transmitted to our backend in clear. To anonymize your data before performing data valuation, simply set :code:`anonymize=True`. 31 | 32 | .. code-block:: python 33 | 34 | df.kxy.data_valuation(y_column, problem_type=problem_type, anonymize=True) # Data valuation using anonymized data. 35 | 36 | 37 | 38 | Automatic (Model-Free) Feature Selection 39 | ---------------------------------------- 40 | 41 | .. code-block:: python 42 | 43 | df.kxy.variable_selection(y_column, problem_type=problem_type) 44 | 45 | By default, your data is transmitted to our backend in clear. To anonymize your data before performing automatic feature selection, simply set :code:`anonymize=True`. 46 | 47 | .. code-block:: python 48 | 49 | df.kxy.variable_selection(y_column, problem_type=problem_type, anonymize=True) # Variable selection using anonymized data. 50 | 51 | 52 | 53 | 54 | Model-Driven Improvability 55 | -------------------------- 56 | For the model-driven improvability analysis, predictions made by the production model should be contained in a column of the :code:`df`. The variable :code:`prediction_column` refers to said column. All columns in :code:`df` but :code:`y_column` and :code:`prediction_column` are considered to be the explanatory variables/features used to train the production model. 57 | 58 | 59 | .. code-block:: python 60 | 61 | anonymize = False # Set to True to anonymize your data before model-driven improvability 62 | df.kxy.model_driven_improvability(y_column, prediction_column, problem_type=problem_type, anonymize=anonymize) 63 | 64 | 65 | 66 | Data-Driven Improvability 67 | ------------------------- 68 | For the data-driven improvability analysis, the list of columns representing new features/explanatory variables to consider (:code:`new_variables`) should be provided. All columns in :code:`df` that are neither :code:`y_column` nor contained in :code:`new_variables` are assumed to be the explanatory variables/features used to trained the production model. 69 | 70 | 71 | .. code-block:: python 72 | 73 | anonymize = False # Set to True to anonymize your data before model-driven improvability 74 | df.kxy.data_driven_improvability(y_column, new_variables, problem_type=problem_type, anonymize=anonymize) 75 | 76 | 77 | 78 | ----------------------- 79 | Examples (Kaggle & UCI) 80 | ----------------------- 81 | 82 | * :ref:`APS Failure (UCI, Classification, n=76000, d=170, 2 classes)` 83 | * :ref:`Abalone (UCI, Regression, n=4177, d=8)` 84 | * :ref:`Adult (UCI, Classification, n=48843, d=14, 3 classes)` 85 | * :ref:`Air Foil (UCI, Regression, n=1503, d=5)` 86 | * :ref:`Air Quality (UCI, Regression, n=8991, d=14)` 87 | * :ref:`Avila (UCI, Classification, n=20867, d=10, 12 classes)` 88 | * :ref:`Bank Marketing (UCI, Classification, n=41188, d=20, 2 classes)` 89 | * :ref:`Bank Note (UCI, Classification, n=1372, d=4, 2 classes)` 90 | * :ref:`Bike Sharing (UCI, Regression, n=17379, d=18)` 91 | * :ref:`Blog Feedback (UCI, Regression, n=60021, d=280)` 92 | * :ref:`CT Slices (UCI, Regression, n=53500, d=385)` 93 | * :ref:`Card Default (UCI, Classification, n=30000, d=23, 2 classes)` 94 | * :ref:`Concrete (UCI, Regression, n=1030, d=8)` 95 | * :ref:`Diabetic Retinopathy (UCI, Classification, n=1151, d=19, 2 classes)` 96 | * :ref:`EEG Eye State (UCI, Classification, n=14980, d=14, 2 classes)` 97 | * :ref:`Energy Efficiency (UCI, Regression, n=768, d=8)` 98 | * :ref:`Facebook Comments (UCI, Regression, n=209074, d=53)` 99 | * :ref:`Heart Attack (Kaggle, Classification, n=303, d=13, 2 classes)` 100 | * :ref:`Heart Disease (Kaggle, Classification, n=303, d=13, 2 classes)` 101 | * :ref:`House Prices Advanced (Kaggle, Regression, n=1460, d=79)` 102 | * :ref:`Landsat (UCI, Classification, n=6435, d=36, 6 classes)` 103 | * :ref:`Letter Recognition (UCI, Classification, n=20000, d=16, 26 classes)` 104 | * :ref:`Magic Gamma (UCI, Classification, n=19020, d=10, 2 classes)` 105 | * :ref:`Naval Propulsion (UCI, Regression, n=11934, d=16)` 106 | * :ref:`Online News (UCI, Regression, n=39644, d=58)` 107 | * :ref:`Parkinson (UCI, Regression, n=5875, d=20)` 108 | * :ref:`Power Plant (UCI, Regression, n=9568, d=4)` 109 | * :ref:`Real Estate (UCI, Regression, n=414, d=6)` 110 | * :ref:`Sensor Less Drive (UCI, Classification, n=58509, d=48, 11 classes)` 111 | * :ref:`Shuttle (UCI, Classification, n=58000, d=9, 7 classes)` 112 | * :ref:`Skin Segmentation (UCI, Classification, n=245057, d=3, 2 classes)` 113 | * :ref:`Social Media Buzz (UCI, Regression, n=583250, d=77)` 114 | * :ref:`Superconductivity (UCI, Regression, n=21263, d=81)` 115 | * :ref:`Titanic (Kaggle, Classification, n=891, d=11, 2 classes)` 116 | * :ref:`Water Quality (Kaggle, Classification, n=3276, d=9, 2 classes)` 117 | * :ref:`White Wine Quality (UCI, Regression, n=4898, d=11)` 118 | * :ref:`Yacht Hydrodynamics (UCI, Regression, n=308, d=6)` 119 | * :ref:`Year Prediction MSD (UCI, Regression, n=515345, d=90)` 120 | 121 | 122 | 123 | ------------ 124 | Case Studies 125 | ------------ 126 | 127 | * :ref:`Evaluating KXY's Data Valuation Function (Classification)` 128 | * :ref:`Evaluating KXY's Data Valuation Function (Regression)` 129 | * :ref:`Automatically Pruning Redundant Features With KXY` 130 | * :ref:`Detecting Features That Are Only Useful In Conjunction With Others` 131 | * :ref:`Better Solving Heavily Unbalanced Classification Problems With KXY` 132 | 133 | 134 | 135 | Classification 136 | -------------- 137 | 138 | * :ref:`Toy Visual Classification Example` 139 | * :ref:`Classification Problem With Some Useless Variables` 140 | * :ref:`Complex Classification Example` 141 | 142 | 143 | Regression 144 | ---------- 145 | * :ref:`Toy 1D Regression Examples` 146 | * :ref:`Toy Multivariate Regression Examples` 147 | * :ref:`Regression Problem With Some Useless Variables` 148 | * :ref:`Complex Regression Example` 149 | 150 | 151 | 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /docs/latest/data_transfer/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: How we use your data. 3 | :keywords: Pandas Dataframe, Lean ML, KXY. 4 | :http-equiv=content-language: en 5 | 6 | ========= 7 | Your Data 8 | ========= 9 | 10 | How We Use Your Data 11 | -------------------- 12 | 13 | .. automodule:: kxy.api.data_transfer 14 | :members: 15 | 16 | 17 | Anonymizing Your Data 18 | --------------------- 19 | Fortunately, our analyses are invariant by various transformations that can completely anonymize your data. 20 | 21 | You may simply run :code:`df_anonymized = df.kxy.anonymize()` on any dataframe :code:`df` to anonymize it, and work with :code:`df_anonymized` instead :code:`df`. 22 | 23 | Check out the function below for more information on how we anonymize your data. 24 | 25 | .. automethod:: kxy.pandas_extension.base_accessor.BaseAccessor.anonymize 26 | 27 | 28 | -------------------------------------------------------------------------------- /docs/latest/data_valuation/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: How to perform data valuation using the kxy Python package. 3 | :keywords: Data Valuation, Lean ML, AutoML, Lean Machine Learning, KXY. 4 | :http-equiv=content-language: en 5 | 6 | 7 | ============== 8 | Data Valuation 9 | ============== 10 | 11 | .. automodule:: kxy.pre_learning.achievable_performance 12 | :members: 13 | -------------------------------------------------------------------------------- /docs/latest/index/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. meta:: 3 | :http-equiv=content-language: en 4 | 5 | Indices and tables 6 | ================== 7 | 8 | * :ref:`genindex` 9 | * :ref:`modindex` 10 | * :ref:`search` 11 | -------------------------------------------------------------------------------- /docs/latest/learning/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: How to use the kxy Python package to terminate training as soon as overfitting can be reliably determined. 3 | :keywords: Early-Termination, Overfitting Mitigation. 4 | :http-equiv=content-language: en 5 | 6 | 7 | ================= 8 | Early Termination 9 | ================= 10 | Callbacks and event handlers used to terminate training as soon as the running loss becomes lower than the theoretical-smallest. 11 | 12 | 13 | Tensorflow v2 14 | ------------- 15 | 16 | .. automodule:: kxy.learning.tensorflow_early_termination 17 | :members: 18 | 19 | 20 | PyTorch 21 | ------- 22 | 23 | .. automodule:: kxy.learning.pytorch_early_termination 24 | :members: -------------------------------------------------------------------------------- /docs/latest/model_explanation/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: How to use the kxy Python package to explain a trained supervised learning model. 3 | :keywords: Model Explanation, Interpretable AI, Post-Learning, KXY. 4 | :http-equiv=content-language: en 5 | 6 | 7 | ================= 8 | Model Explanation 9 | ================= 10 | 11 | .. automodule:: kxy.post_learning.model_explanation 12 | :members: -------------------------------------------------------------------------------- /docs/latest/model_free_variable_selection/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: How to perform model-free variable selection (or feature importance) using the kxy Python package. 3 | :keywords: Model-Free Variable Selection, Model-Free Feature Importance, AutoML, Lean ML, Lean Machine Learning, KXY. 4 | :http-equiv=content-language: en 5 | 6 | 7 | ============================= 8 | Model-Free Variable Selection 9 | ============================= 10 | 11 | .. automodule:: kxy.pre_learning.variable_selection 12 | :members: -------------------------------------------------------------------------------- /docs/latest/model_improvability/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: How to use the kxy Python package to quantify by how much the performance of a trained supervised learning may be improved in a model-driven fashion (i.e. by simply looking for a better model, and without resorting to additional explanatory variables), or in a data-driven fashion (i.e. how much incremental value a specific new set of explanatory variables may bring about). 3 | :keywords: Model-Driven Improvability, Data-Driven Improvability, Post-Learning, KXY. 4 | :http-equiv=content-language: en 5 | 6 | 7 | =================== 8 | Model Improvability 9 | =================== 10 | Estimation of the amount by which the performance of a trained supervised learning model can be increased, either in a model-driven fashion, or a data-driven fashion. 11 | 12 | Model-Driven Improvability 13 | -------------------------- 14 | 15 | .. autofunction:: kxy.post_learning.improvability.model_driven_improvability 16 | 17 | 18 | Data-Driven Improvability 19 | ------------------------- 20 | 21 | .. autofunction:: kxy.post_learning.improvability.data_driven_improvability 22 | -------------------------------------------------------------------------------- /docs/latest/model_wrapped_feature_selection/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: How to seamlessly add feature selection to any predictive model in Python. 3 | :keywords: Feature Selection, LeanML Feature Selection, Boruta Feature Selection, RFE Feature Selection, KXY. 4 | :http-equiv=content-language: en 5 | 6 | 7 | ================= 8 | Model Compression 9 | ================= 10 | 11 | How to seamlessly add feature selection to any predictive model in Python, so as to achieve the same performance with far fewer features. 12 | 13 | 14 | LeanML Feature Selection 15 | ------------------------ 16 | 17 | .. automodule:: kxy.learning.leanml_predictor 18 | :members: 19 | 20 | 21 | Boruta and Recursive Feature Elimination 22 | ---------------------------------------- 23 | 24 | .. automodule:: kxy.misc.boruta 25 | :members: 26 | 27 | .. automodule:: kxy.misc.rfe 28 | :members: 29 | 30 | .. automodule:: kxy.misc.predictors 31 | :members: 32 | 33 | 34 | Principal Feature Selection 35 | --------------------------- 36 | 37 | .. automodule:: kxy.pfs.pfs_selector 38 | :members: 39 | 40 | .. automodule:: kxy.pfs.pfs_predictor 41 | :members: 42 | 43 | 44 | 45 | Utilities Generating Learner Functions 46 | -------------------------------------- 47 | 48 | .. automodule:: kxy.learning.base_learners 49 | :members: 50 | 51 | -------------------------------------------------------------------------------- /docs/latest/pandas/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: Extension of pandas with analyses from the KXY Lean AutoML platform. 3 | :keywords: Pandas Dataframe, AutoML, KXY. 4 | :http-equiv=content-language: en 5 | 6 | ============================= 7 | DataFrame Extension Deep Dive 8 | ============================= 9 | 10 | .. automodule:: kxy.pandas_extension.accessor 11 | :members: 12 | :show-inheritance: 13 | 14 | .. automodule:: kxy.pandas_extension.base_accessor 15 | :members: 16 | 17 | .. automodule:: kxy.pandas_extension.pre_learning_accessor 18 | :members: 19 | :show-inheritance: 20 | 21 | .. automodule:: kxy.pandas_extension.learning_accessor 22 | :members: 23 | :show-inheritance: 24 | 25 | .. automodule:: kxy.pandas_extension.post_learning_accessor 26 | :members: 27 | :show-inheritance: 28 | 29 | .. automodule:: kxy.pandas_extension.finance_accessor 30 | :members: 31 | :show-inheritance: 32 | -------------------------------------------------------------------------------- /docs/latest/quickstart/getting_started.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "Getting Started\n", 9 | "============\n", 10 | "\n", 11 | "Click [here](https://github.com/kxytechnologies/kxy-python/blob/master/docs/latest/quickstart/getting_started.ipynb) to download this page as a Jupyter Notebook.\n", 12 | "\n", 13 | "\n", 14 | "Installation\n", 15 | "-------------\n", 16 | "\n", 17 | "From PyPi:\n", 18 | "\n", 19 | "```bash\n", 20 | "pip install kxy\n", 21 | "```\n", 22 | "\n", 23 | "From GitHub:\n", 24 | "\n", 25 | "```bash\n", 26 | "git clone https://github.com/kxytechnologies/kxy-python.git & cd ./kxy-python & pip install .\n", 27 | "```\n", 28 | "\n", 29 | "The `kxy` package only supports Python 3. Replace ``pip`` by ``pip3`` in the commands above, as needed." 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "Authentication\n", 37 | "-----------------\n", 38 | "All heavy computations are run on the KXY backend and require an API key. The API key is set by running the command \n", 39 | "```bash\n", 40 | "kxy configure\n", 41 | "```\n", 42 | "as a one-off, and following the instructions. \n", 43 | "\n", 44 | "You may also set your API key as the `KXY_API_KEY` environment variable, for instance by running \n", 45 | "```bash\n", 46 | "import os\n", 47 | "os.environ['KXY_API_KEY'] = ''\n", 48 | "```\n", 49 | "To get an API key, you need to have an account with us. You can create an account [here](https://www.kxy.ai/signup/). \n", 50 | "\n", 51 | "By default, you will start in the free `Starter` plan, and you will be able to experience the KXY backend from the portal for free, and without providing a payment method, by uploading your data as csv files.\n", 52 | "\n", 53 | "Once you have an account and are ready to use our API, you can find your API key on the KXY portal [here](https://www.kxy.ai/portal/profile/identity/). \n", 54 | "\n", 55 | "API access to our backend is billed on a per-request basis, and requires that you provide a valid payment method in the KXY portal.\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "Docker\n", 63 | "---------\n", 64 | "The public Docker image [kxytechnologies/kxy](https://hub.docker.com/repository/docker/kxytechnologies/kxy) has been built for your convenience, and comes with anaconda, the kxy package, and various popular machine learning packages (e.g. Tensorflow, XGBoost, LightGBM, scikit-learn, and more).\n", 65 | "\n", 66 | "To start a Jupyter Notebook server from a sandboxed Docker, run\n", 67 | "```bash\n", 68 | "docker run -i -t -p 5555:8888 kxytechnologies/kxy /bin/bash -c \"kxy configure && /opt/conda/bin/jupyter notebook --notebook-dir=/opt/notebooks --ip='*' --port=8888 --no-browser --allow-root --NotebookApp.token=''\"\n", 69 | "```\n", 70 | "where you should replace `` with your API key and navigate to [http://localhost:5555](http://localhost:5555) in your browser.\n", 71 | "\n", 72 | "To start a Jupyter Notebook server from an existing directory of notebooks, run\n", 73 | "```bash\n", 74 | "docker run -i -t --mount src=,target=/opt/notebooks,type=bind -p 5555:8888 kxytechnologies/kxy /bin/bash -c \"kxy configure && /opt/conda/bin/jupyter notebook --notebook-dir=/opt/notebooks --ip='*' --port=8888 --no-browser --allow-root --NotebookApp.token=''\"\n", 75 | "```\n", 76 | "where you should replace `` with the path to your local notebook folder and navigate to [http://localhost:5555](http://localhost:5555) in your browser." 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "Serverless Applications\n", 84 | "----------------------------\n", 85 | "For serverless applications running on AWS, we provide a kxy [AWS Lambda layer](https://docs.aws.amazon.com/lambda/latest/dg/configuration-layers.html) so that you may simply import the `kxy` package from within your AWS Python 3 lambda functions. No additional requirement is needed, other than specifying your API key as the environment variable `KXY_API_KEY`." 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Working with Pandas DataFrames\n", 93 | "----------------------------------------\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "The most convenient way of using the `kxy` package is through pandas DataFrame objects. All our analyses are available as methods of pandas DataFrame objects, under the `kxy` [accessor](https://pandas.pydata.org/pandas-docs/stable/development/extending.html) (i.e. as `df.kxy.`). To access these, all you need is to import the `kxy` package alongside `pandas`." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 1, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "import pandas as pd\n", 110 | "import kxy" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "Checkout the [Cheat Sheet](https://www.kxy.ai/reference/latest/applications/cheat_sheet/index.html) section for code snippets, the [Case Studies](https://www.kxy.ai/reference/latest/applications/case_studies/index.html) section for interesting applications, and the [Kaggle & UCI Experiments](https://www.kxy.ai/reference/latest/applications/illustrations/index.html) section for experiments on dozens of UCI and Kaggle datasets." 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Working with the Low Level RESTFul API\n", 125 | "------------------------------------------------\n", 126 | "\n", 127 | "We intend to provide user-friendly API clients in other programming languages (e.g. R, Ruby and JavaScript). For now, if you are working in a programming language other than Python, you may directly access our serverless compute infrastructure through its RESTFul API. Take a look at our [RESTFul API documentation page](https://www.kxy.ai/reference/latest/api/index.html) for more details." 128 | ] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "kxy", 134 | "language": "python", 135 | "name": "kxy" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.7.10" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 4 152 | } 153 | -------------------------------------------------------------------------------- /docs/latest/theoretical_foundation/memoryful/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: The theoretical foundation of the KXY Lean AutoML platform, for time series. 3 | :http-equiv=content-language: en 4 | 5 | ********************** 6 | Memoryful Observations 7 | ********************** 8 | 9 | The **lean ML** approach we advocated for memoryless problems in section :ref:`Memoryless Observations` presents as much potential when applied to time series forecasting problems. We consider predicting a business outcome :math:`y_t` using past values :math:`y_{t-1}, \dots, y_1` and using present and past values of an exogeneous explanatory vector-valued time series :math:`\{x_t\}`. 10 | 11 | The time series approach to modeling expresses two key points: (i) like random variables, we are uncertain about the value of the phenomenon we are modeling until it is observed, (ii) but unlike random variables, the phenomenon of interest may exhibit some memory in that observations drawn at different times may be related. 12 | 13 | 14 | I - From Memoryful to Memoryless 15 | -------------------------------- 16 | In practice, we do not have the luxury of being able to replay time so as to gather multiple samples of a phenomenon corresponding to the same time, which would be the equivalent of having multiple draws from the same random variable in the memoryless setting. We need to learn from a single finite-length path :math:`\{(y_1, x_1), \dots, (y_T, x_T) \}`. Consequently, instead of using all past values :math:`(y_{t-1}, x_{t-1}), \dots, (y_1, x_1)` to predict :math:`y_t`, we might have to settle for a shorter time window :math:`(y_{t-1}, x_{t-1}), \dots, (y_{t-q}, x_{t-q})` in the interest of forming low-variance estimates, where the window size :math:`q` can be as large as allowed by our sample size :math:`T`. 17 | 18 | The natural question that arises is, can we simply define :math:`Y_i=y_t` and :math:`X_i=\left(x_t, y_{t-1}, x_{t-1}, \dots, y_{t-q}, x_{t-q}\right)`, and apply all the results developed in section :ref:`Memoryless Observations` to the dataset :math:`(Y_i, X_i)_{i \in [1, T]}`? 19 | 20 | The answer is yes, but we need to be cautious! The main difference with the memoryless setting is that :math:`\left(Y_i, X_i \right)` are not necessarily i.i.d. However, so long as the time series :math:`\{z_t\} = \{y_t, x_t\}` is assumed to be `stationary ergodic `_, all population metrics we previously introduced are well-defined, make as much sense as in the memoryless case, and the associated sample estimates remain consistent. 21 | 22 | More generally, when :math:`\{z_t\}` can be assumed to be trend-stationary and ergodic (i.e. :math:`\{y_t-f(t), x_t-g(t)\}` is stationary ergodic for some deterministic functions :math:`f, g`), we do not need to remove trends explicitly. We may simply add time as an explanatory variable, and apply results from the :ref:`Memoryless Observations` section to :math:`(Y_i, X_i^\prime)_{i \in [1, T]}`, with :math:`X_i^\prime = \left(t, x_t, y_{t-1}, x_{t-1}, \dots, y_{t-q}, x_{t-q}\right)`. 23 | 24 | 25 | In the event (trend-)stationarity is too unrealistic an assumption, the time series may be assumed locally-stationary. In other words, we do not assume (trend-adjusted) marginals to be invariant by any translation, but we consider that the magnitude of a change of marginals resulting from a translation depends on the norm of the translation vector. The smaller the norm of the translation vector, the smaller the magnitude of the changes to the marginals. 26 | 27 | Here too, results from the memoryless section apply, but with two caveats. First, we may not use as large a sample size :math:`T` as we want. :math:`T` has to be large enough that we may achieve low-variance estimates, yet small enough that the path used for training only contains the prevailing *local dynamics*. Second, all estimates from the memoryless section should only be considered valid in a limited time window following the last training timestamp [[*]_], and should be regenerated with new data on a rolling basis. 28 | 29 | 30 | II - Choosing the Window Size 31 | ----------------------------- 32 | It is important to note that :math:`q` is only a function of the length :math:`T` of the path we use for training. It is not necessarily chosen so that all lags are relevant. For a given choice of :math:`q`, we will have :math:`m=T-q` distinct samples and an even smaller `effective sample size `_ :math:`nT_j`. 35 | 36 | Once :math:`q` is chosen, section :ref:`2 - Variable Selection Analysis` can be used to determine which lags are actually insightful and should be included in your predictive model. 37 | 38 | 39 | 40 | 41 | 42 | 43 | .. rubric:: References 44 | 45 | .. [1] Kom Samo, Y.-L., Inductive Mutual Information Estimation: A Convex Maximum-Entropy Copula Approach. Proceedings of the 24th International Conference on Artificial Intelligence and Statistics (AISTATS) 2021, San Diego, California, USA. PMLR: Volume 130. 46 | 47 | 48 | .. rubric:: Footnotes 49 | 50 | .. [*] Of size not exceeding :math:`T`. 51 | .. [*] Given the time series memory. 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /docs/latest/theoretical_foundation/memoryless/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: The theoretical foundation of the KXY Lean AutoML platform. 3 | :keywords: Pre-Learning, Post-Learning, Maximum-Entropy Principle, Input Importance, Feature Importance, KXY, Model Explanation, Dataset Valuation, Input Importance, Feature Importance, Model Suboptimality, Model Optimality. 4 | :http-equiv=content-language: en 5 | *********************** 6 | Memoryless Observations 7 | *********************** 8 | In this section we provide an in-depth discussion of what makes the KxY platform work. We begin with predictive problems where input and output variables do not exhibit temporal structures, or their temporal structures are of negligible importance. For time series problems, refer to our :ref:`Memoryful Observations` section. 9 | 10 | The KxY platform aims at **Democratizing Lean AI**. But what is *lean AI*, you might wonder? 11 | 12 | Our estimate is that *1-in-10* machine learning experiments fail, resulting in tremendous amount of avoidable waste (e.g. productivity, compute power and carbon footprint etc.). *Lean AI* is all about developping machine learning techniques to detect experiments in data science projects, or entire data science projects, that are likely to result in dead-ends and, as such, that should be avoided. Done right, this can increase the productivity of your data science teams tenfold, while slashing costs. 13 | 14 | *We are pioneers in this space, and our works are published in top-tier machine learning conferences.* 15 | 16 | Real-life predictive modeling needs are primarily of two types. An organization could be starting a predictive modeling project from scratch, and might be interested in predicting a new business outcome using available data as potential explanatory variables. Alternatively, the organization might be looking to improve a predictive model that was previously trained and released to production. 17 | 18 | We refer to problems arising from attempting to determine whether projects of the former kind (resp. latter kind) would result in a dead-end as **pre-learning** problems (resp. **post-learning** problems). 19 | 20 | 21 | 22 | 23 | 24 | .. toctree:: 25 | :hidden: 26 | 27 | problem_formulation 28 | quantifying_informativeness 29 | applications 30 | estimation 31 | -------------------------------------------------------------------------------- /docs/latest/theoretical_foundation/memoryless/problem_formulation.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :description: Definition of pre-learning and post-learning in supervised learning problems 3 | :keywords: Pre-Learning Explained, Post-Learning Explained, Model Audit, Model Explanation 4 | :http-equiv=content-language: en 5 | 6 | .. role:: raw-html(raw) 7 | :format: html 8 | 9 | I - Problem Formulation 10 | ======================= 11 | 12 | .. admonition:: Summary 13 | 14 | We introduce **pre-learning** and **post-learning** problems, and discuss their importance. 15 | 16 | 17 | A supervised learning problem (i.e. regression or classification) aims at reliably learning an association between 18 | a vector of inputs :math:`x` and a label :math:`y` that is either categorical or real-valued. The association is learned using a training dataset, with the hope that, given a value of the inputs vector never seen before, the associated label can be predicted with high enough accuracy. 19 | 20 | :raw-html:`While the adequacy of the learned association between x and y depends solely on the model used, the overall accuracy achieved is bound by how informative the inputs are about the label.` If :math:`x` and :math:`y` are unrelated, no model, no matter 21 | how fancy or deep can infer :math:`y` from :math:`x`, and any attempt to do so would be futile and result in a waste of time and money. 22 | 23 | 1 - Pre-Learning 24 | ---------------- 25 | 26 | What Is Pre-Learning? 27 | ^^^^^^^^^^^^^^^^^^^^^ 28 | A good analogy to understand **pre-learning** is that pre-learning is to supervised learning what exploration is to oil production. 29 | 30 | It would never occur to an oil company to build a production well first, and then determine whether the site has oil by trying to extract some from the ground. Setting up an oil production site without exploration would be inefficient and very costly. The `exploration phase `_ ought to come first, and is critical to planning and the overall success of operations. In the exploration phase, inference techniques are used to find sites that are likely to be rich in oil, prior to, and independently from oil extraction, a field known as `exploration geophysics `_. 31 | 32 | In a supervised learning setting, **the site is the data used** to predict the business outcome, **the oil is the business value created** through the improvement of decision making, and **the oil extraction is the training of machine learning models**. Starting to train machine learning models on datasets without any expectation on what performance could be achieved, is like setting up an oil extraction site without knowing in advance that the site is rich in oil. 33 | 34 | Selecting and training great predictive models only affects the amount of value *extracted* from the inputs, it does not change the amount of value *there is intrinsically* in those inputs. The same way the amount of oil that can be produced at a site is bound by the amount of oil accessible in the ground, the performance of a predictive model is bound by the intrinsic value that can be found in the inputs :math:`x` about the outcome of interest :math:`y`. 35 | 36 | .. admonition:: Definition 37 | 38 | **Pre-learning** is the study and selection of datasets to use to solve a supervised learning problem, prior to, and independently from any modeling. 39 | 40 | 41 | Why Is Pre-Learning Important? 42 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 43 | 44 | To solve a supervised learning problem, choosing inputs that are collectively insightful about the outcome of interest has as big an impact on overall performance as, if not bigger than, the machine learning model used to extract such insights. 45 | 46 | Additionally, by quantifying the performance that can be achieved in a supervised learning problem, prior to and independently from modeling, the **pre-learning** phase empowers data scientists to know what to aim for, and to focus their efforts and resources accordingly. 47 | 48 | 49 | 50 | 51 | 2 - Post-Learning 52 | ----------------- 53 | Once a set of informative inputs have been selected and a model has been trained, overall accuracy can be improved by either looking for a better supervised learning model, or looking for additional complementary datasets to use. Determining which action would result in the highest ROI is one of the objects of **post-learning**. 54 | 55 | Because the learned model did not yield a satisfactory enough predictive accuracy, does not necessarily mean that a more elaborate model could do better using the same datasets. It is very possible that, although it has an unsatisfactory predictive accuracy, the learned model already factors in everything the input datasets can tell us about our label. In such an event, the only possible course of action would be 56 | to look for additional datasets to use. 57 | 58 | Even then, because a new dataset is sufficiently informative about the label to predict does not necessarily mean that it can be used to improve the performance of our trained model. It is important to choose a dataset that is not only informative about the label to predict, 59 | but in a way that is complementary to datasets used to train the existing model. 60 | 61 | Another object of **post-learning** is *model audit*, which entails understanding the decisions made by a trained machine learning model, and detecting any bias it might encode, to name but a couple aims. 62 | 63 | 64 | .. admonition:: Definition 65 | 66 | **Post-learning** is the study and audit of a trained supervised learning model, as well as courses of action to take to improve its predictive accuracy. 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /docs/latest/utilities/index.rst: -------------------------------------------------------------------------------- 1 | .. meta:: 2 | :http-equiv=content-language: en 3 | ==== 4 | Misc 5 | ==== 6 | 7 | .. automodule:: kxy.api.client 8 | :members: 9 | 10 | .. automodule:: kxy.api.data_transfer 11 | :members: 12 | 13 | .. automodule:: kxy.api.utils 14 | :members: -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /kxy/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | __author__ = "Dr. Yves-Laurent Kom Samo" 5 | __copyright__ = "Copyright (C) 2022 KXY Technologies, Inc." 6 | __license__ = """ 7 | Copyright (C) 2022 KXY TECHNOLOGIES, INC. 8 | 9 | This program is free software: you can redistribute it and/or modify 10 | it under the terms of the GNU General Public License as published by 11 | the Free Software Foundation, either version 3 of the License, or 12 | (at your option) any later version. 13 | 14 | This program is distributed in the hope that it will be useful, 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | GNU General Public License for more details. 18 | 19 | You should have received a copy of the GNU Affero General Public License 20 | along with this program. If not, see . 21 | """ 22 | __version__ = "1.4.11" 23 | 24 | from kxy.api import * 25 | from kxy.pre_learning import * 26 | from kxy.post_learning import * 27 | from kxy.finance import * 28 | from kxy.pandas_extension import * 29 | from kxy.billing import * -------------------------------------------------------------------------------- /kxy/api/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC. 5 | Author: Dr Yves-Laurent Kom Samo 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | """ 20 | from .utils import * 21 | from .decorators import * 22 | from .client import * 23 | from .data_transfer import * 24 | -------------------------------------------------------------------------------- /kxy/api/client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Python client for the KXY RESTful API. 6 | """ 7 | 8 | from functools import lru_cache 9 | import os 10 | import requests 11 | 12 | from .decorators import requires_api_key, get_api_key, log_backend_warnings 13 | from .. import __version__ as client_version 14 | 15 | 16 | class APIClient(object): 17 | """ 18 | Python client for the RESTful KXY API. All API methods require an API key. The API key must be set by running :code:`kxy configure` from the terminal. 19 | """ 20 | @staticmethod 21 | def stage(): 22 | """ 23 | Defines the deployment stage of the RESTful API the client should talk to. 24 | 25 | Returns 26 | ------- 27 | v : str 28 | The API stage to use. 29 | """ 30 | return 'v2' 31 | 32 | @staticmethod 33 | def url(path): 34 | """ 35 | Turns a relative path into a full API endpoint url. 36 | 37 | Parameters 38 | ---------- 39 | path: str 40 | The relative path of the API resource. 41 | 42 | Returns 43 | ------- 44 | u : str 45 | The full URL of the API resource. 46 | """ 47 | path = path.strip('/') 48 | 49 | return 'https://api.kxy.ai/%s/' % APIClient.stage() + path 50 | 51 | 52 | @staticmethod 53 | @requires_api_key 54 | @log_backend_warnings 55 | def get(path, **params): 56 | """ 57 | .. important:: This method requires a valid API key. 58 | 59 | Issues a GET request to the API resource identified by the input path. 60 | 61 | Parameters 62 | ---------- 63 | path: str 64 | The relative path of the API resource. 65 | params: dict, optional 66 | The query parameters of the GET request. Any keyword argument is 67 | automatically interpreted as a request parameter, its name is used 68 | as the parameter name, and its value as the parameter value. 69 | 70 | Returns 71 | ------- 72 | response: requests.Response 73 | The response of the API. The request HTTP status code can be accessed 74 | through `response.status_code`. To check if the request was succesful, 75 | inspect `response.ok`. When the API returned data, they can be accessed 76 | through `response.json()`. Supported status codes are: 77 | 78 | 200: 79 | The request was successful and the API returned some data accessible through 80 | `response.json()`. 81 | 402: 82 | The request failed because your account does not have a valid payment method. 83 | Check `response.json()['reason']` for more information. 84 | 403: 85 | The request failed because some parameter are either invalid or missing. 86 | Check `response.json()['reason']` for more information. 87 | 404: 88 | The request failed because the API couldn't yet solve the problem of interest. 89 | You should typically try again another time. Check `response.json()['reason']` 90 | for more information. 91 | """ 92 | url = APIClient.url(path) 93 | api_key = get_api_key() 94 | if 'client_version' not in params: 95 | params['client_version'] = client_version 96 | response = requests.get(url, params=params, headers={'x-api-key': api_key, \ 97 | 'content-type': 'application/json'}) 98 | 99 | return response 100 | 101 | 102 | @staticmethod 103 | @requires_api_key 104 | @log_backend_warnings 105 | def post(path, **params): 106 | """ 107 | .. important:: This method requires a valid API key. 108 | 109 | Issues a POST request to the API resource identified by the input path. 110 | 111 | Parameters 112 | ---------- 113 | path: str 114 | The relative path of the API resource. 115 | params: dict, optional 116 | The data to be submitted to the API as part of the POST request, as 117 | a JSON. Any keyword argument is automatically interpreted as a 118 | key of the JSON data that will be submitted to the API, 119 | and its value the associated value in the JSON. 120 | 121 | Returns 122 | ------- 123 | response: requests.Response 124 | The response of the API. The request HTTP status code can be accessed 125 | through `response.status_code`. To check if the request was succesful, 126 | inspect `response.ok`. When the API returned data, they can be accessed 127 | through `response.json()`. 128 | 129 | Supported status codes are: 130 | 131 | 200: 132 | The request was successful and the API returned some data accessible through 133 | `response.json()`. 134 | 402: 135 | The request failed because your account does not have a valid payment method. 136 | Check `response.json()['reason']` for more information. 137 | 403: 138 | The request failed because some parameter are either invalid or missing. 139 | Check `response.json()['reason']` for more information. 140 | 404: 141 | The request failed because the API couldn't yet solve the problem of interest. 142 | You should typically try again another time. Check `response.json()['reason']` 143 | for more information. 144 | """ 145 | url = APIClient.url(path) 146 | api_key = get_api_key() 147 | if 'client_version' not in params: 148 | params['client_version'] = client_version 149 | response = requests.post(url, json=params, headers={'x-api-key': api_key, \ 150 | 'content-type': 'application/json'}) 151 | 152 | return response 153 | 154 | 155 | @staticmethod 156 | @lru_cache(maxsize=32) 157 | def route(path=None, method=None, **params): 158 | """ 159 | .. important:: This method requires a valid API key. 160 | 161 | Generic method to issue a GET or a POST request to the API resource identified 162 | by the input path. 163 | 164 | Parameters 165 | ---------- 166 | path: str 167 | The relative path of the API resource. 168 | 169 | method: str 170 | The REST method. Should be either `'GET'` or `'POST'`. 171 | 172 | params: dict, optional 173 | The data to be submitted to the API as a JSON for POST requests, or 174 | query parameters in the case of GET requests. 175 | 176 | Returns 177 | ------- 178 | response: requests.Response 179 | The response of the API. The request HTTP status code can be accessed 180 | through `response.status_code`. To check if the request was succesful, 181 | inspect `response.ok`. When the API returned data, they can be accessed 182 | through `response.json()`. 183 | 184 | Supported status codes are: 185 | 186 | 200: 187 | The request was successful and the API returned some data accessible through 188 | `response.json()`. 189 | 402: 190 | The request failed because your account does not have a valid payment method. 191 | Check `response.json()['reason']` for more information. 192 | 403: 193 | The request failed because some parameter are either invalid or missing. 194 | Check `response.json()['reason']` for more information. 195 | 404: 196 | The request failed because the API couldn't yet solve the problem of interest. 197 | You should typically try again another time. Check `response.json()['reason']` 198 | for more information. 199 | 200 | Raises 201 | ------ 202 | ValueError 203 | If path is None or method is neither 'GET', nor 'POST'. 204 | """ 205 | if path is None or method is None or \ 206 | method.upper() not in ('GET', 'POST'): 207 | return None 208 | 209 | if method.upper() == 'GET': 210 | return APIClient.get(path, **params) 211 | 212 | if method.upper() == 'POST': 213 | return APIClient.post(path, **params) 214 | 215 | 216 | 217 | 218 | -------------------------------------------------------------------------------- /kxy/api/data_transfer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | To run our analyzes, the KXY backend needs your data. The methods below are the only methods involved in sharing your data with us. The :code:`kxy` package only uploads your data `if` and `when` needed. 6 | """ 7 | import hashlib 8 | import logging 9 | from time import time 10 | import requests 11 | 12 | import pandas as pd 13 | import numpy as np 14 | try: 15 | get_ipython().__class__.__name__ 16 | from halo import HaloNotebook as Halo 17 | except: 18 | from halo import Halo 19 | 20 | from .client import APIClient 21 | 22 | 23 | UPLOADED_FILES = {} 24 | 25 | def generate_upload_url(file_name): 26 | """ 27 | Requests a pre-signed URL to upload a dataset. 28 | 29 | Parameters 30 | ---------- 31 | file_name: str 32 | A string that uniquely identifies the content of the file. 33 | 34 | Returns 35 | ------- 36 | d : dict or None 37 | The dictionary containing the pre-signed url. 38 | """ 39 | api_response = APIClient.route( 40 | path='/wk/generate-signed-upload-url', method='POST',\ 41 | file_name=file_name, timestamp=int(time())) 42 | 43 | if api_response.status_code == requests.codes.ok: 44 | api_response = api_response.json() 45 | if 'presigned_url' in api_response: 46 | presigned_url = api_response['presigned_url'] 47 | return presigned_url 48 | 49 | elif api_response.get('file_already_exists', False): 50 | logging.debug('This file was previously uploaded.') 51 | return {} 52 | 53 | else: 54 | return None 55 | 56 | else: 57 | api_response = api_response.json() 58 | if 'message' in api_response: 59 | logging.warning("\n%s" % api_response['message']) 60 | return None 61 | 62 | 63 | def upload_data(df, file_name=None): 64 | """ 65 | Updloads a dataframe to kxy servers. 66 | 67 | Parameters 68 | ---------- 69 | df: pd.DataFrame 70 | The dataframe to upload. 71 | 72 | Returns 73 | ------- 74 | d : bool 75 | Whether the upload was successful. 76 | """ 77 | if file_name is None: 78 | logging.debug('') 79 | logging.debug('Hashing the data to form the file name') 80 | content = pd.util.hash_pandas_object(df).to_string() 81 | data_identifier = hashlib.sha256(content.encode()).hexdigest() 82 | columns = str(sorted([col for col in df.columns])) 83 | columns_identifier = hashlib.sha256(columns.encode()).hexdigest() 84 | identifier = hashlib.sha256((data_identifier+columns_identifier).encode()).hexdigest() 85 | memory_usage = df.memory_usage(index=False).sum()/(1024.0*1024.0*1024.0) 86 | file_name = identifier + '.parquet.gzip' if memory_usage > 1.5 else identifier + '.parquet' if memory_usage > 0.5 else identifier + '.csv' 87 | logging.debug('Done hashing the data') 88 | else: 89 | identifier = file_name.split('.')[0] 90 | 91 | if UPLOADED_FILES.get(identifier, False): 92 | logging.debug('The file with identifier %s was previously uploaded' % identifier) 93 | return file_name 94 | 95 | logging.debug('Requesting a signed upload URL') 96 | presigned_url = generate_upload_url(file_name) 97 | 98 | if presigned_url is None: 99 | logging.warning('Failed to retrieve the signed upload URL') 100 | return None 101 | else: 102 | logging.debug('Signed upload URL retrieved') 103 | 104 | if presigned_url == {}: 105 | logging.debug('This file was previously uploaded') 106 | UPLOADED_FILES[identifier] = True 107 | return file_name 108 | 109 | 110 | logging.debug('Preparing data for upload') 111 | spinner = Halo(text='Preparing data upload', spinner='dots') 112 | spinner.start() 113 | if file_name.endswith('.parquet.gzip'): 114 | # Truncate floats with excessive precision to save space. 115 | df.columns = df.columns.astype(str) 116 | _bytes = df.to_parquet(index=False, compression='gzip') 117 | elif file_name.endswith('.parquet'): 118 | # Truncate floats with excessive precision to save space. 119 | df.columns = df.columns.astype(str) 120 | _bytes = df.to_parquet(index=False) 121 | else: 122 | _bytes = df.to_csv(index=False) 123 | spinner.succeed() 124 | 125 | files = {'file': _bytes} 126 | url = presigned_url['url'] 127 | data = presigned_url['fields'] 128 | logging.debug('Done preparing the data to upload') 129 | logging.debug('Uploading the data') 130 | spinner.start('Uploading data') 131 | upload_response = requests.post(url, data=data, files=files) 132 | spinner.succeed() 133 | 134 | if upload_response.status_code in [requests.codes.ok, requests.codes.created, requests.codes.accepted, requests.codes.no_content]: 135 | logging.debug('Data successfully uploaded') 136 | UPLOADED_FILES[identifier] = True 137 | return file_name 138 | else: 139 | logging.warning('Failed to upload the file. Received status code %s.' % (upload_response.status_code)) 140 | 141 | return None 142 | 143 | 144 | -------------------------------------------------------------------------------- /kxy/api/decorators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | ================== 6 | kxy.api.decorators 7 | ================== 8 | """ 9 | 10 | from functools import wraps 11 | import json 12 | import logging 13 | import os 14 | import requests 15 | 16 | TRIAL_API_KEY = 'SZiRisvhzC7KBgROZG5dE1VQIlE8Jk4DbQ1YZdZ0' 17 | 18 | def get_api_key(): 19 | """ 20 | Retrieves the store API key, or None if none was provided. 21 | """ 22 | home = os.path.expanduser("~") 23 | path = os.path.join(home, '.kxy') 24 | file_name = os.path.join(path, 'config') 25 | try: 26 | with open(file_name, 'r') as f: 27 | config = json.load(f) 28 | existing_key = config.get('KXY_API_KEY', TRIAL_API_KEY) 29 | return existing_key 30 | except: 31 | return os.environ.get('KXY_API_KEY', TRIAL_API_KEY) 32 | 33 | return None 34 | 35 | 36 | 37 | def has_api_key(): 38 | """ 39 | Returns whether or not an API key was provided as a result of running :code:`kxy configure`. 40 | """ 41 | return get_api_key() is not None 42 | 43 | 44 | 45 | def requires_api_key(method): 46 | """ 47 | Decorator used to make functions and methods calls fail 48 | when they require an API key and the user did not provide on 49 | by running :code:`kxy configure`. The decorated function or method 50 | is otherwise not affected. 51 | 52 | Raises 53 | ------ 54 | AssertionError 55 | If an API key was not previously recorded. 56 | """ 57 | @wraps(method) 58 | def wrapper(*args, **kw): 59 | assert has_api_key(), "An API key should be provided. Please run 'kxy configure'" 60 | return method(*args, **kw) 61 | 62 | return wrapper 63 | 64 | 65 | 66 | def log_backend_warnings(method): 67 | """ 68 | Decorator used to make requests hitting the backend log backend warnings. 69 | """ 70 | @wraps(method) 71 | def wrapper(*args, **kw): 72 | response = method(*args, **kw) 73 | try: 74 | if response.status_code == requests.codes.ok: 75 | response_json = response.json() 76 | if 'warning' in response_json: 77 | logging.warning('%s' % response_json['warning']) 78 | except: 79 | pass 80 | return response 81 | 82 | return wrapper 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /kxy/api/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | -------------------------------------------------------------------------------- /kxy/billing/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC. 5 | Author: Dr Yves-Laurent Kom Samo 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | """ 20 | from .billing_details import * 21 | -------------------------------------------------------------------------------- /kxy/billing/billing_details.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Everything billing. 5 | """ 6 | import logging 7 | import requests 8 | from time import time 9 | 10 | from kxy.api import APIClient 11 | 12 | 13 | def get_upcoming_invoice(): 14 | """ 15 | Retrieves all items that will show up in your next invoice. 16 | 17 | Returns 18 | ------- 19 | d : dict 20 | The dictionary containing all items that will appear in your next invoice. 21 | E.g. :code:`{'Type of charge': {'total_usd': ..., 'quantity': ..., 'description': ..., 'billing_period_start_timestamp': ..., 'billing_period_end_timestamp': ...}, ... }` 22 | """ 23 | api_response = APIClient.route( 24 | path='/wk/billing/upcoming-invoice', method='POST',\ 25 | timestamp=int(time())) 26 | try: 27 | if api_response.status_code == requests.codes.ok: 28 | api_response = api_response.json() 29 | return api_response 30 | else: 31 | return {} 32 | except: 33 | logging.exception('Failed to retrieve your upcoming invoice.') 34 | return {} 35 | -------------------------------------------------------------------------------- /kxy/examples/feature_selection_example.py: -------------------------------------------------------------------------------- 1 | # 0. As a one-off, run 'pip install kxy', then 'kxy configure' 2 | # This import is necessary to get all df.kxy.* methods 3 | import kxy 4 | 5 | # 1. Load your data 6 | # pip install kxy_datasets 7 | from kxy_datasets.classifications import BankMarketing 8 | dataset = BankMarketing() 9 | target_column = dataset.y_column 10 | df = dataset.df 11 | 12 | # 2. Generate candidate features 13 | features_df = df.kxy.generate_features(entity=None, max_lag=None,\ 14 | entity_name='*', exclude=[target_column]) 15 | features_df = features_df.drop('y_yes', axis=1) 16 | target_column = 'y_no' 17 | 18 | # 3. Training/Testing split 19 | # pip install scikit-learn 20 | from sklearn.model_selection import train_test_split 21 | train_features_df, test_features_df = train_test_split(features_df, \ 22 | test_size=0.2, random_state=0) 23 | test_labels_df = test_features_df.loc[:, [target_column]] 24 | test_features_df = test_features_df.drop(target_column, axis=1) 25 | 26 | # 4. Create a LightGBM learner function. 27 | 28 | # A learner function is a function that expects up to two optional 29 | # variables: n_vars and path. When called it returns an instance of 30 | # 'predictive model' expecting n_vars features. The path parameter, 31 | # when provided, allows the learner function to load a saved model 32 | # from disk. 33 | 34 | # A 'predictive model' here is any class with a fit(self, x, y) method 35 | # and predict(self, x) method. To use the path argument of the learner 36 | # function, the class should also define a save(self, path) method to 37 | # save a model to disk, and a load(cls, path) class method to load a 38 | # saved model from disk. 39 | 40 | # See kxy.learning.base_learners for helper functions that allow you 41 | # create learner functions that return instances of popular predictive 42 | # models (e.g. lightgbm, xgboost, sklearn, tensorflow, pytorch models 43 | # etc.). 44 | 45 | from kxy.learning import get_lightgbm_learner_learning_api 46 | params = { 47 | 'objective': 'binary', 48 | 'metric': ['auc', 'binary_logloss'], 49 | } 50 | lightgbm_learner_func = get_lightgbm_learner_learning_api(params, \ 51 | num_boost_round=10000, early_stopping_rounds=50, verbose_eval=50, \ 52 | split_random_seed=0) 53 | 54 | # 5. Fit a LightGBM classifier wrapped around LeanML feature selection 55 | results = train_features_df.kxy.fit(target_column, \ 56 | lightgbm_learner_func, problem_type='classification', \ 57 | feature_selection_method='leanml') 58 | predictor = results['predictor'] 59 | 60 | # 6. Make predictions from a dataframe of test features 61 | test_predictions_df = predictor.predict(test_features_df) 62 | 63 | # 7. Compute out-of-sample accuracy and AUC 64 | from sklearn.metrics import accuracy_score, roc_auc_score 65 | accuracy = accuracy_score( 66 | test_labels_df[target_column].values, \ 67 | test_predictions_df[target_column].values, \ 68 | ) 69 | auc = roc_auc_score( \ 70 | test_labels_df[target_column].values, \ 71 | test_predictions_df[target_column].values, \ 72 | multi_class='ovr' 73 | ) 74 | 75 | print('LeanML -- Testing Accuracy: %.2f, AUC: %.2f' % (accuracy, auc)) 76 | selected_features = predictor.selected_variables 77 | print('LeanML -- Selected Variables:') 78 | import pprint as pp 79 | pp.pprint(selected_features) 80 | 81 | # 8. (Optional) Save the trained model. 82 | path = './lightgbm_uci_bank_marketing.sav' 83 | predictor.save(path) 84 | 85 | # 9. (Optional) Load the saved model. 86 | from kxy.learning.leanml_predictor import LeanMLPredictor 87 | loaded_predictor = LeanMLPredictor.load(path, lightgbm_learner_func) 88 | 89 | 90 | 91 | # 10.a Fit a LightGBM classifier wrapped around RFE feature selection 92 | n_leanml_features = len(selected_features) 93 | rfe_results = train_features_df.kxy.fit(target_column, \ 94 | lightgbm_learner_func, problem_type='classification', \ 95 | feature_selection_method='rfe', rfe_n_features=n_leanml_features) 96 | rfe_predictor = rfe_results['predictor'] 97 | 98 | # 10.b Fit a LightGBM classifier wrapped around Boruta feature 99 | # selection. 100 | boruta_results = train_features_df.kxy.fit(target_column, \ 101 | lightgbm_learner_func, problem_type='classification', \ 102 | feature_selection_method='boruta', boruta_n_evaluations= 20, \ 103 | boruta_pval=0.95) 104 | boruta_predictor = boruta_results['predictor'] 105 | 106 | # 10.c Fit a LightGBM classifier wrapped around Boruta feature 107 | # selection. 108 | none_results = train_features_df.kxy.fit(target_column, \ 109 | lightgbm_learner_func, problem_type='classification', \ 110 | feature_selection_method=None) 111 | none_predictor = none_results['predictor'] 112 | 113 | # 11. Make predictions from a dataframe of test features 114 | rfe_test_predictions_df = rfe_predictor.predict(test_features_df) 115 | boruta_test_predictions_df = boruta_predictor.predict(test_features_df) 116 | none_test_predictions_df = none_predictor.predict(test_features_df) 117 | 118 | # 12. Compute out-of-sample accuracy and AUC 119 | rfe_accuracy = accuracy_score( 120 | test_labels_df[target_column].values, \ 121 | rfe_test_predictions_df[target_column].values, \ 122 | ) 123 | rfe_auc = roc_auc_score( \ 124 | test_labels_df[target_column].values, \ 125 | rfe_test_predictions_df[target_column].values, \ 126 | multi_class='ovr' 127 | ) 128 | 129 | boruta_accuracy = accuracy_score( 130 | test_labels_df[target_column].values, \ 131 | boruta_test_predictions_df[target_column].values, \ 132 | ) 133 | boruta_auc = roc_auc_score( \ 134 | test_labels_df[target_column].values, \ 135 | boruta_test_predictions_df[target_column].values, \ 136 | multi_class='ovr' 137 | ) 138 | 139 | none_accuracy = accuracy_score( 140 | test_labels_df[target_column].values, \ 141 | none_test_predictions_df[target_column].values, \ 142 | ) 143 | none_auc = roc_auc_score( \ 144 | test_labels_df[target_column].values, \ 145 | none_test_predictions_df[target_column].values, \ 146 | multi_class='ovr' 147 | ) 148 | 149 | print('RFE -- Accuracy: %.2f, AUC: %.2f' % (rfe_accuracy, rfe_auc)) 150 | rfe_selected_features = rfe_predictor.selected_variables 151 | print('RFE -- Selected Variables:') 152 | pp.pprint(rfe_selected_features) 153 | print() 154 | 155 | print('Boruta -- Accuracy: %.2f, AUC: %.2f' % (boruta_accuracy, \ 156 | boruta_auc)) 157 | boruta_selected_features = boruta_predictor.selected_variables 158 | print('Boruta -- Selected Variables:') 159 | pp.pprint(boruta_selected_features) 160 | print() 161 | 162 | print('No Feature Selection -- Accuracy: %.2f, AUC: %.2f' % (none_accuracy, \ 163 | none_auc)) 164 | all_features = none_predictor.selected_variables 165 | print('No Feature Selection -- Selected Variables:') 166 | pp.pprint(all_features) 167 | 168 | 169 | -------------------------------------------------------------------------------- /kxy/examples/numerai_example.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import r2_score 2 | import kxy 3 | import pandas as pd 4 | from kxy.learning import get_lightgbm_learner_sklearn_api 5 | 6 | ######## 7 | # Data # 8 | ######## 9 | ## Uncomemnt to download Numerai data 10 | # from numerapi import NumerAPI 11 | # napi = NumerAPI() 12 | # current_round = napi.get_current_round(tournament=8) 13 | # napi.download_dataset("numerai_training_data_int8.parquet", "numerai_training_data_int8.parquet") 14 | 15 | df = pd.read_parquet('numerai_training_data_int8.parquet') 16 | target_column, problem_type = 'target', 'regression' 17 | feature_columns = [_ for _ in df.columns if _.startswith('feature_')] 18 | columns = feature_columns + [target_column] 19 | df = df[columns] 20 | 21 | 22 | #################### 23 | # Train/Test Split # 24 | #################### 25 | random_seed = 2 26 | test_df = df.sample(frac=0.7, random_state=random_seed) 27 | train_df = df.drop(test_df.index) 28 | train_features = train_df[feature_columns] 29 | train_labels = train_df[[target_column]] 30 | test_features = test_df[feature_columns] 31 | test_labels = test_df[[target_column]] 32 | 33 | x_train = train_features.values 34 | x_test = test_features.values 35 | y_train = train_labels.values 36 | y_test = test_labels.values 37 | 38 | 39 | # Run PFS 40 | from kxy.misc.tf import set_default_parameter 41 | from kxy.pfs import PFS 42 | set_default_parameter('lr', 0.001) 43 | selector = PFS() 44 | selector.fit(x_train, y_train, epochs=10, seed=random_seed, expand_y=False) 45 | 46 | # Extract the features 47 | fx_train = selector.max_ent_features_x(x_train) 48 | 49 | # Run a linear regression relating learned features to y 50 | from sklearn.linear_model import LinearRegression 51 | from sklearn.metrics import r2_score 52 | 53 | # PFS 54 | # Training 55 | m = LinearRegression() 56 | m.fit(fx_train, y_train) 57 | 58 | # Testing accuracy 59 | fx_test = selector.max_ent_features_x(x_test) 60 | 61 | y_test_predicted = m.predict(fx_test) 62 | testing_r2 = r2_score(y_test_predicted, y_test) 63 | 64 | y_train_predicted = m.predict(fx_train) 65 | training_r2 = r2_score(y_train_predicted, y_train) 66 | 67 | print('R^2 -- PFS -- Training: %.4f, Testing: %.4f' % (training_r2, testing_r2)) 68 | 69 | 70 | # No PFS 71 | m = LinearRegression() 72 | m.fit(x_train, y_train) 73 | 74 | y_test_predicted_n = m.predict(x_test) 75 | y_train_predicted_n = m.predict(x_train) 76 | 77 | testing_r2_n = r2_score(y_test_predicted_n, y_test) 78 | training_r2_n = r2_score(y_train_predicted_n, y_train) 79 | 80 | print('R^2 -- No PFS -- Training: %.4f, Testing: %.4f' % (training_r2_n, testing_r2_n)) 81 | 82 | 83 | 84 | 85 | 86 | # ########################## 87 | # # With Feature Selection # 88 | # ########################## 89 | # # LightGBM model factory 90 | # lightgbm_regressor_learner_cls = get_lightgbm_learner_sklearn_api('lightgbm.LGBMRegressor', \ 91 | # n_jobs=-1, colsample_bytree=0.1, learning_rate=0.01, n_estimators=2000, max_depth=5) 92 | 93 | # # Lean boosting fit 94 | # results = train_df.kxy.fit(target_column, lightgbm_regressor_learner_cls, \ 95 | # problem_type=problem_type, feature_selection_method='pfs', pfs_p=100, \ 96 | # data_identifier='numerai_training_data_int8_train_seed_%d.parquet.gzip' % random_seed) 97 | 98 | # predictor = results['predictor'] 99 | # p = predictor.feature_directions.shape[0] 100 | # print('Number of features: %d' % p) 101 | 102 | # # selected_features = predictor.selected_variables 103 | # # print('Selected Variables') 104 | # # print(selected_features) 105 | 106 | # # Training/Testing Predictions 107 | # train_predictions = predictor.predict(train_features) 108 | # test_predictions = predictor.predict(test_features) 109 | 110 | # # Training/Testing Performance 111 | # train_r2 = r2_score(train_labels, train_predictions) 112 | # test_r2 = r2_score(test_labels, test_predictions) 113 | 114 | # print('Compressed LightGBM: Training R^2: %.4f, Testing R^2: %.4f' % (train_r2, test_r2)) 115 | 116 | 117 | # ################################# 118 | # # Fit Without Feature Selection # 119 | # ################################# 120 | # results = train_df.kxy.fit(target_column, lightgbm_regressor_learner_cls, \ 121 | # problem_type=problem_type, feature_selection_method=None) 122 | # naive_predictor = results['predictor'] 123 | 124 | # # Training/Testing Predictions 125 | # naive_train_predictions = naive_predictor.predict(train_features) 126 | # naive_test_predictions = naive_predictor.predict(test_features) 127 | 128 | # # Training/Testing Performance 129 | # naive_train_r2 = r2_score(train_labels, naive_train_predictions) 130 | # naive_test_r2 = r2_score(test_labels, naive_test_predictions) 131 | 132 | # print('Naive LightGBM: Training R^2: %.4f, Testing R^2: %.4f' % (naive_train_r2, naive_test_r2)) 133 | 134 | 135 | -------------------------------------------------------------------------------- /kxy/finance/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC. 5 | Author: Dr Yves-Laurent Kom Samo 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | """ 20 | from .corr import * -------------------------------------------------------------------------------- /kxy/finance/corr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import logging 4 | import requests 5 | import sys 6 | from time import time, sleep 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | try: 12 | get_ipython().__class__.__name__ 13 | from halo import HaloNotebook as Halo 14 | except: 15 | from halo import Halo 16 | 17 | from kxy.api import APIClient, upload_data 18 | 19 | # Cache old job ids to avoid being charged twice for the same job. 20 | IACORR_JOB_IDS = {} 21 | 22 | def information_adjusted_correlation(data_df, market_column, asset_column): 23 | """ 24 | Estimate the information-adjusted correlation between an asset return :math:`r` and the market return :math:`r_m`: :math:`\\text{IA-Corr}\\left(r, r_m \\right) := \\text{sgn}\\left(\\text{Corr}\\left(r, r_m \\right) \\right) \\left[1 - e^{-2I(r, r_m)} \\right]`, where :math:`\\text{sgn}\\left(\\text{Corr}\\left(r, r_m \\right) \\right)` the sign of the Pearson correlation coefficient. 25 | 26 | Unlike Pearson's correlation coefficient, which is 0 if and only if asset return and market return are **decorrelated** (i.e. they exhibit no linear relation), information-adjusted correlation is 0 if and only if market and asset returns are **statistically independent** (i.e. the exhibit no relation, linear or nonlinear). 27 | 28 | 29 | Parameters 30 | ---------- 31 | data_df : pandas.DataFrame 32 | The pandas DataFrame containing the data. 33 | market_column : str 34 | The name of the column containing market returns. 35 | asset_column : str 36 | The name of the column containing asset returns. 37 | 38 | 39 | Returns 40 | ------- 41 | result : float 42 | The information-adjusted correlation. 43 | 44 | """ 45 | assert market_column in data_df.columns, 'The market column should be a column of the dataframe.' 46 | assert asset_column in data_df.columns, 'The asset column should be a column of the dataframe.' 47 | assert np.can_cast(data_df[market_column], float), 'The market return column should be numeric' 48 | assert np.can_cast(data_df[asset_column], float), 'The asset return column should be numeric' 49 | 50 | k = 0 51 | kp = 0 52 | max_k = 100 53 | spinner = Halo(text='Waiting for results from the backend.', spinner='dots') 54 | spinner.start() 55 | 56 | df = data_df[[market_column, asset_column]] 57 | file_name = upload_data(df) 58 | if file_name: 59 | job_id = IACORR_JOB_IDS.get(file_name, None) 60 | 61 | if job_id: 62 | api_response = APIClient.route( 63 | path='/wk/ia-corr', method='POST', 64 | file_name=file_name, market_column=market_column, \ 65 | asset_column=asset_column, \ 66 | timestamp=int(time()), job_id=job_id) 67 | else: 68 | api_response = APIClient.route( 69 | path='/wk/ia-corr', method='POST', \ 70 | file_name=file_name, market_column=market_column, \ 71 | asset_column=asset_column, \ 72 | timestamp=int(time())) 73 | 74 | initial_time = time() 75 | while api_response.status_code == requests.codes.ok and k < max_k: 76 | if kp%2 != 0: 77 | sleep(2 if kp<5 else 5 if k < max_k-4 else 300) 78 | kp += 4 79 | k = kp//2 80 | else: 81 | try: 82 | response = api_response.json() 83 | if 'job_id' in response: 84 | job_id = response['job_id'] 85 | IACORR_JOB_IDS[file_name] = job_id 86 | sleep(2 if kp<5 else 5 if k < max_k-4 else 300) 87 | kp += 4 88 | k = kp//2 89 | 90 | # Note: it is important to pass the job_id to avoid being charged twice for the same work. 91 | api_response = APIClient.route( 92 | path='/wk/ia-corr', method='POST', 93 | file_name=file_name, market_column=market_column, \ 94 | asset_column=asset_column, \ 95 | timestamp=int(time()), job_id=job_id) 96 | 97 | try: 98 | response = api_response.json() 99 | if 'eta' in response: 100 | progress_text = '%s%% Completed.' % response['progress_pct'] if 'progress_pct' in response else '' 101 | spinner.text = 'Waiting for results from the backend. ETA: %s. %s' % (response['eta'], progress_text) 102 | except: 103 | pass 104 | 105 | if 'job_id' not in response: 106 | duration = int(time()-initial_time) 107 | duration = str(duration) + 's' if duration < 60 else str(duration//60) + 'min' 108 | spinner.text = 'Received results from the backend in %s' % duration 109 | spinner.succeed() 110 | 111 | if 'ia-corr' in response: 112 | return response['ia-corr'] 113 | else: 114 | return np.nan 115 | 116 | except: 117 | spinner.text = 'The backend encountered an unexpected error we are looking into. Please try again later.' 118 | spinner.fail() 119 | logging.exception('\nInformation-adjusted correlation failed. Last HTTP code: %s' % api_response.status_code) 120 | return None 121 | 122 | 123 | if api_response.status_code != requests.codes.ok: 124 | spinner.text = 'The backend is taking longer than expected. Please try again later.' 125 | spinner.fail() 126 | try: 127 | response = api_response.json() 128 | if 'message' in response: 129 | logging.error('\n%s' % response['message']) 130 | except: 131 | logging.error('\nInformation-adjusted correlation failed. Last HTTP code: %s' % api_response.status_code) 132 | 133 | return None 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /kxy/learning/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC. 5 | Author: Dr Yves-Laurent Kom Samo 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | """ 20 | from .base_learners import * 21 | from .leanml_predictor import * -------------------------------------------------------------------------------- /kxy/learning/pytorch_early_termination.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | PyTorch code to terminate training of a deep learning regressor or classifier when the running loss is much 5 | lower than a threshold, typically the theoretical-best. 6 | """ 7 | import logging 8 | import numpy as np 9 | 10 | class TerminateIfOverfittedPT(object): 11 | ''' 12 | PyTorch event handler that terminates training when the running loss is smaller than the theoretical best, which is strong indication that the model will end up overfitting. 13 | 14 | Parameters 15 | ---------- 16 | loss_key : str 17 | Which loss to base early-termination on. Example values are: :code:`'loss'`, :code:`'classification_error'`, and any other registered loss metrics. 18 | theoretical_best : float 19 | The theoretical-smallest loss achievable without overfiting, obtained using :code:`df.kxy.data_valuation`. 20 | 21 | 22 | .. seealso:: 23 | 24 | :ref:`kxy.pre_learning.achievable_performance.data_valuation ` 25 | 26 | ''' 27 | def __init__(self, theoretical_best, loss_key): 28 | self.theoretical_best = theoretical_best 29 | self.loss_key = loss_key 30 | 31 | 32 | def __call__(self, engine): 33 | ''' ''' 34 | logs = engine.state.metrics or {} 35 | if 'accuracy' in logs: 36 | logs['classification_error'] = 1.-logs['accuracy'] 37 | 38 | loss = logs.get(self.loss_key, -np.inf) 39 | if loss < self.theoretical_best: 40 | logging.warning('Loss %s (%.4f) is much smaller than the theoretical best %.4f' % (self.loss_key, loss, self.theoretical_best)) 41 | engine.terminate() 42 | 43 | 44 | -------------------------------------------------------------------------------- /kxy/learning/tensorflow_early_termination.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Tensorflow v2 code to terminate training of a deep learning regressor or classifier when the running loss is much 5 | lower than a threshold, typically the theoretical-best. 6 | """ 7 | import logging 8 | import numpy as np 9 | from tensorflow.keras.callbacks import Callback 10 | 11 | 12 | class TerminateIfOverfittedTF(Callback): 13 | ''' 14 | Tensorflow callback that terminates training at the end of a batch when the running loss is smaller than the theoretical best, which is strong indication that the model will end up overfitting. 15 | 16 | Parameters 17 | ---------- 18 | loss_key : str 19 | Which loss to base early-termination on. Example values are: :code:`'loss'`, :code:`'classification_error'`, and any other registered loss metrics. 20 | theoretical_best : float 21 | The theoretical-smallest loss achievable without overfiting, obtained using :code:`df.kxy.data_valuation` 22 | 23 | 24 | 25 | .. seealso:: 26 | 27 | :ref:`kxy.pre_learning.achievable_performance.data_valuation `. 28 | 29 | 30 | ''' 31 | def __init__(self, theoretical_best, loss_key): 32 | super(EarlyTermination, self).__init__() 33 | self._supports_tf_logs = True 34 | self.theoretical_best = theoretical_best 35 | self.loss_key = loss_key 36 | 37 | def on_batch_end(self, batch, logs=None): 38 | ''' ''' 39 | logs = logs or {} 40 | if 'accuracy' in logs: 41 | logs['classification_error'] = 1.-logs['accuracy'] 42 | 43 | loss = logs.get(self.loss_key, -np.inf) 44 | if loss < self.theoretical_best: 45 | logging.warning('Loss %s (%.4f) is much smaller than the theoretical best %.4f' % (self.loss_key, loss, self.theoretical_best)) 46 | self.model.stop_training = True 47 | -------------------------------------------------------------------------------- /kxy/misc/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC. 5 | Author: Dr Yves-Laurent Kom Samo 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | """ 20 | from .boruta import * 21 | from .rfe import * 22 | from .predictors import * 23 | from .exceptions import * -------------------------------------------------------------------------------- /kxy/misc/exceptions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | class LongerThanExpectedException(Exception): 4 | pass -------------------------------------------------------------------------------- /kxy/misc/mind.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | TensorFlow Implementation of MIND ([1]) under Spearman rank correlation constraints. 5 | 6 | [1] Kom Samo, Y. (2021). Inductive Mutual Information Estimation: A Convex Maximum-Entropy Copula Approach . Proceedings of The 24th International Conference on Artificial Intelligence and Statistics, in Proceedings of Machine Learning Research 130:2242-2250 Available from https://proceedings.mlr.press/v130/kom-samo21a.html. 7 | """ 8 | import numpy as np 9 | 10 | from kxy.misc.tf import CopulaLearner 11 | 12 | def copula_entropy(z, subsets=[]): 13 | ''' 14 | Estimate the entropy of the copula distribution of a d-dimensional random vector using MIND ([1]) with Spearman rank correlation constraints. 15 | 16 | 17 | Parameters 18 | ---------- 19 | z : np.array 20 | Vector whose rows are samples from the d-dimensional random vector and columns its coordinates. 21 | 22 | 23 | Returns 24 | ------- 25 | ent : float 26 | The estimated copula entropy. 27 | ''' 28 | if len(z.shape)==1 or z.shape[1]==1: 29 | return 0.0 30 | 31 | d = z.shape[1] 32 | cl = CopulaLearner(d, subsets=subsets) 33 | cl.fit(z) 34 | ent = min(cl.copula_entropy, 0.0) 35 | 36 | return ent 37 | 38 | 39 | 40 | def mutual_information(y, x): 41 | ''' 42 | Estimate the mutual information between two random vectors using MIND ([1]) with Spearman rank correlation constraints. 43 | 44 | 45 | Parameters 46 | ---------- 47 | y : np.array 48 | Vector whose rows are samples from the d-dimensional random vector and columns its coordinates. 49 | x : np.array 50 | Vector whose rows are samples from the d-dimensional random vector and columns its coordinates. 51 | 52 | 53 | Returns 54 | ------- 55 | mi : float 56 | The estimated mutual information. 57 | ''' 58 | y = y[:, None] if len(y.shape)==1 else y 59 | x = x[:, None] if len(x.shape)==1 else x 60 | z = np.concatenate([y, x], axis=1) 61 | huy = copula_entropy(y) 62 | hux = copula_entropy(x) 63 | huz = copula_entropy(z) 64 | mi = max(huy+hux-huz, 0.0) 65 | 66 | return mi 67 | 68 | 69 | def run_d_dimensional_gaussian_experiment(d, rho, n=1000): 70 | ''' 71 | ''' 72 | # Cholesky decomposition of corr = np.array([[1., rho], [rho, 1.]]) 73 | L = np.array([[1., 0.], [rho, np.sqrt(1.-rho*rho)]]) 74 | y = np.empty((n, d)) 75 | x = np.empty((n, d)) 76 | for i in range(d): 77 | u = np.random.randn(n, 2) 78 | z = np.dot(L, u.T).T 79 | y[:, i] = z[:, 0].copy() 80 | x[:, i] = z[:, 1].copy() 81 | 82 | estimated_mi = mutual_information(y, x) 83 | true_mi = -d*0.5*np.log(1.-rho*rho) 84 | 85 | return estimated_mi, true_mi 86 | 87 | 88 | 89 | if __name__ == '__main__': 90 | rho = 0.95 91 | d = 20 92 | estimated_mi, true_mi = run_d_dimensional_gaussian_experiment(d, rho) 93 | print('%dd Gaussian Mutual Information: Estimated %.4f, True (theoretical) %.4f' % (\ 94 | d, estimated_mi, true_mi)) 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /kxy/misc/naive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | from time import time 6 | import numpy as np 7 | 8 | class NaiveLearner(object): 9 | """ 10 | Implementation of the Recursive Feature Elimination (RFE) feature selection algorithm. 11 | 12 | Reference: 13 | """ 14 | def __init__(self, learner_func, path=None): 15 | """ 16 | Constructor. 17 | 18 | Parameters 19 | ---------- 20 | learner_func : func | callable 21 | Function or callable that expects one optional argument :code:`n_vars` and returns an instance of a superviser learner (regressor or classifier) following the scikit-learn convention, and expecting :code:`n_vars` features. 22 | 23 | Specifically, the learner should have a :code:`fit(x_train, y_train)` method. The learner should also have a :code:`feature_importances_` property or attribute, which is an array or a list containing feature importances once the model has been trained. 24 | 25 | There should be as many importance scores in :code:`feature_importances_` as columns in :code:`x_train`. 26 | 27 | """ 28 | self.selected_variables = [] 29 | self.learner_func = learner_func 30 | self.path = path 31 | 32 | 33 | def fit(self, x_df, y_df): 34 | """ 35 | Fit the model without feature selection. 36 | 37 | Parameters 38 | ---------- 39 | x_df : pd.DataFrame 40 | A dataframe containing all features. 41 | y_df : pd.DataFrame 42 | A dataframe containing the target. 43 | 44 | Attributes 45 | ---------- 46 | selected_variables : list 47 | The list of features. 48 | 49 | Returns 50 | ------- 51 | m : sklearn-like model (an instance returned by :code:`learner_func`) 52 | An instance returned by :code:`learner_func` trained with all features. 53 | 54 | """ 55 | columns = [_ for _ in x_df.columns] 56 | y = y_df.values 57 | x = x_df[columns].values 58 | n_vars = len(columns) 59 | m = self.learner_func(n_vars=n_vars, path=self.path) 60 | m.fit(x, y) 61 | self.selected_variables = columns 62 | 63 | return m 64 | 65 | 66 | -------------------------------------------------------------------------------- /kxy/misc/rfe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | from time import time 6 | import numpy as np 7 | 8 | try: 9 | from tqdm import tqdm 10 | except: 11 | logging.warning('Boruta requires tqdm, which does not seem installed.') 12 | 13 | 14 | class RFE(object): 15 | """ 16 | Implementation of the Recursive Feature Elimination (RFE) feature selection algorithm. 17 | 18 | Reference: 19 | """ 20 | def __init__(self, learner_func, path=None): 21 | """ 22 | Constructor. 23 | 24 | Parameters 25 | ---------- 26 | learner_func : func | callable 27 | Function or callable that expects one optional argument :code:`n_vars` and returns an instance of a superviser learner (regressor or classifier) following the scikit-learn convention, and expecting :code:`n_vars` features. 28 | 29 | Specifically, the learner should have a :code:`fit(x_train, y_train)` method. The learner should also have a :code:`feature_importances_` property or attribute, which is an array or a list containing feature importances once the model has been trained. 30 | 31 | There should be as many importance scores in :code:`feature_importances_` as columns in :code:`x_train`. 32 | 33 | """ 34 | self.selected_variables = [] 35 | self.learner_func = learner_func 36 | self.path = path 37 | 38 | 39 | def fit(self, x_df, y_df, n_vars, max_duration=None): 40 | """ 41 | Performs a run of the Recursive Feature Elimination (RFE) feature selection algorithm. 42 | 43 | Starting with all features, we recursively train a learner, calculate all feature importance scores, remove the least important feature, and repeat until we are left with :code:`n_vars` features. 44 | 45 | Parameters 46 | ---------- 47 | x_df : pd.DataFrame 48 | A dataframe containing all features. 49 | y_df : pd.DataFrame 50 | A dataframe containing the target. 51 | n_vars : int 52 | The number of features to keep. 53 | max_duration : float | None (default) 54 | If not None, then feature elimination will stop after this many seconds. 55 | 56 | Attributes 57 | ---------- 58 | selected_variables : list 59 | The list of the :code:`n_vars` features we kept. 60 | 61 | 62 | Returns 63 | ------- 64 | m : sklearn-like model (an instance returned by :code:`learner_func`) 65 | An instance returned by :code:`learner_func` trained with the :code:`n_vars` features we kept. 66 | 67 | """ 68 | columns = [_ for _ in x_df.columns] 69 | y = y_df.values 70 | 71 | # Fit the model 72 | x = x_df[columns].values 73 | current_n_vars = len(columns) 74 | start_time = time() 75 | m = self.learner_func(n_vars=current_n_vars) 76 | m.fit(x, y) 77 | importances = [_ for _ in m.feature_importances_] 78 | 79 | n_rounds = max(current_n_vars-n_vars, 0) 80 | for _ in tqdm(range(n_rounds)): 81 | duration = time()-start_time 82 | if max_duration and duration > max_duration: 83 | logging.warning('We have exceeded the configured maximum duration %.2fs: exiting...' % max_duration) 84 | break 85 | 86 | # Remove the least important variable 87 | importances = [_ for _ in m.feature_importances_] 88 | least_important_ix = np.argmin(np.abs(importances)) 89 | importances.pop(least_important_ix) 90 | least_important_feature = columns[least_important_ix] 91 | logging.info('Deleting feature %s' % least_important_feature) 92 | columns.remove(least_important_feature) 93 | current_n_vars = len(columns) 94 | 95 | # Re-fit the model 96 | x = x_df[columns].values 97 | m = self.learner_func(n_vars=current_n_vars, path=self.path) 98 | m.fit(x, y) 99 | 100 | self.selected_variables = [col for _, col in sorted(zip(importances, columns), reverse=True)] 101 | 102 | return m 103 | 104 | 105 | -------------------------------------------------------------------------------- /kxy/misc/tf/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC. 5 | Author: Dr Yves-Laurent Kom Samo 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | """ 20 | try: 21 | from pkg_resources import parse_version 22 | import tensorflow as tf 23 | assert parse_version(tf.__version__) >= parse_version('2.4.1') 24 | except: 25 | import logging 26 | logging.warning('You need tensorflow version 2.8 or higher to estimate mutual information or copula entropy locally.') 27 | 28 | from .generators import * 29 | from .ops import * 30 | from .config import * 31 | from .initializers import * 32 | from .layers import * 33 | from .losses import * 34 | from .models import * 35 | from .learners import * -------------------------------------------------------------------------------- /kxy/misc/tf/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Global default training configs 5 | """ 6 | # LEARNING PARAMETERS 7 | LR = 0.005 8 | EPOCHS = 20 9 | 10 | # ADAM PARAMETERS 11 | BETA_1 = 0.9 12 | BETA_2 = 0.999 13 | EPSILON = 1e-04 14 | AMSGRAD = False 15 | BATCH_SIZE = 500 16 | 17 | 18 | def set_default_parameter(name, value): 19 | ''' 20 | Utility function to change parameters above at runtime. 21 | ''' 22 | import logging 23 | globals()[name.upper()] = value 24 | return 25 | 26 | def get_default_parameter(name): 27 | return eval(name.upper()) -------------------------------------------------------------------------------- /kxy/misc/tf/generators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Custom Tensorflow generators. 5 | """ 6 | import numpy as np 7 | import tensorflow as tf 8 | tf.keras.backend.set_floatx('float64') 9 | tf.config.threading.set_inter_op_parallelism_threads(2) 10 | tf.config.threading.set_intra_op_parallelism_threads(8) 11 | tf.config.set_soft_device_placement(True) 12 | from tensorflow.keras.utils import Sequence 13 | 14 | LOCAL_SEED = None 15 | 16 | def set_generators_seed(seed): 17 | globals()['LOCAL_SEED'] = seed 18 | 19 | 20 | rankdata = lambda x: 1.+np.argsort(np.argsort(x, axis=0), axis=0) 21 | class CopulaBatchGenerator(Sequence): 22 | ''' 23 | Random batch generator of maximum-entropy copula learning. 24 | ''' 25 | def __init__(self, z, batch_size=1000, steps_per_epoch=100): 26 | self.batch_size = batch_size 27 | self.d = z.shape[1] 28 | self.n = z.shape[0] 29 | self.z = z 30 | self.steps_per_epoch = steps_per_epoch 31 | self.emp_u = rankdata(self.z)/(self.n + 1.) 32 | self.emp_u[np.isnan(self.z)] = 0.5 33 | self.rnd_gen = np.random.default_rng(LOCAL_SEED) 34 | 35 | if self.n < 200*self.d: 36 | dn = 200*self.d - self.n 37 | selected_rows = self.rnd_gen.choice(self.n, dn, replace=True) 38 | emp_u = self.emp_u[selected_rows, :].copy() 39 | scale = 1./(100.*self.n) 40 | emp_u += (scale*self.rnd_gen.uniform(size=emp_u.shape) - 0.5*scale) 41 | self.emp_u = np.concatenate([self.emp_u, emp_u], axis=0) 42 | self.n = self.emp_u.shape[0] 43 | 44 | self.batch_selector = self.rnd_gen.choice(self.n, self.batch_size*self.steps_per_epoch, replace=True) 45 | self.batch_selector = self.batch_selector.reshape((self.steps_per_epoch, self.batch_size)) 46 | 47 | 48 | def getitem_ndarray(self, idx): 49 | ''' ''' 50 | i = idx % self.steps_per_epoch 51 | selected_rows = self.batch_selector[i] 52 | emp_u_ = self.emp_u[selected_rows, :] 53 | z_p = emp_u_.copy() 54 | z_q = self.rnd_gen.uniform(size=emp_u_.shape) 55 | 56 | z = np.empty((self.batch_size, self.d, 2)) 57 | z[:, :, 0] = z_p 58 | z[:, :, 1] = z_q 59 | batch_x = z 60 | batch_y = np.ones((self.batch_size, 2)) # Not used 61 | return batch_x, batch_y 62 | 63 | 64 | def __getitem__(self, idx): 65 | ''' ''' 66 | batch_x, batch_y = self.getitem_ndarray(idx) 67 | return tf.convert_to_tensor(batch_x), tf.convert_to_tensor(batch_y) 68 | 69 | 70 | def __len__(self): 71 | return self.steps_per_epoch 72 | 73 | 74 | 75 | class PFSBatchGenerator(Sequence): 76 | ''' 77 | Random batch generator. 78 | ''' 79 | def __init__(self, x, y, ox=None, oy=None, batch_size=1000, steps_per_epoch=100, n_shuffle=5): 80 | self.rnd_gen = np.random.default_rng(LOCAL_SEED) 81 | assert x.shape[0] == y.shape[0] 82 | self.batch_size = batch_size 83 | self.n_shuffle = n_shuffle 84 | self.n = x.shape[0] 85 | 86 | x = x if len(x.shape) > 1 else x[:, None] 87 | y = y if len(y.shape) > 1 else y[:, None] 88 | ox = ox if ox is None or len(ox.shape) > 1 else ox[:, None] 89 | oy = oy if oy is None or len(oy.shape) > 1 else oy[:, None] 90 | 91 | self.x = x 92 | self.y = y 93 | self.ox = ox 94 | self.oy = oy 95 | self.z = np.concatenate([self.x, self.y, self.ox, self.oy], axis=1) if (not self.ox is None and not self.oy is None) else \ 96 | np.concatenate([self.x, self.y, self.ox], axis=1) if (not self.ox is None) else \ 97 | np.concatenate([self.x, self.y], axis=1) 98 | self.d = self.z.shape[1] 99 | 100 | self.steps_per_epoch = steps_per_epoch 101 | replace = False if self.n > self.batch_size*self.steps_per_epoch else True 102 | self.batch_selector = self.rnd_gen.choice(self.n, self.batch_size*self.steps_per_epoch, replace=replace) 103 | self.batch_selector = self.batch_selector.reshape((self.steps_per_epoch, self.batch_size)) 104 | 105 | 106 | def getitem_ndarray(self, idx): 107 | ''' ''' 108 | i = idx % self.steps_per_epoch 109 | selected_rows = self.batch_selector[i] 110 | x_ = self.x[selected_rows, :] 111 | y_ = self.y[selected_rows, :] 112 | z_ = self.z[selected_rows, :] 113 | if not self.ox is None: 114 | ox_ = self.ox[selected_rows, :] 115 | if not self.oy is None: 116 | oy_ = self.oy[selected_rows, :] 117 | 118 | z_p = None 119 | z_q = None 120 | for _ in range(self.n_shuffle): 121 | z_p = z_.copy() if z_p is None else np.concatenate([z_p, z_.copy()], axis=0) 122 | y_q = y_.copy() 123 | randomize = np.arange(y_q.shape[0]) 124 | self.rnd_gen.shuffle(randomize) 125 | y_q = y_q[randomize] 126 | if not self.oy is None: 127 | oy_q = oy_.copy() 128 | oy_q = oy_q[randomize] 129 | z_q_ = np.concatenate([x_, y_q.copy(), ox_, oy_q], axis=1) if (not self.ox is None and not self.oy is None) else \ 130 | np.concatenate([x_, y_q.copy(), ox_], axis=1) if not self.ox is None else \ 131 | np.concatenate([x_, y_q.copy()], axis=1) 132 | z_q = z_q_.copy() if z_q is None else np.concatenate([z_q, z_q_.copy()], axis=0) 133 | 134 | z = np.empty((self.batch_size*self.n_shuffle, self.d, 2)) 135 | z[:, :, 0] = z_p 136 | z[:, :, 1] = z_q 137 | batch_x = z 138 | batch_y = np.ones((self.batch_size*self.n_shuffle, 2)) # Not used 139 | return batch_x, batch_y 140 | 141 | 142 | def __getitem__(self, idx): 143 | ''' ''' 144 | batch_x, batch_y = self.getitem_ndarray(idx) 145 | return tf.convert_to_tensor(batch_x), tf.convert_to_tensor(batch_y) 146 | 147 | def __len__(self): 148 | return self.steps_per_epoch 149 | 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /kxy/misc/tf/initializers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Custom Tensorflow initializers. 5 | """ 6 | import logging 7 | 8 | from tensorflow.keras.initializers import GlorotUniform 9 | 10 | LOCAL_SEED = None 11 | INITIALIZER_COUNT = 0 12 | 13 | def frozen_glorot_uniform(): 14 | ''' 15 | Deterministic GlorotUniform initializer. 16 | ''' 17 | if LOCAL_SEED is not None: 18 | initializer = GlorotUniform(LOCAL_SEED+INITIALIZER_COUNT) 19 | globals()['INITIALIZER_COUNT'] = INITIALIZER_COUNT + 1 20 | return initializer 21 | else: 22 | return GlorotUniform() 23 | 24 | def set_initializers_seed(seed): 25 | globals()['LOCAL_SEED'] = seed -------------------------------------------------------------------------------- /kxy/misc/tf/layers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Custom tensorflow layers. 5 | """ 6 | import tensorflow as tf 7 | tf.keras.backend.set_floatx('float64') 8 | tf.config.threading.set_inter_op_parallelism_threads(2) 9 | tf.config.threading.set_intra_op_parallelism_threads(8) 10 | tf.config.set_soft_device_placement(True) 11 | from tensorflow.keras.layers import Layer 12 | 13 | 14 | class InitializableDense(Layer): 15 | ''' 16 | ''' 17 | def __init__(self, units, initial_w=None, initial_b=None, bias=False): 18 | ''' 19 | initial_w should be None or a 2D numpy array. 20 | initial_b should be None or a 1D numpy array. 21 | ''' 22 | super(InitializableDense, self).__init__() 23 | self.units = units 24 | self.with_bias = bias 25 | self.w_initializer = 'zeros' if initial_w is None else tf.constant_initializer(initial_w) 26 | 27 | if self.with_bias: 28 | self.b_initializer = 'zeros' if initial_b is None else tf.constant_initializer(initial_b) 29 | 30 | 31 | def build(self, input_shape): 32 | ''' ''' 33 | self.w = self.add_weight(shape=(input_shape[-1], self.units), \ 34 | initializer=self.w_initializer, trainable=True, name='quad_w') 35 | 36 | if self.with_bias: 37 | self.b = self.add_weight(shape=(self.units,), \ 38 | initializer=self.b_initializer, trainable=True, name='quad_b') 39 | 40 | 41 | def call(self, inputs): 42 | ''' ''' 43 | return tf.matmul(inputs, self.w)+self.b if self.with_bias else tf.matmul(inputs, self.w) 44 | -------------------------------------------------------------------------------- /kxy/misc/tf/losses.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Custom Tensorflow losses. 5 | """ 6 | from multiprocessing import Pool, cpu_count 7 | import numpy as np 8 | 9 | import tensorflow as tf 10 | tf.keras.backend.set_floatx('float64') 11 | tf.config.threading.set_inter_op_parallelism_threads(2) 12 | tf.config.threading.set_intra_op_parallelism_threads(8) 13 | tf.config.set_soft_device_placement(True) 14 | from tensorflow.python.ops import math_ops 15 | from tensorflow.keras.losses import Loss 16 | 17 | from .ops import rectified_exp, d_rectified_exp 18 | 19 | 20 | class MINDLoss(Loss): 21 | ''' 22 | MIND loss function: :math:`-E_P(T(x, y)^T\theta) + \log E_Q(e^{T(x, y)^T\theta})`. 23 | ''' 24 | def call(self, y_true, y_pred): 25 | ''' ''' 26 | p_samples = y_pred[:, 0] 27 | q_samples = y_pred[:, 1] 28 | mi = -tf.reduce_mean(p_samples) + math_ops.log(tf.reduce_mean(math_ops.exp(q_samples))) 29 | return mi 30 | 31 | 32 | class ApproximateMINDLoss(Loss): 33 | ''' 34 | MIND loss function with a gentler version of the exponential: :math:`-E_P(r_exp(T(x, y)^T\theta)) + \log E_Q(dr_exp(T(x, y)^T\theta)`. :math:`r_exp(t) = exp(t)` if :math:`t<0` and :math:`r_exp(t) = 1+x+(1/2)x^2+(1/6)x^2`. 35 | ''' 36 | def call(self, y_true, y_pred): 37 | ''' ''' 38 | p_samples = y_pred[:, 0] 39 | q_samples = y_pred[:, 1] 40 | mi = -tf.reduce_mean(p_samples) + math_ops.log(tf.reduce_mean(rectified_exp(q_samples))) 41 | return mi 42 | 43 | 44 | class RectifiedMINDLoss(Loss): 45 | ''' 46 | Rectified-MIND loss function: :math:`-E_P(\log dr_exp((T(x, y)^T\theta)) + \log E_Q(dr_exp(T(x, y)^T\theta)`. :math:`r_exp(t) = exp(t)` if :math:`t<0` and :math:`r_exp(t) = 1+x+(1/2)x^2+(1/6)x^2`. 47 | ''' 48 | def call(self, y_true, y_pred): 49 | ''' ''' 50 | p_samples = y_pred[:, 0] 51 | q_samples = y_pred[:, 1] 52 | mi = -tf.reduce_mean(math_ops.log(d_rectified_exp(p_samples))) + math_ops.log(tf.reduce_mean(d_rectified_exp(q_samples))) 53 | return mi 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /kxy/misc/tf/ops.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Custom math operations. 5 | """ 6 | from multiprocessing import Pool, cpu_count 7 | import numpy as np 8 | 9 | import tensorflow as tf 10 | tf.keras.backend.set_floatx('float64') 11 | tf.config.threading.set_inter_op_parallelism_threads(2) 12 | tf.config.threading.set_intra_op_parallelism_threads(8) 13 | tf.config.set_soft_device_placement(True) 14 | from tensorflow.python.ops import math_ops 15 | 16 | def rectified_exp(t): 17 | ''' 18 | :math:`r_exp(t) = exp(t)` if :math:`t<0` and :math:`r_exp(t) = 1+x+(1/2)x^2+(1/6)x^3`. 19 | ''' 20 | exp = math_ops.exp(t) 21 | approx_exp = 1.+t+(1./2.)*tf.math.pow(t, 2.)+(1./6.)*tf.math.pow(t, 3.) 22 | condition = tf.greater(t, 0.0) 23 | r_exp = tf.where(condition, x=approx_exp, y=exp) 24 | return r_exp 25 | 26 | 27 | def d_rectified_exp(t): 28 | ''' 29 | :math:`dr_exp(t) = exp(t)` if :math:`t<0` and :math:`dr_exp(t) = 1+x+(1/2)x^2`. 30 | ''' 31 | dexp = math_ops.exp(t) 32 | approx_dexp = 1.+t+(1./2.)*tf.math.pow(t, 2.) 33 | condition = tf.greater(t, 0.0) 34 | dr_exp = tf.where(condition, x=approx_dexp, y=dexp) 35 | return dr_exp -------------------------------------------------------------------------------- /kxy/pandas_extension/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC. 5 | Author: Dr Yves-Laurent Kom Samo 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | """ 20 | from .finance_accessor import * 21 | from .features_accessor import * 22 | from .learning_accessor import * 23 | from .post_learning_accessor import * 24 | from .pre_learning_accessor import * 25 | from .accessor import * -------------------------------------------------------------------------------- /kxy/pandas_extension/accessor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | We define a custom :code:`kxy` `pandas accessor `_ below, 6 | namely the class :code:`Accessor`, that extends the pandas DataFrame class with all our analyses, thereby allowing data scientists to tap into 7 | the power of the :code:`kxy` toolkit within the comfort of their favorite data structure. 8 | 9 | All methods defined in the :code:`Accessor` class are accessible from any DataFrame instance as :code:`df.kxy.`, so long as the :code:`kxy` python 10 | package is imported alongside :code:`pandas`. 11 | """ 12 | 13 | 14 | import pandas as pd 15 | 16 | from .features_accessor import FeaturesAccessor 17 | from .finance_accessor import FinanceAccessor 18 | from .learning_accessor import LearningAccessor 19 | from .post_learning_accessor import PostLearningAccessor 20 | from .pre_learning_accessor import PreLearningAccessor 21 | 22 | 23 | @pd.api.extensions.register_dataframe_accessor("kxy") 24 | class Accessor(PreLearningAccessor, LearningAccessor, PostLearningAccessor, FinanceAccessor, FeaturesAccessor): 25 | """ 26 | Extension of the pandas.DataFrame class with the full capabilities of the :code:`kxy` platform. 27 | """ 28 | pass -------------------------------------------------------------------------------- /kxy/pandas_extension/base_accessor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import logging 4 | import hashlib 5 | import numpy as np 6 | from scipy.stats import norm 7 | import pandas as pd 8 | 9 | try: 10 | get_ipython().__class__.__name__ 11 | from halo import HaloNotebook as Halo 12 | except: 13 | from halo import Halo 14 | 15 | 16 | class BaseAccessor(object): 17 | """ 18 | Base class inheritated by our customs accessors. 19 | """ 20 | def __init__(self, pandas_obj): 21 | self._obj = pandas_obj 22 | 23 | 24 | def check_problem_type(self, problem_type, target_column): 25 | if problem_type == 'regression': 26 | try: 27 | y = self._obj[target_column].astype(float) 28 | except: 29 | raise ValueError('You specified regression as problem_type but the target column is not numeric') 30 | 31 | 32 | def is_discrete(self, column): 33 | """ 34 | Determine whether the input column contains discrete (i.e as opposed to continuous) observations. 35 | """ 36 | if self.is_categorical(column): 37 | return True 38 | 39 | n = self._obj.shape[0] 40 | values, counts = np.unique(self._obj[column].values, return_counts=True) 41 | unique_n = len(values) 42 | 43 | if unique_n < 0.05*n: 44 | return True 45 | 46 | counts = np.array(list(sorted(counts))) 47 | if np.sum(counts[-10:]) > 0.8*n: 48 | return True 49 | 50 | return False 51 | 52 | 53 | def is_categorical(self, column): 54 | """ 55 | Determine whether the input column contains categorical (i.e. non-ordinal) observations. 56 | """ 57 | if self._obj[column].dtype in [float, int, np.float32, np.float64, np.int32, np.int64]: 58 | return False 59 | 60 | try: 61 | casted = self._obj[column].values.astype(float) 62 | return False 63 | except: 64 | return True 65 | 66 | 67 | @property 68 | def is_too_large(self): 69 | return self._obj.memory_usage(index=False).sum()/(1024.0*1024.0*1024.0) > 1.5 70 | 71 | 72 | def describe(self,): 73 | for col in sorted(self._obj.columns): 74 | print(' ') 75 | print('---------' + '-'.join(['' for c in str(col)])) 76 | print('Column: %s' % col) 77 | print('---------' + '-'.join(['' for c in str(col)])) 78 | if self._obj.kxy.is_categorical(col): 79 | print('Type: Categorical') 80 | labels, counts = np.unique(self._obj[col].values.astype(str), return_counts=True) 81 | labels_with_counts = [(labels[i], 100.*counts[i]/self._obj.shape[0]) \ 82 | for i in range(len(labels))] 83 | labels_with_counts = sorted(labels_with_counts, key=lambda x: -x[1]) 84 | tot = 0.0 85 | for label, freq in labels_with_counts: 86 | print('Frequency: %s%%, Label: %s' % (('%.2f' % freq).rjust(5, ' '), label)) 87 | tot += freq 88 | if tot > 90. and tot < 100.: 89 | print('Other Labels: %.2f%%' % (100.-tot)) 90 | break 91 | else: 92 | if self._obj[col].isna().min() == True: 93 | raise ValueError('Column %s only contains NaN' % col) 94 | 95 | m = self._obj[col].min(skipna=True) 96 | M = self._obj[col].max(skipna=True) 97 | mn = self._obj[col].mean(skipna=True) 98 | q50 = self._obj[col].median(skipna=True) 99 | q25 = self._obj[col].quantile(0.25) 100 | q75 = self._obj[col].quantile(0.75) 101 | 102 | print('Type: Continuous') 103 | print('Max: %s' % ('%.1f' % M if M < 10. else '{:,}'.format(int(M)))) 104 | print('p75: %s' % ('%.1f' % q75 if q75 < 10. else '{:,}'.format(int(q75)))) 105 | print('Mean: %s' % ('%.1f' % mn if mn < 10. else '{:,}'.format(int(mn)))) 106 | print('Median: %s' % ('%.1f' % q50 if q50 < 10. else '{:,}'.format(int(q50)))) 107 | print('p25: %s' % ('%.1f' % q25 if q25 < 10. else '{:,}'.format(int(q25)))) 108 | print('Min: %s' % ('%.1f' % m if m < 10. else '{:,}'.format(int(m)))) 109 | 110 | 111 | def anonymize(self, columns_to_exclude=[]): 112 | """ 113 | Anonymize the dataframe in a manner that leaves all pre-learning and post-learning analyses (including data valuation, variable selection, model-driven improvability, data-driven improvability and model explanation) invariant. 114 | 115 | Any transformation on continuous variables that preserves ranks will not change our pre-learning and post-learning analyses. The same holds for any 1-to-1 transformation on categorical variables. 116 | 117 | This implementation replaces ordinal values (i.e. any column that can be cast as a float) with their within-column Gaussian score. For each non-ordinal column, we form the set of all possible values, we assign a unique integer index to each value in the set, and we systematically replace said value appearing in the dataframe by the hexadecimal code of its associated integer index. 118 | 119 | For regression problems, accurate estimation of RMSE related metrics require the target column (and the prediction column for post-learning analyses) not to be anonymized. 120 | 121 | 122 | Parameters 123 | ---------- 124 | columns_to_exclude: list (optional) 125 | List of columns not to anonymize (e.g. target and prediction columns for regression problems). 126 | 127 | 128 | Returns 129 | ------- 130 | result : pandas.DataFrame 131 | The result is a pandas.Dataframe with columns (where applicable): 132 | """ 133 | spinner = Halo(text='Preparing data upload', spinner='dots') 134 | spinner.start() 135 | df = self._obj.copy() 136 | for col in df.columns: 137 | if col in columns_to_exclude: 138 | continue 139 | 140 | if df.kxy.is_categorical(col) or df[col].dtype.name == 'category': 141 | # Note: By using 'category' as dtype you are implicitly telling us that the 'natural' 142 | # order of values does not matter. 143 | unique_values = list(sorted(set(list(df[col].values)))) 144 | mapping = {unique_values[i]: "0x{:03x}".format(i) for i in range(len(unique_values))} 145 | df[col] = df[col].apply(lambda x: mapping.get(x)) 146 | else: 147 | # Note: Any monotonic transformation applied to any continuous column would work. 148 | # The gaussian scoring below makes no assumption on marginals whatsoever. 149 | x = df[col].values.astype(float) 150 | x = x - np.nanmean(x) 151 | s = np.nanstd(x) 152 | if s > 0.0: 153 | x = x/s 154 | x = norm.cdf(x) 155 | df[col] = np.around(x.copy(), 3) 156 | spinner.succeed() 157 | 158 | return df 159 | 160 | 161 | 162 | def __hash__(self): 163 | return hashlib.sha256(self._obj.to_string().encode()).hexdigest() 164 | 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /kxy/pandas_extension/features_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import numpy as np 4 | from scipy.stats import kurtosis, skew 5 | 6 | def nanskew(a, axis=0, bias=True): 7 | ''' ''' 8 | return skew(a, axis=axis, bias=bias, nan_policy='omit') 9 | 10 | def nankurtosis(a, axis=0, fisher=True, bias=True): 11 | ''' ''' 12 | return kurtosis(a, axis=axis, bias=bias, nan_policy='omit') 13 | 14 | def nanmin(a, axis=None, out=None): 15 | ''' ''' 16 | try: 17 | return np.nanmin(a, axis=axis, out=out) 18 | except: 19 | return np.nan 20 | 21 | def nanmax(a, axis=None, out=None): 22 | ''' ''' 23 | try: 24 | return np.nanmax(a, axis=axis, out=out) 25 | except: 26 | return np.nan 27 | 28 | def nanmaxmmin(a, axis=None, out=None): 29 | ''' ''' 30 | return nanmax(a, axis=axis, out=out)-nanmin(a, axis=axis, out=out) 31 | 32 | def nanmean(a, axis=None, out=None): 33 | ''' ''' 34 | try: 35 | return np.nanmean(a, axis=axis, out=out) 36 | except: 37 | return np.nan 38 | 39 | def nansum(a, axis=None, out=None): 40 | ''' ''' 41 | try: 42 | return np.nansum(a, axis=axis, out=out) 43 | except: 44 | return np.nan 45 | 46 | 47 | def nanstd(a, axis=None, dtype=None, out=None): 48 | ''' ''' 49 | try: 50 | return np.nanstd(a, axis=axis, out=out) 51 | except: 52 | return np.nan 53 | 54 | def nanmedian(a, axis=None, out=None, overwrite_input=False): 55 | ''' ''' 56 | try: 57 | return np.nanmedian(a, axis=axis, out=out, overwrite_input=overwrite_input) 58 | except: 59 | return np.nan 60 | 61 | def q25(x): 62 | ''' ''' 63 | return x.quantile(0.25) 64 | 65 | def q75(x): 66 | ''' ''' 67 | return x.quantile(0.75) 68 | 69 | def nanskewabs(a, axis=0, bias=True): 70 | ''' ''' 71 | return skew(np.abs(a), axis=axis, bias=bias, nan_policy='omit') 72 | 73 | def nankurtosisabs(a, axis=0, fisher=True, bias=True): 74 | ''' ''' 75 | return kurtosis(np.abs(a), axis=axis, bias=bias, nan_policy='omit') 76 | 77 | def nanminabs(a, axis=None, out=None): 78 | ''' ''' 79 | try: 80 | return np.nanmin(np.abs(a), axis=axis, out=out) 81 | except: 82 | return np.nan 83 | 84 | def nanmaxabs(a, axis=None, out=None): 85 | ''' ''' 86 | try: 87 | return np.nanmax(np.abs(a), axis=axis, out=out) 88 | except: 89 | return np.nan 90 | 91 | def nanmaxmminabs(a, axis=None, out=None): 92 | ''' ''' 93 | return nanmax(np.abs(a), axis=axis, out=out)-nanmin(a, axis=axis, out=out) 94 | 95 | def nanmeanabs(a, axis=None, out=None): 96 | ''' ''' 97 | try: 98 | return np.nanmean(np.abs(a), axis=axis, out=out) 99 | except: 100 | return np.nan 101 | 102 | def nansumabs(a, axis=None, out=None): 103 | ''' ''' 104 | try: 105 | return np.nansum(np.abs(a), axis=axis, out=out) 106 | except: 107 | return np.nan 108 | 109 | def nanstdabs(a, axis=None, dtype=None, out=None): 110 | ''' ''' 111 | try: 112 | return np.nanstd(np.abs(a), axis=axis, out=out) 113 | except: 114 | return np.nan 115 | 116 | def nanmedianabs(a, axis=None, out=None, overwrite_input=False): 117 | ''' ''' 118 | try: 119 | return np.nanmedian(np.abs(a), axis=axis, out=out, overwrite_input=overwrite_input) 120 | except: 121 | return np.nan 122 | 123 | def q25abs(x): 124 | ''' ''' 125 | return np.abs(x).quantile(0.25) 126 | 127 | def q75abs(x): 128 | ''' ''' 129 | return np.abs(x).quantile(0.75) 130 | 131 | def n_unique(x): 132 | ''' ''' 133 | vc = x.value_counts(normalize=True, sort=True, ascending=False) 134 | return len(vc.index) 135 | 136 | def mode(x): 137 | ''' ''' 138 | vc = x.value_counts(normalize=True, sort=True, ascending=False) 139 | return vc.index[0] if len(vc.index) > 0 else np.nan 140 | 141 | def modefreq(x): 142 | ''' ''' 143 | vc = x.value_counts(normalize=True, sort=True, ascending=False) 144 | return vc.values[0] if len(vc.index) > 0 else np.nan 145 | 146 | def lastmode(x): 147 | ''' ''' 148 | vc = x.value_counts(normalize=True, sort=True, ascending=False) 149 | return vc.index[-1] if len(vc.index) > 0 else np.nan 150 | 151 | def lastmodefreq(x): 152 | ''' ''' 153 | vc = x.value_counts(normalize=True, sort=True, ascending=False) 154 | return vc.values[-1] if len(vc.index) > 0 else np.nan 155 | 156 | def nextmode(x): 157 | ''' ''' 158 | vc = x.value_counts(normalize=True, sort=True, ascending=False) 159 | return vc.index[1] if len(vc.index) > 1 else vc.index[0] if len(vc.index) > 0 else np.nan 160 | 161 | def nextmodefreq(x): 162 | ''' ''' 163 | vc = x.value_counts(normalize=True, sort=True, ascending=False) 164 | return vc.values[1] if len(vc.values) > 1 else vc.values[0] if len(vc.index) > 0 else np.nan 165 | 166 | def rmspe_score(y_true, y_pred): 167 | ''' ''' 168 | return np.sqrt(np.nanmean(np.square((y_true.flatten() - y_pred.flatten()) / y_true.flatten()))) 169 | 170 | def neg_rmspe_score(y_true, y_pred): 171 | ''' ''' 172 | return -rmspe_score(y_true, y_pred) 173 | 174 | 175 | def neg_mae_score(y_true, y_pred): 176 | ''' ''' 177 | return -np.nanmean(np.abs(y_true.flatten()-y_pred.flatten())) 178 | 179 | 180 | def neg_rmse_score(y_true, y_pred): 181 | ''' ''' 182 | return -np.sqrt(np.nanmean((y_true.flatten()-y_pred.flatten())**2)) 183 | 184 | 185 | -------------------------------------------------------------------------------- /kxy/pandas_extension/finance_accessor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from kxy.finance import information_adjusted_correlation as ia_corr 7 | 8 | from .base_accessor import BaseAccessor 9 | 10 | @pd.api.extensions.register_dataframe_accessor("kxy_finance") 11 | class FinanceAccessor(BaseAccessor): 12 | """ 13 | Extension of the pandas.DataFrame class with various finance-specific analytics. 14 | 15 | This class defines the :code:`kxy_finance` `pandas accessor `_. 16 | 17 | All its methods defined are accessible from any DataFrame instance as :code:`df.kxy_finance.`, so long as the :code:`kxy` python package is imported alongside :code:`pandas`. 18 | """ 19 | def information_adjusted_beta(self, market_column, asset_column, anonymize=False): 20 | """ 21 | Estimate the information-adjusted beta of an asset return :math:`r` relative to the market return :math:`r_m`: :math:`\\text{IA-}\\beta := \\text{IA-Corr}\\left(r, r_m \\right) \\sqrt{\\frac{\\text{Var}(r)}{\\text{Var}(r_m)}}`, 22 | where :math:`\\text{IA-Corr}\\left(r, r_m \\right) := \\text{sgn}\\left(\\text{Corr}\\left(r, r_m \\right) \\right) \\left[1 - e^{-2I(r, r_m)} \\right]` denotes the information-adjusted correlation coefficient, with :math:`\\text{sgn}\\left(\\text{Corr}\\left(r, r_m \\right) \\right)` the sign of the Pearson correlation coefficient. 23 | 24 | Unlike the traditional beta coefficient, namely :math:`\\beta := \\text{Corr}\\left(r, r_m \\right) \\sqrt{\\frac{\\text{Var}(r)}{\\text{Var}(r_m)}}`, that only captures linear relations between market and asset returns, and that is 0 if and only if the two are **decorrelated**, :math:`\\text{IA-}\\beta` captures any relationship between asset return and market return, linear or nonlinear, and is 0 if and only if the two variables are **statistically independent**. 25 | 26 | Parameters 27 | ---------- 28 | market_column : str 29 | The name of the column containing market returns. 30 | asset_column : str 31 | The name of the column containing asset returns. 32 | anonymize : bool 33 | When set to true, your explanatory variables will never be shared with KXY (at no performance cost). 34 | 35 | 36 | Returns 37 | ------- 38 | result : float 39 | The information-adjusted beta coefficient. 40 | 41 | """ 42 | assert market_column in self._obj.columns, 'The market column should be a column' 43 | assert asset_column in self._obj.columns, 'The asset column should be a column' 44 | 45 | m_std = np.nanstd(self._obj[market_column].values) 46 | a_std = np.nanstd(self._obj[asset_column].values) 47 | 48 | return self.information_adjusted_correlation(market_column, asset_column, anonymize=anonymize)*a_std/m_std 49 | 50 | 51 | 52 | def information_adjusted_correlation(self, market_column, asset_column, anonymize=False): 53 | """ 54 | Estimate the information-adjusted correlation between an asset return :math:`r` and the market return :math:`r_m`: :math:`\\text{IA-Corr}\\left(r, r_m \\right) := \\text{sgn}\\left(\\text{Corr}\\left(r, r_m \\right) \\right) \\left[1 - e^{-2I(r, r_m)} \\right]`, where :math:`\\text{sgn}\\left(\\text{Corr}\\left(r, r_m \\right) \\right)` is the sign of the Pearson correlation coefficient. 55 | 56 | Unlike Pearson's correlation coefficient, which is 0 if and only if asset return and market return are **decorrelated** (i.e. they exhibit no linear relation), information-adjusted correlation is 0 if and only if market and asset returns are **statistically independent** (i.e. the exhibit no relation, linear or nonlinear). 57 | 58 | 59 | Parameters 60 | ---------- 61 | market_column : str 62 | The name of the column containing market returns. 63 | asset_column : str 64 | The name of the column containing asset returns. 65 | anonymize : bool 66 | When set to true, your explanatory variables will never be shared with KXY (at no performance cost). 67 | 68 | 69 | Returns 70 | ------- 71 | result : float 72 | The information-adjusted correlation. 73 | 74 | """ 75 | assert market_column in self._obj.columns, 'The market column should be a column' 76 | assert asset_column in self._obj.columns, 'The asset column should be a column' 77 | 78 | _obj = self.anonymize(columns_to_exclude=[]) if anonymize else self._obj 79 | 80 | return ia_corr(_obj, market_column, asset_column) 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /kxy/pandas_extension/pre_learning_accessor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import pandas as pd 4 | 5 | from kxy.pre_learning import data_valuation as dv 6 | from kxy.pre_learning import variable_selection as vs 7 | 8 | from .base_accessor import BaseAccessor 9 | 10 | @pd.api.extensions.register_dataframe_accessor("kxy_pre_learning") 11 | class PreLearningAccessor(BaseAccessor): 12 | """ 13 | Extension of the pandas.DataFrame class with various analytics for **post-learning** in supervised learning problems. 14 | 15 | This class defines the :code:`kxy_pre_learning` `pandas accessor `_. 16 | 17 | All its methods defined are accessible from any DataFrame instance as :code:`df.kxy_pre_learning.`, so long as the :code:`kxy` python package is imported alongside :code:`pandas`. 18 | """ 19 | def data_valuation(self, target_column, problem_type=None, anonymize=None, snr='auto', include_mutual_information=False, file_name=None): 20 | """ 21 | Estimate the highest performance metrics achievable when predicting the :code:`target_column` using all other columns. 22 | 23 | When :code:`problem_type=None`, the nature of the supervised learning problem (i.e. regression or classification) is inferred from whether or not :code:`target_column` is categorical. 24 | 25 | 26 | Parameters 27 | ---------- 28 | target_column : str 29 | The name of the column containing true labels. 30 | problem_type : None | 'classification' | 'regression' 31 | The type of supervised learning problem. When None, it is inferred from the column type and the number of distinct values. 32 | anonymize : None | bool 33 | When set to true, your explanatory variables will never be shared with KXY (at no performance cost). When set to None (the default), your data will be anonymized when it is too big. 34 | include_mutual_information : bool 35 | Whether to include the mutual information between target and explanatory variables in the result. 36 | 37 | 38 | Returns 39 | ------- 40 | achievable_performance : pandas.Dataframe 41 | The result is a pandas.Dataframe with columns (where applicable): 42 | 43 | * :code:`'Achievable Accuracy'`: The highest classification accuracy that can be achieved by a model using provided inputs to predict the label. 44 | * :code:`'Achievable R^2'`: The highest :math:`R^2` that can be achieved by a model using provided inputs to predict the label. 45 | * :code:`'Achievable RMSE'`: The lowest Root Mean Square Error that can be achieved by a model using provided inputs to predict the label. 46 | * :code:`'Achievable Log-Likelihood Per Sample'`: The highest true log-likelihood per sample that can be achieved by a model using provided inputs to predict the label. 47 | 48 | 49 | 50 | .. admonition:: Theoretical Foundation 51 | 52 | Section :ref:`1 - Achievable Performance`. 53 | 54 | 55 | .. seealso:: 56 | 57 | :ref:`kxy.pre_learning.achievable_performance.data_valuation ` 58 | 59 | """ 60 | assert target_column in self._obj.columns, 'The target_column should be a column' 61 | if problem_type is None: 62 | problem_type = 'classification' if self.is_discrete(target_column) else 'regression' 63 | self.check_problem_type(problem_type, target_column) 64 | 65 | _obj = self.anonymize(columns_to_exclude=[target_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj 66 | 67 | return dv(_obj, target_column, problem_type, snr=snr, include_mutual_information=include_mutual_information, \ 68 | file_name=file_name) 69 | 70 | 71 | def variable_selection(self, target_column, problem_type=None, anonymize=None, snr='auto', file_name=None): 72 | """ 73 | Runs the model-free variable selection analysis. 74 | 75 | When :code:`problem_type=None`, the nature of the supervised learning problem (i.e. regression or classification) is inferred from whether or not :code:`target_column` is categorical. 76 | 77 | 78 | Parameters 79 | ---------- 80 | target_column : str 81 | The name of the column containing true labels. 82 | problem_type : None | 'classification' | 'regression' 83 | The type of supervised learning problem. When None, it is inferred from the column type and the number of distinct values. 84 | anonymize : None | bool 85 | When set to true, your explanatory variables will never be shared with KXY (at no performance cost). When set to None (the default), your data will be anonymized when it is too big. 86 | 87 | Returns 88 | ------- 89 | result : pandas.DataFrame 90 | The result is a pandas.DataFrame with columns (where applicable): 91 | 92 | * :code:`'Selection Order'`: The order in which the associated variable was selected, starting at 1 for the most important variable. 93 | * :code:`'Variable'`: The column name corresponding to the input variable. 94 | * :code:`'Running Achievable R^2'`: The highest :math:`R^2` that can be achieved by a classification model using all variables selected so far, including this one. 95 | * :code:`'Running Achievable Accuracy'`: The highest classification accuracy that can be achieved by a classification model using all variables selected so far, including this one. 96 | * :code:`'Running Achievable RMSE'`: The highest classification accuracy that can be achieved by a classification model using all variables selected so far, including this one. 97 | 98 | 99 | .. admonition:: Theoretical Foundation 100 | 101 | Section :ref:`2 - Variable Selection Analysis`. 102 | 103 | .. seealso:: 104 | 105 | :ref:`kxy.pre_learning.variable_selection.variable_selection ` 106 | """ 107 | assert target_column in self._obj.columns, 'The target_column should be a column' 108 | if problem_type is None: 109 | problem_type = 'classification' if self.is_discrete(target_column) else 'regression' 110 | self.check_problem_type(problem_type, target_column) 111 | 112 | _obj = self.anonymize(columns_to_exclude=[target_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj 113 | 114 | return vs(_obj, target_column, problem_type, snr=snr, file_name=file_name) 115 | 116 | 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /kxy/pfs/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC. 5 | Author: Dr Yves-Laurent Kom Samo 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | """ 20 | try: 21 | from .pfs_selector import * 22 | from .pfs_predictor import * 23 | except: 24 | import logging 25 | logging.warn('Importing the PFS submodule failed: Principal Feature Selector might not be available.') -------------------------------------------------------------------------------- /kxy/pfs/pfs_predictor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pickle as pkl 7 | 8 | from .pfs_selector import PFS, PCA 9 | 10 | 11 | class PFSPredictor(object): 12 | """ 13 | Principal Feature Selection Predictor. 14 | """ 15 | def _predict(self, obj): 16 | assert hasattr(self, 'models'), 'The model should be first fitted' 17 | assert hasattr(self, 'feature_directions'), 'The model should first be fitted' 18 | assert hasattr(self, 'x_columns'), 'The model should first be fitted' 19 | assert self.feature_directions.shape[0] > 0, 'There should be at least one feature selected' 20 | 21 | z = np.dot(obj[self.x_columns].values, self.feature_directions.T) 22 | y = self.models[0].predict(z) 23 | predictions = pd.DataFrame(index=obj.index) 24 | predictions[self.target_column] = y 25 | 26 | return predictions 27 | 28 | 29 | def predict(self, obj, memory_bound=False): 30 | """ 31 | Make predictions using the fitted model. 32 | 33 | 34 | Parameters 35 | ---------- 36 | obj : pandas.DataFrame 37 | A dataframe containing test explanatory variables/features about which we want to make predictions. 38 | memory_bound : bool (Default False) 39 | Whether we should try to save memory. 40 | 41 | 42 | Returns 43 | ------- 44 | result : pandas.DataFrame 45 | A dataframe with the same index as :code:`obj`, and with one column whose name is the :code:`target_column` used for training. 46 | """ 47 | if memory_bound: 48 | n = obj.shape[0] 49 | max_n = 1000000 50 | res = pd.DataFrame(index=obj.index) 51 | res[self.target_column] = np.nan 52 | i = 0 53 | while i < n: 54 | res.iloc[i:i+max_n] = self._predict(obj.iloc[i:i+max_n]) 55 | i += max_n 56 | return res 57 | 58 | else: 59 | return self._predict(obj) 60 | 61 | 62 | def save(self, path): 63 | """ 64 | Cache the predictor to disk. 65 | """ 66 | meta_path = path + '-meta-' + self.__class__.__name__ 67 | meta = {'target_column': self.target_column, 'feature_directions': self.feature_directions, 'x_columns': self.x_columns} 68 | with open(meta_path, 'wb') as f: 69 | pkl.dump(meta, f) 70 | self.models[0].save(path + '-' + self.__class__.__name__) 71 | 72 | 73 | @classmethod 74 | def load(cls, path, learner_func): 75 | """ 76 | Load the predictor from disk. 77 | """ 78 | meta_path = path + '-meta-' + cls.__name__ 79 | with open(meta_path, 'rb') as f: 80 | meta = pkl.load(f) 81 | target_column = meta['target_column'] 82 | feature_directions = meta['feature_directions'] 83 | x_columns = meta['x_columns'] 84 | 85 | n_vars = feature_directions.shape[0] 86 | model = learner_func(n_vars=n_vars, path=path + '-' + cls.__name__, safe=False) 87 | 88 | predictor = cls() 89 | predictor.models = [model] 90 | predictor.feature_directions = feature_directions 91 | predictor.target_column = target_column 92 | predictor.x_columns = x_columns 93 | 94 | return predictor 95 | 96 | 97 | def get_feature_selector(self): 98 | """ 99 | """ 100 | return PFS() 101 | 102 | @property 103 | def p(self): 104 | return self.feature_directions.shape[0] 105 | 106 | 107 | def fit(self, obj, target_column, learner_func, max_duration=None, path=None, p=None): 108 | """ 109 | Fits a supervised learner enriched with feature selection using the Principal Feature Selection (PFS) algorithm. 110 | 111 | 112 | Parameters 113 | ---------- 114 | obj : pandas.DataFrame 115 | A dataframe containing training explanatory variables/features as well as the target. 116 | target_column : str 117 | The name of the column in :code:`obj` containing targets. 118 | learner_func : func | callable 119 | Function or callable that expects one optional argument :code:`n_vars` and returns an instance of a superviser learner (regressor or classifier) following the scikit-learn convention, and expecting :code:`n_vars` features. Specifically, the learner should have a :code:`fit(x_train, y_train)` method. The learner should also have a :code:`feature_importances_` property or attribute, which is an array or a list containing feature importances once the model has been trained. There should be as many importance scores in :code:`feature_importances_` as columns in :code:`fit(x_train, y_train)`. 120 | max_duration : float | None (default) 121 | If not None, then feature elimination will stop after this many seconds. 122 | p : int | None (default) 123 | The number of principal features to learn when using one-shot PFS. 124 | 125 | 126 | Attributes 127 | ---------- 128 | feature_directions : np.array 129 | The matrix whose rows are the directions in which to project the original features to get principal features. 130 | target_column : str 131 | The name of the column used as target. 132 | models : list 133 | An array whose first entry is the fitted model. 134 | x_columns : list 135 | The list of columns used for PFS sorted alphabetically. 136 | 137 | 138 | Returns 139 | ------- 140 | results : dict 141 | A dictionary containing, among other things, feature directions. 142 | 143 | """ 144 | if path: 145 | try: 146 | predictor = PFSPredictor.load(path, learner_func) 147 | self.models = predictor.models 148 | self.feature_directions = predictor.feature_directions 149 | self.target_column = predictor.target_column 150 | self.x_columns = predictor.x_columns 151 | return {'Feature Directions': self.feature_directions} 152 | except: 153 | pass 154 | self.target_column = target_column 155 | self.x_columns = sorted([_ for _ in obj.columns if _ != target_column]) 156 | 157 | x = obj[self.x_columns].values 158 | y = obj[[target_column]].values 159 | 160 | # Construct principal features 161 | principal_feature_selector = self.get_feature_selector() 162 | self.feature_directions = principal_feature_selector.fit(x, y, max_duration=max_duration, p=p) 163 | z = np.dot(x, self.feature_directions.T) # Principal features 164 | 165 | # Train the learner 166 | n_vars = self.feature_directions.shape[0] 167 | m = learner_func(n_vars=n_vars) 168 | m.fit(z, y) 169 | self.models = [m] 170 | if path: 171 | self.save(path) 172 | 173 | results = {'Feature Directions': self.feature_directions} 174 | return results 175 | 176 | 177 | class PCAPredictor(PFSPredictor): 178 | """ 179 | Principal Component Analysis Predictor. 180 | """ 181 | def __init__(self, energy_loss_frac=0.05): 182 | self.energy_loss_frac = energy_loss_frac 183 | 184 | def get_feature_selector(self): 185 | return PCA(energy_loss_frac=self.energy_loss_frac) 186 | 187 | -------------------------------------------------------------------------------- /kxy/pfs/pfs_selector.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import copy 4 | from time import time 5 | import logging 6 | import numpy as np 7 | 8 | import tensorflow as tf 9 | from tensorflow.keras.callbacks import EarlyStopping, TerminateOnNaN 10 | from tensorflow.keras.optimizers import Adam 11 | 12 | from kxy.misc.tf import PFSLearner, PFSOneShotLearner 13 | 14 | 15 | 16 | 17 | def learn_principal_direction(y, x, ox=None, oy=None, epochs=None, expand_y=True): 18 | """ 19 | Learn the i-th principal feature when using :math:`x` to predict :math:`y`. 20 | 21 | Parameters 22 | ---------- 23 | x : np.array 24 | 2D array of shape :math:`(n, d)` containing original features. 25 | y : np.array 26 | Array of shape :math:`(n)` or :math:`(n, 1)` containing targets. 27 | 28 | Returns 29 | ------- 30 | w : np.array 31 | The first principal direction. 32 | mi: float 33 | The mutual information :math:`I(y; w_i^Tx, \\dots, w_1^Tx)`. 34 | """ 35 | dx = 1 if len(x.shape) == 1 else x.shape[1] 36 | dy = 1 if len(y.shape) == 1 else y.shape[1] 37 | dox = 0 if ox is None else 1 if len(ox.shape) == 1 else ox.shape[1] 38 | doy = 0 if oy is None else 1 if len(oy.shape) == 1 else oy.shape[1] 39 | 40 | learner = PFSLearner(dx, dy=dy, dox=dox, doy=doy, expand_y=expand_y) 41 | learner.fit(x, y, ox=ox, oy=oy, epochs=epochs) 42 | 43 | mi = learner.mutual_information 44 | w = learner.feature_direction 45 | ox = learner.fx 46 | oy = learner.gy if expand_y else None 47 | 48 | return w, mi, ox, oy, learner 49 | 50 | 51 | 52 | def learn_principal_directions_one_shot(y, x, p, epochs=None, expand_y=True): 53 | """ 54 | Jointly learn p principal features. 55 | 56 | Parameters 57 | ---------- 58 | x : np.array 59 | 2D array of shape :math:`(n, d)` containing original features. 60 | y : np.array 61 | Array of shape :math:`(n)` or :math:`(n, 1)` containing targets. 62 | p : int 63 | The number of principal features to learn. 64 | 65 | Returns 66 | ------- 67 | w : np.array 68 | The matrix whose rows are the p principal directions. 69 | """ 70 | dx = 1 if len(x.shape) == 1 else x.shape[1] 71 | learner = PFSOneShotLearner(dx, p=p, expand_y=expand_y) 72 | learner.fit(x, y, epochs=epochs) 73 | w = learner.feature_directions 74 | mi = learner.mutual_information 75 | 76 | return w, mi, learner 77 | 78 | 79 | 80 | 81 | class PFS(object): 82 | """ 83 | Principal Feature Selection. 84 | """ 85 | def fit(self, x, y, p=None, mi_tolerance=0.0001, max_duration=None, epochs=None, seed=None, expand_y=True): 86 | """ 87 | Perform Principal Feature Selection using :math:`x` to predict :math:`y`. 88 | 89 | Specifically, we are looking for a :math:`p x d` matrix :math:`W` whose :math:`p` rows are learned sequentially such that :math:`z := Wx` is a great feature vector for predicting :math:`y`. 90 | 91 | Each row of :math:`W` is normal: :math:`||w_i||=1`, and the corresponding principal feature, namely :math:`w_i^Tx`, points in the same direction as :math:`y` (i.e. :math:`Cov(y, w_i^Tx) > 0`). 92 | 93 | The first row :math:`w_1` is learned so as to maximize the mutual information :math:`I(y; x^Tw_1)`. 94 | 95 | The second row :math:`w_2` is learned so as to maximize the conditional mutual information :math:`I(y; x^Tw_2 | x^Tw_1)`. 96 | 97 | More generally, the :math:`(i+1)`-th row :math:`w_{i+1}` is learned so as to maximize the conditional mutual information :math:`I(y; x^Tw_{i+1} | [x^Tw_1, ..., x^Tw_i])`. 98 | 99 | 100 | Parameters 101 | ---------- 102 | x : np.array 103 | 2D array of shape :math:`(n, d)` containing original features. 104 | y : np.array 105 | Array of shape :math:`(n)` or :math:`(n, 1)` containing targets. 106 | p : int | None (default) 107 | The number of features to select. When :code:`None` (the default) we stop when the estimated mutual information smaller than the mutual information tolerance parameter, or when we have exceeded the maximum duration. A value of :code:`p` that is not :code:`None` triggers one-shot PFS. 108 | mi_tolerance: float 109 | The smallest estimated mutual information required to keep looking for new feature directions. 110 | max_duration : float | None (default) 111 | The maximum amount of time (in second) to allocate to PFS. 112 | 113 | 114 | Returns 115 | ------- 116 | W : np.array 117 | 2D array whose rows are directions to use to compute principal features: :math:`z = Wx`. 118 | """ 119 | if not seed is None: 120 | from kxy.misc.tf import set_seed 121 | set_seed(seed) 122 | 123 | if max_duration: 124 | start_time = time() 125 | 126 | rows = [] 127 | d = 1 if len(x.shape) == 1 else x.shape[1] 128 | learners = [] 129 | if p is None: 130 | t = y.flatten().copy() 131 | old_mi = 0.0 132 | ox = None 133 | oy = None 134 | for i in range(d): 135 | w, mi, ox, oy, learner = learn_principal_direction(t, x, ox=ox, oy=oy, epochs=epochs, \ 136 | expand_y=expand_y) 137 | learners += [copy.copy(learner)] 138 | 139 | if mi-old_mi < mi_tolerance: 140 | logging.info('The mutual information %.4f after %d round has not increase by more than %.4f: stopping.' % ( 141 | mi, i+1, mi_tolerance)) 142 | break 143 | else: 144 | logging.info('The mutual information has increased from %.4f to %.4f after %d rounds.' % (old_mi, mi, i+1)) 145 | rows += [w.copy()] 146 | 147 | if max_duration: 148 | if time()-start_time > max_duration: 149 | logging.info('PFS has exceeded the configured maximum duration: exiting.') 150 | break 151 | 152 | old_mi = mi 153 | 154 | if rows == []: 155 | logging.warning('The only principal feature selected is not informative about the target: I(y; w^Tx)=%.4f' % mi) 156 | rows += [w.copy()] 157 | 158 | self.feature_directions = np.array(rows) 159 | self.mutual_information = old_mi 160 | self.learners = learners 161 | else: 162 | # Learn all p principal features jointly. 163 | feature_directions, mi, learner = learn_principal_directions_one_shot(y, x, p, epochs=epochs, \ 164 | expand_y=expand_y) 165 | learners += [copy.copy(learner)] 166 | self.feature_directions = feature_directions 167 | self.mutual_information = mi 168 | self.learners = learners 169 | 170 | return self.feature_directions 171 | 172 | 173 | def max_ent_features_x(self, x): 174 | """ 175 | """ 176 | assert hasattr(self, 'learners'), 'The object should first be fitted.' 177 | 178 | fxs = [] 179 | for learner in self.learners: 180 | fxs += [learner.learned_constraints_x(x)] 181 | 182 | if len(fxs) == 1: 183 | return fxs[0] 184 | else: 185 | return np.concatenate(fxs, axis=1) 186 | 187 | 188 | 189 | class PCA(object): 190 | """ 191 | Principal Component Analysis. 192 | """ 193 | def __init__(self, energy_loss_frac=0.05): 194 | self.energy_loss_frac = energy_loss_frac 195 | 196 | 197 | def fit(self, x, _, max_duration=None, p=None): 198 | """ 199 | """ 200 | cov_x = np.cov(x.T) # Columns in x should represent variables and rows observations. 201 | u, d, v = np.linalg.svd(cov_x) 202 | cum_energy = np.cumsum(d) 203 | energy = cum_energy[-1] 204 | p = len([_ for _ in cum_energy if _ <= (1.-self.energy_loss_frac)*energy]) 205 | 206 | self.feature_directions = u[:, :p].T 207 | 208 | return self.feature_directions 209 | 210 | 211 | 212 | 213 | -------------------------------------------------------------------------------- /kxy/post_learning/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC. 5 | Author: Dr Yves-Laurent Kom Samo 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | """ 20 | from .improvability import * 21 | from .model_explanation import * -------------------------------------------------------------------------------- /kxy/post_learning/model_explanation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Estimation of the top-:math:`k` most valuable variables in a supervised learning problem for every possible :math:`k`, and 5 | the corresponding achievable performances. 6 | """ 7 | import logging 8 | import requests 9 | import sys 10 | from time import time, sleep 11 | 12 | import numpy as np 13 | import pandas as pd 14 | 15 | try: 16 | get_ipython().__class__.__name__ 17 | from halo import HaloNotebook as Halo 18 | except: 19 | from halo import Halo 20 | 21 | from kxy.api import APIClient, upload_data 22 | from kxy.misc import LongerThanExpectedException 23 | 24 | # Cache old job ids to avoid being charged twice for the same job. 25 | EXPLANATION_JOB_IDS = {} 26 | 27 | def model_explanation(data_df, prediction_column, problem_type, snr='auto', file_name=None): 28 | """ 29 | .. _model-explanation: 30 | Analyzes the variables that a model relies on the most in a brute-force fashion. 31 | 32 | The first variable is the variable the model relies on the most. The second variable is the variable that complements the first variable the most in explaining model decisions etc. 33 | 34 | Running performances should be understood as the performance achievable when trying to guess model predictions using variables with selection order smaller or equal to that of the row. 35 | 36 | When :code:`problem_type=None`, the nature of the supervised learning problem (i.e. regression or classification) is inferred from whether or not :code:`prediction_column` is categorical. 37 | 38 | 39 | Parameters 40 | ---------- 41 | data_df : pandas.DataFrame 42 | The pandas DataFrame containing the data. 43 | prediction_column : str 44 | The name of the column containing true labels. 45 | problem_type : None | 'classification' | 'regression' 46 | The type of supervised learning problem. When None, it is inferred from the column type and the number of distinct values. 47 | file_name : None | str 48 | A unique identifier characterizing data_df in the form of a file name. Do not set this unless you know why. 49 | 50 | 51 | Returns 52 | ------- 53 | result : pandas.DataFrame 54 | The result is a pandas.Dataframe with columns (where applicable): 55 | 56 | * :code:`'Selection Order'`: The order in which the associated variable was selected, starting at 1 for the most important variable. 57 | * :code:`'Variable'`: The column name corresponding to the input variable. 58 | * :code:`'Running Achievable R-Squared'`: The highest :math:`R^2` that can be achieved by a classification model using all variables selected so far, including this one. 59 | * :code:`'Running Achievable Accuracy'`: The highest classification accuracy that can be achieved by a classification model using all variables selected so far, including this one. 60 | * :code:`'Running Achievable RMSE'`: The highest classification accuracy that can be achieved by a classification model using all variables selected so far, including this one. 61 | 62 | 63 | .. admonition:: Theoretical Foundation 64 | 65 | Section :ref:`a) Model Explanation`. 66 | 67 | """ 68 | assert prediction_column in data_df.columns, 'The label column should be a column of the dataframe.' 69 | assert problem_type.lower() in ['classification', 'regression'] 70 | if problem_type.lower() == 'regression': 71 | assert np.can_cast(data_df[prediction_column], float), 'The prediction column should be numeric' 72 | 73 | k = 0 74 | kp = 0 75 | max_k = 100 76 | 77 | file_name = upload_data(data_df, file_name=file_name) 78 | spinner = Halo(text='Waiting for results from the backend.', spinner='dots') 79 | spinner.start() 80 | 81 | if file_name: 82 | job_id = EXPLANATION_JOB_IDS.get((file_name, prediction_column, problem_type), None) 83 | if job_id: 84 | api_response = APIClient.route( 85 | path='/wk/variable-selection', method='POST', \ 86 | file_name=file_name, target_column=prediction_column, \ 87 | problem_type=problem_type, timestamp=int(time()), job_id=job_id, \ 88 | snr=snr) 89 | else: 90 | api_response = APIClient.route( 91 | path='/wk/variable-selection', method='POST', \ 92 | file_name=file_name, target_column=prediction_column, \ 93 | problem_type=problem_type, timestamp=int(time()), snr=snr) 94 | 95 | initial_time = time() 96 | while api_response.status_code == requests.codes.ok and k < max_k: 97 | if kp%2 != 0: 98 | sleep(2 if kp<5 else 10 if k < max_k-4 else 300) 99 | kp += 1 100 | k = kp//2 101 | 102 | else: 103 | try: 104 | response = api_response.json() 105 | if 'job_id' in response: 106 | job_id = response['job_id'] 107 | EXPLANATION_JOB_IDS[(file_name, prediction_column, problem_type)] = job_id 108 | sleep(2 if kp<5 else 10 if k < max_k-4 else 300) 109 | kp += 1 110 | k = kp//2 111 | 112 | # Note: it is important to pass the job_id to avoid being charged twice for the work. 113 | api_response = APIClient.route( 114 | path='/wk/variable-selection', method='POST', \ 115 | file_name=file_name, target_column=prediction_column, \ 116 | problem_type=problem_type, timestamp=int(time()), job_id=job_id, \ 117 | snr=snr) 118 | 119 | try: 120 | response = api_response.json() 121 | if 'eta' in response: 122 | progress_text = '%s%% Completed.' % response['progress_pct'] if 'progress_pct' in response else '' 123 | spinner.text = 'Waiting for results from the backend. ETA: %s. %s' % (response['eta'], progress_text) 124 | except: 125 | pass 126 | 127 | if ('job_id' not in response) or ('selection_order' in response): 128 | duration = int(time()-initial_time) 129 | duration = str(duration) + 's' if duration < 60 else str(duration//60) + 'min' 130 | 131 | result = {} 132 | 133 | if 'selection_order' in response: 134 | result['Selection Order'] = response['selection_order'] 135 | 136 | if 'variable' in response: 137 | result['Variable'] = response['variable'] 138 | 139 | if 'r-squared' in response: 140 | result['Running Achievable R-Squared'] = response['r-squared'] 141 | 142 | if 'log-likelihood' in response: 143 | result['Running Achievable Log-Likelihood Per Sample'] = response['log-likelihood'] 144 | 145 | if 'rmse' in response and problem_type.lower() == 'regression': 146 | result['Running Achievable RMSE'] = response['rmse'] 147 | 148 | if 'accuracy' in response and problem_type.lower() == 'classification': 149 | result['Running Achievable Accuracy'] = response['accuracy'] 150 | 151 | result = pd.DataFrame.from_dict(result) 152 | 153 | if 'selection_order' in response: 154 | result.set_index('Selection Order', inplace=True) 155 | 156 | spinner.text = 'Received results from the backend after %s.' % duration 157 | spinner.succeed() 158 | return result 159 | 160 | 161 | except: 162 | logging.exception('\nModel explanation failed. Last HTTP code: %s, Content: %s' % (api_response.status_code, api_response.content)) 163 | spinner.text = 'The backend encountered an unexpected error we are looking into. Please try again later.' 164 | spinner.fail() 165 | return None 166 | 167 | if api_response.status_code != requests.codes.ok: 168 | spinner.text = 'The backend is taking longer than expected. Please try again later' 169 | spinner.fail() 170 | try: 171 | response = api_response.json() 172 | if 'message' in response: 173 | logging.error('\n%s' % response['message']) 174 | except: 175 | logging.error('\nModel explanation failed. Last HTTP code: %s, Content: %s' % (api_response.status_code, api_response.content)) 176 | 177 | raise LongerThanExpectedException('The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.') 178 | 179 | return None 180 | 181 | 182 | -------------------------------------------------------------------------------- /kxy/pre_learning/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright (C) 2022 KXY TECHNOLOGIES, INC. 5 | Author: Dr Yves-Laurent Kom Samo 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU Affero General Public License 18 | along with this program. If not, see . 19 | """ 20 | from .achievable_performance import * 21 | from .variable_selection import * -------------------------------------------------------------------------------- /kxy/pre_learning/achievable_performance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Estimation of the highest performance achievable in a supervised learning problem. 5 | E.g. :math:`R^2`, RMSE, classification accuracy, true log-likelihood per observation. 6 | """ 7 | import logging 8 | import requests 9 | import sys 10 | from time import time, sleep 11 | 12 | import numpy as np 13 | import pandas as pd 14 | 15 | try: 16 | get_ipython().__class__.__name__ 17 | from halo import HaloNotebook as Halo 18 | except: 19 | from halo import Halo 20 | 21 | from kxy.api import APIClient, upload_data 22 | from kxy.misc import LongerThanExpectedException 23 | 24 | # Cache old job ids to avoid being charged twice for the same job. 25 | VALUATION_JOB_IDS = {} 26 | 27 | def data_valuation(data_df, target_column, problem_type, snr='auto', include_mutual_information=False, file_name=None): 28 | """ 29 | .. _data-valuation: 30 | Estimate the highest performance metrics achievable when predicting the :code:`target_column` using all other columns. 31 | 32 | When :code:`problem_type=None`, the nature of the supervised learning problem (i.e. regression or classification) is inferred from whether or not :code:`target_column` is categorical. 33 | 34 | 35 | Parameters 36 | ---------- 37 | data_df : pandas.DataFrame 38 | The pandas DataFrame containing the data. 39 | target_column : str 40 | The name of the column containing true labels. 41 | problem_type : None | 'classification' | 'regression' 42 | The type of supervised learning problem. When None, it is inferred from the column type and the number of distinct values. 43 | include_mutual_information : bool 44 | Whether to include the mutual information between target and explanatory variables in the result. 45 | file_name : None | str 46 | A unique identifier characterizing data_df in the form of a file name. Do not set this unless you know why. 47 | 48 | 49 | 50 | Returns 51 | ------- 52 | achievable_performance : pandas.Dataframe 53 | The result is a pandas.Dataframe with columns (where applicable): 54 | 55 | * :code:`'Achievable Accuracy'`: The highest classification accuracy that can be achieved by a model using provided inputs to predict the label. 56 | * :code:`'Achievable R-Squared'`: The highest :math:`R^2` that can be achieved by a model using provided inputs to predict the label. 57 | * :code:`'Achievable RMSE'`: The lowest Root Mean Square Error that can be achieved by a model using provided inputs to predict the label. 58 | * :code:`'Achievable Log-Likelihood Per Sample'`: The highest true log-likelihood per sample that can be achieved by a model using provided inputs to predict the label. 59 | 60 | 61 | .. admonition:: Theoretical Foundation 62 | 63 | Section :ref:`1 - Achievable Performance`. 64 | """ 65 | assert target_column in data_df.columns, 'The label column should be a column of the dataframe.' 66 | assert problem_type.lower() in ['classification', 'regression'] 67 | if problem_type.lower() == 'regression': 68 | assert np.can_cast(data_df[target_column], float), 'The target column should be numeric' 69 | 70 | k = 0 71 | max_k = 100 72 | 73 | file_name = upload_data(data_df, file_name=file_name) 74 | spinner = Halo(text='Waiting for results from the backend.', spinner='dots') 75 | spinner.start() 76 | 77 | if file_name: 78 | job_id = VALUATION_JOB_IDS.get((file_name, target_column, problem_type, snr), None) 79 | 80 | if job_id: 81 | api_response = APIClient.route( 82 | path='/wk/data-valuation', method='POST', 83 | file_name=file_name, target_column=target_column, \ 84 | problem_type=problem_type, \ 85 | timestamp=int(time()), job_id=job_id, \ 86 | snr=snr) 87 | else: 88 | api_response = APIClient.route( 89 | path='/wk/data-valuation', method='POST', \ 90 | file_name=file_name, target_column=target_column, \ 91 | problem_type=problem_type, timestamp=int(time()), \ 92 | snr=snr) 93 | 94 | initial_time = time() 95 | while api_response.status_code == requests.codes.ok and k < max_k: 96 | try: 97 | response = api_response.json() 98 | if 'eta' in response: 99 | progress_text = '%s%% Completed.' % response['progress_pct'] if 'progress_pct' in response else '' 100 | spinner.text = 'Waiting for results from the backend. ETA: %s. %s' % (response['eta'], progress_text) 101 | 102 | if ('job_id' in response) and ('r-squared' not in response): 103 | job_id = response['job_id'] 104 | VALUATION_JOB_IDS[(file_name, target_column, problem_type, snr)] = job_id 105 | k += 1 106 | sleep(15.) 107 | 108 | # Note: it is important to pass the job_id to avoid being charged twice for the same work. 109 | api_response = APIClient.route( 110 | path='/wk/data-valuation', method='POST', 111 | file_name=file_name, target_column=target_column, \ 112 | problem_type=problem_type, \ 113 | timestamp=int(time()), job_id=job_id, \ 114 | snr=snr) 115 | 116 | try: 117 | response = api_response.json() 118 | if 'eta' in response: 119 | progress_text = '%s%% Completed.' % response['progress_pct'] if 'progress_pct' in response else '' 120 | spinner.text = 'Waiting for results from the backend. ETA: %s. %s' % (response['eta'], progress_text) 121 | except: 122 | pass 123 | 124 | if ('job_id' not in response) or ('r-squared' in response): 125 | duration = int(time()-initial_time) 126 | duration = str(duration) + 's' if duration < 60 else str(duration//60) + 'min' 127 | 128 | result = {} 129 | if 'r-squared' in response: 130 | result['Achievable R-Squared'] = [response['r-squared']] 131 | 132 | if 'log-likelihood' in response: 133 | result['Achievable Log-Likelihood Per Sample'] = [response['log-likelihood']] 134 | 135 | if 'rmse' in response and problem_type.lower() == 'regression': 136 | result['Achievable RMSE'] = [response['rmse']] 137 | 138 | if 'accuracy' in response and problem_type.lower() == 'classification': 139 | result['Achievable Accuracy'] = [response['accuracy']] 140 | 141 | if include_mutual_information and 'mi' in response: 142 | result['Mutual Information'] = [response['mi']] 143 | 144 | result = pd.DataFrame.from_dict(result) 145 | 146 | spinner.text = 'Received results from the backend after %s.' % duration 147 | spinner.succeed() 148 | 149 | return result 150 | 151 | except: 152 | logging.exception('\nData valuation failed. Last HTTP code: %s' % api_response.status_code) 153 | spinner.text = 'The backend encountered an unexpected error we are looking into. Please try again later.' 154 | spinner.fail() 155 | return None 156 | 157 | 158 | if api_response.status_code != requests.codes.ok: 159 | spinner.text = 'The backend is taking longer than expected. Try again later.' 160 | spinner.fail() 161 | try: 162 | response = api_response.json() 163 | if 'message' in response: 164 | logging.error('\n%s' % response['message']) 165 | except: 166 | logging.error('\nData valuation failed. Last HTTP code: %s' % api_response.status_code) 167 | 168 | raise LongerThanExpectedException('The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.') 169 | 170 | return None 171 | 172 | 173 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.18.1 2 | scipy>=1.4.1 3 | pandas>=0.23.0 4 | requests>=2.22.0 5 | pandarallel 6 | halo 7 | ipywidgets -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # Inside of setup.cfg 2 | [metadata] 3 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 27 10:54:00 2020 4 | 5 | @author: ylkomsamo 6 | """ 7 | 8 | import sys 9 | sys.path.append('.') 10 | from setuptools import setup, find_packages 11 | 12 | with open('README.md') as f: 13 | long_description = f.read() 14 | 15 | version = "1.4.11" 16 | setup(name="kxy", 17 | version=version, 18 | zip_safe=False, 19 | license="GPLv3", 20 | author="Dr. Yves-Laurent Kom Samo", 21 | author_email="github@kxy.ai", 22 | url="https://www.kxy.ai", 23 | description = "A Powerful Serverless Pre-Learning and Post-Learning Analysis Toolkit", 24 | long_description=long_description, 25 | long_description_content_type='text/markdown', # This is important! 26 | project_urls={ 27 | "Documentation": "https://www.kxy.ai/reference", 28 | "Source Code": "https://github.com/kxytechnologies/kxy-python/"}, 29 | download_url = "https://github.com/kxytechnologies/kxy-python/archive/v%s.tar.gz" % version, 30 | keywords = ["Feature Engineering", "Feature Selection", "Data Valuation", "Lean ML", "AutoML", "Pre-Learning", "Post-Learning"], 31 | packages=find_packages(exclude=["tests"]), 32 | install_requires=["numpy>=1.13.1", "scipy>=1.4.1", "pandas>=0.23.0", "requests>=2.22.0", "pandarallel", "halo", "ipywidgets", "scikit-learn"], 33 | classifiers=[ 34 | "Environment :: Console", 35 | "Intended Audience :: Developers", 36 | "Intended Audience :: Education", 37 | "Intended Audience :: Science/Research", 38 | "Intended Audience :: Information Technology", 39 | "Natural Language :: English", 40 | "Operating System :: OS Independent", 41 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 42 | "Topic :: Scientific/Engineering :: Information Analysis", 43 | "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", 44 | "Programming Language :: Python :: 3 :: Only", 45 | "Development Status :: 5 - Production/Stable", 46 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 47 | "Topic :: Scientific/Engineering :: Mathematics" 48 | ], 49 | scripts=['bin/kxy'] 50 | ) 51 | -------------------------------------------------------------------------------- /tests/test_data_valuation.py: -------------------------------------------------------------------------------- 1 | from kxy_datasets.regressions import Abalone 2 | 3 | 4 | def test_include_mi(): 5 | dataset = Abalone() 6 | target_column = dataset.y_column 7 | df = dataset.df 8 | results = df.kxy.data_valuation(target_column, problem_type='regression', \ 9 | include_mutual_information=True) 10 | assert 'Mutual Information' in results -------------------------------------------------------------------------------- /tests/test_features.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kxytechnologies/kxy-python/7d7a7d88e6f66280e51ff3f344144c979f60299d/tests/test_features.py -------------------------------------------------------------------------------- /tests/test_finance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import kxy 4 | 5 | def test_ia_corr_anon(): 6 | x = np.random.randn(10000, 2) 7 | df = pd.DataFrame(x, columns=['market_column', 'asset_column']) 8 | iab_anon = df.kxy.information_adjusted_beta('market_column','asset_column', anonymize=True) 9 | iab = df.kxy.information_adjusted_beta('market_column','asset_column', anonymize=False) 10 | assert np.allclose(iab, iab_anon, atol=1e-03), 'Anonymized and non-anonymized results should be identical (%.4f vs %.4f)' % (iab, iab_anon) 11 | 12 | 13 | def test_ia_corr_nan(): 14 | x = np.random.randn(10000, 2) 15 | x[100:200, 0] = np.nan 16 | x[200:300, 1] = np.nan 17 | df = pd.DataFrame(x, columns=['market_column', 'asset_column']) 18 | iab_anon = df.kxy.information_adjusted_beta('market_column','asset_column', anonymize=True) 19 | assert not np.isnan(iab_anon) 20 | iab = df.kxy.information_adjusted_beta('market_column','asset_column', anonymize=False) 21 | assert not np.isnan(iab) 22 | assert np.allclose(iab, iab_anon, atol=1e-03), 'Anonymized and non-anonymized results should be identical (%.4f vs %.4f)' % (iab, iab_anon) 23 | 24 | -------------------------------------------------------------------------------- /tests/test_flow.py: -------------------------------------------------------------------------------- 1 | # from __future__ import unicode_literals 2 | 3 | if __name__ == '__main__': 4 | # import logging 5 | # logging.basicConfig(level=logging.DEBUG) 6 | import numpy as np 7 | import pandas as pd 8 | import kxy 9 | from kxy.api import upload_data 10 | 11 | df = pd.DataFrame(np.random.randn(20000, 50)) 12 | upload_data(df) 13 | df.kxy.describe() 14 | 15 | 16 | -------------------------------------------------------------------------------- /tests/test_misc.py: -------------------------------------------------------------------------------- 1 | from kxy_datasets.regressions import Abalone 2 | from kxy_datasets.classifications import BankNote, BankMarketing 3 | from kxy.learning import get_xgboost_learner, get_tensorflow_dense_learner, get_pytorch_dense_learner, \ 4 | get_lightgbm_learner_sklearn_api, get_lightgbm_learner_learning_api, get_sklearn_learner 5 | 6 | 7 | 8 | def test_boruta(): 9 | # Regression 10 | sklearn_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor') 11 | dataset = Abalone() 12 | target_column = dataset.y_column 13 | df = dataset.df 14 | 15 | # Features generation 16 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) 17 | 18 | # Model building 19 | results = features_df.kxy.fit(target_column, sklearn_regressor_cls, \ 20 | problem_type='regression', feature_selection_method='boruta', boruta_n_evaluations=100) 21 | assert results['Selected Variables'] == ['Shucked weight', 'Shell weight', 'Sex_I', \ 22 | 'Shucked weight.ABS(* - Q25(*))', 'Whole weight'] 23 | 24 | 25 | def test_rfe(): 26 | # Regression 27 | sklearn_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor') 28 | dataset = Abalone() 29 | target_column = dataset.y_column 30 | df = dataset.df 31 | 32 | # Features generation 33 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) 34 | 35 | # Model building 36 | results = features_df.kxy.fit(target_column, sklearn_regressor_cls, \ 37 | problem_type='regression', feature_selection_method='rfe', rfe_n_features=10) 38 | assert results['Selected Variables'] == ['Shell weight', 'Sex_I', 'Shucked weight.ABS(* - Q25(*))', \ 39 | 'Whole weight.ABS(* - Q25(*))', 'Shucked weight.ABS(* - MEDIAN(*))', 'Shucked weight', \ 40 | 'Shucked weight.ABS(* - Q75(*))', 'Shucked weight.ABS(* - MEAN(*))', 'Diameter.ABS(* - Q25(*))', \ 41 | 'Diameter.ABS(* - Q75(*))'] -------------------------------------------------------------------------------- /tests/test_pca.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import kxy 4 | from kxy.learning import get_sklearn_learner, get_lightgbm_learner_learning_api, get_xgboost_learner 5 | from kxy.pfs import PCAPredictor, PCA 6 | from kxy_datasets.regressions import Abalone 7 | from kxy_datasets.classifications import BankNote, BankMarketing 8 | 9 | 10 | def test_shape(): 11 | dataset = Abalone() 12 | target_column = dataset.y_column 13 | df = dataset.df 14 | 15 | # Features generation 16 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) 17 | y = features_df[target_column].values 18 | x_columns = [_ for _ in features_df.columns if _ != target_column] 19 | x = features_df[x_columns].values 20 | 21 | # Principal features construction 22 | feature_directions = PCA().fit(x, y) 23 | assert feature_directions.shape[1] == x.shape[1] 24 | 25 | predictor = PCAPredictor() 26 | learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', random_state=0) 27 | results = predictor.fit(features_df, target_column, learner_func) 28 | feature_directions = results['Feature Directions'] 29 | assert feature_directions.shape[1] == x.shape[1] 30 | 31 | 32 | def test_orthonormality(): 33 | dataset = Abalone() 34 | target_column = dataset.y_column 35 | df = dataset.df 36 | 37 | # Features generation 38 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) 39 | y = features_df[target_column].values 40 | x_columns = [_ for _ in features_df.columns if _ != target_column] 41 | x = features_df[x_columns].values 42 | 43 | # Principal features construction 44 | feature_directions = PCA().fit(x, y) 45 | n_directions = feature_directions.shape[0] 46 | for i in range(n_directions): 47 | assert np.allclose(np.dot(feature_directions[i, :], feature_directions[i, :]), 1.) 48 | for j in range(n_directions): 49 | if j != i: 50 | assert np.abs(np.dot(feature_directions[i, :], feature_directions[j, :])) < 1e-7 51 | 52 | predictor = PCAPredictor() 53 | learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', random_state=0) 54 | results = predictor.fit(features_df, target_column, learner_func) 55 | feature_directions = results['Feature Directions'] 56 | n_directions = feature_directions.shape[0] 57 | for i in range(n_directions): 58 | assert np.allclose(np.dot(feature_directions[i, :], feature_directions[i, :]), 1.) 59 | 60 | 61 | 62 | 63 | 64 | def test_pca_feature_selection(): 65 | # Regression 66 | xgboost_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor') 67 | dataset = Abalone() 68 | target_column = dataset.y_column 69 | df = dataset.df 70 | 71 | # Features generation 72 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) 73 | 74 | # Model building 75 | results = features_df.kxy.fit(target_column, xgboost_regressor_cls, \ 76 | problem_type='regression', feature_selection_method='pfs') 77 | assert results['Feature Directions'].shape[1] == features_df.shape[1]-1 78 | predictor = results['predictor'] 79 | predictions = predictor.predict(features_df) 80 | assert len(predictions.columns) == 1 81 | assert target_column in predictions.columns 82 | assert set(features_df.index).difference(set(predictions.index)) == set() 83 | assert set(predictions.index).difference(set(features_df.index)) == set() 84 | 85 | 86 | def test_save_pca(): 87 | # Regression 88 | xgboost_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor') 89 | dataset = Abalone() 90 | target_column = dataset.y_column 91 | df = dataset.df 92 | 93 | # Features generation 94 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) 95 | 96 | # Model building 97 | path = 'Abalone' 98 | results = features_df.kxy.fit(target_column, xgboost_regressor_cls, \ 99 | problem_type='regression', feature_selection_method='pca', \ 100 | path=path) 101 | loaded_predictor = PCAPredictor().load(path, xgboost_regressor_cls) 102 | feature_directions = loaded_predictor.feature_directions 103 | assert feature_directions.shape[1] == features_df.shape[1]-1 104 | predictions = loaded_predictor.predict(features_df) 105 | assert len(predictions.columns) == 1 106 | assert target_column in predictions.columns 107 | assert set(features_df.index).difference(set(predictions.index)) == set() 108 | assert set(predictions.index).difference(set(features_df.index)) == set() 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /tests/test_pfs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import kxy 4 | from kxy.learning import get_sklearn_learner, get_lightgbm_learner_learning_api, get_xgboost_learner 5 | from kxy.pfs import PFSPredictor, PFS, PCA 6 | from kxy_datasets.regressions import Abalone 7 | from kxy_datasets.classifications import BankNote, BankMarketing 8 | 9 | 10 | def test_shape(): 11 | dataset = Abalone() 12 | target_column = dataset.y_column 13 | df = dataset.df 14 | 15 | # Features generation 16 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) 17 | y = features_df[target_column].values 18 | x_columns = [_ for _ in features_df.columns if _ != target_column] 19 | x = features_df[x_columns].values 20 | 21 | # Principal features construction 22 | feature_directions = PFS().fit(x, y) 23 | assert feature_directions.shape[1] == x.shape[1] 24 | 25 | predictor = PFSPredictor() 26 | learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', random_state=0) 27 | results = predictor.fit(features_df, target_column, learner_func) 28 | feature_directions = results['Feature Directions'] 29 | assert feature_directions.shape[1] == x.shape[1] 30 | 31 | 32 | def test_norm(): 33 | dataset = Abalone() 34 | target_column = dataset.y_column 35 | df = dataset.df 36 | 37 | # Features generation 38 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) 39 | y = features_df[target_column].values 40 | x_columns = [_ for _ in features_df.columns if _ != target_column] 41 | x = features_df[x_columns].values 42 | 43 | # Principal features construction 44 | feature_directions = PFS().fit(x, y) 45 | n_directions = feature_directions.shape[0] 46 | for i in range(n_directions): 47 | assert np.allclose(np.dot(feature_directions[i, :], feature_directions[i, :]), 1.) 48 | 49 | predictor = PFSPredictor() 50 | learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', random_state=0) 51 | results = predictor.fit(features_df, target_column, learner_func) 52 | feature_directions = results['Feature Directions'] 53 | n_directions = feature_directions.shape[0] 54 | for i in range(n_directions): 55 | assert np.allclose(np.dot(feature_directions[i, :], feature_directions[i, :]), 1.) 56 | 57 | 58 | def test_pfs_feature_selection(): 59 | # Regression 60 | xgboost_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor') 61 | dataset = Abalone() 62 | target_column = dataset.y_column 63 | df = dataset.df 64 | 65 | # Features generation 66 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) 67 | 68 | # Model building 69 | results = features_df.kxy.fit(target_column, xgboost_regressor_cls, \ 70 | problem_type='regression', feature_selection_method='pfs') 71 | assert results['Feature Directions'].shape[1] == features_df.shape[1]-1 72 | predictor = results['predictor'] 73 | predictions = predictor.predict(features_df) 74 | assert len(predictions.columns) == 1 75 | assert target_column in predictions.columns 76 | assert set(features_df.index).difference(set(predictions.index)) == set() 77 | assert set(predictions.index).difference(set(features_df.index)) == set() 78 | 79 | 80 | def test_save_pfs(): 81 | # Regression 82 | xgboost_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor') 83 | dataset = Abalone() 84 | target_column = dataset.y_column 85 | df = dataset.df 86 | 87 | # Features generation 88 | features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) 89 | 90 | # Model building 91 | path = 'Abalone' 92 | results = features_df.kxy.fit(target_column, xgboost_regressor_cls, \ 93 | problem_type='regression', feature_selection_method='pfs', \ 94 | path=path) 95 | loaded_predictor = PFSPredictor().load(path, xgboost_regressor_cls) 96 | feature_directions = loaded_predictor.feature_directions 97 | assert feature_directions.shape[1] == features_df.shape[1]-1 98 | predictions = loaded_predictor.predict(features_df) 99 | assert len(predictions.columns) == 1 100 | assert target_column in predictions.columns 101 | assert set(features_df.index).difference(set(predictions.index)) == set() 102 | assert set(predictions.index).difference(set(features_df.index)) == set() 103 | 104 | 105 | def test_pfs_accuracy(): 106 | # Generate the data 107 | seed = 1 108 | np.random.seed(seed) 109 | d = 100 110 | w = np.ones(d)/d 111 | x = np.random.randn(10000, d) 112 | xTw = np.dot(x, w) 113 | y = xTw + 2.*xTw**2 + 0.5*xTw**3 114 | 115 | # Run PFS 116 | from kxy.misc.tf import set_default_parameter 117 | set_default_parameter('lr', 0.001) 118 | selector = PFS() 119 | selector.fit(x, y, epochs=21, seed=seed, expand_y=True) 120 | 121 | # Learned principal directions 122 | F = selector.feature_directions 123 | 124 | # Learned principal features 125 | z = np.dot(x, F.T) 126 | 127 | # Accuracy 128 | true_f_1 = w/np.linalg.norm(w) 129 | learned_f_1 = F[0, :] 130 | e = np.linalg.norm(true_f_1-learned_f_1) 131 | 132 | assert e <= 0.10 133 | assert selector.mutual_information > 1.0 134 | 135 | 136 | def test_feature_extraction(): 137 | # Generate the data 138 | seed = 1 139 | np.random.seed(seed) 140 | d = 100 141 | w = np.ones(d)/d 142 | x_train = np.random.randn(10000, d) 143 | x_trainTw = np.dot(x_train, w) 144 | y_train = x_trainTw + 2.*x_trainTw**2 + 0.5*x_trainTw**3 145 | 146 | # Run PFS 147 | from kxy.misc.tf import set_default_parameter 148 | set_default_parameter('lr', 0.001) 149 | selector = PFS() 150 | selector.fit(x_train, y_train, epochs=21, seed=seed, expand_y=False) 151 | 152 | # Extract the features 153 | fx_train = selector.max_ent_features_x(x_train) 154 | assert fx_train.shape[0] == x_train.shape[0] 155 | 156 | # Run a linear regression relating learned features to y 157 | from sklearn.linear_model import LinearRegression 158 | from sklearn.metrics import r2_score 159 | 160 | # Training 161 | m = LinearRegression() 162 | m.fit(fx_train, y_train) 163 | 164 | # Testing accuracy 165 | x_test = np.random.randn(10000, d) 166 | x_testTw = np.dot(x_test, w) 167 | y_test = x_testTw + 2.*x_testTw**2 + 0.5*x_testTw**3 168 | 169 | fx_test = selector.max_ent_features_x(x_test) 170 | assert fx_test.shape[0] == x_test.shape[0] 171 | 172 | y_test_predicted = m.predict(fx_test) 173 | testing_r2 = r2_score(y_test_predicted, y_test) 174 | 175 | y_train_predicted = m.predict(fx_train) 176 | training_r2 = r2_score(y_train_predicted, y_train) 177 | 178 | assert training_r2>0.99, 'Learned features should be good for linear regression in-sample' 179 | assert testing_r2>0.99, 'Learned features should be good for linear regression out-of-sample' 180 | 181 | 182 | --------------------------------------------------------------------------------