├── .dockerignore ├── .gitignore ├── .pylintrc ├── .travis.yml ├── AUTHORS.rst ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── Dockerfile.pybase ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── RELEASE_NOTES.md ├── docs ├── Makefile ├── TODO ├── _build │ ├── doctrees │ │ ├── api │ │ │ ├── cross_sectional_imputation.doctree │ │ │ ├── dataset.doctree │ │ │ ├── deletion.doctree │ │ │ ├── index.doctree │ │ │ ├── time_series_imputation.doctree │ │ │ └── util.doctree │ │ ├── contributing │ │ │ ├── current_goals.doctree │ │ │ ├── index.doctree │ │ │ └── philosophy.doctree │ │ ├── environment.pickle │ │ ├── index.doctree │ │ ├── references │ │ │ └── index.doctree │ │ └── user_guide │ │ │ ├── diagnostics.doctree │ │ │ ├── getting_started.doctree │ │ │ ├── overview.doctree │ │ │ ├── rules_of_thumb.doctree │ │ │ └── tutorial.doctree │ └── html │ │ ├── .buildinfo │ │ ├── .nojekyll │ │ ├── _modules │ │ ├── impyute │ │ │ ├── dataset │ │ │ │ └── base.html │ │ │ ├── deletion │ │ │ │ └── complete_case.html │ │ │ ├── imputation │ │ │ │ ├── cs │ │ │ │ │ ├── central_tendency.html │ │ │ │ │ ├── em.html │ │ │ │ │ ├── fast_knn.html │ │ │ │ │ ├── mice.html │ │ │ │ │ └── random.html │ │ │ │ └── ts │ │ │ │ │ ├── locf.html │ │ │ │ │ └── moving_window.html │ │ │ └── util │ │ │ │ ├── checks.html │ │ │ │ ├── compare.html │ │ │ │ ├── count_missing.html │ │ │ │ ├── describe.html │ │ │ │ ├── errors.html │ │ │ │ ├── find_null.html │ │ │ │ └── preprocess.html │ │ └── index.html │ │ ├── _sources │ │ ├── api │ │ │ ├── cross_sectional_imputation.rst.txt │ │ │ ├── dataset.rst.txt │ │ │ ├── deletion.rst.txt │ │ │ ├── index.rst.txt │ │ │ ├── time_series_imputation.rst.txt │ │ │ └── util.rst.txt │ │ ├── contributing │ │ │ ├── current_goals.rst.txt │ │ │ ├── index.rst.txt │ │ │ └── philosophy.rst.txt │ │ ├── index.rst.txt │ │ ├── references │ │ │ └── index.rst.txt │ │ └── user_guide │ │ │ ├── diagnostics.rst.txt │ │ │ ├── getting_started.rst.txt │ │ │ ├── overview.rst.txt │ │ │ ├── rules_of_thumb.rst.txt │ │ │ └── tutorial.rst.txt │ │ ├── _static │ │ ├── ajax-loader.gif │ │ ├── alabaster.css │ │ ├── basic.css │ │ ├── comment-bright.png │ │ ├── comment-close.png │ │ ├── comment.png │ │ ├── custom.css │ │ ├── doctools.js │ │ ├── documentation_options.js │ │ ├── down-pressed.png │ │ ├── down.png │ │ ├── file.png │ │ ├── jquery-3.2.1.js │ │ ├── jquery.js │ │ ├── minus.png │ │ ├── plus.png │ │ ├── pygments.css │ │ ├── searchtools.js │ │ ├── underscore-1.3.1.js │ │ ├── underscore.js │ │ ├── up-pressed.png │ │ ├── up.png │ │ └── websupport.js │ │ ├── api │ │ ├── cross_sectional_imputation.html │ │ ├── dataset.html │ │ ├── deletion.html │ │ ├── index.html │ │ ├── time_series_imputation.html │ │ └── util.html │ │ ├── contributing │ │ ├── current_goals.html │ │ ├── index.html │ │ └── philosophy.html │ │ ├── genindex.html │ │ ├── index.html │ │ ├── objects.inv │ │ ├── py-modindex.html │ │ ├── references │ │ └── index.html │ │ ├── search.html │ │ ├── searchindex.js │ │ └── user_guide │ │ ├── diagnostics.html │ │ ├── getting_started.html │ │ ├── overview.html │ │ ├── rules_of_thumb.html │ │ └── tutorial.html ├── api │ ├── cross_sectional_imputation.rst │ ├── dataset.rst │ ├── deletion.rst │ ├── index.rst │ ├── time_series_imputation.rst │ └── util.rst ├── conf.py ├── contributing │ ├── current_goals.rst │ ├── index.rst │ └── philosophy.rst ├── index.rst ├── package-lock.json ├── references │ └── index.rst └── user_guide │ ├── diagnostics.rst │ ├── getting_started.rst │ ├── overview.rst │ ├── rules_of_thumb.rst │ └── tutorial.rst ├── impyute ├── __init__.py ├── contrib │ ├── __init__.py │ ├── compare.py │ ├── count_missing.py │ └── describe.py ├── dataset │ ├── __init__.py │ ├── base.py │ └── corrupt.py ├── deletion │ ├── __init__.py │ └── complete_case.py ├── imputation │ ├── __init__.py │ ├── cs │ │ ├── __init__.py │ │ ├── buck_iterative.py │ │ ├── central_tendency.py │ │ ├── em.py │ │ ├── fast_knn.py │ │ └── random.py │ └── ts │ │ ├── __init__.py │ │ ├── locf.py │ │ └── moving_window.py └── ops │ ├── __init__.py │ ├── error.py │ ├── inverse_distance_weighting.py │ ├── matrix.py │ ├── testing.py │ ├── util.py │ └── wrapper.py ├── pytest.ini ├── requirements ├── .travis.yaml ├── common.txt └── dev.txt ├── setup.cfg ├── setup.py └── test ├── __init__.py ├── conftest.py ├── contrib ├── __init__.py └── test_compare.py ├── dataset ├── __init__.py ├── test_mnist.py └── test_randc.py ├── deletion ├── __init__.py └── test_complete_case.py ├── imputation ├── __init__.py ├── cs │ ├── __init__.py │ ├── test_buck_iterative.py │ ├── test_central_tendency.py │ ├── test_em.py │ ├── test_fast_knn.py │ └── test_random.py └── ts │ ├── __init__.py │ ├── test_locf.py │ └── test_moving_window.py └── ops ├── __init__.py ├── test_matrix.py ├── test_util.py └── test_wrapper.py /.dockerignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | */*.pyc 3 | */*/*.pyc 4 | */*/*/*.pyc 5 | */*/*/*/*.pyc 6 | */*/*/*/*/*.pyc 7 | */*/*/*/*/*/*.pyc 8 | __pycache__ 9 | */__pycache__ 10 | */*/__pycache__ 11 | */*/*/__pycache__ 12 | */*/*/*/__pycache__ 13 | */*/*/*/*/__pycache__ 14 | */*/*/*/*/*/__pycache__ 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | NOTES 2 | docs/node_modules/ 3 | results.txt 4 | .pytest_cache/ 5 | *.pyc 6 | __pycache__/ 7 | **/.DS_Store 8 | *.swp 9 | .cache/ 10 | .eggs/ 11 | build/ 12 | dist/ 13 | impyute.egg-info/ 14 | *.swo 15 | syntastic 16 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | 3 | language: python 4 | 5 | services: 6 | - docker 7 | 8 | before_install: 9 | - docker pull eltonlaw/pybase 10 | 11 | script: 12 | make test 13 | 14 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | Elton Law 2 | Pavan Teja Dokku 3 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## CHANGELOG 2 | 3 | ### v0.0.7 4 | 5 | - `fast_knn`: Add parameters that can be passed to `scipy.spatial.KDTree` (`leafsize`) and `scipy.spatial.KDTree.query` (`eps`, `p`, `distance_upper_bound`). Add example usage for `fast_knn`. [(PR#38)](https://github.com/eltonlaw/impyute/pull/38) 6 | - Support for Pandas DataFrame objects [(#PR36)](https://github.com/eltonlaw/impyute/pull/36) 7 | - Support for python3.7 [(PR#34)](https://github.com/eltonlaw/impyute/pull/34) 8 | - New time series imputation - Moving window imputation: `impyute.moving_window` [(PR#28)](https://github.com/eltonlaw/impyute/pull/28) 9 | - Renamed some files/functions [(PR#23)](https://github.com/eltonlaw/impyute/pull/23) 10 | * `random_uniform` -> `randu` 11 | * `random_normal` -> `randn` 12 | * `impyute.deletions` -> `impyute.deletion` 13 | * `impyute.datasets` -> `impyute.dataset` 14 | * `impyute.imputations` -> `impyute.imputation` 15 | * `impyute.utils` -> `impyute.util` 16 | - All imputations used to run on a pointer to the original array, changing the original. Changed behaviour run on a copy with the option of running on the original (`inplace=True`). Implementation of this is still buggy, because `inplace=True` only works if what's getting passed in truly is a pointer. [(PR#22)](https://github.com/eltonlaw/impyute/pull/22) 17 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities. 4 | 5 | We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. 6 | 7 | Examples of unacceptable behavior by participants include the use of sexual language or imagery, derogatory comments or personal attacks, trolling, public or private harassment, insults, or other unprofessional conduct. 8 | 9 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed from the project team. 10 | 11 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers. 12 | 13 | This Code of Conduct is adapted from the Contributor Covenant, version 1.0.0, available from http://contributor-covenant.org/version/1/0/0/ 14 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Hey thanks for deciding to contribute! 4 | 5 | The following is a set of guidelines for contributing to the imputations library, impyute, which is hosted [here](https://github.com/eltonlaw/impyute) 6 | 7 | 1. Check for open [issues](https://github.com/eltonlaw/impyute/issues) or create a new one to discuss new features or bugs. 8 | 2. Fork the repo on [github](https://github.com/eltonlaw/impyute) and make the changes. Make sure you follow the guidelines below. 9 | 3. Write a unit test to show that the bug was fixed/feature works 10 | 4. Submit your pull request and reference the issue (and add yourself to [AUTHORS](https://github.com/eltonlaw/impyute/blob/master/AUTHORS.rst)!) 11 | 12 | ### Development 13 | 14 | To run unit tests, just run `pytest` from root. To test all other supported Python versions you will need [Docker](https://docs.docker.com/install/). The unit testing framework used is the built-in one, [`unittest`](https://docs.python.org/3.6/library/unittest.html). Put unit tests in the `test` directory in root. The testing environment works like this: 1) Build a docker image with multiple python versions 2) Run the container with pytest for each python version. 15 | 16 | $ make test 17 | 18 | Using [Sphinx's autodoc](http://www.sphinx-doc.org/en/stable/ext/autodoc.html) module, docstrings are used as the documentation. Make sure that all docstrings are formatted according to the [NumPy/SciPy Docstring Standard](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt#docstring-standard) 19 | 20 | Use [.pylintrc](https://github.com/eltonlaw/impyute/blob/master/.pylintrc) to lint files in accordance (mostly) with [PEP8](https://www.python.org/dev/peps/pep-0008/). You will first need [pylint installed](https://www.pylint.org/#install). I recommend [integrating it with your editor](https://docs.pylint.org/en/1.6.0/ide-integration.html) or you can call it from bash with: 21 | 22 | $ pylint --rcfile=.pylintrc impyute/ 23 | $ pylint --rcfile=.pylintrc test/ 24 | 25 | Fix all warnings raised, if you feel that the warning isn't justified/serves no purpose, then feel free to [disable the specific message](http://pylint.pycqa.org/en/latest/user_guide/message-control.html) for whatever blocks are causing it. 26 | 27 | ### Suggesting Enhancements 28 | 29 | This project was created to cover everything required in the step of your pipeline where you move from data with missing values to data without missing values. Simple enough right? Any enhancements that brings value with that in mind are welcome. 30 | 31 | ### Code of Conduct 32 | 33 | This project adheres to the Contributor Covenant [code of conduct](CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code. 34 | 35 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM eltonlaw/pybase 2 | 3 | COPY ./requirements /impyute/requirements 4 | RUN pip2.7 install -r /impyute/requirements/dev.txt && \ 5 | pip3.5 install -r /impyute/requirements/dev.txt && \ 6 | pip3.6 install -r /impyute/requirements/dev.txt && \ 7 | pip3.7 install -r /impyute/requirements/dev.txt 8 | 9 | COPY ./setup.py ./setup.cfg ./README.rst /impyute/ 10 | COPY ./docs /impyute/docs 11 | COPY ./test/ /impyute/test 12 | COPY ./impyute /impyute/impyute 13 | WORKDIR /impyute 14 | RUN pip2.7 install -e . && \ 15 | pip3.5 install -e . && \ 16 | pip3.6 install -e . && \ 17 | pip3.7 install -e . 18 | 19 | CMD ["python3.6", "-m", "pytest"] 20 | -------------------------------------------------------------------------------- /Dockerfile.pybase: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get update && \ 4 | apt-get -y install software-properties-common \ 5 | python-software-properties && \ 6 | add-apt-repository -y ppa:deadsnakes/ppa && apt-get update && \ 7 | apt-get autoclean 8 | 9 | RUN apt-get -y install \ 10 | python2.7 python2.7-dev \ 11 | python3.5 python3.5-dev \ 12 | python3.6 python3.6-dev \ 13 | python3.7 python3.7-dev && \ 14 | apt-get autoclean 15 | 16 | RUN apt-get install wget && \ 17 | wget https://bootstrap.pypa.io/get-pip.py && \ 18 | python2.7 get-pip.py && \ 19 | python3.5 get-pip.py && \ 20 | python3.6 get-pip.py && \ 21 | python3.7 get-pip.py && \ 22 | rm get-pip.py && \ 23 | apt-get autoclean 24 | 25 | RUN python2.7 -m pip install --upgrade pip && \ 26 | python3.5 -m pip install --upgrade pip && \ 27 | python3.6 -m pip install --upgrade pip && \ 28 | python3.7 -m pip install --upgrade pip 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 Impyute Developers 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the license file 2 | include LICENSE.txt 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | DOCKER_ID_USER=eltonlaw 2 | 3 | .PHONY: all build test upload install docs 4 | 5 | all: test-local 6 | 7 | install-local: 8 | cd $(IMPYUTE_ROOT) && python setup.py develop 9 | 10 | uninstall-local: 11 | cd $(IMPYUTE_ROOT) && python setup.py develop --uninstall 12 | 13 | test-local: 14 | cd $(IMPYUTE_ROOT) && pytest 15 | 16 | clean: 17 | find . -type f -name '*.pyc' -delete 18 | 19 | rebuild-pybase: 20 | docker rmi -f $(DOCKER_ID_USER)/pybase 21 | docker build -f Dockerfile.pybase -t $(DOCKER_ID_USER)/pybase . 22 | 23 | pull-pybase: 24 | docker pull $(DOCKER_ID_USER)/pybase 25 | 26 | build: 27 | if [[ "$(docker images -q eltonlaw/pybase 2> /dev/null)" != "" ]]; then \ 28 | $(MAKE) pull-pybase; \ 29 | fi 30 | docker build -t impyute . 31 | 32 | test: build 33 | docker run impyute python2.7 -m pytest 34 | docker run impyute python3.5 -m pytest 35 | docker run impyute python3.6 -m pytest 36 | docker run impyute python3.7 -m pytest 37 | 38 | # Need to pip install `wheel` and `twine` 39 | upload: build test docs 40 | python3 setup.py bdist_wheel --universal 41 | python3 setup.py sdist 42 | twine upload dist/* 43 | 44 | install: 45 | python3 setup.py install 46 | 47 | docs: 48 | cd docs && $(MAKE) html 49 | 50 | # Remember to call `docker login` first 51 | push-pybase: 52 | docker build -t $(DOCKER_ID_USER)/pybase -f Dockerfile.pybase . 53 | docker push $(DOCKER_ID_USER)/pybase 54 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://travis-ci.org/eltonlaw/impyute.svg?branch=master 2 | :target: https://travis-ci.org/eltonlaw/impyute 3 | 4 | .. image:: https://img.shields.io/pypi/v/impyute.svg 5 | :target: https://pypi.python.org/pypi/impyute 6 | 7 | Impyute 8 | ======== 9 | 10 | Impyute is a library of missing data imputation algorithms. This library was designed to be super lightweight, here's a sneak peak at what impyute can do. 11 | 12 | .. code-block:: python 13 | 14 | >>> n = 5 15 | >>> arr = np.random.uniform(high=6, size=(n, n)) 16 | >>> for _ in range(3): 17 | >>> arr[np.random.randint(n), np.random.randint(n)] = np.nan 18 | >>> print(arr) 19 | array([[0.25288643, 1.8149261 , 4.79943748, 0.54464834, np.nan], 20 | [4.44798362, 0.93518716, 3.24430922, 2.50915032, 5.75956805], 21 | [0.79802036, np.nan, 0.51729349, 5.06533123, 3.70669172], 22 | [1.30848217, 2.08386584, 2.29894541, np.nan, 3.38661392], 23 | [2.70989501, 3.13116687, 0.25851597, 4.24064355, 1.99607231]]) 24 | >>> import impyute as impy 25 | >>> print(impy.mean(arr)) 26 | array([[0.25288643, 1.8149261 , 4.79943748, 0.54464834, 3.7122365], 27 | [4.44798362, 0.93518716, 3.24430922, 2.50915032, 5.75956805], 28 | [0.79802036, 1.99128649, 0.51729349, 5.06533123, 3.70669172], 29 | [1.30848217, 2.08386584, 2.29894541, 3.08994336, 3.38661392], 30 | [2.70989501, 3.13116687, 0.25851597, 4.24064355, 1.99607231]]) 31 | 32 | Feature Support 33 | --------------- 34 | 35 | * Imputation of Cross Sectional Data 36 | * K-Nearest Neighbours 37 | * Multivariate Imputation by Chained Equations 38 | * Expectation Maximization 39 | * Mean Imputation 40 | * Mode Imputation 41 | * Median Imputation 42 | * Random Imputation 43 | * Imputation of Time Series Data 44 | * Last Observation Carried Forward 45 | * Moving Window 46 | * Autoregressive Integrated Moving Average (WIP) 47 | * Diagnostic Tools 48 | * Loggers 49 | * Distribution of Null Values 50 | * Comparison of imputations 51 | * Little's MCAR Test (WIP) 52 | 53 | Versions 54 | -------- 55 | 56 | Currently tested on 2.7, 3.4, 3.5, 3.6 and 3.7 57 | 58 | Installation 59 | ------------ 60 | 61 | To install impyute, run the following: 62 | 63 | .. code-block:: bash 64 | 65 | $ pip install impyute 66 | 67 | Or to get the most current version: 68 | 69 | .. code-block:: bash 70 | 71 | $ git clone https://github.com/eltonlaw/impyute 72 | $ cd impyute 73 | $ python setup.py install 74 | 75 | Documentation 76 | ------------- 77 | 78 | Documentation is available here: http://impyute.readthedocs.io/ 79 | 80 | 81 | How to Contribute 82 | ----------------- 83 | 84 | Check out CONTRIBUTING_ 85 | 86 | .. _CONTRIBUTING: https://github.com/eltonlaw/impyute/blob/master/CONTRIBUTING.md 87 | 88 | -------------------------------------------------------------------------------- /RELEASE_NOTES.md: -------------------------------------------------------------------------------- 1 | ## 0.0.9 2 | 3 | - Fix `fast_knn` incorrect weighting bug. Replaced distance weighting with inverse distance weighting and ability to swap in custom function (arg: 1D list of distances, ret: 1D list of weight percentages). New namespace created `impyute.util.inverse_distance_weighting` for functions that can be modified with custom args using `functool.partial` (check test for more details). 4 | - pybase dockerfile bug fixes 5 | - New `contrib` folder created and some of the utilities from `util` moved there: 6 | * `impyute.util.compare -> `impyute.contrib.compare` 7 | * `impyute.util.count_missing` -> `impyute.contrib.count_missing` 8 | * `impyute.util.describe` -> `impyute.contrib.describe` 9 | - Util namespace breaking changes 10 | * impyute.util.find_null->impyute.ops.matrix.nan_indices 11 | * impyute.util.preprocess->impyute.ops.wrapper.wrappers 12 | * impyute.util.checks->impyute.ops.wrapper.checks 13 | * impyute.util.BadInputError -> impyute.ops.errors.BadInputError 14 | * impyute.util.BadOutputError -> impyute.ops.errors.BadOutputError 15 | -------------------------------------------------------------------------------- /docs/TODO: -------------------------------------------------------------------------------- 1 | Implement 2 | - ARMA 3 | - ARIMA 4 | - Multiple Imputation 5 | - EM with Kalman Filter 6 | 7 | Feature Upgrades 8 | - `compare`: Allow customization of used ML algorithms 9 | 10 | Major Updates 11 | - Imputation of n-dimensional data 12 | - Imputation on specific formats (text, image, audio) 13 | -------------------------------------------------------------------------------- /docs/_build/doctrees/api/cross_sectional_imputation.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/api/cross_sectional_imputation.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/api/dataset.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/api/dataset.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/api/deletion.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/api/deletion.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/api/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/api/index.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/api/time_series_imputation.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/api/time_series_imputation.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/api/util.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/api/util.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/contributing/current_goals.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/contributing/current_goals.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/contributing/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/contributing/index.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/contributing/philosophy.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/contributing/philosophy.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/_build/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/index.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/references/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/references/index.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/user_guide/diagnostics.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/user_guide/diagnostics.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/user_guide/getting_started.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/user_guide/getting_started.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/user_guide/overview.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/user_guide/overview.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/user_guide/rules_of_thumb.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/user_guide/rules_of_thumb.doctree -------------------------------------------------------------------------------- /docs/_build/doctrees/user_guide/tutorial.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/doctrees/user_guide/tutorial.doctree -------------------------------------------------------------------------------- /docs/_build/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 6247af844a39c4867bda6eb8fa482a9b 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /docs/_build/html/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/html/.nojekyll -------------------------------------------------------------------------------- /docs/_build/html/_modules/impyute/deletion/complete_case.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | impyute.deletion.complete_case — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 |
30 |
31 | 32 | 33 |
34 | 35 |

Source code for impyute.deletion.complete_case

 36 | """ impyute.deletion.complete_case """
 37 | import numpy as np
 38 | from impyute.util import checks
 39 | from impyute.util import preprocess
 40 | 
 41 | 
[docs]@preprocess 42 | @checks 43 | def complete_case(data): 44 | """ Return only data rows with all columns 45 | 46 | Parameters 47 | ---------- 48 | data: numpy.ndarray 49 | Data to impute. 50 | 51 | Returns 52 | ------- 53 | numpy.ndarray 54 | Imputed data. 55 | 56 | """ 57 | return data[~np.isnan(data).any(axis=1)]
58 |
59 | 60 |
61 | 62 |
63 |
64 | 127 |
128 |
129 | 137 | 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /docs/_build/html/_modules/impyute/util/errors.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | impyute.util.errors — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 |
30 |
31 | 32 | 33 |
34 | 35 |

Source code for impyute.util.errors

 36 | """ impyute.util.errors """
 37 | 
[docs]class BadInputError(Exception): 38 | def __init__(self, value): 39 | self.value = value 40 | 41 | def __str__(self): 42 | return self.value
43 |
44 | 45 |
46 | 47 |
48 |
49 | 112 |
113 |
114 | 122 | 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /docs/_build/html/_modules/impyute/util/find_null.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | impyute.util.find_null — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 |
30 |
31 | 32 | 33 |
34 | 35 |

Source code for impyute.util.find_null

 36 | """ impyute.util.find_null """
 37 | import numpy as np
 38 | 
 39 | 
 40 | 
[docs]def find_null(data): 41 | """ Finds the indices of all missing values. 42 | 43 | Parameters 44 | ---------- 45 | data: numpy.ndarray 46 | Data to impute. 47 | 48 | Returns 49 | ------- 50 | List of tuples 51 | Indices of all missing values in tuple format; (i, j) 52 | 53 | """ 54 | null_xy = np.argwhere(np.isnan(data)) 55 | return null_xy
56 |
57 | 58 |
59 | 60 |
61 |
62 | 125 |
126 |
127 | 135 | 136 | 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /docs/_build/html/_modules/index.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | Overview: module code — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 | 58 | 119 |
120 |
121 | 129 | 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/api/cross_sectional_imputation.rst.txt: -------------------------------------------------------------------------------- 1 | ============================ 2 | Cross Sectional Imputation 3 | ============================ 4 | 5 | .. automodule:: impyute.imputation.cs 6 | :members: 7 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/api/dataset.rst.txt: -------------------------------------------------------------------------------- 1 | ========= 2 | Dataset 3 | ========= 4 | 5 | .. autofunction:: impyute.dataset.mnist 6 | 7 | .. autofunction:: impyute.dataset.randn 8 | 9 | .. autofunction:: impyute.dataset.randu 10 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/api/deletion.rst.txt: -------------------------------------------------------------------------------- 1 | ========== 2 | Deletion 3 | ========== 4 | 5 | .. automodule:: impyute.deletion 6 | :members: 7 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/api/index.rst.txt: -------------------------------------------------------------------------------- 1 | ================================ 2 | API Reference 3 | ================================ 4 | 5 | Documentation is auto-generated from docstrings. 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | Dataset Generation 11 | Deletions 12 | Utils 13 | Cross Sectional Imputation 14 | Time Series Imputation 15 | 16 | 17 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/api/time_series_imputation.rst.txt: -------------------------------------------------------------------------------- 1 | ======================== 2 | Time Series Imputation 3 | ======================== 4 | 5 | .. automodule:: impyute.imputation.ts 6 | :members: 7 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/api/util.rst.txt: -------------------------------------------------------------------------------- 1 | ========= 2 | Utility 3 | ========= 4 | 5 | .. automodule:: impyute.util 6 | :members: 7 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/contributing/current_goals.rst.txt: -------------------------------------------------------------------------------- 1 | ================= 2 | Looking Forward 3 | ================= 4 | 5 | ** (Not ordered by importance) ** 6 | 7 | Implementations: 8 | - ARMA 9 | - ARIMA 10 | - Multiple Imputation 11 | - EM with Kalman Filter 12 | 13 | Datasets: 14 | - Load more real world datasets 15 | - Generate MCAR, MAR and MNAR data 16 | 17 | Feature Upgrades: 18 | - `compare`: Allow customization of used ML algorithms 19 | 20 | Major Updates: 21 | - Imputation of n-dimensional data 22 | - Imputation on specific formats (text, image, audio) 23 | 24 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/contributing/index.rst.txt: -------------------------------------------------------------------------------- 1 | ============== 2 | Contributing 3 | ============== 4 | 5 | See `CONTRIBUTING `_ -------------------------------------------------------------------------------- /docs/_build/html/_sources/contributing/philosophy.rst.txt: -------------------------------------------------------------------------------- 1 | ============== 2 | Philosophy 3 | ============== -------------------------------------------------------------------------------- /docs/_build/html/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | Impyute 2 | ======== 3 | 4 | .. image:: https://travis-ci.org/eltonlaw/impyute.svg?branch=master 5 | :target: https://travis-ci.org/eltonlaw/impyute 6 | 7 | .. image:: https://img.shields.io/pypi/v/impyute.svg 8 | :target: https://pypi.python.org/pypi/impyute 9 | 10 | 11 | Impyute is a library of missing data imputation algorithms written in Python 3. This library was designed to be super lightweight, here's a sneak peak at what impyute can do. 12 | 13 | .. code-block:: python 14 | 15 | >>> from impyute.datasets import random_uniform 16 | >>> raw_data = random_uniform(shape=(5, 5), missingness="mcar", th=0.2) 17 | >>> print(raw_data) 18 | [[ 1. 0. 4. 0. 1.] 19 | [ 1. nan 6. 4. nan] 20 | [ 5. 0. nan 1. 3.] 21 | [ 2. 1. 5. 4. 6.] 22 | [ 2. 1. 0. 0. 6.]] 23 | >>> from impyute.imputations.cs import mean_imputation 24 | >>> complete_data = random_imputation(raw_data) 25 | >>> print(complete_data) 26 | [[ 1. 0. 4. 0. 1. ] 27 | [ 1. 0.5 6. 4. 4. ] 28 | [ 5. 0. 3.75 1. 3. ] 29 | [ 2. 1. 5. 4. 6. ] 30 | [ 2. 1. 0. 0. 6. ]] 31 | 32 | Feature Support 33 | --------------- 34 | 35 | * Imputation of Cross Sectional Data 36 | * K-Nearest Neighbours 37 | * Multivariate Imputation by Chained Equations 38 | * Expectation Maximization 39 | * Mean Imputation 40 | * Mode Imputation 41 | * Median Imputation 42 | * Random Imputation 43 | * Imputation of Time Series Data 44 | * Last Observation Carried Forward 45 | * Moving Window 46 | * Autoregressive Integrated Moving Average (WIP) 47 | * Diagnostic Tools 48 | * Loggers 49 | * Distribution of Null Values 50 | * Comparison of imputations 51 | * Little's MCAR Test (WIP) 52 | 53 | Versions 54 | -------- 55 | 56 | Currently tested on 2.7, 3.4, 3.5, 3.6 and 3.7 57 | 58 | Installation 59 | ------------ 60 | 61 | To install impyute, run the following: 62 | 63 | .. code-block:: bash 64 | 65 | $ pip3 install impyute 66 | 67 | Or to get the most latest build: 68 | 69 | .. code-block:: bash 70 | 71 | $ git clone https://github.com/eltonlaw/impyute 72 | $ cd impyute 73 | $ python setup.py install 74 | 75 | Documentation 76 | ------------- 77 | 78 | Documentation is available here: http://impyute.readthedocs.io/ 79 | 80 | 81 | How to Contribute 82 | ----------------- 83 | 84 | Check out CONTRIBUTING_ 85 | 86 | .. _CONTRIBUTING: https://github.com/eltonlaw/impyute/blob/master/CONTRIBUTING.md 87 | 88 | 89 | User Guide 90 | =========== 91 | 92 | .. toctree:: 93 | 94 | Overview 95 | Getting Started 96 | Tutorial 97 | Diagnostics 98 | Rules of Thumb 99 | 100 | 101 | API 102 | === 103 | 104 | .. toctree:: 105 | :maxdepth: 2 106 | 107 | API 108 | GitHub Repo 109 | 110 | Contributing 111 | ============ 112 | 113 | .. toctree:: 114 | 115 | Contributing Guidelines 116 | Philosophy 117 | Current Goals 118 | 119 | References 120 | ========== 121 | .. toctree:: 122 | 123 | Papers Master List 124 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/references/index.rst.txt: -------------------------------------------------------------------------------- 1 | =========== 2 | Citations 3 | =========== 4 | 5 | Schmitt P, Mandel J, Guedj M (2015) A Comparison of Six Methods for Missing Data Imputation. J Biom Biostat 6:224. doi: 10.4172/2155-6180.1000224 6 | 7 | Gelman A, Hill J (2006) Data Analysis Using Regression and Multilevel/Hierarchical Models. 8 | 9 | Azur MJ, Stuart EA, Frangakis C, Leaf PJ. Multiple Imputation by Chained Equations: 10 | What is it and how does it work? International journal of methods in psychiatric 11 | research. 2011;20(1):40-49. doi:10.1002/mpr.329. 12 | 13 | Roderick J. A. Little. (1988). A Test of Missing Completely at Random for Multivariate Data with Missing Values. Journal of the American Statistical Association, 83(404), 1198-1202. doi:10.2307/2290157 14 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/user_guide/diagnostics.rst.txt: -------------------------------------------------------------------------------- 1 | =========== 2 | Diagnostics 3 | =========== 4 | 5 | Little's MCAR Test [1]_ 6 | ======================= 7 | 8 | Take the mean of the data with missing values and take the mean of the data without missing values. If they're the same/simlar, then it's more likely that your data is MCAR. 9 | 10 | .. [1] Roderick J. A. Little. (1988). A Test of Missing Completely at Random for Multivariate Data with Missing Values. Journal of the American Statistical Association, 83(404), 1198-1202. doi:10.2307/2290157 11 | 12 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/user_guide/getting_started.rst.txt: -------------------------------------------------------------------------------- 1 | ================= 2 | Getting Started 3 | ================= 4 | 5 | Installation 6 | ============ 7 | 8 | Install via pip:: 9 | 10 | $ pip3 install impyute 11 | 12 | From source: 13 | 14 | .. code-block:: bash 15 | 16 | $ git clone https://github.com/eltonlaw/impyute 17 | $ cd impyute 18 | $ python setup.py install 19 | 20 | 21 | Dependencies 22 | ------------ 23 | 24 | - NumPy 25 | - SciPy 26 | - scikit-learn 27 | 28 | Versions 29 | -------- 30 | 31 | Currently, this package works with 2.7, 3.4, 3.5, 3.6 and 3.7 32 | 33 | 34 | Troubleshooting 35 | =============== 36 | Not working? Open an issue here: https://github.com/eltonlaw/impyute/issues 37 | -------------------------------------------------------------------------------- /docs/_build/html/_sources/user_guide/overview.rst.txt: -------------------------------------------------------------------------------- 1 | ========== 2 | Overview 3 | ========== 4 | 5 | About 6 | ===== 7 | 8 | impyute is a general purpose, imputations library written in Python. In statistics, imputation is the method of estimating missing values in a data set. There are a lot of different types of imputation, the result of the various types of datasets. On datasets with high percentages of missing values, some methods work better than others and vice versa. Datasets can be cross sectional or time series, linear or non linear, continuous or categorical or boolean. As you can imagine, there are a lot of different specifications that need to be kept in mind. 9 | 10 | Functionality 11 | ============= 12 | 13 | impyute was built for convenience, an all in one stop so that users can impute their dataset with minimal knowledge and get on with their day. With that in mind, the following tools are provided for the user: 14 | 15 | - Imputations (Fill in missing values) 16 | - Deletions (Only use complete data) 17 | - Diagnostics to identify the skew and distribution of missing values 18 | - Comparison function to experiment with how different machine learning algorithms are affected by different imputation algorithms. 19 | - Dataset generation to experiment with different types of missingness and different types of data. 20 | 21 | Formatting your Data 22 | ==================== 23 | 24 | Prior to running, checks are run to ensure the given data is in an acceptable format. Please ensure that your data satisfies the following criterion: 25 | 26 | - `numpy.ndarray `_ with type `numpy.float `_ 27 | - Columns are along the x-axis and individual datapoints are along the y-axis. 28 | - 2D Matrix (3D is also allowed in certain cases, but requires special treatment) 29 | - Missing values can be found with `numpy.isnan `_ -------------------------------------------------------------------------------- /docs/_build/html/_sources/user_guide/rules_of_thumb.rst.txt: -------------------------------------------------------------------------------- 1 | ======================================== 2 | Rules of Thumb 3 | ======================================== 4 | 5 | TBA -------------------------------------------------------------------------------- /docs/_build/html/_sources/user_guide/tutorial.rst.txt: -------------------------------------------------------------------------------- 1 | ========== 2 | Tutorial 3 | ========== 4 | 5 | For the Standard User 6 | --------------------- 7 | 8 | Identify what type of data you have (cross-sectional or time-series) then read about the strengths and weaknesses of each type of approach and pick something suitable. I've compiled a small (and incomplete) list of :doc:`Rules of Thumb ` that you can use to aid your decision making. After you've picked your imputation algorithm, 9 | 10 | For the Researcher 11 | ------------------ -------------------------------------------------------------------------------- /docs/_build/html/_static/ajax-loader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/html/_static/ajax-loader.gif -------------------------------------------------------------------------------- /docs/_build/html/_static/comment-bright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/html/_static/comment-bright.png -------------------------------------------------------------------------------- /docs/_build/html/_static/comment-close.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/html/_static/comment-close.png -------------------------------------------------------------------------------- /docs/_build/html/_static/comment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/html/_static/comment.png -------------------------------------------------------------------------------- /docs/_build/html/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* This file intentionally left blank. */ 2 | -------------------------------------------------------------------------------- /docs/_build/html/_static/down-pressed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/html/_static/down-pressed.png -------------------------------------------------------------------------------- /docs/_build/html/_static/down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/html/_static/down.png -------------------------------------------------------------------------------- /docs/_build/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/html/_static/file.png -------------------------------------------------------------------------------- /docs/_build/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/html/_static/minus.png -------------------------------------------------------------------------------- /docs/_build/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/html/_static/plus.png -------------------------------------------------------------------------------- /docs/_build/html/_static/pygments.css: -------------------------------------------------------------------------------- 1 | .highlight .hll { background-color: #ffffcc } 2 | .highlight { background: #eeffcc; } 3 | .highlight .c { color: #408090; font-style: italic } /* Comment */ 4 | .highlight .err { border: 1px solid #FF0000 } /* Error */ 5 | .highlight .k { color: #007020; font-weight: bold } /* Keyword */ 6 | .highlight .o { color: #666666 } /* Operator */ 7 | .highlight .ch { color: #408090; font-style: italic } /* Comment.Hashbang */ 8 | .highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */ 9 | .highlight .cp { color: #007020 } /* Comment.Preproc */ 10 | .highlight .cpf { color: #408090; font-style: italic } /* Comment.PreprocFile */ 11 | .highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */ 12 | .highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */ 13 | .highlight .gd { color: #A00000 } /* Generic.Deleted */ 14 | .highlight .ge { font-style: italic } /* Generic.Emph */ 15 | .highlight .gr { color: #FF0000 } /* Generic.Error */ 16 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ 17 | .highlight .gi { color: #00A000 } /* Generic.Inserted */ 18 | .highlight .go { color: #333333 } /* Generic.Output */ 19 | .highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */ 20 | .highlight .gs { font-weight: bold } /* Generic.Strong */ 21 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ 22 | .highlight .gt { color: #0044DD } /* Generic.Traceback */ 23 | .highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */ 24 | .highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */ 25 | .highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */ 26 | .highlight .kp { color: #007020 } /* Keyword.Pseudo */ 27 | .highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */ 28 | .highlight .kt { color: #902000 } /* Keyword.Type */ 29 | .highlight .m { color: #208050 } /* Literal.Number */ 30 | .highlight .s { color: #4070a0 } /* Literal.String */ 31 | .highlight .na { color: #4070a0 } /* Name.Attribute */ 32 | .highlight .nb { color: #007020 } /* Name.Builtin */ 33 | .highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */ 34 | .highlight .no { color: #60add5 } /* Name.Constant */ 35 | .highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */ 36 | .highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */ 37 | .highlight .ne { color: #007020 } /* Name.Exception */ 38 | .highlight .nf { color: #06287e } /* Name.Function */ 39 | .highlight .nl { color: #002070; font-weight: bold } /* Name.Label */ 40 | .highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */ 41 | .highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */ 42 | .highlight .nv { color: #bb60d5 } /* Name.Variable */ 43 | .highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */ 44 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */ 45 | .highlight .mb { color: #208050 } /* Literal.Number.Bin */ 46 | .highlight .mf { color: #208050 } /* Literal.Number.Float */ 47 | .highlight .mh { color: #208050 } /* Literal.Number.Hex */ 48 | .highlight .mi { color: #208050 } /* Literal.Number.Integer */ 49 | .highlight .mo { color: #208050 } /* Literal.Number.Oct */ 50 | .highlight .sa { color: #4070a0 } /* Literal.String.Affix */ 51 | .highlight .sb { color: #4070a0 } /* Literal.String.Backtick */ 52 | .highlight .sc { color: #4070a0 } /* Literal.String.Char */ 53 | .highlight .dl { color: #4070a0 } /* Literal.String.Delimiter */ 54 | .highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */ 55 | .highlight .s2 { color: #4070a0 } /* Literal.String.Double */ 56 | .highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */ 57 | .highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */ 58 | .highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */ 59 | .highlight .sx { color: #c65d09 } /* Literal.String.Other */ 60 | .highlight .sr { color: #235388 } /* Literal.String.Regex */ 61 | .highlight .s1 { color: #4070a0 } /* Literal.String.Single */ 62 | .highlight .ss { color: #517918 } /* Literal.String.Symbol */ 63 | .highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */ 64 | .highlight .fm { color: #06287e } /* Name.Function.Magic */ 65 | .highlight .vc { color: #bb60d5 } /* Name.Variable.Class */ 66 | .highlight .vg { color: #bb60d5 } /* Name.Variable.Global */ 67 | .highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */ 68 | .highlight .vm { color: #bb60d5 } /* Name.Variable.Magic */ 69 | .highlight .il { color: #208050 } /* Literal.Number.Integer.Long */ -------------------------------------------------------------------------------- /docs/_build/html/_static/up-pressed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/html/_static/up-pressed.png -------------------------------------------------------------------------------- /docs/_build/html/_static/up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/html/_static/up.png -------------------------------------------------------------------------------- /docs/_build/html/api/deletion.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | Deletion — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
31 |
32 |
33 | 34 | 35 |
36 | 37 |
38 |

Deletion

39 |

Missing data approaches that delete values.

40 |
41 |
42 | impyute.deletion.complete_case(data)[source]
43 |

Return only data rows with all columns

44 | 45 | 46 | 47 | 48 | 54 | 55 | 61 | 62 | 63 |
Parameters:
49 |
data: numpy.ndarray
50 |

Data to impute.

51 |
52 |
53 |
Returns:
56 |
numpy.ndarray
57 |

Imputed data.

58 |
59 |
60 |
64 |
65 | 66 |
67 | 68 | 69 |
70 | 71 |
72 |
73 | 145 |
146 |
147 | 155 | 156 | 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /docs/_build/html/api/index.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | API Reference — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
31 |
32 |
33 | 34 | 35 |
36 | 37 |
38 |

API Reference

39 |

Documentation is auto-generated from docstrings.

40 | 49 |
50 | 51 | 52 |
53 | 54 |
55 |
56 | 126 |
127 |
128 | 136 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /docs/_build/html/contributing/current_goals.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | Looking Forward — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
31 |
32 |
33 | 34 | 35 |
36 | 37 |
38 |

Looking Forward

39 |

** (Not ordered by importance) **

40 |

Implementations: 41 | - ARMA 42 | - ARIMA 43 | - Multiple Imputation 44 | - EM with Kalman Filter

45 |

Datasets: 46 | - Load more real world datasets 47 | - Generate MCAR, MAR and MNAR data

48 |

Feature Upgrades: 49 | - compare: Allow customization of used ML algorithms

50 |

Major Updates: 51 | - Imputation of n-dimensional data 52 | - Imputation on specific formats (text, image, audio)

53 |
54 | 55 | 56 |
57 | 58 |
59 |
60 | 123 |
124 |
125 | 133 | 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /docs/_build/html/contributing/index.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | Contributing — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
31 |
32 |
33 | 34 | 35 |
36 | 37 |
38 |

Contributing

39 |

See CONTRIBUTING

40 |
41 | 42 | 43 |
44 | 45 |
46 |
47 | 110 |
111 |
112 | 120 | 121 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /docs/_build/html/contributing/philosophy.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | Philosophy — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
31 |
32 |
33 | 34 | 35 |
36 | 37 |
38 |

Philosophy

39 |
40 | 41 | 42 |
43 | 44 |
45 |
46 | 109 |
110 |
111 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /docs/_build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/docs/_build/html/objects.inv -------------------------------------------------------------------------------- /docs/_build/html/py-modindex.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | Python Module Index — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
32 |
33 |
34 | 35 | 36 |
37 | 38 | 39 |

Python Module Index

40 | 41 |
42 | i 43 |
44 | 45 | 46 | 47 | 49 | 50 | 52 | 55 | 56 | 57 | 60 | 61 | 62 | 65 | 66 | 67 | 70 | 71 | 72 | 75 |
 
48 | i
53 | impyute 54 |
    58 | impyute.deletion 59 |
    63 | impyute.imputation.cs 64 |
    68 | impyute.imputation.ts 69 |
    73 | impyute.util 74 |
76 | 77 | 78 |
79 | 80 |
81 |
82 | 143 |
144 |
145 | 153 | 154 | 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /docs/_build/html/references/index.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | Citations — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
30 |
31 |
32 | 33 | 34 |
35 | 36 |
37 |

Citations

38 |

Schmitt P, Mandel J, Guedj M (2015) A Comparison of Six Methods for Missing Data Imputation. J Biom Biostat 6:224. doi: 10.4172/2155-6180.1000224

39 |

Gelman A, Hill J (2006) Data Analysis Using Regression and Multilevel/Hierarchical Models.

40 |

Azur MJ, Stuart EA, Frangakis C, Leaf PJ. Multiple Imputation by Chained Equations: 41 | What is it and how does it work? International journal of methods in psychiatric 42 | research. 2011;20(1):40-49. doi:10.1002/mpr.329.

43 |

Roderick J. A. Little. (1988). A Test of Missing Completely at Random for Multivariate Data with Missing Values. Journal of the American Statistical Association, 83(404), 1198-1202. doi:10.2307/2290157

44 |
45 | 46 | 47 |
48 | 49 |
50 |
51 | 113 |
114 |
115 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /docs/_build/html/search.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | Search — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 |
38 |
39 |
40 | 41 | 42 |
43 | 44 |

Search

45 |
46 | 47 |

48 | Please activate JavaScript to enable the search 49 | functionality. 50 |

51 |
52 |

53 | From here you can search these documents. Enter your search 54 | words into the box below and click "search". Note that the search 55 | function will automatically search for all of the words. Pages 56 | containing fewer words won't appear in the result list. 57 |

58 |
59 | 60 | 61 | 62 |
63 | 64 |
65 | 66 |
67 | 68 |
69 | 70 |
71 |
72 | 121 |
122 |
123 | 131 | 132 | 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /docs/_build/html/user_guide/diagnostics.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | Diagnostics — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
31 |
32 |
33 | 34 | 35 |
36 | 37 |
38 |

Diagnostics

39 |
40 |

Little’s MCAR Test [1]

41 |

Take the mean of the data with missing values and take the mean of the data without missing values. If they’re the same/simlar, then it’s more likely that your data is MCAR.

42 | 43 | 44 | 45 | 46 | 47 |
[1]Roderick J. A. Little. (1988). A Test of Missing Completely at Random for Multivariate Data with Missing Values. Journal of the American Statistical Association, 83(404), 1198-1202. doi:10.2307/2290157
48 |
49 |
50 | 51 | 52 |
53 | 54 |
55 |
56 | 122 |
123 |
124 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /docs/_build/html/user_guide/getting_started.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | Getting Started — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
31 |
32 |
33 | 34 | 35 |
36 | 37 |
38 |

Getting Started

39 |
40 |

Installation

41 |

Install via pip:

42 |
$ pip3 install impyute
 43 | 
44 |
45 |

From source:

46 |
$ git clone https://github.com/eltonlaw/impyute
 47 | $ cd impyute
 48 | $ python setup.py install
 49 | 
50 |
51 |
52 |

Dependencies

53 |
    54 |
  • NumPy
  • 55 |
  • SciPy
  • 56 |
  • scikit-learn
  • 57 |
58 |
59 |
60 |

Versions

61 |

Currently, this package works with 2.7, 3.4, 3.5, 3.6 and 3.7

62 |
63 |
64 |
65 |

Troubleshooting

66 |

Not working? Open an issue here: https://github.com/eltonlaw/impyute/issues

67 |
68 |
69 | 70 | 71 |
72 | 73 |
74 |
75 | 146 |
147 |
148 | 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /docs/_build/html/user_guide/rules_of_thumb.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | Rules of Thumb — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
31 |
32 |
33 | 34 | 35 |
36 | 37 |
38 |

Rules of Thumb

39 |

TBA

40 |
41 | 42 | 43 |
44 | 45 |
46 |
47 | 110 |
111 |
112 | 120 | 121 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /docs/_build/html/user_guide/tutorial.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | Tutorial — impyute 0.0.7 documentation 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
31 |
32 |
33 | 34 | 35 |
36 | 37 |
38 |

Tutorial

39 |
40 |

For the Standard User

41 |

Identify what type of data you have (cross-sectional or time-series) then read about the strengths and weaknesses of each type of approach and pick something suitable. I’ve compiled a small (and incomplete) list of Rules of Thumb that you can use to aid your decision making. After you’ve picked your imputation algorithm,

42 |
43 |
44 |

For the Researcher

45 |
46 |
47 | 48 | 49 |
50 | 51 |
52 |
53 | 120 |
121 |
122 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /docs/api/cross_sectional_imputation.rst: -------------------------------------------------------------------------------- 1 | ============================ 2 | Cross Sectional Imputation 3 | ============================ 4 | 5 | .. automodule:: impyute.imputation.cs 6 | :members: 7 | -------------------------------------------------------------------------------- /docs/api/dataset.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Dataset 3 | ========= 4 | 5 | .. autofunction:: impyute.dataset.mnist 6 | 7 | .. autofunction:: impyute.dataset.randn 8 | 9 | .. autofunction:: impyute.dataset.randu 10 | -------------------------------------------------------------------------------- /docs/api/deletion.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Deletion 3 | ========== 4 | 5 | .. automodule:: impyute.deletion 6 | :members: 7 | -------------------------------------------------------------------------------- /docs/api/index.rst: -------------------------------------------------------------------------------- 1 | ================================ 2 | API Reference 3 | ================================ 4 | 5 | Documentation is auto-generated from docstrings. 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | Dataset Generation 11 | Deletions 12 | Utils 13 | Cross Sectional Imputation 14 | Time Series Imputation 15 | 16 | 17 | -------------------------------------------------------------------------------- /docs/api/time_series_imputation.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | Time Series Imputation 3 | ======================== 4 | 5 | .. automodule:: impyute.imputation.ts 6 | :members: 7 | -------------------------------------------------------------------------------- /docs/api/util.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Utility 3 | ========= 4 | 5 | .. automodule:: impyute.util 6 | :members: 7 | -------------------------------------------------------------------------------- /docs/contributing/current_goals.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Looking Forward 3 | ================= 4 | 5 | ** (Not ordered by importance) ** 6 | 7 | Implementations: 8 | - ARMA 9 | - ARIMA 10 | - Multiple Imputation 11 | - EM with Kalman Filter 12 | 13 | Datasets: 14 | - Load more real world datasets 15 | - Generate MCAR, MAR and MNAR data 16 | 17 | Feature Upgrades: 18 | - `compare`: Allow customization of used ML algorithms 19 | 20 | Major Updates: 21 | - Imputation of n-dimensional data 22 | - Imputation on specific formats (text, image, audio) 23 | 24 | -------------------------------------------------------------------------------- /docs/contributing/index.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Contributing 3 | ============== 4 | 5 | See `CONTRIBUTING `_ -------------------------------------------------------------------------------- /docs/contributing/philosophy.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Philosophy 3 | ============== -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Impyute 2 | ======== 3 | 4 | .. image:: https://travis-ci.org/eltonlaw/impyute.svg?branch=master 5 | :target: https://travis-ci.org/eltonlaw/impyute 6 | 7 | .. image:: https://img.shields.io/pypi/v/impyute.svg 8 | :target: https://pypi.python.org/pypi/impyute 9 | 10 | 11 | Impyute is a library of missing data imputation algorithms written in Python 3. This library was designed to be super lightweight, here's a sneak peak at what impyute can do. 12 | 13 | .. code-block:: python 14 | 15 | >>> n = 5 16 | >>> arr = np.random.uniform(high=6, size=(n, n)) 17 | >>> for _ in range(3): 18 | >>> arr[np.random.randint(n), np.random.randint(n)] = np.nan 19 | >>> print(arr) 20 | array([[0.25288643, 1.8149261 , 4.79943748, 0.54464834, np.nan], 21 | [4.44798362, 0.93518716, 3.24430922, 2.50915032, 5.75956805], 22 | [0.79802036, np.nan, 0.51729349, 5.06533123, 3.70669172], 23 | [1.30848217, 2.08386584, 2.29894541, np.nan, 3.38661392], 24 | [2.70989501, 3.13116687, 0.25851597, 4.24064355, 1.99607231]]) 25 | >>> import impyute as impy 26 | >>> print(impy.mean(arr)) 27 | array([[0.25288643, 1.8149261 , 4.79943748, 0.54464834, 3.7122365], 28 | [4.44798362, 0.93518716, 3.24430922, 2.50915032, 5.75956805], 29 | [0.79802036, 1.99128649, 0.51729349, 5.06533123, 3.70669172], 30 | [1.30848217, 2.08386584, 2.29894541, 3.08994336, 3.38661392], 31 | [2.70989501, 3.13116687, 0.25851597, 4.24064355, 1.99607231]]) 32 | 33 | Feature Support 34 | --------------- 35 | 36 | * Imputation of Cross Sectional Data 37 | * K-Nearest Neighbours 38 | * Multivariate Imputation by Chained Equations 39 | * Expectation Maximization 40 | * Mean Imputation 41 | * Mode Imputation 42 | * Median Imputation 43 | * Random Imputation 44 | * Imputation of Time Series Data 45 | * Last Observation Carried Forward 46 | * Moving Window 47 | * Autoregressive Integrated Moving Average (WIP) 48 | * Diagnostic Tools 49 | * Loggers 50 | * Distribution of Null Values 51 | * Comparison of imputations 52 | * Little's MCAR Test (WIP) 53 | 54 | Versions 55 | -------- 56 | 57 | Currently tested on 2.7, 3.4, 3.5, 3.6 and 3.7 58 | 59 | Installation 60 | ------------ 61 | 62 | To install impyute, run the following: 63 | 64 | .. code-block:: bash 65 | 66 | $ pip3 install impyute 67 | 68 | Or to get the most latest build: 69 | 70 | .. code-block:: bash 71 | 72 | $ git clone https://github.com/eltonlaw/impyute 73 | $ cd impyute 74 | $ python setup.py install 75 | 76 | Documentation 77 | ------------- 78 | 79 | Documentation is available here: http://impyute.readthedocs.io/ 80 | 81 | 82 | How to Contribute 83 | ----------------- 84 | 85 | Check out CONTRIBUTING_ 86 | 87 | .. _CONTRIBUTING: https://github.com/eltonlaw/impyute/blob/master/CONTRIBUTING.md 88 | 89 | 90 | User Guide 91 | =========== 92 | 93 | .. toctree:: 94 | 95 | Overview 96 | Getting Started 97 | Tutorial 98 | Diagnostics 99 | Rules of Thumb 100 | 101 | 102 | API 103 | === 104 | 105 | .. toctree:: 106 | :maxdepth: 2 107 | 108 | API 109 | GitHub Repo 110 | 111 | Contributing 112 | ============ 113 | 114 | .. toctree:: 115 | 116 | Contributing Guidelines 117 | Philosophy 118 | Current Goals 119 | 120 | References 121 | ========== 122 | .. toctree:: 123 | 124 | Papers Master List 125 | -------------------------------------------------------------------------------- /docs/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "lockfileVersion": 1 3 | } 4 | -------------------------------------------------------------------------------- /docs/references/index.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Citations 3 | =========== 4 | 5 | Schmitt P, Mandel J, Guedj M (2015) A Comparison of Six Methods for Missing Data Imputation. J Biom Biostat 6:224. doi: 10.4172/2155-6180.1000224 6 | 7 | Gelman A, Hill J (2006) Data Analysis Using Regression and Multilevel/Hierarchical Models. 8 | 9 | Azur MJ, Stuart EA, Frangakis C, Leaf PJ. Multiple Imputation by Chained Equations: 10 | What is it and how does it work? International journal of methods in psychiatric 11 | research. 2011;20(1):40-49. doi:10.1002/mpr.329. 12 | 13 | Roderick J. A. Little. (1988). A Test of Missing Completely at Random for Multivariate Data with Missing Values. Journal of the American Statistical Association, 83(404), 1198-1202. doi:10.2307/2290157 14 | -------------------------------------------------------------------------------- /docs/user_guide/diagnostics.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Diagnostics 3 | =========== 4 | 5 | Little's MCAR Test [1]_ 6 | ======================= 7 | 8 | Take the mean of the data with missing values and take the mean of the data without missing values. If they're the same/simlar, then it's more likely that your data is MCAR. 9 | 10 | .. [1] Roderick J. A. Little. (1988). A Test of Missing Completely at Random for Multivariate Data with Missing Values. Journal of the American Statistical Association, 83(404), 1198-1202. doi:10.2307/2290157 11 | 12 | -------------------------------------------------------------------------------- /docs/user_guide/getting_started.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Getting Started 3 | ================= 4 | 5 | Installation 6 | ============ 7 | 8 | Install via pip:: 9 | 10 | $ pip3 install impyute 11 | 12 | From source: 13 | 14 | .. code-block:: bash 15 | 16 | $ git clone https://github.com/eltonlaw/impyute 17 | $ cd impyute 18 | $ python setup.py install 19 | 20 | 21 | Dependencies 22 | ------------ 23 | 24 | - NumPy 25 | - SciPy 26 | - scikit-learn 27 | 28 | Versions 29 | -------- 30 | 31 | Currently, this package works with 2.7, 3.4, 3.5, 3.6 and 3.7 32 | 33 | 34 | Troubleshooting 35 | =============== 36 | Not working? Open an issue here: https://github.com/eltonlaw/impyute/issues 37 | -------------------------------------------------------------------------------- /docs/user_guide/overview.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Overview 3 | ========== 4 | 5 | About 6 | ===== 7 | 8 | impyute is a general purpose, imputations library written in Python. In statistics, imputation is the method of estimating missing values in a data set. There are a lot of different types of imputation, the result of the various types of datasets. On datasets with high percentages of missing values, some methods work better than others and vice versa. Datasets can be cross sectional or time series, linear or non linear, continuous or categorical or boolean. As you can imagine, there are a lot of different specifications that need to be kept in mind. 9 | 10 | Functionality 11 | ============= 12 | 13 | impyute was built for convenience, an all in one stop so that users can impute their dataset with minimal knowledge and get on with their day. With that in mind, the following tools are provided for the user: 14 | 15 | - Imputations (Fill in missing values) 16 | - Deletions (Only use complete data) 17 | - Diagnostics to identify the skew and distribution of missing values 18 | - Comparison function to experiment with how different machine learning algorithms are affected by different imputation algorithms. 19 | - Dataset generation to experiment with different types of missingness and different types of data. 20 | 21 | Formatting your Data 22 | ==================== 23 | 24 | Prior to running, checks are run to ensure the given data is in an acceptable format. Please ensure that your data satisfies the following criterion: 25 | 26 | - `numpy.ndarray `_ with type `numpy.float `_ 27 | - Columns are along the x-axis and individual datapoints are along the y-axis. 28 | - 2D Matrix (3D is also allowed in certain cases, but requires special treatment) 29 | - Missing values can be found with `numpy.isnan `_ -------------------------------------------------------------------------------- /docs/user_guide/rules_of_thumb.rst: -------------------------------------------------------------------------------- 1 | ======================================== 2 | Rules of Thumb 3 | ======================================== 4 | 5 | TBA -------------------------------------------------------------------------------- /docs/user_guide/tutorial.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Tutorial 3 | ========== 4 | 5 | For the Standard User 6 | --------------------- 7 | 8 | Identify what type of data you have (cross-sectional or time-series) then read about the strengths and weaknesses of each type of approach and pick something suitable. I've compiled a small (and incomplete) list of :doc:`Rules of Thumb ` that you can use to aid your decision making. After you've picked your imputation algorithm, 9 | 10 | For the Researcher 11 | ------------------ -------------------------------------------------------------------------------- /impyute/__init__.py: -------------------------------------------------------------------------------- 1 | """ impyute: Data imputations library to preprocess datasets with missing dat 2 | 3 | impyute.imputations.cs: Imputations on cross sectional data 4 | impyute.imputations.ts: Imputations on time series data 5 | impyute.deletion: Deletion type missing data handling 6 | impyute.contrib: Volatile and experimental code 7 | """ 8 | # pylint: disable=wrong-import-position 9 | 10 | __title__ = 'impyute' 11 | __version__ = '0.0.8' 12 | __build__ = 0x021300 13 | __author__ = 'Elton Law' 14 | __license__ = 'MIT' 15 | __copyright__ = 'Copyright 2019 Elton law' 16 | 17 | 18 | ### Top Level Modules 19 | 20 | from impyute import dataset 21 | from impyute import deletion 22 | from impyute import ops 23 | from impyute import contrib 24 | 25 | __all__ = ["contrib", "dataset", "deletion", "ops"] 26 | 27 | ### Cross Sectional Imputations 28 | 29 | from impyute.imputation.cs import mean 30 | from impyute.imputation.cs import median 31 | from impyute.imputation.cs import mode 32 | from impyute.imputation.cs import em 33 | from impyute.imputation.cs import fast_knn 34 | from impyute.imputation.cs import buck_iterative 35 | from impyute.imputation.cs import random 36 | 37 | __all__.extend([ 38 | "mean", 39 | "median", 40 | "mode", 41 | "em", 42 | "fast_knn", 43 | "buck_iterative", 44 | "random" 45 | ]) 46 | 47 | ### Time Series Imputations 48 | 49 | from impyute.imputation.ts import locf 50 | from impyute.imputation.ts import moving_window 51 | 52 | __all__.extend([ 53 | "locf", 54 | "moving_window" 55 | ]) 56 | 57 | ### Deletions 58 | from impyute.deletion import complete_case 59 | 60 | __all__.extend([ 61 | "complete_case" 62 | ]) 63 | -------------------------------------------------------------------------------- /impyute/contrib/__init__.py: -------------------------------------------------------------------------------- 1 | """ Volatile code. Expect stuff in this to change. """ 2 | 3 | from .describe import describe 4 | from .count_missing import count_missing 5 | from .compare import compare 6 | 7 | __all__ = ["describe", "count_missing", "compare"] 8 | -------------------------------------------------------------------------------- /impyute/contrib/compare.py: -------------------------------------------------------------------------------- 1 | """impyute.contrib.compare.py""" 2 | import importlib 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import accuracy_score 5 | # pylint: disable=too-many-locals, dangerous-default-value 6 | 7 | def compare(imputed, classifiers=["sklearn.svm.SVC"], log_path=None): 8 | """ 9 | Given an imputed dataset with labels and a list of supervised machine 10 | learning model, find accuracy score of all model/imputation pairs. 11 | 12 | Parameters 13 | ---------- 14 | imputed: [(str, np.ndarray), (str, np.ndarray)...] 15 | List of tuples containing (imputation_name, imputed_data) where 16 | `imputation_name` is a string and `imputed_data` is a tuple where 17 | `imputed_data`[0] is the data, X and `imputed_data`[1] is the label, y 18 | classifiers: [str, str...str] (optional) 19 | Provide a list of classifiers to run imputed data sets on. Right now, 20 | it ONLY works with sklearn, the format should be like so: 21 | `sklearn.SUBMODULE.FUNCTION`. More generally its 22 | 'MODULE.SUBMODULE.FUNCTION'. If providing a custom classifier, make 23 | sure to add the file location to sys.path first and the classifier 24 | should also be structured like sklearn (with a `fit` and `predict` 25 | method). 26 | log_path: str (optional) 27 | To write results to a file, provide a relative path 28 | 29 | Returns 30 | ------- 31 | results.txt 32 | Classification results on imputed data 33 | 34 | """ 35 | clfs = [] 36 | for clf_name in classifiers: 37 | mod_name, smod_name, fn_name = clf_name.split(".") 38 | try: 39 | mod = importlib.import_module("{}.{}".format(mod_name, smod_name)) 40 | fn = getattr(mod, fn_name) 41 | clfs.append([fn_name, fn]) 42 | except ModuleNotFoundError: 43 | print("Cannot import '{}' from '{}.{}'".format(fn_name, 44 | mod_name, 45 | smod_name)) 46 | 47 | results = {imputation_name: [] for imputation_name, _ in imputed} 48 | 49 | for imputation_name, data in imputed: 50 | X, y = data 51 | X_train, X_test, y_train, y_test = train_test_split(X, y, 52 | test_size=0.33, 53 | random_state=0) 54 | print("Imputation {} =========".format(imputation_name)) 55 | for clf_name, clf in clfs: 56 | clf = clf() 57 | clf.fit(X_train, y_train) 58 | y_pred = clf.predict(X_test) 59 | accuracy = accuracy_score(y_test, y_pred) 60 | results[imputation_name].append((clf_name, accuracy)) 61 | print("...{}".format(clf_name)) 62 | 63 | # If not None, write to path 64 | if log_path: 65 | with open(log_path, 'w') as f: 66 | f.write(str(results)) 67 | print("Results saved to {}".format(log_path)) 68 | 69 | return results 70 | -------------------------------------------------------------------------------- /impyute/contrib/count_missing.py: -------------------------------------------------------------------------------- 1 | """ impyute.contrib.count_missing.py """ 2 | import numpy as np 3 | from impyute.ops import matrix 4 | 5 | def count_missing(data): 6 | """ Calculate the total percentage of missing values and also the 7 | percentage in each column. 8 | 9 | Parameters 10 | ---------- 11 | data: np.array 12 | Data to impute. 13 | 14 | Returns 15 | ------- 16 | dict 17 | Percentage of missing values in total and in each column. 18 | 19 | """ 20 | size = len(data.flatten()) 21 | nan_xy = matrix.nan_indices(data) 22 | np.unique(nan_xy) 23 | counter = {y: 0. for y in np.unique(nan_xy.T[1])} 24 | change_in_percentage = 1./size 25 | for _, y in nan_xy: 26 | counter[y] += change_in_percentage 27 | total_missing = len(nan_xy)/size 28 | counter["total"] = total_missing 29 | 30 | return counter 31 | -------------------------------------------------------------------------------- /impyute/contrib/describe.py: -------------------------------------------------------------------------------- 1 | """ impyute.contrib.describe """ 2 | from impyute.ops import matrix 3 | 4 | def describe(data): # verbose=True): 5 | """ Print input/output multiple times 6 | 7 | Eventually will be used instead of matrix.nan_indices everywhere 8 | 9 | Parameters 10 | ---------- 11 | data: numpy.nd.array 12 | The data you want to get a description from 13 | verbose: boolean(optional) 14 | Decides whether the description is short or long form 15 | 16 | Returns 17 | ------- 18 | dict 19 | missingness: list 20 | Confidence interval of data being MCAR, MAR or MNAR - in that order 21 | nan_xy: list of tuples 22 | Indices of all null points 23 | nan_n: list 24 | Total number of null values for each column 25 | pmissing_n: float 26 | Percentage of missing values in dataset 27 | nan_rows: list 28 | Indices of all rows that are completely null 29 | nan_cols: list 30 | Indices of all columns that are completely null 31 | mean_rows: list 32 | Mean value of each row 33 | mean_cols: list 34 | Mean value of each column 35 | std_dev: list 36 | std dev for each row/column 37 | min_max: list 38 | Finds the minimum and maximum for each row 39 | 40 | """ 41 | # missingness = [0.33, 0.33, 0.33] # find_missingness(data) 42 | nan_xy = matrix.nan_indices(data) 43 | nan_n = len(nan_xy) 44 | pmissing_n = float(nan_n/len(data.flatten)) 45 | # pmissing_rows = "" 46 | # pmissing_cols = "" 47 | # nan_rows = "" 48 | # nan_cols = "" 49 | # mean_rows = "" 50 | # mean_cols = "" 51 | # std_dev = "" 52 | # "missingness": missingness, 53 | description = {"nan_xy": nan_xy, 54 | "nan_n": nan_n, 55 | "pmissing_n": pmissing_n} 56 | # "pmissing_rows": pmissing_rows, 57 | # "pmissing_cols": pmissing_cols, 58 | # "nan_rows": nan_rows, 59 | # "nan_cols": nan_cols, 60 | # "mean_rows": mean_rows, 61 | # "mean_cols": mean_cols, 62 | # "std_dev": std_dev} 63 | return description 64 | -------------------------------------------------------------------------------- /impyute/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | """ Real-world/mock datasets and missingness corruptors to experiment with. """ 2 | from .base import randu 3 | from .base import randn 4 | from .base import mnist 5 | 6 | __all__ = ["randu", "randn", "mnist"] 7 | -------------------------------------------------------------------------------- /impyute/dataset/base.py: -------------------------------------------------------------------------------- 1 | """ Shared functions to load/generate data """ 2 | import itertools 3 | import math 4 | import random 5 | import string 6 | import numpy as np 7 | from impyute.dataset.corrupt import Corruptor 8 | from impyute.ops import error 9 | 10 | def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"): 11 | """ Return randomly generated dataset of numbers with uniformly 12 | distributed values between bound[0] and bound[1] 13 | 14 | Parameters 15 | ---------- 16 | bound:tuple (start,stop) 17 | Determines the range of values in the matrix. Index 0 for start 18 | value and index 1 for stop value. Start is inclusive, stop is 19 | exclusive. 20 | shape:tuple(optional) 21 | Size of the randomly generated data 22 | missingness: ('mcar', 'mar', 'mnar') 23 | Type of missingness you want in your dataset 24 | thr: float between [0,1] 25 | Percentage of missing data in generated data 26 | dtype: ('int','float') 27 | Type of data 28 | 29 | Returns 30 | ------- 31 | numpy.ndarray 32 | """ 33 | if dtype == "int": 34 | data = np.random.randint(bound[0], bound[1], size=shape).astype(float) 35 | elif dtype == "float": 36 | data = np.random.uniform(bound[0], bound[1], size=shape) 37 | corruptor = Corruptor(data, thr=thr) 38 | raw_data = getattr(corruptor, missingness)() 39 | return raw_data 40 | 41 | 42 | def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"): 43 | """ Return randomly generated dataset of numbers with normally 44 | distributed values with given and sigma. 45 | 46 | Parameters 47 | ---------- 48 | theta: tuple (mu, sigma) 49 | Determines the range of values in the matrix 50 | shape:tuple(optional) 51 | Size of the randomly generated data 52 | missingness: ('mcar', 'mar', 'mnar') 53 | Type of missingness you want in your dataset 54 | thr: float between [0,1] 55 | Percentage of missing data in generated data 56 | dtype: ('int','float') 57 | Type of data 58 | 59 | Returns 60 | ------- 61 | numpy.ndarray 62 | """ 63 | mean, sigma = theta 64 | data = np.random.normal(mean, sigma, size=shape) 65 | if dtype == "int": 66 | data = np.round(data) 67 | elif dtype == "float": 68 | pass 69 | corruptor = Corruptor(data, thr=thr) 70 | raw_data = getattr(corruptor, missingness)() 71 | return raw_data 72 | 73 | def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2): 74 | """ Return randomly generated dataset with uniformly distributed categorical data (alphabetic character) 75 | 76 | Parameters 77 | ---------- 78 | nlevels: int 79 | Specify the number of different categories in the dataset 80 | shape: tuple(optional) 81 | Size of the randomly generated data 82 | missingness: string in ('mcar', 'mar', 'mnar') 83 | Type of missingness you want in your dataset 84 | thr: float between [0,1] 85 | Percentage of missing data in generated data 86 | 87 | Returns 88 | ------- 89 | numpy.ndarray 90 | """ 91 | if shape[0]*shape[1] < nlevels: 92 | raise error.BadInputError("nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape") 93 | 94 | length = len(string.ascii_lowercase) 95 | n_fold = int(math.floor(math.log(nlevels, length))) 96 | cat_pool = list(string.ascii_lowercase) 97 | 98 | # when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data 99 | if n_fold > 0: 100 | for i in range(2, n_fold+2): 101 | pool_candidate = list(itertools.product(string.ascii_lowercase, repeat=i)) 102 | cat_pool.extend([''.join(w) for w in pool_candidate]) 103 | if len(cat_pool) > nlevels: 104 | break 105 | 106 | cat = random.sample(cat_pool, nlevels) 107 | data = np.random.choice(cat, shape, replace=True) 108 | 109 | # make sure the data frame has nlevel different categories 110 | while len(np.unique(data)) != nlevels: 111 | data = np.random.choice(cat, shape, replace=True) 112 | 113 | corruptor = Corruptor(data, thr=thr, dtype=np.str) 114 | raw_data = getattr(corruptor, missingness)() 115 | return raw_data 116 | 117 | 118 | 119 | def mnist(missingness="mcar", thr=0.2): 120 | """ Loads corrupted MNIST 121 | 122 | Parameters 123 | ---------- 124 | missingness: ('mcar', 'mar', 'mnar') 125 | Type of missigness you want in your dataset 126 | th: float between [0,1] 127 | Percentage of missing data in generated data 128 | 129 | Returns 130 | ------- 131 | numpy.ndarray 132 | """ 133 | from sklearn.datasets import fetch_mldata 134 | dataset = fetch_mldata('MNIST original') 135 | corruptor = Corruptor(dataset.data, thr=thr) 136 | data = getattr(corruptor, missingness)() 137 | return {"X": data, "Y": dataset.target} 138 | -------------------------------------------------------------------------------- /impyute/dataset/corrupt.py: -------------------------------------------------------------------------------- 1 | """ impyute.dataset.corrupt """ 2 | import numpy as np 3 | 4 | 5 | class Corruptor: 6 | """ Adds missing values to a complete dataset. 7 | 8 | Attributes 9 | ---------- 10 | data: np.ndarray 11 | Matrix of values with no NaN's that you want to add NaN's to. 12 | thr: float (optional) 13 | The percentage of null values you want in your dataset, a number 14 | between 0 and 1. 15 | 16 | Methods 17 | ------- 18 | mcar() 19 | Overwrite values with MCAR placed NaN's. 20 | mar() 21 | Overwrite values with MAR placed NaN's. 22 | mnar() 23 | Overwrite values with MNAR placed NaN's. 24 | 25 | """ 26 | def __init__(self, data, thr=0.2, dtype=np.float): 27 | self.dtype = data.dtype 28 | self.shape = np.shape(data) 29 | self.data = data.astype(dtype) 30 | self.thr = thr 31 | 32 | def mcar(self): 33 | """ Overwrites values with MCAR placed NaN's """ 34 | data_1d = self.data.flatten() 35 | n_total = len(data_1d) 36 | nan_x = np.random.choice(range(n_total), 37 | size=int(self.thr*n_total), 38 | replace=False) 39 | for x_i in nan_x: 40 | data_1d[x_i] = np.nan 41 | output = data_1d.reshape(self.shape) 42 | return output 43 | 44 | def mar(self): 45 | """ Overwrites values with MAR placed NaN's """ 46 | pass 47 | 48 | def mnar(self): 49 | """ Overwrites values with MNAR placed NaN's """ 50 | pass 51 | 52 | def complete(self): 53 | """ Do nothing to the data """ 54 | output = self.data 55 | return output 56 | -------------------------------------------------------------------------------- /impyute/deletion/__init__.py: -------------------------------------------------------------------------------- 1 | """ Missing data approaches that delete values. """ 2 | 3 | from .complete_case import complete_case 4 | 5 | __all__ = ["complete_case"] 6 | -------------------------------------------------------------------------------- /impyute/deletion/complete_case.py: -------------------------------------------------------------------------------- 1 | """ impyute.deletion.complete_case """ 2 | import numpy as np 3 | from impyute.ops import wrapper 4 | 5 | @wrapper.wrappers 6 | @wrapper.checks 7 | def complete_case(data): 8 | """ Return only data rows with all columns 9 | 10 | Parameters 11 | ---------- 12 | data: numpy.ndarray 13 | Data to impute. 14 | 15 | Returns 16 | ------- 17 | numpy.ndarray 18 | Imputed data. 19 | 20 | """ 21 | return data[~np.isnan(data).any(axis=1)] 22 | -------------------------------------------------------------------------------- /impyute/imputation/__init__.py: -------------------------------------------------------------------------------- 1 | """ Imputations for cross-sectional and time-series data. """ 2 | 3 | __all__ = ["cs", "ts"] 4 | -------------------------------------------------------------------------------- /impyute/imputation/cs/__init__.py: -------------------------------------------------------------------------------- 1 | """ Imputations for cross-sectional data. """ 2 | 3 | from .random import random 4 | from .central_tendency import mean 5 | from .central_tendency import mode 6 | from .central_tendency import median 7 | from .buck_iterative import buck_iterative 8 | from .em import em 9 | from .fast_knn import fast_knn 10 | 11 | __all__ = ["random", "mean", "mode", "median", "buck_iterative", "em", "fast_knn"] 12 | -------------------------------------------------------------------------------- /impyute/imputation/cs/buck_iterative.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LinearRegression 3 | from impyute.ops import matrix 4 | from impyute.ops import wrapper 5 | # pylint: disable=too-many-locals 6 | 7 | @wrapper.wrappers 8 | @wrapper.checks 9 | def buck_iterative(data): 10 | """ Iterative variant of buck's method 11 | 12 | - Variable to regress on is chosen at random. 13 | - EM type infinite regression loop stops after change in prediction from 14 | previous prediction < 10% for all columns with missing values 15 | 16 | A Method of Estimation of Missing Values in Multivariate Data Suitable for 17 | use with an Electronic Computer S. F. Buck Journal of the Royal Statistical 18 | Society. Series B (Methodological) Vol. 22, No. 2 (1960), pp. 302-306 19 | 20 | Parameters 21 | ---------- 22 | data: numpy.ndarray 23 | Data to impute. 24 | 25 | Returns 26 | ------- 27 | numpy.ndarray 28 | Imputed data. 29 | 30 | """ 31 | nan_xy = matrix.nan_indices(data) 32 | 33 | # Add a column of zeros to the index values 34 | nan_xyz = np.append(nan_xy, np.zeros((np.shape(nan_xy)[0], 1)), axis=1) 35 | 36 | nan_xyz = [[int(x), int(y), v] for x, y, v in nan_xyz] 37 | temp = [] 38 | cols_missing = {y for _, y, _ in nan_xyz} 39 | 40 | # Step 1: Simple Imputation, these are just placeholders 41 | for x_i, y_i, value in nan_xyz: 42 | # Column containing nan value without the nan value 43 | col = data[:, [y_i]][~np.isnan(data[:, [y_i]])] 44 | 45 | new_value = np.mean(col) 46 | data[x_i][y_i] = new_value 47 | temp.append([x_i, y_i, new_value]) 48 | nan_xyz = temp 49 | 50 | # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary) 51 | 52 | converged = [False] * len(nan_xyz) 53 | while not all(converged): 54 | # Step 2: Placeholders are set back to missing for one variable/column 55 | dependent_col = int(np.random.choice(list(cols_missing))) 56 | missing_xs = [int(x) for x, y, value in nan_xyz if y == dependent_col] 57 | 58 | # Step 3: Perform linear regression using the other variables 59 | x_train, y_train = [], [] 60 | for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs): 61 | x_train.append(np.delete(data[x_i], dependent_col)) 62 | y_train.append(data[x_i][dependent_col]) 63 | model = LinearRegression() 64 | model.fit(x_train, y_train) 65 | 66 | # Step 4: Missing values for the missing variable/column are replaced 67 | # with predictions from our new linear regression model 68 | # For null indices with the dependent column that was randomly chosen 69 | for i, z in enumerate(nan_xyz): 70 | x_i = z[0] 71 | y_i = z[1] 72 | value = data[x_i, y_i] 73 | if y_i == dependent_col: 74 | # Row 'x' without the nan value 75 | new_value = model.predict([np.delete(data[x_i], dependent_col)]) 76 | data[x_i][y_i] = new_value.reshape(1, -1) 77 | if value == 0.0: 78 | delta = (new_value-value)/0.01 79 | else: 80 | delta = (new_value-value)/value 81 | converged[i] = abs(delta) < 0.1 82 | return data 83 | -------------------------------------------------------------------------------- /impyute/imputation/cs/central_tendency.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from impyute.ops import matrix 3 | from impyute.ops import wrapper 4 | 5 | @wrapper.wrappers 6 | @wrapper.checks 7 | def mean(data): 8 | """ Substitute missing values with the mean of that column. 9 | 10 | Parameters 11 | ---------- 12 | data: numpy.ndarray 13 | Data to impute. 14 | 15 | Returns 16 | ------- 17 | numpy.ndarray 18 | Imputed data. 19 | 20 | """ 21 | nan_xy = matrix.nan_indices(data) 22 | for x_i, y_i in nan_xy: 23 | row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])] 24 | new_value = np.mean(row_wo_nan) 25 | data[x_i][y_i] = new_value 26 | return data 27 | 28 | @wrapper.wrappers 29 | @wrapper.checks 30 | def median(data): 31 | """ Substitute missing values with the median of that column(middle). 32 | 33 | Parameters 34 | ---------- 35 | data: numpy.ndarray 36 | Data to impute. 37 | 38 | Returns 39 | ------- 40 | numpy.ndarray 41 | Imputed data. 42 | 43 | """ 44 | nan_xy = matrix.nan_indices(data) 45 | cols_missing = set(nan_xy.T[1]) 46 | medians = {} 47 | for y_i in cols_missing: 48 | cols_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])] 49 | median_y = np.median(cols_wo_nan) 50 | medians[str(y_i)] = median_y 51 | for x_i, y_i in nan_xy: 52 | data[x_i][y_i] = medians[str(y_i)] 53 | return data 54 | 55 | @wrapper.wrappers 56 | @wrapper.checks 57 | def mode(data): 58 | """ Substitute missing values with the mode of that column(most frequent). 59 | 60 | In the case that there is a tie (there are multiple, most frequent values) 61 | for a column randomly pick one of them. 62 | 63 | Parameters 64 | ---------- 65 | data: numpy.ndarray 66 | Data to impute. 67 | 68 | Returns 69 | ------- 70 | numpy.ndarray 71 | Imputed data. 72 | 73 | """ 74 | nan_xy = matrix.nan_indices(data) 75 | modes = [] 76 | for y_i in range(np.shape(data)[1]): 77 | unique_counts = np.unique(data[:, [y_i]], return_counts=True) 78 | max_count = np.max(unique_counts[1]) 79 | mode_y = [unique for unique, count in np.transpose(unique_counts) 80 | if count == max_count and not np.isnan(unique)] 81 | modes.append(mode_y) # Appends index of column and column modes 82 | for x_i, y_i in nan_xy: 83 | data[x_i][y_i] = np.random.choice(modes[y_i]) 84 | return data 85 | -------------------------------------------------------------------------------- /impyute/imputation/cs/em.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from impyute.ops import matrix 3 | from impyute.ops import wrapper 4 | 5 | @wrapper.wrappers 6 | @wrapper.checks 7 | def em(data, eps=0.1): 8 | """ Imputes given data using expectation maximization. 9 | 10 | E-step: Calculates the expected complete data log likelihood ratio. 11 | M-step: Finds the parameters that maximize the log likelihood of the 12 | complete data. 13 | 14 | Parameters 15 | ---------- 16 | data: numpy.nd.array 17 | Data to impute. 18 | eps: float 19 | The amount of minimum change between iterations to break, if relative change < eps, converge. 20 | relative change = abs(current - previous) / previous 21 | inplace: boolean 22 | If True, operate on the numpy array reference 23 | 24 | Returns 25 | ------- 26 | numpy.nd.array 27 | Imputed data. 28 | 29 | """ 30 | nan_xy = matrix.nan_indices(data) 31 | for x_i, y_i in nan_xy: 32 | col = data[:, int(y_i)] 33 | mu = col[~np.isnan(col)].mean() 34 | std = col[~np.isnan(col)].std() 35 | col[x_i] = np.random.normal(loc=mu, scale=std) 36 | previous, i = 1, 1 37 | while True: 38 | i += 1 39 | # Expectation 40 | mu = col[~np.isnan(col)].mean() 41 | std = col[~np.isnan(col)].std() 42 | # Maximization 43 | col[x_i] = np.random.normal(loc=mu, scale=std) 44 | # Break out of loop if likelihood doesn't change at least 10% 45 | # and has run at least 5 times 46 | delta = np.abs(col[x_i]-previous)/previous 47 | if i > 5 and delta < eps: 48 | data[x_i][y_i] = col[x_i] 49 | break 50 | data[x_i][y_i] = col[x_i] 51 | previous = col[x_i] 52 | return data 53 | -------------------------------------------------------------------------------- /impyute/imputation/cs/fast_knn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.spatial import KDTree 3 | from impyute.ops import matrix 4 | from impyute.ops import wrapper 5 | from impyute.ops import inverse_distance_weighting as idw 6 | 7 | from . import mean 8 | # pylint: disable=too-many-arguments 9 | 10 | @wrapper.wrappers 11 | @wrapper.checks 12 | def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10, 13 | idw_fn=idw.shepards, init_impute_fn=mean): 14 | """ Impute using a variant of the nearest neighbours approach 15 | 16 | Basic idea: Impute array with a passed in initial impute fn (mean impute) 17 | and then use the resulting complete array to construct a KDTree. Use this 18 | KDTree to compute nearest neighbours. After finding `k` nearest 19 | neighbours, take the weighted average of them. Basically, find the nearest 20 | row in terms of distance 21 | 22 | This approach is much, much faster than the other implementation (fit+transform 23 | for each subset) which is almost prohibitively expensive. 24 | 25 | Parameters 26 | ---------- 27 | data: numpy.ndarray 28 | 2D matrix to impute. 29 | 30 | k: int, optional 31 | Parameter used for method querying the KDTree class object. Number of 32 | neighbours used in the KNN query. Refer to the docs for 33 | [`scipy.spatial.KDTree.query`] 34 | (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). 35 | 36 | eps: nonnegative float, optional 37 | Parameter used for method querying the KDTree class object. From the 38 | SciPy docs: "Return approximate nearest neighbors; the kth returned 39 | value is guaranteed to be no further than (1+eps) times the distance to 40 | the real kth nearest neighbor". Refer to the docs for 41 | [`scipy.spatial.KDTree.query`] 42 | (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). 43 | 44 | p : float, 1<=p<=infinity, optional 45 | Parameter used for method querying the KDTree class object. Straight from the 46 | SciPy docs: "Which Minkowski p-norm to use. 1 is the 47 | sum-of-absolute-values Manhattan distance 2 is the usual Euclidean 48 | distance infinity is the maximum-coordinate-difference distance". Refer to 49 | the docs for 50 | [`scipy.spatial.KDTree.query`] 51 | (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). 52 | 53 | distance_upper_bound : nonnegative float, optional 54 | Parameter used for method querying the KDTree class object. Straight 55 | from the SciPy docs: "Return only neighbors within this distance. This 56 | is used to prune tree searches, so if you are doing a series of 57 | nearest-neighbor queries, it may help to supply the distance to the 58 | nearest neighbor of the most recent point." Refer to the docs for 59 | [`scipy.spatial.KDTree.query`] 60 | (https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html). 61 | 62 | leafsize: int, optional 63 | Parameter used for construction of the `KDTree` class object. Straight from 64 | the SciPy docs: "The number of points at which the algorithm switches 65 | over to brute-force. Has to be positive". Refer to the docs for 66 | [`scipy.spatial.KDTree`](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html) 67 | for more information. 68 | 69 | idw_fn: fn, optional 70 | Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom 71 | one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using 72 | functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)` 73 | 74 | init_impute_fn: fn, optional 75 | 76 | Returns 77 | ------- 78 | numpy.ndarray 79 | Imputed data. 80 | 81 | Examples 82 | -------- 83 | 84 | >>> data = np.arange(25).reshape((5, 5)).astype(np.float) 85 | >>> data[0][2] = np.nan 86 | >>> data 87 | array([[ 0., 1., nan, 3., 4.], 88 | [ 5., 6., 7., 8., 9.], 89 | [10., 11., 12., 13., 14.], 90 | [15., 16., 17., 18., 19.], 91 | [20., 21., 22., 23., 24.]]) 92 | >> fast_knn(data, k=1) # Weighted average (by distance) of nearest 1 neighbour 93 | array([[ 0., 1., 7., 3., 4.], 94 | [ 5., 6., 7., 8., 9.], 95 | [10., 11., 12., 13., 14.], 96 | [15., 16., 17., 18., 19.], 97 | [20., 21., 22., 23., 24.]]) 98 | >> fast_knn(data, k=2) # Weighted average of nearest 2 neighbours 99 | array([[ 0. , 1. , 10.08608891, 3. , 4. ], 100 | [ 5. , 6. , 7. , 8. , 9. ], 101 | [10. , 11. , 12. , 13. , 14. ], 102 | [15. , 16. , 17. , 18. , 19. ], 103 | [20. , 21. , 22. , 23. , 24. ]]) 104 | >> fast_knn(data, k=3) 105 | array([[ 0. , 1. , 13.40249283, 3. , 4. ], 106 | [ 5. , 6. , 7. , 8. , 9. ], 107 | [10. , 11. , 12. , 13. , 14. ], 108 | [15. , 16. , 17. , 18. , 19. ], 109 | [20. , 21. , 22. , 23. , 24. ]]) 110 | >> fast_knn(data, k=5) # There are at most only 4 neighbours. Raises error 111 | ... 112 | IndexError: index 5 is out of bounds for axis 0 with size 5 113 | 114 | """ 115 | nan_xy = matrix.nan_indices(data) 116 | data_c = init_impute_fn(data) 117 | kdtree = KDTree(data_c, leafsize=leafsize) 118 | 119 | for x_i, y_i in nan_xy: 120 | distances, indices = kdtree.query(data_c[x_i], k=k+1, eps=eps, 121 | p=p, distance_upper_bound=distance_upper_bound) 122 | # Will always return itself in the first index. Delete it. 123 | distances, indices = distances[1:], indices[1:] 124 | # Add small constant to distances to avoid division by 0 125 | distances += 1e-3 126 | weights = idw_fn(distances) 127 | # Assign missing value the weighted average of `k` nearest neighbours 128 | data[x_i][y_i] = np.dot(weights, [data_c[ind][y_i] for ind in indices]) 129 | return data 130 | -------------------------------------------------------------------------------- /impyute/imputation/cs/random.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from impyute.ops import matrix 3 | from impyute.ops import wrapper 4 | 5 | @wrapper.wrappers 6 | @wrapper.checks 7 | def random(data): 8 | """ Fill missing values in with a randomly selected value from the same 9 | column. 10 | 11 | Parameters 12 | ---------- 13 | data: numpy.ndarray 14 | Data to impute. 15 | 16 | Returns 17 | ------- 18 | numpy.ndarray 19 | Imputed data. 20 | 21 | """ 22 | nan_xy = matrix.nan_indices(data) 23 | for x, y in nan_xy: 24 | uniques = np.unique(data[:, y]) 25 | uniques = uniques[~np.isnan(uniques)] 26 | data[x][y] = np.random.choice(uniques) 27 | return data 28 | -------------------------------------------------------------------------------- /impyute/imputation/ts/__init__.py: -------------------------------------------------------------------------------- 1 | """ Imputations for time-series data. """ 2 | 3 | from .locf import locf 4 | from .moving_window import moving_window 5 | 6 | __all__ = ["locf", "moving_window"] 7 | -------------------------------------------------------------------------------- /impyute/imputation/ts/locf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from impyute.ops import matrix 3 | from impyute.ops import wrapper 4 | from impyute.ops import error 5 | 6 | @wrapper.wrappers 7 | @wrapper.checks 8 | def locf(data, axis=0): 9 | """ Last Observation Carried Forward 10 | 11 | For each set of missing indices, use the value of one row before(same 12 | column). In the case that the missing value is the first row, look one 13 | row ahead instead. If this next row is also NaN, look to the next row. 14 | Repeat until you find a row in this column that's not NaN. All the rows 15 | before will be filled with this value. 16 | 17 | Parameters 18 | ---------- 19 | data: numpy.ndarray 20 | Data to impute. 21 | axis: boolean (optional) 22 | 0 if time series is in row format (Ex. data[0][:] is 1st data point). 23 | 1 if time series is in col format (Ex. data[:][0] is 1st data point). 24 | 25 | Returns 26 | ------- 27 | numpy.ndarray 28 | Imputed data. 29 | 30 | """ 31 | if axis == 0: 32 | data = np.transpose(data) 33 | elif axis == 1: 34 | pass 35 | else: 36 | raise error.BadInputError("Error: Axis value is invalid, please use either 0 (row format) or 1 (column format)") 37 | 38 | nan_xy = matrix.nan_indices(data) 39 | for x_i, y_i in nan_xy: 40 | # Simplest scenario, look one row back 41 | if x_i-1 > -1: 42 | data[x_i][y_i] = data[x_i-1][y_i] 43 | # Look n rows forward 44 | else: 45 | x_residuals = np.shape(data)[0]-x_i-1 # n datapoints left 46 | val_found = False 47 | for i in range(1, x_residuals): 48 | if not np.isnan(data[x_i+i][y_i]): 49 | val_found = True 50 | break 51 | if val_found: 52 | # pylint: disable=undefined-loop-variable 53 | for x_nan in range(i): 54 | data[x_i+x_nan][y_i] = data[x_i+i][y_i] 55 | else: 56 | raise Exception("Error: Entire Column is NaN") 57 | return data 58 | -------------------------------------------------------------------------------- /impyute/imputation/ts/moving_window.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from impyute.ops import matrix 3 | from impyute.ops import wrapper 4 | # pylint: disable=invalid-name, too-many-arguments, too-many-locals, too-many-branches, broad-except, len-as-condition 5 | 6 | @wrapper.wrappers 7 | @wrapper.checks 8 | def moving_window(data, nindex=None, wsize=5, errors="coerce", func=np.mean, 9 | inplace=False): 10 | """ Interpolate the missing values based on nearby values. 11 | 12 | For example, with an array like this: 13 | 14 | array([[-1.24940, -1.38673, -0.03214945, 0.08255145, -0.007415], 15 | [ 2.14662, 0.32758 , -0.82601414, 1.78124027, 0.873998], 16 | [-0.41400, -0.977629, nan, -1.39255344, 1.680435], 17 | [ 0.40975, 1.067599, 0.29152388, -1.70160145, -0.565226], 18 | [-0.54592, -1.126187, 2.04004377, 0.16664863, -0.010677]]) 19 | 20 | Using a `k` or window size of 3. The one missing value would be set 21 | to -1.18509122. The window operates on the horizontal axis. 22 | 23 | Usage 24 | ----- 25 | 26 | The parameters default the function to a moving mean. You may want to change 27 | the default window size: 28 | 29 | moving_window(data, wsize=10) 30 | 31 | To only look at past data (null value is at the rightmost index in the window): 32 | 33 | moving_window(data, nindex=-1) 34 | 35 | To use a custom function: 36 | 37 | moving_window(data, func=np.median) 38 | 39 | You can also do something like take 1.5x the max of previous values in the window: 40 | 41 | moving_window(data, func=lambda arr: max(arr) * 1.50, nindex=-1) 42 | 43 | Parameters 44 | ---------- 45 | data: numpy.ndarray 46 | 2D matrix to impute. 47 | nindex: int 48 | Null index. Index of the null value inside the moving average window. 49 | Use cases: Say you wanted to make value skewed toward the left or right 50 | side. 0 would only take the average of values from the right and -1 51 | would only take the average of values from the left 52 | wsize: int 53 | Window size. Size of the moving average window/area of values being used 54 | for each local imputation. This number includes the missing value. 55 | errors: {"raise", "coerce", "ignore"} 56 | Errors will occur with the indexing of the windows - for example if there 57 | is a nan at data[x][0] and `nindex` is set to -1 or there is a nan at 58 | data[x][-1] and `nindex` is set to 0. `"raise"` will raise an error, 59 | `"coerce"` will try again using an nindex set to the middle and `"ignore"` 60 | will just leave it as a nan. 61 | inplace: {True, False} 62 | Whether to return a copy or run on the passed-in array 63 | 64 | Returns 65 | ------- 66 | numpy.ndarray 67 | Imputed data. 68 | 69 | """ 70 | if errors == "ignore": 71 | raise Exception("`errors` value `ignore` not implemented yet. Sorry!") 72 | 73 | if not inplace: 74 | data = data.copy() 75 | 76 | if nindex is None: # If using equal window side lengths 77 | assert wsize % 2 == 1, "The parameter `wsize` should not be even "\ 78 | "if the value `nindex` is not set since it defaults to the midpoint "\ 79 | "and an even `wsize` makes the midpoint ambiguous" 80 | wside_left = wsize // 2 81 | wside_right = wsize // 2 82 | else: # If using custom window side lengths 83 | assert nindex < wsize, "The null index must be smaller than the window size" 84 | if nindex == -1: 85 | wside_left = wsize - 1 86 | wside_right = 0 87 | else: 88 | wside_left = nindex 89 | wside_right = wsize - nindex - 1 90 | 91 | while True: 92 | nan_xy = matrix.nan_indices(data) 93 | n_nan_prev = len(nan_xy) 94 | for x_i, y_i in nan_xy: 95 | left_i = max(0, y_i-wside_left) 96 | right_i = min(len(data), y_i+wside_right+1) 97 | window = data[x_i, left_i: right_i] 98 | window_not_null = window[~np.isnan(window)] 99 | 100 | if len(window_not_null) > 0: 101 | try: 102 | data[x_i][y_i] = func(window_not_null) 103 | continue 104 | except Exception as e: 105 | if errors == "raise": 106 | raise e 107 | 108 | if errors == "coerce": 109 | # If either the window has a length of 0 or the aggregate function fails somehow, 110 | # do a fallback of just trying the best we can by using it as the middle and trying 111 | # to recalculate. Use temporary wside_left/wside_right, for only the calculation of 112 | # this specific problamatic value 113 | wside_left_tmp = wsize // 2 114 | wside_right_tmp = wside_left_tmp 115 | 116 | left_i_tmp = max(0, y_i-wside_left_tmp) 117 | right_i_tmp = min(len(data), y_i+wside_right_tmp+1) 118 | 119 | window = data[x_i, left_i_tmp:right_i_tmp] 120 | window_not_null = window[~np.isnan(window)] 121 | try: 122 | data[x_i][y_i] = func(window_not_null) 123 | except Exception as e: 124 | print("Exception:", e) 125 | if n_nan_prev == len(matrix.nan_indices(data)): 126 | break 127 | 128 | return data 129 | -------------------------------------------------------------------------------- /impyute/ops/__init__.py: -------------------------------------------------------------------------------- 1 | """ Unorganized set of utility functions """ 2 | 3 | from . import error 4 | from . import inverse_distance_weighting 5 | from . import matrix 6 | from . import util 7 | from . import wrapper 8 | 9 | __all__ = [ 10 | "error", "inverse_distance_weighting", "matrix", 11 | "util", "wrapper" 12 | ] 13 | -------------------------------------------------------------------------------- /impyute/ops/error.py: -------------------------------------------------------------------------------- 1 | """ Impyute specific error messages """ 2 | 3 | class BadInputError(Exception): 4 | "Error thrown when input args don't match spec" 5 | pass 6 | 7 | class BadOutputError(Exception): 8 | "Error thrown when outputs don't match spec" 9 | pass 10 | -------------------------------------------------------------------------------- /impyute/ops/inverse_distance_weighting.py: -------------------------------------------------------------------------------- 1 | """ Assign weights to distances in a way such that farther values are weighed less """ 2 | import numpy as np 3 | 4 | def shepards(distances, power=2): 5 | """ Basic inverse distance weighting function 6 | 7 | Parameters 8 | ---------- 9 | distances: list/numpy.ndarray 10 | 1D list of numbers (ex. distance results from call to KDTree.query) 11 | 12 | power: int 13 | Default of 2 used since the referenced paper stated an exponent of 2 "gives seemingly 14 | satisfactory results" 15 | 16 | Returns 17 | ------- 18 | numpy.ndarray 19 | 1D list of numbers that sum to 1, represents weights of provided distances, in order. 20 | 21 | References 22 | ---------- 23 | 24 | Shepard, Donald (1968). "A two-dimensional interpolation function for irregularly-spaced data". 25 | Proceedings of the 1968 ACM National Conference. pp. 517-524. doi:10.1145/800186.810616 26 | """ 27 | return to_percentage(1/np.power(distances, power)) 28 | 29 | def to_percentage(vec): 30 | """ Converts list of real numbers into a list of percentages """ 31 | return vec/np.sum(vec) 32 | -------------------------------------------------------------------------------- /impyute/ops/matrix.py: -------------------------------------------------------------------------------- 1 | """ Common operations on matrices 2 | 3 | *Look into whether it's worth writing these in raw c* 4 | """ 5 | import numpy as np 6 | 7 | def nan_indices(data): 8 | """ Finds the indices of all missing values. 9 | 10 | Parameters 11 | ---------- 12 | data: numpy.ndarray 13 | 14 | Returns 15 | ------- 16 | List of tuples 17 | Indices of all missing values in tuple format; (i, j) 18 | """ 19 | return np.argwhere(np.isnan(data)) 20 | 21 | def map_nd(fn, arr): 22 | """ Map fn that takes a value over entire n-dim array 23 | 24 | Parameters 25 | ---------- 26 | arr: numpy.ndarray 27 | 28 | Returns 29 | ------- 30 | numpy.ndarray 31 | 32 | """ 33 | return np.vectorize(fn)(arr) 34 | 35 | def every_nd(fn, arr): 36 | """ Returns bool, true if fn is true for all elements of arr 37 | 38 | Parameters 39 | ---------- 40 | arr: numpy.ndarray 41 | 42 | Returns 43 | ------- 44 | bool 45 | 46 | """ 47 | return all(map(fn, arr.flatten())) 48 | -------------------------------------------------------------------------------- /impyute/ops/testing.py: -------------------------------------------------------------------------------- 1 | """ Utilities used for unit tests """ 2 | import numpy as np 3 | 4 | 5 | def return_na_check(data): 6 | """Helper function for tests to check if the data returned is a 7 | numpy array and that the imputed data has no NaN's. 8 | 9 | Parameters 10 | ---------- 11 | data: numpy.ndarray 12 | Data to impute. 13 | 14 | Returns 15 | ------- 16 | None 17 | 18 | """ 19 | assert isinstance(data, np.ndarray) 20 | assert not np.isnan(data).any() 21 | -------------------------------------------------------------------------------- /impyute/ops/util.py: -------------------------------------------------------------------------------- 1 | """ Random utility functions """ 2 | from functools import wraps 3 | 4 | # Things that get exposed from * import 5 | __all__ = [ 6 | "constantly", "complement", "identity", "thread", 7 | "execute_fn_with_args_and_or_kwargs" 8 | ] 9 | 10 | def thread(arg, *fns): 11 | if len(fns) > 0: 12 | return thread(fns[0](arg), *fns[1:]) 13 | else: 14 | return arg 15 | 16 | def identity(x): 17 | return x 18 | 19 | def constantly(x): 20 | """ Returns a function that takes any args and returns x """ 21 | def func(*args, **kwargs): 22 | return x 23 | return func 24 | 25 | def complement(fn): 26 | """ Return fn that outputs the opposite truth values of the 27 | input function 28 | """ 29 | @wraps(fn) 30 | def wrapper(*args, **kwargs): 31 | return not fn(*args, **kwargs) 32 | return wrapper 33 | 34 | def execute_fn_with_args_and_or_kwargs(fn, args, kwargs): 35 | """ If args + kwargs aren't accepted only args are passed in""" 36 | try: 37 | return fn(*args, **kwargs) 38 | except TypeError: 39 | return fn(*args) 40 | -------------------------------------------------------------------------------- /impyute/ops/wrapper.py: -------------------------------------------------------------------------------- 1 | """ Decorator functions to wrap around entry and exit 2 | 3 | ... to easily apply to a function, functions that check/process inputs 4 | and outputs 5 | """ 6 | from functools import wraps 7 | import numpy as np 8 | 9 | from . import error 10 | from . import matrix 11 | from . import util as u 12 | 13 | ## Hacky way to handle python2 not having `ModuleNotFoundError` 14 | # pylint: disable=redefined-builtin, missing-docstring 15 | try: 16 | raise ModuleNotFoundError 17 | except NameError: 18 | class ModuleNotFoundError(Exception): 19 | pass 20 | except ModuleNotFoundError: 21 | pass 22 | # pylint: enable=redefined-builtin, missing-docstring 23 | 24 | def get_pandas_df(): 25 | """ Gets pandas DataFrame if we can import it """ 26 | try: 27 | import pandas as pd 28 | df = pd.DataFrame 29 | except (ModuleNotFoundError, ImportError): 30 | df = None 31 | return df 32 | 33 | def handle_df(fn): 34 | """ Decorator to handle pandas Dataframe object as input 35 | 36 | If the first arg is a pandas dataframe, convert it to a numpy array 37 | otherwise don't do anything. Cast back to a pandas Dataframe after 38 | the imputation function has run 39 | """ 40 | @wraps(fn) 41 | def wrapper(*args, **kwargs): 42 | postprocess_fn = None 43 | ## convert tuple to list so args can be modified 44 | args = list(args) 45 | ## Either make a copy or use a pointer to the original 46 | if kwargs.get('inplace'): 47 | args[0] = args[0] 48 | else: 49 | args[0] = args[0].copy() 50 | 51 | ## If input data is a dataframe then cast the input to an np.array 52 | ## and set an indicator flag before continuing 53 | pd_DataFrame = get_pandas_df() 54 | if pd_DataFrame and isinstance(args[0], pd_DataFrame): 55 | postprocess_fn = pd_DataFrame 56 | args[0] = args[0].values 57 | 58 | ## function invokation 59 | results = u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs) 60 | 61 | ## cast the output back to a DataFrame. 62 | if postprocess_fn is not None: 63 | results = postprocess_fn(results) 64 | 65 | return results 66 | return wrapper 67 | 68 | def add_inplace_option(fn): 69 | """ Decorator for inplace option 70 | 71 | Functions wrapped by this can have an `inplace` kwarg to use either a copy of 72 | data or reference """ 73 | @wraps(fn) 74 | def wrapper(*args, **kwargs): 75 | """ Run input checks""" 76 | ## convert tuple to list so args can be modified 77 | args = list(args) 78 | ## Either make a copy or use a pointer to the original 79 | if kwargs.get('inplace'): 80 | args[0] = args[0] 81 | else: 82 | args[0] = args[0].copy() 83 | 84 | ## function invokation 85 | return u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs) 86 | return wrapper 87 | 88 | def conform_output(fn): 89 | """ Decorator to handle impossible values 90 | 91 | Adds two optional kwargs, `coerce_fn` and `valid_fn`. 92 | 93 | `valid_fn` function stub 94 | 95 | def my_coerce_fn(some_literal) -> boolean 96 | 97 | `coerce_fn` function stub 98 | 99 | def my_coerce_fn(arr, x_i, y_i) -> some_literal 100 | 101 | Valid function is something run on each element of the, this is 102 | the function that we use to indicate whether the value is valid 103 | or not 104 | 105 | Coerce function has three arguments, the original matrix and 106 | the two indices of the invalid value x_i and y_i. This function 107 | will be run on all invalid values. 108 | """ 109 | @wraps(fn) 110 | def wrapper(*args, **kwargs): 111 | def raise_error(arr, x_i, y_i): 112 | raise error.BadOutputError("{} does not conform".format(arr[x_i, y_i])) 113 | ## convert tuple to list so args can be modified 114 | args = list(args) 115 | # function that checks if the value is valid 116 | valid_fn = kwargs.get("valid_fn", u.constantly(True)) 117 | # function that modifies the invalid value to something valid 118 | coerce_fn = kwargs.get("coerce_fn", raise_error) 119 | 120 | ## function invokation 121 | results = u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs) 122 | 123 | # check each value to see if it's valid 124 | bool_arr = matrix.map_nd(u.complement(valid_fn), results) 125 | # get indices of invalid values 126 | invalid_indices = np.argwhere(bool_arr) 127 | # run the coerce fn on each invalid indice 128 | for x_i, y_i in invalid_indices: 129 | results[x_i, y_i] = coerce_fn(results, x_i, y_i) 130 | 131 | return results 132 | return wrapper 133 | 134 | def wrappers(fn): 135 | """ Helper decorator, all wrapper functions applied to modify input (matrix 136 | with missing values) and output (matrix with imputed values) 137 | 138 | NOTE: `handle_df` has to be last as it needs to be in the outer loop (first 139 | entry point) since every other function assumes you're getting an np.array 140 | as input 141 | """ 142 | return u.thread( 143 | fn, # function that's getting wrapped 144 | add_inplace_option, # allow choosing reference/copy 145 | conform_output, # allow enforcing of some spec on returned outputs 146 | handle_df, # if df type, cast to np.array on in and df on out 147 | ) 148 | 149 | def _shape_2d(data): 150 | """ True if array is 2D""" 151 | return len(np.shape(data)) == 2 152 | 153 | def _shape_3d(data): 154 | """ True if array is 3D""" 155 | return len(np.shape(data)) == 3 156 | 157 | def _is_ndarray(data): 158 | """ True if the array is an instance of numpy's ndarray""" 159 | return isinstance(data, np.ndarray) 160 | 161 | def _dtype_float(data): 162 | """ True if the values in the array are floating point""" 163 | return data.dtype == np.float 164 | 165 | def _nan_exists(data): 166 | """ True if there is at least one np.nan in the array""" 167 | nan_xy = matrix.nan_indices(data) 168 | return len(nan_xy) > 0 169 | 170 | def checks(fn): 171 | """ Throw exception if error runs""" 172 | @wraps(fn) 173 | def wrapper(*args, **kwargs): 174 | data = args[0] 175 | if len(np.shape(data)) != 2: 176 | raise error.BadInputError("No support for arrays that aren't 2D yet.") 177 | elif not _shape_2d(data): 178 | raise error.BadInputError("Not a 2D array.") 179 | elif not _is_ndarray(data): 180 | raise error.BadInputError("Not a np.ndarray.") 181 | elif not _dtype_float(data): 182 | raise error.BadInputError("Data is not float.") 183 | elif not _nan_exists(data): 184 | raise error.BadInputError("No NaN's in given data") 185 | return u.execute_fn_with_args_and_or_kwargs(fn, args, kwargs) 186 | return wrapper 187 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::RuntimeWarning -------------------------------------------------------------------------------- /requirements/.travis.yaml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | 3 | language: python 4 | 5 | services: 6 | - docker 7 | 8 | before_install: 9 | - docker pull eltonlaw/pybase 10 | 11 | script: 12 | make test 13 | 14 | -------------------------------------------------------------------------------- /requirements/common.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | scikit-learn 4 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | -r common.txt 2 | sphinx 3 | pytest 4 | numpydoc 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | 4 | [pylint] 5 | exclude = docs/* 6 | 7 | [metadata] 8 | description-file = README.rst 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ setup.py """ 2 | import os 3 | from setuptools import setup, find_packages 4 | 5 | def get_description(): 6 | """ Makes README file into a string""" 7 | with open("README.rst") as file: 8 | long_description = file.read() 9 | return long_description 10 | 11 | def parse_requirements(filename): 12 | """ load requirements from a pip requirements file """ 13 | lineiter = (line.strip() for line in open(filename)) 14 | return [line for line in lineiter if line and not line.startswith("#")] 15 | 16 | def get_version(): 17 | """ Gets version from impyute.__init__.py 18 | 19 | Runs `impyute.__init__` and loads defined variables into scope 20 | """ 21 | with open(os.path.join('impyute', '__init__.py')) as version_file: 22 | # pylint: disable=exec-used, undefined-variable 23 | exec(version_file.read(), globals()) 24 | return __version__ 25 | 26 | 27 | setup( 28 | name='impyute', 29 | author='Elton Law', 30 | author_email='eltonlaw296@gmail.com', 31 | version=get_version(), 32 | url="http://impyute.readthedocs.io/en/latest/", 33 | download_url='https://github.com/eltonlaw/impyute', 34 | description='Cross-sectional and time-series data imputation algorithms', 35 | long_description=get_description(), 36 | packages=find_packages(exclude=['docs']), 37 | install_requires=parse_requirements("requirements/common.txt"), 38 | keywords='imputation', 39 | classifiers=['Development Status :: 3 - Alpha', 40 | 'Intended Audience :: Science/Research', 41 | 'Intended Audience :: Developers', 42 | 'Programming Language :: Python', 43 | 'Topic :: Software Development', 44 | 'Topic :: Scientific/Engineering'], 45 | extras_require={ 46 | 'dev': ['pylint', 'sphinx'], 47 | 'test': [], 48 | }, 49 | license='GPL-3.0' 50 | ) 51 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/test/__init__.py -------------------------------------------------------------------------------- /test/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | import pytest 5 | import numpy as np 6 | 7 | 8 | @pytest.fixture(scope='function') 9 | def test_data(): 10 | def prepare_data(shape=(3, 3), pos1=0, pos2=0): 11 | data = np.reshape(np.arange(np.product(shape)), shape).astype("float") 12 | data[pos1, pos2] = np.nan 13 | return data 14 | return prepare_data 15 | 16 | 17 | @pytest.fixture(scope='session') 18 | def buck_test_data(): 19 | data = np.asarray([[1, 2, 3, 4, 5, 6, 7, 8], 20 | [1, 4, 6, 8, 10, 12, 14, 16], 21 | [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4], 22 | [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4], 23 | [3, 6, 9, 12, 15, 18, 21, 24], 24 | [4, 8, 9, 16, 20, 24, 28, 32]]) 25 | data[0, 0] = np.nan 26 | return data 27 | 28 | 29 | @pytest.fixture(scope='session') 30 | def knn_test_data(): 31 | n = 100 32 | data = np.random.normal(size=n * n).reshape((n, n)) 33 | for _ in range(int(n * 0.3 * n)): 34 | data[np.random.randint(n), np.random.randint(n)] = np.nan 35 | return data 36 | 37 | 38 | @pytest.fixture(scope='function') 39 | def mw_data(): 40 | return np.arange(0, 25).reshape(5, 5).astype(float) 41 | 42 | 43 | @pytest.fixture(scope='session') 44 | def results_path(tmpdir_factory): 45 | temp = tmpdir_factory.mktemp('logs') 46 | p = os.path.realpath(str(temp)) 47 | log_path = os.path.join(p, 'results.txt') 48 | yield log_path 49 | if temp.exists(): 50 | shutil.rmtree(str(temp)) -------------------------------------------------------------------------------- /test/contrib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/test/contrib/__init__.py -------------------------------------------------------------------------------- /test/contrib/test_compare.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import pytest 3 | import numpy as np 4 | import impyute as impy 5 | 6 | SHAPE = (5, 5) 7 | 8 | 9 | @pytest.mark.filterwarnings('ignore::FutureWarning') 10 | def test_output_file_exists(test_data, results_path): 11 | data = test_data(SHAPE) 12 | labels = np.array([1, 0, 1, 1, 0]) 13 | imputed_mode = [] 14 | imputed_mode.append(["mode", (impy.mode(np.copy(data)), labels)]) 15 | imputed_mode.append(["mean", (impy.mean(np.copy(data)), labels)]) 16 | 17 | impy.contrib.compare(imputed_mode, log_path=results_path) 18 | with open(results_path, 'r') as fin: 19 | expected = {'mode': [('SVC', 0.0)], 'mean': [('SVC', 0.0)]} 20 | assert ast.literal_eval(next(fin)) == expected 21 | -------------------------------------------------------------------------------- /test/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/test/dataset/__init__.py -------------------------------------------------------------------------------- /test/dataset/test_mnist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from impyute.dataset import mnist 4 | from impyute.ops import matrix 5 | 6 | pytest.skip("takes ~30 sec each test", allow_module_level=True) 7 | data = mnist()["X"] 8 | 9 | def test_return_type(): 10 | """ Check return type, should return an np.ndarray""" 11 | assert isinstance(data, np.ndarray) 12 | 13 | def test_missing_values_present(): 14 | """ Check that the dataset is corrupted (missing values present)""" 15 | assert matrix.nan_indices(data).size != 0 16 | -------------------------------------------------------------------------------- /test/dataset/test_randc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from impyute.dataset.base import randc 4 | from impyute.ops import error 5 | 6 | def test_raise_error_nlevel_exceed_shape(): 7 | with pytest.raises(error.BadInputError) as e: 8 | randc(shape=(2, 2)) 9 | expected = "nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape" 10 | assert str(e.value) == expected 11 | 12 | @pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3,4)), (100, (20, 20))]) 13 | def test_nlevel_categories(nlevels, shape): 14 | """ideally the returned matrix should have nlevel+1 different categories, 15 | +1 because the Corrupt class introduce np.nan however, if the missing value 16 | introduced by Corrupt class happens to replace a group of categories, the 17 | unique category number would be < nlevel + 1 18 | """ 19 | dataframe = randc(nlevels, shape) 20 | assert len(np.unique(dataframe)) <= nlevels + 1 21 | 22 | 23 | @pytest.mark.parametrize("nlevels, shape", [(5, (5,5)), (9, (3, 4)), (100, (20, 20))]) 24 | def test_dataframe_shape(nlevels, shape): 25 | """test if the returned data frame has desired shape""" 26 | dataframe = randc(nlevels, shape) 27 | assert dataframe.shape == shape 28 | -------------------------------------------------------------------------------- /test/deletion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/test/deletion/__init__.py -------------------------------------------------------------------------------- /test/deletion/test_complete_case.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from impyute.deletion import complete_case 3 | from impyute.ops.testing import return_na_check 4 | 5 | SHAPE = (5, 5) 6 | 7 | 8 | def test_complete_case_(test_data): 9 | data = test_data(SHAPE) 10 | imputed = complete_case(data) 11 | return_na_check(imputed) 12 | 13 | 14 | def test_impute_missing_values(test_data): 15 | data = test_data(SHAPE) 16 | imputed = complete_case(data) 17 | assert np.shape(imputed) == (4, 5) 18 | 19 | 20 | def test_imputed_values(test_data): 21 | data = test_data(SHAPE) 22 | imputed = complete_case(data) 23 | expected = np.arange(5, 25, dtype=float).reshape(4, 5) 24 | assert np.equal(imputed, expected).all() 25 | -------------------------------------------------------------------------------- /test/imputation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/test/imputation/__init__.py -------------------------------------------------------------------------------- /test/imputation/cs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/test/imputation/cs/__init__.py -------------------------------------------------------------------------------- /test/imputation/cs/test_buck_iterative.py: -------------------------------------------------------------------------------- 1 | """test_buck_iterative.py""" 2 | import impyute as impy 3 | from impyute.ops.testing import return_na_check 4 | 5 | 6 | def test_buck_iter(buck_test_data): 7 | imputed = impy.buck_iterative(buck_test_data) 8 | return_na_check(imputed) 9 | -------------------------------------------------------------------------------- /test/imputation/cs/test_central_tendency.py: -------------------------------------------------------------------------------- 1 | """test_averagings.py""" 2 | import impyute as impy 3 | from impyute.ops.testing import return_na_check 4 | 5 | SHAPE = (5, 5) 6 | 7 | 8 | def test_mean(test_data): 9 | data = test_data(SHAPE) 10 | imputed = impy.mean(data) 11 | return_na_check(imputed) 12 | 13 | 14 | def test_mode(test_data): 15 | data = test_data(SHAPE) 16 | imputed = impy.mode(data) 17 | return_na_check(imputed) 18 | 19 | 20 | def test_median(test_data): 21 | data = test_data(SHAPE) 22 | imputed = impy.median(data) 23 | return_na_check(imputed) 24 | -------------------------------------------------------------------------------- /test/imputation/cs/test_em.py: -------------------------------------------------------------------------------- 1 | """test_em.py""" 2 | import impyute as impy 3 | from impyute.ops.testing import return_na_check 4 | 5 | SHAPE = (5, 5) 6 | 7 | 8 | def test_em_(test_data): 9 | data = test_data(SHAPE) 10 | imputed = impy.em(data) 11 | return_na_check(imputed) 12 | -------------------------------------------------------------------------------- /test/imputation/cs/test_fast_knn.py: -------------------------------------------------------------------------------- 1 | """test_fast_knn.py""" 2 | import functools 3 | import numpy as np 4 | import impyute as impy 5 | from impyute.ops.testing import return_na_check 6 | # pylint:disable=invalid-name 7 | 8 | SHAPE = (5, 5) 9 | 10 | 11 | def test_return_type(knn_test_data): 12 | imputed = impy.fast_knn(knn_test_data) 13 | return_na_check(imputed) 14 | 15 | 16 | def test_impute_value(test_data): 17 | """fast_knn using standard idw""" 18 | data = test_data(SHAPE, 0, 2) 19 | imputed = impy.fast_knn(data, k=2) 20 | assert np.isclose(imputed[0, 2], 8.38888888888889) 21 | 22 | 23 | def test_impute_value_custom_idw(test_data): 24 | """fast_knn using custom idw""" 25 | data = test_data(SHAPE, 0, 2) 26 | idw_fn = functools.partial(impy.ops.inverse_distance_weighting.shepards, power=1) 27 | imputed = impy.fast_knn(data, k=2, idw_fn=idw_fn) 28 | assert np.isclose(imputed[0, 2], 8.913911092686593) 29 | -------------------------------------------------------------------------------- /test/imputation/cs/test_random.py: -------------------------------------------------------------------------------- 1 | """test_random_imputation.py""" 2 | import impyute as impy 3 | from impyute.ops.testing import return_na_check 4 | 5 | SHAPE = (3, 3) 6 | 7 | 8 | def test_random_(test_data): 9 | data = test_data(SHAPE) 10 | imputed = impy.random(data) 11 | return_na_check(imputed) 12 | -------------------------------------------------------------------------------- /test/imputation/ts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/test/imputation/ts/__init__.py -------------------------------------------------------------------------------- /test/imputation/ts/test_locf.py: -------------------------------------------------------------------------------- 1 | """test_locf.py""" 2 | import numpy as np 3 | import impyute as impy 4 | from impyute.ops.testing import return_na_check 5 | from impyute.ops import error 6 | 7 | SHAPE = (5, 5) 8 | 9 | 10 | def test_locf_(test_data): 11 | data = test_data(SHAPE) 12 | imputed = impy.locf(data) 13 | return_na_check(imputed) 14 | 15 | 16 | def test_na_at_i_start(test_data): 17 | data = test_data(SHAPE) 18 | actual = impy.locf(data, axis=1) 19 | data[0, 0] = data[1, 0] 20 | assert np.array_equal(actual, data) 21 | 22 | 23 | def test_na_at_i(test_data): 24 | data = test_data(SHAPE, 3, 3) 25 | actual = impy.locf(data, axis=1) 26 | data[3, 3] = data[2, 3] 27 | assert np.array_equal(actual, data) 28 | 29 | 30 | def test_na_at_i_end(test_data): 31 | data = test_data(SHAPE) 32 | last_i = np.shape(data)[0] - 1 33 | data = test_data(SHAPE, last_i, 3) 34 | actual = impy.locf(data, axis=1) 35 | data[last_i, 3] = data[last_i - 1, 3] 36 | assert np.array_equal(actual, data) 37 | 38 | 39 | def test_out_of_bounds(test_data): 40 | """Check out of bounds error, should throw BadInputError for any axis outside [0,1]""" 41 | data = test_data(SHAPE) 42 | with np.testing.assert_raises(error.BadInputError): 43 | impy.locf(data, axis=3) 44 | -------------------------------------------------------------------------------- /test/imputation/ts/test_moving_window.py: -------------------------------------------------------------------------------- 1 | """ test/imputation/ts/test_moving_window.py """ 2 | import pytest 3 | import numpy as np 4 | import impyute as impy 5 | from impyute.ops.testing import return_na_check 6 | #pylint:disable=missing-docstring, redefined-outer-name 7 | 8 | 9 | @pytest.mark.parametrize( 10 | 'pos1,pos2,expected', 11 | [ 12 | (2, 0, 11.5), 13 | (2, 2, 12), 14 | (2, -1, 12.5)] 15 | ) 16 | def test_defaults_impute(pos1, pos2, expected, mw_data): 17 | mw_data[pos1, pos2] = np.nan 18 | imputed = impy.moving_window(mw_data) 19 | return_na_check(imputed) 20 | assert imputed[pos1, pos2] == expected 21 | 22 | 23 | @pytest.mark.parametrize( 24 | 'pos1,pos2,expected', 25 | [ 26 | (2, 0, 24), 27 | (2, 2, 28), 28 | (2, -1, 26)] 29 | ) 30 | def test_custom_fn_impute(pos1, pos2, expected, mw_data): 31 | mw_data[pos1, pos2] = np.nan 32 | imputed = impy.moving_window(mw_data, func=lambda l: max(l) * 2) 33 | return_na_check(imputed) 34 | assert imputed[pos1, pos2] == expected 35 | 36 | 37 | @pytest.mark.parametrize( 38 | 'pos1,pos2,expected', 39 | [ 40 | (2, 0, 12.5), 41 | (2, -1, 12.5)] 42 | ) 43 | def test_custom_nindex_impute_0(pos1, pos2, expected, mw_data): 44 | mw_data[pos1, pos2] = np.nan 45 | imputed = impy.moving_window(mw_data, nindex=0) 46 | return_na_check(imputed) 47 | assert imputed[pos1, pos2] == expected 48 | 49 | 50 | @pytest.mark.parametrize( 51 | 'pos1,pos2,expected', 52 | [ 53 | (2, 0, 11.5), 54 | (2, -1, 11.5)] 55 | ) 56 | def test_custom_nindex_impute_1(pos1, pos2, expected, mw_data): 57 | mw_data[pos1, pos2] = np.nan 58 | imputed = impy.moving_window(mw_data, nindex=-1) 59 | return_na_check(imputed) 60 | assert imputed[pos1, pos2] == expected 61 | -------------------------------------------------------------------------------- /test/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eltonlaw/impyute/b76a6b4bd3da36515d5f1fa87f35d0c3f4209c83/test/ops/__init__.py -------------------------------------------------------------------------------- /test/ops/test_matrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from impyute.ops import matrix 3 | 4 | def _is_gt_5(x): 5 | return x > 5 6 | 7 | def test_map_nd_2d(): 8 | arr = np.arange(10).reshape([5, 2]) 9 | expected = np.array([ 10 | [False, False], 11 | [False, False], 12 | [False, False], 13 | [True, True], 14 | [True, True], 15 | ]) 16 | actual = matrix.map_nd(_is_gt_5, arr) 17 | assert matrix.every_nd(bool, expected == actual) 18 | -------------------------------------------------------------------------------- /test/ops/test_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from impyute.ops import matrix 3 | from impyute.ops import util 4 | 5 | def _add_one(x): 6 | """ """ 7 | return x + 1 8 | 9 | def _square(x): 10 | return x * x 11 | 12 | def test_thread(): 13 | assert 10 == util.thread(3, _square, _add_one) 14 | assert 100 == util.thread(3, _square, _add_one, _square) #4 15 | assert 82 == util.thread(3, _square, _square, _add_one) #4 16 | assert 10 == util.thread(3, lambda x: x*x, lambda x: x+1) 17 | assert 100 == util.thread(3, lambda x: x*x, lambda x: x+1, lambda x: x*x) 18 | assert 82 == util.thread(3, lambda x: x*x, lambda x: x*x, lambda x: x+1) 19 | 20 | def test_identity(): 21 | arr = np.array([[1., 2., 3.]]) 22 | actual = arr 23 | expected = util.identity(arr) 24 | assert matrix.every_nd(bool, expected == actual) 25 | -------------------------------------------------------------------------------- /test/ops/test_wrapper.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from impyute.imputation.cs import mean 4 | from impyute.ops import error 5 | from impyute.ops import wrapper 6 | 7 | # pylint:disable=redefined-builtin 8 | try: 9 | raise ModuleNotFoundError 10 | except NameError: 11 | class ModuleNotFoundError(Exception): 12 | "placeholder required for python2.7" 13 | pass 14 | except ModuleNotFoundError: 15 | pass 16 | 17 | @wrapper.wrappers 18 | def wrappers_mul(arr): 19 | """Some function that performs an inplace operation on the input. Accepts kwargs""" 20 | arr *= 25 21 | return arr 22 | 23 | def test_wrappers_inplace_false(): 24 | """Input should be unchanged if inplace set to false""" 25 | A = np.ones((5, 5)) 26 | A_copy = A.copy() 27 | wrappers_mul(A, inplace=False) 28 | assert A[0, 0] == A_copy[0, 0] 29 | 30 | def test_wrappers_inplace_true(): 31 | """Input may be changed if inplace set to true and operation is inplace""" 32 | A = np.ones((5, 5)) 33 | A_copy = A.copy() 34 | wrappers_mul(A, inplace=True) 35 | assert A[0, 0] != A_copy[0, 0] 36 | 37 | def test_wrappers_pandas_input(): 38 | """ Input: DataFrame, Output: DataFrame """ 39 | # Skip this test if you don't have pandas 40 | pytest.importorskip('pandas') 41 | import pandas as pd 42 | # Create a DataFrame with a NaN 43 | A = np.arange(25).reshape((5, 5)).astype(np.float) 44 | A[0, 0] = np.nan 45 | A = pd.DataFrame(A) 46 | # Assert that the output is a DataFrame 47 | assert isinstance(mean(A), pd.DataFrame) 48 | 49 | @wrapper.checks 50 | def some_fn(data): 51 | """Dummy fn that has form of np.array -> np.array""" 52 | return data 53 | 54 | def test_correct_input(): 55 | """ Test that an array that should satisfy all checks, no BadInputError should be raised""" 56 | # Integer np.ndarray (check: `_is_ndarray`, `_shape_2d`, `_nan_exists`) 57 | arr = np.array([[np.nan, 2], [3, 4]]) 58 | # Cast integer array to float (check: `_dtype_float`) 59 | arr.dtype = np.float 60 | try: 61 | some_fn(arr) 62 | except error.BadInputError: 63 | assert False 64 | 65 | def test_1d(): 66 | """ Check 1d array, BadInputError raised""" 67 | arr = np.array([np.nan, 2]) 68 | with pytest.raises(error.BadInputError) as excinfo: 69 | some_fn(arr) 70 | assert str(excinfo.value) == "No support for arrays that aren't 2D yet." 71 | 72 | def test_not_nparray(): 73 | """ If not an np.array, BadInputError raised""" 74 | with pytest.raises(error.BadInputError) as excinfo: 75 | some_fn([[np.nan, 2.], [3, 4]]) 76 | assert str(excinfo.value) == "Not a np.ndarray." 77 | 78 | def test_nan_exists(): 79 | """ If no NaN, BadInputError raised""" 80 | with pytest.raises(error.BadInputError) as excinfo: 81 | some_fn(np.array([[1.]])) 82 | assert str(excinfo.value) == "No NaN's in given data" 83 | 84 | @wrapper.conform_output 85 | def conform_output_dummy(x): 86 | return x 87 | 88 | def is_between_0_1(x): 89 | return 0 <= x <= 1 90 | 91 | def coerce_between_0_1(arr, x_i, y_i): 92 | val = arr[x_i, y_i] 93 | if val < 0: 94 | return 0 95 | elif val > 1: 96 | return 1 97 | else: 98 | return x 99 | 100 | def test_conform_output_not_used(): 101 | """ If neither args passed, don't do anything""" 102 | assert "some input" == conform_output_dummy("some input") 103 | 104 | def test_conform_output_valid_coerce(): 105 | """ Check value valid and coerce invalid values""" 106 | arr = np.array([[1.1, 0.5], [0.2, -1]]) 107 | actual = conform_output_dummy( 108 | arr, 109 | valid_fn=is_between_0_1, 110 | coerce_fn=coerce_between_0_1, 111 | ) 112 | expected = np.array([[1.0, 0.5], [0.2, 0.0]]) 113 | assert np.array_equal(expected, actual) 114 | 115 | def test_conform_output_coerce(): 116 | """ Coerce function doesn't run if no valid_fn passed """ 117 | arr = np.array([[1.1, 0.5], [0.2, -1]]) 118 | actual = conform_output_dummy( 119 | arr, 120 | coerce_fn=coerce_between_0_1, 121 | ) 122 | expected = np.array([[1.1, 0.5], [0.2, -1]]) 123 | assert np.array_equal(expected, actual) 124 | 125 | def test_conform_output_valid(): 126 | """ No coerce_fn with valid_fn will raise BadOutputeError if invalid values 127 | encountered. First invalid value is 1.1 128 | """ 129 | arr = np.array([[1.1, 0.5], [0.2, -1]]) 130 | with pytest.raises(error.BadOutputError) as excinfo: 131 | conform_output_dummy(arr, valid_fn=is_between_0_1) 132 | assert str(excinfo.value) == "1.1 does not conform" 133 | --------------------------------------------------------------------------------