├── .circleci └── config.yml ├── .coveragerc ├── .gitignore ├── .readthedocs.yml ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── appveyor.yml ├── azure-pipelines.yml ├── doc ├── CategoricalColumnTransformer_intro.ipynb ├── Makefile ├── _static │ ├── css │ │ └── project-template.css │ └── js │ │ └── copybutton.js ├── _templates │ ├── class.rst │ ├── function.rst │ └── numpydoc_docstring.py ├── api.rst ├── categorical_column_transformer_example.ipynb ├── conf.py ├── document_vectorization.ipynb ├── index.rst ├── information_weight_transform.ipynb ├── make.bat ├── quick_start.rst ├── requirements.txt ├── sequence_taxonomy.ipynb ├── token_cooccurrence_vectorizer_multi_labelled_cyber_example.ipynb ├── user_guide.rst ├── vectorizers_logo_no_text.png └── vectorizers_logo_text.png ├── environment.yml ├── examples ├── README.txt └── SignatureVectorizer_Examples_1.ipynb ├── requirements.txt ├── setup.cfg ├── setup.py └── vectorizers ├── __init__.py ├── _vectorizers.py ├── _version.py ├── _window_kernels.py ├── base_cooccurrence_vectorizer.py ├── coo_utils.py ├── distances.py ├── distribution_vectorizer.py ├── edge_list_vectorizer.py ├── kde_vectorizer.py ├── linear_optimal_transport.py ├── mixed_gram_vectorizer.py ├── multi_token_cooccurence_vectorizer.py ├── ngram_token_cooccurence_vectorizer.py ├── ngram_vectorizer.py ├── preprocessing.py ├── signature_vectorizer.py ├── skip_gram_vectorizer.py ├── tests ├── __init__.py ├── test_bpe.py ├── test_common.py ├── test_distances.py ├── test_edge_list_vectorizer.py ├── test_signature_vectorizer.py ├── test_template.py └── test_transformers.py ├── timed_token_cooccurrence_vectorizer.py ├── token_cooccurrence_vectorizer.py ├── transformers ├── __init__.py ├── categorical_columns.py ├── count_feature_compression.py ├── info_weight.py ├── row_desnoise.py └── sliding_windows.py ├── tree_token_cooccurrence.py └── utils.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | jobs: 4 | python3: 5 | docker: 6 | - image: circleci/python:3.6.1 7 | steps: 8 | - checkout 9 | - run: 10 | command: | 11 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh 12 | chmod +x miniconda.sh && ./miniconda.sh -b -p ~/miniconda 13 | export PATH="~/miniconda/bin:$PATH" 14 | conda update --yes --quiet conda 15 | conda create -n testenv --yes --quiet python=3 16 | source activate testenv 17 | conda install --yes pip numpy scipy scikit-learn pandas numba matplotlib sphinx sphinx_rtd_theme numpydoc pillow dask pandoc 18 | pip install pynndescent 19 | pip install sphinx-gallery 20 | pip install nbsphinx 21 | pip install . 22 | cd doc 23 | make html 24 | - store_artifacts: 25 | path: doc/_build/html 26 | destination: doc 27 | - store_artifacts: 28 | path: ~/log.txt 29 | - persist_to_workspace: 30 | root: doc/_build/html 31 | paths: . 32 | - attach_workspace: 33 | at: doc/_build/html 34 | - run: ls -ltrh doc/_build/html 35 | filters: 36 | branches: 37 | ignore: gh-pages 38 | 39 | workflows: 40 | version: 2 41 | build-doc-and-deploy: 42 | jobs: 43 | - python3 44 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | # Configuration for coverage.py 2 | 3 | [run] 4 | branch = True 5 | source = vectorizers 6 | include = */vectorizers/* 7 | omit = 8 | */setup.py 9 | 10 | [report] 11 | exclude_lines = 12 | pragma: no cover 13 | def __repr__ 14 | if self.debug: 15 | if settings.DEBUG 16 | raise AssertionError 17 | raise NotImplementedError 18 | if 0: 19 | if __name__ == .__main__.: 20 | if self.verbose: 21 | show_missing = True -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # scikit-learn specific 10 | doc/_build/ 11 | doc/auto_examples/ 12 | doc/modules/generated/ 13 | doc/datasets/generated/ 14 | 15 | # Distribution / packaging 16 | 17 | .Python 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *,cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | 62 | # Sphinx documentation 63 | doc/_build/ 64 | doc/generated/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # Jupyter artifacts 70 | .ipynb_checkpoints 71 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the "docs/" directory with Sphinx 15 | sphinx: 16 | configuration: doc/conf.py 17 | 18 | # Optional but recommended, declare the Python requirements required 19 | # to build your documentation 20 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 21 | python: 22 | install: 23 | - requirements: doc/requirements.txt 24 | - method: pip 25 | path: . 26 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: trusty 2 | sudo: false 3 | 4 | language: python 5 | 6 | cache: 7 | directories: 8 | - $HOME/.cache/pip 9 | 10 | matrix: 11 | include: 12 | - env: PYTHON_VERSION="3.7" NUMPY_VERSION="1.16.6" SCIPY_VERSION="1.4.1" 13 | SKLEARN_VERSION="0.20.3" 14 | - env: PYTHON_VERSION="3.8" NUMPY_VERSION="*" SCIPY_VERSION="*" 15 | SKLEARN_VERSION="*" 16 | - env: PYTHON_VERSION="3.8" NUMPY_VERSION="*" SCIPY_VERSION="*" 17 | SKLEARN_VERSION="*" COVERAGE="true" 18 | 19 | install: 20 | # install miniconda 21 | - deactivate 22 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh 23 | - MINICONDA_PATH=/home/travis/miniconda 24 | - chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH 25 | - export PATH=$MINICONDA_PATH/bin:$PATH 26 | - conda update --yes conda 27 | # create the testing environment 28 | - conda create -n testenv --yes python=$PYTHON_VERSION pip 29 | - source activate testenv 30 | - | 31 | if [ $SKLEARN_VERSION = "nightly" ]; then 32 | conda install --yes numpy==$NUMPY_VERSION scipy==$SCIPY_VERSION cython nose pytest pytest-cov dask 33 | # install nightly wheels 34 | pip install --pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn 35 | else 36 | conda install --yes numpy==$NUMPY_VERSION scipy==$SCIPY_VERSION scikit-learn==$SKLEARN_VERSION cython nose pytest pytest-cov dask 37 | fi 38 | - conda install --yes pandas numba 39 | - pip install pynndescent 40 | - pip install codecov 41 | - pip install coverage 42 | - pip install coveralls 43 | - pip install . 44 | 45 | script: 46 | - | 47 | if [ "$COVERAGE" = "true" ]; then 48 | # disable numba for coverage run 49 | export NUMBA_DISABLE_JIT=1 50 | coverage run -m pytest -v --pyargs vectorizers -k tests 51 | else 52 | pytest -v --pyargs vectorizers 53 | fi 54 | after_success: 55 | - | 56 | if [ "$COVERAGE" = "true" ]; then 57 | codecov 58 | coveralls 59 | fi -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020, John Healy, Leland McInnes, Colin Weir and Vectorizers contributors 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of project-template nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | .. image:: doc/vectorizers_logo_text.png 4 | :width: 600 5 | :alt: Vectorizers Logo 6 | 7 | |Travis|_ |AppVeyor|_ |Codecov|_ |CircleCI|_ |ReadTheDocs|_ 8 | 9 | .. |Travis| image:: https://travis-ci.com/TutteInstitute/vectorizers.svg?branch=master 10 | .. _Travis: https://travis-ci.com/TutteInstitute/vectorizers 11 | 12 | .. |AppVeyor| image:: https://ci.appveyor.com/api/projects/status/sjawsgwo7g4k3jon?svg=true 13 | .. _AppVeyor: https://ci.appveyor.com/project/lmcinnes/vectorizers 14 | 15 | .. |Codecov| image:: https://codecov.io/gh/TutteInstitute/vectorizers/branch/master/graph/badge.svg 16 | .. _Codecov: https://codecov.io/gh/TutteInstitute/vectorizers 17 | 18 | 19 | .. |CircleCI| image:: https://circleci.com/gh/TutteInstitute/vectorizers.svg?style=shield&circle-token=:circle-token 20 | .. _CircleCI: https://circleci.com/gh/scikit-learn-contrib/project-template/tree/master 21 | 22 | .. |ReadTheDocs| image:: https://readthedocs.org/projects/vectorizers/badge/?version=latest 23 | .. _ReadTheDocs: https://vectorizers.readthedocs.io/en/latest/?badge=latest 24 | 25 | =========== 26 | Vectorizers 27 | =========== 28 | 29 | There are a large number of machine learning tools for effectively exploring and working 30 | with data that is given as vectors (ideally with a defined notion of distance as well). 31 | There is also a large volume of data that does not come neatly packaged as vectors. It 32 | could be text data, variable length sequence data (either numeric or categorical), 33 | dataframes of mixed data types, sets of point clouds, or more. Usually, one way or another, 34 | such data can be wrangled into vectors in a way that preserves some relevant properties 35 | of the original data. This library seeks to provide a suite of a wide variety of 36 | general purpose techniques for such wrangling, making it easier and faster for users 37 | to get various kinds of unstructured sequence data into vector formats for exploration and 38 | machine learning. 39 | 40 | -------------------- 41 | Why use Vectorizers? 42 | -------------------- 43 | 44 | Data wrangling can be tedious, error-prone, and fragile when trying to integrate it into 45 | production pipelines. The vectorizers library aims to provide a set of easy to use 46 | tools for turning various kinds of unstructured sequence data into vectors. By following the 47 | scikit-learn transformer API we ensure that any of the vectorizer classes can be 48 | trivially integrated into existing sklearn workflows or pipelines. By keeping the 49 | vectorization approaches as general as possible (as opposed to specialising on very 50 | specific data types), we aim to ensure that a very broad range of data can be handled 51 | efficiently. Finally we aim to provide robust techniques with sound mathematical foundations 52 | over potentially more powerful but black-box approaches for greater transparency 53 | in data processing and transformation. 54 | 55 | ---------------------- 56 | How to use Vectorizers 57 | ---------------------- 58 | 59 | Quick start examples to be added soon ... 60 | 61 | For further examples on using this library for text we recommend checking out the documentation 62 | written up in the EasyData reproducible data science framework by some of our colleagues over at: 63 | https://github.com/hackalog/vectorizers_playground 64 | 65 | ---------- 66 | Installing 67 | ---------- 68 | 69 | Vectorizers is designed to be easy to install being a pure python module with 70 | relatively light requirements: 71 | 72 | * numpy 73 | * scipy 74 | * scikit-learn >= 0.22 75 | * numba >= 0.51 76 | 77 | To install the package from PyPI: 78 | 79 | .. code:: bash 80 | 81 | pip install vectorizers 82 | 83 | To install the package from source: 84 | 85 | .. code:: bash 86 | 87 | pip install https://github.com/TutteInstitute/vectorizers/archive/master.zip 88 | 89 | ---------------- 90 | Help and Support 91 | ---------------- 92 | 93 | This project is still young. The `documentation `_ is still growing. In the meantime please 94 | `open an issue `_ 95 | and we will try to provide any help and guidance that we can. Please also check 96 | the docstrings on the code, which provide some descriptions of the parameters. 97 | 98 | ------------ 99 | Contributing 100 | ------------ 101 | 102 | Contributions are more than welcome! There are lots of opportunities 103 | for potential projects, so please get in touch if you would like to 104 | help out. Everything from code to notebooks to 105 | examples and documentation are all *equally valuable* so please don't feel 106 | you can't contribute. We would greatly appreciate the contribution of 107 | tutorial notebooks applying vectorizer tools to diverse or interesting 108 | datasets. If you find vectorizers useful for your data please consider 109 | contributing an example showing how it can apply to the kind of data 110 | you work with! 111 | 112 | 113 | To contribute please `fork the project `_ make your changes and 114 | submit a pull request. We will do our best to work through any issues with 115 | you and get your code merged into the main branch. 116 | 117 | ------- 118 | License 119 | ------- 120 | 121 | The vectorizers package is 3-clause BSD licensed. 122 | 123 | 124 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | build: false 2 | 3 | environment: 4 | matrix: 5 | - PYTHON: "C:\\Miniconda3-x64" 6 | PYTHON_VERSION: "3.7.x" 7 | PYTHON_ARCH: "32" 8 | NUMPY_VERSION: "1.16.6" 9 | SCIPY_VERSION: "1.4.1" 10 | SKLEARN_VERSION: "0.22.1" 11 | COVERAGE: 0 12 | 13 | - PYTHON: "C:\\Miniconda3-x64" 14 | PYTHON_VERSION: "3.8.x" 15 | PYTHON_ARCH: "64" 16 | NUMPY_VERSION: "*" 17 | SCIPY_VERSION: "*" 18 | SKLEARN_VERSION: "*" 19 | COVERAGE: 0 20 | 21 | - PYTHON: "C:\\Miniconda3-x64" 22 | PYTHON_VERSION: "3.8.x" 23 | PYTHON_ARCH: "64" 24 | NUMPY_VERSION: "*" 25 | SCIPY_VERSION: "*" 26 | SKLEARN_VERSION: "*" 27 | COVERAGE: 1 28 | 29 | install: 30 | # Prepend miniconda installed Python to the PATH of this build 31 | # Add Library/bin directory to fix issue 32 | # https://github.com/conda/conda/issues/1753 33 | - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PYTHON%\\Library\\bin;%PATH%" 34 | # install the dependencies 35 | - "conda install --yes pip numpy==%NUMPY_VERSION% scipy==%SCIPY_VERSION% scikit-learn==%SKLEARN_VERSION% nose pytest pytest-cov" 36 | - conda install --yes numba pandas dask pomegranate 37 | - pip install pynndescent 38 | - pip install iisignature 39 | - pip install codecov 40 | - pip install . 41 | 42 | test_script: 43 | - mkdir for_test 44 | - cd for_test 45 | - IF %COVERAGE%==1 set NUMBA_DISABLE_JIT=1 46 | - pytest -v --cov=vectorizers --pyargs vectorizers 47 | 48 | after_test: 49 | - cp .coverage %APPVEYOR_BUILD_FOLDER% 50 | - cd %APPVEYOR_BUILD_FOLDER% 51 | - IF %COVERAGE%==1 codecov 52 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # Trigger a build when there is a push to the main branch or a tag starts with release- 2 | trigger: 3 | branches: 4 | include: 5 | - main 6 | - master 7 | tags: 8 | include: 9 | - release-* 10 | 11 | # Trigger a build when there is a pull request to the main branch 12 | # Ignore PRs that are just updating the docs 13 | pr: 14 | branches: 15 | include: 16 | - main 17 | - master 18 | exclude: 19 | - doc/* 20 | - README.rst 21 | 22 | parameters: 23 | - name: includeReleaseCandidates 24 | displayName: "Allow pre-release dependencies" 25 | type: boolean 26 | default: false 27 | 28 | variables: 29 | triggeredByPullRequest: $[eq(variables['Build.Reason'], 'PullRequest')] 30 | 31 | stages: 32 | - stage: RunAllTests 33 | displayName: Run test suite 34 | jobs: 35 | - job: run_platform_tests 36 | strategy: 37 | matrix: 38 | mac_py39: 39 | imageName: 'macOS-latest' 40 | python.version: '3.9' 41 | linux_py39: 42 | imageName: 'ubuntu-latest' 43 | python.version: '3.9' 44 | windows_py39: 45 | imageName: 'windows-latest' 46 | python.version: '3.9' 47 | mac_py310: 48 | imageName: 'macOS-latest' 49 | python.version: '3.10' 50 | linux_py310: 51 | imageName: 'ubuntu-latest' 52 | python.version: '3.10' 53 | windows_py310: 54 | imageName: 'windows-latest' 55 | python.version: '3.10' 56 | mac_py311: 57 | imageName: 'macOS-latest' 58 | python.version: '3.11' 59 | linux_py311: 60 | imageName: 'ubuntu-latest' 61 | python.version: '3.11' 62 | windows_py311: 63 | imageName: 'windows-latest' 64 | python.version: '3.11' 65 | mac_py312: 66 | imageName: 'macOS-latest' 67 | python.version: '3.12' 68 | linux_py312: 69 | imageName: 'ubuntu-latest' 70 | python.version: '3.12' 71 | windows_py312: 72 | imageName: 'windows-latest' 73 | python.version: '3.12' 74 | pool: 75 | vmImage: $(imageName) 76 | 77 | steps: 78 | - task: UsePythonVersion@0 79 | inputs: 80 | versionSpec: '$(python.version)' 81 | displayName: 'Use Python $(python.version)' 82 | 83 | - script: | 84 | python -m pip install --upgrade pip 85 | displayName: 'Upgrade pip' 86 | 87 | - script: | 88 | pip install -r requirements.txt 89 | displayName: 'Install dependencies' 90 | condition: ${{ eq(parameters.includeReleaseCandidates, false) }} 91 | 92 | - script: | 93 | pip install --pre -r requirements.txt 94 | displayName: 'Install dependencies (allow pre-releases)' 95 | condition: ${{ eq(parameters.includeReleaseCandidates, true) }} 96 | 97 | - script: | 98 | pip install -e . 99 | pip install pytest pytest-azurepipelines 100 | pip install pytest-cov 101 | pip install coveralls 102 | displayName: 'Install package' 103 | 104 | - script: | 105 | pytest vectorizers/tests --show-capture=no -v --disable-warnings --junitxml=junit/test-results.xml --cov=vectorizers/ --cov-report=xml --cov-report=html 106 | displayName: 'Run tests' 107 | 108 | - bash: | 109 | coveralls 110 | displayName: 'Publish to coveralls' 111 | condition: and(succeeded(), eq(variables.triggeredByPullRequest, false)) # Don't run this for PRs because they can't access pipeline secrets 112 | env: 113 | COVERALLS_REPO_TOKEN: $(COVERALLS_TOKEN) 114 | 115 | - task: PublishTestResults@2 116 | inputs: 117 | testResultsFiles: '$(System.DefaultWorkingDirectory)/**/coverage.xml' 118 | testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)' 119 | condition: succeededOrFailed() 120 | 121 | - stage: BuildPublishArtifact 122 | dependsOn: RunAllTests 123 | condition: and(succeeded(), startsWith(variables['Build.SourceBranch'], 'refs/tags/release-'), eq(variables.triggeredByPullRequest, false)) 124 | jobs: 125 | - job: BuildArtifacts 126 | displayName: Build source dists and wheels 127 | pool: 128 | vmImage: 'ubuntu-latest' 129 | steps: 130 | - task: UsePythonVersion@0 131 | inputs: 132 | versionSpec: '3.10' 133 | displayName: 'Use Python 3.10' 134 | 135 | - script: | 136 | python -m pip install --upgrade pip 137 | pip install wheel 138 | pip install -r requirements.txt 139 | displayName: 'Install dependencies' 140 | 141 | - script: | 142 | pip install -e . 143 | displayName: 'Install package locally' 144 | 145 | - script: | 146 | python setup.py sdist bdist_wheel 147 | displayName: 'Build package' 148 | 149 | - bash: | 150 | export PACKAGE_VERSION="$(python setup.py --version)" 151 | echo "Package Version: ${PACKAGE_VERSION}" 152 | echo "##vso[task.setvariable variable=packageVersionFormatted;]release-${PACKAGE_VERSION}" 153 | displayName: 'Get package version' 154 | 155 | - script: | 156 | echo "Version in git tag $(Build.SourceBranchName) does not match version derived from setup.py $(packageVersionFormatted)" 157 | exit 1 158 | displayName: Raise error if version doesnt match tag 159 | condition: and(succeeded(), ne(variables['Build.SourceBranchName'], variables['packageVersionFormatted'])) 160 | 161 | - task: DownloadSecureFile@1 162 | name: PYPIRC_CONFIG 163 | displayName: 'Download pypirc' 164 | inputs: 165 | secureFile: 'pypirc' 166 | 167 | - script: | 168 | pip install twine 169 | twine upload --repository pypi --config-file $(PYPIRC_CONFIG.secureFilePath) dist/* 170 | displayName: 'Upload to PyPI' 171 | condition: and(succeeded(), eq(variables['Build.SourceBranchName'], variables['packageVersionFormatted'])) 172 | 173 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | -rm -rf $(BUILDDIR)/* 51 | -rm -rf auto_examples/ 52 | -rm -rf generated/* 53 | -rm -rf modules/generated/* 54 | 55 | html: 56 | # These two lines make the build a bit more lengthy, and the 57 | # the embedding of images more robust 58 | rm -rf $(BUILDDIR)/html/_images 59 | #rm -rf _build/doctrees/ 60 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 63 | 64 | dirhtml: 65 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 66 | @echo 67 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 68 | 69 | singlehtml: 70 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 71 | @echo 72 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 73 | 74 | pickle: 75 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 76 | @echo 77 | @echo "Build finished; now you can process the pickle files." 78 | 79 | json: 80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 81 | @echo 82 | @echo "Build finished; now you can process the JSON files." 83 | 84 | htmlhelp: 85 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 86 | @echo 87 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 88 | ".hhp project file in $(BUILDDIR)/htmlhelp." 89 | 90 | qthelp: 91 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 92 | @echo 93 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 94 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 95 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/project-template.qhcp" 96 | @echo "To view the help file:" 97 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/project-template.qhc" 98 | 99 | devhelp: 100 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 101 | @echo 102 | @echo "Build finished." 103 | @echo "To view the help file:" 104 | @echo "# mkdir -p $$HOME/.local/share/devhelp/project-template" 105 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/project-template" 106 | @echo "# devhelp" 107 | 108 | epub: 109 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 110 | @echo 111 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 112 | 113 | latex: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo 116 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 117 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 118 | "(use \`make latexpdf' here to do that automatically)." 119 | 120 | latexpdf: 121 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 122 | @echo "Running LaTeX files through pdflatex..." 123 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 124 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 125 | 126 | latexpdfja: 127 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 128 | @echo "Running LaTeX files through platex and dvipdfmx..." 129 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 130 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 131 | 132 | text: 133 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 134 | @echo 135 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 136 | 137 | man: 138 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 139 | @echo 140 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 141 | 142 | texinfo: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo 145 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 146 | @echo "Run \`make' in that directory to run these through makeinfo" \ 147 | "(use \`make info' here to do that automatically)." 148 | 149 | info: 150 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 151 | @echo "Running Texinfo files through makeinfo..." 152 | make -C $(BUILDDIR)/texinfo info 153 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 154 | 155 | gettext: 156 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 157 | @echo 158 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 159 | 160 | changes: 161 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 162 | @echo 163 | @echo "The overview file is in $(BUILDDIR)/changes." 164 | 165 | linkcheck: 166 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 167 | @echo 168 | @echo "Link check complete; look for any errors in the above output " \ 169 | "or in $(BUILDDIR)/linkcheck/output.txt." 170 | 171 | doctest: 172 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 173 | @echo "Testing of doctests in the sources finished, look at the " \ 174 | "results in $(BUILDDIR)/doctest/output.txt." 175 | 176 | xml: 177 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 178 | @echo 179 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 180 | 181 | pseudoxml: 182 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 183 | @echo 184 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 185 | -------------------------------------------------------------------------------- /doc/_static/css/project-template.css: -------------------------------------------------------------------------------- 1 | @import url("theme.css"); 2 | 3 | .highlight a { 4 | text-decoration: underline; 5 | } 6 | 7 | .deprecated p { 8 | padding: 10px 7px 10px 10px; 9 | color: #b94a48; 10 | background-color: #F3E5E5; 11 | border: 1px solid #eed3d7; 12 | } 13 | 14 | .deprecated p span.versionmodified { 15 | font-weight: bold; 16 | } 17 | -------------------------------------------------------------------------------- /doc/_static/js/copybutton.js: -------------------------------------------------------------------------------- 1 | $(document).ready(function() { 2 | /* Add a [>>>] button on the top-right corner of code samples to hide 3 | * the >>> and ... prompts and the output and thus make the code 4 | * copyable. */ 5 | var div = $('.highlight-python .highlight,' + 6 | '.highlight-python3 .highlight,' + 7 | '.highlight-pycon .highlight,' + 8 | '.highlight-default .highlight') 9 | var pre = div.find('pre'); 10 | 11 | // get the styles from the current theme 12 | pre.parent().parent().css('position', 'relative'); 13 | var hide_text = 'Hide the prompts and output'; 14 | var show_text = 'Show the prompts and output'; 15 | var border_width = pre.css('border-top-width'); 16 | var border_style = pre.css('border-top-style'); 17 | var border_color = pre.css('border-top-color'); 18 | var button_styles = { 19 | 'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0', 20 | 'border-color': border_color, 'border-style': border_style, 21 | 'border-width': border_width, 'color': border_color, 'text-size': '75%', 22 | 'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '0.2em', 23 | 'border-radius': '0 3px 0 0' 24 | } 25 | 26 | // create and add the button to all the code blocks that contain >>> 27 | div.each(function(index) { 28 | var jthis = $(this); 29 | if (jthis.find('.gp').length > 0) { 30 | var button = $('>>>'); 31 | button.css(button_styles) 32 | button.attr('title', hide_text); 33 | button.data('hidden', 'false'); 34 | jthis.prepend(button); 35 | } 36 | // tracebacks (.gt) contain bare text elements that need to be 37 | // wrapped in a span to work with .nextUntil() (see later) 38 | jthis.find('pre:has(.gt)').contents().filter(function() { 39 | return ((this.nodeType == 3) && (this.data.trim().length > 0)); 40 | }).wrap(''); 41 | }); 42 | 43 | // define the behavior of the button when it's clicked 44 | $('.copybutton').click(function(e){ 45 | e.preventDefault(); 46 | var button = $(this); 47 | if (button.data('hidden') === 'false') { 48 | // hide the code output 49 | button.parent().find('.go, .gp, .gt').hide(); 50 | button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden'); 51 | button.css('text-decoration', 'line-through'); 52 | button.attr('title', show_text); 53 | button.data('hidden', 'true'); 54 | } else { 55 | // show the code output 56 | button.parent().find('.go, .gp, .gt').show(); 57 | button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible'); 58 | button.css('text-decoration', 'none'); 59 | button.attr('title', hide_text); 60 | button.data('hidden', 'false'); 61 | } 62 | }); 63 | }); 64 | -------------------------------------------------------------------------------- /doc/_templates/class.rst: -------------------------------------------------------------------------------- 1 | :mod:`{{module}}`.{{objname}} 2 | {{ underline }}============== 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autoclass:: {{ objname }} 7 | 8 | {% block methods %} 9 | .. automethod:: __init__ 10 | {% endblock %} 11 | 12 | .. include:: {{module}}.{{objname}}.examples 13 | 14 | .. raw:: html 15 | 16 |
17 | -------------------------------------------------------------------------------- /doc/_templates/function.rst: -------------------------------------------------------------------------------- 1 | :mod:`{{module}}`.{{objname}} 2 | {{ underline }}==================== 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autofunction:: {{ objname }} 7 | 8 | .. include:: {{module}}.{{objname}}.examples 9 | 10 | .. raw:: html 11 | 12 |
13 | -------------------------------------------------------------------------------- /doc/_templates/numpydoc_docstring.py: -------------------------------------------------------------------------------- 1 | {{index}} 2 | {{summary}} 3 | {{extended_summary}} 4 | {{parameters}} 5 | {{returns}} 6 | {{yields}} 7 | {{other_parameters}} 8 | {{attributes}} 9 | {{raises}} 10 | {{warns}} 11 | {{warnings}} 12 | {{see_also}} 13 | {{notes}} 14 | {{references}} 15 | {{examples}} 16 | {{methods}} 17 | -------------------------------------------------------------------------------- /doc/api.rst: -------------------------------------------------------------------------------- 1 | ############### 2 | Vectorizers API 3 | ############### 4 | 5 | Ngram and Skipgram Vectorizer 6 | =============== 7 | 8 | .. autosummary:: 9 | :toctree: generated/ 10 | :template: class.rst 11 | 12 | NgramVectorizer 13 | SkipgramVectorizer 14 | LZCompressionVectorizer 15 | BytePairEncodingVectorizer 16 | 17 | TokenCooccurrenceVectorizers 18 | =========================== 19 | 20 | .. autosummary:: 21 | :toctree: generated/ 22 | :template: class.rst 23 | 24 | TokenCooccurrenceVectorizer 25 | MultiSetCooccurrenceVectorizer 26 | TimedTokenCooccurrenceVectorizer 27 | LabelledTreeCooccurrenceVectorizer 28 | 29 | Wasserstein style Vectorizers 30 | ============================= 31 | 32 | .. autosummary:: 33 | :toctree: generated/ 34 | :template: class.rst 35 | 36 | WassersteinVectorizer 37 | SinkhornVectorizer 38 | ApproximateWassersteinVectorizer 39 | 40 | Utility Vectorizers and Transformers 41 | ==================================== 42 | 43 | .. autosummary:: 44 | :toctree: generated/ 45 | :template: class.rst 46 | 47 | EdgeListVectorizer 48 | CategoricalColumnTransformer 49 | InformationWeightTransformer 50 | RowDenoisingTransformer 51 | CountFeatureCompressionTransformer 52 | 53 | Time Series Vectorizers and Transformers 54 | ======================================== 55 | 56 | .. autosummary:: 57 | :toctree: generated/ 58 | :template: class.rst 59 | 60 | HistogramVectorizer 61 | KDEVectorizer 62 | SlidingWindowTransformer 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # project-template documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Jan 18 14:44:12 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | import sphinx_gallery 19 | import sphinx_rtd_theme 20 | 21 | # If extensions (or modules to document with autodoc) are in another directory, 22 | # add these directories to sys.path here. If the directory is relative to the 23 | # documentation root, use os.path.abspath to make it absolute, like shown here. 24 | #sys.path.insert(0, os.path.abspath('.')) 25 | 26 | # -- General configuration ------------------------------------------------ 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | #needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | 'sphinx.ext.autodoc', 36 | 'sphinx.ext.autosummary', 37 | 'sphinx.ext.doctest', 38 | 'sphinx.ext.intersphinx', 39 | 'sphinx.ext.viewcode', 40 | 'numpydoc', 41 | 'nbsphinx', 42 | 'sphinx_gallery.gen_gallery', 43 | ] 44 | 45 | # this is needed for some reason... 46 | # see https://github.com/numpy/numpydoc/issues/69 47 | numpydoc_show_class_members = False 48 | 49 | # pngmath / imgmath compatibility layer for different sphinx versions 50 | import sphinx 51 | from distutils.version import LooseVersion 52 | if LooseVersion(sphinx.__version__) < LooseVersion('1.4'): 53 | extensions.append('sphinx.ext.pngmath') 54 | else: 55 | extensions.append('sphinx.ext.imgmath') 56 | 57 | autodoc_default_flags = ['members', 'inherited-members'] 58 | 59 | # Add any paths that contain templates here, relative to this directory. 60 | templates_path = ['_templates'] 61 | 62 | # generate autosummary even if no references 63 | autosummary_generate = True 64 | 65 | # The suffix of source filenames. 66 | source_suffix = '.rst' 67 | 68 | # The encoding of source files. 69 | #source_encoding = 'utf-8-sig' 70 | 71 | # Generate the plots for the gallery 72 | plot_gallery = True 73 | 74 | # The master toctree document. 75 | master_doc = 'index' 76 | 77 | # General information about the project. 78 | project = u'vectorizers' 79 | copyright = u'2022, Benoit Hamelin, John Healy, Leland McInnes, Colin Weir' 80 | 81 | # The version info for the project you're documenting, acts as replacement for 82 | # |version| and |release|, also used in various other places throughout the 83 | # built documents. 84 | # 85 | # The short X.Y version. 86 | from vectorizers import __version__ 87 | version = __version__ 88 | # The full version, including alpha/beta/rc tags. 89 | release = __version__ 90 | 91 | # The language for content autogenerated by Sphinx. Refer to documentation 92 | # for a list of supported languages. 93 | #language = None 94 | 95 | # There are two options for replacing |today|: either, you set today to some 96 | # non-false value, then it is used: 97 | #today = '' 98 | # Else, today_fmt is used as the format for a strftime call. 99 | #today_fmt = '%B %d, %Y' 100 | 101 | # List of patterns, relative to source directory, that match files and 102 | # directories to ignore when looking for source files. 103 | exclude_patterns = ['_build', '_templates'] 104 | 105 | # The reST default role (used for this markup: `text`) to use for all 106 | # documents. 107 | #default_role = None 108 | 109 | # If true, '()' will be appended to :func: etc. cross-reference text. 110 | #add_function_parentheses = True 111 | 112 | # If true, the current module name will be prepended to all description 113 | # unit titles (such as .. function::). 114 | #add_module_names = True 115 | 116 | # If true, sectionauthor and moduleauthor directives will be shown in the 117 | # output. They are ignored by default. 118 | #show_authors = False 119 | 120 | # The name of the Pygments (syntax highlighting) style to use. 121 | pygments_style = 'sphinx' 122 | 123 | # Custom style 124 | html_style = 'css/project-template.css' 125 | 126 | # A list of ignored prefixes for module index sorting. 127 | #modindex_common_prefix = [] 128 | 129 | # If true, keep warnings as "system message" paragraphs in the built documents. 130 | #keep_warnings = False 131 | 132 | 133 | # -- Options for HTML output ---------------------------------------------- 134 | 135 | # The theme to use for HTML and HTML Help pages. See the documentation for 136 | # a list of builtin themes. 137 | html_theme = 'sphinx_rtd_theme' 138 | 139 | # Theme options are theme-specific and customize the look and feel of a theme 140 | # further. For a list of options available for each theme, see the 141 | # documentation. 142 | #html_theme_options = {} 143 | 144 | # Add any paths that contain custom themes here, relative to this directory. 145 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 146 | 147 | # The name for this set of Sphinx documents. If None, it defaults to 148 | # " v documentation". 149 | #html_title = None 150 | 151 | # A shorter title for the navigation bar. Default is the same as html_title. 152 | #html_short_title = None 153 | 154 | # The name of an image file (relative to this directory) to place at the top 155 | # of the sidebar. 156 | html_theme_options = {"navigation_depth": 3, "logo_only": True} 157 | 158 | html_logo = "vectorizers_logo_no_text.png" 159 | 160 | 161 | # The name of an image file (within the static path) to use as favicon of the 162 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 163 | # pixels large. 164 | #html_favicon = None 165 | 166 | # Add any paths that contain custom static files (such as style sheets) here, 167 | # relative to this directory. They are copied after the builtin static files, 168 | # so a file named "default.css" will overwrite the builtin "default.css". 169 | html_static_path = ['_static'] 170 | 171 | # Add any extra paths that contain custom files (such as robots.txt or 172 | # .htaccess) here, relative to this directory. These files are copied 173 | # directly to the root of the documentation. 174 | #html_extra_path = [] 175 | 176 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 177 | # using the given strftime format. 178 | #html_last_updated_fmt = '%b %d, %Y' 179 | 180 | # If true, SmartyPants will be used to convert quotes and dashes to 181 | # typographically correct entities. 182 | #html_use_smartypants = True 183 | 184 | # Custom sidebar templates, maps document names to template names. 185 | #html_sidebars = {} 186 | 187 | # Additional templates that should be rendered to pages, maps page names to 188 | # template names. 189 | #html_additional_pages = {} 190 | 191 | # If false, no module index is generated. 192 | #html_domain_indices = True 193 | 194 | # If false, no index is generated. 195 | #html_use_index = True 196 | 197 | # If true, the index is split into individual pages for each letter. 198 | #html_split_index = False 199 | 200 | # If true, links to the reST sources are added to the pages. 201 | #html_show_sourcelink = True 202 | 203 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 204 | #html_show_sphinx = True 205 | 206 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 207 | #html_show_copyright = True 208 | 209 | # If true, an OpenSearch description file will be output, and all pages will 210 | # contain a tag referring to it. The value of this option must be the 211 | # base URL from which the finished HTML is served. 212 | #html_use_opensearch = '' 213 | 214 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 215 | #html_file_suffix = None 216 | 217 | # Output file base name for HTML help builder. 218 | htmlhelp_basename = 'project-templatedoc' 219 | 220 | 221 | # -- Options for LaTeX output --------------------------------------------- 222 | 223 | latex_elements = { 224 | # The paper size ('letterpaper' or 'a4paper'). 225 | #'papersize': 'letterpaper', 226 | 227 | # The font size ('10pt', '11pt' or '12pt'). 228 | #'pointsize': '10pt', 229 | 230 | # Additional stuff for the LaTeX preamble. 231 | #'preamble': '', 232 | } 233 | 234 | # Grouping the document tree into LaTeX files. List of tuples 235 | # (source start file, target name, title, 236 | # author, documentclass [howto, manual, or own class]). 237 | latex_documents = [ 238 | ('index', 'project-template.tex', u'project-template Documentation', 239 | u'Vighnesh Birodkar', 'manual'), 240 | ] 241 | 242 | # The name of an image file (relative to this directory) to place at the top of 243 | # the title page. 244 | #latex_logo = None 245 | 246 | # For "manual" documents, if this is true, then toplevel headings are parts, 247 | # not chapters. 248 | #latex_use_parts = False 249 | 250 | # If true, show page references after internal links. 251 | #latex_show_pagerefs = False 252 | 253 | # If true, show URL addresses after external links. 254 | #latex_show_urls = False 255 | 256 | # Documents to append as an appendix to all manuals. 257 | #latex_appendices = [] 258 | 259 | # If false, no module index is generated. 260 | #latex_domain_indices = True 261 | 262 | 263 | # -- Options for manual page output --------------------------------------- 264 | 265 | # One entry per manual page. List of tuples 266 | # (source start file, name, description, authors, manual section). 267 | man_pages = [ 268 | ('index', 'project-template', u'project-template Documentation', 269 | [u'Vighnesh Birodkar'], 1) 270 | ] 271 | 272 | # If true, show URL addresses after external links. 273 | #man_show_urls = False 274 | 275 | 276 | # -- Options for Texinfo output ------------------------------------------- 277 | 278 | # Grouping the document tree into Texinfo files. List of tuples 279 | # (source start file, target name, title, author, 280 | # dir menu entry, description, category) 281 | texinfo_documents = [ 282 | ('index', 'project-template', u'project-template Documentation', 283 | u'Vighnesh Birodkar', 'project-template', 'One line description of project.', 284 | 'Miscellaneous'), 285 | ] 286 | 287 | # Documents to append as an appendix to all manuals. 288 | #texinfo_appendices = [] 289 | 290 | # If false, no module index is generated. 291 | #texinfo_domain_indices = True 292 | 293 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 294 | #texinfo_show_urls = 'footnote' 295 | 296 | # If true, do not generate a @detailmenu in the "Top" node's menu. 297 | #texinfo_no_detailmenu = False 298 | 299 | 300 | # Example configuration for intersphinx: refer to the Python standard library. 301 | # intersphinx configuration 302 | intersphinx_mapping = { 303 | 'python': ('https://docs.python.org/{.major}'.format( 304 | sys.version_info), None), 305 | 'numpy': ('https://docs.scipy.org/doc/numpy/', None), 306 | 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None), 307 | 'matplotlib': ('https://matplotlib.org/', None), 308 | 'sklearn': ('http://scikit-learn.org/stable', None) 309 | } 310 | 311 | # sphinx-gallery configuration 312 | sphinx_gallery_conf = { 313 | 'doc_module': 'vectorizers', 314 | 'backreferences_dir': os.path.join('generated'), 315 | 'reference_url': { 316 | 'vectorizers': None} 317 | } 318 | 319 | def setup(app): 320 | # a copy button to copy snippet of code from the documentation 321 | # app.add_javascript('js/copybutton.js') 322 | pass -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. project-template documentation master file, created by 2 | sphinx-quickstart on Mon Jan 18 14:44:12 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. image:: vectorizers_logo_text.png 7 | :width: 600 8 | :alt: Vectorizers Logo 9 | 10 | ===================================================== 11 | Vectorizers: Transform unstructured data into vectors 12 | ===================================================== 13 | 14 | There are a large number of machine learning tools for effectively exploring and working 15 | with data that is given as vectors (ideally with a defined notion of distance as well). 16 | There is also a large volume of data that does not come neatly packaged as vectors. It 17 | could be text data, variable length sequence data (either numeric or categorical), 18 | dataframes of mixed data types, sets of point clouds, or more. Usually, one way or another, 19 | such data can be wrangled into vectors in a way that preserves some relevant properties 20 | of the original data. This library seeks to provide a suite of a wide variety of 21 | general purpose techniques for such wrangling, making it easier and faster for users 22 | to get various kinds of unstructured sequence data into vector formats for exploration and 23 | machine learning. 24 | 25 | .. toctree:: 26 | :maxdepth: 2 27 | :caption: Taxonomy of sequences 28 | 29 | sequence_taxonomy 30 | 31 | .. toctree:: 32 | :maxdepth: 2 33 | :caption: Quick Start 34 | 35 | quick_start 36 | 37 | .. toctree:: 38 | :maxdepth: 2 39 | :caption: Getting Started Tutorials 40 | 41 | document_vectorization 42 | CategoricalColumnTransformer_intro 43 | 44 | .. toctree:: 45 | :maxdepth: 2 46 | :caption: Example Use Cases 47 | 48 | token_cooccurrence_vectorizer_multi_labelled_cyber_example 49 | categorical_column_transformer_example 50 | 51 | .. toctree:: 52 | :maxdepth: 2 53 | :caption: Understanding the Tools 54 | 55 | information_weight_transform 56 | 57 | .. toctree:: 58 | :maxdepth: 2 59 | :caption: API Reference: 60 | 61 | api 62 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\project-template.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\project-template.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /doc/quick_start.rst: -------------------------------------------------------------------------------- 1 | ############################ 2 | Quick Start with Vectorizers 3 | ############################ 4 | 5 | Vectorizers provides a number of tools for working with various kinds of 6 | unstructured data with a focus on sequence data. The library is built to be 7 | compatible with scikit-learn_ and can be used in scikit-learn pipelines. 8 | 9 | ---------- 10 | Installing 11 | ---------- 12 | 13 | Vectorizers can be installed via pip (coming soon) and via conda-forge (coming later). 14 | 15 | (Coming soon) 16 | .. code:: bash 17 | 18 | pip install vectorizers 19 | 20 | (Currently available) 21 | .. code:: bash 22 | 23 | pip install git+https://github.com/TutteInstitute/vectorizers.git 24 | 25 | To manually install this package: 26 | 27 | .. code:: bash 28 | 29 | wget https://github.com/TutteInstitute/vectorizers/archive/master.zip 30 | unzip master.zip 31 | rm master.zip 32 | cd vectorizers-master 33 | python setup.py install 34 | 35 | ----------- 36 | Basic Usage 37 | ----------- 38 | 39 | The vectorizers package provides a number of tools for vectorizing different kinds of 40 | input data. All of them are available as classes that follow sciki-learn's basic API 41 | for transformers, converting input data into vectors in one form or another. For example 42 | to convert sequences of categorical data into ngram vector representations one might use 43 | 44 | .. code:: python3 45 | 46 | import vectorizers 47 | 48 | ngrammer = vectorizers.NgramVectorizer(ngram_size=2) 49 | ngram_vetcors = ngrammer.fit_transform(input_sequences) 50 | 51 | These classes can easily be fit into sklearn pipelines, passing vector 52 | representations on to other scikit-learn (or scikit-learn compatible) classes. See 53 | the `Vectorizers API`_ documentation for more details on the available classes. 54 | 55 | Vetcorizers also provides a number of utility transformers in the ``vectorizers.transformers`` 56 | namespace. These provide convenience transformations of data -- either transforms on vectorized 57 | data, including feature weighting tools, or transformations of structured and unstructured data 58 | into sequences more amenable to other vectorizers classes. 59 | 60 | .. _scikit-learn: https://scikit-learn.org/stable/ 61 | -------------------------------------------------------------------------------- /doc/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | scikit-learn 4 | numba 5 | pandas 6 | dask 7 | pynndescent>=0.5 8 | pomegranate 9 | pygments>=2.4.1 10 | jupyterlab_pygments>=0.1.1 11 | ipykernel 12 | nbsphinx 13 | numpydoc 14 | sphinx-rtd-theme 15 | sphinx-gallery 16 | -------------------------------------------------------------------------------- /doc/sequence_taxonomy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Variable Length Sequences\n", 8 | "\n", 9 | "Variable length sequence data can present a significant challenge\n", 10 | "to machine learning algorithms and data science analysis.\n", 11 | "\n", 12 | "Part of this problem is driven by the wide varieties of variable length\n", 13 | "sequence data that are encountered in the wild. To that end we present\n", 14 | "a taxonomy of the kinds of variable length sequences that we typically\n", 15 | "encounter and our suggestions for how to think about them.\n", 16 | "\n", 17 | "We generally find it useful when describing variable length sequence data\n", 18 | "to describe what it is a sequence of. The basic types that we commonly\n", 19 | "encounter are: categorical values, scalar values and vector values. Certainly scalar data could be thought of a simple one dimensional vector data but given the different techniques that can, and often are, applied to such data we feel that treating it as a seperate data type is warranted.\n", 20 | "\n", 21 | "Next we describe it as either ordered or unordered sequences. Yes, an unordered sequence is an odd turn of phrase but we be find it to be a useful simplifying\n", 22 | "notion. An unordered sequence is often referred to as a bag in data science\n", 23 | "literature. For a example a `bag of words` is the phrase used to describe an\n", 24 | "unordered collection of word tokens. We would describe such a collection as an\n", 25 | "unordered categorical sequence.\n", 26 | "\n", 27 | "Lasty, given an ordered sequence we require one extra piece of information:\n", 28 | "is the ordered regular or irregular. Regular sequences are often\n", 29 | "described as heartbeat data and generally assume equal spacing between all our\n", 30 | "values. Irregular sequences are often referred to as event data and each\n", 31 | "value is associated with a particular position allowing variable spacing amongst\n", 32 | "our values.\n", 33 | "\n", 34 | "Variable length sequence data comes in a vast variety of forms. Different forms of variable length sequence data are amenable to different techniques. To deal with this variety of data we propose this simple taxonomy of variable length sequence data and provide links and suggestions for techniques conducive to each type. \n", 35 | "\n", 36 | "* Type of values: categorical, scalar, vector\n", 37 | "* Order of values: Ordered or Unordered\n", 38 | "* Regularity of values: Regular or Irregular" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "### Sequence Types\n", 46 | "#### Categorical\n", 47 | "| regularity | order | type | sequence | | example |\n", 48 | "| :- | :- | :- | :- | :-: | :-: | \n", 49 | "| | unordered | categorical | sequence | -> | bag of words |\n", 50 | "| regular | ordered | categorical | sequence | -> | text document |\n", 51 | "| irregular | ordered | categorical | sequence | ->| time stamped labelled events |\n", 52 | "\n", 53 | "\n", 54 | "#### Scalar\n", 55 | "| regularity | order | type | sequence | | example |\n", 56 | "| :- | :- | :- | :- | :-: | :-: | \n", 57 | "| | unordered | Scalar | sequence | -> | random variable |\n", 58 | "| regular | ordered | Scalar | sequence | -> | heartbeat time-series |\n", 59 | "| irregular | ordered | Scalar | sequence | ->| time stamped values or event sequence |\n", 60 | "\n", 61 | "#### Vector\n", 62 | "| regularity | order | type | sequence | | example |\n", 63 | "| :- | :- | :- | :- | :-: | :-: | \n", 64 | "| | unordered | Vector | sequence | -> | point cloud |\n", 65 | "| regular | ordered | Vector | sequence | -> | spatial-trajectory data |\n", 66 | "| irregular | ordered | Vector | sequence | ->| time stamped locations |\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "## Vectorizer Functions \n", 74 | "\n", 75 | "This library adheres the sklearn transformer paradigm. With most functions having a `fit`, `fit_transform` and `transform` functions. As such they can be easily arranged in sklearn pipelines to ensure that all of your data transformation steps are encapsulated cleanly.\n", 76 | "\n", 77 | "For the most part our `vectorizers` take in a sequence of variable length sequences and learn a fixed width representation of these sequences. Another way of thinking of this is transforming a jagged array of vectors into a fixed width array of vectors. Fixed width representations are significantly more conducive to traditional machine learning algorithms.\n", 78 | "\n", 79 | "`Transformers` on the other hand are more generic utility functions that massage data in various useful ways. \n", 80 | "\n", 81 | "Due to the variety of vectorization techniques in this library a user might find it easier to determine the type of variable length sequences they are dealing with and use the following index to find the relevant functions.\n", 82 | "\n", 83 | "#### Categorical\n", 84 | "| regularity | order | type | sequence | | example | functions |\n", 85 | "| :- | :- | :- | :- | :-: | :- | :- | \n", 86 | "| | unordered | categorical | sequence | -> | bag of words | [NgramVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.NgramVectorizer.html#vectorizers.NgramVectorizer), [EdgeListVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.EdgeListVectorizer.html) | \n", 87 | "| regular | ordered | categorical | sequence | -> | text document | NgramVectorizer, [LZCompressionVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.LZCompressionVectorizer.html), [BPEVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.BytePairEncodingVectorizer.html) | \n", 88 | "| irregular | ordered | categorical | sequence | ->| time stamped labelled events | [HistogramVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.HistogramVectorizer.html) |\n", 89 | "\n", 90 | "All of these vectorizers take data in the form of a sequence of variable length sequences of categorical values (such as strings). All of these methods presume that a user has already decomposed their data into something of this form. \n", 91 | "\n", 92 | "The most common sources of variable length categorical data are text documents or data frames with categorical columns. In both cases some pre-processing will be necessary to convert such data into sequences of variable length sequences. \n", 93 | "\n", 94 | "In the case of text documents this often involves tokenization and lemmatization steps. An example of applying such transformations on text data before vectorization can be found in [document vectorizer](https://vectorizers.readthedocs.io/en/latest/document_vectorization.html).\n", 95 | "\n", 96 | "Good tokenization and lemmatization libraries include: [HuggingFace](https://huggingface.co/docs/transformers/main_classes/tokenizer), [SentencePiece](https://github.com/google/sentencepiece), [spaCy](https://spacy.io/api/tokenizer), and [nltk](https://www.nltk.org/api/nltk.tokenize.html).\n", 97 | "\n", 98 | "In the case of a data frame with multiple categorical columns one might make use of our libraries CategoricalColumnTransformer for transforming a data frame with one or more columns into a variable length sequence of categorical sequences. This is typically done by specifying one categorical column to represent ones objects and another set of categorical columns to be used to describe said objects.\n", 99 | "For an examples of how one might use this see an [introduction to CategoricalColumnTransformer](https://vectorizers.readthedocs.io/en/latest/CategoricalColumnTransformer_intro.html) or the more complicated [CategoricalColumnTransformer vignette](https://vectorizers.readthedocs.io/en/latest/categorical_column_transformer_example.html). \n", 100 | "\n", 101 | "#### Scalar\n", 102 | "| regularity | order | type | sequence | | example | functions |\n", 103 | "| :- | :- | :- | :- | :-: | :- | :- | \n", 104 | "| | unordered | Scalar | sequence | -> | random variable | [HistogramVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.HistogramVectorizer.html), DistributionVectorizer |\n", 105 | "| regular | ordered | Scalar | sequence | -> | heartbeat time-series | SlidingWindowTransformer |\n", 106 | "| irregular | ordered | Scalar | sequence | ->| time stamped values or event sequence | [KDEVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.KDEVectorizer.html#vectorizers.KDEVectorizer) |\n", 107 | "\n", 108 | "One should note that regular ordered scalar sequences references a Transformer function instead of a Vectorizer. That is because our current recommendation for dealing with such sequences is to use the SlidingWindowTransformer to encode the sequence information into an unordered scalar sequence and then apply the appropriate techniques.\n", 109 | "\n", 110 | "#### Vector\n", 111 | "| regularity | order | type | sequence | | example | functions |\n", 112 | "| :- | :- | :- | :- | :-: | :- | :- | \n", 113 | "| | unordered | Vector | sequence | -> | point cloud | [WassersteinVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.WassersteinVectorizer.html#vectorizers.WassersteinVectorizer), DistributionVectorizer |\n", 114 | "| regular | ordered | Vector | sequence | -> | spatial-trajectory data | SlidingWindowTransformer |\n", 115 | "| irregular | ordered | Vector | sequence | ->| time stamped locations | `we accept pull requests` |\n", 116 | "\n", 117 | "One should note that regular ordered vector sequences references a Transformer function instead of a Vectorizer. That is because our current recommendation for dealing with such sequences is to use the SlidingWindowTransformer to encode the sequence information into an unordered vector sequence and then apply the appropriate techniques.\n", 118 | "\n", 119 | "WassersteinVectorizer should be considered the gold standard for vectorizing point clouds of data. It makes use linear optimal transport to linearize and thus provide a reasonably scalable vectorization of a point cloud so that Euclidean or Cosine distance on this space will be a reasonable approximation of Wasserstein distance between the point cloud distrubitons. SinkhornVectorizer can handle much larger distributions of data and is generally more efficient but this efficiency may come with some loss of quality. Lastly, we include an ApproximateWassersteinVectorizer which is a heuristic linear algebra based solution which poorly approximates our WassersteinVectorizer but is very, very fast. " 120 | ] 121 | } 122 | ], 123 | "metadata": { 124 | "kernelspec": { 125 | "display_name": "Python 3 (ipykernel)", 126 | "language": "python", 127 | "name": "python3" 128 | }, 129 | "language_info": { 130 | "codemirror_mode": { 131 | "name": "ipython", 132 | "version": 3 133 | }, 134 | "file_extension": ".py", 135 | "mimetype": "text/x-python", 136 | "name": "python", 137 | "nbconvert_exporter": "python", 138 | "pygments_lexer": "ipython3", 139 | "version": "3.10.10" 140 | } 141 | }, 142 | "nbformat": 4, 143 | "nbformat_minor": 4 144 | } 145 | -------------------------------------------------------------------------------- /doc/user_guide.rst: -------------------------------------------------------------------------------- 1 | .. title:: User guide : contents 2 | 3 | .. _user_guide: 4 | 5 | ================================================== 6 | User guide: create your own scikit-learn estimator 7 | ================================================== 8 | 9 | Estimator 10 | --------- 11 | 12 | The central piece of transformer, regressor, and classifier is 13 | :class:`sklearn.base.BaseEstimator`. All estimators in scikit-learn are derived 14 | from this class. In more details, this base class enables to set and get 15 | parameters of the estimator. It can be imported as:: 16 | 17 | >>> from sklearn.base import BaseEstimator 18 | 19 | Once imported, you can create a class which inherate from this base class:: 20 | 21 | >>> class MyOwnEstimator(BaseEstimator): 22 | ... pass 23 | 24 | Transformer 25 | ----------- 26 | 27 | Transformers are scikit-learn estimators which implement a ``transform`` method. 28 | The use case is the following: 29 | 30 | * at ``fit``, some parameters can be learned from ``X`` and ``y``; 31 | * at ``transform``, `X` will be transformed, using the parameters learned 32 | during ``fit``. 33 | 34 | .. _mixin: https://en.wikipedia.org/wiki/Mixin 35 | 36 | In addition, scikit-learn provides a 37 | mixin_, i.e. :class:`sklearn.base.TransformerMixin`, which 38 | implement the combination of ``fit`` and ``transform`` called ``fit_transform``:: 39 | 40 | One can import the mixin class as:: 41 | 42 | >>> from sklearn.base import TransformerMixin 43 | 44 | Therefore, when creating a transformer, you need to create a class which 45 | inherits from both :class:`sklearn.base.BaseEstimator` and 46 | :class:`sklearn.base.TransformerMixin`. The scikit-learn API imposed ``fit`` to 47 | **return ``self``**. The reason is that it allows to pipeline ``fit`` and 48 | ``transform`` imposed by the :class:`sklearn.base.TransformerMixin`. The 49 | ``fit`` method is expected to have ``X`` and ``y`` as inputs. Note that 50 | ``transform`` takes only ``X`` as input and is expected to return the 51 | transformed version of ``X``:: 52 | 53 | >>> class MyOwnTransformer(BaseEstimator, TransformerMixin): 54 | ... def fit(self, X, y=None): 55 | ... return self 56 | ... def transform(self, X): 57 | ... return X 58 | 59 | We build a basic example to show that our :class:`MyOwnTransformer` is working 60 | within a scikit-learn ``pipeline``:: 61 | 62 | >>> from sklearn.datasets import load_iris 63 | >>> from sklearn.pipeline import make_pipeline 64 | >>> from sklearn.linear_model import LogisticRegression 65 | >>> X, y = load_iris(return_X_y=True) 66 | >>> pipe = make_pipeline(MyOwnTransformer(), 67 | ... LogisticRegression(random_state=10, 68 | ... solver='lbfgs', 69 | ... multi_class='auto')) 70 | >>> pipe.fit(X, y) # doctest: +ELLIPSIS 71 | Pipeline(...) 72 | >>> pipe.predict(X) # doctest: +ELLIPSIS 73 | array([...]) 74 | 75 | Predictor 76 | --------- 77 | 78 | Regressor 79 | ~~~~~~~~~ 80 | 81 | Similarly, regressors are scikit-learn estimators which implement a ``predict`` 82 | method. The use case is the following: 83 | 84 | * at ``fit``, some parameters can be learned from ``X`` and ``y``; 85 | * at ``predict``, predictions will be computed using ``X`` using the parameters 86 | learned during ``fit``. 87 | 88 | In addition, scikit-learn provides a mixin_, i.e. 89 | :class:`sklearn.base.RegressorMixin`, which implements the ``score`` method 90 | which computes the :math:`R^2` score of the predictions. 91 | 92 | One can import the mixin as:: 93 | 94 | >>> from sklearn.base import RegressorMixin 95 | 96 | Therefore, we create a regressor, :class:`MyOwnRegressor` which inherits from 97 | both :class:`sklearn.base.BaseEstimator` and 98 | :class:`sklearn.base.RegressorMixin`. The method ``fit`` gets ``X`` and ``y`` 99 | as input and should return ``self``. It should implement the ``predict`` 100 | function which should output the predictions of your regressor:: 101 | 102 | >>> import numpy as np 103 | >>> class MyOwnRegressor(BaseEstimator, RegressorMixin): 104 | ... def fit(self, X, y): 105 | ... return self 106 | ... def predict(self, X): 107 | ... return np.mean(X, axis=1) 108 | 109 | We illustrate that this regressor is working within a scikit-learn pipeline:: 110 | 111 | >>> from sklearn.datasets import load_diabetes 112 | >>> X, y = load_diabetes(return_X_y=True) 113 | >>> pipe = make_pipeline(MyOwnTransformer(), MyOwnRegressor()) 114 | >>> pipe.fit(X, y) # doctest: +ELLIPSIS 115 | Pipeline(...) 116 | >>> pipe.predict(X) # doctest: +ELLIPSIS 117 | array([...]) 118 | 119 | Since we inherit from the :class:`sklearn.base.RegressorMixin`, we can call 120 | the ``score`` method which will return the :math:`R^2` score:: 121 | 122 | >>> pipe.score(X, y) # doctest: +ELLIPSIS 123 | -3.9... 124 | 125 | Classifier 126 | ~~~~~~~~~~ 127 | 128 | Similarly to regressors, classifiers implement ``predict``. In addition, they 129 | output the probabilities of the prediction using the ``predict_proba`` method: 130 | 131 | * at ``fit``, some parameters can be learned from ``X`` and ``y``; 132 | * at ``predict``, predictions will be computed using ``X`` using the parameters 133 | learned during ``fit``. The output corresponds to the predicted class for each sample; 134 | * ``predict_proba`` will give a 2D matrix where each column corresponds to the 135 | class and each entry will be the probability of the associated class. 136 | 137 | In addition, scikit-learn provides a mixin, i.e. 138 | :class:`sklearn.base.ClassifierMixin`, which implements the ``score`` method 139 | which computes the accuracy score of the predictions. 140 | 141 | One can import this mixin as:: 142 | 143 | >>> from sklearn.base import ClassifierMixin 144 | 145 | Therefore, we create a classifier, :class:`MyOwnClassifier` which inherits 146 | from both :class:`slearn.base.BaseEstimator` and 147 | :class:`sklearn.base.ClassifierMixin`. The method ``fit`` gets ``X`` and ``y`` 148 | as input and should return ``self``. It should implement the ``predict`` 149 | function which should output the class inferred by the classifier. 150 | ``predict_proba`` will output some probabilities instead:: 151 | 152 | >>> class MyOwnClassifier(BaseEstimator, ClassifierMixin): 153 | ... def fit(self, X, y): 154 | ... self.classes_ = np.unique(y) 155 | ... return self 156 | ... def predict(self, X): 157 | ... return np.random.randint(0, self.classes_.size, 158 | ... size=X.shape[0]) 159 | ... def predict_proba(self, X): 160 | ... pred = np.random.rand(X.shape[0], self.classes_.size) 161 | ... return pred / np.sum(pred, axis=1)[:, np.newaxis] 162 | 163 | We illustrate that this regressor is working within a scikit-learn pipeline:: 164 | 165 | >>> X, y = load_iris(return_X_y=True) 166 | >>> pipe = make_pipeline(MyOwnTransformer(), MyOwnClassifier()) 167 | >>> pipe.fit(X, y) # doctest: +ELLIPSIS 168 | Pipeline(...) 169 | 170 | Then, you can call ``predict`` and ``predict_proba``:: 171 | 172 | >>> pipe.predict(X) # doctest: +ELLIPSIS 173 | array([...]) 174 | >>> pipe.predict_proba(X) # doctest: +ELLIPSIS 175 | array([...]) 176 | 177 | Since our classifier inherits from :class:`sklearn.base.ClassifierMixin`, we 178 | can compute the accuracy by calling the ``score`` method:: 179 | 180 | >>> pipe.score(X, y) # doctest: +ELLIPSIS 181 | 0... 182 | -------------------------------------------------------------------------------- /doc/vectorizers_logo_no_text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TutteInstitute/vectorizers/6e60b98e6c91821fac892675004eda2931380c13/doc/vectorizers_logo_no_text.png -------------------------------------------------------------------------------- /doc/vectorizers_logo_text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TutteInstitute/vectorizers/6e60b98e6c91821fac892675004eda2931380c13/doc/vectorizers_logo_text.png -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: project-template 2 | dependencies: 3 | - numpy 4 | - scipy 5 | - scikit-learn 6 | -------------------------------------------------------------------------------- /examples/README.txt: -------------------------------------------------------------------------------- 1 | .. _general_examples: 2 | 3 | General examples 4 | ================ 5 | 6 | Introductory examples. 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | scikit-learn 4 | numba 5 | pandas 6 | dask 7 | pynndescent>=0.5 8 | pomegranate 9 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst 3 | 4 | [aliases] 5 | test = pytest 6 | 7 | [tool:pytest] 8 | addopts = --doctest-modules 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """A template for scikit-learn compatible packages.""" 3 | 4 | import codecs 5 | import os 6 | 7 | from setuptools import find_packages, setup 8 | 9 | # get __version__ from _version.py 10 | ver_file = os.path.join('vectorizers', '_version.py') 11 | with open(ver_file) as f: 12 | exec(f.read()) 13 | 14 | DISTNAME = 'vectorizers' 15 | DESCRIPTION = 'A suite of vectorizers for various data types.' 16 | with codecs.open('README.rst', encoding='utf-8-sig') as f: 17 | LONG_DESCRIPTION = f.read() 18 | MAINTAINER = 'John Healy, Leland McInnes, Colin Weir' 19 | MAINTAINER_EMAIL = 'leland.mcinnes@gmail.com' 20 | URL = 'https://github.com/TutteInstitute/vectorizers' 21 | LICENSE = 'new BSD' 22 | DOWNLOAD_URL = 'https://github.com/TutteInstitute/vectorizers' 23 | VERSION = __version__ 24 | INSTALL_REQUIRES = ['numpy', 'pandas', 'scipy', 'scikit-learn', 'numba', 'pynndescent', 'dask'] 25 | CLASSIFIERS = ['Intended Audience :: Science/Research', 26 | 'Intended Audience :: Developers', 27 | 'License :: OSI Approved', 28 | 'Programming Language :: Python', 29 | 'Topic :: Software Development', 30 | 'Topic :: Scientific/Engineering', 31 | 'Operating System :: Microsoft :: Windows', 32 | 'Operating System :: POSIX', 33 | 'Operating System :: Unix', 34 | 'Operating System :: MacOS', 35 | 'Programming Language :: Python :: 3.9', 36 | 'Programming Language :: Python :: 3.10', 37 | 'Programming Language :: Python :: 3.11', 38 | 'Programming Language :: Python :: 3.12'] 39 | EXTRAS_REQUIRE = { 40 | 'tests': [ 41 | 'pytest', 42 | 'pytest-cov'], 43 | 'docs': [ 44 | 'sphinx', 45 | 'sphinx-gallery', 46 | 'nbsphinx', 47 | 'sphinx_rtd_theme', 48 | 'numpydoc', 49 | 'matplotlib' 50 | ] 51 | } 52 | 53 | setup(name=DISTNAME, 54 | maintainer=MAINTAINER, 55 | maintainer_email=MAINTAINER_EMAIL, 56 | description=DESCRIPTION, 57 | license=LICENSE, 58 | url=URL, 59 | version=VERSION, 60 | download_url=DOWNLOAD_URL, 61 | long_description=LONG_DESCRIPTION, 62 | zip_safe=False, # the package can run out of an .egg file 63 | classifiers=CLASSIFIERS, 64 | packages=find_packages(), 65 | install_requires=INSTALL_REQUIRES, 66 | extras_require=EXTRAS_REQUIRE) 67 | -------------------------------------------------------------------------------- /vectorizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .token_cooccurrence_vectorizer import TokenCooccurrenceVectorizer 2 | from .timed_token_cooccurrence_vectorizer import TimedTokenCooccurrenceVectorizer 3 | from .ngram_token_cooccurence_vectorizer import NgramCooccurrenceVectorizer 4 | from .multi_token_cooccurence_vectorizer import MultiSetCooccurrenceVectorizer 5 | from ._vectorizers import DistributionVectorizer 6 | from ._vectorizers import HistogramVectorizer 7 | from .skip_gram_vectorizer import SkipgramVectorizer 8 | from .ngram_vectorizer import NgramVectorizer 9 | from .kde_vectorizer import KDEVectorizer 10 | from .tree_token_cooccurrence import LabelledTreeCooccurrenceVectorizer 11 | from .edge_list_vectorizer import EdgeListVectorizer 12 | from .linear_optimal_transport import ( 13 | WassersteinVectorizer, 14 | SinkhornVectorizer, 15 | ApproximateWassersteinVectorizer, 16 | ) 17 | from .mixed_gram_vectorizer import LZCompressionVectorizer, BytePairEncodingVectorizer 18 | 19 | from .signature_vectorizer import SignatureVectorizer 20 | 21 | from .utils import cast_tokens_to_strings 22 | 23 | from ._version import __version__ 24 | 25 | __all__ = [ 26 | "TokenCooccurrenceVectorizer", 27 | "TimedTokenCooccurrenceVectorizer", 28 | "NgramCooccurrenceVectorizer", 29 | "MultiSetCooccurrenceVectorizer", 30 | "DistributionVectorizer", 31 | "HistogramVectorizer", 32 | "SkipgramVectorizer", 33 | "NgramVectorizer", 34 | "KDEVectorizer", 35 | "LabelledTreeCooccurrenceVectorizer", 36 | "WassersteinVectorizer", 37 | "SinkhornVectorizer", 38 | "ApproximateWassersteinVectorizer", 39 | "EdgeListVectorizer", 40 | "SignatureVectorizer", 41 | "__version__", 42 | ] 43 | -------------------------------------------------------------------------------- /vectorizers/_vectorizers.py: -------------------------------------------------------------------------------- 1 | from warnings import warn 2 | 3 | import numpy as np 4 | import numba 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | import pandas as pd 7 | from sklearn.utils.validation import ( 8 | check_array, 9 | check_is_fitted, 10 | check_random_state, 11 | ) 12 | from sklearn.mixture import GaussianMixture 13 | from sklearn.preprocessing import normalize 14 | from .utils import flatten, vectorize_diagram, pairwise_gaussian_ground_distance 15 | import vectorizers.distances as distances 16 | 17 | 18 | class DistributionVectorizer(BaseEstimator, TransformerMixin): 19 | def __init__( 20 | self, 21 | n_components=20, 22 | random_state=None, 23 | ): 24 | self.n_components = n_components 25 | self.random_state = random_state 26 | 27 | def _validate_params(self): 28 | if ( 29 | not np.issubdtype(type(self.n_components), np.integer) 30 | or self.n_components < 2 31 | ): 32 | raise ValueError( 33 | "n_components must be and integer greater than or equal " "to 2." 34 | ) 35 | 36 | def _validate_data(self, X): 37 | try: 38 | assert np.isscalar(X[0][0][0]) 39 | except: 40 | raise ValueError("Input must be a collection of collections of points") 41 | 42 | try: 43 | dims = [np.array(x).shape[1] for x in X] 44 | except: 45 | raise ValueError( 46 | "Elements of each point collection must be of the same dimension." 47 | ) 48 | 49 | if not hasattr(self, "data_dimension_"): 50 | self.data_dimension_ = np.mean(dims) 51 | 52 | if not ( 53 | np.max(dims) == self.data_dimension_ or np.min(dims) == self.data_dimension_ 54 | ): 55 | raise ValueError("Each point collection must be of equal dimension.") 56 | 57 | def fit(self, X, y=None, **fit_params): 58 | random_state = check_random_state(self.random_state) 59 | self._validate_params() 60 | self._validate_data(X) 61 | 62 | combined_data = np.vstack(X) 63 | combined_data = check_array(combined_data) 64 | 65 | self.mixture_model_ = GaussianMixture( 66 | n_components=self.n_components, random_state=random_state 67 | ) 68 | self.mixture_model_.fit(combined_data) 69 | self.ground_distance_ = pairwise_gaussian_ground_distance( 70 | self.mixture_model_.means_, 71 | self.mixture_model_.covariances_, 72 | ) 73 | self.metric_ = distances.hellinger 74 | 75 | def transform(self, X): 76 | check_is_fitted(self, ["mixture_model_", "ground_distance_"]) 77 | self._validate_data(X) 78 | result = np.vstack( 79 | [vectorize_diagram(diagram, self.mixture_model_) for diagram in X] 80 | ) 81 | return result 82 | 83 | def fit_transform(self, X, y=None, **fit_params): 84 | self.fit(X, y, **fit_params) 85 | return np.vstack( 86 | [vectorize_diagram(diagram, self.mixture_model_) for diagram in X] 87 | ) 88 | 89 | 90 | def find_bin_boundaries(flat, n_bins): 91 | """ 92 | Only uniform distribution is currently implemented. 93 | TODO: Implement Normal 94 | :param flat: an iterable. 95 | :param n_bins: 96 | :return: 97 | """ 98 | flat.sort() 99 | flat_csum = np.cumsum(flat) 100 | bin_range = flat_csum[-1] / n_bins 101 | bin_indices = [0] 102 | for i in range(1, len(flat_csum)): 103 | if (flat_csum[i] >= bin_range * len(bin_indices)) & ( 104 | flat[i] > flat[bin_indices[-1]] 105 | ): 106 | bin_indices.append(i) 107 | bin_values = np.array(flat, dtype=float)[bin_indices] 108 | 109 | if bin_values.shape[0] < n_bins: 110 | warn( 111 | f"Could not generate n_bins={n_bins} bins as there are not enough " 112 | f"distinct values. Please check your data." 113 | ) 114 | 115 | return bin_values 116 | 117 | 118 | def expand_boundaries(my_interval_index, absolute_range): 119 | """ 120 | Expands the outer bind on a pandas IntervalIndex to encompass the range specified by the 2-tuple absolute_range. 121 | 122 | Parameters 123 | ---------- 124 | my_interval_index: pandas IntervalIndex object (right closed) 125 | absolute_range: 2-tuple. 126 | (min_value, max_value) 127 | 128 | Returns 129 | ------- 130 | index: a pandas IntervalIndex 131 | A pandas IntervalIndex with the boundaries potentially expanded to encompas the absolute range. 132 | """ 133 | """ 134 | expands the outer bind on a pandas IntervalIndex to encompass the range specified by the 2-tuple absolute_range 135 | :param my_interval_index: 136 | :param absolute_range: 2tuple 137 | :return: a pandas IntervalIndex 138 | """ 139 | interval_list = my_interval_index.to_list() 140 | # Check if the left boundary needs expanding 141 | if interval_list[0].left > absolute_range[0]: 142 | interval_list[0] = pd.Interval( 143 | left=absolute_range[0], right=interval_list[0].right 144 | ) 145 | # Check if the right boundary needs expanding 146 | last = len(interval_list) - 1 147 | if interval_list[last].right < absolute_range[1]: 148 | interval_list[last] = pd.Interval( 149 | left=interval_list[last].left, right=absolute_range[1] 150 | ) 151 | return pd.IntervalIndex(interval_list) 152 | 153 | 154 | def add_outier_bins(my_interval_index, absolute_range): 155 | """ 156 | Appends extra bins to either side our our interval index if appropriate. 157 | That only occurs if the absolute_range is wider than the observed range in your training data. 158 | :param my_interval_index: 159 | :param absolute_range: 160 | :return: 161 | """ 162 | interval_list = my_interval_index.to_list() 163 | # Check if the left boundary needs expanding 164 | if interval_list[0].left > absolute_range[0]: 165 | left_outlier = pd.Interval(left=absolute_range[0], right=interval_list[0].left) 166 | interval_list.insert(0, left_outlier) 167 | 168 | last = len(interval_list) - 1 169 | if interval_list[last].right < absolute_range[1]: 170 | right_outlier = pd.Interval( 171 | left=interval_list[last].right, right=absolute_range[1] 172 | ) 173 | interval_list.append(right_outlier) 174 | return pd.IntervalIndex(interval_list) 175 | 176 | 177 | class HistogramVectorizer(BaseEstimator, TransformerMixin): 178 | """Convert a time series of binary events into a histogram of 179 | event occurrences over a time frame. If the data has explicit time stamps 180 | it can be aggregated over hour of day, day of week, day of month, day of year 181 | , week of year or month of year. 182 | 183 | Parameters 184 | ---------- 185 | n_components: int or array-like, shape (n_features,) (default=5) 186 | The number of bins to produce. Raises ValueError if n_bins < 2. 187 | 188 | strategy: {‘uniform’, ‘quantile’, 'gmm'}, (default=’uniform’) 189 | The method to use for bin selection in the histogram. In general the 190 | quantile option, which will select variable width bins based on the 191 | distribution of the training data, is suggested, but uniformly spaced 192 | identically sized bins, or soft bins learned from a Gaussian mixture model 193 | are also available. 194 | 195 | ground_distance: {'euclidean'} 196 | The distance to induce between bins. 197 | 198 | absolute_range: (minimum_value_possible, maximum_value_possible) (default=(-np.inf, np.inf)) 199 | By default values outside of training data range are included in the extremal bins. 200 | You can specify these values if you know something about your values (e.g. (0, np.inf) ) 201 | 202 | append_outlier_bins: bool (default=False) 203 | Whether to add extra bins to catch values outside of your training 204 | data where appropriate? These bins will increase the total number of 205 | components (to ``n_components + 2`` and will be the first bin (for 206 | outlying small data) and the last bin (for outlying large data). 207 | """ 208 | 209 | # TODO: time stamps, generic groupby 210 | def __init__( 211 | self, 212 | n_components=20, 213 | strategy="uniform", 214 | ground_distance="euclidean", 215 | absolute_range=(-np.inf, np.inf), 216 | append_outlier_bins=False, 217 | ): 218 | self.n_components = n_components 219 | self.strategy = strategy 220 | self.ground_distance = ground_distance # Not currently making use of this. 221 | self.absolute_range = absolute_range 222 | self.append_outlier_bins = append_outlier_bins 223 | 224 | def _validate_params(self): 225 | pass 226 | 227 | def fit(self, X, y=None, **fit_params): 228 | """ 229 | Learns the histogram bins. 230 | Still need to check switch. 231 | :param X: 232 | :return: 233 | """ 234 | flat = flatten(X) 235 | flat = list( 236 | filter( 237 | lambda n: n > self.absolute_range[0] and n < self.absolute_range[1], 238 | flat, 239 | ) 240 | ) 241 | if self.strategy == "uniform": 242 | self.bin_intervals_ = pd.interval_range( 243 | start=np.min(flat), end=np.max(flat), periods=self.n_components 244 | ) 245 | if self.strategy == "quantile": 246 | self.bin_intervals_ = pd.IntervalIndex.from_breaks( 247 | find_bin_boundaries(flat, self.n_components) 248 | ) 249 | if self.append_outlier_bins == True: 250 | self.bin_intervals_ = add_outier_bins( 251 | self.bin_intervals_, self.absolute_range 252 | ) 253 | else: 254 | self.bin_intervals_ = expand_boundaries( 255 | self.bin_intervals_, self.absolute_range 256 | ) 257 | self.metric_ = distances.hellinger 258 | return self 259 | 260 | def _vector_transform(self, vector): 261 | """ 262 | Applies the transform to a single row of the data. 263 | """ 264 | return pd.cut(vector, self.bin_intervals_).value_counts() 265 | 266 | def transform(self, X): 267 | """ 268 | Apply binning to a full data set returning an nparray. 269 | """ 270 | check_is_fitted(self, ["bin_intervals_"]) 271 | result = np.ndarray((len(X), len(self.bin_intervals_))) 272 | for i, seq in enumerate(X): 273 | result[i, :] = self._vector_transform(seq).values 274 | return result 275 | 276 | 277 | def temporal_cyclic_transform(datetime_series, periodicity=None): 278 | """ 279 | TODO: VERY UNFINISHED 280 | Replaces all time resolutions above the resolution specified with a fixed value. 281 | This creates a cycle within a datetime series. 282 | Parameters 283 | ---------- 284 | datetime_series: a pandas series of datetime objects 285 | periodicity: string ['year', 'month' , 'week', 'day', 'hour'] 286 | What time period to create cycles. 287 | 288 | Returns 289 | ------- 290 | cyclic_series: pandas series of datetime objects 291 | 292 | """ 293 | collapse_times = {} 294 | if periodicity in ["year", "month", "day", "hour"]: 295 | collapse_times["year"] = 1970 296 | if periodicity in ["month", "day", "hour"]: 297 | collapse_times["month"] = 1 298 | if periodicity in ["day", "hour"]: 299 | collapse_times["day"] = 1 300 | if periodicity in ["hour"]: 301 | collapse_times["hour"] = 0 302 | cyclic_series = datetime_series.apply(lambda x: x.replace(**collapse_times)) 303 | elif periodicity == "week": 304 | raise NotImplementedError("we have not implemented week cycles yet") 305 | else: 306 | raise ValueError( 307 | f"Sorry resolution={periodicity} is not a valid option. " 308 | + f"Please select from ['year', 'month', 'week', 'day', 'hour']" 309 | ) 310 | return cyclic_series 311 | 312 | 313 | class CyclicHistogramVectorizer(BaseEstimator, TransformerMixin): 314 | """""" 315 | 316 | def __init__( 317 | self, 318 | periodicity="week", 319 | resolution="day", 320 | ): 321 | self.periodicity = periodicity 322 | self.resolution = resolution 323 | 324 | def _validate_params(self): 325 | pass 326 | 327 | def fit(self, X, y=None, **fit_params): 328 | cyclic_data = temporal_cyclic_transform( 329 | pd.to_datetime(X), periodicity=self.periodicity 330 | ) 331 | resampled = ( 332 | pd.Series(index=cyclic_data, data=1).resample(self.resolution).count() 333 | ) 334 | self.temporal_bins_ = resampled.index 335 | return self 336 | 337 | 338 | class ProductDistributionVectorizer(BaseEstimator, TransformerMixin): 339 | pass 340 | -------------------------------------------------------------------------------- /vectorizers/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.2" 2 | -------------------------------------------------------------------------------- /vectorizers/_window_kernels.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numba 3 | 4 | EPSILON = 1e-8 5 | 6 | # The window function 7 | 8 | 9 | @numba.njit(nogil=True) 10 | def window_at_index(token_sequence, window_size, ind, reverse=False): 11 | if reverse: 12 | return np.flipud(token_sequence[max(ind - window_size, 0) : ind]) 13 | return token_sequence[ind + 1 : min(ind + window_size + 1, len(token_sequence))] 14 | 15 | 16 | # Window width functions 17 | 18 | 19 | @numba.njit(nogil=True) 20 | def variable_window_radii( 21 | window_size, 22 | token_frequency, 23 | mask_index=None, 24 | power=0.75, 25 | ): 26 | radii = np.power(token_frequency, power - 1) 27 | radii /= np.sum(radii * token_frequency) 28 | radii = np.append(radii, min(radii)) 29 | if mask_index is not None: 30 | radii[mask_index] = 0.0 31 | result = radii * window_size 32 | result[(result > 0) * (result < 1)] = 1.0 33 | np.round(result, 0, result) 34 | return result.astype(np.int64) 35 | 36 | 37 | @numba.njit(nogil=True) 38 | def fixed_window_radii(window_size, token_frequency, mask_index=None): 39 | radii = np.repeat(window_size, len(token_frequency) + 1) 40 | if mask_index is not None: 41 | radii[mask_index] = 0.0 42 | return radii 43 | 44 | 45 | # Kernel functions 46 | 47 | 48 | @numba.njit(nogil=True) 49 | def flat_kernel(window, mask_index=None, normalize=False, offset=0): 50 | result = np.ones(len(window), dtype=np.float64) 51 | if mask_index is not None: 52 | result[window == mask_index] = 0.0 53 | result[0 : min(offset, len(result))] = 0 54 | if normalize: 55 | temp = result.sum() 56 | if temp > 0: 57 | result /= temp 58 | return result 59 | 60 | 61 | @numba.njit(nogil=True) 62 | def harmonic_kernel(window, mask_index=None, normalize=False, offset=0): 63 | result = 1.0 / np.arange(1, len(window) + 1) 64 | if mask_index is not None: 65 | result[window == mask_index] = 0.0 66 | result[0 : min(offset, len(result))] = 0 67 | if normalize: 68 | temp = result.sum() 69 | if temp > 0: 70 | result /= temp 71 | return result 72 | 73 | 74 | @numba.njit(nogil=True) 75 | def geometric_kernel( 76 | window, 77 | mask_index=None, 78 | normalize=False, 79 | offset=0, 80 | power=0.9, 81 | ): 82 | result = power ** np.arange(1, len(window) + 1) 83 | 84 | if mask_index is not None: 85 | result[window == mask_index] = 0.0 86 | result[0 : min(offset, len(result))] = 0 87 | if normalize: 88 | temp = result.sum() 89 | if temp > 0: 90 | result /= temp 91 | return result 92 | 93 | 94 | @numba.njit(nogil=True) 95 | def multi_flat_kernel( 96 | window, 97 | target_ind, 98 | mask_index=None, 99 | normalize=False, 100 | offset=0, 101 | ): 102 | result_len = 0 103 | for mset in window: 104 | result_len += mset.shape[0] 105 | 106 | ker = np.ones(len(window)) 107 | kernel_result = np.zeros(result_len).astype(np.float64) 108 | 109 | ind = 0 110 | for i, mset in enumerate(window[offset:]): 111 | kernel_result[ind : ind + len(mset)] = np.repeat(ker[i], len(mset)) 112 | if mask_index is not None: 113 | for w_i, token in enumerate(mset): 114 | if token == mask_index: 115 | kernel_result[ind + w_i] = 0 116 | ind += len(mset) 117 | kernel_result[target_ind] = 0 118 | 119 | if normalize: 120 | temp = kernel_result.sum() 121 | if temp > 0: 122 | kernel_result /= temp 123 | 124 | return kernel_result 125 | 126 | 127 | @numba.njit(nogil=True) 128 | def multi_geometric_kernel( 129 | window, 130 | target_ind, 131 | mask_index=None, 132 | normalize=False, 133 | offset=0, 134 | power=0.9, 135 | ): 136 | result_len = 0 137 | for mset in window: 138 | result_len += mset.shape[0] 139 | 140 | ker = power ** np.arange(len(window)) 141 | 142 | kernel_result = np.zeros(result_len).astype(np.float64) 143 | ind = 0 144 | for i, mset in enumerate(window[offset:]): 145 | kernel_result[ind : ind + len(mset)] = np.repeat(ker[i], len(mset)) 146 | if mask_index is not None: 147 | for w_i, token in enumerate(mset): 148 | if token == mask_index: 149 | kernel_result[ind + w_i] = 0 150 | ind += len(mset) 151 | kernel_result[target_ind] = 0 152 | 153 | if normalize: 154 | temp = kernel_result.sum() 155 | if temp > 0: 156 | kernel_result /= temp 157 | 158 | return kernel_result 159 | 160 | 161 | @numba.njit(nogil=True) 162 | def update_kernel( 163 | window, 164 | kernel, 165 | mask_index, 166 | normalize, 167 | ): 168 | result = kernel[: len(window)].astype(np.float64) 169 | if mask_index is not None: 170 | result[window == mask_index] = 0 171 | if normalize: 172 | temp = result.sum() 173 | if temp > 0: 174 | result /= temp 175 | return result 176 | 177 | 178 | @numba.njit(nogil=True) 179 | def timed_geometric_kernel( 180 | window, 181 | time_deltas, 182 | delta, 183 | mask_index, 184 | normalize, 185 | offset, 186 | power=0.9, 187 | ): 188 | result = power ** (time_deltas / delta) 189 | if mask_index is not None: 190 | result[window == mask_index] = 0 191 | result[0 : min(offset, len(result))] = 0 192 | if normalize: 193 | temp = result.sum() 194 | if temp > 0: 195 | result /= temp 196 | return result 197 | 198 | 199 | @numba.njit(nogil=True) 200 | def timed_flat_kernel( 201 | window, 202 | time_deltas, 203 | delta, 204 | mask_index, 205 | normalize, 206 | offset, 207 | ): 208 | result = np.ones(len(time_deltas), dtype=np.float64) 209 | if mask_index is not None: 210 | result[window == mask_index] = 0 211 | result[0 : min(offset, len(result))] = 0 212 | if normalize: 213 | temp = result.sum() 214 | if temp > 0: 215 | result /= temp 216 | return result 217 | 218 | 219 | # Parameter lists 220 | 221 | _WINDOW_FUNCTIONS = { 222 | "variable": variable_window_radii, 223 | "fixed": fixed_window_radii, 224 | } 225 | 226 | _KERNEL_FUNCTIONS = { 227 | "flat": flat_kernel, 228 | "harmonic": harmonic_kernel, 229 | "geometric": geometric_kernel, 230 | } 231 | 232 | _TIMED_KERNEL_FUNCTIONS = { 233 | "flat": timed_flat_kernel, 234 | "geometric": timed_geometric_kernel, 235 | } 236 | 237 | _MULTI_KERNEL_FUNCTIONS = { 238 | "flat": multi_flat_kernel, 239 | "geometric": multi_geometric_kernel, 240 | } 241 | 242 | #################################################### 243 | # Sliding window multivariate time series kernels 244 | #################################################### 245 | 246 | 247 | def averaging_kernel(n_cols, *kernel_params): 248 | return np.full(n_cols, 1.0 / n_cols) 249 | 250 | 251 | def difference_kernel(n_cols, start, step, stride, *kernel_params): 252 | n_differences = int(np.ceil((n_cols - start - step) // stride)) 253 | result = np.zeros((n_differences, n_cols)) 254 | for i in range(n_differences): 255 | result[i, start + i * stride] = -1 256 | result[i, start + i * stride + step] = 1 257 | 258 | return result 259 | 260 | 261 | def positon_velocity_kernel(n_cols, position_index, step, stride, *kernel_params): 262 | n_differences_before = int(np.ceil((position_index - step) // stride)) 263 | n_differences_after = int(np.ceil((n_cols - position_index - step) // stride)) 264 | n_differences = n_differences_before + n_differences_after 265 | result = np.zeros((n_differences + 1, n_cols)) 266 | result[0, position_index] = 1 267 | for i in range(n_differences_before): 268 | result[i + 1, position_index - i * stride] = 1 269 | result[i + 1, position_index - i * stride - step] = -1 270 | for i in range(n_differences_after): 271 | result[i + n_differences_before + 1, position_index + i * stride] = -1 272 | result[i + n_differences_before + 1, position_index + i * stride + step] = 1 273 | 274 | return result 275 | 276 | 277 | def weight_kernel(n_cols, weights, *kernel_params): 278 | if weights.shape[0] != n_cols: 279 | raise ValueError( 280 | f"Cannot construct a weight kernel of size {n_cols} " 281 | f"with weights of shape {weights.shape[0]}" 282 | ) 283 | 284 | return np.diag(weights) 285 | 286 | 287 | def gaussian_weight_kernel(n_cols, sigma, *kernel_params): 288 | width = n_cols / 2 289 | xs = np.linspace(-width, width, n_cols) 290 | weights = 1.0 / (sigma * 2 * np.pi) * np.exp(-((xs / sigma) ** 2) / 2.0) 291 | return np.diag(weights) 292 | 293 | 294 | _SLIDING_WINDOW_KERNELS = { 295 | "average": averaging_kernel, 296 | "differences": difference_kernel, 297 | "position_velocity": positon_velocity_kernel, 298 | "weight": weight_kernel, 299 | "gaussian_weight": gaussian_weight_kernel, 300 | } 301 | 302 | # Copied from the SciPy implementation 303 | @numba.njit() 304 | def binom(n, k): 305 | n = int(n) 306 | k = int(k) 307 | 308 | if k > n or n < 0 or k < 0: 309 | return 0 310 | 311 | m = n + 1 312 | nterms = min(k, n - k) 313 | 314 | numerator = 1 315 | denominator = 1 316 | for j in range(1, nterms + 1): 317 | numerator *= m - j 318 | denominator *= j 319 | 320 | return numerator // denominator 321 | 322 | 323 | # A couple of changepoint based kernels that can be useful. The goal 324 | # is to detect changepoints in sequences of count of time interval 325 | # data (where the intervals are between events). 326 | # 327 | # We can model count data with Poisson's and interval data as inter-arrival 328 | # times (which can can convert to count-like data by taking reciprocals. 329 | # 330 | # Essentially we start with a baseline prior given by a gamma distribution, 331 | # and then update the prior with the data in the window up to, but not 332 | # including, the last element. The return value is then the predictive 333 | # posterior (a negative binomial) of observing the final element of 334 | # the window. 335 | 336 | 337 | def count_changepoint_kernel(alpha=1.0, beta=1): 338 | @numba.njit() 339 | def _kernel(window): 340 | model_window = window[:-1] 341 | observation = window[-1] 342 | alpha_prime = alpha + model_window.sum() 343 | beta_prime = beta + len(model_window) 344 | nb_r = alpha_prime 345 | nb_p = 1.0 / (1.0 + beta_prime) 346 | 347 | prob = ( 348 | binom(observation + nb_r - 1, observation) 349 | * (1 - nb_p) ** nb_r 350 | * nb_p ** observation 351 | ) 352 | 353 | return np.array([-np.log(prob)]) 354 | 355 | return _kernel 356 | 357 | 358 | def inter_arrival_changepoint_kernel(alpha=1.0, beta=1): 359 | @numba.njit() 360 | def _kernel(window): 361 | model_window = 1.0 / (window[:-1] + EPSILON) 362 | observation = 1.0 / (window[-1] + EPSILON) 363 | alpha_prime = alpha + model_window.sum() 364 | beta_prime = beta + len(model_window) 365 | nb_r = alpha_prime 366 | nb_p = 1.0 / (1.0 + beta_prime) 367 | 368 | prob = ( 369 | binom(observation + nb_r - 1, observation) 370 | * (1 - nb_p) ** nb_r 371 | * nb_p ** observation 372 | ) 373 | 374 | return np.array([-np.log(prob)]) 375 | 376 | return _kernel 377 | 378 | 379 | _SLIDING_WINDOW_FUNCTION_KERNELS = { 380 | "count_changepoint": count_changepoint_kernel, 381 | "timespan_changepoint": inter_arrival_changepoint_kernel, 382 | } 383 | -------------------------------------------------------------------------------- /vectorizers/coo_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numba 3 | from collections import namedtuple 4 | 5 | CooArray = namedtuple("CooArray", ["row", "col", "val", "key", "ind", "min", "depth"]) 6 | 7 | COO_QUICKSORT_LIMIT = 1 << 16 8 | COO_MEM_MULTIPLIER = 1.5 9 | 10 | 11 | @numba.njit(nogil=True) 12 | def set_array_size(token_sequences, window_array): 13 | tot_len = np.zeros(window_array.shape[0]).astype(np.float64) 14 | window_array = window_array.astype(np.float64) 15 | for seq in token_sequences: 16 | counts = np.bincount(seq, minlength=window_array.shape[1]).astype(np.float64) 17 | tot_len += np.dot( 18 | window_array, counts 19 | ).T # NOTE: numba only does dot products with floats 20 | return tot_len.astype(np.int64) 21 | 22 | 23 | @numba.njit(nogil=True) 24 | def merge_sum_duplicates(coo): 25 | new_depth = True 26 | for i in range(coo.depth[0]): 27 | if coo.min[i] <= 0: 28 | coo.min[:i] = -coo.ind[0] 29 | coo.min[i] = coo.ind[0] 30 | new_depth = False 31 | break 32 | else: 33 | array_len = coo.ind[0] - np.abs(coo.min[i + 1]) + 1 34 | result_row = np.zeros(array_len) 35 | result_col = np.zeros(array_len) 36 | result_val = np.zeros(array_len) 37 | result_key = np.zeros(array_len) 38 | ptr1 = np.abs(coo.min[i + 1]) 39 | ptr2 = coo.min[i] 40 | result_ptr = 0 41 | result_key[0] = -1 42 | 43 | while ptr1 < coo.min[i] and ptr2 < coo.ind[0]: 44 | if coo.key[ptr1] <= coo.key[ptr2]: 45 | this_ptr = ptr1 46 | ptr1 += 1 47 | else: 48 | this_ptr = ptr2 49 | ptr2 += 1 50 | 51 | if coo.key[this_ptr] == result_key[result_ptr]: 52 | result_val[result_ptr] += coo.val[this_ptr] 53 | else: 54 | result_ptr += 1 55 | result_val[result_ptr] = coo.val[this_ptr] 56 | result_row[result_ptr] = coo.row[this_ptr] 57 | result_col[result_ptr] = coo.col[this_ptr] 58 | result_key[result_ptr] = coo.key[this_ptr] 59 | 60 | if ptr1 >= coo.min[i]: 61 | while ptr2 < coo.ind[0]: 62 | this_ptr = ptr2 63 | ptr2 += 1 64 | 65 | if coo.key[this_ptr] == result_key[result_ptr]: 66 | result_val[result_ptr] += coo.val[this_ptr] 67 | else: 68 | result_ptr += 1 69 | result_val[result_ptr] = coo.val[this_ptr] 70 | result_row[result_ptr] = coo.row[this_ptr] 71 | result_col[result_ptr] = coo.col[this_ptr] 72 | result_key[result_ptr] = coo.key[this_ptr] 73 | else: 74 | while ptr1 < coo.min[i]: 75 | this_ptr = ptr1 76 | ptr1 += 1 77 | 78 | if coo.key[this_ptr] == result_key[result_ptr]: 79 | result_val[result_ptr] += coo.val[this_ptr] 80 | else: 81 | result_ptr += 1 82 | result_val[result_ptr] = coo.val[this_ptr] 83 | result_row[result_ptr] = coo.row[this_ptr] 84 | result_col[result_ptr] = coo.col[this_ptr] 85 | result_key[result_ptr] = coo.key[this_ptr] 86 | 87 | coo.row[np.abs(coo.min[i + 1]) : coo.ind[0]] = result_row[1:] 88 | coo.col[np.abs(coo.min[i + 1]) : coo.ind[0]] = result_col[1:] 89 | coo.val[np.abs(coo.min[i + 1]) : coo.ind[0]] = result_val[1:] 90 | coo.key[np.abs(coo.min[i + 1]) : coo.ind[0]] = result_key[1:] 91 | coo.ind[0] = np.abs(coo.min[i + 1]) + result_ptr 92 | 93 | if new_depth: 94 | coo.min[: coo.depth[0]] = -coo.ind[0] 95 | coo.min[coo.depth[0]] = coo.ind[0] 96 | coo.depth[0] += 1 97 | 98 | 99 | @numba.njit(nogil=True) 100 | def merge_all_sum_duplicates(coo): 101 | new_min = np.zeros(coo.depth[0]) 102 | ptr = 0 103 | for i in range(coo.depth[0]): 104 | if coo.min[i] > 0: 105 | new_min[ptr] = coo.min[i] 106 | ptr += 1 107 | coo.min[: coo.depth[0]] = new_min 108 | merge_sum_duplicates(coo) 109 | 110 | 111 | @numba.njit(nogil=True) 112 | def coo_sum_duplicates(coo): 113 | upper_lim = coo.ind[0] 114 | lower_lim = np.abs(coo.min[0]) 115 | 116 | perm = np.argsort(coo.key[lower_lim:upper_lim]) 117 | 118 | coo.row[lower_lim:upper_lim] = coo.row[lower_lim:upper_lim][perm] 119 | coo.col[lower_lim:upper_lim] = coo.col[lower_lim:upper_lim][perm] 120 | coo.val[lower_lim:upper_lim] = coo.val[lower_lim:upper_lim][perm] 121 | coo.key[lower_lim:upper_lim] = coo.key[lower_lim:upper_lim][perm] 122 | 123 | sum_ind = lower_lim 124 | this_row = coo.row[lower_lim] 125 | this_col = coo.col[lower_lim] 126 | this_val = np.float32(0) 127 | this_key = coo.key[lower_lim] 128 | 129 | for i in range(lower_lim, upper_lim): 130 | if coo.key[i] == this_key: 131 | this_val += coo.val[i] 132 | else: 133 | coo.row[sum_ind] = this_row 134 | coo.col[sum_ind] = this_col 135 | coo.val[sum_ind] = this_val 136 | coo.key[sum_ind] = this_key 137 | this_row = coo.row[i] 138 | this_col = coo.col[i] 139 | this_val = coo.val[i] 140 | this_key = coo.key[i] 141 | sum_ind += 1 142 | 143 | if this_key != coo.key[upper_lim]: 144 | coo.row[sum_ind] = this_row 145 | coo.col[sum_ind] = this_col 146 | coo.val[sum_ind] = this_val 147 | coo.key[sum_ind] = this_key 148 | sum_ind += 1 149 | 150 | coo.ind[0] = sum_ind 151 | merge_sum_duplicates(coo) 152 | 153 | 154 | @numba.njit(nogil=True) 155 | def coo_increase_mem(coo): 156 | 157 | temp = coo.row 158 | new_size = np.int32(max(np.round(COO_MEM_MULTIPLIER * temp.shape[0]), COO_QUICKSORT_LIMIT+1)) 159 | new_row = np.zeros(new_size, dtype=np.int32) 160 | new_row[: temp.shape[0]] = temp 161 | 162 | temp = coo.col 163 | new_size = np.int32(max(np.round(COO_MEM_MULTIPLIER * temp.shape[0]), COO_QUICKSORT_LIMIT+1)) 164 | new_col = np.zeros(new_size, dtype=np.int32) 165 | new_col[: temp.shape[0]] = temp 166 | 167 | temp = coo.val 168 | new_size = np.int32(max(np.round(COO_MEM_MULTIPLIER * temp.shape[0]), COO_QUICKSORT_LIMIT+1)) 169 | new_val = np.zeros(new_size, dtype=np.float32) 170 | new_val[: temp.shape[0]] = temp 171 | 172 | temp = coo.key 173 | new_size = np.int32(max(np.round(COO_MEM_MULTIPLIER * temp.shape[0]), COO_QUICKSORT_LIMIT+1)) 174 | new_key = np.zeros(new_size, dtype=np.int64) 175 | new_key[: temp.shape[0]] = temp 176 | 177 | temp = coo.min 178 | new_size = np.int32(np.round(COO_MEM_MULTIPLIER * (temp.shape[0]+2))) 179 | new_min = np.zeros(new_size, dtype=np.int64) 180 | new_min[: temp.shape[0]] = temp 181 | 182 | coo = CooArray( 183 | new_row, 184 | new_col, 185 | new_val, 186 | new_key, 187 | coo.ind, 188 | new_min, 189 | coo.depth, 190 | ) 191 | 192 | return coo 193 | 194 | 195 | @numba.njit(nogil=True) 196 | def coo_append(coo, tup): 197 | coo.row[coo.ind[0]] = tup[0] 198 | coo.col[coo.ind[0]] = tup[1] 199 | coo.val[coo.ind[0]] = tup[2] 200 | coo.key[coo.ind[0]] = tup[3] 201 | coo.ind[0] += 1 202 | 203 | if (coo.ind[0] - np.abs(coo.min[0])) >= COO_QUICKSORT_LIMIT: 204 | coo_sum_duplicates(coo) 205 | if (coo.key.shape[0] - np.abs(coo.min[0])) <= COO_QUICKSORT_LIMIT: 206 | merge_all_sum_duplicates(coo) 207 | if coo.ind[0] >= 0.95 * coo.key.shape[0]: 208 | coo = coo_increase_mem(coo) 209 | 210 | if coo.ind[0] == coo.key.shape[0] - 1: 211 | coo_sum_duplicates(coo) 212 | if (coo.key.shape[0] - np.abs(coo.min[0])) <= COO_QUICKSORT_LIMIT: 213 | merge_all_sum_duplicates(coo) 214 | if coo.ind[0] >= 0.95 * coo.key.shape[0]: 215 | coo = coo_increase_mem(coo) 216 | 217 | return coo 218 | 219 | 220 | @numba.njit(nogil=True) 221 | def sum_coo_entries(seq): 222 | seq.sort() 223 | this_coord = (seq[0][0], seq[0][1]) 224 | this_sum = 0 225 | reduced_data = [] 226 | for entry in seq: 227 | if (entry[0], entry[1]) == this_coord: 228 | this_sum += entry[2] 229 | else: 230 | reduced_data.append((this_coord[0], this_coord[1], this_sum)) 231 | this_sum = entry[2] 232 | this_coord = (entry[0], entry[1]) 233 | 234 | reduced_data.append((this_coord[0], this_coord[1], this_sum)) 235 | 236 | return reduced_data 237 | 238 | @numba.njit(nogil=True) 239 | def em_update_matrix( 240 | posterior_data, 241 | prior_indices, 242 | prior_indptr, 243 | prior_data, 244 | n_unique_tokens, 245 | target_gram_ind, 246 | windows, 247 | kernels, 248 | ): 249 | """ 250 | Updated the csr matrix from one round of EM on the given (hstack of) n 251 | cooccurrence matrices provided in csr format. 252 | 253 | Parameters 254 | ---------- 255 | posterior_data: numpy.array 256 | The csr data of the hstacked cooccurrence matrix to be updated 257 | 258 | prior_indices: numpy.array 259 | The csr indices of the hstacked cooccurrence matrix 260 | 261 | prior_indptr: numpy.array 262 | The csr indptr of the hstacked cooccurrence matrix 263 | 264 | prior_data: numpy.array 265 | The csr data of the hstacked cooccurrence matrix 266 | 267 | n_unique_tokens: int 268 | The number of unique tokens 269 | 270 | target_gram_ind: int 271 | The index of the target ngram to update 272 | 273 | windows: List of List of int 274 | The indices of the tokens in the windows 275 | 276 | kernels: List of List of floats 277 | The kernel values of the entries in the windows. 278 | 279 | Returns 280 | ------- 281 | posterior_data: numpy.array 282 | The data of the updated csr matrix after an update of EM. 283 | """ 284 | total_win_length = np.sum(np.array([len(w) for w in windows])) 285 | window_posterior = np.zeros(total_win_length) 286 | context_ind = np.zeros(total_win_length, dtype=np.int64) 287 | win_offset = np.append( 288 | np.zeros(1, dtype=np.int64), 289 | np.cumsum(np.array([len(w) for w in windows])), 290 | )[:-1] 291 | 292 | col_ind = prior_indices[ 293 | prior_indptr[target_gram_ind] : prior_indptr[target_gram_ind + 1] 294 | ] 295 | 296 | for w, window in enumerate(windows): 297 | for i, context in enumerate(window): 298 | if kernels[w][i] > 0: 299 | context_ind[i + win_offset[w]] = np.searchsorted( 300 | col_ind, context + w * n_unique_tokens 301 | ) 302 | # assert(col_ind[context_ind[i + win_offset[w]]] == context+w * n_unique_tokens) 303 | if ( 304 | col_ind[context_ind[i + win_offset[w]]] 305 | == context + w * n_unique_tokens 306 | ): 307 | window_posterior[i + win_offset[w]] = ( 308 | kernels[w][i] 309 | * prior_data[ 310 | prior_indptr[target_gram_ind] 311 | + context_ind[i + win_offset[w]] 312 | ] 313 | ) 314 | else: 315 | window_posterior[i + win_offset[w]] = 0 316 | 317 | temp = window_posterior.sum() 318 | if temp > 0: 319 | window_posterior /= temp 320 | 321 | # Partial M_step - Update the posteriors 322 | for w, window in enumerate(windows): 323 | for i, context in enumerate(window): 324 | val = window_posterior[i + win_offset[w]] 325 | if val > 0: 326 | posterior_data[ 327 | prior_indptr[target_gram_ind] + context_ind[i + win_offset[w]] 328 | ] += val 329 | 330 | return posterior_data 331 | -------------------------------------------------------------------------------- /vectorizers/distances.py: -------------------------------------------------------------------------------- 1 | import numba 2 | import numpy as np 3 | 4 | EPS = 1e-11 5 | 6 | 7 | @numba.njit() 8 | def hellinger(x, y): 9 | result = 0.0 10 | l1_norm_x = 0.0 11 | l1_norm_y = 0.0 12 | dim = x.shape[0] 13 | 14 | for i in range(dim): 15 | result += np.sqrt(x[i] * y[i]) 16 | l1_norm_x += x[i] 17 | l1_norm_y += y[i] 18 | 19 | if l1_norm_x == 0 and l1_norm_y == 0: 20 | return 0.0 21 | elif l1_norm_x == 0 or l1_norm_y == 0: 22 | return 1.0 23 | else: 24 | return np.sqrt(1 - result / np.sqrt(l1_norm_x * l1_norm_y)) 25 | 26 | 27 | @numba.njit() 28 | def kantorovich1d(x, y, p=1): 29 | 30 | # Normalize and do a cumulative sum trick 31 | 32 | x_sum = 0.0 33 | y_sum = 0.0 34 | for i in range(x.shape[0]): 35 | x_sum += x[i] 36 | y_sum += y[i] 37 | 38 | x_cdf = x / x_sum 39 | y_cdf = y / y_sum 40 | 41 | for i in range(1, x_cdf.shape[0]): 42 | x_cdf[i] += x_cdf[i - 1] 43 | y_cdf[i] += y_cdf[i - 1] 44 | 45 | # Now we just want minkowski distance on the CDFs 46 | result = 0.0 47 | if p > 2: 48 | for i in range(x_cdf.shape[0]): 49 | result += np.abs(x_cdf[i] - y_cdf[i]) ** p 50 | 51 | return result ** (1.0 / p) 52 | 53 | elif p == 2: 54 | for i in range(x_cdf.shape[0]): 55 | val = x_cdf[i] - y_cdf[i] 56 | result += val * val 57 | 58 | return np.sqrt(result) 59 | 60 | elif p == 1: 61 | for i in range(x_cdf.shape[0]): 62 | result += np.abs(x_cdf[i] - y_cdf[i]) 63 | 64 | return result 65 | 66 | else: 67 | raise ValueError("Invalid p supplied to Kantorvich distance") 68 | 69 | 70 | @numba.njit() 71 | def circular_kantorovich(x, y, p=1): 72 | 73 | x_sum = 0.0 74 | y_sum = 0.0 75 | for i in range(x.shape[0]): 76 | x_sum += x[i] 77 | y_sum += y[i] 78 | 79 | x_cdf = x / x_sum 80 | y_cdf = y / y_sum 81 | 82 | for i in range(1, x_cdf.shape[0]): 83 | x_cdf[i] += x_cdf[i - 1] 84 | y_cdf[i] += y_cdf[i - 1] 85 | 86 | mu = np.median((x_cdf - y_cdf) ** p) 87 | 88 | # Now we just want minkowski distance on the CDFs shifted by mu 89 | result = 0.0 90 | if p > 2: 91 | for i in range(x_cdf.shape[0]): 92 | result += np.abs(x_cdf[i] - y_cdf[i] - mu) ** p 93 | 94 | return result ** (1.0 / p) 95 | 96 | elif p == 2: 97 | for i in range(x_cdf.shape[0]): 98 | val = x_cdf[i] - y_cdf[i] - mu 99 | result += val * val 100 | 101 | return np.sqrt(result) 102 | 103 | elif p == 1: 104 | for i in range(x_cdf.shape[0]): 105 | result += np.abs(x_cdf[i] - y_cdf[i] - mu) 106 | 107 | return result 108 | 109 | else: 110 | raise ValueError("Invalid p supplied to Kantorvich distance") 111 | 112 | 113 | @numba.njit() 114 | def total_variation(x, y): 115 | x_sum = 0.0 116 | y_sum = 0.0 117 | result = 0.0 118 | 119 | for i in range(x.shape[0]): 120 | x_sum += x[i] 121 | y_sum += y[i] 122 | 123 | x_pdf = x / x_sum 124 | y_pdf = y / y_sum 125 | 126 | for i in range(x.shape[0]): 127 | result += 0.5 * np.abs(x_pdf[i] - y_pdf[i]) 128 | 129 | return result 130 | 131 | 132 | @numba.njit() 133 | def jensen_shannon_divergence(x, y): 134 | result = 0.0 135 | l1_norm_x = 0.0 136 | l1_norm_y = 0.0 137 | dim = x.shape[0] 138 | 139 | for i in range(dim): 140 | l1_norm_x += x[i] 141 | l1_norm_y += y[i] 142 | 143 | l1_norm_x += EPS * dim 144 | l1_norm_y += EPS * dim 145 | 146 | pdf_x = (x + EPS) / l1_norm_x 147 | pdf_y = (y + EPS) / l1_norm_y 148 | m = 0.5 * (pdf_x + pdf_y) 149 | 150 | for i in range(dim): 151 | result += 0.5 * ( 152 | pdf_x[i] * np.log(pdf_x[i] / m[i]) + pdf_y[i] * np.log(pdf_y[i] / m[i]) 153 | ) 154 | return result 155 | 156 | 157 | @numba.njit() 158 | def symmetric_kl_divergence(x, y): 159 | result = 0.0 160 | l1_norm_x = 0.0 161 | l1_norm_y = 0.0 162 | dim = x.shape[0] 163 | 164 | for i in range(dim): 165 | l1_norm_x += x[i] 166 | l1_norm_y += y[i] 167 | 168 | l1_norm_x += EPS * dim 169 | l1_norm_y += EPS * dim 170 | 171 | pdf_x = (x + EPS) / l1_norm_x 172 | pdf_y = (y + EPS) / l1_norm_y 173 | 174 | for i in range(dim): 175 | result += pdf_x[i] * np.log(pdf_x[i] / pdf_y[i]) + pdf_y[i] * np.log( 176 | pdf_y[i] / pdf_x[i] 177 | ) 178 | 179 | return result 180 | 181 | 182 | # 183 | # --- Sparse support functions 184 | # 185 | 186 | 187 | # Just reproduce a simpler version of numpy unique (not numba supported yet) 188 | @numba.njit() 189 | def arr_unique(arr): 190 | aux = np.sort(arr) 191 | flag = np.concatenate((np.ones(1, dtype=np.bool_), aux[1:] != aux[:-1])) 192 | return aux[flag] 193 | 194 | 195 | # Just reproduce a simpler version of numpy union1d (not numba supported yet) 196 | @numba.njit() 197 | def arr_union(ar1, ar2): 198 | if ar1.shape[0] == 0: 199 | return ar2 200 | elif ar2.shape[0] == 0: 201 | return ar1 202 | else: 203 | return arr_unique(np.concatenate((ar1, ar2))) 204 | 205 | 206 | @numba.njit() 207 | def arr_intersect(ar1, ar2): 208 | aux = np.concatenate((ar1, ar2)) 209 | aux.sort() 210 | return aux[:-1][aux[1:] == aux[:-1]] 211 | 212 | 213 | @numba.njit() 214 | def sparse_sum(ind1, data1, ind2, data2): 215 | result_ind = arr_union(ind1, ind2) 216 | result_data = np.zeros(result_ind.shape[0], dtype=np.float32) 217 | 218 | i1 = 0 219 | i2 = 0 220 | nnz = 0 221 | 222 | # pass through both index lists 223 | while i1 < ind1.shape[0] and i2 < ind2.shape[0]: 224 | j1 = ind1[i1] 225 | j2 = ind2[i2] 226 | 227 | if j1 == j2: 228 | val = data1[i1] + data2[i2] 229 | if val != 0: 230 | result_ind[nnz] = j1 231 | result_data[nnz] = val 232 | nnz += 1 233 | i1 += 1 234 | i2 += 1 235 | elif j1 < j2: 236 | val = data1[i1] 237 | if val != 0: 238 | result_ind[nnz] = j1 239 | result_data[nnz] = val 240 | nnz += 1 241 | i1 += 1 242 | else: 243 | val = data2[i2] 244 | if val != 0: 245 | result_ind[nnz] = j2 246 | result_data[nnz] = val 247 | nnz += 1 248 | i2 += 1 249 | 250 | # pass over the tails 251 | while i1 < ind1.shape[0]: 252 | val = data1[i1] 253 | if val != 0: 254 | result_ind[nnz] = i1 255 | result_data[nnz] = val 256 | nnz += 1 257 | i1 += 1 258 | 259 | while i2 < ind2.shape[0]: 260 | val = data2[i2] 261 | if val != 0: 262 | result_ind[nnz] = i2 263 | result_data[nnz] = val 264 | nnz += 1 265 | i2 += 1 266 | 267 | # truncate to the correct length in case there were zeros created 268 | result_ind = result_ind[:nnz] 269 | result_data = result_data[:nnz] 270 | 271 | return result_ind, result_data 272 | 273 | 274 | @numba.njit() 275 | def sparse_diff(ind1, data1, ind2, data2): 276 | return sparse_sum(ind1, data1, ind2, -data2) 277 | 278 | 279 | @numba.njit() 280 | def sparse_mul(ind1, data1, ind2, data2): 281 | result_ind = arr_intersect(ind1, ind2) 282 | result_data = np.zeros(result_ind.shape[0], dtype=np.float32) 283 | 284 | i1 = 0 285 | i2 = 0 286 | nnz = 0 287 | 288 | # pass through both index lists 289 | while i1 < ind1.shape[0] and i2 < ind2.shape[0]: 290 | j1 = ind1[i1] 291 | j2 = ind2[i2] 292 | 293 | if j1 == j2: 294 | val = data1[i1] * data2[i2] 295 | if val != 0: 296 | result_ind[nnz] = j1 297 | result_data[nnz] = val 298 | nnz += 1 299 | i1 += 1 300 | i2 += 1 301 | elif j1 < j2: 302 | i1 += 1 303 | else: 304 | i2 += 1 305 | 306 | # truncate to the correct length in case there were zeros created 307 | result_ind = result_ind[:nnz] 308 | result_data = result_data[:nnz] 309 | 310 | return result_ind, result_data 311 | 312 | 313 | # Return dense vectors supported on the union of the non-zero valued indices 314 | @numba.njit() 315 | def dense_union(ind1, data1, ind2, data2): 316 | result_ind = arr_union(ind1, ind2) 317 | result_data1 = np.zeros(result_ind.shape[0], dtype=np.float32) 318 | result_data2 = np.zeros(result_ind.shape[0], dtype=np.float32) 319 | 320 | i1 = 0 321 | i2 = 0 322 | nnz = 0 323 | 324 | # pass through both index lists 325 | while i1 < ind1.shape[0] and i2 < ind2.shape[0]: 326 | j1 = ind1[i1] 327 | j2 = ind2[i2] 328 | 329 | if j1 == j2: 330 | val = data1[i1] + data2[i2] 331 | if val != 0: 332 | result_data1[nnz] = data1[i1] 333 | result_data2[nnz] = data2[i2] 334 | nnz += 1 335 | i1 += 1 336 | i2 += 1 337 | elif j1 < j2: 338 | val = data1[i1] 339 | if val != 0: 340 | result_data1[nnz] = data1[i1] 341 | nnz += 1 342 | i1 += 1 343 | else: 344 | val = data2[i2] 345 | if val != 0: 346 | result_data2[nnz] = data2[i2] 347 | nnz += 1 348 | i2 += 1 349 | 350 | # pass over the tails 351 | while i1 < ind1.shape[0]: 352 | val = data1[i1] 353 | if val != 0: 354 | result_data1[nnz] = data1[i1] 355 | nnz += 1 356 | i1 += 1 357 | 358 | while i2 < ind2.shape[0]: 359 | val = data2[i2] 360 | if val != 0: 361 | result_data2[nnz] = data2[i2] 362 | nnz += 1 363 | i2 += 1 364 | 365 | # truncate to the correct length in case there were zeros 366 | result_data1 = result_data1[:nnz] 367 | result_data2 = result_data2[:nnz] 368 | 369 | return result_data1, result_data2 370 | 371 | 372 | # 373 | # --- Sparse distance functions 374 | # 375 | 376 | 377 | @numba.njit() 378 | def sparse_hellinger(ind1, data1, ind2, data2): 379 | aux_inds, aux_data = sparse_mul(ind1, data1, ind2, data2) 380 | result = 0.0 381 | norm1 = np.sum(data1) 382 | norm2 = np.sum(data2) 383 | sqrt_norm_prod = np.sqrt(norm1 * norm2) 384 | 385 | for i in range(aux_data.shape[0]): 386 | result += np.sqrt(aux_data[i]) 387 | 388 | if norm1 == 0.0 and norm2 == 0.0: 389 | return 0.0 390 | elif norm1 == 0.0 or norm2 == 0.0: 391 | return 1.0 392 | elif result > sqrt_norm_prod: 393 | return 0.0 394 | else: 395 | return np.sqrt(1.0 - (result / sqrt_norm_prod)) 396 | 397 | 398 | @numba.njit() 399 | def sparse_total_variation(ind1, data1, ind2, data2): 400 | norm1 = np.sum(data1) 401 | norm2 = np.sum(data2) 402 | aux_inds, aux_data = sparse_diff(ind1, data1 / norm1, ind2, data2 / norm2) 403 | result = 0.0 404 | for i in range(aux_data.shape[0]): 405 | result += 0.5 * np.abs(aux_data[i]) 406 | return result 407 | 408 | 409 | # Because of the EPS values and the need to normalize after adding them (and then average those for jensen_shannon) 410 | # it seems like we might as well just take the dense union (dense vectors supported on the union of indices) 411 | # and call the dense distance functions 412 | 413 | 414 | @numba.njit() 415 | def sparse_jensen_shannon_divergence(ind1, data1, ind2, data2): 416 | dense_data1, dense_data2 = dense_union(ind1, data1, ind2, data2) 417 | return jensen_shannon_divergence(dense_data1, dense_data2) 418 | 419 | 420 | @numba.njit() 421 | def sparse_symmetric_kl_divergence(ind1, data1, ind2, data2): 422 | dense_data1, dense_data2 = dense_union(ind1, data1, ind2, data2) 423 | return symmetric_kl_divergence(dense_data1, dense_data2) 424 | -------------------------------------------------------------------------------- /vectorizers/distribution_vectorizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pomegranate as pm 4 | 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | from sklearn.utils.validation import ( 7 | check_array, 8 | check_is_fitted, 9 | check_random_state, 10 | ) 11 | from pandas.api.types import is_datetime64_any_dtype as is_datetime 12 | 13 | def distribution_type_from_series(series): # pragma: no cover 14 | if series.dtype in (np.int, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64): 15 | if series.min() >= 0: 16 | if series.max() == 1: 17 | return pm.BernoulliDistribution 18 | else: 19 | return pm.PoissonDistribution 20 | elif series.unique().shape[0] <= 50: 21 | return pm.DiscreteDistribution 22 | else: 23 | return pm.NormalDistribution 24 | 25 | elif series.dtype in (np.float, np.float16, np.float32, np.float64): 26 | if series.min() >= 0: 27 | if series.max() <= 1: 28 | return pm.BetaDistribution 29 | else: 30 | return pm.GammaDistribution 31 | else: 32 | return pm.NormalDistribution 33 | 34 | elif series.dtype in pd.CategoricalDtype: 35 | return pm.DiscreteDistribution 36 | 37 | else: 38 | raise ValueError(f"Failed to handle series {series}") 39 | 40 | 41 | def preprocess_dataframe(df, time_granularity="1s"): # pragma: no cover 42 | for feature in df: 43 | if feature.dtype == object: 44 | df[feature] = pd.Categorical(df[feature]) 45 | elif is_datetime(df[feature]): 46 | df[feature] = ((df.feature - df[feature].min()) / pd.Timedelta(time_granularity)) 47 | 48 | return 49 | 50 | class DataframeDistributionVectorizer(BaseEstimator, TransformerMixin): # pragma: no cover 51 | 52 | def __init__(self, n_components=100): 53 | self.n_components = n_components 54 | 55 | def fit(self, X, y=None, **fit_params): 56 | if type(X) == pd.DataFrame: 57 | X = preprocess_dataframe(X.copy()) 58 | column_models = [ 59 | distribution_type_from_series(X[feature]) 60 | for feature in X 61 | ] 62 | elif type(X) == np.ndarray: 63 | column_models = [ 64 | distribution_type_from_series(X[:, i]) 65 | for i in range(X.shape[1]) 66 | ] 67 | else: 68 | raise ValueError(f"Input type {type(X)} is not currently supported") 69 | 70 | self.mixture_model_ = pm.GeneralMixtureModel.from_samples( 71 | column_models, n_components=self.n_components, X=X 72 | ) 73 | 74 | def transform(self, X): 75 | check_is_fitted(self, ["mixture_model_"]) 76 | 77 | if type(X) == pd.DataFrame: 78 | X = preprocess_dataframe(X.copy()) 79 | 80 | return self.mixture_model_.predict_proba(X) 81 | -------------------------------------------------------------------------------- /vectorizers/edge_list_vectorizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numba 3 | 4 | from sklearn.base import BaseEstimator, TransformerMixin 5 | from sklearn.utils.validation import check_is_fitted 6 | import scipy.sparse 7 | 8 | 9 | def read_edge_data(X): 10 | """ 11 | Read in data of various forms and converts them into an np.array of [row_labels, column_labels, values] 12 | 13 | Returns 14 | ------- 15 | N x 3 np.array of [row_labels, column_labels, values] 16 | """ 17 | try: 18 | edge_list = np.array(X, dtype=object) 19 | except: 20 | raise ValueError("Couldn't convert for your data format into an numpy array.") 21 | if (edge_list.shape[1] != 3) & (edge_list.shape[0] == 3): 22 | edge_list = edge_list.T 23 | if edge_list.shape[1] != 3: 24 | raise ValueError( 25 | f"Incorrect format of data passed in. " 26 | f"We expected some format of Nx3 data and received {edge_list.shape[0]} by {edge_list.shape[1]} data" 27 | ) 28 | 29 | # TODO: Test if edge_list[:,2] is numeric. We currently just convert it into a float. I'd rather preserve the type. 30 | return edge_list 31 | 32 | 33 | class EdgeListVectorizer(BaseEstimator, TransformerMixin): 34 | """ 35 | Takes a weighted edge list of the form row_labels, column_labels, value 36 | and represents each row_name as a sparse matrix containing the values 37 | associated with each column_name. 38 | 39 | This might also be thought of as a PivotTableVectorizer or a CrossTabVectorizer. 40 | 41 | Parameters 42 | ---------- 43 | column_label_dictionary: dictionary or None (optional, default=None) 44 | A fixed dictionary mapping tokens to indices, or None if the dictionary 45 | should be learned from the training data. If specified this will limit 46 | the tokens 47 | row_label_dictionary: dictionary or None (optional, default=None) 48 | A fixed dictionary mapping row labels to indices, or None if the dictionary 49 | should be learned from the training data. If specified this will limit 50 | the tokens 51 | joint_space: Bool (optional, default=False) 52 | Are the first two columns of your edge list over the same token space. If so build 53 | a single unified token dictionary over both columns. 54 | pre_indexed: bool (optional, default=False) 55 | Not yet implemented. I'm not sure that this feature is going to be used enought to prioritize implementing. 56 | Please reach out if this would be useful to you. 57 | My row and columns are just the row and column indices and not row and column labels. 58 | 59 | """ 60 | 61 | def __init__( 62 | self, 63 | column_label_dictionary=None, 64 | row_label_dictionary=None, 65 | joint_space=False, 66 | ): 67 | self.column_label_dictionary = column_label_dictionary 68 | self.row_label_dictionary = row_label_dictionary 69 | self.joint_space = joint_space 70 | 71 | def fit(self, X, y=None, **fit_params): 72 | # Convert data from whatever format it came in into an Nx3 np.array 73 | self.edge_list_ = read_edge_data(X) 74 | 75 | if self.joint_space: 76 | if self.column_label_dictionary is None: 77 | if self.row_label_dictionary is None: 78 | self.row_label_dictionary_ = { 79 | token: index 80 | for index, token in enumerate( 81 | np.unique( 82 | np.append(self.edge_list_[:, 0], self.edge_list_[:, 1]) 83 | ) 84 | ) 85 | } 86 | self.column_label_dictionary_ = self.row_label_dictionary_ 87 | elif self.row_label_dictionary is None: 88 | self.column_label_dictionary_ = self.column_label_dictionary 89 | self.row_label_dictionary_ = self.column_label_dictionary 90 | elif self.column_label_dictionary is None: 91 | self.column_label_dictionary_ = self.row_label_dictionary 92 | self.row_label_dictionary_ = self.row_label_dictionary 93 | else: 94 | raise ValueError( 95 | "Joint_space=True: Please specify at most a single label dictionary (either one works)." 96 | ) 97 | else: # Not in a joint space 98 | if self.row_label_dictionary is None: 99 | self.row_label_dictionary_ = { 100 | token: index 101 | for index, token in enumerate(np.unique(self.edge_list_[:, 0])) 102 | } 103 | else: 104 | self.row_label_dictionary_ = self.row_label_dictionary 105 | if self.column_label_dictionary is None: 106 | self.column_label_dictionary_ = { 107 | token: index 108 | for index, token in enumerate(np.unique(self.edge_list_[:, 1])) 109 | } 110 | else: 111 | self.column_label_dictionary_ = self.column_label_dictionary 112 | # Build reverse indexes 113 | self.row_index_dictionary_ = { 114 | y: x for (x, y) in self.row_label_dictionary_.items() 115 | } 116 | self.column_index_dictionary_ = { 117 | y: x for (x, y) in self.column_label_dictionary_.items() 118 | } 119 | max_row = np.max(list(self.row_index_dictionary_.keys())) + 1 120 | max_col = np.max(list(self.column_index_dictionary_.keys())) + 1 121 | 122 | # Get row and column indices for only the edges who have both labels in our dictionary index 123 | # Don't bother checking if rows are valid if you just constructed the row_label_dictionary from the data 124 | if self.row_label_dictionary is None: 125 | valid_rows = np.repeat(True, self.edge_list_.shape[0]) 126 | else: 127 | valid_rows = np.isin( 128 | self.edge_list_[:, 0], list(self.row_label_dictionary_.keys()) 129 | ) 130 | 131 | # Don't bother checking if rows are valid if you just constructed the col_label_dictionary from the data 132 | if self.column_label_dictionary is None: 133 | valid_cols = np.repeat(True, self.edge_list_.shape[0]) 134 | else: 135 | valid_cols = np.isin( 136 | self.edge_list_[:, 1], list(self.column_label_dictionary_.keys()) 137 | ) 138 | valid_edges = valid_rows & valid_cols 139 | row_indices = [ 140 | self.row_label_dictionary_[x] for x in self.edge_list_[valid_edges, 0] 141 | ] 142 | col_indices = [ 143 | self.column_label_dictionary_[x] for x in self.edge_list_[valid_edges, 1] 144 | ] 145 | # Must specify the shape to ensure that tailing zero rows/cols aren't suppressed. 146 | self._train_matrix = scipy.sparse.coo_matrix( 147 | (self.edge_list_[valid_edges, 2].astype(float), (row_indices, col_indices)), 148 | shape=(max_row, max_col), 149 | ).tocsr() 150 | self._train_matrix.sum_duplicates() 151 | 152 | return self 153 | 154 | def fit_transform(self, X, y=None, **fit_params): 155 | self.fit(X, y, **fit_params) 156 | return self._train_matrix 157 | 158 | def transform(self, X): 159 | check_is_fitted( 160 | self, 161 | [ 162 | "column_label_dictionary_", 163 | "row_label_dictionary_", 164 | ], 165 | ) 166 | 167 | edge_list = read_edge_data(X) 168 | 169 | # Get row and column indices for only the edges who have both labels in our dictionary index 170 | valid_rows = np.isin(edge_list[:, 0], list(self.row_label_dictionary_.keys())) 171 | valid_cols = np.isin( 172 | edge_list[:, 1], list(self.column_label_dictionary_.keys()) 173 | ) 174 | valid_edges = valid_rows & valid_cols 175 | row_indices = [self.row_label_dictionary_[x] for x in edge_list[valid_edges, 0]] 176 | col_indices = [ 177 | self.column_label_dictionary_[x] for x in edge_list[valid_edges, 1] 178 | ] 179 | 180 | matrix = scipy.sparse.coo_matrix( 181 | (edge_list[valid_edges, 2].astype(float), (row_indices, col_indices)) 182 | ).tocsr() 183 | matrix.sum_duplicates() 184 | return matrix 185 | -------------------------------------------------------------------------------- /vectorizers/kde_vectorizer.py: -------------------------------------------------------------------------------- 1 | import numba 2 | import numpy as np 3 | 4 | from sklearn.base import BaseEstimator, TransformerMixin 5 | from sklearn.utils.validation import check_is_fitted 6 | from warnings import warn 7 | from sklearn.neighbors import KernelDensity 8 | from .utils import flatten 9 | 10 | 11 | @numba.njit(nogil=True) 12 | def min_non_zero_difference(data): 13 | """Find the minimum non-zero sequential difference in a single dimensional 14 | array of values. This is useful for determining the minimal reasonable kernel 15 | bandwidth for a 1-dimensional KDE over a dataset. 16 | 17 | Parameters 18 | ---------- 19 | data: array 20 | One dimensional array of values 21 | 22 | Returns 23 | ------- 24 | min_difference: float 25 | The minimal difference between sequential values. 26 | """ 27 | sorted_data = np.sort(data) 28 | differences = sorted_data[1:] - sorted_data[:-1] 29 | return np.min(differences[differences > 0]) 30 | 31 | 32 | def jackknife_bandwidths(data, bandwidths, kernel="gaussian"): 33 | """Perform jack-knife sampling over different bandwidths for KDEs for each 34 | time-series in the dataset. 35 | 36 | Parameters 37 | ---------- 38 | data: list of arrays 39 | A list of (variable length) arrays of values. The values should represent 40 | "times" of "events". 41 | 42 | bandwidths: array 43 | The possible bandwidths to try 44 | 45 | kernel: string (optional, default="gaussian") 46 | The kernel to use for the KDE. Should be accepted by sklearn's KernelDensity 47 | class. 48 | 49 | Returns 50 | ------- 51 | result: array of shape (n_bandwidths,) 52 | The total likelihood of unobserved data over all jackknife samplings and all 53 | time series in the dataset for each bandwidth. 54 | """ 55 | result = np.zeros(bandwidths.shape[0]) 56 | for j in range(bandwidths.shape[0]): 57 | kde = KernelDensity(bandwidth=bandwidths[j], kernel=kernel) 58 | for i in range(len(data)): 59 | likelihood = 0.0 60 | for k in range(len(data[i])): 61 | if k < len(data[i]) - 1: 62 | jackknife_sample = np.hstack([data[i][:k], data[i][k + 1 :]]) 63 | else: 64 | jackknife_sample = data[i][:k] 65 | kde.fit(jackknife_sample[:, None]) 66 | likelihood += np.exp(kde.score(np.array([[data[i][k]]]))) 67 | 68 | result[j] += likelihood 69 | 70 | return result 71 | 72 | 73 | class KDEVectorizer(BaseEstimator, TransformerMixin): 74 | def __init__( 75 | self, 76 | bandwidth=None, 77 | n_components=50, 78 | kernel="gaussian", 79 | evaluation_grid_strategy="uniform", 80 | ): 81 | self.n_components = n_components 82 | self.evaluation_grid_strategy = evaluation_grid_strategy 83 | self.bandwidth = bandwidth 84 | self.kernel = kernel 85 | 86 | def fit(self, X, y=None, **fit_params): 87 | 88 | combined_data = np.array(flatten(X)) 89 | 90 | if self.bandwidth is None: 91 | # Estimate the bandwidth by looking at training data 92 | # We do a jack-knife across each time series and 93 | # find the bandwidth choice that works best over all 94 | # time series 95 | min, max = np.min(combined_data), np.max(combined_data) 96 | avg_n_events = np.mean([len(x) for x in X]) 97 | max_bandwidth = (max - min) / avg_n_events 98 | min_bandwidth = min_non_zero_difference(combined_data) 99 | bandwidths = 10.0 ** np.linspace( 100 | np.log10(min_bandwidth), np.log10(max_bandwidth), 50 101 | ) 102 | jackknifed_total_likelihoods = jackknife_bandwidths(X, bandwidths) 103 | self.bandwidth_ = bandwidths[np.argmax(jackknifed_total_likelihoods)] 104 | else: 105 | self.bandwidth_ = self.bandwidth 106 | 107 | if self.evaluation_grid_strategy == "uniform": 108 | min, max = np.min(combined_data), np.max(combined_data) 109 | self.evaluation_grid_ = np.linspace(min, max, self.n_components) 110 | elif self.evaluation_grid_strategy == "density": 111 | uniform_quantile_grid = np.linspace(0, 1.0, self.n_components) 112 | self.evaluation_grid_ = np.quantile(combined_data, uniform_quantile_grid) 113 | else: 114 | raise ValueError( 115 | "Unrecognized evaluation_grid_strategy; should be one " 116 | 'of: "uniform" or "density"' 117 | ) 118 | 119 | return self 120 | 121 | def transform(self, X): 122 | check_is_fitted(self, ["bandwidth_", "evaluation_grid_"]) 123 | 124 | result = np.empty((len(X), self.n_components), dtype=np.float64) 125 | 126 | for i, sample in enumerate(X): 127 | kde = KernelDensity(bandwidth=self.bandwidth_, kernel=self.kernel) 128 | kde.fit(sample[:, None]) 129 | log_probability = kde.score_samples(self.evaluation_grid_[:, None]) 130 | result[i] = np.exp(log_probability) 131 | 132 | return result 133 | 134 | def fit_transform(self, X, y=None, **fit_params): 135 | self.fit(X, y, **fit_params) 136 | return self.transform(X) 137 | -------------------------------------------------------------------------------- /vectorizers/signature_vectorizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | from sklearn.utils.validation import check_is_fitted 5 | 6 | #import iisignature 7 | 8 | NUMPY_SHAPE_ERROR_MSG = """ 9 | Error: SignatureVectorizer expects numpy arrays to be of shape (num_samples x path_len x path_dim). 10 | """ 11 | LIST_SHAPE_ERROR_MSG = """ 12 | Error: Expecting list entries to be numpy arrays of shape (path_len x path_dim). 13 | """ 14 | 15 | 16 | class SignatureVectorizer(BaseEstimator, TransformerMixin): 17 | """Transforms a list or array of paths into their path signatures. 18 | 19 | Uses the iisignature library (https://pypi.org/project/iisignature/) 20 | * pip install iisignature 21 | 22 | For more details on the path signature technique, please refer to: 23 | * Rough paths, Signatures and the modelling of functions on streams. Lyons, T. (2014) 24 | https://arxiv.org/pdf/1405.4537.pdf 25 | * A Primer on the Signature Method in Machine Learning. Cheyrev, I. (2016) 26 | https://arxiv.org/pdf/1603.03788.pdf 27 | 28 | Parameters 29 | ---------- 30 | truncation_level: int (default = 2) 31 | The level at which we truncate the infinite signature. 32 | 33 | log: bool (default=False) 34 | If True returns the log-signature (a compressed version of the path signature. 35 | Otherwise returns the path signature. 36 | 37 | basepoint: bool (default=False) 38 | If True, prepends each path with the zero vector. The default path signature is blind 39 | to translational shifts in the paths; use this flag if you care about path translations. 40 | """ 41 | 42 | def __init__( 43 | self, truncation_level: int = 2, log: bool = False, basepoint: bool = False 44 | ): 45 | try: 46 | global iisignature 47 | import iisignature as ii 48 | iisignature = ii 49 | except ImportError as err: 50 | from textwrap import dedent 51 | err.msg += dedent( 52 | """ 53 | 54 | A small bug with the install script of the iisignature makes it 55 | impossible to install into an environment where its Numpy dependency 56 | has not yet been installed. Thus, the Vectorizers library does not 57 | make it an explicit dependency. However, you may install this package 58 | yourself into this environment now, by running the command 59 | 60 | pip install iisignature 61 | 62 | The problem has been reported to the maintainers of iisignature, and 63 | this inconvenience will disappear in future releases. 64 | """ 65 | ) 66 | raise 67 | 68 | assert ( 69 | type(truncation_level) is int 70 | ), "Error: expecting int type for truncation_level." 71 | assert type(log) is bool, "Error: expecting bool type for log." 72 | assert type(basepoint) is bool, "Error: expecting bool type for basepoint" 73 | 74 | self.truncation_level = truncation_level 75 | self.log = log 76 | self.basepoint = basepoint 77 | 78 | def fit(self, X, y=None, **fit_params): 79 | """ 80 | Parameters 81 | ---------- 82 | X: np.array of shape (n_samples, path_len, path_dim) or list of np.arrays of shape (?, path_dim) 83 | The path data on which we fit the vectorizer. 84 | If paths are all the same length, then we can pass them to fit as a numpy array (n_samples, path_len, path_dim). 85 | If paths are varting length, then we can pass a list of length n_samples, where each entry is a numpy array 86 | with shape (path_len_i, path_dim). The path_dim should be consistent across the list, but the path length 87 | can vary/ 88 | """ 89 | if type(X) is np.ndarray: 90 | assert len(X.shape) == 3, NUMPY_SHAPE_ERROR_MSG 91 | # We have an array N x p x d of paths 92 | # all paths have the same length -> batch vectorize 93 | self.in_dim_ = X.shape[2] 94 | else: 95 | assert type(X) is list, "Error: Expecting numpy array or list of paths." 96 | assert ( 97 | type(X[0]) is np.ndarray 98 | ), "Error: Expecting list entries to be numpy arrays." 99 | assert ( 100 | type(X[0]) is np.ndarray and len(X[0].shape) == 2 101 | ), LIST_SHAPE_ERROR_MSG 102 | # Accepts a list of paths with differing lengths 103 | self.in_dim_ = X[0].shape[1] 104 | 105 | if self.log: 106 | self.s_ = iisignature.prepare(self.in_dim_, self.truncation_level) 107 | self.out_dim_ = iisignature.logsiglength( 108 | self.in_dim_, self.truncation_level 109 | ) 110 | else: 111 | self.s_ = None 112 | self.out_dim_ = iisignature.siglength(self.in_dim_, self.truncation_level) 113 | 114 | def transform(self, X): 115 | """ 116 | Parameters 117 | ---------- 118 | X: np.array of shape (n_samples, path_len, path_dim) or list of np.arrays of shape (?, path_dim) 119 | The path data on which we fit the vectorizer. 120 | If paths are all the same length, then we can pass them to fit as a numpy array (n_samples, path_len, path_dim). 121 | If paths are varting length, then we can pass a list of length n_samples, where each entry is a numpy array 122 | with shape (path_len_i, path_dim). The path_dim should be consistent across the list, but the path length 123 | can vary. 124 | 125 | Returns 126 | ------- 127 | sigs: np.array of shape (n_samples, self.out_dim_) 128 | The array of signatures corresponding to the paths given in X, truncated at the truncation level specified 129 | at initialisation. 130 | 131 | """ 132 | check_is_fitted( 133 | self, 134 | [ 135 | "in_dim_", 136 | "out_dim_", 137 | "s_", 138 | ], 139 | ) 140 | 141 | if type(X) is np.ndarray: 142 | assert len(X.shape) == 3, NUMPY_SHAPE_ERROR_MSG 143 | # We have an array N x p x d of paths 144 | # all paths have the same length -> batch vectorize 145 | assert ( 146 | X.shape[2] == self.in_dim_ 147 | ), "Error: Expecting path_dim to be %d, got path_dim %d." % ( 148 | self.in_dim_, 149 | X.shape[2], 150 | ) 151 | if self.basepoint: 152 | basepoint = np.zeros((X.shape[0], 1, X.shape[2])) 153 | X = np.concatenate([basepoint, np.array(X)], axis=1) 154 | 155 | if self.log: 156 | v = iisignature.logsig(X, self.s_) 157 | else: 158 | v = iisignature.sig(X, self.truncation_level) 159 | else: 160 | # Accepts a list of paths with differing lengths 161 | assert type(X) is list, "Error: Expecting numpy array or list of paths." 162 | assert ( 163 | type(X[0]) is np.ndarray 164 | ), "Error: Expecting list entries to be numpy arrays." 165 | assert len(X[0].shape) == 2, LIST_SHAPE_ERROR_MSG 166 | assert ( 167 | X[0].shape[1] == self.in_dim_ 168 | ), "Error: Expecting path_dim to be %d, got path_dim %d." % ( 169 | self.in_dim_, 170 | X[0].shape[1], 171 | ) 172 | N = len(X) 173 | if self.basepoint: 174 | basepoint = np.zeros(shape=(1, self.in_dim_)) 175 | X = [np.concatenate([basepoint, x], axis=0) for x in X] 176 | 177 | if self.log: 178 | sig_vectorizer = lambda path: iisignature.logsig(path, self.s_) 179 | else: 180 | sig_vectorizer = lambda path: iisignature.sig( 181 | path, self.truncation_level 182 | ) 183 | 184 | v = np.empty(shape=(N, self.out_dim_)) 185 | 186 | for i, path in enumerate(X): 187 | assert ( 188 | path.shape[-1] == self.in_dim_ 189 | ), "Error: Not all paths share the same dimension." 190 | v[i] = sig_vectorizer(path) 191 | 192 | return v 193 | 194 | def fit_transform(self, X, y=None, **fit_params): 195 | self.fit(X, y, **fit_params) 196 | return self.transform(X) 197 | -------------------------------------------------------------------------------- /vectorizers/tests/__init__.py: -------------------------------------------------------------------------------- 1 | raw_string_data = [ 2 | "asdfj;afoosdaflksapokwerfoobarpokwersdfsadfsadfnbkajyfoopokwer", 3 | "pokfoo;ohnASDbarfoobarpoksdf sgn;asregtjpoksdfpokpokwer", 4 | "werqweoijsdcasdfpoktrfoobarpokqwernasdfasdpokpokpok", 5 | "pokwerpokwqerpokwersadfpokqwepokwerpokpok", 6 | "foobarfoofooasdfsdfgasdffoobarbazcabfoobarbarbazfoobaz", 7 | "pokfoopokbarpokwerpokbazgfniusnvbgasgbabgsadfjnkr[pko", 8 | ] 9 | -------------------------------------------------------------------------------- /vectorizers/tests/test_bpe.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest # noqa 3 | 4 | from vectorizers import BytePairEncodingVectorizer 5 | from vectorizers import NgramVectorizer 6 | from vectorizers.mixed_gram_vectorizer import to_unicode 7 | 8 | from . import raw_string_data 9 | 10 | 11 | def test_bpe_vectorizer_basic(): 12 | bpe = BytePairEncodingVectorizer() 13 | result1 = bpe.fit_transform(raw_string_data) 14 | result2 = bpe.transform(raw_string_data) 15 | assert np.allclose(result1.toarray(), result2.toarray()) 16 | 17 | 18 | def test_bpe_tokens_ngram_matches(): 19 | bpe1 = BytePairEncodingVectorizer(return_type="matrix") 20 | bpe2 = BytePairEncodingVectorizer(return_type="tokens") 21 | 22 | result1 = bpe1.fit_transform(raw_string_data) 23 | token_dictionary = { 24 | to_unicode(code, bpe1.tokens_, bpe1.max_char_code_): n 25 | for code, n in bpe1.column_label_dictionary_.items() 26 | } 27 | 28 | tokens = bpe2.fit_transform(raw_string_data) 29 | result2 = NgramVectorizer(token_dictionary=token_dictionary).fit_transform(tokens) 30 | 31 | assert np.allclose(result1.toarray(), result2.toarray()) 32 | 33 | 34 | def test_bpe_bad_params(): 35 | with pytest.raises(ValueError): 36 | bpe = BytePairEncodingVectorizer(max_vocab_size=-1) 37 | bpe.fit(raw_string_data) 38 | 39 | with pytest.raises(ValueError): 40 | bpe = BytePairEncodingVectorizer(min_token_occurrence=-1) 41 | bpe.fit(raw_string_data) 42 | 43 | with pytest.raises(ValueError): 44 | bpe = BytePairEncodingVectorizer(return_type=-1) 45 | bpe.fit(raw_string_data) 46 | 47 | with pytest.raises(ValueError): 48 | bpe = BytePairEncodingVectorizer(return_type="nonsense") 49 | bpe.fit(raw_string_data) 50 | 51 | 52 | def test_bpe_trash_token(): 53 | bpe = BytePairEncodingVectorizer(return_type="sequences").fit(raw_string_data) 54 | tokenized_no_trash = bpe.transform(raw_string_data) 55 | assert len(tokenized_no_trash) == len(raw_string_data) 56 | assert not any(0 in tokens for tokens in tokenized_no_trash) 57 | tokenized_with_trash = bpe.transform(["asdf{qwer"]) 58 | assert len(tokenized_with_trash) == 1 59 | assert 0 in tokenized_with_trash[0] 60 | 61 | 62 | def test_bpe_set_max_char_code(): 63 | MCC = 65535 64 | bpe = BytePairEncodingVectorizer( 65 | max_char_code=MCC, 66 | return_type="sequences" 67 | ).fit(raw_string_data) 68 | tokens = bpe.transform(raw_string_data) 69 | largest_char = max(max(ord(c) for c in s) for s in raw_string_data) 70 | assert largest_char < 126 71 | assert all( 72 | all( 73 | token <= largest_char or token > MCC 74 | for token in seq 75 | ) 76 | for seq in tokens 77 | ) 78 | tokens_strange = bpe.transform([chr(126) + chr(2000) + chr(60000)]) 79 | assert 1 == len(tokens_strange) 80 | assert np.all([126, 2000, 60000] == tokens_strange[0]) 81 | 82 | 83 | def test_bpe_set_max_char_code_too_low(): 84 | bpe = BytePairEncodingVectorizer(max_char_code=50).fit(raw_string_data) 85 | assert max(max(ord(c) for c in s) for s in raw_string_data) == bpe.max_char_code_ 86 | 87 | 88 | @pytest.mark.parametrize( 89 | "name,max_expected", 90 | [ 91 | ("ascii", 127), 92 | ("common", 2047), 93 | ("bmp", 65535), 94 | ("unicode", 1_114_111), 95 | ] 96 | ) 97 | def test_bpe_max_char_code_limits(name, max_expected): 98 | assert max_expected == BytePairEncodingVectorizer( 99 | max_char_code=name 100 | ).fit(raw_string_data).max_char_code_ 101 | 102 | 103 | def test_bpe_max_char_code_limit_wrong(): 104 | with pytest.raises(ValueError): 105 | BytePairEncodingVectorizer(max_char_code="utf8").fit(raw_string_data) 106 | 107 | 108 | def test_bpe_contract_pair_single_token_training(): 109 | seqs_tokens = BytePairEncodingVectorizer(return_type="tokens").fit_transform([ 110 | "asdfqwerty", 111 | "asdf", 112 | "qwzxasdfcv" 113 | ]) 114 | assert [ 115 | ["asdf", "qw", "e", "r", "t", "y"], 116 | ["asdf"], 117 | ["qw", "z", "x", "asdf", "c", "v"], 118 | ] == seqs_tokens 119 | 120 | 121 | def test_bpe_contract_pair_single_token_inference(): 122 | bpe = BytePairEncodingVectorizer(return_type="tokens").fit([ 123 | "asdfqwerty", 124 | "asdfg", 125 | "qwzxasdfcv", 126 | ]) 127 | assert [["asdf"]] == bpe.transform(["asdf"]) 128 | -------------------------------------------------------------------------------- /vectorizers/tests/test_distances.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import numpy as np 4 | import scipy.sparse 5 | from sklearn.preprocessing import normalize 6 | 7 | from vectorizers.distances import hellinger, sparse_hellinger 8 | from vectorizers.distances import total_variation, sparse_total_variation 9 | from vectorizers.distances import ( 10 | jensen_shannon_divergence, 11 | sparse_jensen_shannon_divergence, 12 | ) 13 | 14 | 15 | def test_hellinger(): 16 | assert hellinger(np.array([0.0, 0.0]), np.array([0.0, 0.0])) == 0.0 17 | assert hellinger(np.array([0.0, 0.0]), np.array([1.0, 0.0])) == 1.0 18 | assert hellinger(np.array([0.5, 0.5]), np.array([0.5, 0.5])) == 0.0 19 | assert hellinger(np.array([0.5, 0.5]), np.array([1.0, 0.0])) == 0.5411961001461969 20 | assert hellinger(np.array([0.1, 0.9]), np.array([1.0, 0.0])) == 0.8269052146305295 21 | 22 | 23 | def test_sparse_hellinger(): 24 | assert np.isclose( 25 | sparse_hellinger( 26 | np.array([7, 12]), 27 | np.array([0.0, 0.0]), 28 | np.array([8, 13]), 29 | np.array([0.0, 0.0]), 30 | ), 31 | 0.0, 32 | ) 33 | assert np.isclose( 34 | sparse_hellinger( 35 | np.array([7, 12]), 36 | np.array([0.0, 0.0]), 37 | np.array([8, 13]), 38 | np.array([1.0, 0.0]), 39 | ), 40 | 1.0, 41 | ) 42 | assert np.isclose( 43 | sparse_hellinger( 44 | np.array([7, 12]), 45 | np.array([0.5, 0.5]), 46 | np.array([7, 12]), 47 | np.array([0.5, 0.5]), 48 | ), 49 | 0.0, 50 | ) 51 | assert np.isclose( 52 | sparse_hellinger( 53 | np.array([7, 12]), 54 | np.array([0.5, 0.5]), 55 | np.array([7, 12]), 56 | np.array([1.0, 0.0]), 57 | ), 58 | 0.5411961001461969, 59 | ) 60 | assert np.isclose( 61 | sparse_hellinger( 62 | np.array([7, 12]), 63 | np.array([0.1, 0.9]), 64 | np.array([7, 12]), 65 | np.array([1.0, 0.0]), 66 | ), 67 | 0.8269052146305295, 68 | ) 69 | 70 | 71 | # Test using inequalities with Hellinger distance from Wikipedia 72 | # https://en.wikipedia.org/wiki/Hellinger_distance#Connection_with_the_statistical_distance 73 | def test_total_variation(): 74 | test_data = np.random.random(size=(10, 50)) 75 | test_data = normalize(test_data, norm="l1") 76 | for i in range(test_data.shape[0]): 77 | for j in range(i + 1, test_data.shape[0]): 78 | hd = hellinger(test_data[i], test_data[j]) 79 | tvd = total_variation(test_data[i], test_data[j]) 80 | assert hd ** 2 <= tvd 81 | assert tvd <= np.sqrt(2) * hd 82 | 83 | 84 | # Test using inequalities with Hellinger distance from Wikipedia 85 | # https://en.wikipedia.org/wiki/Hellinger_distance#Connection_with_the_statistical_distance 86 | def test_sparse_total_variation(): 87 | test_data = np.random.random(size=(10, 100)) 88 | # sparsify 89 | test_data[test_data <= 0.5] = 0.0 90 | test_data = scipy.sparse.csr_matrix(test_data) 91 | test_data = normalize(test_data, norm="l1") 92 | 93 | for i in range(test_data.shape[0]): 94 | for j in range(i + 1, test_data.shape[0]): 95 | hd = sparse_hellinger( 96 | test_data[i].indices, 97 | test_data[i].data, 98 | test_data[j].indices, 99 | test_data[j].data, 100 | ) 101 | tvd = sparse_total_variation( 102 | test_data[i].indices, 103 | test_data[i].data, 104 | test_data[j].indices, 105 | test_data[j].data, 106 | ) 107 | assert hd ** 2 <= tvd 108 | assert tvd <= np.sqrt(2) * hd 109 | 110 | 111 | def test_jensen_shannon(): 112 | test_data = np.random.random(size=(10, 50)) 113 | test_data = normalize(test_data, norm="l1") 114 | for i in range(test_data.shape[0]): 115 | for j in range(i + 1, test_data.shape[0]): 116 | m = (test_data[i] + test_data[j]) / 2.0 117 | p = test_data[i] 118 | q = test_data[j] 119 | d = ( 120 | -np.sum(m * np.log(m)) 121 | + (np.sum(p * np.log(p)) + np.sum(q * np.log(q))) / 2.0 122 | ) 123 | assert np.isclose(d, jensen_shannon_divergence(p, q)) 124 | 125 | 126 | def test_sparse_jensen_shannon(): 127 | test_data = np.random.random(size=(10, 100)) 128 | # sparsify 129 | test_data[test_data <= 0.5] = 0.0 130 | sparse_test_data = scipy.sparse.csr_matrix(test_data) 131 | sparse_test_data = normalize(sparse_test_data, norm="l1") 132 | test_data = normalize(test_data, norm="l1") 133 | 134 | for i in range(test_data.shape[0]): 135 | for j in range(i + 1, test_data.shape[0]): 136 | m = (test_data[i] + test_data[j]) / 2.0 137 | p = test_data[i] 138 | q = test_data[j] 139 | d = ( 140 | -np.sum(m[m > 0] * np.log(m[m > 0])) 141 | + ( 142 | np.sum(p[p > 0] * np.log(p[p > 0])) 143 | + np.sum(q[q > 0] * np.log(q[q > 0])) 144 | ) 145 | / 2.0 146 | ) 147 | assert np.isclose( 148 | d, 149 | sparse_jensen_shannon_divergence( 150 | sparse_test_data[i].indices, 151 | sparse_test_data[i].data, 152 | sparse_test_data[j].indices, 153 | sparse_test_data[j].data, 154 | ), 155 | ) 156 | -------------------------------------------------------------------------------- /vectorizers/tests/test_edge_list_vectorizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vectorizers import EdgeListVectorizer 4 | 5 | # from vectorizers.edge_list_vectorizer import read_edge_data 6 | import numpy as np 7 | import pandas as pd 8 | 9 | rows = np.array(["a", "b", "c", "d", "d"]) 10 | cols = np.array(["b", "c", "d", "b", "c"]) 11 | vals = np.array([1, 2, 3, 4, 8]) 12 | test_data = (rows, cols, vals) 13 | list_of_edges = [ 14 | ["a", "b", 1], 15 | ["b", "c", 2], 16 | ["c", "d", 3], 17 | ["d", "b", 4], 18 | ["d", "c", 8], 19 | ] 20 | df_of_edges = pd.DataFrame({"r": rows, "c": cols, "v": vals}) 21 | 22 | 23 | # Tuple or list of columns, data frame, list of edges 24 | @pytest.mark.parametrize( 25 | "data", [(rows, cols, vals), [rows, cols, vals], list_of_edges, df_of_edges] 26 | ) 27 | def test_edgelist_input(data): 28 | model = EdgeListVectorizer().fit(data) 29 | result = model.transform(data) 30 | result1 = EdgeListVectorizer().fit_transform(data) 31 | assert np.allclose(result.toarray(), result1.toarray()) 32 | assert result.shape == (4, 3) 33 | assert np.allclose(result.toarray()[:, 1], np.array([0, 2, 0, 8])) 34 | 35 | 36 | def test_edgelist_specified_rows(): 37 | row_dict = {"a": 0, "d": 1} 38 | result = EdgeListVectorizer(row_label_dictionary=row_dict).fit_transform(test_data) 39 | assert result.shape == (2, 3) 40 | assert np.allclose(result.toarray()[1, :], np.array([4, 8, 0])) 41 | 42 | 43 | def test_edgelist_specified_columns(): 44 | column_dict = {"b": 0, "c": 1} 45 | result = EdgeListVectorizer(column_label_dictionary=column_dict).fit_transform( 46 | test_data 47 | ) 48 | assert result.shape == (4, 2) 49 | assert np.allclose(result.toarray()[:, 1], np.array([0, 2, 0, 8])) 50 | 51 | 52 | def test_edgelist_specified_rows_missing_index(): 53 | row_dict = {"a": 2, "d": 4} 54 | result = EdgeListVectorizer(row_label_dictionary=row_dict).fit_transform(test_data) 55 | assert result.shape == (5, 3) 56 | assert np.allclose(result.toarray()[:, 0], np.array([0, 0, 1, 0, 4])) 57 | 58 | 59 | def test_edgelist_specified_column_missing_index(): 60 | column_dict = {"b": 2, "c": 4} 61 | result = EdgeListVectorizer(column_label_dictionary=column_dict).fit_transform( 62 | test_data 63 | ) 64 | assert result.shape == (4, 5) 65 | assert np.allclose(result.toarray()[:, 4], np.array([0, 2, 0, 8])) 66 | 67 | 68 | # TODO: Write a unit test for joint_space=True 69 | -------------------------------------------------------------------------------- /vectorizers/tests/test_signature_vectorizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vectorizers import SignatureVectorizer 4 | 5 | import numpy as np 6 | iisignature = pytest.importorskip("iisignature") 7 | import re 8 | 9 | NUMPY_SHAPE_ERROR_MSG = """ 10 | Error: SignatureVectorizer expects numpy arrays to be of shape (num_samples x path_len x path_dim). 11 | """ 12 | LIST_SHAPE_ERROR_MSG = """ 13 | Error: Expecting list entries to be numpy arrays of shape (path_len x path_dim). 14 | """ 15 | 16 | # Check numpy and list vectorizers return the same output as iisignature 17 | @pytest.mark.parametrize("truncation_level", [2, 3, 5]) 18 | @pytest.mark.parametrize("log", [True, False]) 19 | @pytest.mark.parametrize("basepoint", [True, False]) 20 | def test_numpy_vs_list_vs_iisig(truncation_level, log, basepoint, seed=1): 21 | 22 | n_paths = 100 23 | path_len = 50 24 | path_dim = 5 25 | 26 | np.random.seed(seed) 27 | test_paths_list = [ 28 | np.random.normal(size=(path_len, path_dim)) for i in range(n_paths) 29 | ] 30 | test_paths_numpy = np.array(test_paths_list) 31 | 32 | sigs_numpy = SignatureVectorizer( 33 | truncation_level=truncation_level, log=log, basepoint=basepoint 34 | ).fit_transform(test_paths_numpy) 35 | sigs_list = SignatureVectorizer( 36 | truncation_level=truncation_level, log=log, basepoint=basepoint 37 | ).fit_transform(test_paths_list) 38 | 39 | if basepoint: 40 | concat_shape = (test_paths_numpy.shape[0], 1, test_paths_numpy.shape[2]) 41 | X = np.concatenate([np.zeros(shape=concat_shape), test_paths_numpy], axis=1) 42 | else: 43 | X = test_paths_numpy 44 | 45 | if log: 46 | s = iisignature.prepare(X.shape[-1], truncation_level) 47 | sigs_iisig = iisignature.logsig(X, s) 48 | else: 49 | sigs_iisig = iisignature.sig(X, truncation_level) 50 | assert np.all(np.isclose(sigs_numpy, sigs_list)) 51 | assert np.all(np.isclose(sigs_list, sigs_iisig)) 52 | 53 | 54 | # Check bad initialisation returns appropriate error messages 55 | def test_bad_init_params(): 56 | with pytest.raises( 57 | AssertionError, match="Error: expecting int type for truncation_level." 58 | ): 59 | vectorizer = SignatureVectorizer(truncation_level="three") 60 | 61 | with pytest.raises(AssertionError, match="Error: expecting bool type for log."): 62 | vectorizer = SignatureVectorizer(log=1) 63 | 64 | with pytest.raises( 65 | AssertionError, match="Error: expecting bool type for basepoint" 66 | ): 67 | vectorizer = SignatureVectorizer(basepoint=np.zeros(10)) 68 | 69 | 70 | # Check bad fit returns appropriate error messages 71 | def test_bad_fit_params(): 72 | 73 | vectorizer = SignatureVectorizer() 74 | 75 | with pytest.raises(AssertionError, match=re.escape(NUMPY_SHAPE_ERROR_MSG)): 76 | vectorizer.fit(np.random.random(size=(2, 10, 3, 5))) 77 | 78 | with pytest.raises(AssertionError, match=re.escape(NUMPY_SHAPE_ERROR_MSG)): 79 | vectorizer.fit(np.random.random(size=(2, 10))) 80 | 81 | with pytest.raises( 82 | AssertionError, match="Error: Expecting numpy array or list of paths." 83 | ): 84 | vectorizer.fit("Not a list or numpy array") 85 | 86 | with pytest.raises( 87 | AssertionError, match="Error: Expecting list entries to be numpy arrays." 88 | ): 89 | vectorizer.fit(["List", "of", "nonsense"]) 90 | 91 | with pytest.raises(AssertionError, match=re.escape(LIST_SHAPE_ERROR_MSG)): 92 | vectorizer.fit([np.random.random(size=(3, 10, 5))]) 93 | 94 | 95 | # Check bad transform returns appropriate error messages 96 | def test_bad_transform_parameters(): 97 | 98 | vectorizer = SignatureVectorizer() 99 | vectorizer.fit(np.random.random(size=(20, 50, 3))) 100 | with pytest.raises(AssertionError, match=re.escape(NUMPY_SHAPE_ERROR_MSG)): 101 | vectorizer.transform(np.random.random(size=(2, 10, 3, 5))) 102 | 103 | with pytest.raises(AssertionError, match=re.escape(NUMPY_SHAPE_ERROR_MSG)): 104 | vectorizer.transform(np.random.random(size=(2, 10))) 105 | 106 | with pytest.raises( 107 | AssertionError, match="Error: Expecting numpy array or list of paths." 108 | ): 109 | vectorizer.transform("Not a list or numpy array") 110 | 111 | with pytest.raises( 112 | AssertionError, match="Error: Expecting list entries to be numpy arrays." 113 | ): 114 | vectorizer.transform(["List", "of", "nonsense"]) 115 | 116 | with pytest.raises(AssertionError, match=re.escape(LIST_SHAPE_ERROR_MSG)): 117 | vectorizer.transform([np.random.random(size=(3, 10, 5))]) 118 | 119 | # Mismatch from fit shape 120 | with pytest.raises(AssertionError, match="Error: Expecting path_dim to be"): 121 | vectorizer.transform([np.random.random(size=(50, 5))]) 122 | with pytest.raises(AssertionError, match="Error: Expecting path_dim to be "): 123 | vectorizer.transform(np.random.random(size=(30, 50, 5))) 124 | with pytest.raises( 125 | AssertionError, match="Error: Not all paths share the same dimension." 126 | ): 127 | vectorizer.transform( 128 | [ 129 | np.random.random(size=(10, 3)), 130 | np.random.random(size=(10, 3)), 131 | np.random.random(size=(10, 5)), 132 | np.random.random(size=(10, 3)), 133 | ] 134 | ) 135 | -------------------------------------------------------------------------------- /vectorizers/tests/test_template.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | 4 | # from sklearn.datasets import load_iris 5 | # from sklearn.utils.testing import assert_array_equal 6 | # from sklearn.utils.testing import assert_allclose 7 | # 8 | # from vectorizers import TemplateEstimator 9 | # from vectorizers import TemplateTransformer 10 | # from vectorizers import TemplateClassifier 11 | # 12 | # 13 | # @pytest.fixture 14 | # def data(): 15 | # return load_iris(return_X_y=True) 16 | # 17 | # def test_template_estimator(data): 18 | # est = TemplateEstimator() 19 | # assert est.demo_param == 'demo_param' 20 | # 21 | # est.fit(*data) 22 | # assert hasattr(est, 'is_fitted_') 23 | # 24 | # X = data[0] 25 | # y_pred = est.predict(X) 26 | # assert_array_equal(y_pred, np.ones(X.shape[0], dtype=np.int64)) 27 | # 28 | # 29 | # def test_template_transformer_error(data): 30 | # X, y = data 31 | # trans = TemplateTransformer() 32 | # trans.fit(X) 33 | # with pytest.raises(ValueError, match="Shape of input is different"): 34 | # X_diff_size = np.ones((10, X.shape[1] + 1)) 35 | # trans.transform(X_diff_size) 36 | # 37 | # 38 | # def test_template_transformer(data): 39 | # X, y = data 40 | # trans = TemplateTransformer() 41 | # assert trans.demo_param == 'demo' 42 | # 43 | # trans.fit(X) 44 | # assert trans.n_features_ == X.shape[1] 45 | # 46 | # X_trans = trans.transform(X) 47 | # assert_allclose(X_trans, np.sqrt(X)) 48 | # 49 | # X_trans = trans.fit_transform(X) 50 | # assert_allclose(X_trans, np.sqrt(X)) 51 | # 52 | # 53 | # def test_template_classifier(data): 54 | # X, y = data 55 | # clf = TemplateClassifier() 56 | # assert clf.demo_param == 'demo' 57 | # 58 | # clf.fit(X, y) 59 | # assert hasattr(clf, 'classes_') 60 | # assert hasattr(clf, 'X_') 61 | # assert hasattr(clf, 'y_') 62 | # 63 | # y_pred = clf.predict(X) 64 | # assert y_pred.shape == (X.shape[0],) 65 | -------------------------------------------------------------------------------- /vectorizers/tests/test_transformers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vectorizers.transformers import ( 4 | RowDenoisingTransformer, 5 | InformationWeightTransformer, 6 | CategoricalColumnTransformer, 7 | CountFeatureCompressionTransformer, 8 | SlidingWindowTransformer, 9 | SequentialDifferenceTransformer, 10 | sliding_window_generator, 11 | ) 12 | import numpy as np 13 | import scipy.sparse 14 | import pandas as pd 15 | import numba 16 | 17 | test_matrix = scipy.sparse.csr_matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 18 | test_matrix_zero_row = scipy.sparse.csr_matrix([[1, 2, 3], [4, 5, 6], [0, 0, 0]]) 19 | test_matrix_zero_row.eliminate_zeros() 20 | test_matrix_zero_column = scipy.sparse.csr_matrix([[1, 2, 0], [4, 5, 0], [7, 8, 0]]) 21 | test_matrix_zero_column.eliminate_zeros() 22 | 23 | test_df = pd.DataFrame( 24 | { 25 | "id": ["one", "two", "one", "two"], 26 | "A": ["foo", "bar", "pok", "bar"], 27 | "B": ["x", "k", "c", "d"], 28 | } 29 | ) 30 | 31 | test_time_series = [ 32 | np.random.random(size=23), 33 | np.random.random(size=56), 34 | np.random.random(size=71), 35 | np.random.random(size=64), 36 | np.random.random(size=35), 37 | np.random.random(size=44), 38 | ] 39 | 40 | changepoint_position = np.random.randint(11, 100) # changepoint position must be at least window_width in 41 | changepoint_sequence = np.random.poisson(0.75, size=100) 42 | changepoint_sequence[changepoint_position] = 10 43 | 44 | 45 | @pytest.mark.parametrize("include_column_name", [True, False]) 46 | @pytest.mark.parametrize("unique_values", [True, False]) 47 | def test_CategoricalColumnTransformer(include_column_name, unique_values): 48 | result = CategoricalColumnTransformer( 49 | object_column_name="id", 50 | descriptor_column_name="A", 51 | include_column_name=include_column_name, 52 | unique_values=unique_values, 53 | ).fit_transform(test_df) 54 | 55 | if include_column_name: 56 | if unique_values: 57 | expected_result = pd.Series( 58 | [["A:foo", "A:pok"], ["A:bar"]], index=["one", "two"] 59 | ) 60 | else: 61 | expected_result = pd.Series( 62 | [["A:foo", "A:pok"], ["A:bar", "A:bar"]], index=["one", "two"] 63 | ) 64 | else: 65 | if unique_values: 66 | expected_result = pd.Series([["foo", "pok"], ["bar"]], index=["one", "two"]) 67 | else: 68 | expected_result = pd.Series( 69 | [["foo", "pok"], ["bar", "bar"]], index=["one", "two"] 70 | ) 71 | assert (result == expected_result).all() 72 | 73 | 74 | @pytest.mark.parametrize("include_column_name", [True, False]) 75 | @pytest.mark.parametrize("unique_values", [True, False]) 76 | def test_CategoricalColumnTransformer_multi_column(include_column_name, unique_values): 77 | result = CategoricalColumnTransformer( 78 | object_column_name="id", 79 | descriptor_column_name=["A", "B"], 80 | include_column_name=include_column_name, 81 | unique_values=unique_values, 82 | ).fit_transform(test_df) 83 | 84 | if include_column_name: 85 | if unique_values: 86 | expected_result = pd.Series( 87 | [["A:foo", "A:pok", "B:x", "B:c"], ["A:bar", "B:k", "B:d"]], 88 | index=["one", "two"], 89 | ) 90 | else: 91 | expected_result = pd.Series( 92 | [["A:foo", "A:pok", "B:x", "B:c"], ["A:bar", "A:bar", "B:k", "B:d"]], 93 | index=["one", "two"], 94 | ) 95 | else: 96 | if unique_values: 97 | expected_result = pd.Series( 98 | [["foo", "pok", "x", "c"], ["bar", "k", "d"]], index=["one", "two"] 99 | ) 100 | else: 101 | expected_result = pd.Series( 102 | [["foo", "pok", "x", "c"], ["bar", "bar", "k", "d"]], 103 | index=["one", "two"], 104 | ) 105 | assert (result == expected_result).all() 106 | 107 | 108 | def test_CategoricalColumnTransformer_bad_param(): 109 | with pytest.raises(ValueError): 110 | CategoricalColumnTransformer( 111 | object_column_name="id", 112 | descriptor_column_name=["A", "BAD"], 113 | ).fit_transform(test_df) 114 | 115 | 116 | @pytest.mark.parametrize("em_precision", [1e-3, 1e-4]) 117 | @pytest.mark.parametrize("em_background_prior", [0.1, 10.0]) 118 | @pytest.mark.parametrize("em_threshold", [1e-4, 1e-5]) 119 | @pytest.mark.parametrize("em_prior_strength", [1.0, 10.0]) 120 | @pytest.mark.parametrize("normalize", [True, False]) 121 | def test_re_transformer( 122 | em_precision, 123 | em_background_prior, 124 | em_threshold, 125 | em_prior_strength, 126 | normalize, 127 | ): 128 | RET = RowDenoisingTransformer( 129 | em_precision=em_precision, 130 | em_background_prior=em_background_prior, 131 | em_threshold=em_threshold, 132 | em_prior_strength=em_prior_strength, 133 | normalize=normalize, 134 | ) 135 | result = RET.fit_transform(test_matrix) 136 | transform = RET.transform(test_matrix) 137 | assert np.allclose(result.toarray(), transform.toarray()) 138 | 139 | 140 | @pytest.mark.parametrize("em_precision", [1e-3, 1e-4]) 141 | @pytest.mark.parametrize("em_background_prior", [0.1, 10.0]) 142 | @pytest.mark.parametrize("em_threshold", [1e-4, 1e-5]) 143 | @pytest.mark.parametrize("em_prior_strength", [1.0, 10.0]) 144 | @pytest.mark.parametrize("normalize", [True, False]) 145 | def test_re_transformer_zero_column( 146 | em_precision, 147 | em_background_prior, 148 | em_threshold, 149 | em_prior_strength, 150 | normalize, 151 | ): 152 | RET = RowDenoisingTransformer( 153 | em_precision=em_precision, 154 | em_background_prior=em_background_prior, 155 | em_threshold=em_threshold, 156 | em_prior_strength=em_prior_strength, 157 | normalize=normalize, 158 | ) 159 | result = RET.fit_transform(test_matrix_zero_column) 160 | transform = RET.transform(test_matrix_zero_column) 161 | assert np.allclose(result.toarray(), transform.toarray()) 162 | 163 | 164 | @pytest.mark.parametrize("em_precision", [1e-3, 1e-4]) 165 | @pytest.mark.parametrize("em_background_prior", [0.1, 10.0]) 166 | @pytest.mark.parametrize("em_threshold", [1e-4, 1e-5]) 167 | @pytest.mark.parametrize("em_prior_strength", [1.0, 10.0]) 168 | @pytest.mark.parametrize("normalize", [True, False]) 169 | def test_re_transformer_zero_row( 170 | em_precision, 171 | em_background_prior, 172 | em_threshold, 173 | em_prior_strength, 174 | normalize, 175 | ): 176 | RET = RowDenoisingTransformer( 177 | em_precision=em_precision, 178 | em_background_prior=em_background_prior, 179 | em_threshold=em_threshold, 180 | em_prior_strength=em_prior_strength, 181 | normalize=normalize, 182 | ) 183 | result = RET.fit_transform(test_matrix_zero_row) 184 | transform = RET.transform(test_matrix_zero_row) 185 | assert np.allclose(result.toarray(), transform.toarray()) 186 | 187 | 188 | @pytest.mark.parametrize("prior_strength", [0.1, 1.0]) 189 | @pytest.mark.parametrize("approx_prior", [True, False]) 190 | def test_iw_transformer(prior_strength, approx_prior): 191 | IWT = InformationWeightTransformer( 192 | prior_strength=prior_strength, 193 | approx_prior=approx_prior, 194 | ) 195 | result = IWT.fit_transform(test_matrix) 196 | transform = IWT.transform(test_matrix) 197 | assert np.allclose(result.toarray(), transform.toarray()) 198 | 199 | 200 | @pytest.mark.parametrize("prior_strength", [0.1, 1.0]) 201 | @pytest.mark.parametrize("approx_prior", [True, False]) 202 | def test_iw_transformer_supervised(prior_strength, approx_prior): 203 | IWT = InformationWeightTransformer( 204 | prior_strength=prior_strength, 205 | approx_prior=approx_prior, 206 | ) 207 | result = IWT.fit_transform(test_matrix, np.array([0, 1, 1])) 208 | transform = IWT.transform(test_matrix) 209 | assert np.allclose(result.toarray(), transform.toarray()) 210 | 211 | 212 | @pytest.mark.parametrize("prior_strength", [0.1, 1.0]) 213 | @pytest.mark.parametrize("approx_prior", [True, False]) 214 | def test_iw_transformer_zero_column(prior_strength, approx_prior): 215 | IWT = InformationWeightTransformer( 216 | prior_strength=prior_strength, 217 | approx_prior=approx_prior, 218 | ) 219 | result = IWT.fit_transform(test_matrix_zero_column) 220 | transform = IWT.transform(test_matrix_zero_column) 221 | assert np.allclose(result.toarray(), transform.toarray()) 222 | 223 | 224 | @pytest.mark.parametrize("prior_strength", [0.1, 1.0]) 225 | @pytest.mark.parametrize("approx_prior", [True, False]) 226 | def test_iw_transformer_zero_row(prior_strength, approx_prior): 227 | IWT = InformationWeightTransformer( 228 | prior_strength=prior_strength, 229 | approx_prior=approx_prior, 230 | ) 231 | result = IWT.fit_transform(test_matrix_zero_row) 232 | transform = IWT.transform(test_matrix_zero_row) 233 | assert np.allclose(result.toarray(), transform.toarray()) 234 | 235 | 236 | @pytest.mark.parametrize("algorithm", ["randomized", "arpack"]) 237 | def test_count_feature_compression_basic(algorithm): 238 | cfc = CountFeatureCompressionTransformer(n_components=2, algorithm=algorithm) 239 | result = cfc.fit_transform(test_matrix) 240 | transform = cfc.transform(test_matrix) 241 | assert np.allclose(result, transform) 242 | 243 | 244 | @pytest.mark.parametrize("algorithm", ["randomized", "arpack"]) 245 | def test_count_feature_compression_fit_transform_is_fit_and_transform(algorithm): 246 | make_cfc = lambda: CountFeatureCompressionTransformer(n_components=2, algorithm=algorithm) 247 | cfc_fit = make_cfc().fit(test_matrix) 248 | assert np.allclose(cfc_fit.transform(test_matrix), make_cfc().fit_transform(test_matrix)) 249 | 250 | 251 | def test_count_feature_compression_warns(): 252 | cfc = CountFeatureCompressionTransformer(n_components=5) 253 | with pytest.warns(UserWarning): 254 | result = cfc.fit_transform(test_matrix) 255 | 256 | 257 | def test_count_feature_compression_bad_input(): 258 | cfc = CountFeatureCompressionTransformer(n_components=2) 259 | with pytest.raises(ValueError): 260 | result = cfc.fit_transform(-test_matrix) 261 | 262 | with pytest.raises(ValueError): 263 | result = cfc.fit_transform(-test_matrix.toarray()) 264 | 265 | cfc = CountFeatureCompressionTransformer(n_components=2, algorithm="bad_value") 266 | with pytest.raises(ValueError): 267 | result = cfc.fit_transform(test_matrix) 268 | 269 | 270 | @pytest.mark.parametrize("pad_width", [0, 1]) 271 | @pytest.mark.parametrize( 272 | "kernel", 273 | [ 274 | "average", 275 | ("differences", 0, 1, 1), 276 | ("position_velocity", 2, 1, 1), 277 | ("weight", np.array([0.1, 0.75, 1.5, 1.0, 0.25])), 278 | ("gaussian_weight", 2), 279 | np.random.random((5, 5)), 280 | numba.njit(lambda x: x.cumsum()), 281 | ], 282 | ) 283 | @pytest.mark.parametrize("sample", [None, (0, 1), np.arange(5), [4, 1, 3, 2, 0]]) 284 | def test_sliding_window_transformer_basic(pad_width, kernel, sample): 285 | swt = SlidingWindowTransformer( 286 | window_width=5, pad_width=pad_width, kernels=[kernel], window_sample=sample 287 | ) 288 | result = swt.fit_transform(test_time_series) 289 | transform = swt.transform(test_time_series) 290 | for i, point_cloud in enumerate(result): 291 | for j, point in enumerate(point_cloud): 292 | assert np.allclose(point, transform[i][j]) 293 | 294 | 295 | @pytest.mark.parametrize("pad_width", [0, 1]) 296 | @pytest.mark.parametrize( 297 | "kernel", 298 | [ 299 | "average", 300 | ("differences", 0, 1, 1), 301 | ("position_velocity", 2, 1, 1), 302 | ("weight", np.array([0.1, 0.75, 1.5, 1.0, 0.25])), 303 | ("gaussian_weight", 2), 304 | np.random.random((5, 5)), 305 | numba.njit(lambda x: x.cumsum(), cache=True), 306 | ], 307 | ) 308 | @pytest.mark.parametrize("sample", [None, np.arange(5), [4, 1, 3, 2, 0]]) 309 | def test_sliding_window_generator_matches_transformer(pad_width, kernel, sample): 310 | swt = SlidingWindowTransformer( 311 | window_width=5, pad_width=pad_width, kernels=[kernel], window_sample=sample 312 | ) 313 | transformer_result = swt.fit_transform(test_time_series) 314 | test_window = ( 315 | None 316 | if not callable(kernel) 317 | else np.asarray(test_time_series[0])[: swt.window_width][swt.window_sample_] 318 | ) 319 | generator_result = list( 320 | sliding_window_generator( 321 | test_time_series, 322 | test_time_series[0].shape, 323 | window_width=5, 324 | pad_width=pad_width, 325 | kernels=[kernel], 326 | window_sample=sample, 327 | test_window=test_window, 328 | ) 329 | ) 330 | for i, point_cloud in enumerate(transformer_result): 331 | for j, point in enumerate(point_cloud): 332 | assert np.allclose(point, generator_result[i][j]) 333 | 334 | @pytest.mark.parametrize("window_width", [5, 10]) 335 | def test_sliding_window_count_changepoint(window_width): 336 | swt = SlidingWindowTransformer( 337 | window_width=window_width, kernels=[("count_changepoint", 1.0, 2.0)], 338 | ) 339 | changepoint_scores = swt.fit_transform([changepoint_sequence])[0].flatten() 340 | assert np.argmax(changepoint_scores) + window_width - 1 == changepoint_position 341 | 342 | @pytest.mark.parametrize("pad_width", [0, 1]) 343 | @pytest.mark.parametrize( 344 | "kernel", 345 | [ 346 | "average", 347 | ("differences", 0, 1, 1), 348 | ("position_velocity", 2, 1, 1), 349 | ("weight", np.array([0.1, 0.75, 1.5, 1.0, 0.25])), 350 | np.random.random((5, 5)), 351 | numba.njit(lambda x: x.cumsum()), 352 | ], 353 | ) 354 | @pytest.mark.parametrize("sample", [None, np.arange(5), [4, 1, 3, 2, 0]]) 355 | def test_sliding_window_transformer_basic_w_lists(pad_width, kernel, sample): 356 | swt = SlidingWindowTransformer( 357 | window_width=5, pad_width=pad_width, kernels=[kernel], window_sample=sample 358 | ) 359 | result = swt.fit_transform([list(x) for x in test_time_series]) 360 | transform = swt.transform([list(x) for x in test_time_series]) 361 | for i, point_cloud in enumerate(result): 362 | for j, point in enumerate(point_cloud): 363 | assert np.allclose(point, transform[i][j]) 364 | 365 | 366 | def test_sliding_window_transformer_w_sampling(): 367 | swt = SlidingWindowTransformer(window_sample="random", window_sample_size=5) 368 | result = swt.fit_transform(test_time_series) 369 | transform = swt.transform(test_time_series) 370 | for i, point_cloud in enumerate(result): 371 | for j, point in enumerate(point_cloud): 372 | assert np.allclose(point, transform[i][j]) 373 | 374 | 375 | def test_sliding_window_transformer_bad_params(): 376 | swt = SlidingWindowTransformer(window_sample="foo") 377 | with pytest.raises(ValueError): 378 | result = swt.fit_transform(test_time_series) 379 | 380 | swt = SlidingWindowTransformer(window_sample=("foo", "bar")) 381 | with pytest.raises(ValueError): 382 | result = swt.fit_transform(test_time_series) 383 | 384 | swt = SlidingWindowTransformer(window_sample=1.105) 385 | with pytest.raises(ValueError): 386 | result = swt.fit_transform(test_time_series) 387 | 388 | swt = SlidingWindowTransformer(window_width=-1) 389 | with pytest.raises(ValueError): 390 | result = swt.fit_transform(test_time_series) 391 | 392 | swt = SlidingWindowTransformer(kernels=["not a kernel"]) 393 | with pytest.raises(ValueError): 394 | result = swt.fit_transform(test_time_series) 395 | 396 | swt = SlidingWindowTransformer(kernels=-1) 397 | with pytest.raises(ValueError): 398 | result = swt.fit_transform(test_time_series) 399 | 400 | swt = SlidingWindowTransformer(kernels=np.array([[1, 2, 3], [1, 2, 3]])) 401 | with pytest.raises(ValueError): 402 | result = swt.fit_transform(test_time_series) 403 | 404 | def test_seq_diff_transformer_basic(): 405 | sdt = SequentialDifferenceTransformer() 406 | diffs = sdt.fit_transform(test_time_series) 407 | transform_diffs = sdt.transform(test_time_series) 408 | for i, seq_diffs in enumerate(diffs): 409 | assert np.allclose(np.array(seq_diffs), np.array(transform_diffs[i])) 410 | assert np.allclose(test_time_series[i][:-1] + np.ravel(seq_diffs), test_time_series[i][1:]) -------------------------------------------------------------------------------- /vectorizers/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | from vectorizers.transformers.categorical_columns import CategoricalColumnTransformer 2 | from vectorizers.transformers.info_weight import ( 3 | InformationWeightTransformer, 4 | information_weight, 5 | ) 6 | from vectorizers.transformers.row_desnoise import RowDenoisingTransformer 7 | from vectorizers.transformers.sliding_windows import ( 8 | SlidingWindowTransformer, 9 | SequentialDifferenceTransformer, 10 | sliding_window_generator, 11 | ) 12 | from vectorizers.transformers.count_feature_compression import ( 13 | CountFeatureCompressionTransformer, 14 | ) 15 | 16 | __all__ = [ 17 | "CategoricalColumnTransformer", 18 | "InformationWeightTransformer", 19 | "RowDenoisingTransformer", 20 | "SlidingWindowTransformer", 21 | "SequentialDifferenceTransformer", 22 | "CountFeatureCompressionTransformer", 23 | "information_weight", 24 | "sliding_window_generator", 25 | ] 26 | -------------------------------------------------------------------------------- /vectorizers/transformers/categorical_columns.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.base import BaseEstimator, TransformerMixin 3 | 4 | from warnings import warn 5 | 6 | 7 | class CategoricalColumnTransformer(BaseEstimator, TransformerMixin): 8 | """ 9 | This transformer is useful for describing an object as a bag of the categorical values that 10 | have been used to represent it within a pandas DataFrame. 11 | 12 | It takes an categorical column name to groupby, object_column_name, and one 13 | or more categorical columns to be used to describe these objects, 14 | descriptor_column_name. Then it returns a Series with an index being the 15 | unique entries of your object_column_name and the values being a list of 16 | the appropriate categorical values from your descriptor_column_name. 17 | 18 | It can be thought of as a PivotTableTransformer if you'd like. 19 | 20 | Parameters 21 | ---------- 22 | object_column_name: string 23 | The column name from the DataFrame where our object values can be found. 24 | This will be the thing we are grouping by. 25 | 26 | descriptor_column_name: string or list 27 | The name or names of the categorical column(s) who's values will be used for describing our 28 | objects. If you are using multiple names it's recommended that you set include_column_name=True. 29 | 30 | include_column_name: bool (default = False) 31 | Should the column name be appended at the beginning of each value? 32 | This is useful if you intend to combine values from multiple categorical columns 33 | after the fact. 34 | 35 | unique_values: bool (default = False) 36 | Should we apply a unique to the values in column before building our list representation? 37 | 38 | """ 39 | 40 | def __init__( 41 | self, 42 | object_column_name, 43 | descriptor_column_name, 44 | include_column_name=False, 45 | unique_values=False, 46 | ): 47 | self.object_column_name = object_column_name 48 | self.descriptor_column_name = descriptor_column_name 49 | # Get everything on consistent footing so we don't have to handle multiple cases. 50 | if type(self.descriptor_column_name) == str: 51 | self.descriptor_column_name_ = [self.descriptor_column_name] 52 | else: 53 | self.descriptor_column_name_ = self.descriptor_column_name 54 | self.include_column_name = include_column_name 55 | self.unique_values = unique_values 56 | 57 | if ( 58 | (self.include_column_name is False) 59 | and (type(self.descriptor_column_name) == list) 60 | and (len(self.descriptor_column_name) > 1) 61 | ): 62 | warn( 63 | "It is recommended that if you are aggregating " 64 | "multiple columns that you set include_column_name=True" 65 | ) 66 | 67 | def fit_transform(self, X, y=None, **fit_params): 68 | """ 69 | This transformer is useful for describing an object as a bag of the categorical values that 70 | have been used to represent it within a pandas DataFrame. 71 | 72 | It takes an categorical column name to groupby, object_column_name, and one or more 73 | categorical columns to be used to describe these objects, descriptor_column_name. 74 | Then it returns a Series with an index being the unique entries of your object_column_name 75 | and the values being a list of the appropriate categorical values from your descriptor_column_name. 76 | 77 | Parameters 78 | ---------- 79 | X: pd.DataFrame 80 | a pandas dataframe with columns who's names match those specified in the object_column_name and 81 | descriptor_column_name of the constructor. 82 | 83 | Returns 84 | ------- 85 | pandas Series 86 | Series with an index being the unique entries of your object_column_name 87 | and the values being a list of the appropriate categorical values from your descriptor_column_name. 88 | """ 89 | # Check that the dataframe has the appropriate columns 90 | required_columns = set([self.object_column_name] + self.descriptor_column_name_) 91 | if not required_columns.issubset(X.columns): 92 | raise ValueError( 93 | f"Sorry the required column(s) {set(required_columns).difference(set(X.columns))} are not " 94 | f"present in your data frame. \n" 95 | f"Please either specify a new instance or apply to a different data frame. " 96 | ) 97 | 98 | # Compute a single groupby ahead of time to save on compute 99 | grouped_frame = X.groupby(self.object_column_name) 100 | aggregated_columns = [] 101 | for column in self.descriptor_column_name_: 102 | if self.include_column_name: 103 | if self.unique_values: 104 | aggregated_columns.append( 105 | grouped_frame[column].agg( 106 | lambda x: [ 107 | column + ":" + value 108 | for value in x.unique() 109 | if pd.notna(value) 110 | ] 111 | ) 112 | ) 113 | else: 114 | aggregated_columns.append( 115 | grouped_frame[column].agg( 116 | lambda x: [ 117 | column + ":" + value for value in x if pd.notna(value) 118 | ] 119 | ) 120 | ) 121 | else: 122 | if self.unique_values: 123 | aggregated_columns.append( 124 | grouped_frame[column].agg( 125 | lambda x: [value for value in x.unique() if pd.notna(value)] 126 | ) 127 | ) 128 | else: 129 | aggregated_columns.append( 130 | grouped_frame[column].agg( 131 | lambda x: [value for value in x if pd.notna(value)] 132 | ) 133 | ) 134 | reduced = pd.concat(aggregated_columns, axis="columns").sum(axis=1) 135 | return reduced 136 | 137 | def fit(self, X, y=None, **fit_params): 138 | self.fit_transform(X, y, **fit_params) 139 | return self 140 | -------------------------------------------------------------------------------- /vectorizers/transformers/count_feature_compression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.base import BaseEstimator, TransformerMixin 3 | from sklearn.utils.validation import ( 4 | check_is_fitted, 5 | check_random_state, 6 | ) 7 | from sklearn.preprocessing import normalize 8 | import scipy.sparse 9 | from sklearn.utils.extmath import randomized_svd, svd_flip 10 | from scipy.sparse.linalg import svds 11 | 12 | from warnings import warn 13 | 14 | 15 | class CountFeatureCompressionTransformer(BaseEstimator, TransformerMixin): 16 | """Large sparse high dimensional matrices of count based, or strictly 17 | non-negative features are common. This transformer provides a simple 18 | but often very effective dimension reduction approach to provide a 19 | dense representation of the data that is amenable to cosine based distance 20 | measures. 21 | 22 | Parameters 23 | ---------- 24 | n_components: int (optional, default=128) 25 | The number of dimensions to use for the dense reduced representation. 26 | 27 | n_iter: int (optional, default=7) 28 | If using the ``"randomized"`` algorithm for SVD then use this number of 29 | iterations for estimate the SVD. 30 | 31 | algorithm: string (optional, default="randomized") 32 | The algorithm to use internally for the SVD step. Should be one of 33 | * "arpack" 34 | * "randomized" 35 | 36 | random_state: int, np.random_state or None (optional, default=None) 37 | If using the ``"randomized"`` algorithm for SVD then use this as the 38 | random state (or random seed). 39 | """ 40 | 41 | def __init__( 42 | self, 43 | n_components=128, 44 | n_iter=7, 45 | algorithm="randomized", 46 | random_state=None, 47 | rescaling_power=0.5, 48 | ): 49 | self.n_components = n_components 50 | self.n_iter = n_iter 51 | self.algorithm = algorithm 52 | self.random_state = random_state 53 | self.rescaling_power = rescaling_power 54 | 55 | def fit_transform(self, X, y=None, **fit_params): 56 | """ 57 | Given a dataset of count based features (i.e. strictly positive) 58 | perform feature compression / dimension reduction to provide 59 | a dataset with ``self.n_components`` dimensions suitable for 60 | measuring distances using cosine distance. 61 | 62 | Parameters 63 | ---------- 64 | X: ndarray or sparse matrix of shape (n_samples, n_features) 65 | The input data to be transformed. 66 | 67 | Returns 68 | ------- 69 | result: ndarray of shape (n_samples, n_components) 70 | The dimension reduced representation of the input. 71 | """ 72 | # Handle too large an n_components value somewhat gracefully 73 | if self.n_components >= X.shape[1]: 74 | warn( 75 | f"Warning: n_components is {self.n_components} but input has only {X.shape[1]} features!" 76 | f"No compression will be performed." 77 | ) 78 | self.components_ = np.eye(X.shape[1]) 79 | self.component_scaling_ = np.ones(X.shape[1]) 80 | return X 81 | 82 | if scipy.sparse.isspmatrix(X): 83 | if np.any(X.data < 0.0): 84 | raise ValueError("All entries in input most be non-negative!") 85 | else: 86 | if np.any(X < 0.0): 87 | raise ValueError("All entries in input most be non-negative!") 88 | 89 | normed_data = normalize(X) 90 | rescaled_data = scipy.sparse.csr_matrix(normed_data) 91 | rescaled_data.data = np.power(normed_data.data, self.rescaling_power) 92 | if self.algorithm == "arpack": 93 | u, s, v = svds(rescaled_data, k=self.n_components) 94 | elif self.algorithm == "randomized": 95 | random_state = check_random_state(self.random_state) 96 | u, s, v = randomized_svd( 97 | rescaled_data, 98 | n_components=self.n_components, 99 | n_iter=self.n_iter, 100 | random_state=random_state, 101 | ) 102 | else: 103 | raise ValueError("algorithm should be one of 'arpack' or 'randomized'") 104 | 105 | u, v = svd_flip(u, v) 106 | self.component_scaling_ = np.sqrt(s) 107 | self.components_ = v 108 | self.metric_ = "cosine" 109 | 110 | result = u * self.component_scaling_ 111 | 112 | return result 113 | 114 | def fit(self, X, y=None, **fit_params): 115 | """ 116 | Given a dataset of count based features (i.e. strictly positive) 117 | learn a feature compression / dimension reduction to provide 118 | a dataset with ``self.n_components`` dimensions suitable for 119 | measuring distances using cosine distance. 120 | 121 | Parameters 122 | ---------- 123 | X: ndarray or sparse matrix of shape (n_samples, n_features) 124 | The input data to be transformed. 125 | """ 126 | self.fit_transform(X, y, **fit_params) 127 | return self 128 | 129 | def transform(self, X, y=None): 130 | """ 131 | Given a dataset of count based features (i.e. strictly positive) 132 | perform the learned feature compression / dimension reduction. 133 | 134 | Parameters 135 | ---------- 136 | X: ndarray or sparse matrix of shape (n_samples, n_features) 137 | The input data to be transformed. 138 | 139 | Returns 140 | ------- 141 | result: ndarray of shape (n_samples, n_components) 142 | The dimension reduced representation of the input. 143 | """ 144 | check_is_fitted( 145 | self, 146 | ["components_", "component_scaling_"], 147 | ) 148 | normed_data = normalize(X) 149 | rescaled_data = scipy.sparse.csr_matrix(normed_data) 150 | rescaled_data.data = np.power(normed_data.data, self.rescaling_power) 151 | 152 | result = (rescaled_data @ self.components_.T) / self.component_scaling_ 153 | 154 | return result 155 | -------------------------------------------------------------------------------- /vectorizers/transformers/info_weight.py: -------------------------------------------------------------------------------- 1 | import numba 2 | import numpy as np 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | import scipy.sparse 5 | 6 | MOCK_TARGET = np.ones(1, dtype=np.int64) 7 | 8 | 9 | @numba.njit(nogil=True) 10 | def column_kl_divergence_exact_prior( 11 | count_indices, 12 | count_data, 13 | baseline_probabilities, 14 | prior_strength=0.1, 15 | target=MOCK_TARGET, 16 | ): 17 | observed_norm = count_data.sum() + prior_strength 18 | observed_zero_constant = (prior_strength / observed_norm) * np.log( 19 | prior_strength / observed_norm 20 | ) 21 | result = 0.0 22 | count_indices_set = set(count_indices) 23 | for i in range(baseline_probabilities.shape[0]): 24 | if i in count_indices_set: 25 | idx = np.searchsorted(count_indices, i) 26 | observed_probability = ( 27 | count_data[idx] + prior_strength * baseline_probabilities[i] 28 | ) / observed_norm 29 | if observed_probability > 0.0: 30 | result += observed_probability * np.log( 31 | observed_probability / baseline_probabilities[i] 32 | ) 33 | else: 34 | result += baseline_probabilities[i] * observed_zero_constant 35 | 36 | return result 37 | 38 | 39 | @numba.njit(nogil=True) 40 | def column_kl_divergence_approx_prior( 41 | count_indices, 42 | count_data, 43 | baseline_probabilities, 44 | prior_strength=0.1, 45 | target=MOCK_TARGET, 46 | ): 47 | observed_norm = count_data.sum() + prior_strength 48 | observed_zero_constant = (prior_strength / observed_norm) * np.log( 49 | prior_strength / observed_norm 50 | ) 51 | result = 0.0 52 | zero_count_component_estimate = ( 53 | np.mean(baseline_probabilities) 54 | * observed_zero_constant 55 | * (baseline_probabilities.shape[0] - count_indices.shape[0]) 56 | ) 57 | result += zero_count_component_estimate 58 | for i in range(count_indices.shape[0]): 59 | idx = count_indices[i] 60 | observed_probability = ( 61 | count_data[i] + prior_strength * baseline_probabilities[idx] 62 | ) / observed_norm 63 | if observed_probability > 0.0 and baseline_probabilities[idx] > 0: 64 | result += observed_probability * np.log( 65 | observed_probability / baseline_probabilities[idx] 66 | ) 67 | 68 | return result 69 | 70 | 71 | @numba.njit(nogil=True) 72 | def supervised_column_kl( 73 | count_indices, 74 | count_data, 75 | baseline_probabilities, 76 | prior_strength=0.1, 77 | target=MOCK_TARGET, 78 | ): 79 | observed = np.zeros_like(baseline_probabilities) 80 | for i in range(count_indices.shape[0]): 81 | idx = count_indices[i] 82 | label = target[idx] 83 | observed[label] += count_data[i] 84 | 85 | observed += prior_strength * baseline_probabilities 86 | observed /= observed.sum() 87 | 88 | return np.sum(observed * np.log(observed / baseline_probabilities)) 89 | 90 | 91 | @numba.njit(nogil=True, parallel=True) 92 | def column_weights( 93 | indptr, 94 | indices, 95 | data, 96 | baseline_probabilities, 97 | column_kl_divergence_func, 98 | prior_strength=0.1, 99 | target=MOCK_TARGET, 100 | ): 101 | n_cols = indptr.shape[0] - 1 102 | weights = np.ones(n_cols) 103 | for i in numba.prange(n_cols): 104 | weights[i] = column_kl_divergence_func( 105 | indices[indptr[i] : indptr[i + 1]], 106 | data[indptr[i] : indptr[i + 1]], 107 | baseline_probabilities, 108 | prior_strength=prior_strength, 109 | target=target, 110 | ) 111 | return weights 112 | 113 | 114 | def information_weight(data, prior_strength=0.1, approximate_prior=False, target=None): 115 | """Compute information based weights for columns. The information weight 116 | is estimated as the amount of information gained by moving from a baseline 117 | model to a model derived from the observed counts. In practice this can be 118 | computed as the KL-divergence between distributions. For the baseline model 119 | we assume data will be distributed according to the row sums -- i.e. 120 | proportional to the frequency of the row. For the observed counts we use 121 | a background prior of pseudo counts equal to ``prior_strength`` times the 122 | baseline prior distribution. The Bayesian prior can either be computed 123 | exactly (the default) at some computational expense, or estimated for a much 124 | fast computation, often suitable for large or very sparse datasets. 125 | 126 | Parameters 127 | ---------- 128 | data: scipy sparse matrix (n_samples, n_features) 129 | A matrix of count data where rows represent observations and 130 | columns represent features. Column weightings will be learned 131 | from this data. 132 | 133 | prior_strength: float (optional, default=0.1) 134 | How strongly to weight the prior when doing a Bayesian update to 135 | derive a model based on observed counts of a column. 136 | 137 | approximate_prior: bool (optional, default=False) 138 | Whether to approximate weights based on the Bayesian prior or perform 139 | exact computations. Approximations are much faster especialyl for very 140 | large or very sparse datasets. 141 | 142 | target: ndarray or None (optional, default=None) 143 | If supervised target labels are available, these can be used to define distributions 144 | over the target classes rather than over rows, allowing weights to be 145 | supervised and target based. If None then unsupervised weighting is used. 146 | 147 | Returns 148 | ------- 149 | weights: ndarray of shape (n_features,) 150 | The learned weights to be applied to columns based on the amount 151 | of information provided by the column. 152 | """ 153 | if approximate_prior: 154 | column_kl_divergence_func = column_kl_divergence_approx_prior 155 | else: 156 | column_kl_divergence_func = column_kl_divergence_exact_prior 157 | 158 | baseline_counts = np.squeeze(np.array(data.sum(axis=1))) 159 | if target is None: 160 | baseline_probabilities = baseline_counts / baseline_counts.sum() 161 | else: 162 | baseline_probabilities = np.zeros(target.max() + 1) 163 | for i in range(baseline_probabilities.shape[0]): 164 | baseline_probabilities[i] = baseline_counts[target == i].sum() 165 | baseline_probabilities /= baseline_probabilities.sum() 166 | column_kl_divergence_func = supervised_column_kl 167 | 168 | csc_data = data.tocsc() 169 | csc_data.sort_indices() 170 | 171 | weights = column_weights( 172 | csc_data.indptr, 173 | csc_data.indices, 174 | csc_data.data, 175 | baseline_probabilities, 176 | column_kl_divergence_func, 177 | prior_strength=prior_strength, 178 | target=target, 179 | ) 180 | return weights 181 | 182 | 183 | class InformationWeightTransformer(BaseEstimator, TransformerMixin): 184 | """A data transformer that re-weights columns of count data. Column weights 185 | are computed as information based weights for columns. The information weight 186 | is estimated as the amount of information gained by moving from a baseline 187 | model to a model derived from the observed counts. In practice this can be 188 | computed as the KL-divergence between distributions. For the baseline model 189 | we assume data will be distributed according to the row sums -- i.e. 190 | proportional to the frequency of the row. For the observed counts we use 191 | a background prior of pseudo counts equal to ``prior_strength`` times the 192 | baseline prior distribution. The Bayesian prior can either be computed 193 | exactly (the default) at some computational expense, or estimated for a much 194 | fast computation, often suitable for large or very sparse datasets. 195 | 196 | Parameters 197 | ---------- 198 | prior_strength: float (optional, default=0.1) 199 | How strongly to weight the prior when doing a Bayesian update to 200 | derive a model based on observed counts of a column. 201 | 202 | approximate_prior: bool (optional, default=False) 203 | Whether to approximate weights based on the Bayesian prior or perform 204 | exact computations. Approximations are much faster especialyl for very 205 | large or very sparse datasets. 206 | 207 | Attributes 208 | ---------- 209 | 210 | information_weights_: ndarray of shape (n_features,) 211 | The learned weights to be applied to columns based on the amount 212 | of information provided by the column. 213 | """ 214 | 215 | def __init__( 216 | self, 217 | prior_strength=1e-4, 218 | approx_prior=True, 219 | weight_power=2.0, 220 | supervision_weight=0.95, 221 | ): 222 | self.prior_strength = prior_strength 223 | self.approx_prior = approx_prior 224 | self.weight_power = weight_power 225 | self.supervision_weight = supervision_weight 226 | 227 | def fit(self, X, y=None, **fit_kwds): 228 | """Learn the appropriate column weighting as information weights 229 | from the observed count data ``X``. 230 | 231 | Parameters 232 | ---------- 233 | X: ndarray of scipy sparse matrix of shape (n_samples, n_features) 234 | The count data to be trained on. Note that, as count data all 235 | entries should be positive or zero. 236 | 237 | Returns 238 | ------- 239 | self: 240 | The trained model. 241 | """ 242 | if not scipy.sparse.isspmatrix(X): 243 | X = scipy.sparse.csc_matrix(X) 244 | 245 | self.information_weights_ = information_weight( 246 | X, self.prior_strength, self.approx_prior 247 | ) 248 | 249 | if y is not None: 250 | unsupervised_power = (1.0 - self.supervision_weight) * self.weight_power 251 | supervised_power = self.supervision_weight * self.weight_power 252 | 253 | self.information_weights_ /= np.mean(self.information_weights_) 254 | self.information_weights_ = np.maximum(self.information_weights_, 0.0) 255 | self.information_weights_ = np.power( 256 | self.information_weights_, unsupervised_power 257 | ) 258 | 259 | target_classes = np.unique(y) 260 | target_dict = dict( 261 | np.vstack((target_classes, np.arange(target_classes.shape[0]))).T 262 | ) 263 | target = np.array( 264 | [np.int64(target_dict[label]) for label in y], dtype=np.int64 265 | ) 266 | self.supervised_weights_ = information_weight( 267 | X, self.prior_strength, self.approx_prior, target=target 268 | ) 269 | self.supervised_weights_ /= np.mean(self.supervised_weights_) 270 | self.supervised_weights_ = np.maximum(self.supervised_weights_, 0.0) 271 | self.supervised_weights_ = np.power( 272 | self.supervised_weights_, supervised_power 273 | ) 274 | 275 | self.information_weights_ = ( 276 | self.information_weights_ * self.supervised_weights_ 277 | ) 278 | else: 279 | self.information_weights_ /= np.mean(self.information_weights_) 280 | self.information_weights_ = np.maximum(self.information_weights_, 0.0) 281 | self.information_weights_ = np.power( 282 | self.information_weights_, self.weight_power 283 | ) 284 | 285 | return self 286 | 287 | def transform(self, X): 288 | """Reweight data ``X`` based on learned information weights of columns. 289 | 290 | Parameters 291 | ---------- 292 | X: ndarray of scipy sparse matrix of shape (n_samples, n_features) 293 | The count data to be transformed. Note that, as count data all 294 | entries should be positive or zero. 295 | 296 | Returns 297 | ------- 298 | result: ndarray of scipy sparse matrix of shape (n_samples, n_features) 299 | The reweighted data. 300 | """ 301 | result = X @ scipy.sparse.diags(self.information_weights_) 302 | return result 303 | -------------------------------------------------------------------------------- /vectorizers/transformers/row_desnoise.py: -------------------------------------------------------------------------------- 1 | import numba 2 | import numpy as np 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | from sklearn.utils.validation import check_is_fitted 5 | from sklearn.preprocessing import normalize 6 | import scipy.sparse 7 | 8 | from warnings import warn 9 | 10 | 11 | @numba.njit() 12 | def numba_multinomial_em_sparse( 13 | indptr, 14 | inds, 15 | data, 16 | background, 17 | precision=1e-7, 18 | low_thresh=1e-5, 19 | bg_prior=5.0, 20 | prior_strength=0.3, 21 | ): 22 | result = np.zeros(data.shape[0], dtype=np.float32) 23 | mix_weights = np.zeros(indptr.shape[0] - 1, dtype=np.float32) 24 | 25 | prior = np.array([1.0, bg_prior]) * prior_strength 26 | mp = 1.0 + 1.0 * np.sum(prior) 27 | 28 | for i in range(indptr.shape[0] - 1): 29 | indices = inds[indptr[i] : indptr[i + 1]] 30 | row_data = data[indptr[i] : indptr[i + 1]] 31 | 32 | row_background = np.zeros_like(row_data) 33 | for idx in range(indices.shape[0]): 34 | j = indices[idx] 35 | row_background[idx] = background[j] 36 | 37 | row_background = row_background / row_background.sum() 38 | 39 | mix_param = 0.5 40 | current_dist = mix_param * row_data + (1.0 - mix_param) * row_background 41 | 42 | last_mix_param = mix_param 43 | change_magnitude = 1.0 44 | 45 | while ( 46 | change_magnitude > precision 47 | and mix_param > precision 48 | and mix_param < 1.0 - precision 49 | ): 50 | 51 | posterior_dist = current_dist * mix_param 52 | posterior_dist /= current_dist * mix_param + row_background * ( 53 | 1.0 - mix_param 54 | ) 55 | 56 | current_dist = posterior_dist * row_data 57 | mix_param = (current_dist.sum() + prior[0]) / mp 58 | current_dist = current_dist / current_dist.sum() 59 | 60 | change_magnitude = np.abs(mix_param - last_mix_param) 61 | last_mix_param = mix_param 62 | 63 | # zero out any small values 64 | norm = 0.0 65 | for n in range(current_dist.shape[0]): 66 | if current_dist[n] < low_thresh: 67 | current_dist[n] = 0.0 68 | else: 69 | norm += current_dist[n] 70 | current_dist /= norm 71 | 72 | result[indptr[i] : indptr[i + 1]] = current_dist 73 | mix_weights[i] = mix_param 74 | 75 | return result, mix_weights 76 | 77 | 78 | def multinomial_em_sparse( 79 | matrix, 80 | background, 81 | precision=1e-7, 82 | low_thresh=1e-5, 83 | bg_prior=5.0, 84 | prior_strength=0.3, 85 | ): 86 | if scipy.sparse.isspmatrix_csr(matrix): 87 | result = matrix.copy().astype(np.float32) 88 | else: 89 | result = matrix.tocsr().astype(np.float32) 90 | new_data, mix_weights = numba_multinomial_em_sparse( 91 | result.indptr, 92 | result.indices, 93 | result.data, 94 | background, 95 | precision, 96 | low_thresh, 97 | bg_prior, 98 | prior_strength, 99 | ) 100 | result.data = new_data 101 | 102 | return result, mix_weights 103 | 104 | 105 | class RowDenoisingTransformer(BaseEstimator, TransformerMixin): 106 | """ 107 | 108 | Parameters 109 | ---------- 110 | normalize = False 111 | Return the modified count matrix (default) or the L_1 normalization of each row. 112 | 113 | optional EM params: 114 | * em_precision = 1e-7, (halt EM when the mix_param changes less than this) 115 | * em_threshold = 1e-5, (set to zero any values below this) 116 | * em_background_prior = 5.0, (a non-negative number) 117 | * em_prior_strength = 0.3 (a non-negative number) 118 | """ 119 | 120 | def __init__( 121 | self, 122 | em_precision=1.0e-7, 123 | em_background_prior=1.0, 124 | em_threshold=1.0e-8, 125 | em_prior_strength=0.5, 126 | normalize=False, 127 | ): 128 | self.em_threshold = em_threshold 129 | self.em_background_prior = em_background_prior 130 | self.em_precision = em_precision 131 | self.em_prior_strength = em_prior_strength 132 | self.normalize = normalize 133 | 134 | def fit(self, X, y=None, **fit_params): 135 | """ 136 | 137 | Parameters 138 | ---------- 139 | X: sparse matrix of shape (n_docs, n_words) 140 | The data matrix to used to find the low-rank effects 141 | 142 | y: Ignored 143 | 144 | fit_params: 145 | optional model params 146 | 147 | Returns 148 | ------- 149 | self 150 | 151 | """ 152 | if scipy.sparse.issparse(X): 153 | X.eliminate_zeros() 154 | if X.nnz == 0: 155 | warn("Cannot fit an empty matrix") 156 | return self 157 | self.background_model_ = np.squeeze( 158 | np.array(X.sum(axis=0), dtype=np.float32) 159 | ) 160 | else: 161 | self.background_model_ = X.sum(axis=0) 162 | 163 | self.background_model_ /= self.background_model_.sum() 164 | 165 | return self 166 | 167 | def transform(self, X, y=None): 168 | """ 169 | 170 | X: sparse matrix of shape (n_docs, n_words) 171 | The data matrix that has the effects removed 172 | 173 | y: Ignored 174 | 175 | fit_params: 176 | optional model params 177 | 178 | Returns 179 | ------- 180 | X: scipy.sparse csr_matrix 181 | The matrix X with the low-rank effects removed. 182 | 183 | """ 184 | 185 | check_is_fitted(self, ["background_model_"]) 186 | 187 | row_sums = np.array(X.sum(axis=1)).T[0] 188 | 189 | result, weights = multinomial_em_sparse( 190 | normalize(X, norm="l1"), 191 | self.background_model_, 192 | low_thresh=self.em_threshold, 193 | bg_prior=self.em_background_prior, 194 | precision=self.em_precision, 195 | prior_strength=self.em_prior_strength, 196 | ) 197 | self.mix_weights_ = weights 198 | if not self.normalize: 199 | result = scipy.sparse.diags(row_sums * weights) * result 200 | 201 | result.eliminate_zeros() 202 | 203 | return result 204 | 205 | def fit_transform(self, X, y=None, **fit_params): 206 | """ 207 | 208 | Parameters 209 | ---------- 210 | X: sparse matrix of shape (n_docs, n_words) 211 | The data matrix that is used to deduce the low-rank effects and then has them removed 212 | 213 | y: Ignored 214 | 215 | fit_params: 216 | optional model params 217 | 218 | Returns 219 | ------- 220 | X: scipy.sparse csr_matrix 221 | The matrix X with the low-rank effects removed. 222 | 223 | """ 224 | self.fit(X, **fit_params) 225 | if X.nnz == 0: 226 | return X 227 | return self.transform(X) 228 | --------------------------------------------------------------------------------