├── .circleci
└── config.yml
├── .coveragerc
├── .gitignore
├── .readthedocs.yml
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── appveyor.yml
├── azure-pipelines.yml
├── doc
├── CategoricalColumnTransformer_intro.ipynb
├── Makefile
├── _static
│ ├── css
│ │ └── project-template.css
│ └── js
│ │ └── copybutton.js
├── _templates
│ ├── class.rst
│ ├── function.rst
│ └── numpydoc_docstring.py
├── api.rst
├── categorical_column_transformer_example.ipynb
├── conf.py
├── document_vectorization.ipynb
├── index.rst
├── information_weight_transform.ipynb
├── make.bat
├── quick_start.rst
├── requirements.txt
├── sequence_taxonomy.ipynb
├── token_cooccurrence_vectorizer_multi_labelled_cyber_example.ipynb
├── user_guide.rst
├── vectorizers_logo_no_text.png
└── vectorizers_logo_text.png
├── environment.yml
├── examples
├── README.txt
└── SignatureVectorizer_Examples_1.ipynb
├── requirements.txt
├── setup.cfg
├── setup.py
└── vectorizers
├── __init__.py
├── _vectorizers.py
├── _version.py
├── _window_kernels.py
├── base_cooccurrence_vectorizer.py
├── coo_utils.py
├── distances.py
├── distribution_vectorizer.py
├── edge_list_vectorizer.py
├── kde_vectorizer.py
├── linear_optimal_transport.py
├── mixed_gram_vectorizer.py
├── multi_token_cooccurence_vectorizer.py
├── ngram_token_cooccurence_vectorizer.py
├── ngram_vectorizer.py
├── preprocessing.py
├── signature_vectorizer.py
├── skip_gram_vectorizer.py
├── tests
├── __init__.py
├── test_bpe.py
├── test_common.py
├── test_distances.py
├── test_edge_list_vectorizer.py
├── test_signature_vectorizer.py
├── test_template.py
└── test_transformers.py
├── timed_token_cooccurrence_vectorizer.py
├── token_cooccurrence_vectorizer.py
├── transformers
├── __init__.py
├── categorical_columns.py
├── count_feature_compression.py
├── info_weight.py
├── row_desnoise.py
└── sliding_windows.py
├── tree_token_cooccurrence.py
└── utils.py
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | jobs:
4 | python3:
5 | docker:
6 | - image: circleci/python:3.6.1
7 | steps:
8 | - checkout
9 | - run:
10 | command: |
11 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
12 | chmod +x miniconda.sh && ./miniconda.sh -b -p ~/miniconda
13 | export PATH="~/miniconda/bin:$PATH"
14 | conda update --yes --quiet conda
15 | conda create -n testenv --yes --quiet python=3
16 | source activate testenv
17 | conda install --yes pip numpy scipy scikit-learn pandas numba matplotlib sphinx sphinx_rtd_theme numpydoc pillow dask pandoc
18 | pip install pynndescent
19 | pip install sphinx-gallery
20 | pip install nbsphinx
21 | pip install .
22 | cd doc
23 | make html
24 | - store_artifacts:
25 | path: doc/_build/html
26 | destination: doc
27 | - store_artifacts:
28 | path: ~/log.txt
29 | - persist_to_workspace:
30 | root: doc/_build/html
31 | paths: .
32 | - attach_workspace:
33 | at: doc/_build/html
34 | - run: ls -ltrh doc/_build/html
35 | filters:
36 | branches:
37 | ignore: gh-pages
38 |
39 | workflows:
40 | version: 2
41 | build-doc-and-deploy:
42 | jobs:
43 | - python3
44 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | # Configuration for coverage.py
2 |
3 | [run]
4 | branch = True
5 | source = vectorizers
6 | include = */vectorizers/*
7 | omit =
8 | */setup.py
9 |
10 | [report]
11 | exclude_lines =
12 | pragma: no cover
13 | def __repr__
14 | if self.debug:
15 | if settings.DEBUG
16 | raise AssertionError
17 | raise NotImplementedError
18 | if 0:
19 | if __name__ == .__main__.:
20 | if self.verbose:
21 | show_missing = True
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # scikit-learn specific
10 | doc/_build/
11 | doc/auto_examples/
12 | doc/modules/generated/
13 | doc/datasets/generated/
14 |
15 | # Distribution / packaging
16 |
17 | .Python
18 | env/
19 | build/
20 | develop-eggs/
21 | dist/
22 | downloads/
23 | eggs/
24 | .eggs/
25 | lib/
26 | lib64/
27 | parts/
28 | sdist/
29 | var/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 |
34 | # PyInstaller
35 | # Usually these files are written by a python script from a template
36 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
37 | *.manifest
38 | *.spec
39 |
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 |
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *,cover
53 | .hypothesis/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 |
62 | # Sphinx documentation
63 | doc/_build/
64 | doc/generated/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # Jupyter artifacts
70 | .ipynb_checkpoints
71 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yaml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Set the OS, Python version and other tools you might need
9 | build:
10 | os: ubuntu-22.04
11 | tools:
12 | python: "3.11"
13 |
14 | # Build documentation in the "docs/" directory with Sphinx
15 | sphinx:
16 | configuration: doc/conf.py
17 |
18 | # Optional but recommended, declare the Python requirements required
19 | # to build your documentation
20 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
21 | python:
22 | install:
23 | - requirements: doc/requirements.txt
24 | - method: pip
25 | path: .
26 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | dist: trusty
2 | sudo: false
3 |
4 | language: python
5 |
6 | cache:
7 | directories:
8 | - $HOME/.cache/pip
9 |
10 | matrix:
11 | include:
12 | - env: PYTHON_VERSION="3.7" NUMPY_VERSION="1.16.6" SCIPY_VERSION="1.4.1"
13 | SKLEARN_VERSION="0.20.3"
14 | - env: PYTHON_VERSION="3.8" NUMPY_VERSION="*" SCIPY_VERSION="*"
15 | SKLEARN_VERSION="*"
16 | - env: PYTHON_VERSION="3.8" NUMPY_VERSION="*" SCIPY_VERSION="*"
17 | SKLEARN_VERSION="*" COVERAGE="true"
18 |
19 | install:
20 | # install miniconda
21 | - deactivate
22 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
23 | - MINICONDA_PATH=/home/travis/miniconda
24 | - chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH
25 | - export PATH=$MINICONDA_PATH/bin:$PATH
26 | - conda update --yes conda
27 | # create the testing environment
28 | - conda create -n testenv --yes python=$PYTHON_VERSION pip
29 | - source activate testenv
30 | - |
31 | if [ $SKLEARN_VERSION = "nightly" ]; then
32 | conda install --yes numpy==$NUMPY_VERSION scipy==$SCIPY_VERSION cython nose pytest pytest-cov dask
33 | # install nightly wheels
34 | pip install --pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn
35 | else
36 | conda install --yes numpy==$NUMPY_VERSION scipy==$SCIPY_VERSION scikit-learn==$SKLEARN_VERSION cython nose pytest pytest-cov dask
37 | fi
38 | - conda install --yes pandas numba
39 | - pip install pynndescent
40 | - pip install codecov
41 | - pip install coverage
42 | - pip install coveralls
43 | - pip install .
44 |
45 | script:
46 | - |
47 | if [ "$COVERAGE" = "true" ]; then
48 | # disable numba for coverage run
49 | export NUMBA_DISABLE_JIT=1
50 | coverage run -m pytest -v --pyargs vectorizers -k tests
51 | else
52 | pytest -v --pyargs vectorizers
53 | fi
54 | after_success:
55 | - |
56 | if [ "$COVERAGE" = "true" ]; then
57 | codecov
58 | coveralls
59 | fi
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2020, John Healy, Leland McInnes, Colin Weir and Vectorizers contributors
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | * Neither the name of project-template nor the names of its
15 | contributors may be used to endorse or promote products derived from
16 | this software without specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | .. image:: doc/vectorizers_logo_text.png
4 | :width: 600
5 | :alt: Vectorizers Logo
6 |
7 | |Travis|_ |AppVeyor|_ |Codecov|_ |CircleCI|_ |ReadTheDocs|_
8 |
9 | .. |Travis| image:: https://travis-ci.com/TutteInstitute/vectorizers.svg?branch=master
10 | .. _Travis: https://travis-ci.com/TutteInstitute/vectorizers
11 |
12 | .. |AppVeyor| image:: https://ci.appveyor.com/api/projects/status/sjawsgwo7g4k3jon?svg=true
13 | .. _AppVeyor: https://ci.appveyor.com/project/lmcinnes/vectorizers
14 |
15 | .. |Codecov| image:: https://codecov.io/gh/TutteInstitute/vectorizers/branch/master/graph/badge.svg
16 | .. _Codecov: https://codecov.io/gh/TutteInstitute/vectorizers
17 |
18 |
19 | .. |CircleCI| image:: https://circleci.com/gh/TutteInstitute/vectorizers.svg?style=shield&circle-token=:circle-token
20 | .. _CircleCI: https://circleci.com/gh/scikit-learn-contrib/project-template/tree/master
21 |
22 | .. |ReadTheDocs| image:: https://readthedocs.org/projects/vectorizers/badge/?version=latest
23 | .. _ReadTheDocs: https://vectorizers.readthedocs.io/en/latest/?badge=latest
24 |
25 | ===========
26 | Vectorizers
27 | ===========
28 |
29 | There are a large number of machine learning tools for effectively exploring and working
30 | with data that is given as vectors (ideally with a defined notion of distance as well).
31 | There is also a large volume of data that does not come neatly packaged as vectors. It
32 | could be text data, variable length sequence data (either numeric or categorical),
33 | dataframes of mixed data types, sets of point clouds, or more. Usually, one way or another,
34 | such data can be wrangled into vectors in a way that preserves some relevant properties
35 | of the original data. This library seeks to provide a suite of a wide variety of
36 | general purpose techniques for such wrangling, making it easier and faster for users
37 | to get various kinds of unstructured sequence data into vector formats for exploration and
38 | machine learning.
39 |
40 | --------------------
41 | Why use Vectorizers?
42 | --------------------
43 |
44 | Data wrangling can be tedious, error-prone, and fragile when trying to integrate it into
45 | production pipelines. The vectorizers library aims to provide a set of easy to use
46 | tools for turning various kinds of unstructured sequence data into vectors. By following the
47 | scikit-learn transformer API we ensure that any of the vectorizer classes can be
48 | trivially integrated into existing sklearn workflows or pipelines. By keeping the
49 | vectorization approaches as general as possible (as opposed to specialising on very
50 | specific data types), we aim to ensure that a very broad range of data can be handled
51 | efficiently. Finally we aim to provide robust techniques with sound mathematical foundations
52 | over potentially more powerful but black-box approaches for greater transparency
53 | in data processing and transformation.
54 |
55 | ----------------------
56 | How to use Vectorizers
57 | ----------------------
58 |
59 | Quick start examples to be added soon ...
60 |
61 | For further examples on using this library for text we recommend checking out the documentation
62 | written up in the EasyData reproducible data science framework by some of our colleagues over at:
63 | https://github.com/hackalog/vectorizers_playground
64 |
65 | ----------
66 | Installing
67 | ----------
68 |
69 | Vectorizers is designed to be easy to install being a pure python module with
70 | relatively light requirements:
71 |
72 | * numpy
73 | * scipy
74 | * scikit-learn >= 0.22
75 | * numba >= 0.51
76 |
77 | To install the package from PyPI:
78 |
79 | .. code:: bash
80 |
81 | pip install vectorizers
82 |
83 | To install the package from source:
84 |
85 | .. code:: bash
86 |
87 | pip install https://github.com/TutteInstitute/vectorizers/archive/master.zip
88 |
89 | ----------------
90 | Help and Support
91 | ----------------
92 |
93 | This project is still young. The `documentation `_ is still growing. In the meantime please
94 | `open an issue `_
95 | and we will try to provide any help and guidance that we can. Please also check
96 | the docstrings on the code, which provide some descriptions of the parameters.
97 |
98 | ------------
99 | Contributing
100 | ------------
101 |
102 | Contributions are more than welcome! There are lots of opportunities
103 | for potential projects, so please get in touch if you would like to
104 | help out. Everything from code to notebooks to
105 | examples and documentation are all *equally valuable* so please don't feel
106 | you can't contribute. We would greatly appreciate the contribution of
107 | tutorial notebooks applying vectorizer tools to diverse or interesting
108 | datasets. If you find vectorizers useful for your data please consider
109 | contributing an example showing how it can apply to the kind of data
110 | you work with!
111 |
112 |
113 | To contribute please `fork the project `_ make your changes and
114 | submit a pull request. We will do our best to work through any issues with
115 | you and get your code merged into the main branch.
116 |
117 | -------
118 | License
119 | -------
120 |
121 | The vectorizers package is 3-clause BSD licensed.
122 |
123 |
124 |
--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
1 | build: false
2 |
3 | environment:
4 | matrix:
5 | - PYTHON: "C:\\Miniconda3-x64"
6 | PYTHON_VERSION: "3.7.x"
7 | PYTHON_ARCH: "32"
8 | NUMPY_VERSION: "1.16.6"
9 | SCIPY_VERSION: "1.4.1"
10 | SKLEARN_VERSION: "0.22.1"
11 | COVERAGE: 0
12 |
13 | - PYTHON: "C:\\Miniconda3-x64"
14 | PYTHON_VERSION: "3.8.x"
15 | PYTHON_ARCH: "64"
16 | NUMPY_VERSION: "*"
17 | SCIPY_VERSION: "*"
18 | SKLEARN_VERSION: "*"
19 | COVERAGE: 0
20 |
21 | - PYTHON: "C:\\Miniconda3-x64"
22 | PYTHON_VERSION: "3.8.x"
23 | PYTHON_ARCH: "64"
24 | NUMPY_VERSION: "*"
25 | SCIPY_VERSION: "*"
26 | SKLEARN_VERSION: "*"
27 | COVERAGE: 1
28 |
29 | install:
30 | # Prepend miniconda installed Python to the PATH of this build
31 | # Add Library/bin directory to fix issue
32 | # https://github.com/conda/conda/issues/1753
33 | - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PYTHON%\\Library\\bin;%PATH%"
34 | # install the dependencies
35 | - "conda install --yes pip numpy==%NUMPY_VERSION% scipy==%SCIPY_VERSION% scikit-learn==%SKLEARN_VERSION% nose pytest pytest-cov"
36 | - conda install --yes numba pandas dask pomegranate
37 | - pip install pynndescent
38 | - pip install iisignature
39 | - pip install codecov
40 | - pip install .
41 |
42 | test_script:
43 | - mkdir for_test
44 | - cd for_test
45 | - IF %COVERAGE%==1 set NUMBA_DISABLE_JIT=1
46 | - pytest -v --cov=vectorizers --pyargs vectorizers
47 |
48 | after_test:
49 | - cp .coverage %APPVEYOR_BUILD_FOLDER%
50 | - cd %APPVEYOR_BUILD_FOLDER%
51 | - IF %COVERAGE%==1 codecov
52 |
--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
1 | # Trigger a build when there is a push to the main branch or a tag starts with release-
2 | trigger:
3 | branches:
4 | include:
5 | - main
6 | - master
7 | tags:
8 | include:
9 | - release-*
10 |
11 | # Trigger a build when there is a pull request to the main branch
12 | # Ignore PRs that are just updating the docs
13 | pr:
14 | branches:
15 | include:
16 | - main
17 | - master
18 | exclude:
19 | - doc/*
20 | - README.rst
21 |
22 | parameters:
23 | - name: includeReleaseCandidates
24 | displayName: "Allow pre-release dependencies"
25 | type: boolean
26 | default: false
27 |
28 | variables:
29 | triggeredByPullRequest: $[eq(variables['Build.Reason'], 'PullRequest')]
30 |
31 | stages:
32 | - stage: RunAllTests
33 | displayName: Run test suite
34 | jobs:
35 | - job: run_platform_tests
36 | strategy:
37 | matrix:
38 | mac_py39:
39 | imageName: 'macOS-latest'
40 | python.version: '3.9'
41 | linux_py39:
42 | imageName: 'ubuntu-latest'
43 | python.version: '3.9'
44 | windows_py39:
45 | imageName: 'windows-latest'
46 | python.version: '3.9'
47 | mac_py310:
48 | imageName: 'macOS-latest'
49 | python.version: '3.10'
50 | linux_py310:
51 | imageName: 'ubuntu-latest'
52 | python.version: '3.10'
53 | windows_py310:
54 | imageName: 'windows-latest'
55 | python.version: '3.10'
56 | mac_py311:
57 | imageName: 'macOS-latest'
58 | python.version: '3.11'
59 | linux_py311:
60 | imageName: 'ubuntu-latest'
61 | python.version: '3.11'
62 | windows_py311:
63 | imageName: 'windows-latest'
64 | python.version: '3.11'
65 | mac_py312:
66 | imageName: 'macOS-latest'
67 | python.version: '3.12'
68 | linux_py312:
69 | imageName: 'ubuntu-latest'
70 | python.version: '3.12'
71 | windows_py312:
72 | imageName: 'windows-latest'
73 | python.version: '3.12'
74 | pool:
75 | vmImage: $(imageName)
76 |
77 | steps:
78 | - task: UsePythonVersion@0
79 | inputs:
80 | versionSpec: '$(python.version)'
81 | displayName: 'Use Python $(python.version)'
82 |
83 | - script: |
84 | python -m pip install --upgrade pip
85 | displayName: 'Upgrade pip'
86 |
87 | - script: |
88 | pip install -r requirements.txt
89 | displayName: 'Install dependencies'
90 | condition: ${{ eq(parameters.includeReleaseCandidates, false) }}
91 |
92 | - script: |
93 | pip install --pre -r requirements.txt
94 | displayName: 'Install dependencies (allow pre-releases)'
95 | condition: ${{ eq(parameters.includeReleaseCandidates, true) }}
96 |
97 | - script: |
98 | pip install -e .
99 | pip install pytest pytest-azurepipelines
100 | pip install pytest-cov
101 | pip install coveralls
102 | displayName: 'Install package'
103 |
104 | - script: |
105 | pytest vectorizers/tests --show-capture=no -v --disable-warnings --junitxml=junit/test-results.xml --cov=vectorizers/ --cov-report=xml --cov-report=html
106 | displayName: 'Run tests'
107 |
108 | - bash: |
109 | coveralls
110 | displayName: 'Publish to coveralls'
111 | condition: and(succeeded(), eq(variables.triggeredByPullRequest, false)) # Don't run this for PRs because they can't access pipeline secrets
112 | env:
113 | COVERALLS_REPO_TOKEN: $(COVERALLS_TOKEN)
114 |
115 | - task: PublishTestResults@2
116 | inputs:
117 | testResultsFiles: '$(System.DefaultWorkingDirectory)/**/coverage.xml'
118 | testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
119 | condition: succeededOrFailed()
120 |
121 | - stage: BuildPublishArtifact
122 | dependsOn: RunAllTests
123 | condition: and(succeeded(), startsWith(variables['Build.SourceBranch'], 'refs/tags/release-'), eq(variables.triggeredByPullRequest, false))
124 | jobs:
125 | - job: BuildArtifacts
126 | displayName: Build source dists and wheels
127 | pool:
128 | vmImage: 'ubuntu-latest'
129 | steps:
130 | - task: UsePythonVersion@0
131 | inputs:
132 | versionSpec: '3.10'
133 | displayName: 'Use Python 3.10'
134 |
135 | - script: |
136 | python -m pip install --upgrade pip
137 | pip install wheel
138 | pip install -r requirements.txt
139 | displayName: 'Install dependencies'
140 |
141 | - script: |
142 | pip install -e .
143 | displayName: 'Install package locally'
144 |
145 | - script: |
146 | python setup.py sdist bdist_wheel
147 | displayName: 'Build package'
148 |
149 | - bash: |
150 | export PACKAGE_VERSION="$(python setup.py --version)"
151 | echo "Package Version: ${PACKAGE_VERSION}"
152 | echo "##vso[task.setvariable variable=packageVersionFormatted;]release-${PACKAGE_VERSION}"
153 | displayName: 'Get package version'
154 |
155 | - script: |
156 | echo "Version in git tag $(Build.SourceBranchName) does not match version derived from setup.py $(packageVersionFormatted)"
157 | exit 1
158 | displayName: Raise error if version doesnt match tag
159 | condition: and(succeeded(), ne(variables['Build.SourceBranchName'], variables['packageVersionFormatted']))
160 |
161 | - task: DownloadSecureFile@1
162 | name: PYPIRC_CONFIG
163 | displayName: 'Download pypirc'
164 | inputs:
165 | secureFile: 'pypirc'
166 |
167 | - script: |
168 | pip install twine
169 | twine upload --repository pypi --config-file $(PYPIRC_CONFIG.secureFilePath) dist/*
170 | displayName: 'Upload to PyPI'
171 | condition: and(succeeded(), eq(variables['Build.SourceBranchName'], variables['packageVersionFormatted']))
172 |
173 |
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | PAPER =
8 | BUILDDIR = _build
9 |
10 | # User-friendly check for sphinx-build
11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
13 | endif
14 |
15 | # Internal variables.
16 | PAPEROPT_a4 = -D latex_paper_size=a4
17 | PAPEROPT_letter = -D latex_paper_size=letter
18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
19 | # the i18n builder cannot share the environment and doctrees with the others
20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
21 |
22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
23 |
24 | help:
25 | @echo "Please use \`make ' where is one of"
26 | @echo " html to make standalone HTML files"
27 | @echo " dirhtml to make HTML files named index.html in directories"
28 | @echo " singlehtml to make a single large HTML file"
29 | @echo " pickle to make pickle files"
30 | @echo " json to make JSON files"
31 | @echo " htmlhelp to make HTML files and a HTML help project"
32 | @echo " qthelp to make HTML files and a qthelp project"
33 | @echo " devhelp to make HTML files and a Devhelp project"
34 | @echo " epub to make an epub"
35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
36 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
38 | @echo " text to make text files"
39 | @echo " man to make manual pages"
40 | @echo " texinfo to make Texinfo files"
41 | @echo " info to make Texinfo files and run them through makeinfo"
42 | @echo " gettext to make PO message catalogs"
43 | @echo " changes to make an overview of all changed/added/deprecated items"
44 | @echo " xml to make Docutils-native XML files"
45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes"
46 | @echo " linkcheck to check all external links for integrity"
47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
48 |
49 | clean:
50 | -rm -rf $(BUILDDIR)/*
51 | -rm -rf auto_examples/
52 | -rm -rf generated/*
53 | -rm -rf modules/generated/*
54 |
55 | html:
56 | # These two lines make the build a bit more lengthy, and the
57 | # the embedding of images more robust
58 | rm -rf $(BUILDDIR)/html/_images
59 | #rm -rf _build/doctrees/
60 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
61 | @echo
62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
63 |
64 | dirhtml:
65 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
66 | @echo
67 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
68 |
69 | singlehtml:
70 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
71 | @echo
72 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
73 |
74 | pickle:
75 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
76 | @echo
77 | @echo "Build finished; now you can process the pickle files."
78 |
79 | json:
80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
81 | @echo
82 | @echo "Build finished; now you can process the JSON files."
83 |
84 | htmlhelp:
85 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
86 | @echo
87 | @echo "Build finished; now you can run HTML Help Workshop with the" \
88 | ".hhp project file in $(BUILDDIR)/htmlhelp."
89 |
90 | qthelp:
91 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
92 | @echo
93 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \
94 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
95 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/project-template.qhcp"
96 | @echo "To view the help file:"
97 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/project-template.qhc"
98 |
99 | devhelp:
100 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
101 | @echo
102 | @echo "Build finished."
103 | @echo "To view the help file:"
104 | @echo "# mkdir -p $$HOME/.local/share/devhelp/project-template"
105 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/project-template"
106 | @echo "# devhelp"
107 |
108 | epub:
109 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
110 | @echo
111 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
112 |
113 | latex:
114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | @echo
116 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
117 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
118 | "(use \`make latexpdf' here to do that automatically)."
119 |
120 | latexpdf:
121 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
122 | @echo "Running LaTeX files through pdflatex..."
123 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
124 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
125 |
126 | latexpdfja:
127 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
128 | @echo "Running LaTeX files through platex and dvipdfmx..."
129 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
130 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
131 |
132 | text:
133 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
134 | @echo
135 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
136 |
137 | man:
138 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
139 | @echo
140 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
141 |
142 | texinfo:
143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | @echo
145 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
146 | @echo "Run \`make' in that directory to run these through makeinfo" \
147 | "(use \`make info' here to do that automatically)."
148 |
149 | info:
150 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
151 | @echo "Running Texinfo files through makeinfo..."
152 | make -C $(BUILDDIR)/texinfo info
153 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
154 |
155 | gettext:
156 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
157 | @echo
158 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
159 |
160 | changes:
161 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
162 | @echo
163 | @echo "The overview file is in $(BUILDDIR)/changes."
164 |
165 | linkcheck:
166 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
167 | @echo
168 | @echo "Link check complete; look for any errors in the above output " \
169 | "or in $(BUILDDIR)/linkcheck/output.txt."
170 |
171 | doctest:
172 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
173 | @echo "Testing of doctests in the sources finished, look at the " \
174 | "results in $(BUILDDIR)/doctest/output.txt."
175 |
176 | xml:
177 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
178 | @echo
179 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
180 |
181 | pseudoxml:
182 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
183 | @echo
184 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
185 |
--------------------------------------------------------------------------------
/doc/_static/css/project-template.css:
--------------------------------------------------------------------------------
1 | @import url("theme.css");
2 |
3 | .highlight a {
4 | text-decoration: underline;
5 | }
6 |
7 | .deprecated p {
8 | padding: 10px 7px 10px 10px;
9 | color: #b94a48;
10 | background-color: #F3E5E5;
11 | border: 1px solid #eed3d7;
12 | }
13 |
14 | .deprecated p span.versionmodified {
15 | font-weight: bold;
16 | }
17 |
--------------------------------------------------------------------------------
/doc/_static/js/copybutton.js:
--------------------------------------------------------------------------------
1 | $(document).ready(function() {
2 | /* Add a [>>>] button on the top-right corner of code samples to hide
3 | * the >>> and ... prompts and the output and thus make the code
4 | * copyable. */
5 | var div = $('.highlight-python .highlight,' +
6 | '.highlight-python3 .highlight,' +
7 | '.highlight-pycon .highlight,' +
8 | '.highlight-default .highlight')
9 | var pre = div.find('pre');
10 |
11 | // get the styles from the current theme
12 | pre.parent().parent().css('position', 'relative');
13 | var hide_text = 'Hide the prompts and output';
14 | var show_text = 'Show the prompts and output';
15 | var border_width = pre.css('border-top-width');
16 | var border_style = pre.css('border-top-style');
17 | var border_color = pre.css('border-top-color');
18 | var button_styles = {
19 | 'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0',
20 | 'border-color': border_color, 'border-style': border_style,
21 | 'border-width': border_width, 'color': border_color, 'text-size': '75%',
22 | 'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '0.2em',
23 | 'border-radius': '0 3px 0 0'
24 | }
25 |
26 | // create and add the button to all the code blocks that contain >>>
27 | div.each(function(index) {
28 | var jthis = $(this);
29 | if (jthis.find('.gp').length > 0) {
30 | var button = $('>>>');
31 | button.css(button_styles)
32 | button.attr('title', hide_text);
33 | button.data('hidden', 'false');
34 | jthis.prepend(button);
35 | }
36 | // tracebacks (.gt) contain bare text elements that need to be
37 | // wrapped in a span to work with .nextUntil() (see later)
38 | jthis.find('pre:has(.gt)').contents().filter(function() {
39 | return ((this.nodeType == 3) && (this.data.trim().length > 0));
40 | }).wrap('');
41 | });
42 |
43 | // define the behavior of the button when it's clicked
44 | $('.copybutton').click(function(e){
45 | e.preventDefault();
46 | var button = $(this);
47 | if (button.data('hidden') === 'false') {
48 | // hide the code output
49 | button.parent().find('.go, .gp, .gt').hide();
50 | button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden');
51 | button.css('text-decoration', 'line-through');
52 | button.attr('title', show_text);
53 | button.data('hidden', 'true');
54 | } else {
55 | // show the code output
56 | button.parent().find('.go, .gp, .gt').show();
57 | button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible');
58 | button.css('text-decoration', 'none');
59 | button.attr('title', hide_text);
60 | button.data('hidden', 'false');
61 | }
62 | });
63 | });
64 |
--------------------------------------------------------------------------------
/doc/_templates/class.rst:
--------------------------------------------------------------------------------
1 | :mod:`{{module}}`.{{objname}}
2 | {{ underline }}==============
3 |
4 | .. currentmodule:: {{ module }}
5 |
6 | .. autoclass:: {{ objname }}
7 |
8 | {% block methods %}
9 | .. automethod:: __init__
10 | {% endblock %}
11 |
12 | .. include:: {{module}}.{{objname}}.examples
13 |
14 | .. raw:: html
15 |
16 |
17 |
--------------------------------------------------------------------------------
/doc/_templates/function.rst:
--------------------------------------------------------------------------------
1 | :mod:`{{module}}`.{{objname}}
2 | {{ underline }}====================
3 |
4 | .. currentmodule:: {{ module }}
5 |
6 | .. autofunction:: {{ objname }}
7 |
8 | .. include:: {{module}}.{{objname}}.examples
9 |
10 | .. raw:: html
11 |
12 |
13 |
--------------------------------------------------------------------------------
/doc/_templates/numpydoc_docstring.py:
--------------------------------------------------------------------------------
1 | {{index}}
2 | {{summary}}
3 | {{extended_summary}}
4 | {{parameters}}
5 | {{returns}}
6 | {{yields}}
7 | {{other_parameters}}
8 | {{attributes}}
9 | {{raises}}
10 | {{warns}}
11 | {{warnings}}
12 | {{see_also}}
13 | {{notes}}
14 | {{references}}
15 | {{examples}}
16 | {{methods}}
17 |
--------------------------------------------------------------------------------
/doc/api.rst:
--------------------------------------------------------------------------------
1 | ###############
2 | Vectorizers API
3 | ###############
4 |
5 | Ngram and Skipgram Vectorizer
6 | ===============
7 |
8 | .. autosummary::
9 | :toctree: generated/
10 | :template: class.rst
11 |
12 | NgramVectorizer
13 | SkipgramVectorizer
14 | LZCompressionVectorizer
15 | BytePairEncodingVectorizer
16 |
17 | TokenCooccurrenceVectorizers
18 | ===========================
19 |
20 | .. autosummary::
21 | :toctree: generated/
22 | :template: class.rst
23 |
24 | TokenCooccurrenceVectorizer
25 | MultiSetCooccurrenceVectorizer
26 | TimedTokenCooccurrenceVectorizer
27 | LabelledTreeCooccurrenceVectorizer
28 |
29 | Wasserstein style Vectorizers
30 | =============================
31 |
32 | .. autosummary::
33 | :toctree: generated/
34 | :template: class.rst
35 |
36 | WassersteinVectorizer
37 | SinkhornVectorizer
38 | ApproximateWassersteinVectorizer
39 |
40 | Utility Vectorizers and Transformers
41 | ====================================
42 |
43 | .. autosummary::
44 | :toctree: generated/
45 | :template: class.rst
46 |
47 | EdgeListVectorizer
48 | CategoricalColumnTransformer
49 | InformationWeightTransformer
50 | RowDenoisingTransformer
51 | CountFeatureCompressionTransformer
52 |
53 | Time Series Vectorizers and Transformers
54 | ========================================
55 |
56 | .. autosummary::
57 | :toctree: generated/
58 | :template: class.rst
59 |
60 | HistogramVectorizer
61 | KDEVectorizer
62 | SlidingWindowTransformer
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # project-template documentation build configuration file, created by
4 | # sphinx-quickstart on Mon Jan 18 14:44:12 2016.
5 | #
6 | # This file is execfile()d with the current directory set to its
7 | # containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | import sys
16 | import os
17 |
18 | import sphinx_gallery
19 | import sphinx_rtd_theme
20 |
21 | # If extensions (or modules to document with autodoc) are in another directory,
22 | # add these directories to sys.path here. If the directory is relative to the
23 | # documentation root, use os.path.abspath to make it absolute, like shown here.
24 | #sys.path.insert(0, os.path.abspath('.'))
25 |
26 | # -- General configuration ------------------------------------------------
27 |
28 | # If your documentation needs a minimal Sphinx version, state it here.
29 | #needs_sphinx = '1.0'
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = [
35 | 'sphinx.ext.autodoc',
36 | 'sphinx.ext.autosummary',
37 | 'sphinx.ext.doctest',
38 | 'sphinx.ext.intersphinx',
39 | 'sphinx.ext.viewcode',
40 | 'numpydoc',
41 | 'nbsphinx',
42 | 'sphinx_gallery.gen_gallery',
43 | ]
44 |
45 | # this is needed for some reason...
46 | # see https://github.com/numpy/numpydoc/issues/69
47 | numpydoc_show_class_members = False
48 |
49 | # pngmath / imgmath compatibility layer for different sphinx versions
50 | import sphinx
51 | from distutils.version import LooseVersion
52 | if LooseVersion(sphinx.__version__) < LooseVersion('1.4'):
53 | extensions.append('sphinx.ext.pngmath')
54 | else:
55 | extensions.append('sphinx.ext.imgmath')
56 |
57 | autodoc_default_flags = ['members', 'inherited-members']
58 |
59 | # Add any paths that contain templates here, relative to this directory.
60 | templates_path = ['_templates']
61 |
62 | # generate autosummary even if no references
63 | autosummary_generate = True
64 |
65 | # The suffix of source filenames.
66 | source_suffix = '.rst'
67 |
68 | # The encoding of source files.
69 | #source_encoding = 'utf-8-sig'
70 |
71 | # Generate the plots for the gallery
72 | plot_gallery = True
73 |
74 | # The master toctree document.
75 | master_doc = 'index'
76 |
77 | # General information about the project.
78 | project = u'vectorizers'
79 | copyright = u'2022, Benoit Hamelin, John Healy, Leland McInnes, Colin Weir'
80 |
81 | # The version info for the project you're documenting, acts as replacement for
82 | # |version| and |release|, also used in various other places throughout the
83 | # built documents.
84 | #
85 | # The short X.Y version.
86 | from vectorizers import __version__
87 | version = __version__
88 | # The full version, including alpha/beta/rc tags.
89 | release = __version__
90 |
91 | # The language for content autogenerated by Sphinx. Refer to documentation
92 | # for a list of supported languages.
93 | #language = None
94 |
95 | # There are two options for replacing |today|: either, you set today to some
96 | # non-false value, then it is used:
97 | #today = ''
98 | # Else, today_fmt is used as the format for a strftime call.
99 | #today_fmt = '%B %d, %Y'
100 |
101 | # List of patterns, relative to source directory, that match files and
102 | # directories to ignore when looking for source files.
103 | exclude_patterns = ['_build', '_templates']
104 |
105 | # The reST default role (used for this markup: `text`) to use for all
106 | # documents.
107 | #default_role = None
108 |
109 | # If true, '()' will be appended to :func: etc. cross-reference text.
110 | #add_function_parentheses = True
111 |
112 | # If true, the current module name will be prepended to all description
113 | # unit titles (such as .. function::).
114 | #add_module_names = True
115 |
116 | # If true, sectionauthor and moduleauthor directives will be shown in the
117 | # output. They are ignored by default.
118 | #show_authors = False
119 |
120 | # The name of the Pygments (syntax highlighting) style to use.
121 | pygments_style = 'sphinx'
122 |
123 | # Custom style
124 | html_style = 'css/project-template.css'
125 |
126 | # A list of ignored prefixes for module index sorting.
127 | #modindex_common_prefix = []
128 |
129 | # If true, keep warnings as "system message" paragraphs in the built documents.
130 | #keep_warnings = False
131 |
132 |
133 | # -- Options for HTML output ----------------------------------------------
134 |
135 | # The theme to use for HTML and HTML Help pages. See the documentation for
136 | # a list of builtin themes.
137 | html_theme = 'sphinx_rtd_theme'
138 |
139 | # Theme options are theme-specific and customize the look and feel of a theme
140 | # further. For a list of options available for each theme, see the
141 | # documentation.
142 | #html_theme_options = {}
143 |
144 | # Add any paths that contain custom themes here, relative to this directory.
145 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
146 |
147 | # The name for this set of Sphinx documents. If None, it defaults to
148 | # " v documentation".
149 | #html_title = None
150 |
151 | # A shorter title for the navigation bar. Default is the same as html_title.
152 | #html_short_title = None
153 |
154 | # The name of an image file (relative to this directory) to place at the top
155 | # of the sidebar.
156 | html_theme_options = {"navigation_depth": 3, "logo_only": True}
157 |
158 | html_logo = "vectorizers_logo_no_text.png"
159 |
160 |
161 | # The name of an image file (within the static path) to use as favicon of the
162 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
163 | # pixels large.
164 | #html_favicon = None
165 |
166 | # Add any paths that contain custom static files (such as style sheets) here,
167 | # relative to this directory. They are copied after the builtin static files,
168 | # so a file named "default.css" will overwrite the builtin "default.css".
169 | html_static_path = ['_static']
170 |
171 | # Add any extra paths that contain custom files (such as robots.txt or
172 | # .htaccess) here, relative to this directory. These files are copied
173 | # directly to the root of the documentation.
174 | #html_extra_path = []
175 |
176 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
177 | # using the given strftime format.
178 | #html_last_updated_fmt = '%b %d, %Y'
179 |
180 | # If true, SmartyPants will be used to convert quotes and dashes to
181 | # typographically correct entities.
182 | #html_use_smartypants = True
183 |
184 | # Custom sidebar templates, maps document names to template names.
185 | #html_sidebars = {}
186 |
187 | # Additional templates that should be rendered to pages, maps page names to
188 | # template names.
189 | #html_additional_pages = {}
190 |
191 | # If false, no module index is generated.
192 | #html_domain_indices = True
193 |
194 | # If false, no index is generated.
195 | #html_use_index = True
196 |
197 | # If true, the index is split into individual pages for each letter.
198 | #html_split_index = False
199 |
200 | # If true, links to the reST sources are added to the pages.
201 | #html_show_sourcelink = True
202 |
203 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
204 | #html_show_sphinx = True
205 |
206 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
207 | #html_show_copyright = True
208 |
209 | # If true, an OpenSearch description file will be output, and all pages will
210 | # contain a tag referring to it. The value of this option must be the
211 | # base URL from which the finished HTML is served.
212 | #html_use_opensearch = ''
213 |
214 | # This is the file name suffix for HTML files (e.g. ".xhtml").
215 | #html_file_suffix = None
216 |
217 | # Output file base name for HTML help builder.
218 | htmlhelp_basename = 'project-templatedoc'
219 |
220 |
221 | # -- Options for LaTeX output ---------------------------------------------
222 |
223 | latex_elements = {
224 | # The paper size ('letterpaper' or 'a4paper').
225 | #'papersize': 'letterpaper',
226 |
227 | # The font size ('10pt', '11pt' or '12pt').
228 | #'pointsize': '10pt',
229 |
230 | # Additional stuff for the LaTeX preamble.
231 | #'preamble': '',
232 | }
233 |
234 | # Grouping the document tree into LaTeX files. List of tuples
235 | # (source start file, target name, title,
236 | # author, documentclass [howto, manual, or own class]).
237 | latex_documents = [
238 | ('index', 'project-template.tex', u'project-template Documentation',
239 | u'Vighnesh Birodkar', 'manual'),
240 | ]
241 |
242 | # The name of an image file (relative to this directory) to place at the top of
243 | # the title page.
244 | #latex_logo = None
245 |
246 | # For "manual" documents, if this is true, then toplevel headings are parts,
247 | # not chapters.
248 | #latex_use_parts = False
249 |
250 | # If true, show page references after internal links.
251 | #latex_show_pagerefs = False
252 |
253 | # If true, show URL addresses after external links.
254 | #latex_show_urls = False
255 |
256 | # Documents to append as an appendix to all manuals.
257 | #latex_appendices = []
258 |
259 | # If false, no module index is generated.
260 | #latex_domain_indices = True
261 |
262 |
263 | # -- Options for manual page output ---------------------------------------
264 |
265 | # One entry per manual page. List of tuples
266 | # (source start file, name, description, authors, manual section).
267 | man_pages = [
268 | ('index', 'project-template', u'project-template Documentation',
269 | [u'Vighnesh Birodkar'], 1)
270 | ]
271 |
272 | # If true, show URL addresses after external links.
273 | #man_show_urls = False
274 |
275 |
276 | # -- Options for Texinfo output -------------------------------------------
277 |
278 | # Grouping the document tree into Texinfo files. List of tuples
279 | # (source start file, target name, title, author,
280 | # dir menu entry, description, category)
281 | texinfo_documents = [
282 | ('index', 'project-template', u'project-template Documentation',
283 | u'Vighnesh Birodkar', 'project-template', 'One line description of project.',
284 | 'Miscellaneous'),
285 | ]
286 |
287 | # Documents to append as an appendix to all manuals.
288 | #texinfo_appendices = []
289 |
290 | # If false, no module index is generated.
291 | #texinfo_domain_indices = True
292 |
293 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
294 | #texinfo_show_urls = 'footnote'
295 |
296 | # If true, do not generate a @detailmenu in the "Top" node's menu.
297 | #texinfo_no_detailmenu = False
298 |
299 |
300 | # Example configuration for intersphinx: refer to the Python standard library.
301 | # intersphinx configuration
302 | intersphinx_mapping = {
303 | 'python': ('https://docs.python.org/{.major}'.format(
304 | sys.version_info), None),
305 | 'numpy': ('https://docs.scipy.org/doc/numpy/', None),
306 | 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
307 | 'matplotlib': ('https://matplotlib.org/', None),
308 | 'sklearn': ('http://scikit-learn.org/stable', None)
309 | }
310 |
311 | # sphinx-gallery configuration
312 | sphinx_gallery_conf = {
313 | 'doc_module': 'vectorizers',
314 | 'backreferences_dir': os.path.join('generated'),
315 | 'reference_url': {
316 | 'vectorizers': None}
317 | }
318 |
319 | def setup(app):
320 | # a copy button to copy snippet of code from the documentation
321 | # app.add_javascript('js/copybutton.js')
322 | pass
--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
1 | .. project-template documentation master file, created by
2 | sphinx-quickstart on Mon Jan 18 14:44:12 2016.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | .. image:: vectorizers_logo_text.png
7 | :width: 600
8 | :alt: Vectorizers Logo
9 |
10 | =====================================================
11 | Vectorizers: Transform unstructured data into vectors
12 | =====================================================
13 |
14 | There are a large number of machine learning tools for effectively exploring and working
15 | with data that is given as vectors (ideally with a defined notion of distance as well).
16 | There is also a large volume of data that does not come neatly packaged as vectors. It
17 | could be text data, variable length sequence data (either numeric or categorical),
18 | dataframes of mixed data types, sets of point clouds, or more. Usually, one way or another,
19 | such data can be wrangled into vectors in a way that preserves some relevant properties
20 | of the original data. This library seeks to provide a suite of a wide variety of
21 | general purpose techniques for such wrangling, making it easier and faster for users
22 | to get various kinds of unstructured sequence data into vector formats for exploration and
23 | machine learning.
24 |
25 | .. toctree::
26 | :maxdepth: 2
27 | :caption: Taxonomy of sequences
28 |
29 | sequence_taxonomy
30 |
31 | .. toctree::
32 | :maxdepth: 2
33 | :caption: Quick Start
34 |
35 | quick_start
36 |
37 | .. toctree::
38 | :maxdepth: 2
39 | :caption: Getting Started Tutorials
40 |
41 | document_vectorization
42 | CategoricalColumnTransformer_intro
43 |
44 | .. toctree::
45 | :maxdepth: 2
46 | :caption: Example Use Cases
47 |
48 | token_cooccurrence_vectorizer_multi_labelled_cyber_example
49 | categorical_column_transformer_example
50 |
51 | .. toctree::
52 | :maxdepth: 2
53 | :caption: Understanding the Tools
54 |
55 | information_weight_transform
56 |
57 | .. toctree::
58 | :maxdepth: 2
59 | :caption: API Reference:
60 |
61 | api
62 |
--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | REM Command file for Sphinx documentation
4 |
5 | if "%SPHINXBUILD%" == "" (
6 | set SPHINXBUILD=sphinx-build
7 | )
8 | set BUILDDIR=_build
9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
11 | if NOT "%PAPER%" == "" (
12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
14 | )
15 |
16 | if "%1" == "" goto help
17 |
18 | if "%1" == "help" (
19 | :help
20 | echo.Please use `make ^` where ^ is one of
21 | echo. html to make standalone HTML files
22 | echo. dirhtml to make HTML files named index.html in directories
23 | echo. singlehtml to make a single large HTML file
24 | echo. pickle to make pickle files
25 | echo. json to make JSON files
26 | echo. htmlhelp to make HTML files and a HTML help project
27 | echo. qthelp to make HTML files and a qthelp project
28 | echo. devhelp to make HTML files and a Devhelp project
29 | echo. epub to make an epub
30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
31 | echo. text to make text files
32 | echo. man to make manual pages
33 | echo. texinfo to make Texinfo files
34 | echo. gettext to make PO message catalogs
35 | echo. changes to make an overview over all changed/added/deprecated items
36 | echo. xml to make Docutils-native XML files
37 | echo. pseudoxml to make pseudoxml-XML files for display purposes
38 | echo. linkcheck to check all external links for integrity
39 | echo. doctest to run all doctests embedded in the documentation if enabled
40 | goto end
41 | )
42 |
43 | if "%1" == "clean" (
44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
45 | del /q /s %BUILDDIR%\*
46 | goto end
47 | )
48 |
49 |
50 | %SPHINXBUILD% 2> nul
51 | if errorlevel 9009 (
52 | echo.
53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
54 | echo.installed, then set the SPHINXBUILD environment variable to point
55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
56 | echo.may add the Sphinx directory to PATH.
57 | echo.
58 | echo.If you don't have Sphinx installed, grab it from
59 | echo.http://sphinx-doc.org/
60 | exit /b 1
61 | )
62 |
63 | if "%1" == "html" (
64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
65 | if errorlevel 1 exit /b 1
66 | echo.
67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html.
68 | goto end
69 | )
70 |
71 | if "%1" == "dirhtml" (
72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
73 | if errorlevel 1 exit /b 1
74 | echo.
75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
76 | goto end
77 | )
78 |
79 | if "%1" == "singlehtml" (
80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
81 | if errorlevel 1 exit /b 1
82 | echo.
83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
84 | goto end
85 | )
86 |
87 | if "%1" == "pickle" (
88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
89 | if errorlevel 1 exit /b 1
90 | echo.
91 | echo.Build finished; now you can process the pickle files.
92 | goto end
93 | )
94 |
95 | if "%1" == "json" (
96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
97 | if errorlevel 1 exit /b 1
98 | echo.
99 | echo.Build finished; now you can process the JSON files.
100 | goto end
101 | )
102 |
103 | if "%1" == "htmlhelp" (
104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | if errorlevel 1 exit /b 1
106 | echo.
107 | echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | goto end
110 | )
111 |
112 | if "%1" == "qthelp" (
113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | if errorlevel 1 exit /b 1
115 | echo.
116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\project-template.qhcp
119 | echo.To view the help file:
120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\project-template.ghc
121 | goto end
122 | )
123 |
124 | if "%1" == "devhelp" (
125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | if errorlevel 1 exit /b 1
127 | echo.
128 | echo.Build finished.
129 | goto end
130 | )
131 |
132 | if "%1" == "epub" (
133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | if errorlevel 1 exit /b 1
135 | echo.
136 | echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | goto end
138 | )
139 |
140 | if "%1" == "latex" (
141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | if errorlevel 1 exit /b 1
143 | echo.
144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | goto end
146 | )
147 |
148 | if "%1" == "latexpdf" (
149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | cd %BUILDDIR%/latex
151 | make all-pdf
152 | cd %BUILDDIR%/..
153 | echo.
154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | goto end
156 | )
157 |
158 | if "%1" == "latexpdfja" (
159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | cd %BUILDDIR%/latex
161 | make all-pdf-ja
162 | cd %BUILDDIR%/..
163 | echo.
164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | goto end
166 | )
167 |
168 | if "%1" == "text" (
169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | if errorlevel 1 exit /b 1
171 | echo.
172 | echo.Build finished. The text files are in %BUILDDIR%/text.
173 | goto end
174 | )
175 |
176 | if "%1" == "man" (
177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | if errorlevel 1 exit /b 1
179 | echo.
180 | echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | goto end
182 | )
183 |
184 | if "%1" == "texinfo" (
185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | if errorlevel 1 exit /b 1
187 | echo.
188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | goto end
190 | )
191 |
192 | if "%1" == "gettext" (
193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | if errorlevel 1 exit /b 1
195 | echo.
196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | goto end
198 | )
199 |
200 | if "%1" == "changes" (
201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | if errorlevel 1 exit /b 1
203 | echo.
204 | echo.The overview file is in %BUILDDIR%/changes.
205 | goto end
206 | )
207 |
208 | if "%1" == "linkcheck" (
209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | if errorlevel 1 exit /b 1
211 | echo.
212 | echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | goto end
215 | )
216 |
217 | if "%1" == "doctest" (
218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | if errorlevel 1 exit /b 1
220 | echo.
221 | echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | goto end
224 | )
225 |
226 | if "%1" == "xml" (
227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | if errorlevel 1 exit /b 1
229 | echo.
230 | echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | goto end
232 | )
233 |
234 | if "%1" == "pseudoxml" (
235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | if errorlevel 1 exit /b 1
237 | echo.
238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | goto end
240 | )
241 |
242 | :end
243 |
--------------------------------------------------------------------------------
/doc/quick_start.rst:
--------------------------------------------------------------------------------
1 | ############################
2 | Quick Start with Vectorizers
3 | ############################
4 |
5 | Vectorizers provides a number of tools for working with various kinds of
6 | unstructured data with a focus on sequence data. The library is built to be
7 | compatible with scikit-learn_ and can be used in scikit-learn pipelines.
8 |
9 | ----------
10 | Installing
11 | ----------
12 |
13 | Vectorizers can be installed via pip (coming soon) and via conda-forge (coming later).
14 |
15 | (Coming soon)
16 | .. code:: bash
17 |
18 | pip install vectorizers
19 |
20 | (Currently available)
21 | .. code:: bash
22 |
23 | pip install git+https://github.com/TutteInstitute/vectorizers.git
24 |
25 | To manually install this package:
26 |
27 | .. code:: bash
28 |
29 | wget https://github.com/TutteInstitute/vectorizers/archive/master.zip
30 | unzip master.zip
31 | rm master.zip
32 | cd vectorizers-master
33 | python setup.py install
34 |
35 | -----------
36 | Basic Usage
37 | -----------
38 |
39 | The vectorizers package provides a number of tools for vectorizing different kinds of
40 | input data. All of them are available as classes that follow sciki-learn's basic API
41 | for transformers, converting input data into vectors in one form or another. For example
42 | to convert sequences of categorical data into ngram vector representations one might use
43 |
44 | .. code:: python3
45 |
46 | import vectorizers
47 |
48 | ngrammer = vectorizers.NgramVectorizer(ngram_size=2)
49 | ngram_vetcors = ngrammer.fit_transform(input_sequences)
50 |
51 | These classes can easily be fit into sklearn pipelines, passing vector
52 | representations on to other scikit-learn (or scikit-learn compatible) classes. See
53 | the `Vectorizers API`_ documentation for more details on the available classes.
54 |
55 | Vetcorizers also provides a number of utility transformers in the ``vectorizers.transformers``
56 | namespace. These provide convenience transformations of data -- either transforms on vectorized
57 | data, including feature weighting tools, or transformations of structured and unstructured data
58 | into sequences more amenable to other vectorizers classes.
59 |
60 | .. _scikit-learn: https://scikit-learn.org/stable/
61 |
--------------------------------------------------------------------------------
/doc/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | scikit-learn
4 | numba
5 | pandas
6 | dask
7 | pynndescent>=0.5
8 | pomegranate
9 | pygments>=2.4.1
10 | jupyterlab_pygments>=0.1.1
11 | ipykernel
12 | nbsphinx
13 | numpydoc
14 | sphinx-rtd-theme
15 | sphinx-gallery
16 |
--------------------------------------------------------------------------------
/doc/sequence_taxonomy.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Variable Length Sequences\n",
8 | "\n",
9 | "Variable length sequence data can present a significant challenge\n",
10 | "to machine learning algorithms and data science analysis.\n",
11 | "\n",
12 | "Part of this problem is driven by the wide varieties of variable length\n",
13 | "sequence data that are encountered in the wild. To that end we present\n",
14 | "a taxonomy of the kinds of variable length sequences that we typically\n",
15 | "encounter and our suggestions for how to think about them.\n",
16 | "\n",
17 | "We generally find it useful when describing variable length sequence data\n",
18 | "to describe what it is a sequence of. The basic types that we commonly\n",
19 | "encounter are: categorical values, scalar values and vector values. Certainly scalar data could be thought of a simple one dimensional vector data but given the different techniques that can, and often are, applied to such data we feel that treating it as a seperate data type is warranted.\n",
20 | "\n",
21 | "Next we describe it as either ordered or unordered sequences. Yes, an unordered sequence is an odd turn of phrase but we be find it to be a useful simplifying\n",
22 | "notion. An unordered sequence is often referred to as a bag in data science\n",
23 | "literature. For a example a `bag of words` is the phrase used to describe an\n",
24 | "unordered collection of word tokens. We would describe such a collection as an\n",
25 | "unordered categorical sequence.\n",
26 | "\n",
27 | "Lasty, given an ordered sequence we require one extra piece of information:\n",
28 | "is the ordered regular or irregular. Regular sequences are often\n",
29 | "described as heartbeat data and generally assume equal spacing between all our\n",
30 | "values. Irregular sequences are often referred to as event data and each\n",
31 | "value is associated with a particular position allowing variable spacing amongst\n",
32 | "our values.\n",
33 | "\n",
34 | "Variable length sequence data comes in a vast variety of forms. Different forms of variable length sequence data are amenable to different techniques. To deal with this variety of data we propose this simple taxonomy of variable length sequence data and provide links and suggestions for techniques conducive to each type. \n",
35 | "\n",
36 | "* Type of values: categorical, scalar, vector\n",
37 | "* Order of values: Ordered or Unordered\n",
38 | "* Regularity of values: Regular or Irregular"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "### Sequence Types\n",
46 | "#### Categorical\n",
47 | "| regularity | order | type | sequence | | example |\n",
48 | "| :- | :- | :- | :- | :-: | :-: | \n",
49 | "| | unordered | categorical | sequence | -> | bag of words |\n",
50 | "| regular | ordered | categorical | sequence | -> | text document |\n",
51 | "| irregular | ordered | categorical | sequence | ->| time stamped labelled events |\n",
52 | "\n",
53 | "\n",
54 | "#### Scalar\n",
55 | "| regularity | order | type | sequence | | example |\n",
56 | "| :- | :- | :- | :- | :-: | :-: | \n",
57 | "| | unordered | Scalar | sequence | -> | random variable |\n",
58 | "| regular | ordered | Scalar | sequence | -> | heartbeat time-series |\n",
59 | "| irregular | ordered | Scalar | sequence | ->| time stamped values or event sequence |\n",
60 | "\n",
61 | "#### Vector\n",
62 | "| regularity | order | type | sequence | | example |\n",
63 | "| :- | :- | :- | :- | :-: | :-: | \n",
64 | "| | unordered | Vector | sequence | -> | point cloud |\n",
65 | "| regular | ordered | Vector | sequence | -> | spatial-trajectory data |\n",
66 | "| irregular | ordered | Vector | sequence | ->| time stamped locations |\n"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "## Vectorizer Functions \n",
74 | "\n",
75 | "This library adheres the sklearn transformer paradigm. With most functions having a `fit`, `fit_transform` and `transform` functions. As such they can be easily arranged in sklearn pipelines to ensure that all of your data transformation steps are encapsulated cleanly.\n",
76 | "\n",
77 | "For the most part our `vectorizers` take in a sequence of variable length sequences and learn a fixed width representation of these sequences. Another way of thinking of this is transforming a jagged array of vectors into a fixed width array of vectors. Fixed width representations are significantly more conducive to traditional machine learning algorithms.\n",
78 | "\n",
79 | "`Transformers` on the other hand are more generic utility functions that massage data in various useful ways. \n",
80 | "\n",
81 | "Due to the variety of vectorization techniques in this library a user might find it easier to determine the type of variable length sequences they are dealing with and use the following index to find the relevant functions.\n",
82 | "\n",
83 | "#### Categorical\n",
84 | "| regularity | order | type | sequence | | example | functions |\n",
85 | "| :- | :- | :- | :- | :-: | :- | :- | \n",
86 | "| | unordered | categorical | sequence | -> | bag of words | [NgramVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.NgramVectorizer.html#vectorizers.NgramVectorizer), [EdgeListVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.EdgeListVectorizer.html) | \n",
87 | "| regular | ordered | categorical | sequence | -> | text document | NgramVectorizer, [LZCompressionVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.LZCompressionVectorizer.html), [BPEVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.BytePairEncodingVectorizer.html) | \n",
88 | "| irregular | ordered | categorical | sequence | ->| time stamped labelled events | [HistogramVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.HistogramVectorizer.html) |\n",
89 | "\n",
90 | "All of these vectorizers take data in the form of a sequence of variable length sequences of categorical values (such as strings). All of these methods presume that a user has already decomposed their data into something of this form. \n",
91 | "\n",
92 | "The most common sources of variable length categorical data are text documents or data frames with categorical columns. In both cases some pre-processing will be necessary to convert such data into sequences of variable length sequences. \n",
93 | "\n",
94 | "In the case of text documents this often involves tokenization and lemmatization steps. An example of applying such transformations on text data before vectorization can be found in [document vectorizer](https://vectorizers.readthedocs.io/en/latest/document_vectorization.html).\n",
95 | "\n",
96 | "Good tokenization and lemmatization libraries include: [HuggingFace](https://huggingface.co/docs/transformers/main_classes/tokenizer), [SentencePiece](https://github.com/google/sentencepiece), [spaCy](https://spacy.io/api/tokenizer), and [nltk](https://www.nltk.org/api/nltk.tokenize.html).\n",
97 | "\n",
98 | "In the case of a data frame with multiple categorical columns one might make use of our libraries CategoricalColumnTransformer for transforming a data frame with one or more columns into a variable length sequence of categorical sequences. This is typically done by specifying one categorical column to represent ones objects and another set of categorical columns to be used to describe said objects.\n",
99 | "For an examples of how one might use this see an [introduction to CategoricalColumnTransformer](https://vectorizers.readthedocs.io/en/latest/CategoricalColumnTransformer_intro.html) or the more complicated [CategoricalColumnTransformer vignette](https://vectorizers.readthedocs.io/en/latest/categorical_column_transformer_example.html). \n",
100 | "\n",
101 | "#### Scalar\n",
102 | "| regularity | order | type | sequence | | example | functions |\n",
103 | "| :- | :- | :- | :- | :-: | :- | :- | \n",
104 | "| | unordered | Scalar | sequence | -> | random variable | [HistogramVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.HistogramVectorizer.html), DistributionVectorizer |\n",
105 | "| regular | ordered | Scalar | sequence | -> | heartbeat time-series | SlidingWindowTransformer |\n",
106 | "| irregular | ordered | Scalar | sequence | ->| time stamped values or event sequence | [KDEVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.KDEVectorizer.html#vectorizers.KDEVectorizer) |\n",
107 | "\n",
108 | "One should note that regular ordered scalar sequences references a Transformer function instead of a Vectorizer. That is because our current recommendation for dealing with such sequences is to use the SlidingWindowTransformer to encode the sequence information into an unordered scalar sequence and then apply the appropriate techniques.\n",
109 | "\n",
110 | "#### Vector\n",
111 | "| regularity | order | type | sequence | | example | functions |\n",
112 | "| :- | :- | :- | :- | :-: | :- | :- | \n",
113 | "| | unordered | Vector | sequence | -> | point cloud | [WassersteinVectorizer](https://vectorizers.readthedocs.io/en/latest/generated/vectorizers.WassersteinVectorizer.html#vectorizers.WassersteinVectorizer), DistributionVectorizer |\n",
114 | "| regular | ordered | Vector | sequence | -> | spatial-trajectory data | SlidingWindowTransformer |\n",
115 | "| irregular | ordered | Vector | sequence | ->| time stamped locations | `we accept pull requests` |\n",
116 | "\n",
117 | "One should note that regular ordered vector sequences references a Transformer function instead of a Vectorizer. That is because our current recommendation for dealing with such sequences is to use the SlidingWindowTransformer to encode the sequence information into an unordered vector sequence and then apply the appropriate techniques.\n",
118 | "\n",
119 | "WassersteinVectorizer should be considered the gold standard for vectorizing point clouds of data. It makes use linear optimal transport to linearize and thus provide a reasonably scalable vectorization of a point cloud so that Euclidean or Cosine distance on this space will be a reasonable approximation of Wasserstein distance between the point cloud distrubitons. SinkhornVectorizer can handle much larger distributions of data and is generally more efficient but this efficiency may come with some loss of quality. Lastly, we include an ApproximateWassersteinVectorizer which is a heuristic linear algebra based solution which poorly approximates our WassersteinVectorizer but is very, very fast. "
120 | ]
121 | }
122 | ],
123 | "metadata": {
124 | "kernelspec": {
125 | "display_name": "Python 3 (ipykernel)",
126 | "language": "python",
127 | "name": "python3"
128 | },
129 | "language_info": {
130 | "codemirror_mode": {
131 | "name": "ipython",
132 | "version": 3
133 | },
134 | "file_extension": ".py",
135 | "mimetype": "text/x-python",
136 | "name": "python",
137 | "nbconvert_exporter": "python",
138 | "pygments_lexer": "ipython3",
139 | "version": "3.10.10"
140 | }
141 | },
142 | "nbformat": 4,
143 | "nbformat_minor": 4
144 | }
145 |
--------------------------------------------------------------------------------
/doc/user_guide.rst:
--------------------------------------------------------------------------------
1 | .. title:: User guide : contents
2 |
3 | .. _user_guide:
4 |
5 | ==================================================
6 | User guide: create your own scikit-learn estimator
7 | ==================================================
8 |
9 | Estimator
10 | ---------
11 |
12 | The central piece of transformer, regressor, and classifier is
13 | :class:`sklearn.base.BaseEstimator`. All estimators in scikit-learn are derived
14 | from this class. In more details, this base class enables to set and get
15 | parameters of the estimator. It can be imported as::
16 |
17 | >>> from sklearn.base import BaseEstimator
18 |
19 | Once imported, you can create a class which inherate from this base class::
20 |
21 | >>> class MyOwnEstimator(BaseEstimator):
22 | ... pass
23 |
24 | Transformer
25 | -----------
26 |
27 | Transformers are scikit-learn estimators which implement a ``transform`` method.
28 | The use case is the following:
29 |
30 | * at ``fit``, some parameters can be learned from ``X`` and ``y``;
31 | * at ``transform``, `X` will be transformed, using the parameters learned
32 | during ``fit``.
33 |
34 | .. _mixin: https://en.wikipedia.org/wiki/Mixin
35 |
36 | In addition, scikit-learn provides a
37 | mixin_, i.e. :class:`sklearn.base.TransformerMixin`, which
38 | implement the combination of ``fit`` and ``transform`` called ``fit_transform``::
39 |
40 | One can import the mixin class as::
41 |
42 | >>> from sklearn.base import TransformerMixin
43 |
44 | Therefore, when creating a transformer, you need to create a class which
45 | inherits from both :class:`sklearn.base.BaseEstimator` and
46 | :class:`sklearn.base.TransformerMixin`. The scikit-learn API imposed ``fit`` to
47 | **return ``self``**. The reason is that it allows to pipeline ``fit`` and
48 | ``transform`` imposed by the :class:`sklearn.base.TransformerMixin`. The
49 | ``fit`` method is expected to have ``X`` and ``y`` as inputs. Note that
50 | ``transform`` takes only ``X`` as input and is expected to return the
51 | transformed version of ``X``::
52 |
53 | >>> class MyOwnTransformer(BaseEstimator, TransformerMixin):
54 | ... def fit(self, X, y=None):
55 | ... return self
56 | ... def transform(self, X):
57 | ... return X
58 |
59 | We build a basic example to show that our :class:`MyOwnTransformer` is working
60 | within a scikit-learn ``pipeline``::
61 |
62 | >>> from sklearn.datasets import load_iris
63 | >>> from sklearn.pipeline import make_pipeline
64 | >>> from sklearn.linear_model import LogisticRegression
65 | >>> X, y = load_iris(return_X_y=True)
66 | >>> pipe = make_pipeline(MyOwnTransformer(),
67 | ... LogisticRegression(random_state=10,
68 | ... solver='lbfgs',
69 | ... multi_class='auto'))
70 | >>> pipe.fit(X, y) # doctest: +ELLIPSIS
71 | Pipeline(...)
72 | >>> pipe.predict(X) # doctest: +ELLIPSIS
73 | array([...])
74 |
75 | Predictor
76 | ---------
77 |
78 | Regressor
79 | ~~~~~~~~~
80 |
81 | Similarly, regressors are scikit-learn estimators which implement a ``predict``
82 | method. The use case is the following:
83 |
84 | * at ``fit``, some parameters can be learned from ``X`` and ``y``;
85 | * at ``predict``, predictions will be computed using ``X`` using the parameters
86 | learned during ``fit``.
87 |
88 | In addition, scikit-learn provides a mixin_, i.e.
89 | :class:`sklearn.base.RegressorMixin`, which implements the ``score`` method
90 | which computes the :math:`R^2` score of the predictions.
91 |
92 | One can import the mixin as::
93 |
94 | >>> from sklearn.base import RegressorMixin
95 |
96 | Therefore, we create a regressor, :class:`MyOwnRegressor` which inherits from
97 | both :class:`sklearn.base.BaseEstimator` and
98 | :class:`sklearn.base.RegressorMixin`. The method ``fit`` gets ``X`` and ``y``
99 | as input and should return ``self``. It should implement the ``predict``
100 | function which should output the predictions of your regressor::
101 |
102 | >>> import numpy as np
103 | >>> class MyOwnRegressor(BaseEstimator, RegressorMixin):
104 | ... def fit(self, X, y):
105 | ... return self
106 | ... def predict(self, X):
107 | ... return np.mean(X, axis=1)
108 |
109 | We illustrate that this regressor is working within a scikit-learn pipeline::
110 |
111 | >>> from sklearn.datasets import load_diabetes
112 | >>> X, y = load_diabetes(return_X_y=True)
113 | >>> pipe = make_pipeline(MyOwnTransformer(), MyOwnRegressor())
114 | >>> pipe.fit(X, y) # doctest: +ELLIPSIS
115 | Pipeline(...)
116 | >>> pipe.predict(X) # doctest: +ELLIPSIS
117 | array([...])
118 |
119 | Since we inherit from the :class:`sklearn.base.RegressorMixin`, we can call
120 | the ``score`` method which will return the :math:`R^2` score::
121 |
122 | >>> pipe.score(X, y) # doctest: +ELLIPSIS
123 | -3.9...
124 |
125 | Classifier
126 | ~~~~~~~~~~
127 |
128 | Similarly to regressors, classifiers implement ``predict``. In addition, they
129 | output the probabilities of the prediction using the ``predict_proba`` method:
130 |
131 | * at ``fit``, some parameters can be learned from ``X`` and ``y``;
132 | * at ``predict``, predictions will be computed using ``X`` using the parameters
133 | learned during ``fit``. The output corresponds to the predicted class for each sample;
134 | * ``predict_proba`` will give a 2D matrix where each column corresponds to the
135 | class and each entry will be the probability of the associated class.
136 |
137 | In addition, scikit-learn provides a mixin, i.e.
138 | :class:`sklearn.base.ClassifierMixin`, which implements the ``score`` method
139 | which computes the accuracy score of the predictions.
140 |
141 | One can import this mixin as::
142 |
143 | >>> from sklearn.base import ClassifierMixin
144 |
145 | Therefore, we create a classifier, :class:`MyOwnClassifier` which inherits
146 | from both :class:`slearn.base.BaseEstimator` and
147 | :class:`sklearn.base.ClassifierMixin`. The method ``fit`` gets ``X`` and ``y``
148 | as input and should return ``self``. It should implement the ``predict``
149 | function which should output the class inferred by the classifier.
150 | ``predict_proba`` will output some probabilities instead::
151 |
152 | >>> class MyOwnClassifier(BaseEstimator, ClassifierMixin):
153 | ... def fit(self, X, y):
154 | ... self.classes_ = np.unique(y)
155 | ... return self
156 | ... def predict(self, X):
157 | ... return np.random.randint(0, self.classes_.size,
158 | ... size=X.shape[0])
159 | ... def predict_proba(self, X):
160 | ... pred = np.random.rand(X.shape[0], self.classes_.size)
161 | ... return pred / np.sum(pred, axis=1)[:, np.newaxis]
162 |
163 | We illustrate that this regressor is working within a scikit-learn pipeline::
164 |
165 | >>> X, y = load_iris(return_X_y=True)
166 | >>> pipe = make_pipeline(MyOwnTransformer(), MyOwnClassifier())
167 | >>> pipe.fit(X, y) # doctest: +ELLIPSIS
168 | Pipeline(...)
169 |
170 | Then, you can call ``predict`` and ``predict_proba``::
171 |
172 | >>> pipe.predict(X) # doctest: +ELLIPSIS
173 | array([...])
174 | >>> pipe.predict_proba(X) # doctest: +ELLIPSIS
175 | array([...])
176 |
177 | Since our classifier inherits from :class:`sklearn.base.ClassifierMixin`, we
178 | can compute the accuracy by calling the ``score`` method::
179 |
180 | >>> pipe.score(X, y) # doctest: +ELLIPSIS
181 | 0...
182 |
--------------------------------------------------------------------------------
/doc/vectorizers_logo_no_text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TutteInstitute/vectorizers/6e60b98e6c91821fac892675004eda2931380c13/doc/vectorizers_logo_no_text.png
--------------------------------------------------------------------------------
/doc/vectorizers_logo_text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TutteInstitute/vectorizers/6e60b98e6c91821fac892675004eda2931380c13/doc/vectorizers_logo_text.png
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: project-template
2 | dependencies:
3 | - numpy
4 | - scipy
5 | - scikit-learn
6 |
--------------------------------------------------------------------------------
/examples/README.txt:
--------------------------------------------------------------------------------
1 | .. _general_examples:
2 |
3 | General examples
4 | ================
5 |
6 | Introductory examples.
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | scikit-learn
4 | numba
5 | pandas
6 | dask
7 | pynndescent>=0.5
8 | pomegranate
9 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst
3 |
4 | [aliases]
5 | test = pytest
6 |
7 | [tool:pytest]
8 | addopts = --doctest-modules
9 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | """A template for scikit-learn compatible packages."""
3 |
4 | import codecs
5 | import os
6 |
7 | from setuptools import find_packages, setup
8 |
9 | # get __version__ from _version.py
10 | ver_file = os.path.join('vectorizers', '_version.py')
11 | with open(ver_file) as f:
12 | exec(f.read())
13 |
14 | DISTNAME = 'vectorizers'
15 | DESCRIPTION = 'A suite of vectorizers for various data types.'
16 | with codecs.open('README.rst', encoding='utf-8-sig') as f:
17 | LONG_DESCRIPTION = f.read()
18 | MAINTAINER = 'John Healy, Leland McInnes, Colin Weir'
19 | MAINTAINER_EMAIL = 'leland.mcinnes@gmail.com'
20 | URL = 'https://github.com/TutteInstitute/vectorizers'
21 | LICENSE = 'new BSD'
22 | DOWNLOAD_URL = 'https://github.com/TutteInstitute/vectorizers'
23 | VERSION = __version__
24 | INSTALL_REQUIRES = ['numpy', 'pandas', 'scipy', 'scikit-learn', 'numba', 'pynndescent', 'dask']
25 | CLASSIFIERS = ['Intended Audience :: Science/Research',
26 | 'Intended Audience :: Developers',
27 | 'License :: OSI Approved',
28 | 'Programming Language :: Python',
29 | 'Topic :: Software Development',
30 | 'Topic :: Scientific/Engineering',
31 | 'Operating System :: Microsoft :: Windows',
32 | 'Operating System :: POSIX',
33 | 'Operating System :: Unix',
34 | 'Operating System :: MacOS',
35 | 'Programming Language :: Python :: 3.9',
36 | 'Programming Language :: Python :: 3.10',
37 | 'Programming Language :: Python :: 3.11',
38 | 'Programming Language :: Python :: 3.12']
39 | EXTRAS_REQUIRE = {
40 | 'tests': [
41 | 'pytest',
42 | 'pytest-cov'],
43 | 'docs': [
44 | 'sphinx',
45 | 'sphinx-gallery',
46 | 'nbsphinx',
47 | 'sphinx_rtd_theme',
48 | 'numpydoc',
49 | 'matplotlib'
50 | ]
51 | }
52 |
53 | setup(name=DISTNAME,
54 | maintainer=MAINTAINER,
55 | maintainer_email=MAINTAINER_EMAIL,
56 | description=DESCRIPTION,
57 | license=LICENSE,
58 | url=URL,
59 | version=VERSION,
60 | download_url=DOWNLOAD_URL,
61 | long_description=LONG_DESCRIPTION,
62 | zip_safe=False, # the package can run out of an .egg file
63 | classifiers=CLASSIFIERS,
64 | packages=find_packages(),
65 | install_requires=INSTALL_REQUIRES,
66 | extras_require=EXTRAS_REQUIRE)
67 |
--------------------------------------------------------------------------------
/vectorizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .token_cooccurrence_vectorizer import TokenCooccurrenceVectorizer
2 | from .timed_token_cooccurrence_vectorizer import TimedTokenCooccurrenceVectorizer
3 | from .ngram_token_cooccurence_vectorizer import NgramCooccurrenceVectorizer
4 | from .multi_token_cooccurence_vectorizer import MultiSetCooccurrenceVectorizer
5 | from ._vectorizers import DistributionVectorizer
6 | from ._vectorizers import HistogramVectorizer
7 | from .skip_gram_vectorizer import SkipgramVectorizer
8 | from .ngram_vectorizer import NgramVectorizer
9 | from .kde_vectorizer import KDEVectorizer
10 | from .tree_token_cooccurrence import LabelledTreeCooccurrenceVectorizer
11 | from .edge_list_vectorizer import EdgeListVectorizer
12 | from .linear_optimal_transport import (
13 | WassersteinVectorizer,
14 | SinkhornVectorizer,
15 | ApproximateWassersteinVectorizer,
16 | )
17 | from .mixed_gram_vectorizer import LZCompressionVectorizer, BytePairEncodingVectorizer
18 |
19 | from .signature_vectorizer import SignatureVectorizer
20 |
21 | from .utils import cast_tokens_to_strings
22 |
23 | from ._version import __version__
24 |
25 | __all__ = [
26 | "TokenCooccurrenceVectorizer",
27 | "TimedTokenCooccurrenceVectorizer",
28 | "NgramCooccurrenceVectorizer",
29 | "MultiSetCooccurrenceVectorizer",
30 | "DistributionVectorizer",
31 | "HistogramVectorizer",
32 | "SkipgramVectorizer",
33 | "NgramVectorizer",
34 | "KDEVectorizer",
35 | "LabelledTreeCooccurrenceVectorizer",
36 | "WassersteinVectorizer",
37 | "SinkhornVectorizer",
38 | "ApproximateWassersteinVectorizer",
39 | "EdgeListVectorizer",
40 | "SignatureVectorizer",
41 | "__version__",
42 | ]
43 |
--------------------------------------------------------------------------------
/vectorizers/_vectorizers.py:
--------------------------------------------------------------------------------
1 | from warnings import warn
2 |
3 | import numpy as np
4 | import numba
5 | from sklearn.base import BaseEstimator, TransformerMixin
6 | import pandas as pd
7 | from sklearn.utils.validation import (
8 | check_array,
9 | check_is_fitted,
10 | check_random_state,
11 | )
12 | from sklearn.mixture import GaussianMixture
13 | from sklearn.preprocessing import normalize
14 | from .utils import flatten, vectorize_diagram, pairwise_gaussian_ground_distance
15 | import vectorizers.distances as distances
16 |
17 |
18 | class DistributionVectorizer(BaseEstimator, TransformerMixin):
19 | def __init__(
20 | self,
21 | n_components=20,
22 | random_state=None,
23 | ):
24 | self.n_components = n_components
25 | self.random_state = random_state
26 |
27 | def _validate_params(self):
28 | if (
29 | not np.issubdtype(type(self.n_components), np.integer)
30 | or self.n_components < 2
31 | ):
32 | raise ValueError(
33 | "n_components must be and integer greater than or equal " "to 2."
34 | )
35 |
36 | def _validate_data(self, X):
37 | try:
38 | assert np.isscalar(X[0][0][0])
39 | except:
40 | raise ValueError("Input must be a collection of collections of points")
41 |
42 | try:
43 | dims = [np.array(x).shape[1] for x in X]
44 | except:
45 | raise ValueError(
46 | "Elements of each point collection must be of the same dimension."
47 | )
48 |
49 | if not hasattr(self, "data_dimension_"):
50 | self.data_dimension_ = np.mean(dims)
51 |
52 | if not (
53 | np.max(dims) == self.data_dimension_ or np.min(dims) == self.data_dimension_
54 | ):
55 | raise ValueError("Each point collection must be of equal dimension.")
56 |
57 | def fit(self, X, y=None, **fit_params):
58 | random_state = check_random_state(self.random_state)
59 | self._validate_params()
60 | self._validate_data(X)
61 |
62 | combined_data = np.vstack(X)
63 | combined_data = check_array(combined_data)
64 |
65 | self.mixture_model_ = GaussianMixture(
66 | n_components=self.n_components, random_state=random_state
67 | )
68 | self.mixture_model_.fit(combined_data)
69 | self.ground_distance_ = pairwise_gaussian_ground_distance(
70 | self.mixture_model_.means_,
71 | self.mixture_model_.covariances_,
72 | )
73 | self.metric_ = distances.hellinger
74 |
75 | def transform(self, X):
76 | check_is_fitted(self, ["mixture_model_", "ground_distance_"])
77 | self._validate_data(X)
78 | result = np.vstack(
79 | [vectorize_diagram(diagram, self.mixture_model_) for diagram in X]
80 | )
81 | return result
82 |
83 | def fit_transform(self, X, y=None, **fit_params):
84 | self.fit(X, y, **fit_params)
85 | return np.vstack(
86 | [vectorize_diagram(diagram, self.mixture_model_) for diagram in X]
87 | )
88 |
89 |
90 | def find_bin_boundaries(flat, n_bins):
91 | """
92 | Only uniform distribution is currently implemented.
93 | TODO: Implement Normal
94 | :param flat: an iterable.
95 | :param n_bins:
96 | :return:
97 | """
98 | flat.sort()
99 | flat_csum = np.cumsum(flat)
100 | bin_range = flat_csum[-1] / n_bins
101 | bin_indices = [0]
102 | for i in range(1, len(flat_csum)):
103 | if (flat_csum[i] >= bin_range * len(bin_indices)) & (
104 | flat[i] > flat[bin_indices[-1]]
105 | ):
106 | bin_indices.append(i)
107 | bin_values = np.array(flat, dtype=float)[bin_indices]
108 |
109 | if bin_values.shape[0] < n_bins:
110 | warn(
111 | f"Could not generate n_bins={n_bins} bins as there are not enough "
112 | f"distinct values. Please check your data."
113 | )
114 |
115 | return bin_values
116 |
117 |
118 | def expand_boundaries(my_interval_index, absolute_range):
119 | """
120 | Expands the outer bind on a pandas IntervalIndex to encompass the range specified by the 2-tuple absolute_range.
121 |
122 | Parameters
123 | ----------
124 | my_interval_index: pandas IntervalIndex object (right closed)
125 | absolute_range: 2-tuple.
126 | (min_value, max_value)
127 |
128 | Returns
129 | -------
130 | index: a pandas IntervalIndex
131 | A pandas IntervalIndex with the boundaries potentially expanded to encompas the absolute range.
132 | """
133 | """
134 | expands the outer bind on a pandas IntervalIndex to encompass the range specified by the 2-tuple absolute_range
135 | :param my_interval_index:
136 | :param absolute_range: 2tuple
137 | :return: a pandas IntervalIndex
138 | """
139 | interval_list = my_interval_index.to_list()
140 | # Check if the left boundary needs expanding
141 | if interval_list[0].left > absolute_range[0]:
142 | interval_list[0] = pd.Interval(
143 | left=absolute_range[0], right=interval_list[0].right
144 | )
145 | # Check if the right boundary needs expanding
146 | last = len(interval_list) - 1
147 | if interval_list[last].right < absolute_range[1]:
148 | interval_list[last] = pd.Interval(
149 | left=interval_list[last].left, right=absolute_range[1]
150 | )
151 | return pd.IntervalIndex(interval_list)
152 |
153 |
154 | def add_outier_bins(my_interval_index, absolute_range):
155 | """
156 | Appends extra bins to either side our our interval index if appropriate.
157 | That only occurs if the absolute_range is wider than the observed range in your training data.
158 | :param my_interval_index:
159 | :param absolute_range:
160 | :return:
161 | """
162 | interval_list = my_interval_index.to_list()
163 | # Check if the left boundary needs expanding
164 | if interval_list[0].left > absolute_range[0]:
165 | left_outlier = pd.Interval(left=absolute_range[0], right=interval_list[0].left)
166 | interval_list.insert(0, left_outlier)
167 |
168 | last = len(interval_list) - 1
169 | if interval_list[last].right < absolute_range[1]:
170 | right_outlier = pd.Interval(
171 | left=interval_list[last].right, right=absolute_range[1]
172 | )
173 | interval_list.append(right_outlier)
174 | return pd.IntervalIndex(interval_list)
175 |
176 |
177 | class HistogramVectorizer(BaseEstimator, TransformerMixin):
178 | """Convert a time series of binary events into a histogram of
179 | event occurrences over a time frame. If the data has explicit time stamps
180 | it can be aggregated over hour of day, day of week, day of month, day of year
181 | , week of year or month of year.
182 |
183 | Parameters
184 | ----------
185 | n_components: int or array-like, shape (n_features,) (default=5)
186 | The number of bins to produce. Raises ValueError if n_bins < 2.
187 |
188 | strategy: {‘uniform’, ‘quantile’, 'gmm'}, (default=’uniform’)
189 | The method to use for bin selection in the histogram. In general the
190 | quantile option, which will select variable width bins based on the
191 | distribution of the training data, is suggested, but uniformly spaced
192 | identically sized bins, or soft bins learned from a Gaussian mixture model
193 | are also available.
194 |
195 | ground_distance: {'euclidean'}
196 | The distance to induce between bins.
197 |
198 | absolute_range: (minimum_value_possible, maximum_value_possible) (default=(-np.inf, np.inf))
199 | By default values outside of training data range are included in the extremal bins.
200 | You can specify these values if you know something about your values (e.g. (0, np.inf) )
201 |
202 | append_outlier_bins: bool (default=False)
203 | Whether to add extra bins to catch values outside of your training
204 | data where appropriate? These bins will increase the total number of
205 | components (to ``n_components + 2`` and will be the first bin (for
206 | outlying small data) and the last bin (for outlying large data).
207 | """
208 |
209 | # TODO: time stamps, generic groupby
210 | def __init__(
211 | self,
212 | n_components=20,
213 | strategy="uniform",
214 | ground_distance="euclidean",
215 | absolute_range=(-np.inf, np.inf),
216 | append_outlier_bins=False,
217 | ):
218 | self.n_components = n_components
219 | self.strategy = strategy
220 | self.ground_distance = ground_distance # Not currently making use of this.
221 | self.absolute_range = absolute_range
222 | self.append_outlier_bins = append_outlier_bins
223 |
224 | def _validate_params(self):
225 | pass
226 |
227 | def fit(self, X, y=None, **fit_params):
228 | """
229 | Learns the histogram bins.
230 | Still need to check switch.
231 | :param X:
232 | :return:
233 | """
234 | flat = flatten(X)
235 | flat = list(
236 | filter(
237 | lambda n: n > self.absolute_range[0] and n < self.absolute_range[1],
238 | flat,
239 | )
240 | )
241 | if self.strategy == "uniform":
242 | self.bin_intervals_ = pd.interval_range(
243 | start=np.min(flat), end=np.max(flat), periods=self.n_components
244 | )
245 | if self.strategy == "quantile":
246 | self.bin_intervals_ = pd.IntervalIndex.from_breaks(
247 | find_bin_boundaries(flat, self.n_components)
248 | )
249 | if self.append_outlier_bins == True:
250 | self.bin_intervals_ = add_outier_bins(
251 | self.bin_intervals_, self.absolute_range
252 | )
253 | else:
254 | self.bin_intervals_ = expand_boundaries(
255 | self.bin_intervals_, self.absolute_range
256 | )
257 | self.metric_ = distances.hellinger
258 | return self
259 |
260 | def _vector_transform(self, vector):
261 | """
262 | Applies the transform to a single row of the data.
263 | """
264 | return pd.cut(vector, self.bin_intervals_).value_counts()
265 |
266 | def transform(self, X):
267 | """
268 | Apply binning to a full data set returning an nparray.
269 | """
270 | check_is_fitted(self, ["bin_intervals_"])
271 | result = np.ndarray((len(X), len(self.bin_intervals_)))
272 | for i, seq in enumerate(X):
273 | result[i, :] = self._vector_transform(seq).values
274 | return result
275 |
276 |
277 | def temporal_cyclic_transform(datetime_series, periodicity=None):
278 | """
279 | TODO: VERY UNFINISHED
280 | Replaces all time resolutions above the resolution specified with a fixed value.
281 | This creates a cycle within a datetime series.
282 | Parameters
283 | ----------
284 | datetime_series: a pandas series of datetime objects
285 | periodicity: string ['year', 'month' , 'week', 'day', 'hour']
286 | What time period to create cycles.
287 |
288 | Returns
289 | -------
290 | cyclic_series: pandas series of datetime objects
291 |
292 | """
293 | collapse_times = {}
294 | if periodicity in ["year", "month", "day", "hour"]:
295 | collapse_times["year"] = 1970
296 | if periodicity in ["month", "day", "hour"]:
297 | collapse_times["month"] = 1
298 | if periodicity in ["day", "hour"]:
299 | collapse_times["day"] = 1
300 | if periodicity in ["hour"]:
301 | collapse_times["hour"] = 0
302 | cyclic_series = datetime_series.apply(lambda x: x.replace(**collapse_times))
303 | elif periodicity == "week":
304 | raise NotImplementedError("we have not implemented week cycles yet")
305 | else:
306 | raise ValueError(
307 | f"Sorry resolution={periodicity} is not a valid option. "
308 | + f"Please select from ['year', 'month', 'week', 'day', 'hour']"
309 | )
310 | return cyclic_series
311 |
312 |
313 | class CyclicHistogramVectorizer(BaseEstimator, TransformerMixin):
314 | """"""
315 |
316 | def __init__(
317 | self,
318 | periodicity="week",
319 | resolution="day",
320 | ):
321 | self.periodicity = periodicity
322 | self.resolution = resolution
323 |
324 | def _validate_params(self):
325 | pass
326 |
327 | def fit(self, X, y=None, **fit_params):
328 | cyclic_data = temporal_cyclic_transform(
329 | pd.to_datetime(X), periodicity=self.periodicity
330 | )
331 | resampled = (
332 | pd.Series(index=cyclic_data, data=1).resample(self.resolution).count()
333 | )
334 | self.temporal_bins_ = resampled.index
335 | return self
336 |
337 |
338 | class ProductDistributionVectorizer(BaseEstimator, TransformerMixin):
339 | pass
340 |
--------------------------------------------------------------------------------
/vectorizers/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.2"
2 |
--------------------------------------------------------------------------------
/vectorizers/_window_kernels.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import numba
3 |
4 | EPSILON = 1e-8
5 |
6 | # The window function
7 |
8 |
9 | @numba.njit(nogil=True)
10 | def window_at_index(token_sequence, window_size, ind, reverse=False):
11 | if reverse:
12 | return np.flipud(token_sequence[max(ind - window_size, 0) : ind])
13 | return token_sequence[ind + 1 : min(ind + window_size + 1, len(token_sequence))]
14 |
15 |
16 | # Window width functions
17 |
18 |
19 | @numba.njit(nogil=True)
20 | def variable_window_radii(
21 | window_size,
22 | token_frequency,
23 | mask_index=None,
24 | power=0.75,
25 | ):
26 | radii = np.power(token_frequency, power - 1)
27 | radii /= np.sum(radii * token_frequency)
28 | radii = np.append(radii, min(radii))
29 | if mask_index is not None:
30 | radii[mask_index] = 0.0
31 | result = radii * window_size
32 | result[(result > 0) * (result < 1)] = 1.0
33 | np.round(result, 0, result)
34 | return result.astype(np.int64)
35 |
36 |
37 | @numba.njit(nogil=True)
38 | def fixed_window_radii(window_size, token_frequency, mask_index=None):
39 | radii = np.repeat(window_size, len(token_frequency) + 1)
40 | if mask_index is not None:
41 | radii[mask_index] = 0.0
42 | return radii
43 |
44 |
45 | # Kernel functions
46 |
47 |
48 | @numba.njit(nogil=True)
49 | def flat_kernel(window, mask_index=None, normalize=False, offset=0):
50 | result = np.ones(len(window), dtype=np.float64)
51 | if mask_index is not None:
52 | result[window == mask_index] = 0.0
53 | result[0 : min(offset, len(result))] = 0
54 | if normalize:
55 | temp = result.sum()
56 | if temp > 0:
57 | result /= temp
58 | return result
59 |
60 |
61 | @numba.njit(nogil=True)
62 | def harmonic_kernel(window, mask_index=None, normalize=False, offset=0):
63 | result = 1.0 / np.arange(1, len(window) + 1)
64 | if mask_index is not None:
65 | result[window == mask_index] = 0.0
66 | result[0 : min(offset, len(result))] = 0
67 | if normalize:
68 | temp = result.sum()
69 | if temp > 0:
70 | result /= temp
71 | return result
72 |
73 |
74 | @numba.njit(nogil=True)
75 | def geometric_kernel(
76 | window,
77 | mask_index=None,
78 | normalize=False,
79 | offset=0,
80 | power=0.9,
81 | ):
82 | result = power ** np.arange(1, len(window) + 1)
83 |
84 | if mask_index is not None:
85 | result[window == mask_index] = 0.0
86 | result[0 : min(offset, len(result))] = 0
87 | if normalize:
88 | temp = result.sum()
89 | if temp > 0:
90 | result /= temp
91 | return result
92 |
93 |
94 | @numba.njit(nogil=True)
95 | def multi_flat_kernel(
96 | window,
97 | target_ind,
98 | mask_index=None,
99 | normalize=False,
100 | offset=0,
101 | ):
102 | result_len = 0
103 | for mset in window:
104 | result_len += mset.shape[0]
105 |
106 | ker = np.ones(len(window))
107 | kernel_result = np.zeros(result_len).astype(np.float64)
108 |
109 | ind = 0
110 | for i, mset in enumerate(window[offset:]):
111 | kernel_result[ind : ind + len(mset)] = np.repeat(ker[i], len(mset))
112 | if mask_index is not None:
113 | for w_i, token in enumerate(mset):
114 | if token == mask_index:
115 | kernel_result[ind + w_i] = 0
116 | ind += len(mset)
117 | kernel_result[target_ind] = 0
118 |
119 | if normalize:
120 | temp = kernel_result.sum()
121 | if temp > 0:
122 | kernel_result /= temp
123 |
124 | return kernel_result
125 |
126 |
127 | @numba.njit(nogil=True)
128 | def multi_geometric_kernel(
129 | window,
130 | target_ind,
131 | mask_index=None,
132 | normalize=False,
133 | offset=0,
134 | power=0.9,
135 | ):
136 | result_len = 0
137 | for mset in window:
138 | result_len += mset.shape[0]
139 |
140 | ker = power ** np.arange(len(window))
141 |
142 | kernel_result = np.zeros(result_len).astype(np.float64)
143 | ind = 0
144 | for i, mset in enumerate(window[offset:]):
145 | kernel_result[ind : ind + len(mset)] = np.repeat(ker[i], len(mset))
146 | if mask_index is not None:
147 | for w_i, token in enumerate(mset):
148 | if token == mask_index:
149 | kernel_result[ind + w_i] = 0
150 | ind += len(mset)
151 | kernel_result[target_ind] = 0
152 |
153 | if normalize:
154 | temp = kernel_result.sum()
155 | if temp > 0:
156 | kernel_result /= temp
157 |
158 | return kernel_result
159 |
160 |
161 | @numba.njit(nogil=True)
162 | def update_kernel(
163 | window,
164 | kernel,
165 | mask_index,
166 | normalize,
167 | ):
168 | result = kernel[: len(window)].astype(np.float64)
169 | if mask_index is not None:
170 | result[window == mask_index] = 0
171 | if normalize:
172 | temp = result.sum()
173 | if temp > 0:
174 | result /= temp
175 | return result
176 |
177 |
178 | @numba.njit(nogil=True)
179 | def timed_geometric_kernel(
180 | window,
181 | time_deltas,
182 | delta,
183 | mask_index,
184 | normalize,
185 | offset,
186 | power=0.9,
187 | ):
188 | result = power ** (time_deltas / delta)
189 | if mask_index is not None:
190 | result[window == mask_index] = 0
191 | result[0 : min(offset, len(result))] = 0
192 | if normalize:
193 | temp = result.sum()
194 | if temp > 0:
195 | result /= temp
196 | return result
197 |
198 |
199 | @numba.njit(nogil=True)
200 | def timed_flat_kernel(
201 | window,
202 | time_deltas,
203 | delta,
204 | mask_index,
205 | normalize,
206 | offset,
207 | ):
208 | result = np.ones(len(time_deltas), dtype=np.float64)
209 | if mask_index is not None:
210 | result[window == mask_index] = 0
211 | result[0 : min(offset, len(result))] = 0
212 | if normalize:
213 | temp = result.sum()
214 | if temp > 0:
215 | result /= temp
216 | return result
217 |
218 |
219 | # Parameter lists
220 |
221 | _WINDOW_FUNCTIONS = {
222 | "variable": variable_window_radii,
223 | "fixed": fixed_window_radii,
224 | }
225 |
226 | _KERNEL_FUNCTIONS = {
227 | "flat": flat_kernel,
228 | "harmonic": harmonic_kernel,
229 | "geometric": geometric_kernel,
230 | }
231 |
232 | _TIMED_KERNEL_FUNCTIONS = {
233 | "flat": timed_flat_kernel,
234 | "geometric": timed_geometric_kernel,
235 | }
236 |
237 | _MULTI_KERNEL_FUNCTIONS = {
238 | "flat": multi_flat_kernel,
239 | "geometric": multi_geometric_kernel,
240 | }
241 |
242 | ####################################################
243 | # Sliding window multivariate time series kernels
244 | ####################################################
245 |
246 |
247 | def averaging_kernel(n_cols, *kernel_params):
248 | return np.full(n_cols, 1.0 / n_cols)
249 |
250 |
251 | def difference_kernel(n_cols, start, step, stride, *kernel_params):
252 | n_differences = int(np.ceil((n_cols - start - step) // stride))
253 | result = np.zeros((n_differences, n_cols))
254 | for i in range(n_differences):
255 | result[i, start + i * stride] = -1
256 | result[i, start + i * stride + step] = 1
257 |
258 | return result
259 |
260 |
261 | def positon_velocity_kernel(n_cols, position_index, step, stride, *kernel_params):
262 | n_differences_before = int(np.ceil((position_index - step) // stride))
263 | n_differences_after = int(np.ceil((n_cols - position_index - step) // stride))
264 | n_differences = n_differences_before + n_differences_after
265 | result = np.zeros((n_differences + 1, n_cols))
266 | result[0, position_index] = 1
267 | for i in range(n_differences_before):
268 | result[i + 1, position_index - i * stride] = 1
269 | result[i + 1, position_index - i * stride - step] = -1
270 | for i in range(n_differences_after):
271 | result[i + n_differences_before + 1, position_index + i * stride] = -1
272 | result[i + n_differences_before + 1, position_index + i * stride + step] = 1
273 |
274 | return result
275 |
276 |
277 | def weight_kernel(n_cols, weights, *kernel_params):
278 | if weights.shape[0] != n_cols:
279 | raise ValueError(
280 | f"Cannot construct a weight kernel of size {n_cols} "
281 | f"with weights of shape {weights.shape[0]}"
282 | )
283 |
284 | return np.diag(weights)
285 |
286 |
287 | def gaussian_weight_kernel(n_cols, sigma, *kernel_params):
288 | width = n_cols / 2
289 | xs = np.linspace(-width, width, n_cols)
290 | weights = 1.0 / (sigma * 2 * np.pi) * np.exp(-((xs / sigma) ** 2) / 2.0)
291 | return np.diag(weights)
292 |
293 |
294 | _SLIDING_WINDOW_KERNELS = {
295 | "average": averaging_kernel,
296 | "differences": difference_kernel,
297 | "position_velocity": positon_velocity_kernel,
298 | "weight": weight_kernel,
299 | "gaussian_weight": gaussian_weight_kernel,
300 | }
301 |
302 | # Copied from the SciPy implementation
303 | @numba.njit()
304 | def binom(n, k):
305 | n = int(n)
306 | k = int(k)
307 |
308 | if k > n or n < 0 or k < 0:
309 | return 0
310 |
311 | m = n + 1
312 | nterms = min(k, n - k)
313 |
314 | numerator = 1
315 | denominator = 1
316 | for j in range(1, nterms + 1):
317 | numerator *= m - j
318 | denominator *= j
319 |
320 | return numerator // denominator
321 |
322 |
323 | # A couple of changepoint based kernels that can be useful. The goal
324 | # is to detect changepoints in sequences of count of time interval
325 | # data (where the intervals are between events).
326 | #
327 | # We can model count data with Poisson's and interval data as inter-arrival
328 | # times (which can can convert to count-like data by taking reciprocals.
329 | #
330 | # Essentially we start with a baseline prior given by a gamma distribution,
331 | # and then update the prior with the data in the window up to, but not
332 | # including, the last element. The return value is then the predictive
333 | # posterior (a negative binomial) of observing the final element of
334 | # the window.
335 |
336 |
337 | def count_changepoint_kernel(alpha=1.0, beta=1):
338 | @numba.njit()
339 | def _kernel(window):
340 | model_window = window[:-1]
341 | observation = window[-1]
342 | alpha_prime = alpha + model_window.sum()
343 | beta_prime = beta + len(model_window)
344 | nb_r = alpha_prime
345 | nb_p = 1.0 / (1.0 + beta_prime)
346 |
347 | prob = (
348 | binom(observation + nb_r - 1, observation)
349 | * (1 - nb_p) ** nb_r
350 | * nb_p ** observation
351 | )
352 |
353 | return np.array([-np.log(prob)])
354 |
355 | return _kernel
356 |
357 |
358 | def inter_arrival_changepoint_kernel(alpha=1.0, beta=1):
359 | @numba.njit()
360 | def _kernel(window):
361 | model_window = 1.0 / (window[:-1] + EPSILON)
362 | observation = 1.0 / (window[-1] + EPSILON)
363 | alpha_prime = alpha + model_window.sum()
364 | beta_prime = beta + len(model_window)
365 | nb_r = alpha_prime
366 | nb_p = 1.0 / (1.0 + beta_prime)
367 |
368 | prob = (
369 | binom(observation + nb_r - 1, observation)
370 | * (1 - nb_p) ** nb_r
371 | * nb_p ** observation
372 | )
373 |
374 | return np.array([-np.log(prob)])
375 |
376 | return _kernel
377 |
378 |
379 | _SLIDING_WINDOW_FUNCTION_KERNELS = {
380 | "count_changepoint": count_changepoint_kernel,
381 | "timespan_changepoint": inter_arrival_changepoint_kernel,
382 | }
383 |
--------------------------------------------------------------------------------
/vectorizers/coo_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import numba
3 | from collections import namedtuple
4 |
5 | CooArray = namedtuple("CooArray", ["row", "col", "val", "key", "ind", "min", "depth"])
6 |
7 | COO_QUICKSORT_LIMIT = 1 << 16
8 | COO_MEM_MULTIPLIER = 1.5
9 |
10 |
11 | @numba.njit(nogil=True)
12 | def set_array_size(token_sequences, window_array):
13 | tot_len = np.zeros(window_array.shape[0]).astype(np.float64)
14 | window_array = window_array.astype(np.float64)
15 | for seq in token_sequences:
16 | counts = np.bincount(seq, minlength=window_array.shape[1]).astype(np.float64)
17 | tot_len += np.dot(
18 | window_array, counts
19 | ).T # NOTE: numba only does dot products with floats
20 | return tot_len.astype(np.int64)
21 |
22 |
23 | @numba.njit(nogil=True)
24 | def merge_sum_duplicates(coo):
25 | new_depth = True
26 | for i in range(coo.depth[0]):
27 | if coo.min[i] <= 0:
28 | coo.min[:i] = -coo.ind[0]
29 | coo.min[i] = coo.ind[0]
30 | new_depth = False
31 | break
32 | else:
33 | array_len = coo.ind[0] - np.abs(coo.min[i + 1]) + 1
34 | result_row = np.zeros(array_len)
35 | result_col = np.zeros(array_len)
36 | result_val = np.zeros(array_len)
37 | result_key = np.zeros(array_len)
38 | ptr1 = np.abs(coo.min[i + 1])
39 | ptr2 = coo.min[i]
40 | result_ptr = 0
41 | result_key[0] = -1
42 |
43 | while ptr1 < coo.min[i] and ptr2 < coo.ind[0]:
44 | if coo.key[ptr1] <= coo.key[ptr2]:
45 | this_ptr = ptr1
46 | ptr1 += 1
47 | else:
48 | this_ptr = ptr2
49 | ptr2 += 1
50 |
51 | if coo.key[this_ptr] == result_key[result_ptr]:
52 | result_val[result_ptr] += coo.val[this_ptr]
53 | else:
54 | result_ptr += 1
55 | result_val[result_ptr] = coo.val[this_ptr]
56 | result_row[result_ptr] = coo.row[this_ptr]
57 | result_col[result_ptr] = coo.col[this_ptr]
58 | result_key[result_ptr] = coo.key[this_ptr]
59 |
60 | if ptr1 >= coo.min[i]:
61 | while ptr2 < coo.ind[0]:
62 | this_ptr = ptr2
63 | ptr2 += 1
64 |
65 | if coo.key[this_ptr] == result_key[result_ptr]:
66 | result_val[result_ptr] += coo.val[this_ptr]
67 | else:
68 | result_ptr += 1
69 | result_val[result_ptr] = coo.val[this_ptr]
70 | result_row[result_ptr] = coo.row[this_ptr]
71 | result_col[result_ptr] = coo.col[this_ptr]
72 | result_key[result_ptr] = coo.key[this_ptr]
73 | else:
74 | while ptr1 < coo.min[i]:
75 | this_ptr = ptr1
76 | ptr1 += 1
77 |
78 | if coo.key[this_ptr] == result_key[result_ptr]:
79 | result_val[result_ptr] += coo.val[this_ptr]
80 | else:
81 | result_ptr += 1
82 | result_val[result_ptr] = coo.val[this_ptr]
83 | result_row[result_ptr] = coo.row[this_ptr]
84 | result_col[result_ptr] = coo.col[this_ptr]
85 | result_key[result_ptr] = coo.key[this_ptr]
86 |
87 | coo.row[np.abs(coo.min[i + 1]) : coo.ind[0]] = result_row[1:]
88 | coo.col[np.abs(coo.min[i + 1]) : coo.ind[0]] = result_col[1:]
89 | coo.val[np.abs(coo.min[i + 1]) : coo.ind[0]] = result_val[1:]
90 | coo.key[np.abs(coo.min[i + 1]) : coo.ind[0]] = result_key[1:]
91 | coo.ind[0] = np.abs(coo.min[i + 1]) + result_ptr
92 |
93 | if new_depth:
94 | coo.min[: coo.depth[0]] = -coo.ind[0]
95 | coo.min[coo.depth[0]] = coo.ind[0]
96 | coo.depth[0] += 1
97 |
98 |
99 | @numba.njit(nogil=True)
100 | def merge_all_sum_duplicates(coo):
101 | new_min = np.zeros(coo.depth[0])
102 | ptr = 0
103 | for i in range(coo.depth[0]):
104 | if coo.min[i] > 0:
105 | new_min[ptr] = coo.min[i]
106 | ptr += 1
107 | coo.min[: coo.depth[0]] = new_min
108 | merge_sum_duplicates(coo)
109 |
110 |
111 | @numba.njit(nogil=True)
112 | def coo_sum_duplicates(coo):
113 | upper_lim = coo.ind[0]
114 | lower_lim = np.abs(coo.min[0])
115 |
116 | perm = np.argsort(coo.key[lower_lim:upper_lim])
117 |
118 | coo.row[lower_lim:upper_lim] = coo.row[lower_lim:upper_lim][perm]
119 | coo.col[lower_lim:upper_lim] = coo.col[lower_lim:upper_lim][perm]
120 | coo.val[lower_lim:upper_lim] = coo.val[lower_lim:upper_lim][perm]
121 | coo.key[lower_lim:upper_lim] = coo.key[lower_lim:upper_lim][perm]
122 |
123 | sum_ind = lower_lim
124 | this_row = coo.row[lower_lim]
125 | this_col = coo.col[lower_lim]
126 | this_val = np.float32(0)
127 | this_key = coo.key[lower_lim]
128 |
129 | for i in range(lower_lim, upper_lim):
130 | if coo.key[i] == this_key:
131 | this_val += coo.val[i]
132 | else:
133 | coo.row[sum_ind] = this_row
134 | coo.col[sum_ind] = this_col
135 | coo.val[sum_ind] = this_val
136 | coo.key[sum_ind] = this_key
137 | this_row = coo.row[i]
138 | this_col = coo.col[i]
139 | this_val = coo.val[i]
140 | this_key = coo.key[i]
141 | sum_ind += 1
142 |
143 | if this_key != coo.key[upper_lim]:
144 | coo.row[sum_ind] = this_row
145 | coo.col[sum_ind] = this_col
146 | coo.val[sum_ind] = this_val
147 | coo.key[sum_ind] = this_key
148 | sum_ind += 1
149 |
150 | coo.ind[0] = sum_ind
151 | merge_sum_duplicates(coo)
152 |
153 |
154 | @numba.njit(nogil=True)
155 | def coo_increase_mem(coo):
156 |
157 | temp = coo.row
158 | new_size = np.int32(max(np.round(COO_MEM_MULTIPLIER * temp.shape[0]), COO_QUICKSORT_LIMIT+1))
159 | new_row = np.zeros(new_size, dtype=np.int32)
160 | new_row[: temp.shape[0]] = temp
161 |
162 | temp = coo.col
163 | new_size = np.int32(max(np.round(COO_MEM_MULTIPLIER * temp.shape[0]), COO_QUICKSORT_LIMIT+1))
164 | new_col = np.zeros(new_size, dtype=np.int32)
165 | new_col[: temp.shape[0]] = temp
166 |
167 | temp = coo.val
168 | new_size = np.int32(max(np.round(COO_MEM_MULTIPLIER * temp.shape[0]), COO_QUICKSORT_LIMIT+1))
169 | new_val = np.zeros(new_size, dtype=np.float32)
170 | new_val[: temp.shape[0]] = temp
171 |
172 | temp = coo.key
173 | new_size = np.int32(max(np.round(COO_MEM_MULTIPLIER * temp.shape[0]), COO_QUICKSORT_LIMIT+1))
174 | new_key = np.zeros(new_size, dtype=np.int64)
175 | new_key[: temp.shape[0]] = temp
176 |
177 | temp = coo.min
178 | new_size = np.int32(np.round(COO_MEM_MULTIPLIER * (temp.shape[0]+2)))
179 | new_min = np.zeros(new_size, dtype=np.int64)
180 | new_min[: temp.shape[0]] = temp
181 |
182 | coo = CooArray(
183 | new_row,
184 | new_col,
185 | new_val,
186 | new_key,
187 | coo.ind,
188 | new_min,
189 | coo.depth,
190 | )
191 |
192 | return coo
193 |
194 |
195 | @numba.njit(nogil=True)
196 | def coo_append(coo, tup):
197 | coo.row[coo.ind[0]] = tup[0]
198 | coo.col[coo.ind[0]] = tup[1]
199 | coo.val[coo.ind[0]] = tup[2]
200 | coo.key[coo.ind[0]] = tup[3]
201 | coo.ind[0] += 1
202 |
203 | if (coo.ind[0] - np.abs(coo.min[0])) >= COO_QUICKSORT_LIMIT:
204 | coo_sum_duplicates(coo)
205 | if (coo.key.shape[0] - np.abs(coo.min[0])) <= COO_QUICKSORT_LIMIT:
206 | merge_all_sum_duplicates(coo)
207 | if coo.ind[0] >= 0.95 * coo.key.shape[0]:
208 | coo = coo_increase_mem(coo)
209 |
210 | if coo.ind[0] == coo.key.shape[0] - 1:
211 | coo_sum_duplicates(coo)
212 | if (coo.key.shape[0] - np.abs(coo.min[0])) <= COO_QUICKSORT_LIMIT:
213 | merge_all_sum_duplicates(coo)
214 | if coo.ind[0] >= 0.95 * coo.key.shape[0]:
215 | coo = coo_increase_mem(coo)
216 |
217 | return coo
218 |
219 |
220 | @numba.njit(nogil=True)
221 | def sum_coo_entries(seq):
222 | seq.sort()
223 | this_coord = (seq[0][0], seq[0][1])
224 | this_sum = 0
225 | reduced_data = []
226 | for entry in seq:
227 | if (entry[0], entry[1]) == this_coord:
228 | this_sum += entry[2]
229 | else:
230 | reduced_data.append((this_coord[0], this_coord[1], this_sum))
231 | this_sum = entry[2]
232 | this_coord = (entry[0], entry[1])
233 |
234 | reduced_data.append((this_coord[0], this_coord[1], this_sum))
235 |
236 | return reduced_data
237 |
238 | @numba.njit(nogil=True)
239 | def em_update_matrix(
240 | posterior_data,
241 | prior_indices,
242 | prior_indptr,
243 | prior_data,
244 | n_unique_tokens,
245 | target_gram_ind,
246 | windows,
247 | kernels,
248 | ):
249 | """
250 | Updated the csr matrix from one round of EM on the given (hstack of) n
251 | cooccurrence matrices provided in csr format.
252 |
253 | Parameters
254 | ----------
255 | posterior_data: numpy.array
256 | The csr data of the hstacked cooccurrence matrix to be updated
257 |
258 | prior_indices: numpy.array
259 | The csr indices of the hstacked cooccurrence matrix
260 |
261 | prior_indptr: numpy.array
262 | The csr indptr of the hstacked cooccurrence matrix
263 |
264 | prior_data: numpy.array
265 | The csr data of the hstacked cooccurrence matrix
266 |
267 | n_unique_tokens: int
268 | The number of unique tokens
269 |
270 | target_gram_ind: int
271 | The index of the target ngram to update
272 |
273 | windows: List of List of int
274 | The indices of the tokens in the windows
275 |
276 | kernels: List of List of floats
277 | The kernel values of the entries in the windows.
278 |
279 | Returns
280 | -------
281 | posterior_data: numpy.array
282 | The data of the updated csr matrix after an update of EM.
283 | """
284 | total_win_length = np.sum(np.array([len(w) for w in windows]))
285 | window_posterior = np.zeros(total_win_length)
286 | context_ind = np.zeros(total_win_length, dtype=np.int64)
287 | win_offset = np.append(
288 | np.zeros(1, dtype=np.int64),
289 | np.cumsum(np.array([len(w) for w in windows])),
290 | )[:-1]
291 |
292 | col_ind = prior_indices[
293 | prior_indptr[target_gram_ind] : prior_indptr[target_gram_ind + 1]
294 | ]
295 |
296 | for w, window in enumerate(windows):
297 | for i, context in enumerate(window):
298 | if kernels[w][i] > 0:
299 | context_ind[i + win_offset[w]] = np.searchsorted(
300 | col_ind, context + w * n_unique_tokens
301 | )
302 | # assert(col_ind[context_ind[i + win_offset[w]]] == context+w * n_unique_tokens)
303 | if (
304 | col_ind[context_ind[i + win_offset[w]]]
305 | == context + w * n_unique_tokens
306 | ):
307 | window_posterior[i + win_offset[w]] = (
308 | kernels[w][i]
309 | * prior_data[
310 | prior_indptr[target_gram_ind]
311 | + context_ind[i + win_offset[w]]
312 | ]
313 | )
314 | else:
315 | window_posterior[i + win_offset[w]] = 0
316 |
317 | temp = window_posterior.sum()
318 | if temp > 0:
319 | window_posterior /= temp
320 |
321 | # Partial M_step - Update the posteriors
322 | for w, window in enumerate(windows):
323 | for i, context in enumerate(window):
324 | val = window_posterior[i + win_offset[w]]
325 | if val > 0:
326 | posterior_data[
327 | prior_indptr[target_gram_ind] + context_ind[i + win_offset[w]]
328 | ] += val
329 |
330 | return posterior_data
331 |
--------------------------------------------------------------------------------
/vectorizers/distances.py:
--------------------------------------------------------------------------------
1 | import numba
2 | import numpy as np
3 |
4 | EPS = 1e-11
5 |
6 |
7 | @numba.njit()
8 | def hellinger(x, y):
9 | result = 0.0
10 | l1_norm_x = 0.0
11 | l1_norm_y = 0.0
12 | dim = x.shape[0]
13 |
14 | for i in range(dim):
15 | result += np.sqrt(x[i] * y[i])
16 | l1_norm_x += x[i]
17 | l1_norm_y += y[i]
18 |
19 | if l1_norm_x == 0 and l1_norm_y == 0:
20 | return 0.0
21 | elif l1_norm_x == 0 or l1_norm_y == 0:
22 | return 1.0
23 | else:
24 | return np.sqrt(1 - result / np.sqrt(l1_norm_x * l1_norm_y))
25 |
26 |
27 | @numba.njit()
28 | def kantorovich1d(x, y, p=1):
29 |
30 | # Normalize and do a cumulative sum trick
31 |
32 | x_sum = 0.0
33 | y_sum = 0.0
34 | for i in range(x.shape[0]):
35 | x_sum += x[i]
36 | y_sum += y[i]
37 |
38 | x_cdf = x / x_sum
39 | y_cdf = y / y_sum
40 |
41 | for i in range(1, x_cdf.shape[0]):
42 | x_cdf[i] += x_cdf[i - 1]
43 | y_cdf[i] += y_cdf[i - 1]
44 |
45 | # Now we just want minkowski distance on the CDFs
46 | result = 0.0
47 | if p > 2:
48 | for i in range(x_cdf.shape[0]):
49 | result += np.abs(x_cdf[i] - y_cdf[i]) ** p
50 |
51 | return result ** (1.0 / p)
52 |
53 | elif p == 2:
54 | for i in range(x_cdf.shape[0]):
55 | val = x_cdf[i] - y_cdf[i]
56 | result += val * val
57 |
58 | return np.sqrt(result)
59 |
60 | elif p == 1:
61 | for i in range(x_cdf.shape[0]):
62 | result += np.abs(x_cdf[i] - y_cdf[i])
63 |
64 | return result
65 |
66 | else:
67 | raise ValueError("Invalid p supplied to Kantorvich distance")
68 |
69 |
70 | @numba.njit()
71 | def circular_kantorovich(x, y, p=1):
72 |
73 | x_sum = 0.0
74 | y_sum = 0.0
75 | for i in range(x.shape[0]):
76 | x_sum += x[i]
77 | y_sum += y[i]
78 |
79 | x_cdf = x / x_sum
80 | y_cdf = y / y_sum
81 |
82 | for i in range(1, x_cdf.shape[0]):
83 | x_cdf[i] += x_cdf[i - 1]
84 | y_cdf[i] += y_cdf[i - 1]
85 |
86 | mu = np.median((x_cdf - y_cdf) ** p)
87 |
88 | # Now we just want minkowski distance on the CDFs shifted by mu
89 | result = 0.0
90 | if p > 2:
91 | for i in range(x_cdf.shape[0]):
92 | result += np.abs(x_cdf[i] - y_cdf[i] - mu) ** p
93 |
94 | return result ** (1.0 / p)
95 |
96 | elif p == 2:
97 | for i in range(x_cdf.shape[0]):
98 | val = x_cdf[i] - y_cdf[i] - mu
99 | result += val * val
100 |
101 | return np.sqrt(result)
102 |
103 | elif p == 1:
104 | for i in range(x_cdf.shape[0]):
105 | result += np.abs(x_cdf[i] - y_cdf[i] - mu)
106 |
107 | return result
108 |
109 | else:
110 | raise ValueError("Invalid p supplied to Kantorvich distance")
111 |
112 |
113 | @numba.njit()
114 | def total_variation(x, y):
115 | x_sum = 0.0
116 | y_sum = 0.0
117 | result = 0.0
118 |
119 | for i in range(x.shape[0]):
120 | x_sum += x[i]
121 | y_sum += y[i]
122 |
123 | x_pdf = x / x_sum
124 | y_pdf = y / y_sum
125 |
126 | for i in range(x.shape[0]):
127 | result += 0.5 * np.abs(x_pdf[i] - y_pdf[i])
128 |
129 | return result
130 |
131 |
132 | @numba.njit()
133 | def jensen_shannon_divergence(x, y):
134 | result = 0.0
135 | l1_norm_x = 0.0
136 | l1_norm_y = 0.0
137 | dim = x.shape[0]
138 |
139 | for i in range(dim):
140 | l1_norm_x += x[i]
141 | l1_norm_y += y[i]
142 |
143 | l1_norm_x += EPS * dim
144 | l1_norm_y += EPS * dim
145 |
146 | pdf_x = (x + EPS) / l1_norm_x
147 | pdf_y = (y + EPS) / l1_norm_y
148 | m = 0.5 * (pdf_x + pdf_y)
149 |
150 | for i in range(dim):
151 | result += 0.5 * (
152 | pdf_x[i] * np.log(pdf_x[i] / m[i]) + pdf_y[i] * np.log(pdf_y[i] / m[i])
153 | )
154 | return result
155 |
156 |
157 | @numba.njit()
158 | def symmetric_kl_divergence(x, y):
159 | result = 0.0
160 | l1_norm_x = 0.0
161 | l1_norm_y = 0.0
162 | dim = x.shape[0]
163 |
164 | for i in range(dim):
165 | l1_norm_x += x[i]
166 | l1_norm_y += y[i]
167 |
168 | l1_norm_x += EPS * dim
169 | l1_norm_y += EPS * dim
170 |
171 | pdf_x = (x + EPS) / l1_norm_x
172 | pdf_y = (y + EPS) / l1_norm_y
173 |
174 | for i in range(dim):
175 | result += pdf_x[i] * np.log(pdf_x[i] / pdf_y[i]) + pdf_y[i] * np.log(
176 | pdf_y[i] / pdf_x[i]
177 | )
178 |
179 | return result
180 |
181 |
182 | #
183 | # --- Sparse support functions
184 | #
185 |
186 |
187 | # Just reproduce a simpler version of numpy unique (not numba supported yet)
188 | @numba.njit()
189 | def arr_unique(arr):
190 | aux = np.sort(arr)
191 | flag = np.concatenate((np.ones(1, dtype=np.bool_), aux[1:] != aux[:-1]))
192 | return aux[flag]
193 |
194 |
195 | # Just reproduce a simpler version of numpy union1d (not numba supported yet)
196 | @numba.njit()
197 | def arr_union(ar1, ar2):
198 | if ar1.shape[0] == 0:
199 | return ar2
200 | elif ar2.shape[0] == 0:
201 | return ar1
202 | else:
203 | return arr_unique(np.concatenate((ar1, ar2)))
204 |
205 |
206 | @numba.njit()
207 | def arr_intersect(ar1, ar2):
208 | aux = np.concatenate((ar1, ar2))
209 | aux.sort()
210 | return aux[:-1][aux[1:] == aux[:-1]]
211 |
212 |
213 | @numba.njit()
214 | def sparse_sum(ind1, data1, ind2, data2):
215 | result_ind = arr_union(ind1, ind2)
216 | result_data = np.zeros(result_ind.shape[0], dtype=np.float32)
217 |
218 | i1 = 0
219 | i2 = 0
220 | nnz = 0
221 |
222 | # pass through both index lists
223 | while i1 < ind1.shape[0] and i2 < ind2.shape[0]:
224 | j1 = ind1[i1]
225 | j2 = ind2[i2]
226 |
227 | if j1 == j2:
228 | val = data1[i1] + data2[i2]
229 | if val != 0:
230 | result_ind[nnz] = j1
231 | result_data[nnz] = val
232 | nnz += 1
233 | i1 += 1
234 | i2 += 1
235 | elif j1 < j2:
236 | val = data1[i1]
237 | if val != 0:
238 | result_ind[nnz] = j1
239 | result_data[nnz] = val
240 | nnz += 1
241 | i1 += 1
242 | else:
243 | val = data2[i2]
244 | if val != 0:
245 | result_ind[nnz] = j2
246 | result_data[nnz] = val
247 | nnz += 1
248 | i2 += 1
249 |
250 | # pass over the tails
251 | while i1 < ind1.shape[0]:
252 | val = data1[i1]
253 | if val != 0:
254 | result_ind[nnz] = i1
255 | result_data[nnz] = val
256 | nnz += 1
257 | i1 += 1
258 |
259 | while i2 < ind2.shape[0]:
260 | val = data2[i2]
261 | if val != 0:
262 | result_ind[nnz] = i2
263 | result_data[nnz] = val
264 | nnz += 1
265 | i2 += 1
266 |
267 | # truncate to the correct length in case there were zeros created
268 | result_ind = result_ind[:nnz]
269 | result_data = result_data[:nnz]
270 |
271 | return result_ind, result_data
272 |
273 |
274 | @numba.njit()
275 | def sparse_diff(ind1, data1, ind2, data2):
276 | return sparse_sum(ind1, data1, ind2, -data2)
277 |
278 |
279 | @numba.njit()
280 | def sparse_mul(ind1, data1, ind2, data2):
281 | result_ind = arr_intersect(ind1, ind2)
282 | result_data = np.zeros(result_ind.shape[0], dtype=np.float32)
283 |
284 | i1 = 0
285 | i2 = 0
286 | nnz = 0
287 |
288 | # pass through both index lists
289 | while i1 < ind1.shape[0] and i2 < ind2.shape[0]:
290 | j1 = ind1[i1]
291 | j2 = ind2[i2]
292 |
293 | if j1 == j2:
294 | val = data1[i1] * data2[i2]
295 | if val != 0:
296 | result_ind[nnz] = j1
297 | result_data[nnz] = val
298 | nnz += 1
299 | i1 += 1
300 | i2 += 1
301 | elif j1 < j2:
302 | i1 += 1
303 | else:
304 | i2 += 1
305 |
306 | # truncate to the correct length in case there were zeros created
307 | result_ind = result_ind[:nnz]
308 | result_data = result_data[:nnz]
309 |
310 | return result_ind, result_data
311 |
312 |
313 | # Return dense vectors supported on the union of the non-zero valued indices
314 | @numba.njit()
315 | def dense_union(ind1, data1, ind2, data2):
316 | result_ind = arr_union(ind1, ind2)
317 | result_data1 = np.zeros(result_ind.shape[0], dtype=np.float32)
318 | result_data2 = np.zeros(result_ind.shape[0], dtype=np.float32)
319 |
320 | i1 = 0
321 | i2 = 0
322 | nnz = 0
323 |
324 | # pass through both index lists
325 | while i1 < ind1.shape[0] and i2 < ind2.shape[0]:
326 | j1 = ind1[i1]
327 | j2 = ind2[i2]
328 |
329 | if j1 == j2:
330 | val = data1[i1] + data2[i2]
331 | if val != 0:
332 | result_data1[nnz] = data1[i1]
333 | result_data2[nnz] = data2[i2]
334 | nnz += 1
335 | i1 += 1
336 | i2 += 1
337 | elif j1 < j2:
338 | val = data1[i1]
339 | if val != 0:
340 | result_data1[nnz] = data1[i1]
341 | nnz += 1
342 | i1 += 1
343 | else:
344 | val = data2[i2]
345 | if val != 0:
346 | result_data2[nnz] = data2[i2]
347 | nnz += 1
348 | i2 += 1
349 |
350 | # pass over the tails
351 | while i1 < ind1.shape[0]:
352 | val = data1[i1]
353 | if val != 0:
354 | result_data1[nnz] = data1[i1]
355 | nnz += 1
356 | i1 += 1
357 |
358 | while i2 < ind2.shape[0]:
359 | val = data2[i2]
360 | if val != 0:
361 | result_data2[nnz] = data2[i2]
362 | nnz += 1
363 | i2 += 1
364 |
365 | # truncate to the correct length in case there were zeros
366 | result_data1 = result_data1[:nnz]
367 | result_data2 = result_data2[:nnz]
368 |
369 | return result_data1, result_data2
370 |
371 |
372 | #
373 | # --- Sparse distance functions
374 | #
375 |
376 |
377 | @numba.njit()
378 | def sparse_hellinger(ind1, data1, ind2, data2):
379 | aux_inds, aux_data = sparse_mul(ind1, data1, ind2, data2)
380 | result = 0.0
381 | norm1 = np.sum(data1)
382 | norm2 = np.sum(data2)
383 | sqrt_norm_prod = np.sqrt(norm1 * norm2)
384 |
385 | for i in range(aux_data.shape[0]):
386 | result += np.sqrt(aux_data[i])
387 |
388 | if norm1 == 0.0 and norm2 == 0.0:
389 | return 0.0
390 | elif norm1 == 0.0 or norm2 == 0.0:
391 | return 1.0
392 | elif result > sqrt_norm_prod:
393 | return 0.0
394 | else:
395 | return np.sqrt(1.0 - (result / sqrt_norm_prod))
396 |
397 |
398 | @numba.njit()
399 | def sparse_total_variation(ind1, data1, ind2, data2):
400 | norm1 = np.sum(data1)
401 | norm2 = np.sum(data2)
402 | aux_inds, aux_data = sparse_diff(ind1, data1 / norm1, ind2, data2 / norm2)
403 | result = 0.0
404 | for i in range(aux_data.shape[0]):
405 | result += 0.5 * np.abs(aux_data[i])
406 | return result
407 |
408 |
409 | # Because of the EPS values and the need to normalize after adding them (and then average those for jensen_shannon)
410 | # it seems like we might as well just take the dense union (dense vectors supported on the union of indices)
411 | # and call the dense distance functions
412 |
413 |
414 | @numba.njit()
415 | def sparse_jensen_shannon_divergence(ind1, data1, ind2, data2):
416 | dense_data1, dense_data2 = dense_union(ind1, data1, ind2, data2)
417 | return jensen_shannon_divergence(dense_data1, dense_data2)
418 |
419 |
420 | @numba.njit()
421 | def sparse_symmetric_kl_divergence(ind1, data1, ind2, data2):
422 | dense_data1, dense_data2 = dense_union(ind1, data1, ind2, data2)
423 | return symmetric_kl_divergence(dense_data1, dense_data2)
424 |
--------------------------------------------------------------------------------
/vectorizers/distribution_vectorizer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pomegranate as pm
4 |
5 | from sklearn.base import BaseEstimator, TransformerMixin
6 | from sklearn.utils.validation import (
7 | check_array,
8 | check_is_fitted,
9 | check_random_state,
10 | )
11 | from pandas.api.types import is_datetime64_any_dtype as is_datetime
12 |
13 | def distribution_type_from_series(series): # pragma: no cover
14 | if series.dtype in (np.int, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64):
15 | if series.min() >= 0:
16 | if series.max() == 1:
17 | return pm.BernoulliDistribution
18 | else:
19 | return pm.PoissonDistribution
20 | elif series.unique().shape[0] <= 50:
21 | return pm.DiscreteDistribution
22 | else:
23 | return pm.NormalDistribution
24 |
25 | elif series.dtype in (np.float, np.float16, np.float32, np.float64):
26 | if series.min() >= 0:
27 | if series.max() <= 1:
28 | return pm.BetaDistribution
29 | else:
30 | return pm.GammaDistribution
31 | else:
32 | return pm.NormalDistribution
33 |
34 | elif series.dtype in pd.CategoricalDtype:
35 | return pm.DiscreteDistribution
36 |
37 | else:
38 | raise ValueError(f"Failed to handle series {series}")
39 |
40 |
41 | def preprocess_dataframe(df, time_granularity="1s"): # pragma: no cover
42 | for feature in df:
43 | if feature.dtype == object:
44 | df[feature] = pd.Categorical(df[feature])
45 | elif is_datetime(df[feature]):
46 | df[feature] = ((df.feature - df[feature].min()) / pd.Timedelta(time_granularity))
47 |
48 | return
49 |
50 | class DataframeDistributionVectorizer(BaseEstimator, TransformerMixin): # pragma: no cover
51 |
52 | def __init__(self, n_components=100):
53 | self.n_components = n_components
54 |
55 | def fit(self, X, y=None, **fit_params):
56 | if type(X) == pd.DataFrame:
57 | X = preprocess_dataframe(X.copy())
58 | column_models = [
59 | distribution_type_from_series(X[feature])
60 | for feature in X
61 | ]
62 | elif type(X) == np.ndarray:
63 | column_models = [
64 | distribution_type_from_series(X[:, i])
65 | for i in range(X.shape[1])
66 | ]
67 | else:
68 | raise ValueError(f"Input type {type(X)} is not currently supported")
69 |
70 | self.mixture_model_ = pm.GeneralMixtureModel.from_samples(
71 | column_models, n_components=self.n_components, X=X
72 | )
73 |
74 | def transform(self, X):
75 | check_is_fitted(self, ["mixture_model_"])
76 |
77 | if type(X) == pd.DataFrame:
78 | X = preprocess_dataframe(X.copy())
79 |
80 | return self.mixture_model_.predict_proba(X)
81 |
--------------------------------------------------------------------------------
/vectorizers/edge_list_vectorizer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import numba
3 |
4 | from sklearn.base import BaseEstimator, TransformerMixin
5 | from sklearn.utils.validation import check_is_fitted
6 | import scipy.sparse
7 |
8 |
9 | def read_edge_data(X):
10 | """
11 | Read in data of various forms and converts them into an np.array of [row_labels, column_labels, values]
12 |
13 | Returns
14 | -------
15 | N x 3 np.array of [row_labels, column_labels, values]
16 | """
17 | try:
18 | edge_list = np.array(X, dtype=object)
19 | except:
20 | raise ValueError("Couldn't convert for your data format into an numpy array.")
21 | if (edge_list.shape[1] != 3) & (edge_list.shape[0] == 3):
22 | edge_list = edge_list.T
23 | if edge_list.shape[1] != 3:
24 | raise ValueError(
25 | f"Incorrect format of data passed in. "
26 | f"We expected some format of Nx3 data and received {edge_list.shape[0]} by {edge_list.shape[1]} data"
27 | )
28 |
29 | # TODO: Test if edge_list[:,2] is numeric. We currently just convert it into a float. I'd rather preserve the type.
30 | return edge_list
31 |
32 |
33 | class EdgeListVectorizer(BaseEstimator, TransformerMixin):
34 | """
35 | Takes a weighted edge list of the form row_labels, column_labels, value
36 | and represents each row_name as a sparse matrix containing the values
37 | associated with each column_name.
38 |
39 | This might also be thought of as a PivotTableVectorizer or a CrossTabVectorizer.
40 |
41 | Parameters
42 | ----------
43 | column_label_dictionary: dictionary or None (optional, default=None)
44 | A fixed dictionary mapping tokens to indices, or None if the dictionary
45 | should be learned from the training data. If specified this will limit
46 | the tokens
47 | row_label_dictionary: dictionary or None (optional, default=None)
48 | A fixed dictionary mapping row labels to indices, or None if the dictionary
49 | should be learned from the training data. If specified this will limit
50 | the tokens
51 | joint_space: Bool (optional, default=False)
52 | Are the first two columns of your edge list over the same token space. If so build
53 | a single unified token dictionary over both columns.
54 | pre_indexed: bool (optional, default=False)
55 | Not yet implemented. I'm not sure that this feature is going to be used enought to prioritize implementing.
56 | Please reach out if this would be useful to you.
57 | My row and columns are just the row and column indices and not row and column labels.
58 |
59 | """
60 |
61 | def __init__(
62 | self,
63 | column_label_dictionary=None,
64 | row_label_dictionary=None,
65 | joint_space=False,
66 | ):
67 | self.column_label_dictionary = column_label_dictionary
68 | self.row_label_dictionary = row_label_dictionary
69 | self.joint_space = joint_space
70 |
71 | def fit(self, X, y=None, **fit_params):
72 | # Convert data from whatever format it came in into an Nx3 np.array
73 | self.edge_list_ = read_edge_data(X)
74 |
75 | if self.joint_space:
76 | if self.column_label_dictionary is None:
77 | if self.row_label_dictionary is None:
78 | self.row_label_dictionary_ = {
79 | token: index
80 | for index, token in enumerate(
81 | np.unique(
82 | np.append(self.edge_list_[:, 0], self.edge_list_[:, 1])
83 | )
84 | )
85 | }
86 | self.column_label_dictionary_ = self.row_label_dictionary_
87 | elif self.row_label_dictionary is None:
88 | self.column_label_dictionary_ = self.column_label_dictionary
89 | self.row_label_dictionary_ = self.column_label_dictionary
90 | elif self.column_label_dictionary is None:
91 | self.column_label_dictionary_ = self.row_label_dictionary
92 | self.row_label_dictionary_ = self.row_label_dictionary
93 | else:
94 | raise ValueError(
95 | "Joint_space=True: Please specify at most a single label dictionary (either one works)."
96 | )
97 | else: # Not in a joint space
98 | if self.row_label_dictionary is None:
99 | self.row_label_dictionary_ = {
100 | token: index
101 | for index, token in enumerate(np.unique(self.edge_list_[:, 0]))
102 | }
103 | else:
104 | self.row_label_dictionary_ = self.row_label_dictionary
105 | if self.column_label_dictionary is None:
106 | self.column_label_dictionary_ = {
107 | token: index
108 | for index, token in enumerate(np.unique(self.edge_list_[:, 1]))
109 | }
110 | else:
111 | self.column_label_dictionary_ = self.column_label_dictionary
112 | # Build reverse indexes
113 | self.row_index_dictionary_ = {
114 | y: x for (x, y) in self.row_label_dictionary_.items()
115 | }
116 | self.column_index_dictionary_ = {
117 | y: x for (x, y) in self.column_label_dictionary_.items()
118 | }
119 | max_row = np.max(list(self.row_index_dictionary_.keys())) + 1
120 | max_col = np.max(list(self.column_index_dictionary_.keys())) + 1
121 |
122 | # Get row and column indices for only the edges who have both labels in our dictionary index
123 | # Don't bother checking if rows are valid if you just constructed the row_label_dictionary from the data
124 | if self.row_label_dictionary is None:
125 | valid_rows = np.repeat(True, self.edge_list_.shape[0])
126 | else:
127 | valid_rows = np.isin(
128 | self.edge_list_[:, 0], list(self.row_label_dictionary_.keys())
129 | )
130 |
131 | # Don't bother checking if rows are valid if you just constructed the col_label_dictionary from the data
132 | if self.column_label_dictionary is None:
133 | valid_cols = np.repeat(True, self.edge_list_.shape[0])
134 | else:
135 | valid_cols = np.isin(
136 | self.edge_list_[:, 1], list(self.column_label_dictionary_.keys())
137 | )
138 | valid_edges = valid_rows & valid_cols
139 | row_indices = [
140 | self.row_label_dictionary_[x] for x in self.edge_list_[valid_edges, 0]
141 | ]
142 | col_indices = [
143 | self.column_label_dictionary_[x] for x in self.edge_list_[valid_edges, 1]
144 | ]
145 | # Must specify the shape to ensure that tailing zero rows/cols aren't suppressed.
146 | self._train_matrix = scipy.sparse.coo_matrix(
147 | (self.edge_list_[valid_edges, 2].astype(float), (row_indices, col_indices)),
148 | shape=(max_row, max_col),
149 | ).tocsr()
150 | self._train_matrix.sum_duplicates()
151 |
152 | return self
153 |
154 | def fit_transform(self, X, y=None, **fit_params):
155 | self.fit(X, y, **fit_params)
156 | return self._train_matrix
157 |
158 | def transform(self, X):
159 | check_is_fitted(
160 | self,
161 | [
162 | "column_label_dictionary_",
163 | "row_label_dictionary_",
164 | ],
165 | )
166 |
167 | edge_list = read_edge_data(X)
168 |
169 | # Get row and column indices for only the edges who have both labels in our dictionary index
170 | valid_rows = np.isin(edge_list[:, 0], list(self.row_label_dictionary_.keys()))
171 | valid_cols = np.isin(
172 | edge_list[:, 1], list(self.column_label_dictionary_.keys())
173 | )
174 | valid_edges = valid_rows & valid_cols
175 | row_indices = [self.row_label_dictionary_[x] for x in edge_list[valid_edges, 0]]
176 | col_indices = [
177 | self.column_label_dictionary_[x] for x in edge_list[valid_edges, 1]
178 | ]
179 |
180 | matrix = scipy.sparse.coo_matrix(
181 | (edge_list[valid_edges, 2].astype(float), (row_indices, col_indices))
182 | ).tocsr()
183 | matrix.sum_duplicates()
184 | return matrix
185 |
--------------------------------------------------------------------------------
/vectorizers/kde_vectorizer.py:
--------------------------------------------------------------------------------
1 | import numba
2 | import numpy as np
3 |
4 | from sklearn.base import BaseEstimator, TransformerMixin
5 | from sklearn.utils.validation import check_is_fitted
6 | from warnings import warn
7 | from sklearn.neighbors import KernelDensity
8 | from .utils import flatten
9 |
10 |
11 | @numba.njit(nogil=True)
12 | def min_non_zero_difference(data):
13 | """Find the minimum non-zero sequential difference in a single dimensional
14 | array of values. This is useful for determining the minimal reasonable kernel
15 | bandwidth for a 1-dimensional KDE over a dataset.
16 |
17 | Parameters
18 | ----------
19 | data: array
20 | One dimensional array of values
21 |
22 | Returns
23 | -------
24 | min_difference: float
25 | The minimal difference between sequential values.
26 | """
27 | sorted_data = np.sort(data)
28 | differences = sorted_data[1:] - sorted_data[:-1]
29 | return np.min(differences[differences > 0])
30 |
31 |
32 | def jackknife_bandwidths(data, bandwidths, kernel="gaussian"):
33 | """Perform jack-knife sampling over different bandwidths for KDEs for each
34 | time-series in the dataset.
35 |
36 | Parameters
37 | ----------
38 | data: list of arrays
39 | A list of (variable length) arrays of values. The values should represent
40 | "times" of "events".
41 |
42 | bandwidths: array
43 | The possible bandwidths to try
44 |
45 | kernel: string (optional, default="gaussian")
46 | The kernel to use for the KDE. Should be accepted by sklearn's KernelDensity
47 | class.
48 |
49 | Returns
50 | -------
51 | result: array of shape (n_bandwidths,)
52 | The total likelihood of unobserved data over all jackknife samplings and all
53 | time series in the dataset for each bandwidth.
54 | """
55 | result = np.zeros(bandwidths.shape[0])
56 | for j in range(bandwidths.shape[0]):
57 | kde = KernelDensity(bandwidth=bandwidths[j], kernel=kernel)
58 | for i in range(len(data)):
59 | likelihood = 0.0
60 | for k in range(len(data[i])):
61 | if k < len(data[i]) - 1:
62 | jackknife_sample = np.hstack([data[i][:k], data[i][k + 1 :]])
63 | else:
64 | jackknife_sample = data[i][:k]
65 | kde.fit(jackknife_sample[:, None])
66 | likelihood += np.exp(kde.score(np.array([[data[i][k]]])))
67 |
68 | result[j] += likelihood
69 |
70 | return result
71 |
72 |
73 | class KDEVectorizer(BaseEstimator, TransformerMixin):
74 | def __init__(
75 | self,
76 | bandwidth=None,
77 | n_components=50,
78 | kernel="gaussian",
79 | evaluation_grid_strategy="uniform",
80 | ):
81 | self.n_components = n_components
82 | self.evaluation_grid_strategy = evaluation_grid_strategy
83 | self.bandwidth = bandwidth
84 | self.kernel = kernel
85 |
86 | def fit(self, X, y=None, **fit_params):
87 |
88 | combined_data = np.array(flatten(X))
89 |
90 | if self.bandwidth is None:
91 | # Estimate the bandwidth by looking at training data
92 | # We do a jack-knife across each time series and
93 | # find the bandwidth choice that works best over all
94 | # time series
95 | min, max = np.min(combined_data), np.max(combined_data)
96 | avg_n_events = np.mean([len(x) for x in X])
97 | max_bandwidth = (max - min) / avg_n_events
98 | min_bandwidth = min_non_zero_difference(combined_data)
99 | bandwidths = 10.0 ** np.linspace(
100 | np.log10(min_bandwidth), np.log10(max_bandwidth), 50
101 | )
102 | jackknifed_total_likelihoods = jackknife_bandwidths(X, bandwidths)
103 | self.bandwidth_ = bandwidths[np.argmax(jackknifed_total_likelihoods)]
104 | else:
105 | self.bandwidth_ = self.bandwidth
106 |
107 | if self.evaluation_grid_strategy == "uniform":
108 | min, max = np.min(combined_data), np.max(combined_data)
109 | self.evaluation_grid_ = np.linspace(min, max, self.n_components)
110 | elif self.evaluation_grid_strategy == "density":
111 | uniform_quantile_grid = np.linspace(0, 1.0, self.n_components)
112 | self.evaluation_grid_ = np.quantile(combined_data, uniform_quantile_grid)
113 | else:
114 | raise ValueError(
115 | "Unrecognized evaluation_grid_strategy; should be one "
116 | 'of: "uniform" or "density"'
117 | )
118 |
119 | return self
120 |
121 | def transform(self, X):
122 | check_is_fitted(self, ["bandwidth_", "evaluation_grid_"])
123 |
124 | result = np.empty((len(X), self.n_components), dtype=np.float64)
125 |
126 | for i, sample in enumerate(X):
127 | kde = KernelDensity(bandwidth=self.bandwidth_, kernel=self.kernel)
128 | kde.fit(sample[:, None])
129 | log_probability = kde.score_samples(self.evaluation_grid_[:, None])
130 | result[i] = np.exp(log_probability)
131 |
132 | return result
133 |
134 | def fit_transform(self, X, y=None, **fit_params):
135 | self.fit(X, y, **fit_params)
136 | return self.transform(X)
137 |
--------------------------------------------------------------------------------
/vectorizers/signature_vectorizer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from sklearn.base import BaseEstimator, TransformerMixin
4 | from sklearn.utils.validation import check_is_fitted
5 |
6 | #import iisignature
7 |
8 | NUMPY_SHAPE_ERROR_MSG = """
9 | Error: SignatureVectorizer expects numpy arrays to be of shape (num_samples x path_len x path_dim).
10 | """
11 | LIST_SHAPE_ERROR_MSG = """
12 | Error: Expecting list entries to be numpy arrays of shape (path_len x path_dim).
13 | """
14 |
15 |
16 | class SignatureVectorizer(BaseEstimator, TransformerMixin):
17 | """Transforms a list or array of paths into their path signatures.
18 |
19 | Uses the iisignature library (https://pypi.org/project/iisignature/)
20 | * pip install iisignature
21 |
22 | For more details on the path signature technique, please refer to:
23 | * Rough paths, Signatures and the modelling of functions on streams. Lyons, T. (2014)
24 | https://arxiv.org/pdf/1405.4537.pdf
25 | * A Primer on the Signature Method in Machine Learning. Cheyrev, I. (2016)
26 | https://arxiv.org/pdf/1603.03788.pdf
27 |
28 | Parameters
29 | ----------
30 | truncation_level: int (default = 2)
31 | The level at which we truncate the infinite signature.
32 |
33 | log: bool (default=False)
34 | If True returns the log-signature (a compressed version of the path signature.
35 | Otherwise returns the path signature.
36 |
37 | basepoint: bool (default=False)
38 | If True, prepends each path with the zero vector. The default path signature is blind
39 | to translational shifts in the paths; use this flag if you care about path translations.
40 | """
41 |
42 | def __init__(
43 | self, truncation_level: int = 2, log: bool = False, basepoint: bool = False
44 | ):
45 | try:
46 | global iisignature
47 | import iisignature as ii
48 | iisignature = ii
49 | except ImportError as err:
50 | from textwrap import dedent
51 | err.msg += dedent(
52 | """
53 |
54 | A small bug with the install script of the iisignature makes it
55 | impossible to install into an environment where its Numpy dependency
56 | has not yet been installed. Thus, the Vectorizers library does not
57 | make it an explicit dependency. However, you may install this package
58 | yourself into this environment now, by running the command
59 |
60 | pip install iisignature
61 |
62 | The problem has been reported to the maintainers of iisignature, and
63 | this inconvenience will disappear in future releases.
64 | """
65 | )
66 | raise
67 |
68 | assert (
69 | type(truncation_level) is int
70 | ), "Error: expecting int type for truncation_level."
71 | assert type(log) is bool, "Error: expecting bool type for log."
72 | assert type(basepoint) is bool, "Error: expecting bool type for basepoint"
73 |
74 | self.truncation_level = truncation_level
75 | self.log = log
76 | self.basepoint = basepoint
77 |
78 | def fit(self, X, y=None, **fit_params):
79 | """
80 | Parameters
81 | ----------
82 | X: np.array of shape (n_samples, path_len, path_dim) or list of np.arrays of shape (?, path_dim)
83 | The path data on which we fit the vectorizer.
84 | If paths are all the same length, then we can pass them to fit as a numpy array (n_samples, path_len, path_dim).
85 | If paths are varting length, then we can pass a list of length n_samples, where each entry is a numpy array
86 | with shape (path_len_i, path_dim). The path_dim should be consistent across the list, but the path length
87 | can vary/
88 | """
89 | if type(X) is np.ndarray:
90 | assert len(X.shape) == 3, NUMPY_SHAPE_ERROR_MSG
91 | # We have an array N x p x d of paths
92 | # all paths have the same length -> batch vectorize
93 | self.in_dim_ = X.shape[2]
94 | else:
95 | assert type(X) is list, "Error: Expecting numpy array or list of paths."
96 | assert (
97 | type(X[0]) is np.ndarray
98 | ), "Error: Expecting list entries to be numpy arrays."
99 | assert (
100 | type(X[0]) is np.ndarray and len(X[0].shape) == 2
101 | ), LIST_SHAPE_ERROR_MSG
102 | # Accepts a list of paths with differing lengths
103 | self.in_dim_ = X[0].shape[1]
104 |
105 | if self.log:
106 | self.s_ = iisignature.prepare(self.in_dim_, self.truncation_level)
107 | self.out_dim_ = iisignature.logsiglength(
108 | self.in_dim_, self.truncation_level
109 | )
110 | else:
111 | self.s_ = None
112 | self.out_dim_ = iisignature.siglength(self.in_dim_, self.truncation_level)
113 |
114 | def transform(self, X):
115 | """
116 | Parameters
117 | ----------
118 | X: np.array of shape (n_samples, path_len, path_dim) or list of np.arrays of shape (?, path_dim)
119 | The path data on which we fit the vectorizer.
120 | If paths are all the same length, then we can pass them to fit as a numpy array (n_samples, path_len, path_dim).
121 | If paths are varting length, then we can pass a list of length n_samples, where each entry is a numpy array
122 | with shape (path_len_i, path_dim). The path_dim should be consistent across the list, but the path length
123 | can vary.
124 |
125 | Returns
126 | -------
127 | sigs: np.array of shape (n_samples, self.out_dim_)
128 | The array of signatures corresponding to the paths given in X, truncated at the truncation level specified
129 | at initialisation.
130 |
131 | """
132 | check_is_fitted(
133 | self,
134 | [
135 | "in_dim_",
136 | "out_dim_",
137 | "s_",
138 | ],
139 | )
140 |
141 | if type(X) is np.ndarray:
142 | assert len(X.shape) == 3, NUMPY_SHAPE_ERROR_MSG
143 | # We have an array N x p x d of paths
144 | # all paths have the same length -> batch vectorize
145 | assert (
146 | X.shape[2] == self.in_dim_
147 | ), "Error: Expecting path_dim to be %d, got path_dim %d." % (
148 | self.in_dim_,
149 | X.shape[2],
150 | )
151 | if self.basepoint:
152 | basepoint = np.zeros((X.shape[0], 1, X.shape[2]))
153 | X = np.concatenate([basepoint, np.array(X)], axis=1)
154 |
155 | if self.log:
156 | v = iisignature.logsig(X, self.s_)
157 | else:
158 | v = iisignature.sig(X, self.truncation_level)
159 | else:
160 | # Accepts a list of paths with differing lengths
161 | assert type(X) is list, "Error: Expecting numpy array or list of paths."
162 | assert (
163 | type(X[0]) is np.ndarray
164 | ), "Error: Expecting list entries to be numpy arrays."
165 | assert len(X[0].shape) == 2, LIST_SHAPE_ERROR_MSG
166 | assert (
167 | X[0].shape[1] == self.in_dim_
168 | ), "Error: Expecting path_dim to be %d, got path_dim %d." % (
169 | self.in_dim_,
170 | X[0].shape[1],
171 | )
172 | N = len(X)
173 | if self.basepoint:
174 | basepoint = np.zeros(shape=(1, self.in_dim_))
175 | X = [np.concatenate([basepoint, x], axis=0) for x in X]
176 |
177 | if self.log:
178 | sig_vectorizer = lambda path: iisignature.logsig(path, self.s_)
179 | else:
180 | sig_vectorizer = lambda path: iisignature.sig(
181 | path, self.truncation_level
182 | )
183 |
184 | v = np.empty(shape=(N, self.out_dim_))
185 |
186 | for i, path in enumerate(X):
187 | assert (
188 | path.shape[-1] == self.in_dim_
189 | ), "Error: Not all paths share the same dimension."
190 | v[i] = sig_vectorizer(path)
191 |
192 | return v
193 |
194 | def fit_transform(self, X, y=None, **fit_params):
195 | self.fit(X, y, **fit_params)
196 | return self.transform(X)
197 |
--------------------------------------------------------------------------------
/vectorizers/tests/__init__.py:
--------------------------------------------------------------------------------
1 | raw_string_data = [
2 | "asdfj;afoosdaflksapokwerfoobarpokwersdfsadfsadfnbkajyfoopokwer",
3 | "pokfoo;ohnASDbarfoobarpoksdf sgn;asregtjpoksdfpokpokwer",
4 | "werqweoijsdcasdfpoktrfoobarpokqwernasdfasdpokpokpok",
5 | "pokwerpokwqerpokwersadfpokqwepokwerpokpok",
6 | "foobarfoofooasdfsdfgasdffoobarbazcabfoobarbarbazfoobaz",
7 | "pokfoopokbarpokwerpokbazgfniusnvbgasgbabgsadfjnkr[pko",
8 | ]
9 |
--------------------------------------------------------------------------------
/vectorizers/tests/test_bpe.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pytest # noqa
3 |
4 | from vectorizers import BytePairEncodingVectorizer
5 | from vectorizers import NgramVectorizer
6 | from vectorizers.mixed_gram_vectorizer import to_unicode
7 |
8 | from . import raw_string_data
9 |
10 |
11 | def test_bpe_vectorizer_basic():
12 | bpe = BytePairEncodingVectorizer()
13 | result1 = bpe.fit_transform(raw_string_data)
14 | result2 = bpe.transform(raw_string_data)
15 | assert np.allclose(result1.toarray(), result2.toarray())
16 |
17 |
18 | def test_bpe_tokens_ngram_matches():
19 | bpe1 = BytePairEncodingVectorizer(return_type="matrix")
20 | bpe2 = BytePairEncodingVectorizer(return_type="tokens")
21 |
22 | result1 = bpe1.fit_transform(raw_string_data)
23 | token_dictionary = {
24 | to_unicode(code, bpe1.tokens_, bpe1.max_char_code_): n
25 | for code, n in bpe1.column_label_dictionary_.items()
26 | }
27 |
28 | tokens = bpe2.fit_transform(raw_string_data)
29 | result2 = NgramVectorizer(token_dictionary=token_dictionary).fit_transform(tokens)
30 |
31 | assert np.allclose(result1.toarray(), result2.toarray())
32 |
33 |
34 | def test_bpe_bad_params():
35 | with pytest.raises(ValueError):
36 | bpe = BytePairEncodingVectorizer(max_vocab_size=-1)
37 | bpe.fit(raw_string_data)
38 |
39 | with pytest.raises(ValueError):
40 | bpe = BytePairEncodingVectorizer(min_token_occurrence=-1)
41 | bpe.fit(raw_string_data)
42 |
43 | with pytest.raises(ValueError):
44 | bpe = BytePairEncodingVectorizer(return_type=-1)
45 | bpe.fit(raw_string_data)
46 |
47 | with pytest.raises(ValueError):
48 | bpe = BytePairEncodingVectorizer(return_type="nonsense")
49 | bpe.fit(raw_string_data)
50 |
51 |
52 | def test_bpe_trash_token():
53 | bpe = BytePairEncodingVectorizer(return_type="sequences").fit(raw_string_data)
54 | tokenized_no_trash = bpe.transform(raw_string_data)
55 | assert len(tokenized_no_trash) == len(raw_string_data)
56 | assert not any(0 in tokens for tokens in tokenized_no_trash)
57 | tokenized_with_trash = bpe.transform(["asdf{qwer"])
58 | assert len(tokenized_with_trash) == 1
59 | assert 0 in tokenized_with_trash[0]
60 |
61 |
62 | def test_bpe_set_max_char_code():
63 | MCC = 65535
64 | bpe = BytePairEncodingVectorizer(
65 | max_char_code=MCC,
66 | return_type="sequences"
67 | ).fit(raw_string_data)
68 | tokens = bpe.transform(raw_string_data)
69 | largest_char = max(max(ord(c) for c in s) for s in raw_string_data)
70 | assert largest_char < 126
71 | assert all(
72 | all(
73 | token <= largest_char or token > MCC
74 | for token in seq
75 | )
76 | for seq in tokens
77 | )
78 | tokens_strange = bpe.transform([chr(126) + chr(2000) + chr(60000)])
79 | assert 1 == len(tokens_strange)
80 | assert np.all([126, 2000, 60000] == tokens_strange[0])
81 |
82 |
83 | def test_bpe_set_max_char_code_too_low():
84 | bpe = BytePairEncodingVectorizer(max_char_code=50).fit(raw_string_data)
85 | assert max(max(ord(c) for c in s) for s in raw_string_data) == bpe.max_char_code_
86 |
87 |
88 | @pytest.mark.parametrize(
89 | "name,max_expected",
90 | [
91 | ("ascii", 127),
92 | ("common", 2047),
93 | ("bmp", 65535),
94 | ("unicode", 1_114_111),
95 | ]
96 | )
97 | def test_bpe_max_char_code_limits(name, max_expected):
98 | assert max_expected == BytePairEncodingVectorizer(
99 | max_char_code=name
100 | ).fit(raw_string_data).max_char_code_
101 |
102 |
103 | def test_bpe_max_char_code_limit_wrong():
104 | with pytest.raises(ValueError):
105 | BytePairEncodingVectorizer(max_char_code="utf8").fit(raw_string_data)
106 |
107 |
108 | def test_bpe_contract_pair_single_token_training():
109 | seqs_tokens = BytePairEncodingVectorizer(return_type="tokens").fit_transform([
110 | "asdfqwerty",
111 | "asdf",
112 | "qwzxasdfcv"
113 | ])
114 | assert [
115 | ["asdf", "qw", "e", "r", "t", "y"],
116 | ["asdf"],
117 | ["qw", "z", "x", "asdf", "c", "v"],
118 | ] == seqs_tokens
119 |
120 |
121 | def test_bpe_contract_pair_single_token_inference():
122 | bpe = BytePairEncodingVectorizer(return_type="tokens").fit([
123 | "asdfqwerty",
124 | "asdfg",
125 | "qwzxasdfcv",
126 | ])
127 | assert [["asdf"]] == bpe.transform(["asdf"])
128 |
--------------------------------------------------------------------------------
/vectorizers/tests/test_distances.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import numpy as np
4 | import scipy.sparse
5 | from sklearn.preprocessing import normalize
6 |
7 | from vectorizers.distances import hellinger, sparse_hellinger
8 | from vectorizers.distances import total_variation, sparse_total_variation
9 | from vectorizers.distances import (
10 | jensen_shannon_divergence,
11 | sparse_jensen_shannon_divergence,
12 | )
13 |
14 |
15 | def test_hellinger():
16 | assert hellinger(np.array([0.0, 0.0]), np.array([0.0, 0.0])) == 0.0
17 | assert hellinger(np.array([0.0, 0.0]), np.array([1.0, 0.0])) == 1.0
18 | assert hellinger(np.array([0.5, 0.5]), np.array([0.5, 0.5])) == 0.0
19 | assert hellinger(np.array([0.5, 0.5]), np.array([1.0, 0.0])) == 0.5411961001461969
20 | assert hellinger(np.array([0.1, 0.9]), np.array([1.0, 0.0])) == 0.8269052146305295
21 |
22 |
23 | def test_sparse_hellinger():
24 | assert np.isclose(
25 | sparse_hellinger(
26 | np.array([7, 12]),
27 | np.array([0.0, 0.0]),
28 | np.array([8, 13]),
29 | np.array([0.0, 0.0]),
30 | ),
31 | 0.0,
32 | )
33 | assert np.isclose(
34 | sparse_hellinger(
35 | np.array([7, 12]),
36 | np.array([0.0, 0.0]),
37 | np.array([8, 13]),
38 | np.array([1.0, 0.0]),
39 | ),
40 | 1.0,
41 | )
42 | assert np.isclose(
43 | sparse_hellinger(
44 | np.array([7, 12]),
45 | np.array([0.5, 0.5]),
46 | np.array([7, 12]),
47 | np.array([0.5, 0.5]),
48 | ),
49 | 0.0,
50 | )
51 | assert np.isclose(
52 | sparse_hellinger(
53 | np.array([7, 12]),
54 | np.array([0.5, 0.5]),
55 | np.array([7, 12]),
56 | np.array([1.0, 0.0]),
57 | ),
58 | 0.5411961001461969,
59 | )
60 | assert np.isclose(
61 | sparse_hellinger(
62 | np.array([7, 12]),
63 | np.array([0.1, 0.9]),
64 | np.array([7, 12]),
65 | np.array([1.0, 0.0]),
66 | ),
67 | 0.8269052146305295,
68 | )
69 |
70 |
71 | # Test using inequalities with Hellinger distance from Wikipedia
72 | # https://en.wikipedia.org/wiki/Hellinger_distance#Connection_with_the_statistical_distance
73 | def test_total_variation():
74 | test_data = np.random.random(size=(10, 50))
75 | test_data = normalize(test_data, norm="l1")
76 | for i in range(test_data.shape[0]):
77 | for j in range(i + 1, test_data.shape[0]):
78 | hd = hellinger(test_data[i], test_data[j])
79 | tvd = total_variation(test_data[i], test_data[j])
80 | assert hd ** 2 <= tvd
81 | assert tvd <= np.sqrt(2) * hd
82 |
83 |
84 | # Test using inequalities with Hellinger distance from Wikipedia
85 | # https://en.wikipedia.org/wiki/Hellinger_distance#Connection_with_the_statistical_distance
86 | def test_sparse_total_variation():
87 | test_data = np.random.random(size=(10, 100))
88 | # sparsify
89 | test_data[test_data <= 0.5] = 0.0
90 | test_data = scipy.sparse.csr_matrix(test_data)
91 | test_data = normalize(test_data, norm="l1")
92 |
93 | for i in range(test_data.shape[0]):
94 | for j in range(i + 1, test_data.shape[0]):
95 | hd = sparse_hellinger(
96 | test_data[i].indices,
97 | test_data[i].data,
98 | test_data[j].indices,
99 | test_data[j].data,
100 | )
101 | tvd = sparse_total_variation(
102 | test_data[i].indices,
103 | test_data[i].data,
104 | test_data[j].indices,
105 | test_data[j].data,
106 | )
107 | assert hd ** 2 <= tvd
108 | assert tvd <= np.sqrt(2) * hd
109 |
110 |
111 | def test_jensen_shannon():
112 | test_data = np.random.random(size=(10, 50))
113 | test_data = normalize(test_data, norm="l1")
114 | for i in range(test_data.shape[0]):
115 | for j in range(i + 1, test_data.shape[0]):
116 | m = (test_data[i] + test_data[j]) / 2.0
117 | p = test_data[i]
118 | q = test_data[j]
119 | d = (
120 | -np.sum(m * np.log(m))
121 | + (np.sum(p * np.log(p)) + np.sum(q * np.log(q))) / 2.0
122 | )
123 | assert np.isclose(d, jensen_shannon_divergence(p, q))
124 |
125 |
126 | def test_sparse_jensen_shannon():
127 | test_data = np.random.random(size=(10, 100))
128 | # sparsify
129 | test_data[test_data <= 0.5] = 0.0
130 | sparse_test_data = scipy.sparse.csr_matrix(test_data)
131 | sparse_test_data = normalize(sparse_test_data, norm="l1")
132 | test_data = normalize(test_data, norm="l1")
133 |
134 | for i in range(test_data.shape[0]):
135 | for j in range(i + 1, test_data.shape[0]):
136 | m = (test_data[i] + test_data[j]) / 2.0
137 | p = test_data[i]
138 | q = test_data[j]
139 | d = (
140 | -np.sum(m[m > 0] * np.log(m[m > 0]))
141 | + (
142 | np.sum(p[p > 0] * np.log(p[p > 0]))
143 | + np.sum(q[q > 0] * np.log(q[q > 0]))
144 | )
145 | / 2.0
146 | )
147 | assert np.isclose(
148 | d,
149 | sparse_jensen_shannon_divergence(
150 | sparse_test_data[i].indices,
151 | sparse_test_data[i].data,
152 | sparse_test_data[j].indices,
153 | sparse_test_data[j].data,
154 | ),
155 | )
156 |
--------------------------------------------------------------------------------
/vectorizers/tests/test_edge_list_vectorizer.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from vectorizers import EdgeListVectorizer
4 |
5 | # from vectorizers.edge_list_vectorizer import read_edge_data
6 | import numpy as np
7 | import pandas as pd
8 |
9 | rows = np.array(["a", "b", "c", "d", "d"])
10 | cols = np.array(["b", "c", "d", "b", "c"])
11 | vals = np.array([1, 2, 3, 4, 8])
12 | test_data = (rows, cols, vals)
13 | list_of_edges = [
14 | ["a", "b", 1],
15 | ["b", "c", 2],
16 | ["c", "d", 3],
17 | ["d", "b", 4],
18 | ["d", "c", 8],
19 | ]
20 | df_of_edges = pd.DataFrame({"r": rows, "c": cols, "v": vals})
21 |
22 |
23 | # Tuple or list of columns, data frame, list of edges
24 | @pytest.mark.parametrize(
25 | "data", [(rows, cols, vals), [rows, cols, vals], list_of_edges, df_of_edges]
26 | )
27 | def test_edgelist_input(data):
28 | model = EdgeListVectorizer().fit(data)
29 | result = model.transform(data)
30 | result1 = EdgeListVectorizer().fit_transform(data)
31 | assert np.allclose(result.toarray(), result1.toarray())
32 | assert result.shape == (4, 3)
33 | assert np.allclose(result.toarray()[:, 1], np.array([0, 2, 0, 8]))
34 |
35 |
36 | def test_edgelist_specified_rows():
37 | row_dict = {"a": 0, "d": 1}
38 | result = EdgeListVectorizer(row_label_dictionary=row_dict).fit_transform(test_data)
39 | assert result.shape == (2, 3)
40 | assert np.allclose(result.toarray()[1, :], np.array([4, 8, 0]))
41 |
42 |
43 | def test_edgelist_specified_columns():
44 | column_dict = {"b": 0, "c": 1}
45 | result = EdgeListVectorizer(column_label_dictionary=column_dict).fit_transform(
46 | test_data
47 | )
48 | assert result.shape == (4, 2)
49 | assert np.allclose(result.toarray()[:, 1], np.array([0, 2, 0, 8]))
50 |
51 |
52 | def test_edgelist_specified_rows_missing_index():
53 | row_dict = {"a": 2, "d": 4}
54 | result = EdgeListVectorizer(row_label_dictionary=row_dict).fit_transform(test_data)
55 | assert result.shape == (5, 3)
56 | assert np.allclose(result.toarray()[:, 0], np.array([0, 0, 1, 0, 4]))
57 |
58 |
59 | def test_edgelist_specified_column_missing_index():
60 | column_dict = {"b": 2, "c": 4}
61 | result = EdgeListVectorizer(column_label_dictionary=column_dict).fit_transform(
62 | test_data
63 | )
64 | assert result.shape == (4, 5)
65 | assert np.allclose(result.toarray()[:, 4], np.array([0, 2, 0, 8]))
66 |
67 |
68 | # TODO: Write a unit test for joint_space=True
69 |
--------------------------------------------------------------------------------
/vectorizers/tests/test_signature_vectorizer.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from vectorizers import SignatureVectorizer
4 |
5 | import numpy as np
6 | iisignature = pytest.importorskip("iisignature")
7 | import re
8 |
9 | NUMPY_SHAPE_ERROR_MSG = """
10 | Error: SignatureVectorizer expects numpy arrays to be of shape (num_samples x path_len x path_dim).
11 | """
12 | LIST_SHAPE_ERROR_MSG = """
13 | Error: Expecting list entries to be numpy arrays of shape (path_len x path_dim).
14 | """
15 |
16 | # Check numpy and list vectorizers return the same output as iisignature
17 | @pytest.mark.parametrize("truncation_level", [2, 3, 5])
18 | @pytest.mark.parametrize("log", [True, False])
19 | @pytest.mark.parametrize("basepoint", [True, False])
20 | def test_numpy_vs_list_vs_iisig(truncation_level, log, basepoint, seed=1):
21 |
22 | n_paths = 100
23 | path_len = 50
24 | path_dim = 5
25 |
26 | np.random.seed(seed)
27 | test_paths_list = [
28 | np.random.normal(size=(path_len, path_dim)) for i in range(n_paths)
29 | ]
30 | test_paths_numpy = np.array(test_paths_list)
31 |
32 | sigs_numpy = SignatureVectorizer(
33 | truncation_level=truncation_level, log=log, basepoint=basepoint
34 | ).fit_transform(test_paths_numpy)
35 | sigs_list = SignatureVectorizer(
36 | truncation_level=truncation_level, log=log, basepoint=basepoint
37 | ).fit_transform(test_paths_list)
38 |
39 | if basepoint:
40 | concat_shape = (test_paths_numpy.shape[0], 1, test_paths_numpy.shape[2])
41 | X = np.concatenate([np.zeros(shape=concat_shape), test_paths_numpy], axis=1)
42 | else:
43 | X = test_paths_numpy
44 |
45 | if log:
46 | s = iisignature.prepare(X.shape[-1], truncation_level)
47 | sigs_iisig = iisignature.logsig(X, s)
48 | else:
49 | sigs_iisig = iisignature.sig(X, truncation_level)
50 | assert np.all(np.isclose(sigs_numpy, sigs_list))
51 | assert np.all(np.isclose(sigs_list, sigs_iisig))
52 |
53 |
54 | # Check bad initialisation returns appropriate error messages
55 | def test_bad_init_params():
56 | with pytest.raises(
57 | AssertionError, match="Error: expecting int type for truncation_level."
58 | ):
59 | vectorizer = SignatureVectorizer(truncation_level="three")
60 |
61 | with pytest.raises(AssertionError, match="Error: expecting bool type for log."):
62 | vectorizer = SignatureVectorizer(log=1)
63 |
64 | with pytest.raises(
65 | AssertionError, match="Error: expecting bool type for basepoint"
66 | ):
67 | vectorizer = SignatureVectorizer(basepoint=np.zeros(10))
68 |
69 |
70 | # Check bad fit returns appropriate error messages
71 | def test_bad_fit_params():
72 |
73 | vectorizer = SignatureVectorizer()
74 |
75 | with pytest.raises(AssertionError, match=re.escape(NUMPY_SHAPE_ERROR_MSG)):
76 | vectorizer.fit(np.random.random(size=(2, 10, 3, 5)))
77 |
78 | with pytest.raises(AssertionError, match=re.escape(NUMPY_SHAPE_ERROR_MSG)):
79 | vectorizer.fit(np.random.random(size=(2, 10)))
80 |
81 | with pytest.raises(
82 | AssertionError, match="Error: Expecting numpy array or list of paths."
83 | ):
84 | vectorizer.fit("Not a list or numpy array")
85 |
86 | with pytest.raises(
87 | AssertionError, match="Error: Expecting list entries to be numpy arrays."
88 | ):
89 | vectorizer.fit(["List", "of", "nonsense"])
90 |
91 | with pytest.raises(AssertionError, match=re.escape(LIST_SHAPE_ERROR_MSG)):
92 | vectorizer.fit([np.random.random(size=(3, 10, 5))])
93 |
94 |
95 | # Check bad transform returns appropriate error messages
96 | def test_bad_transform_parameters():
97 |
98 | vectorizer = SignatureVectorizer()
99 | vectorizer.fit(np.random.random(size=(20, 50, 3)))
100 | with pytest.raises(AssertionError, match=re.escape(NUMPY_SHAPE_ERROR_MSG)):
101 | vectorizer.transform(np.random.random(size=(2, 10, 3, 5)))
102 |
103 | with pytest.raises(AssertionError, match=re.escape(NUMPY_SHAPE_ERROR_MSG)):
104 | vectorizer.transform(np.random.random(size=(2, 10)))
105 |
106 | with pytest.raises(
107 | AssertionError, match="Error: Expecting numpy array or list of paths."
108 | ):
109 | vectorizer.transform("Not a list or numpy array")
110 |
111 | with pytest.raises(
112 | AssertionError, match="Error: Expecting list entries to be numpy arrays."
113 | ):
114 | vectorizer.transform(["List", "of", "nonsense"])
115 |
116 | with pytest.raises(AssertionError, match=re.escape(LIST_SHAPE_ERROR_MSG)):
117 | vectorizer.transform([np.random.random(size=(3, 10, 5))])
118 |
119 | # Mismatch from fit shape
120 | with pytest.raises(AssertionError, match="Error: Expecting path_dim to be"):
121 | vectorizer.transform([np.random.random(size=(50, 5))])
122 | with pytest.raises(AssertionError, match="Error: Expecting path_dim to be "):
123 | vectorizer.transform(np.random.random(size=(30, 50, 5)))
124 | with pytest.raises(
125 | AssertionError, match="Error: Not all paths share the same dimension."
126 | ):
127 | vectorizer.transform(
128 | [
129 | np.random.random(size=(10, 3)),
130 | np.random.random(size=(10, 3)),
131 | np.random.random(size=(10, 5)),
132 | np.random.random(size=(10, 3)),
133 | ]
134 | )
135 |
--------------------------------------------------------------------------------
/vectorizers/tests/test_template.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 |
4 | # from sklearn.datasets import load_iris
5 | # from sklearn.utils.testing import assert_array_equal
6 | # from sklearn.utils.testing import assert_allclose
7 | #
8 | # from vectorizers import TemplateEstimator
9 | # from vectorizers import TemplateTransformer
10 | # from vectorizers import TemplateClassifier
11 | #
12 | #
13 | # @pytest.fixture
14 | # def data():
15 | # return load_iris(return_X_y=True)
16 | #
17 | # def test_template_estimator(data):
18 | # est = TemplateEstimator()
19 | # assert est.demo_param == 'demo_param'
20 | #
21 | # est.fit(*data)
22 | # assert hasattr(est, 'is_fitted_')
23 | #
24 | # X = data[0]
25 | # y_pred = est.predict(X)
26 | # assert_array_equal(y_pred, np.ones(X.shape[0], dtype=np.int64))
27 | #
28 | #
29 | # def test_template_transformer_error(data):
30 | # X, y = data
31 | # trans = TemplateTransformer()
32 | # trans.fit(X)
33 | # with pytest.raises(ValueError, match="Shape of input is different"):
34 | # X_diff_size = np.ones((10, X.shape[1] + 1))
35 | # trans.transform(X_diff_size)
36 | #
37 | #
38 | # def test_template_transformer(data):
39 | # X, y = data
40 | # trans = TemplateTransformer()
41 | # assert trans.demo_param == 'demo'
42 | #
43 | # trans.fit(X)
44 | # assert trans.n_features_ == X.shape[1]
45 | #
46 | # X_trans = trans.transform(X)
47 | # assert_allclose(X_trans, np.sqrt(X))
48 | #
49 | # X_trans = trans.fit_transform(X)
50 | # assert_allclose(X_trans, np.sqrt(X))
51 | #
52 | #
53 | # def test_template_classifier(data):
54 | # X, y = data
55 | # clf = TemplateClassifier()
56 | # assert clf.demo_param == 'demo'
57 | #
58 | # clf.fit(X, y)
59 | # assert hasattr(clf, 'classes_')
60 | # assert hasattr(clf, 'X_')
61 | # assert hasattr(clf, 'y_')
62 | #
63 | # y_pred = clf.predict(X)
64 | # assert y_pred.shape == (X.shape[0],)
65 |
--------------------------------------------------------------------------------
/vectorizers/tests/test_transformers.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from vectorizers.transformers import (
4 | RowDenoisingTransformer,
5 | InformationWeightTransformer,
6 | CategoricalColumnTransformer,
7 | CountFeatureCompressionTransformer,
8 | SlidingWindowTransformer,
9 | SequentialDifferenceTransformer,
10 | sliding_window_generator,
11 | )
12 | import numpy as np
13 | import scipy.sparse
14 | import pandas as pd
15 | import numba
16 |
17 | test_matrix = scipy.sparse.csr_matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
18 | test_matrix_zero_row = scipy.sparse.csr_matrix([[1, 2, 3], [4, 5, 6], [0, 0, 0]])
19 | test_matrix_zero_row.eliminate_zeros()
20 | test_matrix_zero_column = scipy.sparse.csr_matrix([[1, 2, 0], [4, 5, 0], [7, 8, 0]])
21 | test_matrix_zero_column.eliminate_zeros()
22 |
23 | test_df = pd.DataFrame(
24 | {
25 | "id": ["one", "two", "one", "two"],
26 | "A": ["foo", "bar", "pok", "bar"],
27 | "B": ["x", "k", "c", "d"],
28 | }
29 | )
30 |
31 | test_time_series = [
32 | np.random.random(size=23),
33 | np.random.random(size=56),
34 | np.random.random(size=71),
35 | np.random.random(size=64),
36 | np.random.random(size=35),
37 | np.random.random(size=44),
38 | ]
39 |
40 | changepoint_position = np.random.randint(11, 100) # changepoint position must be at least window_width in
41 | changepoint_sequence = np.random.poisson(0.75, size=100)
42 | changepoint_sequence[changepoint_position] = 10
43 |
44 |
45 | @pytest.mark.parametrize("include_column_name", [True, False])
46 | @pytest.mark.parametrize("unique_values", [True, False])
47 | def test_CategoricalColumnTransformer(include_column_name, unique_values):
48 | result = CategoricalColumnTransformer(
49 | object_column_name="id",
50 | descriptor_column_name="A",
51 | include_column_name=include_column_name,
52 | unique_values=unique_values,
53 | ).fit_transform(test_df)
54 |
55 | if include_column_name:
56 | if unique_values:
57 | expected_result = pd.Series(
58 | [["A:foo", "A:pok"], ["A:bar"]], index=["one", "two"]
59 | )
60 | else:
61 | expected_result = pd.Series(
62 | [["A:foo", "A:pok"], ["A:bar", "A:bar"]], index=["one", "two"]
63 | )
64 | else:
65 | if unique_values:
66 | expected_result = pd.Series([["foo", "pok"], ["bar"]], index=["one", "two"])
67 | else:
68 | expected_result = pd.Series(
69 | [["foo", "pok"], ["bar", "bar"]], index=["one", "two"]
70 | )
71 | assert (result == expected_result).all()
72 |
73 |
74 | @pytest.mark.parametrize("include_column_name", [True, False])
75 | @pytest.mark.parametrize("unique_values", [True, False])
76 | def test_CategoricalColumnTransformer_multi_column(include_column_name, unique_values):
77 | result = CategoricalColumnTransformer(
78 | object_column_name="id",
79 | descriptor_column_name=["A", "B"],
80 | include_column_name=include_column_name,
81 | unique_values=unique_values,
82 | ).fit_transform(test_df)
83 |
84 | if include_column_name:
85 | if unique_values:
86 | expected_result = pd.Series(
87 | [["A:foo", "A:pok", "B:x", "B:c"], ["A:bar", "B:k", "B:d"]],
88 | index=["one", "two"],
89 | )
90 | else:
91 | expected_result = pd.Series(
92 | [["A:foo", "A:pok", "B:x", "B:c"], ["A:bar", "A:bar", "B:k", "B:d"]],
93 | index=["one", "two"],
94 | )
95 | else:
96 | if unique_values:
97 | expected_result = pd.Series(
98 | [["foo", "pok", "x", "c"], ["bar", "k", "d"]], index=["one", "two"]
99 | )
100 | else:
101 | expected_result = pd.Series(
102 | [["foo", "pok", "x", "c"], ["bar", "bar", "k", "d"]],
103 | index=["one", "two"],
104 | )
105 | assert (result == expected_result).all()
106 |
107 |
108 | def test_CategoricalColumnTransformer_bad_param():
109 | with pytest.raises(ValueError):
110 | CategoricalColumnTransformer(
111 | object_column_name="id",
112 | descriptor_column_name=["A", "BAD"],
113 | ).fit_transform(test_df)
114 |
115 |
116 | @pytest.mark.parametrize("em_precision", [1e-3, 1e-4])
117 | @pytest.mark.parametrize("em_background_prior", [0.1, 10.0])
118 | @pytest.mark.parametrize("em_threshold", [1e-4, 1e-5])
119 | @pytest.mark.parametrize("em_prior_strength", [1.0, 10.0])
120 | @pytest.mark.parametrize("normalize", [True, False])
121 | def test_re_transformer(
122 | em_precision,
123 | em_background_prior,
124 | em_threshold,
125 | em_prior_strength,
126 | normalize,
127 | ):
128 | RET = RowDenoisingTransformer(
129 | em_precision=em_precision,
130 | em_background_prior=em_background_prior,
131 | em_threshold=em_threshold,
132 | em_prior_strength=em_prior_strength,
133 | normalize=normalize,
134 | )
135 | result = RET.fit_transform(test_matrix)
136 | transform = RET.transform(test_matrix)
137 | assert np.allclose(result.toarray(), transform.toarray())
138 |
139 |
140 | @pytest.mark.parametrize("em_precision", [1e-3, 1e-4])
141 | @pytest.mark.parametrize("em_background_prior", [0.1, 10.0])
142 | @pytest.mark.parametrize("em_threshold", [1e-4, 1e-5])
143 | @pytest.mark.parametrize("em_prior_strength", [1.0, 10.0])
144 | @pytest.mark.parametrize("normalize", [True, False])
145 | def test_re_transformer_zero_column(
146 | em_precision,
147 | em_background_prior,
148 | em_threshold,
149 | em_prior_strength,
150 | normalize,
151 | ):
152 | RET = RowDenoisingTransformer(
153 | em_precision=em_precision,
154 | em_background_prior=em_background_prior,
155 | em_threshold=em_threshold,
156 | em_prior_strength=em_prior_strength,
157 | normalize=normalize,
158 | )
159 | result = RET.fit_transform(test_matrix_zero_column)
160 | transform = RET.transform(test_matrix_zero_column)
161 | assert np.allclose(result.toarray(), transform.toarray())
162 |
163 |
164 | @pytest.mark.parametrize("em_precision", [1e-3, 1e-4])
165 | @pytest.mark.parametrize("em_background_prior", [0.1, 10.0])
166 | @pytest.mark.parametrize("em_threshold", [1e-4, 1e-5])
167 | @pytest.mark.parametrize("em_prior_strength", [1.0, 10.0])
168 | @pytest.mark.parametrize("normalize", [True, False])
169 | def test_re_transformer_zero_row(
170 | em_precision,
171 | em_background_prior,
172 | em_threshold,
173 | em_prior_strength,
174 | normalize,
175 | ):
176 | RET = RowDenoisingTransformer(
177 | em_precision=em_precision,
178 | em_background_prior=em_background_prior,
179 | em_threshold=em_threshold,
180 | em_prior_strength=em_prior_strength,
181 | normalize=normalize,
182 | )
183 | result = RET.fit_transform(test_matrix_zero_row)
184 | transform = RET.transform(test_matrix_zero_row)
185 | assert np.allclose(result.toarray(), transform.toarray())
186 |
187 |
188 | @pytest.mark.parametrize("prior_strength", [0.1, 1.0])
189 | @pytest.mark.parametrize("approx_prior", [True, False])
190 | def test_iw_transformer(prior_strength, approx_prior):
191 | IWT = InformationWeightTransformer(
192 | prior_strength=prior_strength,
193 | approx_prior=approx_prior,
194 | )
195 | result = IWT.fit_transform(test_matrix)
196 | transform = IWT.transform(test_matrix)
197 | assert np.allclose(result.toarray(), transform.toarray())
198 |
199 |
200 | @pytest.mark.parametrize("prior_strength", [0.1, 1.0])
201 | @pytest.mark.parametrize("approx_prior", [True, False])
202 | def test_iw_transformer_supervised(prior_strength, approx_prior):
203 | IWT = InformationWeightTransformer(
204 | prior_strength=prior_strength,
205 | approx_prior=approx_prior,
206 | )
207 | result = IWT.fit_transform(test_matrix, np.array([0, 1, 1]))
208 | transform = IWT.transform(test_matrix)
209 | assert np.allclose(result.toarray(), transform.toarray())
210 |
211 |
212 | @pytest.mark.parametrize("prior_strength", [0.1, 1.0])
213 | @pytest.mark.parametrize("approx_prior", [True, False])
214 | def test_iw_transformer_zero_column(prior_strength, approx_prior):
215 | IWT = InformationWeightTransformer(
216 | prior_strength=prior_strength,
217 | approx_prior=approx_prior,
218 | )
219 | result = IWT.fit_transform(test_matrix_zero_column)
220 | transform = IWT.transform(test_matrix_zero_column)
221 | assert np.allclose(result.toarray(), transform.toarray())
222 |
223 |
224 | @pytest.mark.parametrize("prior_strength", [0.1, 1.0])
225 | @pytest.mark.parametrize("approx_prior", [True, False])
226 | def test_iw_transformer_zero_row(prior_strength, approx_prior):
227 | IWT = InformationWeightTransformer(
228 | prior_strength=prior_strength,
229 | approx_prior=approx_prior,
230 | )
231 | result = IWT.fit_transform(test_matrix_zero_row)
232 | transform = IWT.transform(test_matrix_zero_row)
233 | assert np.allclose(result.toarray(), transform.toarray())
234 |
235 |
236 | @pytest.mark.parametrize("algorithm", ["randomized", "arpack"])
237 | def test_count_feature_compression_basic(algorithm):
238 | cfc = CountFeatureCompressionTransformer(n_components=2, algorithm=algorithm)
239 | result = cfc.fit_transform(test_matrix)
240 | transform = cfc.transform(test_matrix)
241 | assert np.allclose(result, transform)
242 |
243 |
244 | @pytest.mark.parametrize("algorithm", ["randomized", "arpack"])
245 | def test_count_feature_compression_fit_transform_is_fit_and_transform(algorithm):
246 | make_cfc = lambda: CountFeatureCompressionTransformer(n_components=2, algorithm=algorithm)
247 | cfc_fit = make_cfc().fit(test_matrix)
248 | assert np.allclose(cfc_fit.transform(test_matrix), make_cfc().fit_transform(test_matrix))
249 |
250 |
251 | def test_count_feature_compression_warns():
252 | cfc = CountFeatureCompressionTransformer(n_components=5)
253 | with pytest.warns(UserWarning):
254 | result = cfc.fit_transform(test_matrix)
255 |
256 |
257 | def test_count_feature_compression_bad_input():
258 | cfc = CountFeatureCompressionTransformer(n_components=2)
259 | with pytest.raises(ValueError):
260 | result = cfc.fit_transform(-test_matrix)
261 |
262 | with pytest.raises(ValueError):
263 | result = cfc.fit_transform(-test_matrix.toarray())
264 |
265 | cfc = CountFeatureCompressionTransformer(n_components=2, algorithm="bad_value")
266 | with pytest.raises(ValueError):
267 | result = cfc.fit_transform(test_matrix)
268 |
269 |
270 | @pytest.mark.parametrize("pad_width", [0, 1])
271 | @pytest.mark.parametrize(
272 | "kernel",
273 | [
274 | "average",
275 | ("differences", 0, 1, 1),
276 | ("position_velocity", 2, 1, 1),
277 | ("weight", np.array([0.1, 0.75, 1.5, 1.0, 0.25])),
278 | ("gaussian_weight", 2),
279 | np.random.random((5, 5)),
280 | numba.njit(lambda x: x.cumsum()),
281 | ],
282 | )
283 | @pytest.mark.parametrize("sample", [None, (0, 1), np.arange(5), [4, 1, 3, 2, 0]])
284 | def test_sliding_window_transformer_basic(pad_width, kernel, sample):
285 | swt = SlidingWindowTransformer(
286 | window_width=5, pad_width=pad_width, kernels=[kernel], window_sample=sample
287 | )
288 | result = swt.fit_transform(test_time_series)
289 | transform = swt.transform(test_time_series)
290 | for i, point_cloud in enumerate(result):
291 | for j, point in enumerate(point_cloud):
292 | assert np.allclose(point, transform[i][j])
293 |
294 |
295 | @pytest.mark.parametrize("pad_width", [0, 1])
296 | @pytest.mark.parametrize(
297 | "kernel",
298 | [
299 | "average",
300 | ("differences", 0, 1, 1),
301 | ("position_velocity", 2, 1, 1),
302 | ("weight", np.array([0.1, 0.75, 1.5, 1.0, 0.25])),
303 | ("gaussian_weight", 2),
304 | np.random.random((5, 5)),
305 | numba.njit(lambda x: x.cumsum(), cache=True),
306 | ],
307 | )
308 | @pytest.mark.parametrize("sample", [None, np.arange(5), [4, 1, 3, 2, 0]])
309 | def test_sliding_window_generator_matches_transformer(pad_width, kernel, sample):
310 | swt = SlidingWindowTransformer(
311 | window_width=5, pad_width=pad_width, kernels=[kernel], window_sample=sample
312 | )
313 | transformer_result = swt.fit_transform(test_time_series)
314 | test_window = (
315 | None
316 | if not callable(kernel)
317 | else np.asarray(test_time_series[0])[: swt.window_width][swt.window_sample_]
318 | )
319 | generator_result = list(
320 | sliding_window_generator(
321 | test_time_series,
322 | test_time_series[0].shape,
323 | window_width=5,
324 | pad_width=pad_width,
325 | kernels=[kernel],
326 | window_sample=sample,
327 | test_window=test_window,
328 | )
329 | )
330 | for i, point_cloud in enumerate(transformer_result):
331 | for j, point in enumerate(point_cloud):
332 | assert np.allclose(point, generator_result[i][j])
333 |
334 | @pytest.mark.parametrize("window_width", [5, 10])
335 | def test_sliding_window_count_changepoint(window_width):
336 | swt = SlidingWindowTransformer(
337 | window_width=window_width, kernels=[("count_changepoint", 1.0, 2.0)],
338 | )
339 | changepoint_scores = swt.fit_transform([changepoint_sequence])[0].flatten()
340 | assert np.argmax(changepoint_scores) + window_width - 1 == changepoint_position
341 |
342 | @pytest.mark.parametrize("pad_width", [0, 1])
343 | @pytest.mark.parametrize(
344 | "kernel",
345 | [
346 | "average",
347 | ("differences", 0, 1, 1),
348 | ("position_velocity", 2, 1, 1),
349 | ("weight", np.array([0.1, 0.75, 1.5, 1.0, 0.25])),
350 | np.random.random((5, 5)),
351 | numba.njit(lambda x: x.cumsum()),
352 | ],
353 | )
354 | @pytest.mark.parametrize("sample", [None, np.arange(5), [4, 1, 3, 2, 0]])
355 | def test_sliding_window_transformer_basic_w_lists(pad_width, kernel, sample):
356 | swt = SlidingWindowTransformer(
357 | window_width=5, pad_width=pad_width, kernels=[kernel], window_sample=sample
358 | )
359 | result = swt.fit_transform([list(x) for x in test_time_series])
360 | transform = swt.transform([list(x) for x in test_time_series])
361 | for i, point_cloud in enumerate(result):
362 | for j, point in enumerate(point_cloud):
363 | assert np.allclose(point, transform[i][j])
364 |
365 |
366 | def test_sliding_window_transformer_w_sampling():
367 | swt = SlidingWindowTransformer(window_sample="random", window_sample_size=5)
368 | result = swt.fit_transform(test_time_series)
369 | transform = swt.transform(test_time_series)
370 | for i, point_cloud in enumerate(result):
371 | for j, point in enumerate(point_cloud):
372 | assert np.allclose(point, transform[i][j])
373 |
374 |
375 | def test_sliding_window_transformer_bad_params():
376 | swt = SlidingWindowTransformer(window_sample="foo")
377 | with pytest.raises(ValueError):
378 | result = swt.fit_transform(test_time_series)
379 |
380 | swt = SlidingWindowTransformer(window_sample=("foo", "bar"))
381 | with pytest.raises(ValueError):
382 | result = swt.fit_transform(test_time_series)
383 |
384 | swt = SlidingWindowTransformer(window_sample=1.105)
385 | with pytest.raises(ValueError):
386 | result = swt.fit_transform(test_time_series)
387 |
388 | swt = SlidingWindowTransformer(window_width=-1)
389 | with pytest.raises(ValueError):
390 | result = swt.fit_transform(test_time_series)
391 |
392 | swt = SlidingWindowTransformer(kernels=["not a kernel"])
393 | with pytest.raises(ValueError):
394 | result = swt.fit_transform(test_time_series)
395 |
396 | swt = SlidingWindowTransformer(kernels=-1)
397 | with pytest.raises(ValueError):
398 | result = swt.fit_transform(test_time_series)
399 |
400 | swt = SlidingWindowTransformer(kernels=np.array([[1, 2, 3], [1, 2, 3]]))
401 | with pytest.raises(ValueError):
402 | result = swt.fit_transform(test_time_series)
403 |
404 | def test_seq_diff_transformer_basic():
405 | sdt = SequentialDifferenceTransformer()
406 | diffs = sdt.fit_transform(test_time_series)
407 | transform_diffs = sdt.transform(test_time_series)
408 | for i, seq_diffs in enumerate(diffs):
409 | assert np.allclose(np.array(seq_diffs), np.array(transform_diffs[i]))
410 | assert np.allclose(test_time_series[i][:-1] + np.ravel(seq_diffs), test_time_series[i][1:])
--------------------------------------------------------------------------------
/vectorizers/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | from vectorizers.transformers.categorical_columns import CategoricalColumnTransformer
2 | from vectorizers.transformers.info_weight import (
3 | InformationWeightTransformer,
4 | information_weight,
5 | )
6 | from vectorizers.transformers.row_desnoise import RowDenoisingTransformer
7 | from vectorizers.transformers.sliding_windows import (
8 | SlidingWindowTransformer,
9 | SequentialDifferenceTransformer,
10 | sliding_window_generator,
11 | )
12 | from vectorizers.transformers.count_feature_compression import (
13 | CountFeatureCompressionTransformer,
14 | )
15 |
16 | __all__ = [
17 | "CategoricalColumnTransformer",
18 | "InformationWeightTransformer",
19 | "RowDenoisingTransformer",
20 | "SlidingWindowTransformer",
21 | "SequentialDifferenceTransformer",
22 | "CountFeatureCompressionTransformer",
23 | "information_weight",
24 | "sliding_window_generator",
25 | ]
26 |
--------------------------------------------------------------------------------
/vectorizers/transformers/categorical_columns.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.base import BaseEstimator, TransformerMixin
3 |
4 | from warnings import warn
5 |
6 |
7 | class CategoricalColumnTransformer(BaseEstimator, TransformerMixin):
8 | """
9 | This transformer is useful for describing an object as a bag of the categorical values that
10 | have been used to represent it within a pandas DataFrame.
11 |
12 | It takes an categorical column name to groupby, object_column_name, and one
13 | or more categorical columns to be used to describe these objects,
14 | descriptor_column_name. Then it returns a Series with an index being the
15 | unique entries of your object_column_name and the values being a list of
16 | the appropriate categorical values from your descriptor_column_name.
17 |
18 | It can be thought of as a PivotTableTransformer if you'd like.
19 |
20 | Parameters
21 | ----------
22 | object_column_name: string
23 | The column name from the DataFrame where our object values can be found.
24 | This will be the thing we are grouping by.
25 |
26 | descriptor_column_name: string or list
27 | The name or names of the categorical column(s) who's values will be used for describing our
28 | objects. If you are using multiple names it's recommended that you set include_column_name=True.
29 |
30 | include_column_name: bool (default = False)
31 | Should the column name be appended at the beginning of each value?
32 | This is useful if you intend to combine values from multiple categorical columns
33 | after the fact.
34 |
35 | unique_values: bool (default = False)
36 | Should we apply a unique to the values in column before building our list representation?
37 |
38 | """
39 |
40 | def __init__(
41 | self,
42 | object_column_name,
43 | descriptor_column_name,
44 | include_column_name=False,
45 | unique_values=False,
46 | ):
47 | self.object_column_name = object_column_name
48 | self.descriptor_column_name = descriptor_column_name
49 | # Get everything on consistent footing so we don't have to handle multiple cases.
50 | if type(self.descriptor_column_name) == str:
51 | self.descriptor_column_name_ = [self.descriptor_column_name]
52 | else:
53 | self.descriptor_column_name_ = self.descriptor_column_name
54 | self.include_column_name = include_column_name
55 | self.unique_values = unique_values
56 |
57 | if (
58 | (self.include_column_name is False)
59 | and (type(self.descriptor_column_name) == list)
60 | and (len(self.descriptor_column_name) > 1)
61 | ):
62 | warn(
63 | "It is recommended that if you are aggregating "
64 | "multiple columns that you set include_column_name=True"
65 | )
66 |
67 | def fit_transform(self, X, y=None, **fit_params):
68 | """
69 | This transformer is useful for describing an object as a bag of the categorical values that
70 | have been used to represent it within a pandas DataFrame.
71 |
72 | It takes an categorical column name to groupby, object_column_name, and one or more
73 | categorical columns to be used to describe these objects, descriptor_column_name.
74 | Then it returns a Series with an index being the unique entries of your object_column_name
75 | and the values being a list of the appropriate categorical values from your descriptor_column_name.
76 |
77 | Parameters
78 | ----------
79 | X: pd.DataFrame
80 | a pandas dataframe with columns who's names match those specified in the object_column_name and
81 | descriptor_column_name of the constructor.
82 |
83 | Returns
84 | -------
85 | pandas Series
86 | Series with an index being the unique entries of your object_column_name
87 | and the values being a list of the appropriate categorical values from your descriptor_column_name.
88 | """
89 | # Check that the dataframe has the appropriate columns
90 | required_columns = set([self.object_column_name] + self.descriptor_column_name_)
91 | if not required_columns.issubset(X.columns):
92 | raise ValueError(
93 | f"Sorry the required column(s) {set(required_columns).difference(set(X.columns))} are not "
94 | f"present in your data frame. \n"
95 | f"Please either specify a new instance or apply to a different data frame. "
96 | )
97 |
98 | # Compute a single groupby ahead of time to save on compute
99 | grouped_frame = X.groupby(self.object_column_name)
100 | aggregated_columns = []
101 | for column in self.descriptor_column_name_:
102 | if self.include_column_name:
103 | if self.unique_values:
104 | aggregated_columns.append(
105 | grouped_frame[column].agg(
106 | lambda x: [
107 | column + ":" + value
108 | for value in x.unique()
109 | if pd.notna(value)
110 | ]
111 | )
112 | )
113 | else:
114 | aggregated_columns.append(
115 | grouped_frame[column].agg(
116 | lambda x: [
117 | column + ":" + value for value in x if pd.notna(value)
118 | ]
119 | )
120 | )
121 | else:
122 | if self.unique_values:
123 | aggregated_columns.append(
124 | grouped_frame[column].agg(
125 | lambda x: [value for value in x.unique() if pd.notna(value)]
126 | )
127 | )
128 | else:
129 | aggregated_columns.append(
130 | grouped_frame[column].agg(
131 | lambda x: [value for value in x if pd.notna(value)]
132 | )
133 | )
134 | reduced = pd.concat(aggregated_columns, axis="columns").sum(axis=1)
135 | return reduced
136 |
137 | def fit(self, X, y=None, **fit_params):
138 | self.fit_transform(X, y, **fit_params)
139 | return self
140 |
--------------------------------------------------------------------------------
/vectorizers/transformers/count_feature_compression.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.base import BaseEstimator, TransformerMixin
3 | from sklearn.utils.validation import (
4 | check_is_fitted,
5 | check_random_state,
6 | )
7 | from sklearn.preprocessing import normalize
8 | import scipy.sparse
9 | from sklearn.utils.extmath import randomized_svd, svd_flip
10 | from scipy.sparse.linalg import svds
11 |
12 | from warnings import warn
13 |
14 |
15 | class CountFeatureCompressionTransformer(BaseEstimator, TransformerMixin):
16 | """Large sparse high dimensional matrices of count based, or strictly
17 | non-negative features are common. This transformer provides a simple
18 | but often very effective dimension reduction approach to provide a
19 | dense representation of the data that is amenable to cosine based distance
20 | measures.
21 |
22 | Parameters
23 | ----------
24 | n_components: int (optional, default=128)
25 | The number of dimensions to use for the dense reduced representation.
26 |
27 | n_iter: int (optional, default=7)
28 | If using the ``"randomized"`` algorithm for SVD then use this number of
29 | iterations for estimate the SVD.
30 |
31 | algorithm: string (optional, default="randomized")
32 | The algorithm to use internally for the SVD step. Should be one of
33 | * "arpack"
34 | * "randomized"
35 |
36 | random_state: int, np.random_state or None (optional, default=None)
37 | If using the ``"randomized"`` algorithm for SVD then use this as the
38 | random state (or random seed).
39 | """
40 |
41 | def __init__(
42 | self,
43 | n_components=128,
44 | n_iter=7,
45 | algorithm="randomized",
46 | random_state=None,
47 | rescaling_power=0.5,
48 | ):
49 | self.n_components = n_components
50 | self.n_iter = n_iter
51 | self.algorithm = algorithm
52 | self.random_state = random_state
53 | self.rescaling_power = rescaling_power
54 |
55 | def fit_transform(self, X, y=None, **fit_params):
56 | """
57 | Given a dataset of count based features (i.e. strictly positive)
58 | perform feature compression / dimension reduction to provide
59 | a dataset with ``self.n_components`` dimensions suitable for
60 | measuring distances using cosine distance.
61 |
62 | Parameters
63 | ----------
64 | X: ndarray or sparse matrix of shape (n_samples, n_features)
65 | The input data to be transformed.
66 |
67 | Returns
68 | -------
69 | result: ndarray of shape (n_samples, n_components)
70 | The dimension reduced representation of the input.
71 | """
72 | # Handle too large an n_components value somewhat gracefully
73 | if self.n_components >= X.shape[1]:
74 | warn(
75 | f"Warning: n_components is {self.n_components} but input has only {X.shape[1]} features!"
76 | f"No compression will be performed."
77 | )
78 | self.components_ = np.eye(X.shape[1])
79 | self.component_scaling_ = np.ones(X.shape[1])
80 | return X
81 |
82 | if scipy.sparse.isspmatrix(X):
83 | if np.any(X.data < 0.0):
84 | raise ValueError("All entries in input most be non-negative!")
85 | else:
86 | if np.any(X < 0.0):
87 | raise ValueError("All entries in input most be non-negative!")
88 |
89 | normed_data = normalize(X)
90 | rescaled_data = scipy.sparse.csr_matrix(normed_data)
91 | rescaled_data.data = np.power(normed_data.data, self.rescaling_power)
92 | if self.algorithm == "arpack":
93 | u, s, v = svds(rescaled_data, k=self.n_components)
94 | elif self.algorithm == "randomized":
95 | random_state = check_random_state(self.random_state)
96 | u, s, v = randomized_svd(
97 | rescaled_data,
98 | n_components=self.n_components,
99 | n_iter=self.n_iter,
100 | random_state=random_state,
101 | )
102 | else:
103 | raise ValueError("algorithm should be one of 'arpack' or 'randomized'")
104 |
105 | u, v = svd_flip(u, v)
106 | self.component_scaling_ = np.sqrt(s)
107 | self.components_ = v
108 | self.metric_ = "cosine"
109 |
110 | result = u * self.component_scaling_
111 |
112 | return result
113 |
114 | def fit(self, X, y=None, **fit_params):
115 | """
116 | Given a dataset of count based features (i.e. strictly positive)
117 | learn a feature compression / dimension reduction to provide
118 | a dataset with ``self.n_components`` dimensions suitable for
119 | measuring distances using cosine distance.
120 |
121 | Parameters
122 | ----------
123 | X: ndarray or sparse matrix of shape (n_samples, n_features)
124 | The input data to be transformed.
125 | """
126 | self.fit_transform(X, y, **fit_params)
127 | return self
128 |
129 | def transform(self, X, y=None):
130 | """
131 | Given a dataset of count based features (i.e. strictly positive)
132 | perform the learned feature compression / dimension reduction.
133 |
134 | Parameters
135 | ----------
136 | X: ndarray or sparse matrix of shape (n_samples, n_features)
137 | The input data to be transformed.
138 |
139 | Returns
140 | -------
141 | result: ndarray of shape (n_samples, n_components)
142 | The dimension reduced representation of the input.
143 | """
144 | check_is_fitted(
145 | self,
146 | ["components_", "component_scaling_"],
147 | )
148 | normed_data = normalize(X)
149 | rescaled_data = scipy.sparse.csr_matrix(normed_data)
150 | rescaled_data.data = np.power(normed_data.data, self.rescaling_power)
151 |
152 | result = (rescaled_data @ self.components_.T) / self.component_scaling_
153 |
154 | return result
155 |
--------------------------------------------------------------------------------
/vectorizers/transformers/info_weight.py:
--------------------------------------------------------------------------------
1 | import numba
2 | import numpy as np
3 | from sklearn.base import BaseEstimator, TransformerMixin
4 | import scipy.sparse
5 |
6 | MOCK_TARGET = np.ones(1, dtype=np.int64)
7 |
8 |
9 | @numba.njit(nogil=True)
10 | def column_kl_divergence_exact_prior(
11 | count_indices,
12 | count_data,
13 | baseline_probabilities,
14 | prior_strength=0.1,
15 | target=MOCK_TARGET,
16 | ):
17 | observed_norm = count_data.sum() + prior_strength
18 | observed_zero_constant = (prior_strength / observed_norm) * np.log(
19 | prior_strength / observed_norm
20 | )
21 | result = 0.0
22 | count_indices_set = set(count_indices)
23 | for i in range(baseline_probabilities.shape[0]):
24 | if i in count_indices_set:
25 | idx = np.searchsorted(count_indices, i)
26 | observed_probability = (
27 | count_data[idx] + prior_strength * baseline_probabilities[i]
28 | ) / observed_norm
29 | if observed_probability > 0.0:
30 | result += observed_probability * np.log(
31 | observed_probability / baseline_probabilities[i]
32 | )
33 | else:
34 | result += baseline_probabilities[i] * observed_zero_constant
35 |
36 | return result
37 |
38 |
39 | @numba.njit(nogil=True)
40 | def column_kl_divergence_approx_prior(
41 | count_indices,
42 | count_data,
43 | baseline_probabilities,
44 | prior_strength=0.1,
45 | target=MOCK_TARGET,
46 | ):
47 | observed_norm = count_data.sum() + prior_strength
48 | observed_zero_constant = (prior_strength / observed_norm) * np.log(
49 | prior_strength / observed_norm
50 | )
51 | result = 0.0
52 | zero_count_component_estimate = (
53 | np.mean(baseline_probabilities)
54 | * observed_zero_constant
55 | * (baseline_probabilities.shape[0] - count_indices.shape[0])
56 | )
57 | result += zero_count_component_estimate
58 | for i in range(count_indices.shape[0]):
59 | idx = count_indices[i]
60 | observed_probability = (
61 | count_data[i] + prior_strength * baseline_probabilities[idx]
62 | ) / observed_norm
63 | if observed_probability > 0.0 and baseline_probabilities[idx] > 0:
64 | result += observed_probability * np.log(
65 | observed_probability / baseline_probabilities[idx]
66 | )
67 |
68 | return result
69 |
70 |
71 | @numba.njit(nogil=True)
72 | def supervised_column_kl(
73 | count_indices,
74 | count_data,
75 | baseline_probabilities,
76 | prior_strength=0.1,
77 | target=MOCK_TARGET,
78 | ):
79 | observed = np.zeros_like(baseline_probabilities)
80 | for i in range(count_indices.shape[0]):
81 | idx = count_indices[i]
82 | label = target[idx]
83 | observed[label] += count_data[i]
84 |
85 | observed += prior_strength * baseline_probabilities
86 | observed /= observed.sum()
87 |
88 | return np.sum(observed * np.log(observed / baseline_probabilities))
89 |
90 |
91 | @numba.njit(nogil=True, parallel=True)
92 | def column_weights(
93 | indptr,
94 | indices,
95 | data,
96 | baseline_probabilities,
97 | column_kl_divergence_func,
98 | prior_strength=0.1,
99 | target=MOCK_TARGET,
100 | ):
101 | n_cols = indptr.shape[0] - 1
102 | weights = np.ones(n_cols)
103 | for i in numba.prange(n_cols):
104 | weights[i] = column_kl_divergence_func(
105 | indices[indptr[i] : indptr[i + 1]],
106 | data[indptr[i] : indptr[i + 1]],
107 | baseline_probabilities,
108 | prior_strength=prior_strength,
109 | target=target,
110 | )
111 | return weights
112 |
113 |
114 | def information_weight(data, prior_strength=0.1, approximate_prior=False, target=None):
115 | """Compute information based weights for columns. The information weight
116 | is estimated as the amount of information gained by moving from a baseline
117 | model to a model derived from the observed counts. In practice this can be
118 | computed as the KL-divergence between distributions. For the baseline model
119 | we assume data will be distributed according to the row sums -- i.e.
120 | proportional to the frequency of the row. For the observed counts we use
121 | a background prior of pseudo counts equal to ``prior_strength`` times the
122 | baseline prior distribution. The Bayesian prior can either be computed
123 | exactly (the default) at some computational expense, or estimated for a much
124 | fast computation, often suitable for large or very sparse datasets.
125 |
126 | Parameters
127 | ----------
128 | data: scipy sparse matrix (n_samples, n_features)
129 | A matrix of count data where rows represent observations and
130 | columns represent features. Column weightings will be learned
131 | from this data.
132 |
133 | prior_strength: float (optional, default=0.1)
134 | How strongly to weight the prior when doing a Bayesian update to
135 | derive a model based on observed counts of a column.
136 |
137 | approximate_prior: bool (optional, default=False)
138 | Whether to approximate weights based on the Bayesian prior or perform
139 | exact computations. Approximations are much faster especialyl for very
140 | large or very sparse datasets.
141 |
142 | target: ndarray or None (optional, default=None)
143 | If supervised target labels are available, these can be used to define distributions
144 | over the target classes rather than over rows, allowing weights to be
145 | supervised and target based. If None then unsupervised weighting is used.
146 |
147 | Returns
148 | -------
149 | weights: ndarray of shape (n_features,)
150 | The learned weights to be applied to columns based on the amount
151 | of information provided by the column.
152 | """
153 | if approximate_prior:
154 | column_kl_divergence_func = column_kl_divergence_approx_prior
155 | else:
156 | column_kl_divergence_func = column_kl_divergence_exact_prior
157 |
158 | baseline_counts = np.squeeze(np.array(data.sum(axis=1)))
159 | if target is None:
160 | baseline_probabilities = baseline_counts / baseline_counts.sum()
161 | else:
162 | baseline_probabilities = np.zeros(target.max() + 1)
163 | for i in range(baseline_probabilities.shape[0]):
164 | baseline_probabilities[i] = baseline_counts[target == i].sum()
165 | baseline_probabilities /= baseline_probabilities.sum()
166 | column_kl_divergence_func = supervised_column_kl
167 |
168 | csc_data = data.tocsc()
169 | csc_data.sort_indices()
170 |
171 | weights = column_weights(
172 | csc_data.indptr,
173 | csc_data.indices,
174 | csc_data.data,
175 | baseline_probabilities,
176 | column_kl_divergence_func,
177 | prior_strength=prior_strength,
178 | target=target,
179 | )
180 | return weights
181 |
182 |
183 | class InformationWeightTransformer(BaseEstimator, TransformerMixin):
184 | """A data transformer that re-weights columns of count data. Column weights
185 | are computed as information based weights for columns. The information weight
186 | is estimated as the amount of information gained by moving from a baseline
187 | model to a model derived from the observed counts. In practice this can be
188 | computed as the KL-divergence between distributions. For the baseline model
189 | we assume data will be distributed according to the row sums -- i.e.
190 | proportional to the frequency of the row. For the observed counts we use
191 | a background prior of pseudo counts equal to ``prior_strength`` times the
192 | baseline prior distribution. The Bayesian prior can either be computed
193 | exactly (the default) at some computational expense, or estimated for a much
194 | fast computation, often suitable for large or very sparse datasets.
195 |
196 | Parameters
197 | ----------
198 | prior_strength: float (optional, default=0.1)
199 | How strongly to weight the prior when doing a Bayesian update to
200 | derive a model based on observed counts of a column.
201 |
202 | approximate_prior: bool (optional, default=False)
203 | Whether to approximate weights based on the Bayesian prior or perform
204 | exact computations. Approximations are much faster especialyl for very
205 | large or very sparse datasets.
206 |
207 | Attributes
208 | ----------
209 |
210 | information_weights_: ndarray of shape (n_features,)
211 | The learned weights to be applied to columns based on the amount
212 | of information provided by the column.
213 | """
214 |
215 | def __init__(
216 | self,
217 | prior_strength=1e-4,
218 | approx_prior=True,
219 | weight_power=2.0,
220 | supervision_weight=0.95,
221 | ):
222 | self.prior_strength = prior_strength
223 | self.approx_prior = approx_prior
224 | self.weight_power = weight_power
225 | self.supervision_weight = supervision_weight
226 |
227 | def fit(self, X, y=None, **fit_kwds):
228 | """Learn the appropriate column weighting as information weights
229 | from the observed count data ``X``.
230 |
231 | Parameters
232 | ----------
233 | X: ndarray of scipy sparse matrix of shape (n_samples, n_features)
234 | The count data to be trained on. Note that, as count data all
235 | entries should be positive or zero.
236 |
237 | Returns
238 | -------
239 | self:
240 | The trained model.
241 | """
242 | if not scipy.sparse.isspmatrix(X):
243 | X = scipy.sparse.csc_matrix(X)
244 |
245 | self.information_weights_ = information_weight(
246 | X, self.prior_strength, self.approx_prior
247 | )
248 |
249 | if y is not None:
250 | unsupervised_power = (1.0 - self.supervision_weight) * self.weight_power
251 | supervised_power = self.supervision_weight * self.weight_power
252 |
253 | self.information_weights_ /= np.mean(self.information_weights_)
254 | self.information_weights_ = np.maximum(self.information_weights_, 0.0)
255 | self.information_weights_ = np.power(
256 | self.information_weights_, unsupervised_power
257 | )
258 |
259 | target_classes = np.unique(y)
260 | target_dict = dict(
261 | np.vstack((target_classes, np.arange(target_classes.shape[0]))).T
262 | )
263 | target = np.array(
264 | [np.int64(target_dict[label]) for label in y], dtype=np.int64
265 | )
266 | self.supervised_weights_ = information_weight(
267 | X, self.prior_strength, self.approx_prior, target=target
268 | )
269 | self.supervised_weights_ /= np.mean(self.supervised_weights_)
270 | self.supervised_weights_ = np.maximum(self.supervised_weights_, 0.0)
271 | self.supervised_weights_ = np.power(
272 | self.supervised_weights_, supervised_power
273 | )
274 |
275 | self.information_weights_ = (
276 | self.information_weights_ * self.supervised_weights_
277 | )
278 | else:
279 | self.information_weights_ /= np.mean(self.information_weights_)
280 | self.information_weights_ = np.maximum(self.information_weights_, 0.0)
281 | self.information_weights_ = np.power(
282 | self.information_weights_, self.weight_power
283 | )
284 |
285 | return self
286 |
287 | def transform(self, X):
288 | """Reweight data ``X`` based on learned information weights of columns.
289 |
290 | Parameters
291 | ----------
292 | X: ndarray of scipy sparse matrix of shape (n_samples, n_features)
293 | The count data to be transformed. Note that, as count data all
294 | entries should be positive or zero.
295 |
296 | Returns
297 | -------
298 | result: ndarray of scipy sparse matrix of shape (n_samples, n_features)
299 | The reweighted data.
300 | """
301 | result = X @ scipy.sparse.diags(self.information_weights_)
302 | return result
303 |
--------------------------------------------------------------------------------
/vectorizers/transformers/row_desnoise.py:
--------------------------------------------------------------------------------
1 | import numba
2 | import numpy as np
3 | from sklearn.base import BaseEstimator, TransformerMixin
4 | from sklearn.utils.validation import check_is_fitted
5 | from sklearn.preprocessing import normalize
6 | import scipy.sparse
7 |
8 | from warnings import warn
9 |
10 |
11 | @numba.njit()
12 | def numba_multinomial_em_sparse(
13 | indptr,
14 | inds,
15 | data,
16 | background,
17 | precision=1e-7,
18 | low_thresh=1e-5,
19 | bg_prior=5.0,
20 | prior_strength=0.3,
21 | ):
22 | result = np.zeros(data.shape[0], dtype=np.float32)
23 | mix_weights = np.zeros(indptr.shape[0] - 1, dtype=np.float32)
24 |
25 | prior = np.array([1.0, bg_prior]) * prior_strength
26 | mp = 1.0 + 1.0 * np.sum(prior)
27 |
28 | for i in range(indptr.shape[0] - 1):
29 | indices = inds[indptr[i] : indptr[i + 1]]
30 | row_data = data[indptr[i] : indptr[i + 1]]
31 |
32 | row_background = np.zeros_like(row_data)
33 | for idx in range(indices.shape[0]):
34 | j = indices[idx]
35 | row_background[idx] = background[j]
36 |
37 | row_background = row_background / row_background.sum()
38 |
39 | mix_param = 0.5
40 | current_dist = mix_param * row_data + (1.0 - mix_param) * row_background
41 |
42 | last_mix_param = mix_param
43 | change_magnitude = 1.0
44 |
45 | while (
46 | change_magnitude > precision
47 | and mix_param > precision
48 | and mix_param < 1.0 - precision
49 | ):
50 |
51 | posterior_dist = current_dist * mix_param
52 | posterior_dist /= current_dist * mix_param + row_background * (
53 | 1.0 - mix_param
54 | )
55 |
56 | current_dist = posterior_dist * row_data
57 | mix_param = (current_dist.sum() + prior[0]) / mp
58 | current_dist = current_dist / current_dist.sum()
59 |
60 | change_magnitude = np.abs(mix_param - last_mix_param)
61 | last_mix_param = mix_param
62 |
63 | # zero out any small values
64 | norm = 0.0
65 | for n in range(current_dist.shape[0]):
66 | if current_dist[n] < low_thresh:
67 | current_dist[n] = 0.0
68 | else:
69 | norm += current_dist[n]
70 | current_dist /= norm
71 |
72 | result[indptr[i] : indptr[i + 1]] = current_dist
73 | mix_weights[i] = mix_param
74 |
75 | return result, mix_weights
76 |
77 |
78 | def multinomial_em_sparse(
79 | matrix,
80 | background,
81 | precision=1e-7,
82 | low_thresh=1e-5,
83 | bg_prior=5.0,
84 | prior_strength=0.3,
85 | ):
86 | if scipy.sparse.isspmatrix_csr(matrix):
87 | result = matrix.copy().astype(np.float32)
88 | else:
89 | result = matrix.tocsr().astype(np.float32)
90 | new_data, mix_weights = numba_multinomial_em_sparse(
91 | result.indptr,
92 | result.indices,
93 | result.data,
94 | background,
95 | precision,
96 | low_thresh,
97 | bg_prior,
98 | prior_strength,
99 | )
100 | result.data = new_data
101 |
102 | return result, mix_weights
103 |
104 |
105 | class RowDenoisingTransformer(BaseEstimator, TransformerMixin):
106 | """
107 |
108 | Parameters
109 | ----------
110 | normalize = False
111 | Return the modified count matrix (default) or the L_1 normalization of each row.
112 |
113 | optional EM params:
114 | * em_precision = 1e-7, (halt EM when the mix_param changes less than this)
115 | * em_threshold = 1e-5, (set to zero any values below this)
116 | * em_background_prior = 5.0, (a non-negative number)
117 | * em_prior_strength = 0.3 (a non-negative number)
118 | """
119 |
120 | def __init__(
121 | self,
122 | em_precision=1.0e-7,
123 | em_background_prior=1.0,
124 | em_threshold=1.0e-8,
125 | em_prior_strength=0.5,
126 | normalize=False,
127 | ):
128 | self.em_threshold = em_threshold
129 | self.em_background_prior = em_background_prior
130 | self.em_precision = em_precision
131 | self.em_prior_strength = em_prior_strength
132 | self.normalize = normalize
133 |
134 | def fit(self, X, y=None, **fit_params):
135 | """
136 |
137 | Parameters
138 | ----------
139 | X: sparse matrix of shape (n_docs, n_words)
140 | The data matrix to used to find the low-rank effects
141 |
142 | y: Ignored
143 |
144 | fit_params:
145 | optional model params
146 |
147 | Returns
148 | -------
149 | self
150 |
151 | """
152 | if scipy.sparse.issparse(X):
153 | X.eliminate_zeros()
154 | if X.nnz == 0:
155 | warn("Cannot fit an empty matrix")
156 | return self
157 | self.background_model_ = np.squeeze(
158 | np.array(X.sum(axis=0), dtype=np.float32)
159 | )
160 | else:
161 | self.background_model_ = X.sum(axis=0)
162 |
163 | self.background_model_ /= self.background_model_.sum()
164 |
165 | return self
166 |
167 | def transform(self, X, y=None):
168 | """
169 |
170 | X: sparse matrix of shape (n_docs, n_words)
171 | The data matrix that has the effects removed
172 |
173 | y: Ignored
174 |
175 | fit_params:
176 | optional model params
177 |
178 | Returns
179 | -------
180 | X: scipy.sparse csr_matrix
181 | The matrix X with the low-rank effects removed.
182 |
183 | """
184 |
185 | check_is_fitted(self, ["background_model_"])
186 |
187 | row_sums = np.array(X.sum(axis=1)).T[0]
188 |
189 | result, weights = multinomial_em_sparse(
190 | normalize(X, norm="l1"),
191 | self.background_model_,
192 | low_thresh=self.em_threshold,
193 | bg_prior=self.em_background_prior,
194 | precision=self.em_precision,
195 | prior_strength=self.em_prior_strength,
196 | )
197 | self.mix_weights_ = weights
198 | if not self.normalize:
199 | result = scipy.sparse.diags(row_sums * weights) * result
200 |
201 | result.eliminate_zeros()
202 |
203 | return result
204 |
205 | def fit_transform(self, X, y=None, **fit_params):
206 | """
207 |
208 | Parameters
209 | ----------
210 | X: sparse matrix of shape (n_docs, n_words)
211 | The data matrix that is used to deduce the low-rank effects and then has them removed
212 |
213 | y: Ignored
214 |
215 | fit_params:
216 | optional model params
217 |
218 | Returns
219 | -------
220 | X: scipy.sparse csr_matrix
221 | The matrix X with the low-rank effects removed.
222 |
223 | """
224 | self.fit(X, **fit_params)
225 | if X.nnz == 0:
226 | return X
227 | return self.transform(X)
228 |
--------------------------------------------------------------------------------