├── .flake8
├── .github
    └── workflows
    │   └── unittest.yml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── data
    └── oos-intent.jsonl
├── docs
    ├── api
    │   ├── component.md
    │   ├── extension.md
    │   ├── model.md
    │   ├── pipeline.md
    │   ├── textprep.md
    │   └── wabbit.md
    ├── faq.md
    ├── guide
    │   ├── sklearn.md
    │   └── spacy.md
    ├── images
    │   ├── how-it-works.png
    │   ├── huge_sparse_array.png
    │   ├── make_concat.png
    │   ├── minipipe.png
    │   └── pipeline.png
    ├── index.md
    ├── logo-tokw.png
    └── token.png
├── mkdocs.yml
├── setup.py
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── data
    │   ├── en.vs5000.model
    │   └── nlp.txt
    ├── pipeline
    │   ├── __init__.py
    │   ├── test_concat.py
    │   ├── test_slice.py
    │   └── test_union.py
    ├── test_common.py
    ├── test_docs.py
    ├── test_extension.py
    ├── test_spacy_models
    │   ├── __init__.py
    │   └── test_base_usage_architectures.py
    ├── test_textprep
    │   ├── test_hyphen.py
    │   ├── test_phonetic.py
    │   └── test_sklearn.py
    ├── test_tfm.py
    ├── test_tok
    │   ├── __init__.py
    │   └── test_whitespace.py
    └── test_wabbit.py
├── theme
    ├── token.png
    └── token.svg
├── token.png
└── tokenwiser
    ├── __init__.py
    ├── __main__.py
    ├── common.py
    ├── component
        ├── __init__.py
        └── _sklearn.py
    ├── extension
        ├── __init__.py
        └── _extension.py
    ├── model
        ├── __init__.py
        └── sklearnmod.py
    ├── pipeline
        ├── __init__.py
        ├── _concat.py
        ├── _pipe.py
        └── _union.py
    ├── proj
        └── __init__.py
    ├── textprep
        ├── __init__.py
        ├── _cleaner.py
        ├── _hyphen.py
        ├── _identity.py
        ├── _morph.py
        ├── _phonetic.py
        ├── _prep.py
        ├── _sentpiece.py
        ├── _snowball.py
        └── _yake.py
    ├── tok
        ├── __init__.py
        ├── _spacy.py
        ├── _tok.py
        └── _whitespace.py
    └── wabbit
        ├── __init__.py
        └── _vowpal.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | per-file-ignores =
3 |   clumper/__init__.py: F401
4 | max-line-length = 160
5 | ignore = E203


--------------------------------------------------------------------------------
/.github/workflows/unittest.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - main
 7 |   pull_request:
 8 |     branches:
 9 |     - main
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ${{ matrix.os }}
14 |     strategy:
15 |       matrix:
16 |         os: [ubuntu-latest]
17 |         python-version: [3.7, 3.8, 3.9]
18 | 
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v1
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install General Dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip setuptools wheel
28 |         pip install -e ".[dev]"
29 |         python -m spacy download en_core_web_sm
30 |     - name: Test with pytest
31 |       run: |
32 |         pytest --verbose tests
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | .idea
131 | *.ipynb
132 | *.model
133 | *.csv
134 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | black:
 2 | 	black tokenwiser tests setup.py --check
 3 | 
 4 | flake:
 5 | 	flake8 tokenwiser tests setup.py
 6 | 
 7 | test:
 8 | 	pytest
 9 | 
10 | check: black flake test
11 | 
12 | install:
13 | 	python -m pip install -e .
14 | 
15 | install-dev:
16 | 	python -m pip install -e ".[dev]"
17 | 	pre-commit install
18 | 
19 | install-test:
20 | 	python -m pip install -e ".[test]"
21 | 	python -m pip install -e ".[all]"
22 | 
23 | pypi:
24 | 	python setup.py sdist
25 | 	python setup.py bdist_wheel --universal
26 | 	twine upload dist/*
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="docs/logo-tokw.png" width=280 align="right">
 2 | 
 3 | # tokenwiser
 4 | 
 5 | > Bag of, not words, but tricks!
 6 | 
 7 | This project contains a couple of "tricks" on tokens. It's a collection 
 8 | of tricks for sparse data that might be trained on a stream of data too.
 9 | 
10 | While exploring these tricks was super fun, I do feel like there are plenty
11 | of better alternatives than the ideas I explore here. In the end, TfIDF + LogReg
12 | can be "fine" for a bunch of tasks that don't require embeddings. 
13 | 
14 | And for embeddings ... there's [embetter](https://github.com/koaning/embetter). 
15 | 
16 | So I archived this repo. Bit of a shame, because I _really_ liked the name of this package. 
17 | 


--------------------------------------------------------------------------------
/docs/api/component.md:
--------------------------------------------------------------------------------
 1 | # `component`
 2 | 
 3 | ```python
 4 | from tokenwiser.component import *
 5 | ```
 6 | 
 7 | In the `component` submodule you can find spaCy compatible components.
 8 | 
 9 | ::: tokenwiser.component.attach_sklearn_categoriser
10 |     rendering:
11 |         show_root_full_path: false
12 |         show_root_heading: true
13 | 


--------------------------------------------------------------------------------
/docs/api/extension.md:
--------------------------------------------------------------------------------
 1 | # `extension`
 2 | 
 3 | ```python
 4 | from tokenwiser.extension import *
 5 | ```
 6 | 
 7 | In the `extension` submodule you can find spaCy compatible extensions.
 8 | 
 9 | ::: tokenwiser.extension.attach_hyphen_extension
10 |     rendering:
11 |         show_root_full_path: false
12 |         show_root_heading: true
13 | 
14 | 
15 | ::: tokenwiser.extension.sklearn_method
16 |     rendering:
17 |         show_root_full_path: false
18 |         show_root_heading: true
19 | 


--------------------------------------------------------------------------------
/docs/api/model.md:
--------------------------------------------------------------------------------
 1 | # `model`
 2 | 
 3 | ```python
 4 | from tokenwiser.model import *
 5 | ```
 6 | 
 7 | In the `model` submodule you can find scikit-learn pipelines that are trainable via spaCy. 
 8 | These pipelines apply the `.partial_fit().predict()`-design which makes them compliant with
 9 | the `spacy train` command.
10 | 
11 | ::: tokenwiser.model.SklearnCat
12 |     rendering:
13 |         show_root_full_path: false
14 |         show_root_heading: true
15 | 


--------------------------------------------------------------------------------
/docs/api/pipeline.md:
--------------------------------------------------------------------------------
 1 | # `pipeline`
 2 | 
 3 | ```python
 4 | from tokenwiser.pipeline import * 
 5 | ```
 6 | 
 7 | In the `pipeline` submodule you can find scikit-learn compatbile
 8 | pipelines that extend the standard behavior. 
 9 | 
10 | ::: tokenwiser.pipeline.PartialPipeline
11 |     rendering:
12 |         show_root_full_path: false
13 |         show_root_heading: true
14 | 
15 | ::: tokenwiser.pipeline.TextConcat
16 |     rendering:
17 |         show_root_full_path: false
18 |         show_root_heading: true
19 |     selection:
20 |         members:
21 |           - partial_fit
22 |         
23 | ::: tokenwiser.pipeline.PartialFeatureUnion
24 |     rendering:
25 |         show_root_full_path: false
26 |         show_root_heading: true
27 | 
28 | ::: tokenwiser.pipeline.make_partial_pipeline
29 |     rendering:
30 |         show_root_full_path: false
31 |         show_root_heading: true
32 | 
33 | ::: tokenwiser.pipeline.make_concat
34 |     rendering:
35 |         show_root_full_path: false
36 |         show_root_heading: true
37 | 
38 | ::: tokenwiser.pipeline.make_partial_union
39 |     rendering:
40 |         show_root_full_path: false
41 |         show_root_heading: true
42 | 


--------------------------------------------------------------------------------
/docs/api/textprep.md:
--------------------------------------------------------------------------------
 1 | # `textprep`
 2 | 
 3 | ```python
 4 | from tokenwiser.textprep import *
 5 | ```
 6 | 
 7 | In the `textprep` submodule you can find scikit-learn compatbile
 8 | components that transform text into another type of text. The idea
 9 | is that this may be combined in interesting ways in CountVectorizers.
10 | 
11 | ::: tokenwiser.textprep.Cleaner
12 |     rendering:
13 |         show_root_full_path: false
14 |         show_root_heading: true
15 | 
16 | ::: tokenwiser.textprep.Identity
17 |     selection:
18 |         members:
19 |           - no
20 |     rendering:
21 |         show_root_full_path: false
22 |         show_root_heading: true
23 | 
24 | ::: tokenwiser.textprep.HyphenTextPrep
25 |     selection:
26 |         members:
27 |           - fit
28 |           - transform
29 |     rendering:
30 |         show_root_full_path: false
31 |         show_root_heading: true
32 | 
33 | ::: tokenwiser.textprep.SentencePiecePrep
34 |     rendering:
35 |         show_root_full_path: false
36 |         show_root_heading: true
37 | 
38 | ::: tokenwiser.textprep.PhoneticTextPrep
39 |     rendering:
40 |         show_root_full_path: false
41 |         show_root_heading: true
42 | 
43 | ::: tokenwiser.textprep.YakeTextPrep
44 |     rendering:
45 |         show_root_full_path: false
46 |         show_root_heading: true
47 | 
48 | ::: tokenwiser.textprep.SpacyMorphTextPrep
49 |     rendering:
50 |         show_root_full_path: false
51 |         show_root_heading: true
52 | 
53 | ::: tokenwiser.textprep.SpacyPosTextPrep
54 |     rendering:
55 |         show_root_full_path: false
56 |         show_root_heading: true
57 | 
58 | ::: tokenwiser.textprep.SpacyLemmaTextPrep
59 |     rendering:
60 |         show_root_full_path: false
61 |         show_root_heading: true
62 | 
63 | ::: tokenwiser.textprep.SnowballTextPrep
64 |     rendering:
65 |         show_root_full_path: false
66 |         show_root_heading: true
67 | 


--------------------------------------------------------------------------------
/docs/api/wabbit.md:
--------------------------------------------------------------------------------
 1 | # `wabbit`
 2 | 
 3 | ```python
 4 | from tokenwiser.wabbit import *
 5 | ```
 6 | 
 7 | In the `wabbit` submodule you can find a scikit-learn 
 8 | component based on [vowpal wabbit](https://vowpalwabbit.org/).
 9 | 
10 | ::: tokenwiser.wabbit.VowpalWabbitClassifier
11 |     rendering:
12 |         show_root_full_path: false
13 |         show_root_heading: true
14 | 


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
 1 | ## Why can't I use normal `Pipeline` objects with the spaCy API? 
 2 | 
 3 | Scikit-Learn assumes that data is trained via `.fit(X, y).predict(X)`. This is great
 4 | when you've got a dataset fully in memory but it's not so great when your dataset is 
 5 | too big to fit in one go. This is a main reason why spaCy has an `.update()`
 6 | API for their trainable pipeline components. It's similar to `.partial_fit(X)` in 
 7 | scikit-learn. You wouldn't train on a single batch of data. Instead you would iteratively
 8 | train on subsets of the dataset. 
 9 | 
10 | A big downside of the `Pipeline` API is that it cannot use `.partial_fit(X)`. 
11 | Even if all the components on the inside are compatible, it forces you to use `.fit(X)`. 
12 | That is why this library offers a `PartialPipeline`. It only allows for components that have `.partial_fit` 
13 | implemented and it's these pipelines that can also comply with spaCy's `.update()`
14 | API.
15 | 
16 | Note that all scikit-learn components offered by this library are compatible with
17 | the `PartialPipeline`. This includes everything from the `tokeniser.textprep` submodule. 
18 | 
19 | ## Can I train spaCy with scikit-learn from Jupyter? 
20 | 
21 | It's not our favorite way of doing things, but nobody is stopping you. 
22 | 
23 | ```python
24 | import spacy 
25 | from spacy import registry
26 | from spacy.training import Example
27 | from spacy.language import Language
28 | 
29 | from tokenwiser.pipeline import PartialPipeline
30 | from tokenwiser.model.sklearnmod import SklearnCat
31 | from sklearn.feature_extraction.text import HashingVectorizer
32 | from sklearn.linear_model import SGDClassifier
33 | 
34 | @Language.factory("custom-sklearn-cat")
35 | def make_sklearn_cat(nlp, name, sklearn_model, label, classes):
36 |     return SklearnCat(nlp, name, sklearn_model, label, classes)
37 | 
38 | @registry.architectures("sklearn_model_basic_sgd.v1")
39 | def make_sklearn_cat_basic_sgd():
40 |     """This creates a *partial* pipeline. We can't use a standard pipeline from scikit-learn."""
41 |     return PartialPipeline([("hash", HashingVectorizer()), ("lr", SGDClassifier(loss="log"))])
42 | 
43 | 
44 | nlp = spacy.load("en_core_web_sm")
45 | config = {
46 |     "sklearn_model": "@sklearn_model_basic_sgd.v1", 
47 |     "label": "pos", 
48 |     "classes": ["pos", "neg"]
49 | }
50 | nlp.add_pipe("custom-sklearn-cat", config=config)
51 | 
52 | texts = [
53 |     "you are a nice person", 
54 |     "this is a great movie", 
55 |     "i do not like cofee", 
56 |     "i hate tea"
57 | ]
58 | labels = ["pos", "pos", "neg", "neg"]
59 | 
60 | # This is the training loop just for out categorizer model.
61 | with nlp.select_pipes(enable="custom-sklearn-cat"):
62 |     optimizer = nlp.resume_training()
63 |     for loop in range(10):
64 |         for t, lab in zip(texts, labels):
65 |             doc = nlp.make_doc(t)
66 |             example = Example.from_dict(doc, {"cats": {"pos": lab}})
67 |             nlp.update([example], sgd=optimizer)
68 |             
69 | nlp("you are a nice person").cats # {'pos': 0.9979167909733176}
70 | nlp("coffee i do not like").cats # {'neg': 0.990049724779963}
71 | ```


--------------------------------------------------------------------------------
/docs/guide/sklearn.md:
--------------------------------------------------------------------------------
  1 | Scikit-Learn pipelines are amazing but they are not perfect for simple text use-cases. 
  2 | 
  3 | - The standard pipeline does not allow for interactive learning. You can 
  4 | apply `.fit` but that's it. Even if the tools inside of the pipeline have 
  5 | a `.partial_fit` available, the pipeline doesn't allow it. 
  6 | - The `CountVectorizer` is great, but we might need some more text-tricks 
  7 | at our disposal that are specialized towards text to make this object more effective.  
  8 | 
  9 | Part of what this library does is give more tools that extend scikit-learn for simple
 10 | text classification problems. In this document we will showcase some of the main features.
 11 | 
 12 | ## Text Preparation Tools
 13 | 
 14 | Let's first discuss a basic pipeline for text inside of scikit-learn. 
 15 | 
 16 | ### Base Pipeline 
 17 | 
 18 | This simplest text classification pipeline in scikit-learn looks like this; 
 19 | 
 20 | ![](../images/minipipe.png)
 21 | 
 22 | ```python
 23 | from sklearn.pipeline import make_pipeline 
 24 | from sklearn.feature_extraction.text import CountVectorizer
 25 | from sklearn.linear_model import SGDClassifier
 26 | 
 27 | pipe = make_pipeline(
 28 |     CountVectorizer(), 
 29 |     SGDClassifier()
 30 | )
 31 | ```
 32 | 
 33 | This pipeline will encode words as sparse features before passing them on to the logistic regression model. 
 34 | This pattern is very common and has proven to work well enough for many English text classification tasks.
 35 | 
 36 | ![](../images/how-it-works.png)
 37 | 
 38 | The nice thing about using a `SGDClassifier` is that we're able to learn from our data even if the dataset
 39 | does not fit in memory. We can call `.partial_fit` instead of `.fit` and learn in a more "online" setting. 
 40 | 
 41 | That said, there are things we can do even to this pipeline to make it better.
 42 | 
 43 | ### Spelling Errors 
 44 | 
 45 | When you are classifying online texts you are often confronted with spelling errors. To 
 46 | deal with this you'd typically use a [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) 
 47 | with a character-level analyzer such that you also encode subwords. 
 48 | 
 49 | ![](../images/huge_sparse_array.png)
 50 | 
 51 | With all of these subwords around, we'll be more robust against spelling errors.  
 52 | The downside of this approach is that you might wonder if we really *need* all these subwords. So how about this, 
 53 | let's add a step that will turn our text into subwords by splitting up hyphens. 
 54 | 
 55 | ```python
 56 | from tokenwiser.textprep import HyphenTextPrep
 57 | 
 58 | multi = HyphenTextPrep().transform(["geology", "astrology"])
 59 | 
 60 | assert multi == ['geo logy', 'as tro logy']
 61 | ```
 62 | 
 63 | The `HyphenTextPrep` preprocessor is a `TextPrep`-object. For all intents and purposes these are 
 64 | scikit-learn compatible preprocessing components but they all output strings instead of arrays. What's
 65 | nice about these though is that you can "retokenize" the original text. This allows you to use the 
 66 | subtokens as if they were tokens which might help keep your pipelines lightweight while still keeping
 67 | them robust against certain spelling errors. 
 68 | 
 69 | ### Long Texts
 70 | 
 71 | There are some other tricks that you might want to apply for longer texts. Maybe you want to summarise a text before
 72 | vectorizing it. So maybe it'd be nice to use a transformer that keeps only the most important tokens. 
 73 | 
 74 | A neat heuristic toolkit for this is [yake](https://github.com/LIAAD/yake) (you can find a demo 
 75 | [here](http://yake.inesctec.pt/demo/sample/sample1)). This package also features a scikit-learn compatible component for it.
 76 | 
 77 | ```python
 78 | from tokenwiser.textprep import YakeTextPrep
 79 | 
 80 | text = [
 81 |   "Sources tell us that Google is acquiring Kaggle, \
 82 |    a platform that hosts data science and machine learning"
 83 | ]
 84 | example = YakeTextPrep(top_n=3, unique=False).transform(text)
 85 | 
 86 | assert example[0] == 'hosts data science acquiring kaggle google is acquiring'
 87 | ```
 88 | 
 89 | The idea here is to reduce the text down to only the most important words. Again, this trick 
 90 | might keep the algorithm lightweight and this trick will go a lot further than most "stopword"-lists.  
 91 | 
 92 | ### Bag of Tricks! 
 93 | 
 94 | The goal of this library is to host a few meaningful tricks that might be helpful. Here's some more; 
 95 | 
 96 | - `Cleaner` lowercase text remove all non alphanumeric characters.
 97 | - `Identity` just keeps the text as is, useful when constructing elaborate pipelines.
 98 | - `PhoneticTextPrep` translate text into a phonetic encoding. 
 99 | - `SpacyPosTextPrep` add part of speech infomation to the text using spaCy.
100 | - `SpacyLemmaTextPrep` lemmatize the text using spaCy.
101 | 
102 | All of these tools are part of the `textprep` submodule and are documented in detail 
103 | [here](https://koaning.github.io/tokenwiser/api/textprep.html).
104 | 
105 | ## Pipeline Tools 
106 | 
107 | Pipeline components are certainly nice. But maybe we can go a step further for text. Maybe
108 | we can make better pipelines for text too!
109 | 
110 | ### Concatenate Text
111 | 
112 | In scikit-learn you would use `FeatureUnion` or `make_union` to concatenate features in 
113 | a pipeline. Ut is assumed that transformers output arrays that need to be concatenated so the
114 | result of a concatenation is always a 2D array. This can be a bit awkward if you're using text preprocessors. 
115 | 
116 | ![](../images/make_concat.png)
117 | 
118 | The reason why we want to keep everything a string is so that the `CountVectorizer` from scikit-learn
119 | can properly encode it. That is why this library comes with a special union
120 | component: `TextConcat`. It concatenates the output of text-prep tools into a string instead of 
121 | an array. Note that we also pack a convenient `make_concat` function too.
122 | 
123 | ```python
124 | from sklearn.pipeline import make_pipeline 
125 | 
126 | from tokenwiser.pipeline import make_concat
127 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
128 | 
129 | pipe = make_pipeline(
130 |     Cleaner(),
131 |     make_concat(Identity(), HyphenTextPrep()),
132 | )
133 | 
134 | output = pipe.fit_transform(["hello astrology!!!!"])
135 | assert output == ['hello astrology hel lo astro logy']
136 | ```
137 | 
138 | Again, we see that we're taking a text input and that we're generating a text output. The `make_concat`
139 | is making sure that we concatenate strings, not arrays! This is great when we want to follow up with 
140 | a `CountVectorizer!
141 | 
142 | ```python
143 | from sklearn.pipeline import make_pipeline 
144 | from sklearn.linear_model import LogisticRegression
145 | from sklearn.feature_extraction.text import CountVectorizer
146 | 
147 | from tokenwiser.pipeline import make_concat
148 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
149 | 
150 | pipe = make_pipeline(
151 |     Cleaner(),
152 |     make_concat(Identity(), HyphenTextPrep()),
153 |     CountVectorizer(), 
154 |     LogisticRegression()
155 | )
156 | ```
157 | 
158 | The mental picture for `pipe`-pipeline looks like the diagram below. 
159 | 
160 | ![](../images/pipeline.png) 
161 | 
162 | ### Partial Fit 
163 | 
164 | We can go a step further though. The scikit-learn pipeline follows the `fit/predict` API. That
165 | means that we cannot use `.partial_fit()`. Even if all the components in the pipeline are compatible 
166 | with the `partial_fit/predict` API. That is why this library also introduced components for mini-batch 
167 | learning: `PartialPipeline` and `make_partial_pipeline` 
168 | 
169 | In these scenarios you will need to swap out the `CountVectorizer` with a `HashVectorizer` in order to
170 | be able to learn from new data comming in. 
171 | 
172 | ```python
173 | from sklearn.linear_model import SGDClassifier
174 | from sklearn.feature_extraction.text import HashingVectorizer
175 | 
176 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
177 | from tokenwiser.pipeline import make_concat, make_partial_pipeline
178 | 
179 | pipe = make_partial_pipeline(
180 |     Cleaner(),
181 |     make_concat(Identity(), HyphenTextPrep()),
182 |     HashingVectorizer(), 
183 |     SGDClassifier()
184 | )
185 | ```
186 | 
187 | This `pipe`-Pipeline is scikit-learn compatible for all intents and purposes
188 | but it has the option of learning from batches of data via `partal_fit`. This is great
189 | because it means that you're able to classify text even when it doesn't fit into memory!
190 | 
191 | > Note that all of the `TextPrep`-components in this library allow for `partial_fit`. 
192 | 
193 | To make a `partial_fit` actually work you will need to supply the names of the `classes` 
194 | at learning time. Otherwise you might accidentally get a batch that only contains one class
195 | and the algorithm would become numerically unstable. 
196 | 
197 | ```python
198 | import numpy as np
199 | from sklearn.linear_model import SGDClassifier
200 | from sklearn.feature_extraction.text import HashingVectorizer
201 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
202 | from tokenwiser.pipeline import make_partial_pipeline, make_partial_union
203 | 
204 | pipe = make_partial_pipeline(
205 |     Cleaner(),
206 |     make_partial_union(
207 |         make_partial_pipeline(Identity(), HashingVectorizer()),
208 |         make_partial_pipeline(HyphenTextPrep(), HashingVectorizer())
209 |     ),
210 |     SGDClassifier()
211 | )
212 | 
213 | X = [
214 |     "i really like this post",
215 |     "thanks for that comment",
216 |     "i enjoy this friendly forum",
217 |     "this is a bad post",
218 |     "i dislike this article",
219 |     "this is not well written"
220 | ]
221 | 
222 | y = np.array([1, 1, 1, 0, 0, 0])
223 | 
224 | for loop in range(3):
225 |     # It might make sense to loop over the same dataset multiple times
226 |     # if the dataset is small. For larger datasets this isn't recommended. 
227 |     pipe.partial_fit(X, y, classes=[0, 1])
228 | 
229 | assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0]))
230 | ```
231 | 
232 | ### Concatenate Features 
233 | 
234 | The standard `FeatureUnion` from scikit-learn also does not allow for `.partial_fit`. So we've
235 | added a `PartialFeatureUnion` class and a `make_partial_union` function to this library as well.
236 | 
237 | ```python
238 | import numpy as np
239 | from sklearn.linear_model import SGDClassifier
240 | from sklearn.feature_extraction.text import HashingVectorizer
241 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
242 | from tokenwiser.pipeline import make_partial_pipeline, make_partial_union
243 | 
244 | pipe = make_partial_pipeline(
245 |     Cleaner(),
246 |     make_partial_union(
247 |         make_partial_pipeline(Identity(), HashingVectorizer()),
248 |         make_partial_pipeline(HyphenTextPrep(), HashingVectorizer())
249 |     ),
250 |     SGDClassifier()
251 | )
252 | 
253 | X = [
254 |     "i really like this post",
255 |     "thanks for that comment",
256 |     "i enjoy this friendly forum",
257 |     "this is a bad post",
258 |     "i dislike this article",
259 |     "this is not well written"
260 | ]
261 | 
262 | y = np.array([1, 1, 1, 0, 0, 0])
263 | 
264 | for loop in range(3):
265 |     pipe.partial_fit(X, y, classes=[0, 1])
266 | 
267 | assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0]))
268 | ```


--------------------------------------------------------------------------------
/docs/guide/spacy.md:
--------------------------------------------------------------------------------
1 | This is where we'll elaborate on the `spaCy` tools.
2 | 
3 | Under construction.


--------------------------------------------------------------------------------
/docs/images/how-it-works.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/images/how-it-works.png


--------------------------------------------------------------------------------
/docs/images/huge_sparse_array.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/images/huge_sparse_array.png


--------------------------------------------------------------------------------
/docs/images/make_concat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/images/make_concat.png


--------------------------------------------------------------------------------
/docs/images/minipipe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/images/minipipe.png


--------------------------------------------------------------------------------
/docs/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/images/pipeline.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | <img src="logo-tokw.png" width=250 align="right">
 2 | 
 3 | <h1 style="font-weight: bold; color: black;">tokenwiser</h1>
 4 | 
 5 | > Bag of, not words, but tricks!
 6 | 
 7 | ## Goal
 8 | 
 9 | We noticed that a lot of benchmarks relied on heavy-weight tools while they did not 
10 | check if something more lightweight would also work. Maybe if we just apply some simple 
11 | tricks on our tokens we won't need massive language models. The goal of this package is 
12 | to contribute tricks to keep your NLP pipelines simple. These tricks are made available
13 | for spaCy, scikit-learn and vowpal wabbit. 
14 | 
15 | > If you're looking for a tool that can add pretrained language models to scikit-learn 
16 | pipelines as a benchmark you'll want to explore another tool: [whatlies](https://rasahq.github.io/whatlies/tutorial/scikit-learn/).
17 | 
18 | ## Features
19 | 
20 | ### Scikit-Learn Tools 
21 | 
22 | The following submodules contain features that might be useful. 
23 | 
24 | - `.textprep`: Contains string pre-processing tools for scikit-learn.  
25 | - `.pipeline`: Contains extra pipeline components for scikit-learn.
26 | - `.wabbit`: Contains a scikit-learn component based on [vowpal wabbit](https://vowpalwabbit.org/).
27 | 
28 | ### SpaCy Tools 
29 |  
30 | - `.component`: Contains spaCy compatible components that might be added as a pipeline step.
31 | - `.extension`: Contains spaCy compatible extensions that might be added manually. 
32 | 


--------------------------------------------------------------------------------
/docs/logo-tokw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/logo-tokw.png


--------------------------------------------------------------------------------
/docs/token.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/token.png


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: "tokenwiser"
 2 | extra_css: [style.css]
 3 | repo_url: https://github.com/koaning/tokenwiser
 4 | site_url: https://koaning.github.io/tokenwiser/
 5 | site_description: Bag of, not words, but tricks!
 6 | site_author: Vincent D. Warmerdam
 7 | use_directory_urls: false
 8 | nav:
 9 |   - Home: index.md
10 |   - Scikit-Learn: guide/sklearn.md
11 |   - spaCy: guide/spacy.md
12 |   - API:
13 |       - textprep: api/textprep.md
14 |       - pipeline: api/pipeline.md
15 |       - extension: api/extension.md
16 |       - component: api/component.md
17 |       - wabbit: api/wabbit.md
18 |   - FAQ: faq.md
19 | plugins:
20 |   - mkdocstrings:
21 |       handlers:
22 |         python:
23 |           setup_commands:
24 |             - from tokenwiser.textprep import *
25 |             - from tokenwiser.pipeline import *
26 |       watch:
27 |         - tokenwiser
28 |   - search
29 | copyright: Copyright &copy; 2020 Maintained by <a href="https://twitter.com/fishnets88">Vincent</a>.
30 | theme:
31 |   name: material
32 |   logo: token.png
33 |   font:
34 |     text: Ubuntu
35 |     code: Ubuntu Mono
36 |   feature:
37 |     tabs: true
38 |   palette:
39 |     primary: white
40 |     accent: teal
41 |   features:
42 |     - navigation.tabs
43 | markdown_extensions:
44 |   - admonition
45 |   - codehilite
46 |   - pymdownx.inlinehilite
47 |   - pymdownx.details
48 |   - pymdownx.tabbed
49 |   - pymdownx.superfences
50 |   - pymdownx.highlight:
51 |       use_pygments: true
52 |       guess_lang: true
53 |   - toc:
54 |       permalink: true
55 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from tokenwiser import __version__
 2 | from setuptools import setup, find_packages
 3 | 
 4 | base_packages = [
 5 |     "jellyfish>=0.8.2",
 6 |     "Pyphen>=0.10.0",
 7 |     "scikit-learn>=0.24.0",
 8 |     "PyYAML>=5.3.1",
 9 |     "spacy>=3.2.0",
10 |     "yake-github>=0.4.0",
11 |     "vowpalwabbit>=8.9.0",
12 |     "sentencepiece>=0.1.95",
13 |     "snowballstemmer>=2.1.0",
14 |     "h5py>=2.10.0"
15 | ]
16 | 
17 | dev_packages = [
18 |     "flake8>=3.6.0",
19 |     "pytest>=4.0.2",
20 |     "jupyter>=1.0.0",
21 |     "jupyterlab>=0.35.4",
22 |     "mktestdocs>=0.1.0",
23 | ]
24 | 
25 | docs_packages = [
26 |     "mkdocs>=1.1.2",
27 |     "mkdocs-material>=6.2.8",
28 |     "mkdocstrings>=0.14.0"
29 | ]
30 | 
31 | 
32 | setup(
33 |     name="tokenwiser",
34 |     version=__version__,
35 |     packages=find_packages(exclude=["notebooks"]),
36 |     install_requires=base_packages,
37 |     extras_require={"dev": dev_packages + docs_packages, "docs": docs_packages},
38 | )
39 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from sklearn.utils import estimator_checks
 2 | 
 3 | transformer_checks = (
 4 |     estimator_checks.check_transformer_data_not_an_array,
 5 |     estimator_checks.check_transformer_general,
 6 |     estimator_checks.check_transformers_unfitted,
 7 | )
 8 | 
 9 | general_checks = (
10 |     estimator_checks.check_fit2d_predict1d,
11 |     estimator_checks.check_methods_subset_invariance,
12 |     estimator_checks.check_fit2d_1sample,
13 |     estimator_checks.check_fit2d_1feature,
14 |     estimator_checks.check_fit1d,
15 |     estimator_checks.check_get_params_invariance,
16 |     estimator_checks.check_set_params,
17 |     estimator_checks.check_dict_unchanged,
18 |     estimator_checks.check_dont_overwrite_parameters,
19 | )
20 | 
21 | nonmeta_checks = (
22 |     estimator_checks.check_estimators_dtypes,
23 |     estimator_checks.check_fit_score_takes_y,
24 |     estimator_checks.check_dtype_object,
25 |     estimator_checks.check_sample_weights_pandas_series,
26 |     estimator_checks.check_sample_weights_list,
27 |     estimator_checks.check_sample_weights_invariance,
28 |     estimator_checks.check_estimators_fit_returns_self,
29 |     estimator_checks.check_complex_data,
30 |     estimator_checks.check_estimators_empty_data_messages,
31 |     estimator_checks.check_pipeline_consistency,
32 |     estimator_checks.check_estimators_nan_inf,
33 |     estimator_checks.check_estimators_overwrite_params,
34 |     estimator_checks.check_estimator_sparse_data,
35 |     estimator_checks.check_estimators_pickle,
36 | )
37 | 
38 | classifier_checks = (
39 |     estimator_checks.check_classifier_data_not_an_array,
40 |     estimator_checks.check_classifiers_one_label,
41 |     estimator_checks.check_classifiers_classes,
42 |     estimator_checks.check_estimators_partial_fit_n_features,
43 |     estimator_checks.check_classifiers_train,
44 |     estimator_checks.check_supervised_y_2d,
45 |     estimator_checks.check_supervised_y_no_nan,
46 |     estimator_checks.check_estimators_unfitted,
47 |     estimator_checks.check_non_transformer_estimators_n_iter,
48 |     estimator_checks.check_decision_proba_consistency,
49 | )
50 | 
51 | regressor_checks = (
52 |     estimator_checks.check_regressors_train,
53 |     estimator_checks.check_regressor_data_not_an_array,
54 |     estimator_checks.check_estimators_partial_fit_n_features,
55 |     estimator_checks.check_regressors_no_decision_function,
56 |     estimator_checks.check_supervised_y_2d,
57 |     estimator_checks.check_supervised_y_no_nan,
58 |     estimator_checks.check_regressors_int,
59 |     estimator_checks.check_estimators_unfitted,
60 | )
61 | 
62 | outlier_checks = (
63 |     estimator_checks.check_outliers_fit_predict,
64 |     estimator_checks.check_outliers_train,
65 |     estimator_checks.check_classifier_data_not_an_array,
66 |     estimator_checks.check_estimators_unfitted,
67 | )
68 | 
69 | 
70 | def select_tests(include, exclude=[]):
71 |     """Return an iterable of include with all tests whose name is not in exclude"""
72 |     for test in include:
73 |         if test.__name__ not in exclude:
74 |             yield test
75 | 
76 | 
77 | def id_func(param):
78 |     """Returns the repr of an object for usage in pytest parametrize"""
79 |     return repr(param)
80 | 


--------------------------------------------------------------------------------
/tests/data/en.vs5000.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/tests/data/en.vs5000.model


--------------------------------------------------------------------------------
/tests/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/tests/pipeline/__init__.py


--------------------------------------------------------------------------------
/tests/pipeline/test_concat.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/tests/pipeline/test_concat.py


--------------------------------------------------------------------------------
/tests/pipeline/test_slice.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import HashingVectorizer
 2 | from sklearn.linear_model import LogisticRegression
 3 | 
 4 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
 5 | from tokenwiser.pipeline import PartialPipeline, make_partial_pipeline, make_concat
 6 | 
 7 | 
 8 | def test_can_slice_pipeline():
 9 |     """If we slice a pipeline, we should get a new pipeline object"""
10 |     pipe1 = make_partial_pipeline(
11 |         Cleaner(),
12 |         make_concat(
13 |             Identity(),
14 |             HyphenTextPrep(),
15 |         ),
16 |         HashingVectorizer(),
17 |         LogisticRegression()
18 |     )
19 | 
20 |     slice = pipe1[:-1]
21 |     assert isinstance(slice, PartialPipeline)
22 | 


--------------------------------------------------------------------------------
/tests/pipeline/test_union.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.feature_extraction.text import HashingVectorizer
 3 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
 4 | from tokenwiser.pipeline import PartialPipeline, PartialFeatureUnion
 5 | 
 6 | 
 7 | def test_shape_doubles():
 8 |     """If we concatenate using a partial union. It should increase in size."""
 9 |     pipe1 = PartialPipeline(
10 |         [
11 |             ("clean", Cleaner()),
12 |             (
13 |                 "union",
14 |                 PartialFeatureUnion(
15 |                     [
16 |                         (
17 |                             "full_text_pipe",
18 |                             PartialPipeline(
19 |                                 [
20 |                                     ("identity", Identity()),
21 |                                     ("hash1", HashingVectorizer()),
22 |                                 ]
23 |                             ),
24 |                         )
25 |                     ]
26 |                 ),
27 |             ),
28 |         ]
29 |     )
30 | 
31 |     pipe2 = PartialPipeline(
32 |         [
33 |             ("clean", Cleaner()),
34 |             (
35 |                 "union",
36 |                 PartialFeatureUnion(
37 |                     [
38 |                         (
39 |                             "full_text_pipe",
40 |                             PartialPipeline(
41 |                                 [
42 |                                     ("identity", Identity()),
43 |                                     ("hash1", HashingVectorizer()),
44 |                                 ]
45 |                             ),
46 |                         ),
47 |                         (
48 |                             "hyphen_pipe",
49 |                             PartialPipeline(
50 |                                 [
51 |                                     ("hyphen", HyphenTextPrep()),
52 |                                     ("hash2", HashingVectorizer()),
53 |                                 ]
54 |                             ),
55 |                         ),
56 |                     ]
57 |                 ),
58 |             ),
59 |         ]
60 |     )
61 | 
62 |     X = [
63 |         "i really like this post",
64 |         "thanks for that comment",
65 |         "i enjoy this friendly forum",
66 |         "this is a bad post",
67 |         "i dislike this article",
68 |         "this is not well written",
69 |     ]
70 | 
71 |     y = np.array([1, 1, 1, 0, 0, 0])
72 | 
73 |     p1 = pipe1.partial_fit(X, y, classes=[0, 1]).transform(X)
74 |     p2 = pipe2.partial_fit(X, y, classes=[0, 1]).transform(X)
75 | 
76 |     assert p1.shape[1] * 2 == p2.shape[1]
77 | 


--------------------------------------------------------------------------------
/tests/test_common.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pathlib
 3 | from tokenwiser.common import load_coefficients, save_coefficients
 4 | 
 5 | import numpy as np
 6 | from sklearn.linear_model import SGDClassifier, LogisticRegression, PassiveAggressiveClassifier
 7 | from sklearn.feature_extraction.text import HashingVectorizer
 8 | from sklearn.pipeline import make_pipeline
 9 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
10 | from tokenwiser.pipeline import make_partial_pipeline, make_partial_union
11 | 
12 | 
13 | @pytest.mark.parametrize("clf_train", [LogisticRegression, SGDClassifier, PassiveAggressiveClassifier])
14 | @pytest.mark.parametrize("clf_target", [LogisticRegression, SGDClassifier, PassiveAggressiveClassifier])
15 | def test_load_save(clf_train, clf_target, tmpdir):
16 |     """
17 |     Ensure that we can save/load vectors.
18 |     """
19 |     clf = clf_train()
20 |     pipe = make_pipeline(
21 |         Cleaner(),
22 |         make_partial_union(
23 |             make_partial_pipeline(Identity(), HashingVectorizer()),
24 |             make_partial_pipeline(HyphenTextPrep(), HashingVectorizer())
25 |         ),
26 |         clf
27 |     )
28 | 
29 |     X = [
30 |         "i really like this post",
31 |         "thanks for that comment",
32 |         "i enjoy this friendly forum",
33 |         "this is a bad post",
34 |         "i dislike this article",
35 |         "this is not well written"
36 |     ]
37 | 
38 |     y = np.array([1, 1, 1, 0, 0, 0])
39 | 
40 |     pipe.fit(X, y)
41 | 
42 |     assert np.all(pipe.predict(X) == y)
43 |     
44 |     # Here we create in the new pipeline. 
45 |     clf_new = clf_target()
46 |     pipe_new = make_partial_pipeline(
47 |         Cleaner(),
48 |         make_partial_union(
49 |             make_partial_pipeline(Identity(), HashingVectorizer()),
50 |             make_partial_pipeline(HyphenTextPrep(), HashingVectorizer())
51 |         ),
52 |         clf
53 |     )
54 |     path = pathlib.Path(tmpdir, "coefs.h5")
55 |     save_coefficients(clf, path)
56 |     load_coefficients(clf_new, path)
57 |     assert np.all(clf.intercept_ == clf_new.intercept_)
58 |     assert np.all(clf.coef_ == clf_new.coef_)
59 |     assert np.all(clf.classes_ == clf_new.classes_)
60 |     assert np.all(pipe_new.predict(X) == y)
61 | 


--------------------------------------------------------------------------------
/tests/test_docs.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from tokenwiser.textprep import (
 3 |     Cleaner,
 4 |     Identity,
 5 |     HyphenTextPrep,
 6 |     SpacyMorphTextPrep,
 7 |     SpacyPosTextPrep,
 8 |     SpacyLemmaTextPrep,
 9 |     YakeTextPrep,
10 |     PhoneticTextPrep,
11 | )
12 | from tokenwiser.pipeline import (
13 |     TextConcat,
14 |     PartialPipeline,
15 |     PartialFeatureUnion,
16 |     make_partial_pipeline,
17 |     make_concat,
18 |     make_partial_union,
19 | )
20 | from tokenwiser.extension import (
21 |     attach_hyphen_extension,
22 |     attach_sklearn_extension,
23 |     sklearn_method,
24 | )
25 | from tokenwiser.component import attach_sklearn_categoriser
26 | 
27 | import pytest
28 | from mktestdocs import check_docstring, check_md_file
29 | 
30 | components = [
31 |     Cleaner,
32 |     Identity,
33 |     HyphenTextPrep,
34 |     SpacyMorphTextPrep,
35 |     SpacyPosTextPrep,
36 |     SpacyLemmaTextPrep,
37 |     PhoneticTextPrep,
38 |     YakeTextPrep,
39 |     TextConcat,
40 |     PartialPipeline,
41 |     PartialFeatureUnion,
42 |     make_partial_pipeline,
43 |     make_concat,
44 |     make_partial_union,
45 |     attach_hyphen_extension,
46 |     attach_sklearn_extension,
47 |     sklearn_method,
48 |     attach_sklearn_categoriser,
49 | ]
50 | 
51 | 
52 | @pytest.mark.parametrize("obj", components, ids=lambda d: d.__qualname__)
53 | def test_member(obj):
54 |     check_docstring(obj)
55 | 
56 | 
57 | @pytest.mark.parametrize(
58 |     "fpath", [str(p) for p in pathlib.Path("docs").glob("**/*.md")]
59 | )
60 | def test_fpath(fpath):
61 |     check_md_file(fpath)
62 | 


--------------------------------------------------------------------------------
/tests/test_extension.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | from tokenwiser.extension import attach_hyphen_extension
 3 | 
 4 | 
 5 | def test_hyphen_works():
 6 |     nlp = spacy.load("en_core_web_sm")
 7 |     doc = nlp("this is a dinosaurhead")
 8 |     tok = doc[-1]
 9 |     attach_hyphen_extension()
10 |     assert tok._.hyphen == ["di", "no", "saur", "head"]
11 | 


--------------------------------------------------------------------------------
/tests/test_spacy_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/tests/test_spacy_models/__init__.py


--------------------------------------------------------------------------------
/tests/test_spacy_models/test_base_usage_architectures.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import spacy
 3 | from spacy.training import Example
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "model", ["@sklearn_model_basic_sgd.v1", "@sklearn_model_basic_naive_bayes.v1"]
 8 | )
 9 | def test_model_config_inline(model):
10 |     nlp = spacy.load("en_core_web_sm")
11 |     conf = {"sklearn_model": model, "label": "pos", "classes": ["pos", "neg"]}
12 |     nlp.add_pipe("sklearn-cat", config=conf)
13 | 
14 |     texts = ["you are a nice person", "this is a great movie", "i do not like coffee"]
15 |     labels = ["pos", "pos", "neg"]
16 | 
17 |     with nlp.select_pipes(enable="sklearn-cat"):
18 |         optimizer = nlp.resume_training()
19 |         for itn in range(100):
20 |             for t, lab in zip(texts, labels):
21 |                 doc = nlp.make_doc(t)
22 |                 example = Example.from_dict(doc, {"cats": {"pos": lab}})
23 |                 nlp.update([example], sgd=optimizer)
24 | 
25 |     assert len(nlp("you are a nice person").cats.keys()) > 0
26 |     assert len(nlp("coffee i do not like").cats.keys()) > 0
27 | 


--------------------------------------------------------------------------------
/tests/test_textprep/test_hyphen.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | from tokenwiser.textprep import HyphenTextPrep
4 | 
5 | 
6 | @pytest.mark.parametrize("x_in,x_out", [("haleluja", "hale lu ja"), ("hello", "hello")])
7 | def test_basic(x_in, x_out):
8 |     assert HyphenTextPrep().encode_single(x_in) == x_out
9 | 


--------------------------------------------------------------------------------
/tests/test_textprep/test_phonetic.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from tokenwiser.textprep import PhoneticTextPrep
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "x_in,x_out", [("haleluja", "H442"), ("hello there world", "H400 T600 W643")]
 8 | )
 9 | def test_soundex(x_in, x_out):
10 |     assert PhoneticTextPrep(kind="soundex").encode_single(x_in) == x_out
11 | 
12 | 
13 | @pytest.mark.parametrize(
14 |     "x_in,x_out", [("haleluja", "HLLJ"), ("hello there world", "HL 0R WRLT")]
15 | )
16 | def test_metaphone(x_in, x_out):
17 |     assert PhoneticTextPrep(kind="metaphone").encode_single(x_in) == x_out
18 | 
19 | 
20 | @pytest.mark.parametrize(
21 |     "x_in,x_out", [("haleluja", "HALALAJ"), ("hello there world", "HAL TAR WARLD")]
22 | )
23 | def test_nysiis(x_in, x_out):
24 |     assert PhoneticTextPrep(kind="nysiis").encode_single(x_in) == x_out
25 | 


--------------------------------------------------------------------------------
/tests/test_textprep/test_sklearn.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from sklearn.pipeline import Pipeline
 3 | from sklearn.feature_extraction.text import CountVectorizer
 4 | 
 5 | from tokenwiser.textprep import (
 6 |     Cleaner,
 7 |     HyphenTextPrep,
 8 |     SpacyMorphTextPrep,
 9 |     SpacyPosTextPrep,
10 |     SpacyLemmaTextPrep,
11 |     YakeTextPrep,
12 |     PhoneticTextPrep,
13 |     Identity,
14 |     SentencePiecePrep,
15 | )
16 | import spacy
17 | 
18 | nlp = spacy.load("en_core_web_sm")
19 | 
20 | 
21 | prep_list = [
22 |     Cleaner(),
23 |     HyphenTextPrep(),
24 |     PhoneticTextPrep(kind="soundex"),
25 |     PhoneticTextPrep(kind="metaphone"),
26 |     PhoneticTextPrep(kind="nysiis"),
27 |     YakeTextPrep(),
28 |     SpacyLemmaTextPrep(nlp),
29 |     SpacyMorphTextPrep(nlp),
30 |     SpacyPosTextPrep(nlp),
31 |     Identity(),
32 |     SentencePiecePrep(model_file="tests/data/en.vs5000.model"),
33 | ]
34 | 
35 | 
36 | @pytest.mark.parametrize("prep", prep_list, ids=[str(d) for d in prep_list])
37 | def test_pipeline_single(prep):
38 |     X = ["hello world", "this is dog", "it should work"]
39 |     pipe = Pipeline([("prep", prep), ("cv", CountVectorizer())])
40 |     assert pipe.fit_transform(X).shape[0] == 3
41 | 
42 | 
43 | @pytest.mark.parametrize("prep", prep_list, ids=[str(d) for d in prep_list])
44 | def test_pipeline_single_clean_first(prep):
45 |     X = ["hello world", "this is dog", "it should work"]
46 |     pipe = Pipeline([("clean", Cleaner()), ("prep", prep), ("cv", CountVectorizer())])
47 |     assert pipe.fit_transform(X).shape[0] == 3
48 | 


--------------------------------------------------------------------------------
/tests/test_tfm.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from tokenwiser.common import flatten
 4 | from tokenwiser.proj import BinaryRandomProjection, PointSplitProjection
 5 | 
 6 | from tests.conftest import (
 7 |     nonmeta_checks,
 8 |     general_checks,
 9 |     transformer_checks,
10 |     select_tests,
11 | )
12 | 
13 | 
14 | @pytest.mark.parametrize(
15 |     "test_fn",
16 |     select_tests(
17 |         flatten([nonmeta_checks, transformer_checks, general_checks]),
18 |         exclude=[
19 |             "check_transformer_data_not_an_array",
20 |             "check_estimators_nan_inf",
21 |             "check_fit2d_predict1d",
22 |             "check_sample_weights_invariance",
23 |             "check_sample_weights_list"
24 |         ],
25 |     ),
26 | )
27 | def test_estimator_checks_binary(test_fn):
28 |     random_proj = BinaryRandomProjection(random_seed=42)
29 |     test_fn(random_proj, random_proj)
30 | 
31 | 
32 | @pytest.mark.parametrize(
33 |     "test_fn",
34 |     select_tests(
35 |         flatten([nonmeta_checks, transformer_checks, general_checks]),
36 |         exclude=[
37 |             "check_transformer_data_not_an_array",
38 |             "check_sample_weights_invariance",
39 |             "check_estimators_nan_inf",
40 |             "check_fit2d_predict1d",
41 |             "check_transformer_general",
42 |             "check_pipeline_consistency",
43 |             "check_sample_weights_list"
44 |         ],
45 |     ),
46 | )
47 | def test_estimator_checks_split(test_fn):
48 |     random_proj = PointSplitProjection(random_seed=42)
49 |     test_fn(random_proj, random_proj)
50 | 


--------------------------------------------------------------------------------
/tests/test_tok/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/tests/test_tok/__init__.py


--------------------------------------------------------------------------------
/tests/test_tok/test_whitespace.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from tokenwiser.tok import WhiteSpaceTokenizer
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "x_in,x_out",
 8 |     [("haleluja", ["haleluja"]), ("hello there world", ["hello", "there", "world"])],
 9 | )
10 | def test_basic(x_in, x_out):
11 |     assert WhiteSpaceTokenizer()(x_in) == x_out
12 | 


--------------------------------------------------------------------------------
/tests/test_wabbit.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tokenwiser.wabbit import VowpalWabbitClassifier
 3 | from tokenwiser.pipeline import make_partial_pipeline
 4 | from tokenwiser.textprep import Cleaner
 5 | 
 6 | X = [
 7 |     "i really like this post",
 8 |     "thanks for that comment",
 9 |     "i enjoy this friendly forum",
10 |     "this is a bad post",
11 |     "i dislike this article",
12 |     "this is not well written",
13 | ]
14 | 
15 | y = np.array([1, 1, 1, 0, 0, 0])
16 | 
17 | 
18 | def test_wabbit_fit_shape_sensible():
19 |     assert VowpalWabbitClassifier().fit(X, y).predict(X).shape[0] == 6
20 |     assert VowpalWabbitClassifier().fit(X, y).predict_proba(X).shape == (6, 2)
21 | 
22 | 
23 | def test_wabbit_pipeline():
24 |     pipe = make_partial_pipeline(
25 |         Cleaner(), VowpalWabbitClassifier(n_loop=1, n_gram=1, learning_rate=0.1)
26 |     )
27 |     for i in range(5):
28 |         pipe.partial_fit(X, y, classes=list(set(y)))
29 | 


--------------------------------------------------------------------------------
/theme/token.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/theme/token.png


--------------------------------------------------------------------------------
/theme/token.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <svg width="100pt" height="100pt" version="1.1" viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg">
3 |  <g>
4 |   <path d="m50 2.5c-26.199 0-47.5 21.301-47.5 47.5s21.301 47.5 47.5 47.5 47.5-21.301 47.5-47.5-21.301-47.5-47.5-47.5zm29.801 17.699c3.8984 3.8984 7 8.5 9.1016 13.602l-8.3008 3.3984c-1.6992-4.1016-4.1016-7.6992-7.1992-10.801zm-29.801-12.301c5.6992 0 11.199 1.1992 16.102 3.1992l-3.3984 8.3008c-3.8984-1.6016-8.1992-2.5-12.699-2.5zm-16.102 3.2031 3.3984 8.3008c-4 1.6992-7.6992 4.1016-10.801 7.1992l-6.3984-6.3008c4.0039-4 8.7031-7.1016 13.801-9.1992zm-22.797 22.797 8.3008 3.3984c-1.6016 3.8984-2.5 8.1992-2.5 12.699h-9c-0.003906-5.6953 1.0977-11.195 3.1992-16.098zm9.0977 45.902c-3.8984-3.8984-7-8.5-9.1016-13.699l8.3008-3.3984c1.6992 4.1016 4.1016 7.6992 7.1992 10.801zm29.801 12.301c-5.6992 0-11.102-1.1016-16.102-3.1992l3.3984-8.3008c3.8984 1.6016 8.1992 2.5 12.699 2.5zm-27.801-42.102c0-15.301 12.5-27.801 27.801-27.801s27.801 12.5 27.801 27.801-12.402 27.801-27.801 27.801-27.801-12.5-27.801-27.801zm43.902 38.898-3.3984-8.3008c4-1.6992 7.6992-4.1016 10.699-7.1992l6.3984 6.3008c-3.9023 4-8.5 7.1016-13.699 9.1992zm14.598-26.199c1.6016-3.8984 2.5-8.1992 2.5-12.699h9c0 5.6992-1.1992 11.199-3.1992 16.102z"/>
5 |   <path d="m66.898 43.102-9.6992-1.8984c-0.5-0.10156-0.80078-0.39844-1.1016-0.80078l-4.8008-8.6016c-0.60156-1.1016-2.1016-1.1016-2.6992 0l-4.8008 8.6016c-0.19922 0.39844-0.60156 0.69922-1.1016 0.80078l-9.6992 1.8984c-1.1992 0.19922-1.6992 1.6992-0.80078 2.6016l6.6992 7.1992c0.30078 0.30078 0.5 0.80078 0.39844 1.3008l-1.0938 9.7969c-0.10156 1.1992 1.1016 2.1016 2.1992 1.6016l8.8984-4.1992c0.39844-0.19922 0.89844-0.19922 1.3008 0l8.8984 4.1992c1.1016 0.5 2.3984-0.39844 2.1992-1.6016l-1.1992-9.8008c-0.10156-0.5 0.10156-0.89844 0.39844-1.3008l6.6992-7.1992c1.0078-0.89844 0.50781-2.3008-0.69531-2.5977z"/>
6 |  </g>
7 | </svg>
8 | 


--------------------------------------------------------------------------------
/token.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/token.png


--------------------------------------------------------------------------------
/tokenwiser/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.0"
2 | 


--------------------------------------------------------------------------------
/tokenwiser/__main__.py:
--------------------------------------------------------------------------------
 1 | import typer
 2 | 
 3 | from tokenwiser import __version__
 4 | 
 5 | app = typer.Typer(
 6 |     add_completion=False,
 7 |     help="Tokenwiser CLI. Allows you to train embeddings from the commandline.",
 8 | )
 9 | 
10 | 
11 | @app.command("version", help="show the version of tokenwise")
12 | def version():
13 |     typer.echo(f"{__version__}")
14 | 
15 | 
16 | @app.command()
17 | def init():
18 |     pass
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     app()
23 | 


--------------------------------------------------------------------------------
/tokenwiser/common.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | 
 3 | def save_coefficients(classifier, filename):
 4 |     """Save the coefficients of a linear model into a .h5 file."""
 5 |     with h5py.File(filename, 'w') as hf:
 6 |         hf.create_dataset("coef",  data=classifier.coef_)
 7 |         hf.create_dataset("intercept",  data=classifier.intercept_)
 8 |         hf.create_dataset("classes", data=classifier.classes_)
 9 | 
10 | def load_coefficients(classifier, filename):
11 |     """Attach the saved coefficients to a linear model."""
12 |     with h5py.File(filename, 'r') as hf:
13 |         coef = hf['coef'][:]
14 |         intercept = hf['intercept'][:]
15 |         classes = hf['classes'][:]
16 |     classifier.coef_ = coef
17 |     classifier.intercept_ = intercept
18 |     classifier.classes_ = classes
19 |     
20 | def flatten(nested):
21 |     """Flatten a nested list."""
22 |     return [item for li in nested for item in li]
23 | 


--------------------------------------------------------------------------------
/tokenwiser/component/__init__.py:
--------------------------------------------------------------------------------
1 | from ._sklearn import attach_sklearn_categoriser
2 | 
3 | __all__ = ["attach_sklearn_categoriser"]
4 | 


--------------------------------------------------------------------------------
/tokenwiser/component/_sklearn.py:
--------------------------------------------------------------------------------
 1 | from spacy.language import Language
 2 | 
 3 | 
 4 | def attach_sklearn_categoriser(nlp, pipe_name, estimator):
 5 |     """
 6 |     This function will attach a scikit-learn compatible estimator to
 7 |     the pipeline which will feed predictions to the `.cats` property.
 8 | 
 9 |     This is useful if you're interesting in added a pre-trained sklearn
10 |     model to the pipeline. This is **not** useful if you're interested
11 |     in training a new model via spaCy, check out the `tokenwiser.model`
12 |     submodule for that.
13 | 
14 |     Usage:
15 | 
16 |     ```python
17 |     import spacy
18 | 
19 |     from sklearn.pipeline import make_pipeline
20 |     from sklearn.feature_extraction.text import CountVectorizer
21 |     from sklearn.linear_model import LogisticRegression
22 | 
23 |     from tokenwiser.component import attach_sklearn_categoriser
24 | 
25 |     X = [
26 |         "i really like this post",
27 |         "thanks for that comment",
28 |         "i enjoy this friendly forum",
29 |         "this is a bad post",
30 |         "i dislike this article",
31 |         "this is not well written"
32 |     ]
33 | 
34 |     y = ["pos", "pos", "pos", "neg", "neg", "neg"]
35 | 
36 |     # Note that we're training a pipeline here via a single-batch `.fit()` method
37 |     pipe = make_pipeline(CountVectorizer(), LogisticRegression()).fit(X, y)
38 | 
39 |     nlp = spacy.load("en_core_web_sm")
40 |     # This is where we attach our pre-trained model as a pipeline step.
41 |     attach_sklearn_categoriser(nlp, pipe_name="silly_sentiment", estimator=pipe)
42 | 
43 |     assert nlp.pipe_names[-1] == "silly_sentiment"
44 |     assert nlp("this post i really like").cats["pos"] > 0.5
45 |     ```
46 |     """
47 | 
48 |     @Language.component(pipe_name)
49 |     def my_component(doc):
50 |         pred = estimator.predict([doc.text])[0]
51 |         proba = estimator.predict_proba([doc.text]).max()
52 |         doc.cats[pred] = proba
53 |         return doc
54 | 
55 |     nlp.add_pipe(pipe_name)
56 | 


--------------------------------------------------------------------------------
/tokenwiser/extension/__init__.py:
--------------------------------------------------------------------------------
1 | from ._extension import (
2 |     attach_hyphen_extension,
3 |     attach_sklearn_extension,
4 |     sklearn_method,
5 | )
6 | 
7 | __all__ = ["attach_hyphen_extension", "attach_sklearn_extension", "sklearn_method"]
8 | 


--------------------------------------------------------------------------------
/tokenwiser/extension/_extension.py:
--------------------------------------------------------------------------------
  1 | from spacy.tokens import Doc, Token
  2 | 
  3 | from tokenwiser.textprep import HyphenTextPrep
  4 | 
  5 | 
  6 | def attach_hyphen_extension():
  7 |     """
  8 |     This function will attach an extension `._.hyphen` to the `Token`s.
  9 | 
 10 |     ```python
 11 |     import spacy
 12 |     from tokenwiser.extension import attach_hyphen_extension
 13 | 
 14 |     nlp = spacy.load("en_core_web_sm")
 15 |     # Attach the Hyphen extensions.
 16 |     attach_hyphen_extension()
 17 | 
 18 |     # Now you can query hyphens on the tokens.
 19 |     doc = nlp("this is a dinosaurhead")
 20 |     tok = doc[-1]
 21 | 
 22 |     assert tok._.hyphen == ["di", "no", "saur", "head"]
 23 |     ```
 24 |     """
 25 |     Token.set_extension(
 26 |         "hyphen",
 27 |         getter=lambda t: HyphenTextPrep().encode_single(t.text).split(" "),
 28 |         force=True,
 29 |     )
 30 | 
 31 | 
 32 | def attach_sklearn_extension(attribute_name, estimator):
 33 |     """
 34 |     This function will attach an extension `._.attribute_name` to the `Token`s.
 35 | 
 36 |     ```python
 37 |     import spacy
 38 |     from spacy.tokens import Doc
 39 | 
 40 |     from sklearn.pipeline import make_pipeline
 41 |     from sklearn.feature_extraction.text import CountVectorizer
 42 |     from sklearn.linear_model import LogisticRegression
 43 | 
 44 |     from tokenwiser.extension import attach_sklearn_extension
 45 | 
 46 |     X = [
 47 |         "i really like this post",
 48 |         "thanks for that comment",
 49 |         "i enjoy this friendly forum",
 50 |         "this is a bad post",
 51 |         "i dislike this article",
 52 |         "this is not well written"
 53 |     ]
 54 | 
 55 |     y = ["pos", "pos", "pos", "neg", "neg", "neg"]
 56 | 
 57 |     # First we train a (silly) model.
 58 |     mod = make_pipeline(CountVectorizer(), LogisticRegression()).fit(X, y)
 59 | 
 60 |     # Demo
 61 |     nlp = spacy.load("en_core_web_sm")
 62 |     doc = nlp("thank you, really nice")
 63 |     attach_sklearn_extension("sillysent", mod)
 64 |     doc._.sillysent # {"neg: 0.4446964938410244, "pos": 0.5553035061589756}
 65 |     ```
 66 |     """
 67 |     Doc.set_extension(
 68 |         attribute_name,
 69 |         getter=lambda t: sklearn_method(estimator=estimator),
 70 |         force=True,
 71 |     )
 72 | 
 73 | 
 74 | def sklearn_method(estimator):
 75 |     """
 76 |     A helper to turn a scikit-learn estimator into a spaCy extension.
 77 | 
 78 |     Just in case you *really* wanted to do it manually.
 79 | 
 80 |     ```python
 81 |     import spacy
 82 |     from spacy.tokens import Doc
 83 | 
 84 |     from sklearn.pipeline import make_pipeline
 85 |     from sklearn.feature_extraction.text import CountVectorizer
 86 |     from sklearn.linear_model import LogisticRegression
 87 | 
 88 |     from tokenwiser.extension import sklearn_method
 89 | 
 90 |     X = [
 91 |         "i really like this post",
 92 |         "thanks for that comment",
 93 |         "i enjoy this friendly forum",
 94 |         "this is a bad post",
 95 |         "i dislike this article",
 96 |         "this is not well written"
 97 |     ]
 98 | 
 99 |     y = ["pos", "pos", "pos", "neg", "neg", "neg"]
100 | 
101 |     # First we train a (silly) model.
102 |     mod = make_pipeline(CountVectorizer(), LogisticRegression()).fit(X, y)
103 | 
104 |     # This is where we attach the scikit-learn model to spaCy as a method extension.
105 |     Doc.set_extension("sillysent_method", method=sklearn_method(mod))
106 |     # This is where we attach the scikit-learn model to spaCy as a property extension.
107 |     Doc.set_extension("sillysent_prop", getter=sklearn_method(mod))
108 | 
109 |     # Demo
110 |     nlp = spacy.load("en_core_web_sm")
111 |     doc = nlp("thank you, really nice")
112 | 
113 |     doc._.sillysent_method() # {"neg": 0.4446964938410244, "pos: 0.5553035061589756}
114 |     doc._.sillysent_prop # {"neg: 0.4446964938410244, "pos": 0.5553035061589756}
115 |     ```
116 |     """
117 | 
118 |     def method(doc):
119 |         proba = estimator.predict_proba([doc.text])[0]
120 |         return {c: p for c, p in zip(estimator.classes_, proba)}
121 | 
122 |     return method
123 | 


--------------------------------------------------------------------------------
/tokenwiser/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .sklearnmod import SklearnCat
2 | 
3 | __all__ = ["SklearnCat"]
4 | 


--------------------------------------------------------------------------------
/tokenwiser/model/sklearnmod.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import pathlib
  3 | from typing import Iterable
  4 | 
  5 | import spacy
  6 | from spacy import registry
  7 | from spacy.tokens import Doc
  8 | from spacy.training import Example
  9 | from spacy.language import Language
 10 | from sklearn.feature_extraction.text import HashingVectorizer
 11 | from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier
 12 | from sklearn.naive_bayes import MultinomialNB
 13 | from joblib import dump, load
 14 | 
 15 | from tokenwiser.pipeline import PartialPipeline
 16 | 
 17 | 
 18 | class SklearnCat:
 19 |     """
 20 |     This is a spaCy pipeline component object that can train specific scikit-learn pipelines.
 21 | 
 22 |     This allows you to run a simple benchmark via spaCy on simple text-based scikit-learn models.
 23 |     One should not expect these models to have state of the art accuracy. But they should have
 24 |     "pretty good" accuracy while being substantially faster to train than most deep-learning
 25 |     based models.
 26 | 
 27 |     The intended use-case for these models is to offer a base benchmark. If these models perform well
 28 |     one your task, it's an indication that you're in luck and that you've got a simple task that
 29 |     doesn't require state of the art models.
 30 |     """
 31 | 
 32 |     def __init__(self, nlp, name, sklearn_model, label, classes):
 33 |         self.nlp = nlp
 34 |         self.name = name
 35 |         self.label = label
 36 |         self.classes = classes
 37 |         self.sklearn_model = spacy.registry.architectures.get(
 38 |             sklearn_model.replace("@", "")
 39 |         )()
 40 | 
 41 |     def __call__(self, doc: Doc):
 42 |         scores = self.predict([doc])
 43 |         self.set_annotations([doc], scores)
 44 |         return doc
 45 | 
 46 |     def update(
 47 |         self, examples: Iterable[Example], *, drop: float = 0.0, sgd=None, losses=None
 48 |     ):
 49 |         texts = [
 50 |             ex.reference.text
 51 |             for ex in examples
 52 |             if self.label in ex.reference.cats.keys()
 53 |         ]
 54 |         labels = [
 55 |             ex.reference.cats[self.label]
 56 |             for ex in examples
 57 |             if self.label in ex.reference.cats.keys()
 58 |         ]
 59 |         self.sklearn_model.partial_fit(texts, labels, classes=self.classes)
 60 | 
 61 |     def predict(self, docs: Iterable[Doc]):
 62 |         return self.sklearn_model.predict_proba([d.text for d in docs]).max(axis=1)
 63 | 
 64 |     def set_annotations(self, docs: Iterable[Doc], scores):
 65 |         preds = self.sklearn_model.predict([d.text for d in docs])
 66 |         for doc, pred, proba in zip(docs, preds, scores):
 67 |             doc.cats[pred] = proba
 68 |         return docs
 69 | 
 70 |     def score(self):
 71 |         return random.random()
 72 | 
 73 |     def to_disk(self, path, exclude=None):
 74 |         pathlib.Path(path).mkdir(parents=True, exist_ok=True)
 75 |         dump(self.sklearn_model, str(pathlib.Path(path) / "filename.joblib"))
 76 | 
 77 |     def from_disk(self, path, exclude=None):
 78 |         self.sklearn_model = load(str(pathlib.Path(path) / "filename.joblib"))
 79 |         return self
 80 | 
 81 | 
 82 | @Language.factory("sklearn-cat")
 83 | def make_sklearn_cat(nlp, name, sklearn_model, label, classes):
 84 |     return SklearnCat(nlp, name, sklearn_model, label, classes)
 85 | 
 86 | 
 87 | @registry.architectures("sklearn_model_basic_sgd.v1")
 88 | def make_sklearn_cat_basic_sgd():
 89 |     return PartialPipeline(
 90 |         [("hash", HashingVectorizer()), ("lr", SGDClassifier(loss="log"))]
 91 |     )
 92 | 
 93 | 
 94 | @registry.architectures("sklearn_model_basic_pa.v1")
 95 | def make_sklearn_cat_basic_pa():
 96 |     return PartialPipeline(
 97 |         [("hash", HashingVectorizer()), ("lr", PassiveAggressiveClassifier())]
 98 |     )
 99 | 
100 | 
101 | @registry.architectures("sklearn_model_basic_naive_bayes.v1")
102 | def make_sklearn_cat_basic_naive_bayes():
103 |     return PartialPipeline(
104 |         [("hash", HashingVectorizer(binary=True)), ("nb", MultinomialNB())]
105 |     )
106 | 


--------------------------------------------------------------------------------
/tokenwiser/pipeline/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._concat import TextConcat, make_concat
 2 | from ._pipe import PartialPipeline, make_partial_pipeline
 3 | from ._union import PartialFeatureUnion, make_partial_union
 4 | 
 5 | __all__ = [
 6 |     "TextConcat",
 7 |     "make_concat",
 8 |     "PartialPipeline",
 9 |     "make_partial_pipeline",
10 |     "PartialFeatureUnion",
11 |     "make_partial_union",
12 | ]
13 | 


--------------------------------------------------------------------------------
/tokenwiser/pipeline/_concat.py:
--------------------------------------------------------------------------------
 1 | from sklearn.pipeline import _name_estimators
 2 | from sklearn.base import BaseEstimator
 3 | 
 4 | 
 5 | class TextConcat(BaseEstimator):
 6 |     """
 7 |     A component like `FeatureUnion` but this also concatenates the text.
 8 | 
 9 |     Arguments:
10 |         transformer_list: list of (name, text-transformer)-tuples
11 | 
12 |     Example:
13 | 
14 |     ```python
15 |     from tokenwiser.textprep import HyphenTextPrep, Cleaner
16 |     from tokenwiser.pipeline import TextConcat
17 | 
18 |     tc = TextConcat([("hyp", HyphenTextPrep()), ("clean", Cleaner())])
19 |     results = tc.fit_transform(["dinosaurhead", "another$$ sentence$$"])
20 |     expected = ['di no saur head dinosaurhead', 'an other $$ sen tence$$ another sentence']
21 | 
22 |     assert results == expected
23 |     ```
24 |     """
25 | 
26 |     def __init__(self, transformer_list):
27 |         self.transformer_list = transformer_list
28 | 
29 |     def fit(self, X, y=None):
30 |         """
31 |         Fits the components in a single batch.
32 |         """
33 |         names = [n for n, t in self.transformer_list]
34 |         if len(names) != len(set(names)):
35 |             raise ValueError("Make sure that the names of each step are unique.")
36 |         return self
37 | 
38 |     def partial_fit(self, X, y=None):
39 |         """
40 |         Fits the components, but allow for batches.
41 |         """
42 |         names = [n for n, t in self.transformer_list]
43 |         if len(names) != len(set(names)):
44 |             raise ValueError("Make sure that the names of each step are unique.")
45 |         return self
46 | 
47 |     def transform(self, X, y=None):
48 |         """
49 |         Transformers the text.
50 |         """
51 |         names = [n for n, t in self.transformer_list]
52 |         if len(names) != len(set(names)):
53 |             raise ValueError("Make sure that the names of each step are unique.")
54 |         results = {}
55 |         for name, tfm in self.transformer_list:
56 |             results[name] = tfm.transform(X)
57 |         return [" ".join([results[n][i] for n in names]) for i in range(len(X))]
58 | 
59 |     def fit_transform(self, X, y=None):
60 |         """
61 |         Fits the components and transforms the text in one step.
62 |         """
63 |         return self.fit(X, y).transform(X, y)
64 | 
65 | 
66 | def make_concat(*steps):
67 |     """
68 |     Utility function to generate a `TextConcat`
69 | 
70 |     Arguments:
71 |         steps: a collection of text-transformers
72 | 
73 |     ```python
74 |     from tokenwiser.textprep import HyphenTextPrep, Cleaner
75 |     from tokenwiser.pipeline import make_concat
76 | 
77 |     tc = make_concat(HyphenTextPrep(), Cleaner())
78 |     results = tc.fit_transform(["dinosaurhead", "another$$ sentence$$"])
79 |     expected = ['di no saur head dinosaurhead', 'an other $$ sen tence$$ another sentence']
80 | 
81 |     assert results == expected
82 |     ```
83 |     """
84 |     return TextConcat(_name_estimators(steps))
85 | 


--------------------------------------------------------------------------------
/tokenwiser/pipeline/_pipe.py:
--------------------------------------------------------------------------------
 1 | from sklearn.pipeline import Pipeline, _name_estimators
 2 | 
 3 | 
 4 | class PartialPipeline(Pipeline):
 5 |     """
 6 |     Utility function to generate a `PartialPipeline`
 7 | 
 8 |     Arguments:
 9 |         steps: a collection of text-transformers
10 | 
11 |     ```python
12 |     from tokenwiser.pipeline import PartialPipeline
13 |     from tokenwiser.textprep import HyphenTextPrep, Cleaner
14 | 
15 |     tc = PartialPipeline([('clean', Cleaner()), ('hyp', HyphenTextPrep())])
16 |     data = ["dinosaurhead", "another$$ sentence$$"]
17 |     results = tc.partial_fit(data).transform(data)
18 |     expected = ['di no saur head', 'an other  sen tence']
19 | 
20 |     assert results == expected
21 |     ```
22 |     """
23 |     def partial_fit(self, X, y=None, classes=None, **kwargs):
24 |         """
25 |         Fits the components, but allow for batches.
26 |         """
27 |         for name, step in self.steps:
28 |             if not hasattr(step, "partial_fit"):
29 |                 raise ValueError(
30 |                     f"Step {name} is a {step} which does not have `.partial_fit` implemented."
31 |                 )
32 |         for name, step in self.steps:
33 |             if hasattr(step, "predict"):
34 |                 step.partial_fit(X, y, classes=classes, **kwargs)
35 |             else:
36 |                 step.partial_fit(X, y)
37 |             if hasattr(step, "transform"):
38 |                 X = step.transform(X)
39 |         return self
40 | 
41 | 
42 | def make_partial_pipeline(*steps):
43 |     """
44 |     Utility function to generate a `PartialPipeline`
45 | 
46 |     Arguments:
47 |         steps: a collection of text-transformers
48 | 
49 |     ```python
50 |     from tokenwiser.pipeline import make_partial_pipeline
51 |     from tokenwiser.textprep import HyphenTextPrep, Cleaner
52 | 
53 |     tc = make_partial_pipeline(Cleaner(), HyphenTextPrep())
54 |     data = ["dinosaurhead", "another$$ sentence$$"]
55 |     results = tc.partial_fit(data).transform(data)
56 |     expected = ['di no saur head', 'an other  sen tence']
57 | 
58 |     assert results == expected
59 |     ```
60 |     """
61 |     return PartialPipeline(_name_estimators(steps))
62 | 


--------------------------------------------------------------------------------
/tokenwiser/pipeline/_union.py:
--------------------------------------------------------------------------------
  1 | from sklearn.pipeline import FeatureUnion, _name_estimators
  2 | 
  3 | 
  4 | class PartialFeatureUnion(FeatureUnion):
  5 |     """
  6 |     A `PartialFeatureUnion` is a `FeatureUnion` but able to `.partial_fit`.
  7 | 
  8 |     Arguments:
  9 |         transformer_list: a list of transformers to apply and concatenate
 10 | 
 11 |     Example:
 12 | 
 13 |     ```python
 14 |     import numpy as np
 15 |     from sklearn.linear_model import SGDClassifier
 16 |     from sklearn.feature_extraction.text import HashingVectorizer
 17 | 
 18 |     from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
 19 |     from tokenwiser.pipeline import PartialPipeline, PartialFeatureUnion
 20 | 
 21 |     pipe = PartialPipeline([
 22 |         ("clean", Cleaner()),
 23 |         ("union", PartialFeatureUnion([
 24 |             ("full_text_pipe", PartialPipeline([
 25 |                 ("identity", Identity()),
 26 |                 ("hash1", HashingVectorizer()),
 27 |             ])),
 28 |             ("hyphen_pipe", PartialPipeline([
 29 |                 ("hyphen", HyphenTextPrep()),
 30 |                 ("hash2", HashingVectorizer()),
 31 |             ]))
 32 |         ])),
 33 |         ("clf", SGDClassifier())
 34 |     ])
 35 | 
 36 |     X = [
 37 |         "i really like this post",
 38 |         "thanks for that comment",
 39 |         "i enjoy this friendly forum",
 40 |         "this is a bad post",
 41 |         "i dislike this article",
 42 |         "this is not well written"
 43 |     ]
 44 | 
 45 |     y = np.array([1, 1, 1, 0, 0, 0])
 46 | 
 47 |     for loop in range(3):
 48 |         pipe.partial_fit(X, y, classes=[0, 1])
 49 | 
 50 |     assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0]))
 51 |     ```
 52 |     """
 53 | 
 54 |     def partial_fit(self, X, y=None, classes=None, **kwargs):
 55 |         """
 56 |         Fits the components, but allow for batches.
 57 |         """
 58 |         for name, step in self.transformer_list:
 59 |             if not hasattr(step, "partial_fit"):
 60 |                 raise ValueError(
 61 |                     f"Step {name} is a {step} which does not have `.partial_fit` implemented."
 62 |                 )
 63 |         for name, step in self.transformer_list:
 64 |             if hasattr(step, "predict"):
 65 |                 step.partial_fit(X, y, classes=classes, **kwargs)
 66 |             else:
 67 |                 step.partial_fit(X, y)
 68 |         return self
 69 | 
 70 | 
 71 | def make_partial_union(*transformer_list):
 72 |     """
 73 |     Utility function to generate a `PartialFeatureUnion`
 74 | 
 75 |     Arguments:
 76 |         transformer_list: a list of transformers to apply and concatenate
 77 | 
 78 |     Example:
 79 | 
 80 |     ```python
 81 |     import numpy as np
 82 |     from sklearn.linear_model import SGDClassifier
 83 |     from sklearn.feature_extraction.text import HashingVectorizer
 84 | 
 85 |     from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
 86 |     from tokenwiser.pipeline import make_partial_pipeline, make_partial_union
 87 | 
 88 |     pipe = make_partial_pipeline(
 89 |         Cleaner(),
 90 |         make_partial_union(
 91 |             make_partial_pipeline(Identity(), HashingVectorizer()),
 92 |             make_partial_pipeline(HyphenTextPrep(), HashingVectorizer())
 93 |         ),
 94 |         SGDClassifier()
 95 |     )
 96 | 
 97 |     X = [
 98 |         "i really like this post",
 99 |         "thanks for that comment",
100 |         "i enjoy this friendly forum",
101 |         "this is a bad post",
102 |         "i dislike this article",
103 |         "this is not well written"
104 |     ]
105 | 
106 |     y = np.array([1, 1, 1, 0, 0, 0])
107 | 
108 |     for loop in range(3):
109 |         pipe.partial_fit(X, y, classes=[0, 1])
110 | 
111 |     assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0]))
112 |     ```
113 |     """
114 |     return PartialFeatureUnion(_name_estimators(transformer_list))
115 | 


--------------------------------------------------------------------------------
/tokenwiser/proj/__init__.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.base import TransformerMixin, BaseEstimator
 3 | from sklearn.utils import check_array
 4 | from sklearn.utils.validation import check_is_fitted
 5 | 
 6 | 
 7 | class BinaryRandomProjection(BaseEstimator, TransformerMixin):
 8 |     def __init__(self, n_components=100, random_seed=42, threshold=0.0):
 9 |         self.n_components = n_components
10 |         self.random_seed = random_seed
11 |         self.threshold = threshold
12 | 
13 |     def fit(self, X, y=None):
14 |         X = check_array(X)
15 |         np.random.seed(self.random_seed)
16 |         self.proj_ = np.random.normal(0, 1, (X.shape[1], self.n_components))
17 |         return self
18 | 
19 |     def transform(self, X, y=None):
20 |         check_is_fitted(self, ["proj_"])
21 |         return (X @ self.proj_ > self.threshold).astype(np.int8)
22 | 
23 | 
24 | def proj_away(x, y):
25 |     """project y away from x"""
26 |     return x.dot(x) / y.dot(y) * x
27 | 
28 | 
29 | def select_random_rows(X):
30 |     i1, i2 = np.random.randint(0, X.shape[0], 2)
31 |     return X[i1, :], X[i2, :]
32 | 
33 | 
34 | class PointSplitProjection(BaseEstimator, TransformerMixin):
35 |     def __init__(self, n_components=100, random_seed=42):
36 |         self.n_components = n_components
37 |         self.random_seed = random_seed
38 | 
39 |     def fit(self, X, y=None):
40 |         X = check_array(X)
41 |         self.X_ = X
42 |         self.indices_ = [
43 |             tuple(np.random.randint(0, X.shape[0], 2)) for t in range(self.n_components)
44 |         ]
45 |         return self
46 | 
47 |     def generate_feature_(self, new_X, i):
48 |         i1, i2 = self.indices_[i]
49 |         v1, v2 = self.X_[i1, :], self.X_[i2, :]
50 |         m = np.array([v1, v2]).mean(axis=0)
51 |         return new_X @ (proj_away(v2 - v1, m)) > m.dot(proj_away(v2 - v1, m))
52 | 
53 |     def transform(self, X, y=None):
54 |         check_is_fitted(self, ["X_", "indices_"])
55 |         if X.shape[1] != self.X_.shape[1]:
56 |             raise ValueError(
57 |                 f"shapes train/transform do not match. {X.shape[1]} vs {self.X_.shape[1]}"
58 |             )
59 |         result = np.zeros((X.shape[0], self.n_components))
60 |         for col in range(self.n_components):
61 |             result[:, col] = self.generate_feature_(X, col)
62 |         return result
63 | 


--------------------------------------------------------------------------------
/tokenwiser/textprep/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._hyphen import HyphenTextPrep
 2 | from ._phonetic import PhoneticTextPrep
 3 | from ._cleaner import Cleaner
 4 | from ._morph import SpacyMorphTextPrep, SpacyLemmaTextPrep, SpacyPosTextPrep
 5 | from ._yake import YakeTextPrep
 6 | from ._sentpiece import SentencePiecePrep
 7 | from ._identity import Identity
 8 | from ._snowball import SnowballTextPrep
 9 | 
10 | __all__ = [
11 |     "HyphenTextPrep",
12 |     "PhoneticTextPrep",
13 |     "Cleaner",
14 |     "SpacyMorphTextPrep",
15 |     "SpacyLemmaTextPrep",
16 |     "SpacyPosTextPrep",
17 |     "YakeTextPrep",
18 |     "Identity",
19 |     "SentencePiecePrep",
20 |     "SnowballTextPrep",
21 | ]
22 | 


--------------------------------------------------------------------------------
/tokenwiser/textprep/_cleaner.py:
--------------------------------------------------------------------------------
 1 | from sklearn.base import BaseEstimator
 2 | 
 3 | from ._prep import TextPrep
 4 | 
 5 | 
 6 | class Cleaner(TextPrep, BaseEstimator):
 7 |     """
 8 |     Applies a lowercase and removes non-alphanum.
 9 | 
10 |     Usage:
11 | 
12 |     ```python
13 |     from tokenwiser.textprep import Cleaner
14 | 
15 |     single = Cleaner().encode_single("$$$5 dollars")
16 |     assert single == "5 dollars"
17 |     multi = Cleaner().transform(["$$$5 dollars", "#hashtag!"])
18 |     assert multi == ["5 dollars", "hashtag"]
19 |     ```
20 |     """
21 | 
22 |     def __init__(self):
23 |         pass
24 | 
25 |     def encode_single(self, x: str):
26 |         return "".join([c.lower() for c in x if c.isalnum() or c == " "])
27 | 


--------------------------------------------------------------------------------
/tokenwiser/textprep/_hyphen.py:
--------------------------------------------------------------------------------
 1 | import pyphen
 2 | from sklearn.base import BaseEstimator
 3 | 
 4 | from ._prep import TextPrep
 5 | 
 6 | 
 7 | class HyphenTextPrep(TextPrep, BaseEstimator):
 8 |     """
 9 |     Hyphenate the text going in.
10 | 
11 |     Usage:
12 | 
13 |     ```python
14 |     from tokenwiser.textprep import HyphenTextPrep
15 | 
16 |     multi = HyphenTextPrep().transform(["geology", "astrology"])
17 |     assert multi == ['geo logy', 'as tro logy']
18 |     ```
19 |     """
20 | 
21 |     def __init__(self, lang="en_GB"):
22 |         self.lang = lang
23 |         self.dic = pyphen.Pyphen(lang=lang)
24 | 
25 |     def encode_single(self, x):
26 |         return " ".join(self.dic.inserted(x).split("-", -1))
27 | 


--------------------------------------------------------------------------------
/tokenwiser/textprep/_identity.py:
--------------------------------------------------------------------------------
 1 | from sklearn.base import BaseEstimator
 2 | from ._prep import TextPrep
 3 | 
 4 | 
 5 | class Identity(TextPrep, BaseEstimator):
 6 |     """
 7 |     Keeps the text as is. Can be used as a placeholder in a pipeline.
 8 | 
 9 |     Usage:
10 | 
11 |     ```python
12 |     from tokenwiser.textprep import Identity
13 | 
14 |     text = ["hello", "world"]
15 |     example = Identity().transform(text)
16 | 
17 |     assert example == ["hello", "world"]
18 |     ```
19 | 
20 |     The main use-case is as a placeholder.
21 | 
22 |     ```
23 |     from tokenwiser.pipeline import make_concat
24 |     from sklearn.pipeline import make_pipeline, make_union
25 | 
26 |     from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
27 | 
28 |     pipe = make_pipeline(
29 |         Cleaner(),
30 |         make_concat(Identity(), HyphenTextPrep()),
31 |     )
32 |     ```
33 |     """
34 | 
35 |     def __init__(self):
36 |         pass
37 | 
38 |     def encode_single(self, x):
39 |         return x
40 | 
41 |     def transform(self, X, y=None):
42 |         return X
43 | 
44 |     def fit(self, X, y=None):
45 |         return self
46 | 
47 |     def partial_fit(self, X, y=None):
48 |         return self
49 | 


--------------------------------------------------------------------------------
/tokenwiser/textprep/_morph.py:
--------------------------------------------------------------------------------
  1 | from sklearn.base import BaseEstimator
  2 | 
  3 | from ._prep import TextPrep
  4 | 
  5 | 
  6 | class SpacyMorphTextPrep(TextPrep, BaseEstimator):
  7 |     """
  8 |     Adds morphologic information to tokens in text.
  9 | 
 10 |     Usage:
 11 | 
 12 |     ```python
 13 |     import spacy
 14 |     from tokenwiser.textprep import SpacyMorphTextPrep
 15 | 
 16 |     nlp = spacy.load("en_core_web_sm")
 17 |     example1 = SpacyMorphTextPrep(nlp).encode_single("quick! duck!")
 18 |     example2 = SpacyMorphTextPrep(nlp).encode_single("hey look a duck")
 19 | 
 20 |     assert example1 == "quick|Degree=Pos !|PunctType=Peri duck|Number=Sing !|PunctType=Peri"
 21 |     assert example2 == "hey| look|VerbForm=Inf a|Definite=Ind|PronType=Art duck|Number=Sing"
 22 |     ```
 23 |     """
 24 | 
 25 |     def __init__(self, model, lemma: bool = False):
 26 |         self.model = model
 27 |         self.lemma = lemma
 28 | 
 29 |     def encode_single(self, text):
 30 |         return " ".join(
 31 |             [
 32 |                 f"{t.text if not self.lemma else t.lemma_}|{t.morph}"
 33 |                 for t in self.model(text)
 34 |             ]
 35 |         )
 36 | 
 37 | 
 38 | class SpacyPosTextPrep(TextPrep, BaseEstimator):
 39 |     """
 40 |     Adds part of speech information per token using spaCy.
 41 | 
 42 |     Arguments:
 43 |         model: the spaCy model to use
 44 |         lemma: also lemmatize the text
 45 |         fine_grained: use fine grained parts of speech
 46 | 
 47 |     Usage:
 48 | 
 49 |     ```python
 50 |     import spacy
 51 |     from tokenwiser.textprep import SpacyPosTextPrep
 52 | 
 53 |     nlp = spacy.load("en_core_web_sm")
 54 |     example1 = SpacyPosTextPrep(nlp).encode_single("we need to duck")
 55 |     example2 = SpacyPosTextPrep(nlp).encode_single("hey look a duck")
 56 | 
 57 |     assert example1 == "we|PRON need|VERB to|PART duck|VERB"
 58 |     assert example2 == "hey|INTJ look|VERB a|DET duck|NOUN"
 59 |     ```
 60 |     """
 61 | 
 62 |     def __init__(self, model, lemma: bool = False, fine_grained: bool = False):
 63 |         self.model = model
 64 |         self.lemma = lemma
 65 |         self.fine_grained = fine_grained
 66 | 
 67 |     def encode_single(self, text):
 68 |         return " ".join(
 69 |             [
 70 |                 f"{t.text if not self.lemma else t.lemma_}|{t.tag_ if self.fine_grained else t.pos_}"
 71 |                 for t in self.model(text)
 72 |             ]
 73 |         )
 74 | 
 75 | 
 76 | class SpacyLemmaTextPrep(TextPrep, BaseEstimator):
 77 |     """
 78 |     Turns each token into a lemmatizer version using spaCy.
 79 | 
 80 |     Usage:
 81 | 
 82 |     ```python
 83 |     import spacy
 84 |     from tokenwiser.textprep import SpacyLemmaTextPrep
 85 | 
 86 |     nlp = spacy.load("en_core_web_sm")
 87 |     example1 = SpacyLemmaTextPrep(nlp).encode_single("we are running")
 88 |     example2 = SpacyLemmaTextPrep(nlp).encode_single("these are dogs")
 89 | 
 90 |     assert example1 == 'we be run'
 91 |     assert example2 == 'these be dog'
 92 |     ```
 93 |     """
 94 | 
 95 |     def __init__(self, model, stop=False):
 96 |         self.stop = stop
 97 |         self.model = model
 98 | 
 99 |     def encode_single(self, text):
100 |         if self.stop:
101 |             return " ".join([t.lemma_ for t in self.model(text) if not t.is_stop])
102 |         return " ".join([t.lemma_ for t in self.model(text)])
103 | 
104 |     def transform(self, X, y=None):
105 |         return [" ".join([t.lemma_ for t in d]) for d in self.model.pipe(X)]
106 | 


--------------------------------------------------------------------------------
/tokenwiser/textprep/_phonetic.py:
--------------------------------------------------------------------------------
 1 | import jellyfish
 2 | from sklearn.base import BaseEstimator
 3 | 
 4 | from ._prep import TextPrep
 5 | 
 6 | 
 7 | class PhoneticTextPrep(TextPrep, BaseEstimator):
 8 |     """
 9 |     The ProneticPrep object prepares strings by encoding them phonetically.
10 | 
11 |     Arguments:
12 |         kind: type of encoding, either `"soundex"`, "`metaphone`" or `"nysiis"`
13 | 
14 |     Usage:
15 | 
16 |     ```python
17 |     import spacy
18 |     from tokenwiser.textprep import PhoneticTextPrep
19 | 
20 |     nlp = spacy.load("en_core_web_sm")
21 |     example1 = PhoneticTextPrep(kind="soundex").transform(["dinosaurus book"])
22 |     example2 = PhoneticTextPrep(kind="metaphone").transform(["dinosaurus book"])
23 |     example3 = PhoneticTextPrep(kind="nysiis").transform(["dinosaurus book"])
24 | 
25 |     assert example1[0] == 'D526 B200'
26 |     assert example2[0] == 'TNSRS BK'
27 |     assert example3[0] == 'DANASAR BAC'
28 |     ```
29 |     """
30 | 
31 |     def __init__(self, kind="soundex"):
32 |         methods = {
33 |             "soundex": jellyfish.soundex,
34 |             "metaphone": jellyfish.metaphone,
35 |             "nysiis": jellyfish.nysiis,
36 |         }
37 |         self.kind = kind
38 |         self.method = methods[kind]
39 | 
40 |     def encode_single(self, x):
41 |         return " ".join([self.method(d) for d in x.split(" ")])
42 | 


--------------------------------------------------------------------------------
/tokenwiser/textprep/_prep.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class TextPrep(ABC):
 5 |     def fit(self, X, y=None):
 6 |         """Fits the `TextPrep` step. Considered a no-op."""
 7 |         return self
 8 | 
 9 |     def partial_fit(self, X, y=None):
10 |         """Partially fits the `TextPrep` step. Considered a no-op."""
11 |         return self
12 | 
13 |     @abstractmethod
14 |     def encode_single(self, x):
15 |         pass
16 | 
17 |     def pipe(self, X):
18 |         for x in X:
19 |             yield self.encode_single(x)
20 | 
21 |     def transform(self, X, y=None):
22 |         return [self.encode_single(x) for x in X]
23 | 


--------------------------------------------------------------------------------
/tokenwiser/textprep/_sentpiece.py:
--------------------------------------------------------------------------------
 1 | from urllib.error import HTTPError
 2 | import urllib.request
 3 | from typing import Union
 4 | from pathlib import Path
 5 | 
 6 | import sentencepiece as spm
 7 | from sklearn.base import BaseEstimator
 8 | 
 9 | from ._prep import TextPrep
10 | 
11 | 
12 | class SentencePiecePrep(TextPrep, BaseEstimator):
13 |     """
14 |     The SentencePiecePrep object splits text into subtokens based on a pre-trained model.
15 | 
16 |     You can find many pre-trained subtokenizers via the [bpemb](https://nlp.h-its.org/bpemb/) project.
17 |     For example, on the [English](https://nlp.h-its.org/bpemb/en/) sub-site you can find many
18 |     models for different vocabulary sizes. Note that this site supports 275 pre-trained
19 |     subword tokenizers.
20 | 
21 |     Note that you can train your own sentencepiece tokenizer as well.
22 | 
23 |     ```python
24 |     import sentencepiece as spm
25 | 
26 |     # This saves a file named `mod.model` which can be read in later.
27 |     spm.SentencePieceTrainer.train('--input=tests/data/nlp.txt --model_prefix=mod --vocab_size=2000')
28 |     ```
29 | 
30 |     Arguments:
31 |         model_file: pre-trained model file
32 | 
33 |     Usage:
34 | 
35 |     ```python
36 |     from tokenwiser.textprep import SentencePiecePrep
37 |     sp_tfm = SentencePiecePrep(model_file="tests/data/en.vs5000.model")
38 | 
39 |     texts = ["talking about geology"]
40 |     example = sp_tfm.transform(texts)
41 |     assert example == ['▁talk ing ▁about ▁ge ology']
42 |     ```
43 |     """
44 | 
45 |     def __init__(self, model_file: Union[str, Path]):
46 |         self.model_file = model_file
47 |         self.spm = spm.SentencePieceProcessor(model_file=str(model_file))
48 | 
49 |     def encode_single(self, x):
50 |         return " ".join(self.spm.encode_as_pieces(x))
51 |     
52 |     @classmethod
53 |     def download(self, lang: str, vocab_size: int, filename: str = None):
54 |         """
55 |         Download a pre-trained model from the bpemb project.
56 | 
57 |         You can see some examples of pre-trained models on the [English](https://nlp.h-its.org/bpemb/en/) sub-site.
58 |         There are many languages available, but you should take care that you pick the right 
59 |         vocabulary size. 
60 | 
61 |         Arguments:
62 |             lang: language code
63 |             vocab_size: vocab size, can be 1000, 3000, 5000, 10000, 25000, 50000, 100000, 200000
64 |         """
65 |         url = f"https://bpemb.h-its.org/{lang}/{lang}.wiki.bpe.vs{vocab_size}.model"
66 |         if not filename:
67 |             filename = f"{lang}.wiki.bpe.vs{vocab_size}.model"
68 |         try:
69 |             urllib.request.urlretrieve(url=url, filename=filename)
70 |         except HTTPError:
71 |             raise ValueError(f"Double check if the language ({lang}) and voacb size ({vocab_size}) combo exist.")
72 | 


--------------------------------------------------------------------------------
/tokenwiser/textprep/_snowball.py:
--------------------------------------------------------------------------------
 1 | import snowballstemmer
 2 | from sklearn.base import BaseEstimator
 3 | 
 4 | from ._prep import TextPrep
 5 | 
 6 | 
 7 | 
 8 | 
 9 | class SnowballTextPrep(TextPrep, BaseEstimator):
10 |     """
11 |     Applies the snowball stemmer to the text.
12 | 
13 |     There are 26 languages supported, for the full list check the list on the 
14 |     lefthand side on [pypi](https://pypi.org/project/snowballstemmer/).
15 | 
16 |     Usage:
17 | 
18 |     ```python
19 |     from tokenwiser.textprep import SnowballTextPrep
20 | 
21 |     single = SnowballTextPrep(language='english').encode_single("Dogs like running")
22 |     assert single == "Dog like run"
23 |     multi = Cleaner().transform(["Dogs like running", "Cats like sleeping"])
24 |     assert multi == ["Dog like run", "Cat like sleep"]
25 |     ```
26 |     """
27 | 
28 |     def __init__(self, language='english'):
29 |         self.stemmer = snowballstemmer.stemmer(language)
30 | 
31 |     def encode_single(self, x: str):
32 |         return " ".join(self.stemmer.stemWords(x))
33 | 


--------------------------------------------------------------------------------
/tokenwiser/textprep/_yake.py:
--------------------------------------------------------------------------------
 1 | from sklearn.base import BaseEstimator
 2 | import yake
 3 | 
 4 | from ._prep import TextPrep
 5 | 
 6 | 
 7 | class YakeTextPrep(TextPrep, BaseEstimator):
 8 |     """
 9 |     Remove all text except meaningful key-phrases. Uses [yake](https://github.com/LIAAD/yake).
10 | 
11 |     Arguments:
12 |         top_n: number of key-phrases to select
13 |         unique: only return unique keywords from the key-phrases
14 | 
15 |     Usage:
16 | 
17 |     ```python
18 |     from tokenwiser.textprep import YakeTextPrep
19 | 
20 |     text = ["Sources tell us that Google is acquiring Kaggle, a platform that hosts data science and machine learning"]
21 |     example = YakeTextPrep(top_n=3, unique=False).transform(text)
22 | 
23 |     assert example[0] == 'hosts data science acquiring kaggle google is acquiring'
24 |     ```
25 |     """
26 | 
27 |     def __init__(self, top_n: int = 5, unique: bool = False):
28 |         self.top_n = top_n
29 |         self.unique = unique
30 |         self.extractor = yake.KeywordExtractor(top=self.top_n)
31 | 
32 |     def encode_single(self, text):
33 |         texts = " ".join([t[0] for t in self.extractor.extract_keywords(text)])
34 |         if not self.unique:
35 |             return texts
36 |         return " ".join(set(texts.split(" ")))
37 | 


--------------------------------------------------------------------------------
/tokenwiser/tok/__init__.py:
--------------------------------------------------------------------------------
1 | from ._whitespace import WhiteSpaceTokenizer
2 | from ._spacy import SpacyTokenizer
3 | 
4 | __all__ = ["WhiteSpaceTokenizer", "SpacyTokenizer"]
5 | 


--------------------------------------------------------------------------------
/tokenwiser/tok/_spacy.py:
--------------------------------------------------------------------------------
 1 | from tokenwiser.tok._tok import Tok
 2 | 
 3 | from sklearn.base import BaseEstimator
 4 | 
 5 | 
 6 | class SpacyTokenizer(Tok, BaseEstimator):
 7 |     """
 8 |     A tokenizer that uses spaCy under the hood for the tokenization.
 9 | 
10 |     Arguments:
11 |         model: reference to the spaCy model
12 |         lemma: weather or not to also apply lemmatization
13 |         stop: weather or not to remove stopwords
14 | 
15 |     Usage:
16 | 
17 |     ```python
18 |     import spacy
19 |     from tokenwiser.tok import SpacyTokenizer
20 | 
21 |     # This can also be a Non-English model.
22 |     nlp = spacy.load("en_core_web_sm")
23 |     tok = SpacyTokenizer(model=nlp)
24 | 
25 |     single = tok("hello world")
26 |     assert single == ["hello", "world"]
27 |     ```
28 |     """
29 | 
30 |     def __init__(self, model, lemma=False, stop=False):
31 |         self.model = model
32 |         self.lemma = lemma
33 |         self.stop = stop
34 | 
35 |     def __call__(self, text):
36 |         if self.stop:
37 |             return [
38 |                 t.lemma_ if self.lemma else t.text
39 |                 for t in self.model(text)
40 |                 if not t.is_stop
41 |             ]
42 |         return [t.lemma_ if self.lemma else t.text for t in self.model(text)]
43 | 


--------------------------------------------------------------------------------
/tokenwiser/tok/_tok.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | 
3 | 
4 | class Tok(ABC):
5 |     @abstractmethod
6 |     def __call__(self, x):
7 |         pass
8 | 


--------------------------------------------------------------------------------
/tokenwiser/tok/_whitespace.py:
--------------------------------------------------------------------------------
 1 | from tokenwiser.tok._tok import Tok
 2 | 
 3 | from sklearn.base import BaseEstimator
 4 | 
 5 | 
 6 | class WhiteSpaceTokenizer(Tok, BaseEstimator):
 7 |     """
 8 |     A simple tokenizer that simple splits on whitespace.
 9 | 
10 |     Usage:
11 | 
12 |     ```python
13 |     from tokenwiser.tok import WhiteSpaceTokenizer
14 | 
15 |     tok = WhiteSpaceTokenizer()
16 |     single = tok("hello world")
17 |     assert single == ["hello", "world"]
18 |     ```
19 |     """
20 | 
21 |     def __init__(self):
22 |         pass
23 | 
24 |     def __call__(self, text):
25 |         return [r for r in text.split(" ") if r != ""]
26 | 


--------------------------------------------------------------------------------
/tokenwiser/wabbit/__init__.py:
--------------------------------------------------------------------------------
1 | from ._vowpal import VowpalWabbitClassifier
2 | 
3 | __all__ = ["VowpalWabbitClassifier"]
4 | 


--------------------------------------------------------------------------------
/tokenwiser/wabbit/_vowpal.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from vowpalwabbit import pyvw
  3 | from sklearn.utils.validation import check_is_fitted
  4 | from sklearn.base import BaseEstimator, ClassifierMixin
  5 | 
  6 | 
  7 | class VowpalWabbitClassifier(BaseEstimator, ClassifierMixin):
  8 |     """
  9 |     Vowpal Wabbit based text classifier.
 10 | 
 11 |     This object represents a simplified [Vowpal Wabbit](https://vowpalwabbit.org/) classifier that is
 12 |     compatible with scikit-learn. The only caveat is that the model expects
 13 |     text-arrays as opposed to numeric arrays.
 14 | 
 15 |     Arguments:
 16 |         n_loop: the number of times the fit step should apply to the training data
 17 |         n_gram: number of n_grams to encode as well
 18 |         learning_rate: the learning rate to apply while training
 19 | 
 20 |     Usage:
 21 | 
 22 |     ```python
 23 |     from tokenwiser.wabbit import VowpalWabbitClassifier
 24 | 
 25 |     clf = VowpalWabbitClassifier()
 26 | 
 27 |     X = [
 28 |         "this is friendly",
 29 |         "very friendly",
 30 |         "i do not like you",
 31 |         "the sky is blue"
 32 |     ]
 33 | 
 34 |     y = ["pos", "pos", "neg", "neutral"]
 35 | 
 36 |     # partial fitting
 37 |     for x_, y_ in zip(X, y):
 38 |         clf.partial_fit(x_, y_, classes=["pos", "neg", "neutral"])
 39 |     clf.predict(X)
 40 | 
 41 |     # batch fitting
 42 |     clf.fit(X, y).predict(X)
 43 |     ```
 44 |     """
 45 | 
 46 |     def __init__(self, n_loop: int = 1, n_gram: int = 1, learning_rate: float = 0.5):
 47 |         self.model = None
 48 |         self.n_loop = n_loop
 49 |         self.n_gram = n_gram
 50 |         self.learning_rate = learning_rate
 51 | 
 52 |     def fit(self, X, y):
 53 |         """
 54 |         Fit the model using X, y as training data.
 55 | 
 56 |         Arguments:
 57 |             X: array-like, shape=(n_columns, n_samples, ) training data, must be text.
 58 |             y: labels
 59 |         """
 60 |         return self.partial_fit(X, y, classes=list(set(y)))
 61 | 
 62 |     def partial_fit(self, X, y, classes):
 63 |         """
 64 |         Incremental fit on a batch of samples.
 65 | 
 66 |         Arguments:
 67 |             X: array-like, shape=(n_columns, n_samples, ) training data, must be text.
 68 |             y: labels
 69 |             classes: list of all the classes in the dataset
 70 |         """
 71 |         if not isinstance(X[0], str):
 72 |             raise ValueError("This model only accepts text as input.")
 73 |         if not self.model:
 74 |             self.classes_ = classes
 75 |             self.idx_to_cls_ = {i + 1: c for i, c in enumerate(self.classes_)}
 76 |             self.cls_to_idx_ = {c: i + 1 for i, c in enumerate(self.classes_)}
 77 |             self.model = pyvw.vw(
 78 |                 quiet=True,
 79 |                 oaa=len(classes),
 80 |                 ngram=self.n_gram,
 81 |                 learning_rate=self.learning_rate,
 82 |                 loss_function="logistic",
 83 |                 probabilities=True,
 84 |             )
 85 |         for loop in range(self.n_loop):
 86 |             for x_, y_ in zip(X, y):
 87 |                 try:
 88 |                     self.model.learn(f"{self.cls_to_idx_[y_]} | {x_}")
 89 |                 except RuntimeError as e:
 90 |                     ex = f"{self.cls_to_idx_[y_]} | {x_}"
 91 |                     raise RuntimeError(f"{e}\nculprit: {ex}")
 92 |         return self
 93 | 
 94 |     def predict_proba(self, X):
 95 |         """
 96 |         Return probability estimates for the test vector X.
 97 | 
 98 |         Arguments:
 99 |             X: array-like, shape=(n_columns, n_samples, ) training data, must be text.
100 |         """
101 |         check_is_fitted(self, ["classes_", "cls_to_idx_", "idx_to_cls_"])
102 |         r = np.array([self.model.predict(f"| {x}") for x in X])
103 |         return r / r.sum(axis=1).reshape(-1, 1)
104 | 
105 |     def predict(self, X):
106 |         """
107 |         Perform classification on an array of test vectors X.
108 | 
109 |         Arguments:
110 |             X: array-like, shape=(n_columns, n_samples, ) training data, must be text.
111 |         """
112 |         argmax = self.predict_proba(X).argmax(axis=1)
113 |         return np.array([self.idx_to_cls_[a + 1] for a in argmax])
114 | 


--------------------------------------------------------------------------------