├── .flake8 ├── .github └── workflows │ └── unittest.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── data └── oos-intent.jsonl ├── docs ├── api │ ├── component.md │ ├── extension.md │ ├── model.md │ ├── pipeline.md │ ├── textprep.md │ └── wabbit.md ├── faq.md ├── guide │ ├── sklearn.md │ └── spacy.md ├── images │ ├── how-it-works.png │ ├── huge_sparse_array.png │ ├── make_concat.png │ ├── minipipe.png │ └── pipeline.png ├── index.md ├── logo-tokw.png └── token.png ├── mkdocs.yml ├── setup.py ├── tests ├── __init__.py ├── conftest.py ├── data │ ├── en.vs5000.model │ └── nlp.txt ├── pipeline │ ├── __init__.py │ ├── test_concat.py │ ├── test_slice.py │ └── test_union.py ├── test_common.py ├── test_docs.py ├── test_extension.py ├── test_spacy_models │ ├── __init__.py │ └── test_base_usage_architectures.py ├── test_textprep │ ├── test_hyphen.py │ ├── test_phonetic.py │ └── test_sklearn.py ├── test_tfm.py ├── test_tok │ ├── __init__.py │ └── test_whitespace.py └── test_wabbit.py ├── theme ├── token.png └── token.svg ├── token.png └── tokenwiser ├── __init__.py ├── __main__.py ├── common.py ├── component ├── __init__.py └── _sklearn.py ├── extension ├── __init__.py └── _extension.py ├── model ├── __init__.py └── sklearnmod.py ├── pipeline ├── __init__.py ├── _concat.py ├── _pipe.py └── _union.py ├── proj └── __init__.py ├── textprep ├── __init__.py ├── _cleaner.py ├── _hyphen.py ├── _identity.py ├── _morph.py ├── _phonetic.py ├── _prep.py ├── _sentpiece.py ├── _snowball.py └── _yake.py ├── tok ├── __init__.py ├── _spacy.py ├── _tok.py └── _whitespace.py └── wabbit ├── __init__.py └── _vowpal.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | per-file-ignores = 3 | clumper/__init__.py: F401 4 | max-line-length = 160 5 | ignore = E203 -------------------------------------------------------------------------------- /.github/workflows/unittest.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | build: 13 | runs-on: ${{ matrix.os }} 14 | strategy: 15 | matrix: 16 | os: [ubuntu-latest] 17 | python-version: [3.7, 3.8, 3.9] 18 | 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v1 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install General Dependencies 26 | run: | 27 | python -m pip install --upgrade pip setuptools wheel 28 | pip install -e ".[dev]" 29 | python -m spacy download en_core_web_sm 30 | - name: Test with pytest 31 | run: | 32 | pytest --verbose tests 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | .idea 131 | *.ipynb 132 | *.model 133 | *.csv 134 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | black: 2 | black tokenwiser tests setup.py --check 3 | 4 | flake: 5 | flake8 tokenwiser tests setup.py 6 | 7 | test: 8 | pytest 9 | 10 | check: black flake test 11 | 12 | install: 13 | python -m pip install -e . 14 | 15 | install-dev: 16 | python -m pip install -e ".[dev]" 17 | pre-commit install 18 | 19 | install-test: 20 | python -m pip install -e ".[test]" 21 | python -m pip install -e ".[all]" 22 | 23 | pypi: 24 | python setup.py sdist 25 | python setup.py bdist_wheel --universal 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # tokenwiser 4 | 5 | > Bag of, not words, but tricks! 6 | 7 | This project contains a couple of "tricks" on tokens. It's a collection 8 | of tricks for sparse data that might be trained on a stream of data too. 9 | 10 | While exploring these tricks was super fun, I do feel like there are plenty 11 | of better alternatives than the ideas I explore here. In the end, TfIDF + LogReg 12 | can be "fine" for a bunch of tasks that don't require embeddings. 13 | 14 | And for embeddings ... there's [embetter](https://github.com/koaning/embetter). 15 | 16 | So I archived this repo. Bit of a shame, because I _really_ liked the name of this package. 17 | -------------------------------------------------------------------------------- /docs/api/component.md: -------------------------------------------------------------------------------- 1 | # `component` 2 | 3 | ```python 4 | from tokenwiser.component import * 5 | ``` 6 | 7 | In the `component` submodule you can find spaCy compatible components. 8 | 9 | ::: tokenwiser.component.attach_sklearn_categoriser 10 | rendering: 11 | show_root_full_path: false 12 | show_root_heading: true 13 | -------------------------------------------------------------------------------- /docs/api/extension.md: -------------------------------------------------------------------------------- 1 | # `extension` 2 | 3 | ```python 4 | from tokenwiser.extension import * 5 | ``` 6 | 7 | In the `extension` submodule you can find spaCy compatible extensions. 8 | 9 | ::: tokenwiser.extension.attach_hyphen_extension 10 | rendering: 11 | show_root_full_path: false 12 | show_root_heading: true 13 | 14 | 15 | ::: tokenwiser.extension.sklearn_method 16 | rendering: 17 | show_root_full_path: false 18 | show_root_heading: true 19 | -------------------------------------------------------------------------------- /docs/api/model.md: -------------------------------------------------------------------------------- 1 | # `model` 2 | 3 | ```python 4 | from tokenwiser.model import * 5 | ``` 6 | 7 | In the `model` submodule you can find scikit-learn pipelines that are trainable via spaCy. 8 | These pipelines apply the `.partial_fit().predict()`-design which makes them compliant with 9 | the `spacy train` command. 10 | 11 | ::: tokenwiser.model.SklearnCat 12 | rendering: 13 | show_root_full_path: false 14 | show_root_heading: true 15 | -------------------------------------------------------------------------------- /docs/api/pipeline.md: -------------------------------------------------------------------------------- 1 | # `pipeline` 2 | 3 | ```python 4 | from tokenwiser.pipeline import * 5 | ``` 6 | 7 | In the `pipeline` submodule you can find scikit-learn compatbile 8 | pipelines that extend the standard behavior. 9 | 10 | ::: tokenwiser.pipeline.PartialPipeline 11 | rendering: 12 | show_root_full_path: false 13 | show_root_heading: true 14 | 15 | ::: tokenwiser.pipeline.TextConcat 16 | rendering: 17 | show_root_full_path: false 18 | show_root_heading: true 19 | selection: 20 | members: 21 | - partial_fit 22 | 23 | ::: tokenwiser.pipeline.PartialFeatureUnion 24 | rendering: 25 | show_root_full_path: false 26 | show_root_heading: true 27 | 28 | ::: tokenwiser.pipeline.make_partial_pipeline 29 | rendering: 30 | show_root_full_path: false 31 | show_root_heading: true 32 | 33 | ::: tokenwiser.pipeline.make_concat 34 | rendering: 35 | show_root_full_path: false 36 | show_root_heading: true 37 | 38 | ::: tokenwiser.pipeline.make_partial_union 39 | rendering: 40 | show_root_full_path: false 41 | show_root_heading: true 42 | -------------------------------------------------------------------------------- /docs/api/textprep.md: -------------------------------------------------------------------------------- 1 | # `textprep` 2 | 3 | ```python 4 | from tokenwiser.textprep import * 5 | ``` 6 | 7 | In the `textprep` submodule you can find scikit-learn compatbile 8 | components that transform text into another type of text. The idea 9 | is that this may be combined in interesting ways in CountVectorizers. 10 | 11 | ::: tokenwiser.textprep.Cleaner 12 | rendering: 13 | show_root_full_path: false 14 | show_root_heading: true 15 | 16 | ::: tokenwiser.textprep.Identity 17 | selection: 18 | members: 19 | - no 20 | rendering: 21 | show_root_full_path: false 22 | show_root_heading: true 23 | 24 | ::: tokenwiser.textprep.HyphenTextPrep 25 | selection: 26 | members: 27 | - fit 28 | - transform 29 | rendering: 30 | show_root_full_path: false 31 | show_root_heading: true 32 | 33 | ::: tokenwiser.textprep.SentencePiecePrep 34 | rendering: 35 | show_root_full_path: false 36 | show_root_heading: true 37 | 38 | ::: tokenwiser.textprep.PhoneticTextPrep 39 | rendering: 40 | show_root_full_path: false 41 | show_root_heading: true 42 | 43 | ::: tokenwiser.textprep.YakeTextPrep 44 | rendering: 45 | show_root_full_path: false 46 | show_root_heading: true 47 | 48 | ::: tokenwiser.textprep.SpacyMorphTextPrep 49 | rendering: 50 | show_root_full_path: false 51 | show_root_heading: true 52 | 53 | ::: tokenwiser.textprep.SpacyPosTextPrep 54 | rendering: 55 | show_root_full_path: false 56 | show_root_heading: true 57 | 58 | ::: tokenwiser.textprep.SpacyLemmaTextPrep 59 | rendering: 60 | show_root_full_path: false 61 | show_root_heading: true 62 | 63 | ::: tokenwiser.textprep.SnowballTextPrep 64 | rendering: 65 | show_root_full_path: false 66 | show_root_heading: true 67 | -------------------------------------------------------------------------------- /docs/api/wabbit.md: -------------------------------------------------------------------------------- 1 | # `wabbit` 2 | 3 | ```python 4 | from tokenwiser.wabbit import * 5 | ``` 6 | 7 | In the `wabbit` submodule you can find a scikit-learn 8 | component based on [vowpal wabbit](https://vowpalwabbit.org/). 9 | 10 | ::: tokenwiser.wabbit.VowpalWabbitClassifier 11 | rendering: 12 | show_root_full_path: false 13 | show_root_heading: true 14 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | ## Why can't I use normal `Pipeline` objects with the spaCy API? 2 | 3 | Scikit-Learn assumes that data is trained via `.fit(X, y).predict(X)`. This is great 4 | when you've got a dataset fully in memory but it's not so great when your dataset is 5 | too big to fit in one go. This is a main reason why spaCy has an `.update()` 6 | API for their trainable pipeline components. It's similar to `.partial_fit(X)` in 7 | scikit-learn. You wouldn't train on a single batch of data. Instead you would iteratively 8 | train on subsets of the dataset. 9 | 10 | A big downside of the `Pipeline` API is that it cannot use `.partial_fit(X)`. 11 | Even if all the components on the inside are compatible, it forces you to use `.fit(X)`. 12 | That is why this library offers a `PartialPipeline`. It only allows for components that have `.partial_fit` 13 | implemented and it's these pipelines that can also comply with spaCy's `.update()` 14 | API. 15 | 16 | Note that all scikit-learn components offered by this library are compatible with 17 | the `PartialPipeline`. This includes everything from the `tokeniser.textprep` submodule. 18 | 19 | ## Can I train spaCy with scikit-learn from Jupyter? 20 | 21 | It's not our favorite way of doing things, but nobody is stopping you. 22 | 23 | ```python 24 | import spacy 25 | from spacy import registry 26 | from spacy.training import Example 27 | from spacy.language import Language 28 | 29 | from tokenwiser.pipeline import PartialPipeline 30 | from tokenwiser.model.sklearnmod import SklearnCat 31 | from sklearn.feature_extraction.text import HashingVectorizer 32 | from sklearn.linear_model import SGDClassifier 33 | 34 | @Language.factory("custom-sklearn-cat") 35 | def make_sklearn_cat(nlp, name, sklearn_model, label, classes): 36 | return SklearnCat(nlp, name, sklearn_model, label, classes) 37 | 38 | @registry.architectures("sklearn_model_basic_sgd.v1") 39 | def make_sklearn_cat_basic_sgd(): 40 | """This creates a *partial* pipeline. We can't use a standard pipeline from scikit-learn.""" 41 | return PartialPipeline([("hash", HashingVectorizer()), ("lr", SGDClassifier(loss="log"))]) 42 | 43 | 44 | nlp = spacy.load("en_core_web_sm") 45 | config = { 46 | "sklearn_model": "@sklearn_model_basic_sgd.v1", 47 | "label": "pos", 48 | "classes": ["pos", "neg"] 49 | } 50 | nlp.add_pipe("custom-sklearn-cat", config=config) 51 | 52 | texts = [ 53 | "you are a nice person", 54 | "this is a great movie", 55 | "i do not like cofee", 56 | "i hate tea" 57 | ] 58 | labels = ["pos", "pos", "neg", "neg"] 59 | 60 | # This is the training loop just for out categorizer model. 61 | with nlp.select_pipes(enable="custom-sklearn-cat"): 62 | optimizer = nlp.resume_training() 63 | for loop in range(10): 64 | for t, lab in zip(texts, labels): 65 | doc = nlp.make_doc(t) 66 | example = Example.from_dict(doc, {"cats": {"pos": lab}}) 67 | nlp.update([example], sgd=optimizer) 68 | 69 | nlp("you are a nice person").cats # {'pos': 0.9979167909733176} 70 | nlp("coffee i do not like").cats # {'neg': 0.990049724779963} 71 | ``` -------------------------------------------------------------------------------- /docs/guide/sklearn.md: -------------------------------------------------------------------------------- 1 | Scikit-Learn pipelines are amazing but they are not perfect for simple text use-cases. 2 | 3 | - The standard pipeline does not allow for interactive learning. You can 4 | apply `.fit` but that's it. Even if the tools inside of the pipeline have 5 | a `.partial_fit` available, the pipeline doesn't allow it. 6 | - The `CountVectorizer` is great, but we might need some more text-tricks 7 | at our disposal that are specialized towards text to make this object more effective. 8 | 9 | Part of what this library does is give more tools that extend scikit-learn for simple 10 | text classification problems. In this document we will showcase some of the main features. 11 | 12 | ## Text Preparation Tools 13 | 14 | Let's first discuss a basic pipeline for text inside of scikit-learn. 15 | 16 | ### Base Pipeline 17 | 18 | This simplest text classification pipeline in scikit-learn looks like this; 19 | 20 | ![](../images/minipipe.png) 21 | 22 | ```python 23 | from sklearn.pipeline import make_pipeline 24 | from sklearn.feature_extraction.text import CountVectorizer 25 | from sklearn.linear_model import SGDClassifier 26 | 27 | pipe = make_pipeline( 28 | CountVectorizer(), 29 | SGDClassifier() 30 | ) 31 | ``` 32 | 33 | This pipeline will encode words as sparse features before passing them on to the logistic regression model. 34 | This pattern is very common and has proven to work well enough for many English text classification tasks. 35 | 36 | ![](../images/how-it-works.png) 37 | 38 | The nice thing about using a `SGDClassifier` is that we're able to learn from our data even if the dataset 39 | does not fit in memory. We can call `.partial_fit` instead of `.fit` and learn in a more "online" setting. 40 | 41 | That said, there are things we can do even to this pipeline to make it better. 42 | 43 | ### Spelling Errors 44 | 45 | When you are classifying online texts you are often confronted with spelling errors. To 46 | deal with this you'd typically use a [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) 47 | with a character-level analyzer such that you also encode subwords. 48 | 49 | ![](../images/huge_sparse_array.png) 50 | 51 | With all of these subwords around, we'll be more robust against spelling errors. 52 | The downside of this approach is that you might wonder if we really *need* all these subwords. So how about this, 53 | let's add a step that will turn our text into subwords by splitting up hyphens. 54 | 55 | ```python 56 | from tokenwiser.textprep import HyphenTextPrep 57 | 58 | multi = HyphenTextPrep().transform(["geology", "astrology"]) 59 | 60 | assert multi == ['geo logy', 'as tro logy'] 61 | ``` 62 | 63 | The `HyphenTextPrep` preprocessor is a `TextPrep`-object. For all intents and purposes these are 64 | scikit-learn compatible preprocessing components but they all output strings instead of arrays. What's 65 | nice about these though is that you can "retokenize" the original text. This allows you to use the 66 | subtokens as if they were tokens which might help keep your pipelines lightweight while still keeping 67 | them robust against certain spelling errors. 68 | 69 | ### Long Texts 70 | 71 | There are some other tricks that you might want to apply for longer texts. Maybe you want to summarise a text before 72 | vectorizing it. So maybe it'd be nice to use a transformer that keeps only the most important tokens. 73 | 74 | A neat heuristic toolkit for this is [yake](https://github.com/LIAAD/yake) (you can find a demo 75 | [here](http://yake.inesctec.pt/demo/sample/sample1)). This package also features a scikit-learn compatible component for it. 76 | 77 | ```python 78 | from tokenwiser.textprep import YakeTextPrep 79 | 80 | text = [ 81 | "Sources tell us that Google is acquiring Kaggle, \ 82 | a platform that hosts data science and machine learning" 83 | ] 84 | example = YakeTextPrep(top_n=3, unique=False).transform(text) 85 | 86 | assert example[0] == 'hosts data science acquiring kaggle google is acquiring' 87 | ``` 88 | 89 | The idea here is to reduce the text down to only the most important words. Again, this trick 90 | might keep the algorithm lightweight and this trick will go a lot further than most "stopword"-lists. 91 | 92 | ### Bag of Tricks! 93 | 94 | The goal of this library is to host a few meaningful tricks that might be helpful. Here's some more; 95 | 96 | - `Cleaner` lowercase text remove all non alphanumeric characters. 97 | - `Identity` just keeps the text as is, useful when constructing elaborate pipelines. 98 | - `PhoneticTextPrep` translate text into a phonetic encoding. 99 | - `SpacyPosTextPrep` add part of speech infomation to the text using spaCy. 100 | - `SpacyLemmaTextPrep` lemmatize the text using spaCy. 101 | 102 | All of these tools are part of the `textprep` submodule and are documented in detail 103 | [here](https://koaning.github.io/tokenwiser/api/textprep.html). 104 | 105 | ## Pipeline Tools 106 | 107 | Pipeline components are certainly nice. But maybe we can go a step further for text. Maybe 108 | we can make better pipelines for text too! 109 | 110 | ### Concatenate Text 111 | 112 | In scikit-learn you would use `FeatureUnion` or `make_union` to concatenate features in 113 | a pipeline. Ut is assumed that transformers output arrays that need to be concatenated so the 114 | result of a concatenation is always a 2D array. This can be a bit awkward if you're using text preprocessors. 115 | 116 | ![](../images/make_concat.png) 117 | 118 | The reason why we want to keep everything a string is so that the `CountVectorizer` from scikit-learn 119 | can properly encode it. That is why this library comes with a special union 120 | component: `TextConcat`. It concatenates the output of text-prep tools into a string instead of 121 | an array. Note that we also pack a convenient `make_concat` function too. 122 | 123 | ```python 124 | from sklearn.pipeline import make_pipeline 125 | 126 | from tokenwiser.pipeline import make_concat 127 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep 128 | 129 | pipe = make_pipeline( 130 | Cleaner(), 131 | make_concat(Identity(), HyphenTextPrep()), 132 | ) 133 | 134 | output = pipe.fit_transform(["hello astrology!!!!"]) 135 | assert output == ['hello astrology hel lo astro logy'] 136 | ``` 137 | 138 | Again, we see that we're taking a text input and that we're generating a text output. The `make_concat` 139 | is making sure that we concatenate strings, not arrays! This is great when we want to follow up with 140 | a `CountVectorizer! 141 | 142 | ```python 143 | from sklearn.pipeline import make_pipeline 144 | from sklearn.linear_model import LogisticRegression 145 | from sklearn.feature_extraction.text import CountVectorizer 146 | 147 | from tokenwiser.pipeline import make_concat 148 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep 149 | 150 | pipe = make_pipeline( 151 | Cleaner(), 152 | make_concat(Identity(), HyphenTextPrep()), 153 | CountVectorizer(), 154 | LogisticRegression() 155 | ) 156 | ``` 157 | 158 | The mental picture for `pipe`-pipeline looks like the diagram below. 159 | 160 | ![](../images/pipeline.png) 161 | 162 | ### Partial Fit 163 | 164 | We can go a step further though. The scikit-learn pipeline follows the `fit/predict` API. That 165 | means that we cannot use `.partial_fit()`. Even if all the components in the pipeline are compatible 166 | with the `partial_fit/predict` API. That is why this library also introduced components for mini-batch 167 | learning: `PartialPipeline` and `make_partial_pipeline` 168 | 169 | In these scenarios you will need to swap out the `CountVectorizer` with a `HashVectorizer` in order to 170 | be able to learn from new data comming in. 171 | 172 | ```python 173 | from sklearn.linear_model import SGDClassifier 174 | from sklearn.feature_extraction.text import HashingVectorizer 175 | 176 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep 177 | from tokenwiser.pipeline import make_concat, make_partial_pipeline 178 | 179 | pipe = make_partial_pipeline( 180 | Cleaner(), 181 | make_concat(Identity(), HyphenTextPrep()), 182 | HashingVectorizer(), 183 | SGDClassifier() 184 | ) 185 | ``` 186 | 187 | This `pipe`-Pipeline is scikit-learn compatible for all intents and purposes 188 | but it has the option of learning from batches of data via `partal_fit`. This is great 189 | because it means that you're able to classify text even when it doesn't fit into memory! 190 | 191 | > Note that all of the `TextPrep`-components in this library allow for `partial_fit`. 192 | 193 | To make a `partial_fit` actually work you will need to supply the names of the `classes` 194 | at learning time. Otherwise you might accidentally get a batch that only contains one class 195 | and the algorithm would become numerically unstable. 196 | 197 | ```python 198 | import numpy as np 199 | from sklearn.linear_model import SGDClassifier 200 | from sklearn.feature_extraction.text import HashingVectorizer 201 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep 202 | from tokenwiser.pipeline import make_partial_pipeline, make_partial_union 203 | 204 | pipe = make_partial_pipeline( 205 | Cleaner(), 206 | make_partial_union( 207 | make_partial_pipeline(Identity(), HashingVectorizer()), 208 | make_partial_pipeline(HyphenTextPrep(), HashingVectorizer()) 209 | ), 210 | SGDClassifier() 211 | ) 212 | 213 | X = [ 214 | "i really like this post", 215 | "thanks for that comment", 216 | "i enjoy this friendly forum", 217 | "this is a bad post", 218 | "i dislike this article", 219 | "this is not well written" 220 | ] 221 | 222 | y = np.array([1, 1, 1, 0, 0, 0]) 223 | 224 | for loop in range(3): 225 | # It might make sense to loop over the same dataset multiple times 226 | # if the dataset is small. For larger datasets this isn't recommended. 227 | pipe.partial_fit(X, y, classes=[0, 1]) 228 | 229 | assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0])) 230 | ``` 231 | 232 | ### Concatenate Features 233 | 234 | The standard `FeatureUnion` from scikit-learn also does not allow for `.partial_fit`. So we've 235 | added a `PartialFeatureUnion` class and a `make_partial_union` function to this library as well. 236 | 237 | ```python 238 | import numpy as np 239 | from sklearn.linear_model import SGDClassifier 240 | from sklearn.feature_extraction.text import HashingVectorizer 241 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep 242 | from tokenwiser.pipeline import make_partial_pipeline, make_partial_union 243 | 244 | pipe = make_partial_pipeline( 245 | Cleaner(), 246 | make_partial_union( 247 | make_partial_pipeline(Identity(), HashingVectorizer()), 248 | make_partial_pipeline(HyphenTextPrep(), HashingVectorizer()) 249 | ), 250 | SGDClassifier() 251 | ) 252 | 253 | X = [ 254 | "i really like this post", 255 | "thanks for that comment", 256 | "i enjoy this friendly forum", 257 | "this is a bad post", 258 | "i dislike this article", 259 | "this is not well written" 260 | ] 261 | 262 | y = np.array([1, 1, 1, 0, 0, 0]) 263 | 264 | for loop in range(3): 265 | pipe.partial_fit(X, y, classes=[0, 1]) 266 | 267 | assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0])) 268 | ``` -------------------------------------------------------------------------------- /docs/guide/spacy.md: -------------------------------------------------------------------------------- 1 | This is where we'll elaborate on the `spaCy` tools. 2 | 3 | Under construction. -------------------------------------------------------------------------------- /docs/images/how-it-works.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/images/how-it-works.png -------------------------------------------------------------------------------- /docs/images/huge_sparse_array.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/images/huge_sparse_array.png -------------------------------------------------------------------------------- /docs/images/make_concat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/images/make_concat.png -------------------------------------------------------------------------------- /docs/images/minipipe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/images/minipipe.png -------------------------------------------------------------------------------- /docs/images/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/images/pipeline.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |

tokenwiser

4 | 5 | > Bag of, not words, but tricks! 6 | 7 | ## Goal 8 | 9 | We noticed that a lot of benchmarks relied on heavy-weight tools while they did not 10 | check if something more lightweight would also work. Maybe if we just apply some simple 11 | tricks on our tokens we won't need massive language models. The goal of this package is 12 | to contribute tricks to keep your NLP pipelines simple. These tricks are made available 13 | for spaCy, scikit-learn and vowpal wabbit. 14 | 15 | > If you're looking for a tool that can add pretrained language models to scikit-learn 16 | pipelines as a benchmark you'll want to explore another tool: [whatlies](https://rasahq.github.io/whatlies/tutorial/scikit-learn/). 17 | 18 | ## Features 19 | 20 | ### Scikit-Learn Tools 21 | 22 | The following submodules contain features that might be useful. 23 | 24 | - `.textprep`: Contains string pre-processing tools for scikit-learn. 25 | - `.pipeline`: Contains extra pipeline components for scikit-learn. 26 | - `.wabbit`: Contains a scikit-learn component based on [vowpal wabbit](https://vowpalwabbit.org/). 27 | 28 | ### SpaCy Tools 29 | 30 | - `.component`: Contains spaCy compatible components that might be added as a pipeline step. 31 | - `.extension`: Contains spaCy compatible extensions that might be added manually. 32 | -------------------------------------------------------------------------------- /docs/logo-tokw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/logo-tokw.png -------------------------------------------------------------------------------- /docs/token.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/docs/token.png -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: "tokenwiser" 2 | extra_css: [style.css] 3 | repo_url: https://github.com/koaning/tokenwiser 4 | site_url: https://koaning.github.io/tokenwiser/ 5 | site_description: Bag of, not words, but tricks! 6 | site_author: Vincent D. Warmerdam 7 | use_directory_urls: false 8 | nav: 9 | - Home: index.md 10 | - Scikit-Learn: guide/sklearn.md 11 | - spaCy: guide/spacy.md 12 | - API: 13 | - textprep: api/textprep.md 14 | - pipeline: api/pipeline.md 15 | - extension: api/extension.md 16 | - component: api/component.md 17 | - wabbit: api/wabbit.md 18 | - FAQ: faq.md 19 | plugins: 20 | - mkdocstrings: 21 | handlers: 22 | python: 23 | setup_commands: 24 | - from tokenwiser.textprep import * 25 | - from tokenwiser.pipeline import * 26 | watch: 27 | - tokenwiser 28 | - search 29 | copyright: Copyright © 2020 Maintained by Vincent. 30 | theme: 31 | name: material 32 | logo: token.png 33 | font: 34 | text: Ubuntu 35 | code: Ubuntu Mono 36 | feature: 37 | tabs: true 38 | palette: 39 | primary: white 40 | accent: teal 41 | features: 42 | - navigation.tabs 43 | markdown_extensions: 44 | - admonition 45 | - codehilite 46 | - pymdownx.inlinehilite 47 | - pymdownx.details 48 | - pymdownx.tabbed 49 | - pymdownx.superfences 50 | - pymdownx.highlight: 51 | use_pygments: true 52 | guess_lang: true 53 | - toc: 54 | permalink: true 55 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from tokenwiser import __version__ 2 | from setuptools import setup, find_packages 3 | 4 | base_packages = [ 5 | "jellyfish>=0.8.2", 6 | "Pyphen>=0.10.0", 7 | "scikit-learn>=0.24.0", 8 | "PyYAML>=5.3.1", 9 | "spacy>=3.2.0", 10 | "yake-github>=0.4.0", 11 | "vowpalwabbit>=8.9.0", 12 | "sentencepiece>=0.1.95", 13 | "snowballstemmer>=2.1.0", 14 | "h5py>=2.10.0" 15 | ] 16 | 17 | dev_packages = [ 18 | "flake8>=3.6.0", 19 | "pytest>=4.0.2", 20 | "jupyter>=1.0.0", 21 | "jupyterlab>=0.35.4", 22 | "mktestdocs>=0.1.0", 23 | ] 24 | 25 | docs_packages = [ 26 | "mkdocs>=1.1.2", 27 | "mkdocs-material>=6.2.8", 28 | "mkdocstrings>=0.14.0" 29 | ] 30 | 31 | 32 | setup( 33 | name="tokenwiser", 34 | version=__version__, 35 | packages=find_packages(exclude=["notebooks"]), 36 | install_requires=base_packages, 37 | extras_require={"dev": dev_packages + docs_packages, "docs": docs_packages}, 38 | ) 39 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from sklearn.utils import estimator_checks 2 | 3 | transformer_checks = ( 4 | estimator_checks.check_transformer_data_not_an_array, 5 | estimator_checks.check_transformer_general, 6 | estimator_checks.check_transformers_unfitted, 7 | ) 8 | 9 | general_checks = ( 10 | estimator_checks.check_fit2d_predict1d, 11 | estimator_checks.check_methods_subset_invariance, 12 | estimator_checks.check_fit2d_1sample, 13 | estimator_checks.check_fit2d_1feature, 14 | estimator_checks.check_fit1d, 15 | estimator_checks.check_get_params_invariance, 16 | estimator_checks.check_set_params, 17 | estimator_checks.check_dict_unchanged, 18 | estimator_checks.check_dont_overwrite_parameters, 19 | ) 20 | 21 | nonmeta_checks = ( 22 | estimator_checks.check_estimators_dtypes, 23 | estimator_checks.check_fit_score_takes_y, 24 | estimator_checks.check_dtype_object, 25 | estimator_checks.check_sample_weights_pandas_series, 26 | estimator_checks.check_sample_weights_list, 27 | estimator_checks.check_sample_weights_invariance, 28 | estimator_checks.check_estimators_fit_returns_self, 29 | estimator_checks.check_complex_data, 30 | estimator_checks.check_estimators_empty_data_messages, 31 | estimator_checks.check_pipeline_consistency, 32 | estimator_checks.check_estimators_nan_inf, 33 | estimator_checks.check_estimators_overwrite_params, 34 | estimator_checks.check_estimator_sparse_data, 35 | estimator_checks.check_estimators_pickle, 36 | ) 37 | 38 | classifier_checks = ( 39 | estimator_checks.check_classifier_data_not_an_array, 40 | estimator_checks.check_classifiers_one_label, 41 | estimator_checks.check_classifiers_classes, 42 | estimator_checks.check_estimators_partial_fit_n_features, 43 | estimator_checks.check_classifiers_train, 44 | estimator_checks.check_supervised_y_2d, 45 | estimator_checks.check_supervised_y_no_nan, 46 | estimator_checks.check_estimators_unfitted, 47 | estimator_checks.check_non_transformer_estimators_n_iter, 48 | estimator_checks.check_decision_proba_consistency, 49 | ) 50 | 51 | regressor_checks = ( 52 | estimator_checks.check_regressors_train, 53 | estimator_checks.check_regressor_data_not_an_array, 54 | estimator_checks.check_estimators_partial_fit_n_features, 55 | estimator_checks.check_regressors_no_decision_function, 56 | estimator_checks.check_supervised_y_2d, 57 | estimator_checks.check_supervised_y_no_nan, 58 | estimator_checks.check_regressors_int, 59 | estimator_checks.check_estimators_unfitted, 60 | ) 61 | 62 | outlier_checks = ( 63 | estimator_checks.check_outliers_fit_predict, 64 | estimator_checks.check_outliers_train, 65 | estimator_checks.check_classifier_data_not_an_array, 66 | estimator_checks.check_estimators_unfitted, 67 | ) 68 | 69 | 70 | def select_tests(include, exclude=[]): 71 | """Return an iterable of include with all tests whose name is not in exclude""" 72 | for test in include: 73 | if test.__name__ not in exclude: 74 | yield test 75 | 76 | 77 | def id_func(param): 78 | """Returns the repr of an object for usage in pytest parametrize""" 79 | return repr(param) 80 | -------------------------------------------------------------------------------- /tests/data/en.vs5000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/tests/data/en.vs5000.model -------------------------------------------------------------------------------- /tests/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/tests/pipeline/__init__.py -------------------------------------------------------------------------------- /tests/pipeline/test_concat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/tests/pipeline/test_concat.py -------------------------------------------------------------------------------- /tests/pipeline/test_slice.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import HashingVectorizer 2 | from sklearn.linear_model import LogisticRegression 3 | 4 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep 5 | from tokenwiser.pipeline import PartialPipeline, make_partial_pipeline, make_concat 6 | 7 | 8 | def test_can_slice_pipeline(): 9 | """If we slice a pipeline, we should get a new pipeline object""" 10 | pipe1 = make_partial_pipeline( 11 | Cleaner(), 12 | make_concat( 13 | Identity(), 14 | HyphenTextPrep(), 15 | ), 16 | HashingVectorizer(), 17 | LogisticRegression() 18 | ) 19 | 20 | slice = pipe1[:-1] 21 | assert isinstance(slice, PartialPipeline) 22 | -------------------------------------------------------------------------------- /tests/pipeline/test_union.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.feature_extraction.text import HashingVectorizer 3 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep 4 | from tokenwiser.pipeline import PartialPipeline, PartialFeatureUnion 5 | 6 | 7 | def test_shape_doubles(): 8 | """If we concatenate using a partial union. It should increase in size.""" 9 | pipe1 = PartialPipeline( 10 | [ 11 | ("clean", Cleaner()), 12 | ( 13 | "union", 14 | PartialFeatureUnion( 15 | [ 16 | ( 17 | "full_text_pipe", 18 | PartialPipeline( 19 | [ 20 | ("identity", Identity()), 21 | ("hash1", HashingVectorizer()), 22 | ] 23 | ), 24 | ) 25 | ] 26 | ), 27 | ), 28 | ] 29 | ) 30 | 31 | pipe2 = PartialPipeline( 32 | [ 33 | ("clean", Cleaner()), 34 | ( 35 | "union", 36 | PartialFeatureUnion( 37 | [ 38 | ( 39 | "full_text_pipe", 40 | PartialPipeline( 41 | [ 42 | ("identity", Identity()), 43 | ("hash1", HashingVectorizer()), 44 | ] 45 | ), 46 | ), 47 | ( 48 | "hyphen_pipe", 49 | PartialPipeline( 50 | [ 51 | ("hyphen", HyphenTextPrep()), 52 | ("hash2", HashingVectorizer()), 53 | ] 54 | ), 55 | ), 56 | ] 57 | ), 58 | ), 59 | ] 60 | ) 61 | 62 | X = [ 63 | "i really like this post", 64 | "thanks for that comment", 65 | "i enjoy this friendly forum", 66 | "this is a bad post", 67 | "i dislike this article", 68 | "this is not well written", 69 | ] 70 | 71 | y = np.array([1, 1, 1, 0, 0, 0]) 72 | 73 | p1 = pipe1.partial_fit(X, y, classes=[0, 1]).transform(X) 74 | p2 = pipe2.partial_fit(X, y, classes=[0, 1]).transform(X) 75 | 76 | assert p1.shape[1] * 2 == p2.shape[1] 77 | -------------------------------------------------------------------------------- /tests/test_common.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pathlib 3 | from tokenwiser.common import load_coefficients, save_coefficients 4 | 5 | import numpy as np 6 | from sklearn.linear_model import SGDClassifier, LogisticRegression, PassiveAggressiveClassifier 7 | from sklearn.feature_extraction.text import HashingVectorizer 8 | from sklearn.pipeline import make_pipeline 9 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep 10 | from tokenwiser.pipeline import make_partial_pipeline, make_partial_union 11 | 12 | 13 | @pytest.mark.parametrize("clf_train", [LogisticRegression, SGDClassifier, PassiveAggressiveClassifier]) 14 | @pytest.mark.parametrize("clf_target", [LogisticRegression, SGDClassifier, PassiveAggressiveClassifier]) 15 | def test_load_save(clf_train, clf_target, tmpdir): 16 | """ 17 | Ensure that we can save/load vectors. 18 | """ 19 | clf = clf_train() 20 | pipe = make_pipeline( 21 | Cleaner(), 22 | make_partial_union( 23 | make_partial_pipeline(Identity(), HashingVectorizer()), 24 | make_partial_pipeline(HyphenTextPrep(), HashingVectorizer()) 25 | ), 26 | clf 27 | ) 28 | 29 | X = [ 30 | "i really like this post", 31 | "thanks for that comment", 32 | "i enjoy this friendly forum", 33 | "this is a bad post", 34 | "i dislike this article", 35 | "this is not well written" 36 | ] 37 | 38 | y = np.array([1, 1, 1, 0, 0, 0]) 39 | 40 | pipe.fit(X, y) 41 | 42 | assert np.all(pipe.predict(X) == y) 43 | 44 | # Here we create in the new pipeline. 45 | clf_new = clf_target() 46 | pipe_new = make_partial_pipeline( 47 | Cleaner(), 48 | make_partial_union( 49 | make_partial_pipeline(Identity(), HashingVectorizer()), 50 | make_partial_pipeline(HyphenTextPrep(), HashingVectorizer()) 51 | ), 52 | clf 53 | ) 54 | path = pathlib.Path(tmpdir, "coefs.h5") 55 | save_coefficients(clf, path) 56 | load_coefficients(clf_new, path) 57 | assert np.all(clf.intercept_ == clf_new.intercept_) 58 | assert np.all(clf.coef_ == clf_new.coef_) 59 | assert np.all(clf.classes_ == clf_new.classes_) 60 | assert np.all(pipe_new.predict(X) == y) 61 | -------------------------------------------------------------------------------- /tests/test_docs.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from tokenwiser.textprep import ( 3 | Cleaner, 4 | Identity, 5 | HyphenTextPrep, 6 | SpacyMorphTextPrep, 7 | SpacyPosTextPrep, 8 | SpacyLemmaTextPrep, 9 | YakeTextPrep, 10 | PhoneticTextPrep, 11 | ) 12 | from tokenwiser.pipeline import ( 13 | TextConcat, 14 | PartialPipeline, 15 | PartialFeatureUnion, 16 | make_partial_pipeline, 17 | make_concat, 18 | make_partial_union, 19 | ) 20 | from tokenwiser.extension import ( 21 | attach_hyphen_extension, 22 | attach_sklearn_extension, 23 | sklearn_method, 24 | ) 25 | from tokenwiser.component import attach_sklearn_categoriser 26 | 27 | import pytest 28 | from mktestdocs import check_docstring, check_md_file 29 | 30 | components = [ 31 | Cleaner, 32 | Identity, 33 | HyphenTextPrep, 34 | SpacyMorphTextPrep, 35 | SpacyPosTextPrep, 36 | SpacyLemmaTextPrep, 37 | PhoneticTextPrep, 38 | YakeTextPrep, 39 | TextConcat, 40 | PartialPipeline, 41 | PartialFeatureUnion, 42 | make_partial_pipeline, 43 | make_concat, 44 | make_partial_union, 45 | attach_hyphen_extension, 46 | attach_sklearn_extension, 47 | sklearn_method, 48 | attach_sklearn_categoriser, 49 | ] 50 | 51 | 52 | @pytest.mark.parametrize("obj", components, ids=lambda d: d.__qualname__) 53 | def test_member(obj): 54 | check_docstring(obj) 55 | 56 | 57 | @pytest.mark.parametrize( 58 | "fpath", [str(p) for p in pathlib.Path("docs").glob("**/*.md")] 59 | ) 60 | def test_fpath(fpath): 61 | check_md_file(fpath) 62 | -------------------------------------------------------------------------------- /tests/test_extension.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from tokenwiser.extension import attach_hyphen_extension 3 | 4 | 5 | def test_hyphen_works(): 6 | nlp = spacy.load("en_core_web_sm") 7 | doc = nlp("this is a dinosaurhead") 8 | tok = doc[-1] 9 | attach_hyphen_extension() 10 | assert tok._.hyphen == ["di", "no", "saur", "head"] 11 | -------------------------------------------------------------------------------- /tests/test_spacy_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/tests/test_spacy_models/__init__.py -------------------------------------------------------------------------------- /tests/test_spacy_models/test_base_usage_architectures.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import spacy 3 | from spacy.training import Example 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "model", ["@sklearn_model_basic_sgd.v1", "@sklearn_model_basic_naive_bayes.v1"] 8 | ) 9 | def test_model_config_inline(model): 10 | nlp = spacy.load("en_core_web_sm") 11 | conf = {"sklearn_model": model, "label": "pos", "classes": ["pos", "neg"]} 12 | nlp.add_pipe("sklearn-cat", config=conf) 13 | 14 | texts = ["you are a nice person", "this is a great movie", "i do not like coffee"] 15 | labels = ["pos", "pos", "neg"] 16 | 17 | with nlp.select_pipes(enable="sklearn-cat"): 18 | optimizer = nlp.resume_training() 19 | for itn in range(100): 20 | for t, lab in zip(texts, labels): 21 | doc = nlp.make_doc(t) 22 | example = Example.from_dict(doc, {"cats": {"pos": lab}}) 23 | nlp.update([example], sgd=optimizer) 24 | 25 | assert len(nlp("you are a nice person").cats.keys()) > 0 26 | assert len(nlp("coffee i do not like").cats.keys()) > 0 27 | -------------------------------------------------------------------------------- /tests/test_textprep/test_hyphen.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from tokenwiser.textprep import HyphenTextPrep 4 | 5 | 6 | @pytest.mark.parametrize("x_in,x_out", [("haleluja", "hale lu ja"), ("hello", "hello")]) 7 | def test_basic(x_in, x_out): 8 | assert HyphenTextPrep().encode_single(x_in) == x_out 9 | -------------------------------------------------------------------------------- /tests/test_textprep/test_phonetic.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from tokenwiser.textprep import PhoneticTextPrep 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "x_in,x_out", [("haleluja", "H442"), ("hello there world", "H400 T600 W643")] 8 | ) 9 | def test_soundex(x_in, x_out): 10 | assert PhoneticTextPrep(kind="soundex").encode_single(x_in) == x_out 11 | 12 | 13 | @pytest.mark.parametrize( 14 | "x_in,x_out", [("haleluja", "HLLJ"), ("hello there world", "HL 0R WRLT")] 15 | ) 16 | def test_metaphone(x_in, x_out): 17 | assert PhoneticTextPrep(kind="metaphone").encode_single(x_in) == x_out 18 | 19 | 20 | @pytest.mark.parametrize( 21 | "x_in,x_out", [("haleluja", "HALALAJ"), ("hello there world", "HAL TAR WARLD")] 22 | ) 23 | def test_nysiis(x_in, x_out): 24 | assert PhoneticTextPrep(kind="nysiis").encode_single(x_in) == x_out 25 | -------------------------------------------------------------------------------- /tests/test_textprep/test_sklearn.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sklearn.pipeline import Pipeline 3 | from sklearn.feature_extraction.text import CountVectorizer 4 | 5 | from tokenwiser.textprep import ( 6 | Cleaner, 7 | HyphenTextPrep, 8 | SpacyMorphTextPrep, 9 | SpacyPosTextPrep, 10 | SpacyLemmaTextPrep, 11 | YakeTextPrep, 12 | PhoneticTextPrep, 13 | Identity, 14 | SentencePiecePrep, 15 | ) 16 | import spacy 17 | 18 | nlp = spacy.load("en_core_web_sm") 19 | 20 | 21 | prep_list = [ 22 | Cleaner(), 23 | HyphenTextPrep(), 24 | PhoneticTextPrep(kind="soundex"), 25 | PhoneticTextPrep(kind="metaphone"), 26 | PhoneticTextPrep(kind="nysiis"), 27 | YakeTextPrep(), 28 | SpacyLemmaTextPrep(nlp), 29 | SpacyMorphTextPrep(nlp), 30 | SpacyPosTextPrep(nlp), 31 | Identity(), 32 | SentencePiecePrep(model_file="tests/data/en.vs5000.model"), 33 | ] 34 | 35 | 36 | @pytest.mark.parametrize("prep", prep_list, ids=[str(d) for d in prep_list]) 37 | def test_pipeline_single(prep): 38 | X = ["hello world", "this is dog", "it should work"] 39 | pipe = Pipeline([("prep", prep), ("cv", CountVectorizer())]) 40 | assert pipe.fit_transform(X).shape[0] == 3 41 | 42 | 43 | @pytest.mark.parametrize("prep", prep_list, ids=[str(d) for d in prep_list]) 44 | def test_pipeline_single_clean_first(prep): 45 | X = ["hello world", "this is dog", "it should work"] 46 | pipe = Pipeline([("clean", Cleaner()), ("prep", prep), ("cv", CountVectorizer())]) 47 | assert pipe.fit_transform(X).shape[0] == 3 48 | -------------------------------------------------------------------------------- /tests/test_tfm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from tokenwiser.common import flatten 4 | from tokenwiser.proj import BinaryRandomProjection, PointSplitProjection 5 | 6 | from tests.conftest import ( 7 | nonmeta_checks, 8 | general_checks, 9 | transformer_checks, 10 | select_tests, 11 | ) 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "test_fn", 16 | select_tests( 17 | flatten([nonmeta_checks, transformer_checks, general_checks]), 18 | exclude=[ 19 | "check_transformer_data_not_an_array", 20 | "check_estimators_nan_inf", 21 | "check_fit2d_predict1d", 22 | "check_sample_weights_invariance", 23 | "check_sample_weights_list" 24 | ], 25 | ), 26 | ) 27 | def test_estimator_checks_binary(test_fn): 28 | random_proj = BinaryRandomProjection(random_seed=42) 29 | test_fn(random_proj, random_proj) 30 | 31 | 32 | @pytest.mark.parametrize( 33 | "test_fn", 34 | select_tests( 35 | flatten([nonmeta_checks, transformer_checks, general_checks]), 36 | exclude=[ 37 | "check_transformer_data_not_an_array", 38 | "check_sample_weights_invariance", 39 | "check_estimators_nan_inf", 40 | "check_fit2d_predict1d", 41 | "check_transformer_general", 42 | "check_pipeline_consistency", 43 | "check_sample_weights_list" 44 | ], 45 | ), 46 | ) 47 | def test_estimator_checks_split(test_fn): 48 | random_proj = PointSplitProjection(random_seed=42) 49 | test_fn(random_proj, random_proj) 50 | -------------------------------------------------------------------------------- /tests/test_tok/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/tests/test_tok/__init__.py -------------------------------------------------------------------------------- /tests/test_tok/test_whitespace.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from tokenwiser.tok import WhiteSpaceTokenizer 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "x_in,x_out", 8 | [("haleluja", ["haleluja"]), ("hello there world", ["hello", "there", "world"])], 9 | ) 10 | def test_basic(x_in, x_out): 11 | assert WhiteSpaceTokenizer()(x_in) == x_out 12 | -------------------------------------------------------------------------------- /tests/test_wabbit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tokenwiser.wabbit import VowpalWabbitClassifier 3 | from tokenwiser.pipeline import make_partial_pipeline 4 | from tokenwiser.textprep import Cleaner 5 | 6 | X = [ 7 | "i really like this post", 8 | "thanks for that comment", 9 | "i enjoy this friendly forum", 10 | "this is a bad post", 11 | "i dislike this article", 12 | "this is not well written", 13 | ] 14 | 15 | y = np.array([1, 1, 1, 0, 0, 0]) 16 | 17 | 18 | def test_wabbit_fit_shape_sensible(): 19 | assert VowpalWabbitClassifier().fit(X, y).predict(X).shape[0] == 6 20 | assert VowpalWabbitClassifier().fit(X, y).predict_proba(X).shape == (6, 2) 21 | 22 | 23 | def test_wabbit_pipeline(): 24 | pipe = make_partial_pipeline( 25 | Cleaner(), VowpalWabbitClassifier(n_loop=1, n_gram=1, learning_rate=0.1) 26 | ) 27 | for i in range(5): 28 | pipe.partial_fit(X, y, classes=list(set(y))) 29 | -------------------------------------------------------------------------------- /theme/token.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/theme/token.png -------------------------------------------------------------------------------- /theme/token.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /token.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/tokenwiser/1b6a2a28f8e520fd9e8c670cb58a5ceb3f6c08ef/token.png -------------------------------------------------------------------------------- /tokenwiser/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.0" 2 | -------------------------------------------------------------------------------- /tokenwiser/__main__.py: -------------------------------------------------------------------------------- 1 | import typer 2 | 3 | from tokenwiser import __version__ 4 | 5 | app = typer.Typer( 6 | add_completion=False, 7 | help="Tokenwiser CLI. Allows you to train embeddings from the commandline.", 8 | ) 9 | 10 | 11 | @app.command("version", help="show the version of tokenwise") 12 | def version(): 13 | typer.echo(f"{__version__}") 14 | 15 | 16 | @app.command() 17 | def init(): 18 | pass 19 | 20 | 21 | if __name__ == "__main__": 22 | app() 23 | -------------------------------------------------------------------------------- /tokenwiser/common.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | 3 | def save_coefficients(classifier, filename): 4 | """Save the coefficients of a linear model into a .h5 file.""" 5 | with h5py.File(filename, 'w') as hf: 6 | hf.create_dataset("coef", data=classifier.coef_) 7 | hf.create_dataset("intercept", data=classifier.intercept_) 8 | hf.create_dataset("classes", data=classifier.classes_) 9 | 10 | def load_coefficients(classifier, filename): 11 | """Attach the saved coefficients to a linear model.""" 12 | with h5py.File(filename, 'r') as hf: 13 | coef = hf['coef'][:] 14 | intercept = hf['intercept'][:] 15 | classes = hf['classes'][:] 16 | classifier.coef_ = coef 17 | classifier.intercept_ = intercept 18 | classifier.classes_ = classes 19 | 20 | def flatten(nested): 21 | """Flatten a nested list.""" 22 | return [item for li in nested for item in li] 23 | -------------------------------------------------------------------------------- /tokenwiser/component/__init__.py: -------------------------------------------------------------------------------- 1 | from ._sklearn import attach_sklearn_categoriser 2 | 3 | __all__ = ["attach_sklearn_categoriser"] 4 | -------------------------------------------------------------------------------- /tokenwiser/component/_sklearn.py: -------------------------------------------------------------------------------- 1 | from spacy.language import Language 2 | 3 | 4 | def attach_sklearn_categoriser(nlp, pipe_name, estimator): 5 | """ 6 | This function will attach a scikit-learn compatible estimator to 7 | the pipeline which will feed predictions to the `.cats` property. 8 | 9 | This is useful if you're interesting in added a pre-trained sklearn 10 | model to the pipeline. This is **not** useful if you're interested 11 | in training a new model via spaCy, check out the `tokenwiser.model` 12 | submodule for that. 13 | 14 | Usage: 15 | 16 | ```python 17 | import spacy 18 | 19 | from sklearn.pipeline import make_pipeline 20 | from sklearn.feature_extraction.text import CountVectorizer 21 | from sklearn.linear_model import LogisticRegression 22 | 23 | from tokenwiser.component import attach_sklearn_categoriser 24 | 25 | X = [ 26 | "i really like this post", 27 | "thanks for that comment", 28 | "i enjoy this friendly forum", 29 | "this is a bad post", 30 | "i dislike this article", 31 | "this is not well written" 32 | ] 33 | 34 | y = ["pos", "pos", "pos", "neg", "neg", "neg"] 35 | 36 | # Note that we're training a pipeline here via a single-batch `.fit()` method 37 | pipe = make_pipeline(CountVectorizer(), LogisticRegression()).fit(X, y) 38 | 39 | nlp = spacy.load("en_core_web_sm") 40 | # This is where we attach our pre-trained model as a pipeline step. 41 | attach_sklearn_categoriser(nlp, pipe_name="silly_sentiment", estimator=pipe) 42 | 43 | assert nlp.pipe_names[-1] == "silly_sentiment" 44 | assert nlp("this post i really like").cats["pos"] > 0.5 45 | ``` 46 | """ 47 | 48 | @Language.component(pipe_name) 49 | def my_component(doc): 50 | pred = estimator.predict([doc.text])[0] 51 | proba = estimator.predict_proba([doc.text]).max() 52 | doc.cats[pred] = proba 53 | return doc 54 | 55 | nlp.add_pipe(pipe_name) 56 | -------------------------------------------------------------------------------- /tokenwiser/extension/__init__.py: -------------------------------------------------------------------------------- 1 | from ._extension import ( 2 | attach_hyphen_extension, 3 | attach_sklearn_extension, 4 | sklearn_method, 5 | ) 6 | 7 | __all__ = ["attach_hyphen_extension", "attach_sklearn_extension", "sklearn_method"] 8 | -------------------------------------------------------------------------------- /tokenwiser/extension/_extension.py: -------------------------------------------------------------------------------- 1 | from spacy.tokens import Doc, Token 2 | 3 | from tokenwiser.textprep import HyphenTextPrep 4 | 5 | 6 | def attach_hyphen_extension(): 7 | """ 8 | This function will attach an extension `._.hyphen` to the `Token`s. 9 | 10 | ```python 11 | import spacy 12 | from tokenwiser.extension import attach_hyphen_extension 13 | 14 | nlp = spacy.load("en_core_web_sm") 15 | # Attach the Hyphen extensions. 16 | attach_hyphen_extension() 17 | 18 | # Now you can query hyphens on the tokens. 19 | doc = nlp("this is a dinosaurhead") 20 | tok = doc[-1] 21 | 22 | assert tok._.hyphen == ["di", "no", "saur", "head"] 23 | ``` 24 | """ 25 | Token.set_extension( 26 | "hyphen", 27 | getter=lambda t: HyphenTextPrep().encode_single(t.text).split(" "), 28 | force=True, 29 | ) 30 | 31 | 32 | def attach_sklearn_extension(attribute_name, estimator): 33 | """ 34 | This function will attach an extension `._.attribute_name` to the `Token`s. 35 | 36 | ```python 37 | import spacy 38 | from spacy.tokens import Doc 39 | 40 | from sklearn.pipeline import make_pipeline 41 | from sklearn.feature_extraction.text import CountVectorizer 42 | from sklearn.linear_model import LogisticRegression 43 | 44 | from tokenwiser.extension import attach_sklearn_extension 45 | 46 | X = [ 47 | "i really like this post", 48 | "thanks for that comment", 49 | "i enjoy this friendly forum", 50 | "this is a bad post", 51 | "i dislike this article", 52 | "this is not well written" 53 | ] 54 | 55 | y = ["pos", "pos", "pos", "neg", "neg", "neg"] 56 | 57 | # First we train a (silly) model. 58 | mod = make_pipeline(CountVectorizer(), LogisticRegression()).fit(X, y) 59 | 60 | # Demo 61 | nlp = spacy.load("en_core_web_sm") 62 | doc = nlp("thank you, really nice") 63 | attach_sklearn_extension("sillysent", mod) 64 | doc._.sillysent # {"neg: 0.4446964938410244, "pos": 0.5553035061589756} 65 | ``` 66 | """ 67 | Doc.set_extension( 68 | attribute_name, 69 | getter=lambda t: sklearn_method(estimator=estimator), 70 | force=True, 71 | ) 72 | 73 | 74 | def sklearn_method(estimator): 75 | """ 76 | A helper to turn a scikit-learn estimator into a spaCy extension. 77 | 78 | Just in case you *really* wanted to do it manually. 79 | 80 | ```python 81 | import spacy 82 | from spacy.tokens import Doc 83 | 84 | from sklearn.pipeline import make_pipeline 85 | from sklearn.feature_extraction.text import CountVectorizer 86 | from sklearn.linear_model import LogisticRegression 87 | 88 | from tokenwiser.extension import sklearn_method 89 | 90 | X = [ 91 | "i really like this post", 92 | "thanks for that comment", 93 | "i enjoy this friendly forum", 94 | "this is a bad post", 95 | "i dislike this article", 96 | "this is not well written" 97 | ] 98 | 99 | y = ["pos", "pos", "pos", "neg", "neg", "neg"] 100 | 101 | # First we train a (silly) model. 102 | mod = make_pipeline(CountVectorizer(), LogisticRegression()).fit(X, y) 103 | 104 | # This is where we attach the scikit-learn model to spaCy as a method extension. 105 | Doc.set_extension("sillysent_method", method=sklearn_method(mod)) 106 | # This is where we attach the scikit-learn model to spaCy as a property extension. 107 | Doc.set_extension("sillysent_prop", getter=sklearn_method(mod)) 108 | 109 | # Demo 110 | nlp = spacy.load("en_core_web_sm") 111 | doc = nlp("thank you, really nice") 112 | 113 | doc._.sillysent_method() # {"neg": 0.4446964938410244, "pos: 0.5553035061589756} 114 | doc._.sillysent_prop # {"neg: 0.4446964938410244, "pos": 0.5553035061589756} 115 | ``` 116 | """ 117 | 118 | def method(doc): 119 | proba = estimator.predict_proba([doc.text])[0] 120 | return {c: p for c, p in zip(estimator.classes_, proba)} 121 | 122 | return method 123 | -------------------------------------------------------------------------------- /tokenwiser/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .sklearnmod import SklearnCat 2 | 3 | __all__ = ["SklearnCat"] 4 | -------------------------------------------------------------------------------- /tokenwiser/model/sklearnmod.py: -------------------------------------------------------------------------------- 1 | import random 2 | import pathlib 3 | from typing import Iterable 4 | 5 | import spacy 6 | from spacy import registry 7 | from spacy.tokens import Doc 8 | from spacy.training import Example 9 | from spacy.language import Language 10 | from sklearn.feature_extraction.text import HashingVectorizer 11 | from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier 12 | from sklearn.naive_bayes import MultinomialNB 13 | from joblib import dump, load 14 | 15 | from tokenwiser.pipeline import PartialPipeline 16 | 17 | 18 | class SklearnCat: 19 | """ 20 | This is a spaCy pipeline component object that can train specific scikit-learn pipelines. 21 | 22 | This allows you to run a simple benchmark via spaCy on simple text-based scikit-learn models. 23 | One should not expect these models to have state of the art accuracy. But they should have 24 | "pretty good" accuracy while being substantially faster to train than most deep-learning 25 | based models. 26 | 27 | The intended use-case for these models is to offer a base benchmark. If these models perform well 28 | one your task, it's an indication that you're in luck and that you've got a simple task that 29 | doesn't require state of the art models. 30 | """ 31 | 32 | def __init__(self, nlp, name, sklearn_model, label, classes): 33 | self.nlp = nlp 34 | self.name = name 35 | self.label = label 36 | self.classes = classes 37 | self.sklearn_model = spacy.registry.architectures.get( 38 | sklearn_model.replace("@", "") 39 | )() 40 | 41 | def __call__(self, doc: Doc): 42 | scores = self.predict([doc]) 43 | self.set_annotations([doc], scores) 44 | return doc 45 | 46 | def update( 47 | self, examples: Iterable[Example], *, drop: float = 0.0, sgd=None, losses=None 48 | ): 49 | texts = [ 50 | ex.reference.text 51 | for ex in examples 52 | if self.label in ex.reference.cats.keys() 53 | ] 54 | labels = [ 55 | ex.reference.cats[self.label] 56 | for ex in examples 57 | if self.label in ex.reference.cats.keys() 58 | ] 59 | self.sklearn_model.partial_fit(texts, labels, classes=self.classes) 60 | 61 | def predict(self, docs: Iterable[Doc]): 62 | return self.sklearn_model.predict_proba([d.text for d in docs]).max(axis=1) 63 | 64 | def set_annotations(self, docs: Iterable[Doc], scores): 65 | preds = self.sklearn_model.predict([d.text for d in docs]) 66 | for doc, pred, proba in zip(docs, preds, scores): 67 | doc.cats[pred] = proba 68 | return docs 69 | 70 | def score(self): 71 | return random.random() 72 | 73 | def to_disk(self, path, exclude=None): 74 | pathlib.Path(path).mkdir(parents=True, exist_ok=True) 75 | dump(self.sklearn_model, str(pathlib.Path(path) / "filename.joblib")) 76 | 77 | def from_disk(self, path, exclude=None): 78 | self.sklearn_model = load(str(pathlib.Path(path) / "filename.joblib")) 79 | return self 80 | 81 | 82 | @Language.factory("sklearn-cat") 83 | def make_sklearn_cat(nlp, name, sklearn_model, label, classes): 84 | return SklearnCat(nlp, name, sklearn_model, label, classes) 85 | 86 | 87 | @registry.architectures("sklearn_model_basic_sgd.v1") 88 | def make_sklearn_cat_basic_sgd(): 89 | return PartialPipeline( 90 | [("hash", HashingVectorizer()), ("lr", SGDClassifier(loss="log"))] 91 | ) 92 | 93 | 94 | @registry.architectures("sklearn_model_basic_pa.v1") 95 | def make_sklearn_cat_basic_pa(): 96 | return PartialPipeline( 97 | [("hash", HashingVectorizer()), ("lr", PassiveAggressiveClassifier())] 98 | ) 99 | 100 | 101 | @registry.architectures("sklearn_model_basic_naive_bayes.v1") 102 | def make_sklearn_cat_basic_naive_bayes(): 103 | return PartialPipeline( 104 | [("hash", HashingVectorizer(binary=True)), ("nb", MultinomialNB())] 105 | ) 106 | -------------------------------------------------------------------------------- /tokenwiser/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from ._concat import TextConcat, make_concat 2 | from ._pipe import PartialPipeline, make_partial_pipeline 3 | from ._union import PartialFeatureUnion, make_partial_union 4 | 5 | __all__ = [ 6 | "TextConcat", 7 | "make_concat", 8 | "PartialPipeline", 9 | "make_partial_pipeline", 10 | "PartialFeatureUnion", 11 | "make_partial_union", 12 | ] 13 | -------------------------------------------------------------------------------- /tokenwiser/pipeline/_concat.py: -------------------------------------------------------------------------------- 1 | from sklearn.pipeline import _name_estimators 2 | from sklearn.base import BaseEstimator 3 | 4 | 5 | class TextConcat(BaseEstimator): 6 | """ 7 | A component like `FeatureUnion` but this also concatenates the text. 8 | 9 | Arguments: 10 | transformer_list: list of (name, text-transformer)-tuples 11 | 12 | Example: 13 | 14 | ```python 15 | from tokenwiser.textprep import HyphenTextPrep, Cleaner 16 | from tokenwiser.pipeline import TextConcat 17 | 18 | tc = TextConcat([("hyp", HyphenTextPrep()), ("clean", Cleaner())]) 19 | results = tc.fit_transform(["dinosaurhead", "another$$ sentence$$"]) 20 | expected = ['di no saur head dinosaurhead', 'an other $$ sen tence$$ another sentence'] 21 | 22 | assert results == expected 23 | ``` 24 | """ 25 | 26 | def __init__(self, transformer_list): 27 | self.transformer_list = transformer_list 28 | 29 | def fit(self, X, y=None): 30 | """ 31 | Fits the components in a single batch. 32 | """ 33 | names = [n for n, t in self.transformer_list] 34 | if len(names) != len(set(names)): 35 | raise ValueError("Make sure that the names of each step are unique.") 36 | return self 37 | 38 | def partial_fit(self, X, y=None): 39 | """ 40 | Fits the components, but allow for batches. 41 | """ 42 | names = [n for n, t in self.transformer_list] 43 | if len(names) != len(set(names)): 44 | raise ValueError("Make sure that the names of each step are unique.") 45 | return self 46 | 47 | def transform(self, X, y=None): 48 | """ 49 | Transformers the text. 50 | """ 51 | names = [n for n, t in self.transformer_list] 52 | if len(names) != len(set(names)): 53 | raise ValueError("Make sure that the names of each step are unique.") 54 | results = {} 55 | for name, tfm in self.transformer_list: 56 | results[name] = tfm.transform(X) 57 | return [" ".join([results[n][i] for n in names]) for i in range(len(X))] 58 | 59 | def fit_transform(self, X, y=None): 60 | """ 61 | Fits the components and transforms the text in one step. 62 | """ 63 | return self.fit(X, y).transform(X, y) 64 | 65 | 66 | def make_concat(*steps): 67 | """ 68 | Utility function to generate a `TextConcat` 69 | 70 | Arguments: 71 | steps: a collection of text-transformers 72 | 73 | ```python 74 | from tokenwiser.textprep import HyphenTextPrep, Cleaner 75 | from tokenwiser.pipeline import make_concat 76 | 77 | tc = make_concat(HyphenTextPrep(), Cleaner()) 78 | results = tc.fit_transform(["dinosaurhead", "another$$ sentence$$"]) 79 | expected = ['di no saur head dinosaurhead', 'an other $$ sen tence$$ another sentence'] 80 | 81 | assert results == expected 82 | ``` 83 | """ 84 | return TextConcat(_name_estimators(steps)) 85 | -------------------------------------------------------------------------------- /tokenwiser/pipeline/_pipe.py: -------------------------------------------------------------------------------- 1 | from sklearn.pipeline import Pipeline, _name_estimators 2 | 3 | 4 | class PartialPipeline(Pipeline): 5 | """ 6 | Utility function to generate a `PartialPipeline` 7 | 8 | Arguments: 9 | steps: a collection of text-transformers 10 | 11 | ```python 12 | from tokenwiser.pipeline import PartialPipeline 13 | from tokenwiser.textprep import HyphenTextPrep, Cleaner 14 | 15 | tc = PartialPipeline([('clean', Cleaner()), ('hyp', HyphenTextPrep())]) 16 | data = ["dinosaurhead", "another$$ sentence$$"] 17 | results = tc.partial_fit(data).transform(data) 18 | expected = ['di no saur head', 'an other sen tence'] 19 | 20 | assert results == expected 21 | ``` 22 | """ 23 | def partial_fit(self, X, y=None, classes=None, **kwargs): 24 | """ 25 | Fits the components, but allow for batches. 26 | """ 27 | for name, step in self.steps: 28 | if not hasattr(step, "partial_fit"): 29 | raise ValueError( 30 | f"Step {name} is a {step} which does not have `.partial_fit` implemented." 31 | ) 32 | for name, step in self.steps: 33 | if hasattr(step, "predict"): 34 | step.partial_fit(X, y, classes=classes, **kwargs) 35 | else: 36 | step.partial_fit(X, y) 37 | if hasattr(step, "transform"): 38 | X = step.transform(X) 39 | return self 40 | 41 | 42 | def make_partial_pipeline(*steps): 43 | """ 44 | Utility function to generate a `PartialPipeline` 45 | 46 | Arguments: 47 | steps: a collection of text-transformers 48 | 49 | ```python 50 | from tokenwiser.pipeline import make_partial_pipeline 51 | from tokenwiser.textprep import HyphenTextPrep, Cleaner 52 | 53 | tc = make_partial_pipeline(Cleaner(), HyphenTextPrep()) 54 | data = ["dinosaurhead", "another$$ sentence$$"] 55 | results = tc.partial_fit(data).transform(data) 56 | expected = ['di no saur head', 'an other sen tence'] 57 | 58 | assert results == expected 59 | ``` 60 | """ 61 | return PartialPipeline(_name_estimators(steps)) 62 | -------------------------------------------------------------------------------- /tokenwiser/pipeline/_union.py: -------------------------------------------------------------------------------- 1 | from sklearn.pipeline import FeatureUnion, _name_estimators 2 | 3 | 4 | class PartialFeatureUnion(FeatureUnion): 5 | """ 6 | A `PartialFeatureUnion` is a `FeatureUnion` but able to `.partial_fit`. 7 | 8 | Arguments: 9 | transformer_list: a list of transformers to apply and concatenate 10 | 11 | Example: 12 | 13 | ```python 14 | import numpy as np 15 | from sklearn.linear_model import SGDClassifier 16 | from sklearn.feature_extraction.text import HashingVectorizer 17 | 18 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep 19 | from tokenwiser.pipeline import PartialPipeline, PartialFeatureUnion 20 | 21 | pipe = PartialPipeline([ 22 | ("clean", Cleaner()), 23 | ("union", PartialFeatureUnion([ 24 | ("full_text_pipe", PartialPipeline([ 25 | ("identity", Identity()), 26 | ("hash1", HashingVectorizer()), 27 | ])), 28 | ("hyphen_pipe", PartialPipeline([ 29 | ("hyphen", HyphenTextPrep()), 30 | ("hash2", HashingVectorizer()), 31 | ])) 32 | ])), 33 | ("clf", SGDClassifier()) 34 | ]) 35 | 36 | X = [ 37 | "i really like this post", 38 | "thanks for that comment", 39 | "i enjoy this friendly forum", 40 | "this is a bad post", 41 | "i dislike this article", 42 | "this is not well written" 43 | ] 44 | 45 | y = np.array([1, 1, 1, 0, 0, 0]) 46 | 47 | for loop in range(3): 48 | pipe.partial_fit(X, y, classes=[0, 1]) 49 | 50 | assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0])) 51 | ``` 52 | """ 53 | 54 | def partial_fit(self, X, y=None, classes=None, **kwargs): 55 | """ 56 | Fits the components, but allow for batches. 57 | """ 58 | for name, step in self.transformer_list: 59 | if not hasattr(step, "partial_fit"): 60 | raise ValueError( 61 | f"Step {name} is a {step} which does not have `.partial_fit` implemented." 62 | ) 63 | for name, step in self.transformer_list: 64 | if hasattr(step, "predict"): 65 | step.partial_fit(X, y, classes=classes, **kwargs) 66 | else: 67 | step.partial_fit(X, y) 68 | return self 69 | 70 | 71 | def make_partial_union(*transformer_list): 72 | """ 73 | Utility function to generate a `PartialFeatureUnion` 74 | 75 | Arguments: 76 | transformer_list: a list of transformers to apply and concatenate 77 | 78 | Example: 79 | 80 | ```python 81 | import numpy as np 82 | from sklearn.linear_model import SGDClassifier 83 | from sklearn.feature_extraction.text import HashingVectorizer 84 | 85 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep 86 | from tokenwiser.pipeline import make_partial_pipeline, make_partial_union 87 | 88 | pipe = make_partial_pipeline( 89 | Cleaner(), 90 | make_partial_union( 91 | make_partial_pipeline(Identity(), HashingVectorizer()), 92 | make_partial_pipeline(HyphenTextPrep(), HashingVectorizer()) 93 | ), 94 | SGDClassifier() 95 | ) 96 | 97 | X = [ 98 | "i really like this post", 99 | "thanks for that comment", 100 | "i enjoy this friendly forum", 101 | "this is a bad post", 102 | "i dislike this article", 103 | "this is not well written" 104 | ] 105 | 106 | y = np.array([1, 1, 1, 0, 0, 0]) 107 | 108 | for loop in range(3): 109 | pipe.partial_fit(X, y, classes=[0, 1]) 110 | 111 | assert np.all(pipe.predict(X) == np.array([1, 1, 1, 0, 0, 0])) 112 | ``` 113 | """ 114 | return PartialFeatureUnion(_name_estimators(transformer_list)) 115 | -------------------------------------------------------------------------------- /tokenwiser/proj/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.base import TransformerMixin, BaseEstimator 3 | from sklearn.utils import check_array 4 | from sklearn.utils.validation import check_is_fitted 5 | 6 | 7 | class BinaryRandomProjection(BaseEstimator, TransformerMixin): 8 | def __init__(self, n_components=100, random_seed=42, threshold=0.0): 9 | self.n_components = n_components 10 | self.random_seed = random_seed 11 | self.threshold = threshold 12 | 13 | def fit(self, X, y=None): 14 | X = check_array(X) 15 | np.random.seed(self.random_seed) 16 | self.proj_ = np.random.normal(0, 1, (X.shape[1], self.n_components)) 17 | return self 18 | 19 | def transform(self, X, y=None): 20 | check_is_fitted(self, ["proj_"]) 21 | return (X @ self.proj_ > self.threshold).astype(np.int8) 22 | 23 | 24 | def proj_away(x, y): 25 | """project y away from x""" 26 | return x.dot(x) / y.dot(y) * x 27 | 28 | 29 | def select_random_rows(X): 30 | i1, i2 = np.random.randint(0, X.shape[0], 2) 31 | return X[i1, :], X[i2, :] 32 | 33 | 34 | class PointSplitProjection(BaseEstimator, TransformerMixin): 35 | def __init__(self, n_components=100, random_seed=42): 36 | self.n_components = n_components 37 | self.random_seed = random_seed 38 | 39 | def fit(self, X, y=None): 40 | X = check_array(X) 41 | self.X_ = X 42 | self.indices_ = [ 43 | tuple(np.random.randint(0, X.shape[0], 2)) for t in range(self.n_components) 44 | ] 45 | return self 46 | 47 | def generate_feature_(self, new_X, i): 48 | i1, i2 = self.indices_[i] 49 | v1, v2 = self.X_[i1, :], self.X_[i2, :] 50 | m = np.array([v1, v2]).mean(axis=0) 51 | return new_X @ (proj_away(v2 - v1, m)) > m.dot(proj_away(v2 - v1, m)) 52 | 53 | def transform(self, X, y=None): 54 | check_is_fitted(self, ["X_", "indices_"]) 55 | if X.shape[1] != self.X_.shape[1]: 56 | raise ValueError( 57 | f"shapes train/transform do not match. {X.shape[1]} vs {self.X_.shape[1]}" 58 | ) 59 | result = np.zeros((X.shape[0], self.n_components)) 60 | for col in range(self.n_components): 61 | result[:, col] = self.generate_feature_(X, col) 62 | return result 63 | -------------------------------------------------------------------------------- /tokenwiser/textprep/__init__.py: -------------------------------------------------------------------------------- 1 | from ._hyphen import HyphenTextPrep 2 | from ._phonetic import PhoneticTextPrep 3 | from ._cleaner import Cleaner 4 | from ._morph import SpacyMorphTextPrep, SpacyLemmaTextPrep, SpacyPosTextPrep 5 | from ._yake import YakeTextPrep 6 | from ._sentpiece import SentencePiecePrep 7 | from ._identity import Identity 8 | from ._snowball import SnowballTextPrep 9 | 10 | __all__ = [ 11 | "HyphenTextPrep", 12 | "PhoneticTextPrep", 13 | "Cleaner", 14 | "SpacyMorphTextPrep", 15 | "SpacyLemmaTextPrep", 16 | "SpacyPosTextPrep", 17 | "YakeTextPrep", 18 | "Identity", 19 | "SentencePiecePrep", 20 | "SnowballTextPrep", 21 | ] 22 | -------------------------------------------------------------------------------- /tokenwiser/textprep/_cleaner.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator 2 | 3 | from ._prep import TextPrep 4 | 5 | 6 | class Cleaner(TextPrep, BaseEstimator): 7 | """ 8 | Applies a lowercase and removes non-alphanum. 9 | 10 | Usage: 11 | 12 | ```python 13 | from tokenwiser.textprep import Cleaner 14 | 15 | single = Cleaner().encode_single("$$$5 dollars") 16 | assert single == "5 dollars" 17 | multi = Cleaner().transform(["$$$5 dollars", "#hashtag!"]) 18 | assert multi == ["5 dollars", "hashtag"] 19 | ``` 20 | """ 21 | 22 | def __init__(self): 23 | pass 24 | 25 | def encode_single(self, x: str): 26 | return "".join([c.lower() for c in x if c.isalnum() or c == " "]) 27 | -------------------------------------------------------------------------------- /tokenwiser/textprep/_hyphen.py: -------------------------------------------------------------------------------- 1 | import pyphen 2 | from sklearn.base import BaseEstimator 3 | 4 | from ._prep import TextPrep 5 | 6 | 7 | class HyphenTextPrep(TextPrep, BaseEstimator): 8 | """ 9 | Hyphenate the text going in. 10 | 11 | Usage: 12 | 13 | ```python 14 | from tokenwiser.textprep import HyphenTextPrep 15 | 16 | multi = HyphenTextPrep().transform(["geology", "astrology"]) 17 | assert multi == ['geo logy', 'as tro logy'] 18 | ``` 19 | """ 20 | 21 | def __init__(self, lang="en_GB"): 22 | self.lang = lang 23 | self.dic = pyphen.Pyphen(lang=lang) 24 | 25 | def encode_single(self, x): 26 | return " ".join(self.dic.inserted(x).split("-", -1)) 27 | -------------------------------------------------------------------------------- /tokenwiser/textprep/_identity.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator 2 | from ._prep import TextPrep 3 | 4 | 5 | class Identity(TextPrep, BaseEstimator): 6 | """ 7 | Keeps the text as is. Can be used as a placeholder in a pipeline. 8 | 9 | Usage: 10 | 11 | ```python 12 | from tokenwiser.textprep import Identity 13 | 14 | text = ["hello", "world"] 15 | example = Identity().transform(text) 16 | 17 | assert example == ["hello", "world"] 18 | ``` 19 | 20 | The main use-case is as a placeholder. 21 | 22 | ``` 23 | from tokenwiser.pipeline import make_concat 24 | from sklearn.pipeline import make_pipeline, make_union 25 | 26 | from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep 27 | 28 | pipe = make_pipeline( 29 | Cleaner(), 30 | make_concat(Identity(), HyphenTextPrep()), 31 | ) 32 | ``` 33 | """ 34 | 35 | def __init__(self): 36 | pass 37 | 38 | def encode_single(self, x): 39 | return x 40 | 41 | def transform(self, X, y=None): 42 | return X 43 | 44 | def fit(self, X, y=None): 45 | return self 46 | 47 | def partial_fit(self, X, y=None): 48 | return self 49 | -------------------------------------------------------------------------------- /tokenwiser/textprep/_morph.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator 2 | 3 | from ._prep import TextPrep 4 | 5 | 6 | class SpacyMorphTextPrep(TextPrep, BaseEstimator): 7 | """ 8 | Adds morphologic information to tokens in text. 9 | 10 | Usage: 11 | 12 | ```python 13 | import spacy 14 | from tokenwiser.textprep import SpacyMorphTextPrep 15 | 16 | nlp = spacy.load("en_core_web_sm") 17 | example1 = SpacyMorphTextPrep(nlp).encode_single("quick! duck!") 18 | example2 = SpacyMorphTextPrep(nlp).encode_single("hey look a duck") 19 | 20 | assert example1 == "quick|Degree=Pos !|PunctType=Peri duck|Number=Sing !|PunctType=Peri" 21 | assert example2 == "hey| look|VerbForm=Inf a|Definite=Ind|PronType=Art duck|Number=Sing" 22 | ``` 23 | """ 24 | 25 | def __init__(self, model, lemma: bool = False): 26 | self.model = model 27 | self.lemma = lemma 28 | 29 | def encode_single(self, text): 30 | return " ".join( 31 | [ 32 | f"{t.text if not self.lemma else t.lemma_}|{t.morph}" 33 | for t in self.model(text) 34 | ] 35 | ) 36 | 37 | 38 | class SpacyPosTextPrep(TextPrep, BaseEstimator): 39 | """ 40 | Adds part of speech information per token using spaCy. 41 | 42 | Arguments: 43 | model: the spaCy model to use 44 | lemma: also lemmatize the text 45 | fine_grained: use fine grained parts of speech 46 | 47 | Usage: 48 | 49 | ```python 50 | import spacy 51 | from tokenwiser.textprep import SpacyPosTextPrep 52 | 53 | nlp = spacy.load("en_core_web_sm") 54 | example1 = SpacyPosTextPrep(nlp).encode_single("we need to duck") 55 | example2 = SpacyPosTextPrep(nlp).encode_single("hey look a duck") 56 | 57 | assert example1 == "we|PRON need|VERB to|PART duck|VERB" 58 | assert example2 == "hey|INTJ look|VERB a|DET duck|NOUN" 59 | ``` 60 | """ 61 | 62 | def __init__(self, model, lemma: bool = False, fine_grained: bool = False): 63 | self.model = model 64 | self.lemma = lemma 65 | self.fine_grained = fine_grained 66 | 67 | def encode_single(self, text): 68 | return " ".join( 69 | [ 70 | f"{t.text if not self.lemma else t.lemma_}|{t.tag_ if self.fine_grained else t.pos_}" 71 | for t in self.model(text) 72 | ] 73 | ) 74 | 75 | 76 | class SpacyLemmaTextPrep(TextPrep, BaseEstimator): 77 | """ 78 | Turns each token into a lemmatizer version using spaCy. 79 | 80 | Usage: 81 | 82 | ```python 83 | import spacy 84 | from tokenwiser.textprep import SpacyLemmaTextPrep 85 | 86 | nlp = spacy.load("en_core_web_sm") 87 | example1 = SpacyLemmaTextPrep(nlp).encode_single("we are running") 88 | example2 = SpacyLemmaTextPrep(nlp).encode_single("these are dogs") 89 | 90 | assert example1 == 'we be run' 91 | assert example2 == 'these be dog' 92 | ``` 93 | """ 94 | 95 | def __init__(self, model, stop=False): 96 | self.stop = stop 97 | self.model = model 98 | 99 | def encode_single(self, text): 100 | if self.stop: 101 | return " ".join([t.lemma_ for t in self.model(text) if not t.is_stop]) 102 | return " ".join([t.lemma_ for t in self.model(text)]) 103 | 104 | def transform(self, X, y=None): 105 | return [" ".join([t.lemma_ for t in d]) for d in self.model.pipe(X)] 106 | -------------------------------------------------------------------------------- /tokenwiser/textprep/_phonetic.py: -------------------------------------------------------------------------------- 1 | import jellyfish 2 | from sklearn.base import BaseEstimator 3 | 4 | from ._prep import TextPrep 5 | 6 | 7 | class PhoneticTextPrep(TextPrep, BaseEstimator): 8 | """ 9 | The ProneticPrep object prepares strings by encoding them phonetically. 10 | 11 | Arguments: 12 | kind: type of encoding, either `"soundex"`, "`metaphone`" or `"nysiis"` 13 | 14 | Usage: 15 | 16 | ```python 17 | import spacy 18 | from tokenwiser.textprep import PhoneticTextPrep 19 | 20 | nlp = spacy.load("en_core_web_sm") 21 | example1 = PhoneticTextPrep(kind="soundex").transform(["dinosaurus book"]) 22 | example2 = PhoneticTextPrep(kind="metaphone").transform(["dinosaurus book"]) 23 | example3 = PhoneticTextPrep(kind="nysiis").transform(["dinosaurus book"]) 24 | 25 | assert example1[0] == 'D526 B200' 26 | assert example2[0] == 'TNSRS BK' 27 | assert example3[0] == 'DANASAR BAC' 28 | ``` 29 | """ 30 | 31 | def __init__(self, kind="soundex"): 32 | methods = { 33 | "soundex": jellyfish.soundex, 34 | "metaphone": jellyfish.metaphone, 35 | "nysiis": jellyfish.nysiis, 36 | } 37 | self.kind = kind 38 | self.method = methods[kind] 39 | 40 | def encode_single(self, x): 41 | return " ".join([self.method(d) for d in x.split(" ")]) 42 | -------------------------------------------------------------------------------- /tokenwiser/textprep/_prep.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class TextPrep(ABC): 5 | def fit(self, X, y=None): 6 | """Fits the `TextPrep` step. Considered a no-op.""" 7 | return self 8 | 9 | def partial_fit(self, X, y=None): 10 | """Partially fits the `TextPrep` step. Considered a no-op.""" 11 | return self 12 | 13 | @abstractmethod 14 | def encode_single(self, x): 15 | pass 16 | 17 | def pipe(self, X): 18 | for x in X: 19 | yield self.encode_single(x) 20 | 21 | def transform(self, X, y=None): 22 | return [self.encode_single(x) for x in X] 23 | -------------------------------------------------------------------------------- /tokenwiser/textprep/_sentpiece.py: -------------------------------------------------------------------------------- 1 | from urllib.error import HTTPError 2 | import urllib.request 3 | from typing import Union 4 | from pathlib import Path 5 | 6 | import sentencepiece as spm 7 | from sklearn.base import BaseEstimator 8 | 9 | from ._prep import TextPrep 10 | 11 | 12 | class SentencePiecePrep(TextPrep, BaseEstimator): 13 | """ 14 | The SentencePiecePrep object splits text into subtokens based on a pre-trained model. 15 | 16 | You can find many pre-trained subtokenizers via the [bpemb](https://nlp.h-its.org/bpemb/) project. 17 | For example, on the [English](https://nlp.h-its.org/bpemb/en/) sub-site you can find many 18 | models for different vocabulary sizes. Note that this site supports 275 pre-trained 19 | subword tokenizers. 20 | 21 | Note that you can train your own sentencepiece tokenizer as well. 22 | 23 | ```python 24 | import sentencepiece as spm 25 | 26 | # This saves a file named `mod.model` which can be read in later. 27 | spm.SentencePieceTrainer.train('--input=tests/data/nlp.txt --model_prefix=mod --vocab_size=2000') 28 | ``` 29 | 30 | Arguments: 31 | model_file: pre-trained model file 32 | 33 | Usage: 34 | 35 | ```python 36 | from tokenwiser.textprep import SentencePiecePrep 37 | sp_tfm = SentencePiecePrep(model_file="tests/data/en.vs5000.model") 38 | 39 | texts = ["talking about geology"] 40 | example = sp_tfm.transform(texts) 41 | assert example == ['▁talk ing ▁about ▁ge ology'] 42 | ``` 43 | """ 44 | 45 | def __init__(self, model_file: Union[str, Path]): 46 | self.model_file = model_file 47 | self.spm = spm.SentencePieceProcessor(model_file=str(model_file)) 48 | 49 | def encode_single(self, x): 50 | return " ".join(self.spm.encode_as_pieces(x)) 51 | 52 | @classmethod 53 | def download(self, lang: str, vocab_size: int, filename: str = None): 54 | """ 55 | Download a pre-trained model from the bpemb project. 56 | 57 | You can see some examples of pre-trained models on the [English](https://nlp.h-its.org/bpemb/en/) sub-site. 58 | There are many languages available, but you should take care that you pick the right 59 | vocabulary size. 60 | 61 | Arguments: 62 | lang: language code 63 | vocab_size: vocab size, can be 1000, 3000, 5000, 10000, 25000, 50000, 100000, 200000 64 | """ 65 | url = f"https://bpemb.h-its.org/{lang}/{lang}.wiki.bpe.vs{vocab_size}.model" 66 | if not filename: 67 | filename = f"{lang}.wiki.bpe.vs{vocab_size}.model" 68 | try: 69 | urllib.request.urlretrieve(url=url, filename=filename) 70 | except HTTPError: 71 | raise ValueError(f"Double check if the language ({lang}) and voacb size ({vocab_size}) combo exist.") 72 | -------------------------------------------------------------------------------- /tokenwiser/textprep/_snowball.py: -------------------------------------------------------------------------------- 1 | import snowballstemmer 2 | from sklearn.base import BaseEstimator 3 | 4 | from ._prep import TextPrep 5 | 6 | 7 | 8 | 9 | class SnowballTextPrep(TextPrep, BaseEstimator): 10 | """ 11 | Applies the snowball stemmer to the text. 12 | 13 | There are 26 languages supported, for the full list check the list on the 14 | lefthand side on [pypi](https://pypi.org/project/snowballstemmer/). 15 | 16 | Usage: 17 | 18 | ```python 19 | from tokenwiser.textprep import SnowballTextPrep 20 | 21 | single = SnowballTextPrep(language='english').encode_single("Dogs like running") 22 | assert single == "Dog like run" 23 | multi = Cleaner().transform(["Dogs like running", "Cats like sleeping"]) 24 | assert multi == ["Dog like run", "Cat like sleep"] 25 | ``` 26 | """ 27 | 28 | def __init__(self, language='english'): 29 | self.stemmer = snowballstemmer.stemmer(language) 30 | 31 | def encode_single(self, x: str): 32 | return " ".join(self.stemmer.stemWords(x)) 33 | -------------------------------------------------------------------------------- /tokenwiser/textprep/_yake.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator 2 | import yake 3 | 4 | from ._prep import TextPrep 5 | 6 | 7 | class YakeTextPrep(TextPrep, BaseEstimator): 8 | """ 9 | Remove all text except meaningful key-phrases. Uses [yake](https://github.com/LIAAD/yake). 10 | 11 | Arguments: 12 | top_n: number of key-phrases to select 13 | unique: only return unique keywords from the key-phrases 14 | 15 | Usage: 16 | 17 | ```python 18 | from tokenwiser.textprep import YakeTextPrep 19 | 20 | text = ["Sources tell us that Google is acquiring Kaggle, a platform that hosts data science and machine learning"] 21 | example = YakeTextPrep(top_n=3, unique=False).transform(text) 22 | 23 | assert example[0] == 'hosts data science acquiring kaggle google is acquiring' 24 | ``` 25 | """ 26 | 27 | def __init__(self, top_n: int = 5, unique: bool = False): 28 | self.top_n = top_n 29 | self.unique = unique 30 | self.extractor = yake.KeywordExtractor(top=self.top_n) 31 | 32 | def encode_single(self, text): 33 | texts = " ".join([t[0] for t in self.extractor.extract_keywords(text)]) 34 | if not self.unique: 35 | return texts 36 | return " ".join(set(texts.split(" "))) 37 | -------------------------------------------------------------------------------- /tokenwiser/tok/__init__.py: -------------------------------------------------------------------------------- 1 | from ._whitespace import WhiteSpaceTokenizer 2 | from ._spacy import SpacyTokenizer 3 | 4 | __all__ = ["WhiteSpaceTokenizer", "SpacyTokenizer"] 5 | -------------------------------------------------------------------------------- /tokenwiser/tok/_spacy.py: -------------------------------------------------------------------------------- 1 | from tokenwiser.tok._tok import Tok 2 | 3 | from sklearn.base import BaseEstimator 4 | 5 | 6 | class SpacyTokenizer(Tok, BaseEstimator): 7 | """ 8 | A tokenizer that uses spaCy under the hood for the tokenization. 9 | 10 | Arguments: 11 | model: reference to the spaCy model 12 | lemma: weather or not to also apply lemmatization 13 | stop: weather or not to remove stopwords 14 | 15 | Usage: 16 | 17 | ```python 18 | import spacy 19 | from tokenwiser.tok import SpacyTokenizer 20 | 21 | # This can also be a Non-English model. 22 | nlp = spacy.load("en_core_web_sm") 23 | tok = SpacyTokenizer(model=nlp) 24 | 25 | single = tok("hello world") 26 | assert single == ["hello", "world"] 27 | ``` 28 | """ 29 | 30 | def __init__(self, model, lemma=False, stop=False): 31 | self.model = model 32 | self.lemma = lemma 33 | self.stop = stop 34 | 35 | def __call__(self, text): 36 | if self.stop: 37 | return [ 38 | t.lemma_ if self.lemma else t.text 39 | for t in self.model(text) 40 | if not t.is_stop 41 | ] 42 | return [t.lemma_ if self.lemma else t.text for t in self.model(text)] 43 | -------------------------------------------------------------------------------- /tokenwiser/tok/_tok.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class Tok(ABC): 5 | @abstractmethod 6 | def __call__(self, x): 7 | pass 8 | -------------------------------------------------------------------------------- /tokenwiser/tok/_whitespace.py: -------------------------------------------------------------------------------- 1 | from tokenwiser.tok._tok import Tok 2 | 3 | from sklearn.base import BaseEstimator 4 | 5 | 6 | class WhiteSpaceTokenizer(Tok, BaseEstimator): 7 | """ 8 | A simple tokenizer that simple splits on whitespace. 9 | 10 | Usage: 11 | 12 | ```python 13 | from tokenwiser.tok import WhiteSpaceTokenizer 14 | 15 | tok = WhiteSpaceTokenizer() 16 | single = tok("hello world") 17 | assert single == ["hello", "world"] 18 | ``` 19 | """ 20 | 21 | def __init__(self): 22 | pass 23 | 24 | def __call__(self, text): 25 | return [r for r in text.split(" ") if r != ""] 26 | -------------------------------------------------------------------------------- /tokenwiser/wabbit/__init__.py: -------------------------------------------------------------------------------- 1 | from ._vowpal import VowpalWabbitClassifier 2 | 3 | __all__ = ["VowpalWabbitClassifier"] 4 | -------------------------------------------------------------------------------- /tokenwiser/wabbit/_vowpal.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from vowpalwabbit import pyvw 3 | from sklearn.utils.validation import check_is_fitted 4 | from sklearn.base import BaseEstimator, ClassifierMixin 5 | 6 | 7 | class VowpalWabbitClassifier(BaseEstimator, ClassifierMixin): 8 | """ 9 | Vowpal Wabbit based text classifier. 10 | 11 | This object represents a simplified [Vowpal Wabbit](https://vowpalwabbit.org/) classifier that is 12 | compatible with scikit-learn. The only caveat is that the model expects 13 | text-arrays as opposed to numeric arrays. 14 | 15 | Arguments: 16 | n_loop: the number of times the fit step should apply to the training data 17 | n_gram: number of n_grams to encode as well 18 | learning_rate: the learning rate to apply while training 19 | 20 | Usage: 21 | 22 | ```python 23 | from tokenwiser.wabbit import VowpalWabbitClassifier 24 | 25 | clf = VowpalWabbitClassifier() 26 | 27 | X = [ 28 | "this is friendly", 29 | "very friendly", 30 | "i do not like you", 31 | "the sky is blue" 32 | ] 33 | 34 | y = ["pos", "pos", "neg", "neutral"] 35 | 36 | # partial fitting 37 | for x_, y_ in zip(X, y): 38 | clf.partial_fit(x_, y_, classes=["pos", "neg", "neutral"]) 39 | clf.predict(X) 40 | 41 | # batch fitting 42 | clf.fit(X, y).predict(X) 43 | ``` 44 | """ 45 | 46 | def __init__(self, n_loop: int = 1, n_gram: int = 1, learning_rate: float = 0.5): 47 | self.model = None 48 | self.n_loop = n_loop 49 | self.n_gram = n_gram 50 | self.learning_rate = learning_rate 51 | 52 | def fit(self, X, y): 53 | """ 54 | Fit the model using X, y as training data. 55 | 56 | Arguments: 57 | X: array-like, shape=(n_columns, n_samples, ) training data, must be text. 58 | y: labels 59 | """ 60 | return self.partial_fit(X, y, classes=list(set(y))) 61 | 62 | def partial_fit(self, X, y, classes): 63 | """ 64 | Incremental fit on a batch of samples. 65 | 66 | Arguments: 67 | X: array-like, shape=(n_columns, n_samples, ) training data, must be text. 68 | y: labels 69 | classes: list of all the classes in the dataset 70 | """ 71 | if not isinstance(X[0], str): 72 | raise ValueError("This model only accepts text as input.") 73 | if not self.model: 74 | self.classes_ = classes 75 | self.idx_to_cls_ = {i + 1: c for i, c in enumerate(self.classes_)} 76 | self.cls_to_idx_ = {c: i + 1 for i, c in enumerate(self.classes_)} 77 | self.model = pyvw.vw( 78 | quiet=True, 79 | oaa=len(classes), 80 | ngram=self.n_gram, 81 | learning_rate=self.learning_rate, 82 | loss_function="logistic", 83 | probabilities=True, 84 | ) 85 | for loop in range(self.n_loop): 86 | for x_, y_ in zip(X, y): 87 | try: 88 | self.model.learn(f"{self.cls_to_idx_[y_]} | {x_}") 89 | except RuntimeError as e: 90 | ex = f"{self.cls_to_idx_[y_]} | {x_}" 91 | raise RuntimeError(f"{e}\nculprit: {ex}") 92 | return self 93 | 94 | def predict_proba(self, X): 95 | """ 96 | Return probability estimates for the test vector X. 97 | 98 | Arguments: 99 | X: array-like, shape=(n_columns, n_samples, ) training data, must be text. 100 | """ 101 | check_is_fitted(self, ["classes_", "cls_to_idx_", "idx_to_cls_"]) 102 | r = np.array([self.model.predict(f"| {x}") for x in X]) 103 | return r / r.sum(axis=1).reshape(-1, 1) 104 | 105 | def predict(self, X): 106 | """ 107 | Perform classification on an array of test vectors X. 108 | 109 | Arguments: 110 | X: array-like, shape=(n_columns, n_samples, ) training data, must be text. 111 | """ 112 | argmax = self.predict_proba(X).argmax(axis=1) 113 | return np.array([self.idx_to_cls_[a + 1] for a in argmax]) 114 | --------------------------------------------------------------------------------