├── .gitignore
├── LICENSE
├── README.md
├── embedders
    ├── __init__.py
    ├── classification
    │   ├── __init__.py
    │   ├── contextual.py
    │   ├── count_based.py
    │   └── reduce.py
    ├── enums.py
    ├── extraction
    │   ├── __init__.py
    │   ├── contextual.py
    │   ├── count_based.py
    │   └── reduce.py
    ├── samples
    │   ├── __init__.py
    │   └── clickbait.py
    └── util.py
├── publish
├── requirements.txt
├── setup.py
└── tutorials
    └── Finding similar sentences within a text corpus.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | .vscode/
  2 | .DS_Store
  3 | debugging.py
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright (c) 2022 kern.ai
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![embedders](https://uploads-ssl.webflow.com/61e47fafb12bd56b40022a49/626ee1c35a3abf0ca872486d_embedder-banner.png)
  2 | [![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-390/)
  3 | [![pypi 0.1.8](https://img.shields.io/badge/pypi-0.1.8-red.svg)](https://pypi.org/project/embedders/0.1.8/)
  4 | 
  5 | # ⚗️ embedders
  6 | 
  7 | With `embedders`, you can easily convert your texts into sentence- or token-level embeddings within a few lines of code. Use cases for this include similarity search between texts, information extraction such as named entity recognition, or basic text classification.
  8 | 
  9 | ## Prerequisites
 10 | 
 11 | This library uses [spaCy](https://github.com/explosion/spaCy) for tokenization; to apply it, please download the [respective language model](https://spacy.io/models) first.
 12 | 
 13 | ## Installation
 14 | 
 15 | You can set up this library via either running `$ pip install embedders`, or via cloning this repository and running `$ pip install -r requirements.txt` in your repository.
 16 | 
 17 | A sample installation would be:
 18 | 
 19 | ```
 20 | $ conda create --name embedders python=3.9
 21 | $ conda activate embedders
 22 | $ pip install embedders
 23 | $ python -m spacy download en_core_web_sm
 24 | ```
 25 | 
 26 | ## Usage
 27 | 
 28 | Once you installed the package, you can apply the embedders with a few lines of code. You can apply embedders on sentence- or token-level.
 29 | 
 30 | ### Sentence embeddings
 31 | 
 32 | `"Wow, what a cool tool!"` is embedded to
 33 | 
 34 | ```
 35 | [
 36 |     2.453, 8.325, ..., 3.863
 37 | ]
 38 | ```
 39 | 
 40 | Currently, we provide the following sentence embeddings:
 41 | | **Path** | **Name** | **Embeds documents using ...** |
 42 | | ------------------------------------ | --------------------------- | ------------------------------------------------------------ |
 43 | | embedders.classification.contextual | HuggingFaceSentenceEmbedder | large, pre-trained transformers from https://huggingface.co |
 44 | | embedders.classification.contextual | OpenAISentenceEmbedder | large, pre-trained transformers from https://openai.com |
 45 | | embedders.classification.contextual | CohereSentenceEmbedder | large, pre-trained transformers from https://cohere.com |
 46 | | embedders.classification.count_based | BagOfCharsSentenceEmbedder | plain Bag of Chars approach |
 47 | | embedders.classification.count_based | BagOfWordsSentenceEmbedder | plain Bag of Words approach |
 48 | | embedders.classification.count_based | TfidfSentenceEmbedder | Term Frequency - Inverse Document Frequency (TFIDF) approach |
 49 | 
 50 | ### Token embeddings
 51 | 
 52 | `"Wow, what a cool tool!"` is embedded to
 53 | 
 54 | ```
 55 | [
 56 |     [8.453, 1.853, ...],
 57 |     [3.623, 2.023, ...],
 58 |     [1.906, 9.604, ...],
 59 |     [7.306, 2.325, ...],
 60 |     [6.630, 1.643, ...],
 61 |     [3.023, 4.974, ...]
 62 | ]
 63 | ```
 64 | 
 65 | Currently, we provide the following token embeddings:
 66 | 
 67 | | **Path**                         | **Name**                 | **Embeds documents using ...**                              |
 68 | | -------------------------------- | ------------------------ | ----------------------------------------------------------- |
 69 | | embedders.extraction.contextual  | TransformerTokenEmbedder | large, pre-trained transformers from https://huggingface.co |
 70 | | embedders.extraction.count_based | BagOfCharsTokenEmbedder  | plain Bag of Characters approach                            |
 71 | 
 72 | You can choose the embedding category depending on your task at hand. To implement them, you can just grab one of the available methods and apply them to your text corpus as follows (shown for sentence embeddings, but the same is possible for token):
 73 | 
 74 | ```python
 75 | from embedders.classification.contextual import TransformerSentenceEmbedder
 76 | from embedders.classification.reduce import PCASentenceReducer
 77 | 
 78 | corpus = [
 79 |     "I went to Cologne in 2009",
 80 |     "My favorite number is 41",
 81 |     # ...
 82 | ]
 83 | 
 84 | embedder = TransformerSentenceEmbedder("bert-base-cased")
 85 | embeddings = embedder.fit_transform(corpus) # contains a list of shape [num_texts, embedding_dimension]
 86 | ```
 87 | 
 88 | Sometimes, you want to reduce the size of the embeddings you received. To do so, you can easily wrap your embedder with some dimensionality reduction technique.
 89 | 
 90 | ```python
 91 | # if the dimension is too large, you can also apply dimensionality reduction
 92 | reducer = PCASentenceReducer(embedder)
 93 | embeddings_reduced = reducer.fit_transform(corpus)
 94 | ```
 95 | 
 96 | Currently, we provide the following dimensionality reductions:
 97 | | **Path** | **Name** | **Description** |
 98 | | ------------------------------- | ------------------- | -------------------------------------------------------------------------------- |
 99 | | embedders.classification.reduce | PCASentenceEmbedder | Wraps embedder into a principial component analysis to reduce the dimensionality |
100 | | embedders.extraction.reduce | PCATokenEmbedder | Wraps embedder into a principial component analysis to reduce the dimensionality |
101 | 
102 | ## Pre-trained embedders
103 | 
104 | With growing availability of large, pre-trained models such as provided by [🤗 Hugging Face](https://huggingface.co/), embedding complex sentences in a wide variety of languages and domains becomes much more applicable. If you want to make use of transformer models, you can just use the configuration string of the respective model, which will automatically pull the correct model for the [🤗 Hugging Face Hub](https://huggingface.co/models).
105 | 
106 | ## Contributing
107 | 
108 | Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**.
109 | 
110 | If you have a suggestion that would make this better, please fork the repo and create a pull request. You can also simply open an issue with the tag "enhancement".
111 | Don't forget to give the project a star! Thanks again!
112 | 
113 | 1. Fork the Project
114 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
115 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
116 | 4. Push to the Branch (`git push origin feature/AmazingFeature`)
117 | 5. Open a Pull Request
118 | 
119 | And please don't forget to leave a ⭐ if you like the work!
120 | 
121 | ## License
122 | 
123 | Distributed under the Apache 2.0 License. See LICENSE.txt for more information.
124 | 
125 | ## Contact
126 | 
127 | This library is developed and maintained by [kern.ai](https://github.com/code-kern-ai). If you want to provide us with feedback or have some questions, don't hesitate to contact us. We're super happy to help ✌️
128 | 


--------------------------------------------------------------------------------
/embedders/__init__.py:
--------------------------------------------------------------------------------
  1 | from abc import ABCMeta, abstractmethod
  2 | from typing import Dict, List, Generator, Optional, Union
  3 | from spacy.tokens.doc import Doc
  4 | from sklearn.decomposition import PCA
  5 | from tqdm import tqdm
  6 | from embedders import util
  7 | from joblib import dump, load
  8 | 
  9 | 
 10 | class Transformer(metaclass=ABCMeta):
 11 |     def __init__(self):
 12 |         self._warnings = {}
 13 | 
 14 |     @abstractmethod
 15 |     def fit_transform(
 16 |         self, documents: List[Union[str, Doc]], as_generator: bool
 17 |     ) -> Union[List, Generator]:
 18 |         """Trains the given algorithm to embed textual documents into semantic vector-spacy representations.
 19 | 
 20 |         Args:
 21 |             documents (List[Union[str, Doc]]): List of plain strings or spaCy documents.
 22 |             as_generator (bool): Embeddings are calculated batch-wise. If this is set to False, the results will be summarized in one list, else a generator will yield the values.
 23 | 
 24 |         Returns:
 25 |             Union[List, Generator]: List with all embeddings or generator that yields the embeddings.
 26 |         """
 27 |         pass
 28 | 
 29 |     @abstractmethod
 30 |     def transform(
 31 |         self, documents: List[Union[str, Doc]], as_generator: bool
 32 |     ) -> Union[List, Generator]:
 33 |         """Uses the trained algorithm to embed textual documents into semantic vector-spacy representations.
 34 | 
 35 |         Args:
 36 |             documents (List[Union[str, Doc]]): List of plain strings or spaCy documents.
 37 |             as_generator (bool): Embeddings are calculated batch-wise. If this is set to False, the results will be summarized in one list, else a generator will yield the values.
 38 | 
 39 |         Returns:
 40 |             Union[List, Generator]: List with all embeddings or generator that yields the embeddings.
 41 |         """
 42 |         pass
 43 | 
 44 |     @abstractmethod
 45 |     def get_warnings(self) -> Dict:
 46 |         """Collects all warnings reported during the embedding creation or PCA.
 47 | 
 48 |         Returns:
 49 |             List: List with all warnings
 50 |         """
 51 |         pass
 52 | 
 53 | 
 54 | class Embedder(Transformer, metaclass=ABCMeta):
 55 |     def __init__(self):
 56 |         super().__init__()
 57 | 
 58 |     @abstractmethod
 59 |     def _encode(self, documents: List[Union[str, Doc]], fit_model: bool) -> Generator:
 60 |         pass
 61 | 
 62 |     def _encode_batch(
 63 |         self,
 64 |         documents: List[Union[str, Doc]],
 65 |         as_generator: bool,
 66 |         fit_model: bool,
 67 |         show_progress: Optional[bool] = True,
 68 |     ) -> Union[List, Generator]:
 69 |         if as_generator:
 70 |             return self._encode(documents, fit_model)
 71 |         else:
 72 |             embeddings = []
 73 |             if show_progress:
 74 |                 num_batches = util.num_batches(documents, self.batch_size)
 75 |                 print("Initializing model, might take some time...")
 76 |                 for embedding_batch in tqdm(
 77 |                     self._encode(documents, fit_model),
 78 |                     total=num_batches,
 79 |                     desc="Encoding batches ...",
 80 |                 ):
 81 |                     embeddings.extend(embedding_batch)
 82 |             else:
 83 |                 for embedding_batch in self._encode(documents, fit_model):
 84 |                     embeddings.extend(embedding_batch)
 85 |             return embeddings
 86 | 
 87 |     def fit_transform(
 88 |         self, documents: List[Union[str, Doc]], as_generator: bool = False
 89 |     ) -> Union[List, Generator]:
 90 |         return self._encode_batch(documents, as_generator, True)
 91 | 
 92 |     def transform(
 93 |         self, documents: List[Union[str, Doc]], as_generator: bool = False
 94 |     ) -> Union[List, Generator]:
 95 |         return self._encode_batch(documents, as_generator, False)
 96 | 
 97 |     def get_warnings(self) -> Dict:
 98 |         return self._warnings
 99 | 
100 | 
101 | class PCAReducer(Transformer, metaclass=ABCMeta):
102 |     """Wraps embedder into a principial component analysis to reduce the dimensionality.
103 | 
104 |     Args:
105 |         embedder (Embedder): Algorithm to embed the documents.
106 |         n_components (int, optional): Number of principal components to keep. Defaults to 8.
107 |         autocorrect_n_components (bool, optional): If there are less data samples than specified components, this will automatically reduce the number of principial components. Defaults to True.
108 |     """
109 | 
110 |     def __init__(
111 |         self,
112 |         embedder: Embedder,
113 |         n_components: int = 8,
114 |         autocorrect_n_components: bool = True,
115 |         **kwargs
116 |     ):
117 |         super().__init__()
118 |         self.embedder = embedder
119 |         self.reducer = PCA(n_components=n_components, **kwargs)
120 |         self.batch_size = self.embedder.batch_size
121 |         self.autocorrect_n_components = autocorrect_n_components
122 | 
123 |     def store_pca_weights(self, file_name: str):
124 |         """Stores the PCA weights to a file.
125 | 
126 |         Args:
127 |             file_name (str): Path to the file without any file endings.
128 |         """
129 |         dump(self.reducer, f'{file_name}.joblib') 
130 | 
131 |     def load_pca_weights(self, file_name: str):
132 |         """Loads the PCA weights from a file.
133 | 
134 |         Args:
135 |             file_name (str): Path to the file without any file endings.
136 |         """
137 |         self.reducer = load(f'{file_name}.joblib')
138 | 
139 |     @abstractmethod
140 |     def _reduce(
141 |         self,
142 |         documents: List[Union[str, Doc]],
143 |         fit_model: bool,
144 |         fit_after_n_batches: int,
145 |     ):
146 |         pass
147 | 
148 |     def _reduce_batch(
149 |         self,
150 |         documents: List[Union[str, Doc]],
151 |         as_generator: bool,
152 |         fit_model: bool,
153 |         fit_after_n_batches: int,
154 |     ) -> Union[List, Generator]:
155 |         if as_generator:
156 |             return self._reduce(documents, fit_model, fit_after_n_batches)
157 |         else:
158 |             embeddings = []
159 |             for embedding_batch in self._reduce(
160 |                 documents, fit_model, fit_after_n_batches
161 |             ):
162 |                 embeddings.extend(embedding_batch)
163 |             return embeddings
164 | 
165 |     def fit_transform(
166 |         self,
167 |         documents: List[Union[str, Doc]],
168 |         as_generator: bool = False,
169 |         fit_after_n_batches: int = 5,
170 |     ) -> Union[List, Generator]:
171 |         """Trains the given algorithm to embed textual documents into semantic vector-spacy representations.
172 | 
173 |         Args:
174 |             documents (List[Union[str, Doc]]): List of plain strings or spaCy documents.
175 |             as_generator (bool, optional): Embeddings are calculated batch-wise. If this is set to False, the results will be summarized in one list, else a generator will yield the values.. Defaults to False.
176 |             fit_after_n_batches (int, optional): Maximal batch iteration, after which the PCA is fitted. Defaults to 5.
177 | 
178 |         Returns:
179 |             Union[List, Generator]: List with all embeddings or generator that yields the embeddings.
180 |         """
181 | 
182 |         return self._reduce_batch(
183 |             documents,
184 |             as_generator,
185 |             True,
186 |             fit_after_n_batches,
187 |         )
188 | 
189 |     def transform(self, documents, as_generator=False) -> Union[List, Generator]:
190 |         return self._reduce_batch(documents, as_generator, False, 0)
191 | 
192 |     def get_warnings(self) -> Dict:
193 |         return {**self._warnings, **self.embedder.get_warnings()}
194 | 


--------------------------------------------------------------------------------
/embedders/classification/__init__.py:
--------------------------------------------------------------------------------
1 | from embedders import Embedder
2 | 
3 | 
4 | class SentenceEmbedder(Embedder):
5 |     def __init__(self, batch_size: int = 128):
6 |         super().__init__()
7 |         self.batch_size = batch_size
8 | 


--------------------------------------------------------------------------------
/embedders/classification/contextual.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Union, Generator
  2 | from sentence_transformers import SentenceTransformer
  3 | from embedders import util
  4 | from embedders.classification import SentenceEmbedder
  5 | from spacy.tokens.doc import Doc
  6 | import torch
  7 | import openai
  8 | from openai import error as openai_error
  9 | import cohere
 10 | import time
 11 | 
 12 | 
 13 | class TransformerSentenceEmbedder(SentenceEmbedder):
 14 |     """Embeds documents using large, pre-trained transformers from https://huggingface.co
 15 | 
 16 |     Args:
 17 |         config_string (str): Name of the model listed on https://huggingface.co/models
 18 |         batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128.
 19 |     """
 20 | 
 21 |     def __init__(self, config_string: str, batch_size: int = 128):
 22 |         super().__init__(batch_size)
 23 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 24 |         self.model = SentenceTransformer(config_string).to(self.device)
 25 | 
 26 |     def _encode(
 27 |         self, documents: List[Union[str, Doc]], fit_model: bool
 28 |     ) -> Generator[List[List[float]], None, None]:
 29 |         for documents_batch in util.batch(documents, self.batch_size):
 30 |             yield self.model.encode(documents_batch, show_progress_bar=False).tolist()
 31 | 
 32 | 
 33 | class HuggingFaceSentenceEmbedder(TransformerSentenceEmbedder):
 34 |     def __init__(self, config_string: str, batch_size: int = 128):
 35 |         super().__init__(config_string, batch_size)
 36 | 
 37 | 
 38 | class OpenAISentenceEmbedder(SentenceEmbedder):
 39 |     def __init__(
 40 |         self,
 41 |         openai_api_key: str,
 42 |         model_name: str,
 43 |         batch_size: int = 128,
 44 |         api_base: Optional[str] = None,
 45 |         api_type: Optional[str] = None,
 46 |         api_version: Optional[str] = None,
 47 |     ):
 48 |         """
 49 |         Embeds documents using large language models from https://openai.com or https://azure.microsoft.com
 50 | 
 51 |         Args:
 52 |             openai_api_key (str): API key from OpenAI or Azure
 53 |             model_name (str): Name of the embedding model from OpenAI (e.g. text-embedding-ada-002) or the name of your Azure endpoint
 54 |             batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128.
 55 |             api_base (str, optional): If you use Azure, you need to provide the base URL of your Azure endpoint (e.g. 'https://azureopenkernai.openai.azure.com/'). Defaults to None.
 56 |             api_type (str, optional): If you use Azure, you need to provide the type of your Azure endpoint (e.g. 'azure'). Defaults to None.
 57 |             api_version (str, optional): If you use Azure, you need to provide the version of your Azure endpoint (e.g. '2023-05-15'). Defaults to None.
 58 | 
 59 |         Raises:
 60 |             Exception: If you use Azure, you need to provide api_type, api_version and api_base.
 61 | 
 62 |         Examples:
 63 |             >>> from embedders.classification.contextual import OpenAISentenceEmbedder
 64 |             >>> embedder_openai = OpenAISentenceEmbedder(
 65 |             ...     "my-key-from-openai",
 66 |             ...     "text-embedding-ada-002",
 67 |             ... )
 68 |             >>> embeddings = embedder_openai.transform(["This is a test", "This is another test"])
 69 |             >>> print(embeddings)
 70 |             [[-0.0001, 0.0002, ...], [-0.0001, 0.0002, ...]]
 71 | 
 72 |             >>> from embedders.classification.contextual import OpenAISentenceEmbedder
 73 |             >>> embedder_azure = OpenAISentenceEmbedder(
 74 |             ...     "my-key-from-azure",
 75 |             ...     "my-endpoint-name",
 76 |             ...     api_base="https://azureopenkernai.openai.azure.com/",
 77 |             ...     api_type="azure",
 78 |             ...     api_version="2023-05-15",
 79 |             ... )
 80 |             >>> embeddings = embedder_azure.transform(["This is a test", "This is another test"])
 81 |             >>> print(embeddings)
 82 |             [[-0.0001, 0.0002, ...], [-0.0001, 0.0002, ...]]
 83 | 
 84 |         """
 85 |         super().__init__(batch_size)
 86 |         self.model_name = model_name
 87 |         self.openai_api_key = openai_api_key
 88 |         openai.api_key = self.openai_api_key
 89 |         self.api_base = api_base
 90 |         self.api_type = api_type
 91 |         self.api_version = api_version
 92 | 
 93 |         self.use_azure = any(
 94 |             [
 95 |                 api_base is not None,
 96 |                 api_type is not None,
 97 |                 api_version is not None,
 98 |             ]
 99 |         )
100 |         if self.use_azure:
101 |             assert (
102 |                 api_type is not None
103 |                 and api_version is not None
104 |                 and api_base is not None
105 |             ), "If you want to use Azure, you need to provide api_type, api_version and api_base."
106 | 
107 |             openai.api_base = api_base
108 |             openai.api_type = api_type
109 |             openai.api_version = api_version
110 | 
111 |     def __getstate__(self):
112 |         state = self.__dict__.copy()
113 |         return state
114 | 
115 |     def __setstate__(self, state):
116 |         self.__dict__.update(state)
117 |         self.model_name = state["model_name"]
118 |         self.openai_api_key = state["openai_api_key"]
119 |         openai.api_key = self.openai_api_key
120 |         self.use_azure = state.get("use_azure")
121 |         if self.use_azure:
122 |             self.api_base = state["api_base"]
123 |             self.api_type = state["api_type"]
124 |             self.api_version = state["api_version"]
125 |             openai.api_base = self.api_base
126 |             openai.api_type = self.api_type
127 |             openai.api_version = self.api_version
128 | 
129 |     def _encode(
130 |         self, documents: List[Union[str, Doc]], fit_model: bool
131 |     ) -> Generator[List[List[float]], None, None]:
132 |         for documents_batch in util.batch(documents, self.batch_size):
133 |             documents_batch = [doc.replace("\n", " ") for doc in documents_batch]
134 |             try:
135 |                 if self.use_azure:
136 |                     embeddings = []
137 |                     for azure_batch in util.batch(documents_batch, 16):
138 |                         # azure only allows up to 16 documents per request
139 |                         count = 0
140 |                         while True and count < 60:
141 |                             try:
142 |                                 count += 1
143 |                                 response = openai.Embedding.create(
144 |                                     input=azure_batch, engine=self.model_name
145 |                                 )
146 |                                 break
147 |                             except openai.error.RateLimitError as e:
148 |                                 if count >= 60:
149 |                                     raise e
150 |                                 if count == 1:
151 |                                     print(
152 |                                         "Rate limit exceeded. Waiting 10 seconds...",
153 |                                         flush=True,
154 |                                     )
155 |                                     time.sleep(10.05)
156 |                                 else:
157 |                                     time.sleep(1)
158 |                         embeddings += [entry["embedding"] for entry in response["data"]]
159 |                 else:
160 |                     response = openai.Embedding.create(
161 |                         input=documents_batch, engine=self.model_name
162 |                     )
163 |                     embeddings = [entry["embedding"] for entry in response["data"]]
164 |                 yield embeddings
165 |             except openai_error.AuthenticationError:
166 |                 raise Exception(
167 |                     "OpenAI API key is invalid. Please provide a valid API key in the constructor of OpenAISentenceEmbedder."
168 |                 )
169 | 
170 | 
171 | class CohereSentenceEmbedder(SentenceEmbedder):
172 |     def __init__(self, cohere_api_key: str, batch_size: int = 128):
173 |         super().__init__(batch_size)
174 |         self.cohere_api_key = cohere_api_key
175 |         self.model = cohere.Client(self.cohere_api_key)
176 | 
177 |     def __getstate__(self):
178 |         state = self.__dict__.copy()
179 |         # Don't pickle 'model'
180 |         del state["model"]
181 |         return state
182 | 
183 |     def __setstate__(self, state):
184 |         self.__dict__.update(state)
185 |         # Restore 'model' after unpickling
186 |         self.model = cohere.Client(self.cohere_api_key)
187 | 
188 |     def _encode(
189 |         self, documents: List[Union[str, Doc]], fit_model: bool
190 |     ) -> Generator[List[List[float]], None, None]:
191 |         for documents_batch in util.batch(documents, self.batch_size):
192 |             embeddings = self.model.embed(documents_batch).embeddings
193 |             yield embeddings
194 | 


--------------------------------------------------------------------------------
/embedders/classification/count_based.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union, Generator
 2 | from sklearn.feature_extraction.text import CountVectorizer
 3 | from sklearn.feature_extraction.text import TfidfVectorizer
 4 | from embedders.classification import SentenceEmbedder
 5 | from embedders import util
 6 | 
 7 | 
 8 | class CountSentenceEmbedder(SentenceEmbedder):
 9 |     def __init__(self, batch_size: int, min_df: float, **kwargs):
10 |         super().__init__(batch_size)
11 | 
12 |     def _encode(
13 |         self, documents: List[str], fit_model: bool
14 |     ) -> Generator[List[List[Union[float, int]]], None, None]:
15 |         if fit_model:
16 |             self.model.fit(documents)
17 | 
18 |         for documents_batch in util.batch(documents, self.batch_size):
19 |             documents_batch_embedded = []
20 |             for doc in documents_batch:
21 |                 documents_batch_embedded.append(
22 |                     self.model.transform([doc]).toarray().tolist()[0]
23 |                 )
24 |             yield documents_batch_embedded
25 | 
26 | 
27 | class BagOfCharsSentenceEmbedder(CountSentenceEmbedder):
28 |     """Embeds documents using plain Bag of Characters approach.
29 | 
30 |     Args:
31 |         batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128.
32 |         min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1.
33 |     """
34 | 
35 |     def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs):
36 |         super().__init__(batch_size, min_df)
37 |         self.model = CountVectorizer(analyzer="char", min_df=min_df, **kwargs)
38 | 
39 | 
40 | class BagOfWordsSentenceEmbedder(CountSentenceEmbedder):
41 |     """Embeds documents using plain Bag of Words approach.
42 | 
43 |     Args:
44 |         batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128.
45 |         min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1.
46 |     """
47 | 
48 |     def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs):
49 |         super().__init__(batch_size, min_df)
50 |         self.model = CountVectorizer(min_df=min_df, **kwargs)
51 | 
52 | 
53 | class TfidfSentenceEmbedder(CountSentenceEmbedder):
54 |     """Embeds documents using Term Frequency - Inverse Document Frequency (TFIDF) approach.
55 | 
56 |     Args:
57 |         batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128.
58 |         min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1.
59 |     """
60 | 
61 |     def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs):
62 |         super().__init__(batch_size, min_df)
63 |         self.model = TfidfVectorizer(min_df=min_df, **kwargs)
64 | 


--------------------------------------------------------------------------------
/embedders/classification/reduce.py:
--------------------------------------------------------------------------------
 1 | from spacy.tokens.doc import Doc
 2 | from typing import Union, List, Generator
 3 | import numpy as np
 4 | from embedders import PCAReducer, util
 5 | 
 6 | 
 7 | class PCASentenceReducer(PCAReducer):
 8 |     def _transform(
 9 |         self, embeddings: List[List[Union[int, float]]]
10 |     ) -> List[List[Union[float, int]]]:
11 |         return self.reducer.transform(embeddings).tolist()
12 | 
13 |     def _reduce(
14 |         self,
15 |         documents: List[Union[str, Doc]],
16 |         fit_model: bool,
17 |         fit_after_n_batches: int,
18 |     ) -> Generator[List[List[Union[float, int]]], None, None]:
19 |         if fit_model:
20 |             embeddings_training = []
21 |             num_batches = util.num_batches(documents, self.embedder.batch_size)
22 |             fit_after_n_batches = min(num_batches, fit_after_n_batches) - 1
23 |             for batch_idx, batch in enumerate(
24 |                 self.embedder.fit_transform(documents, as_generator=True)
25 |             ):
26 |                 if batch_idx <= fit_after_n_batches:
27 |                     embeddings_training.append(batch)
28 | 
29 |                 if batch_idx == fit_after_n_batches:
30 |                     embeddings_training_flattened = []
31 |                     for batch_training in embeddings_training:
32 |                         embeddings_training_flattened.extend(batch_training)
33 |                     embeddings_training_flattened = np.array(
34 |                         embeddings_training_flattened
35 |                     )
36 |                     if (
37 |                         embeddings_training_flattened.shape[1]
38 |                         < self.reducer.n_components
39 |                         and self.autocorrect_n_components
40 |                     ):
41 |                         self.reducer.n_components = embeddings_training_flattened.shape[
42 |                             1
43 |                         ]
44 |                     self.reducer.fit(embeddings_training_flattened)
45 | 
46 |                     for batch_training in embeddings_training:
47 |                         yield self._transform(batch_training)
48 |                 if batch_idx > fit_after_n_batches:
49 |                     yield self._transform(batch)
50 |         else:
51 |             embeddings = self.embedder.transform(documents)
52 |             yield self._transform(embeddings)
53 | 


--------------------------------------------------------------------------------
/embedders/enums.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | 
3 | 
4 | class WarningType(Enum):
5 |     DOCUMENT_IS_SPLITTED = "DOCUMENT_IS_SPLITTED"
6 |     TOKEN_MISMATCHING = "TOKEN_MISMATCHING"
7 | 


--------------------------------------------------------------------------------
/embedders/extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | from embedders import Embedder
 3 | from spacy.tokens.doc import Doc
 4 | from typing import Union
 5 | 
 6 | 
 7 | class TokenEmbedder(Embedder):
 8 |     def __init__(
 9 |         self, language_code: str, precomputed_docs: bool = False, batch_size: int = 128
10 |     ):
11 |         super().__init__()
12 |         self.preloaded = precomputed_docs
13 |         if precomputed_docs:
14 |             self.nlp = spacy.blank(language_code)
15 |         else:
16 |             self.nlp = spacy.load(language_code)
17 |         self.batch_size = batch_size
18 | 
19 |     def _get_tokenized_document(self, document: Union[str, Doc]):
20 |         if self.preloaded:
21 |             return document
22 |         else:
23 |             return self.nlp(document)
24 | 


--------------------------------------------------------------------------------
/embedders/extraction/contextual.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Tuple, Union, Iterator
  2 | import torch
  3 | import math
  4 | import numpy as np
  5 | import re
  6 | from transformers import AutoTokenizer, AutoModel
  7 | from collections import defaultdict
  8 | from embedders import util
  9 | from spacy.tokens.doc import Doc
 10 | 
 11 | 
 12 | from embedders.enums import WarningType
 13 | from embedders.extraction import TokenEmbedder
 14 | 
 15 | 
 16 | class TransformerTokenEmbedder(TokenEmbedder):
 17 |     """Embeds documents using large, pre-trained transformers from https://huggingface.co
 18 | 
 19 |     Args:
 20 |         config_string (str): Name of the model listed on https://huggingface.co/models
 21 |         language_code (str): Name of the spaCy language model
 22 |         precomputed_docs (bool, optional): If you have a large text corpus, it might make sense to precompute the data and input tokenized spaCy documents. Defaults to False.
 23 |         batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128.
 24 |     """
 25 | 
 26 |     _NL_TOKEN = "[NL]"
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         config_string: str,
 31 |         language_code: str,
 32 |         precomputed_docs: bool = False,
 33 |         batch_size: int = 128,
 34 |     ):
 35 |         super().__init__(language_code, precomputed_docs, batch_size)
 36 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 37 | 
 38 |         self.transformer_tokenizer = AutoTokenizer.from_pretrained(config_string)
 39 |         self.transformer_tokenizer.add_special_tokens(
 40 |             {"additional_special_tokens": [self._NL_TOKEN]}
 41 |         )
 42 | 
 43 |         self.model = AutoModel.from_pretrained(
 44 |             config_string, output_hidden_states=True
 45 |         ).to(self.device)
 46 |         self.model.resize_token_embeddings(len(self.transformer_tokenizer))
 47 | 
 48 |     def _encode(
 49 |         self, documents: Union[List[str], List[Doc]], fit_model: bool
 50 |     ) -> Iterator[List[List[List[float]]]]:
 51 |         for batch_number, documents_batch in enumerate(
 52 |             util.batch(documents, self.batch_size)
 53 |         ):
 54 |             """
 55 |             Computation of embeddings for each spacy token of each document. For the
 56 |             embedding creation transformer models are used. Embeddings are calculated
 57 |             for the transformer tokens of the document, then these token embeddings are
 58 |             matched to the spacy tokens.
 59 | 
 60 |             Args:
 61 |                 documents: list of strings or spacy documents
 62 |                 fit_model: not used, required by base class
 63 |             Return:
 64 |                 Token embeddings for each document
 65 |             """
 66 | 
 67 |             documents_batch_embedded = []
 68 |             for document_number, document in enumerate(documents_batch):
 69 |                 doc = self._get_tokenized_document(document)
 70 | 
 71 |                 # no spacy token.
 72 |                 # set special token as text, so that an embedding
 73 |                 # is created that can be processed by the PCA.
 74 |                 if len(doc) == 0:
 75 |                     doc = self.nlp(self._NL_TOKEN)
 76 | 
 77 |                 # spacy creates tokens which only contain whitespace characters.
 78 |                 # the transformer tokenizer ignores these tokens.
 79 |                 # in order to avoid problems while matching the tokens the text is
 80 |                 # preprocessed.
 81 |                 text, prep_offsets = self._preprocess_doc_text(doc)
 82 | 
 83 |                 # transformer models have a maximum number of tokens which can be
 84 |                 # processed at the same time.
 85 |                 # in this case the text is splitted in mutliple subparts
 86 |                 number_est_tokens = self._estimate_token_number(text)
 87 |                 idx_document = batch_number * self.batch_size + document_number
 88 |                 if self.model.config.max_position_embeddings < number_est_tokens:
 89 |                     if WarningType.DOCUMENT_IS_SPLITTED.value in self._warnings:
 90 |                         self._warnings[WarningType.DOCUMENT_IS_SPLITTED.value].append(
 91 |                             idx_document
 92 |                         )
 93 |                     else:
 94 |                         self._warnings[WarningType.DOCUMENT_IS_SPLITTED.value] = [
 95 |                             idx_document
 96 |                         ]
 97 | 
 98 |                     transformer_embs = []
 99 |                     for doc_part, index_offset in self._split_document(
100 |                         text, number_est_tokens
101 |                     ):
102 |                         transformer_embs.extend(
103 |                             self._get_transformer_embeddings(
104 |                                 doc_part, idx_document, index_offset
105 |                             )
106 |                         )
107 |                 else:
108 |                     transformer_embs = self._get_transformer_embeddings(
109 |                         text, idx_document
110 |                     )
111 | 
112 |                 document_embedded = self._match_transformer_embeddings_to_spacy_tokens(
113 |                     transformer_embs, doc, prep_offsets
114 |                 )
115 | 
116 |                 if len(document_embedded) != len(doc):
117 |                     idx_document = batch_number * self.batch_size + document_number
118 |                     if WarningType.TOKEN_MISMATCHING.value in self._warnings:
119 |                         self._warnings[WarningType.TOKEN_MISMATCHING.value].append(
120 |                             idx_document
121 |                         )
122 |                     else:
123 |                         self._warnings[WarningType.TOKEN_MISMATCHING.value] = [
124 |                             idx_document
125 |                         ]
126 | 
127 |                 documents_batch_embedded.append(document_embedded)
128 |             yield documents_batch_embedded
129 | 
130 |     def _preprocess_doc_text(self, doc: Doc) -> Tuple[str, np.ndarray]:
131 |         """Replaces the text of tokens which only consist of whitespace characters
132 |         with the special token [NL] (new line).
133 |         The special token and the whitespace string can consist of different number of
134 |         chars. To match the tokens later these differences are saved as offsets.
135 | 
136 |         Args:
137 |             doc: spacy document
138 |         Returns:
139 |             Preprocessed text in which whitespace tokens are
140 |             replaced by a special token, an array containing the indices of replaced
141 |             strings and the resulting offset
142 |         """
143 | 
144 |         prep_text = ""
145 |         idx_already_preprocessed = 0
146 |         # pairs of the line number of the preprocessed text and the offset relative to
147 |         # the original document, here, offset is the difference btw the preprocessed and
148 |         # the original text.
149 |         prep_offsets = [(0, 0)]
150 | 
151 |         for tkn in doc:
152 |             if not re.sub(r"[\s]+", "", tkn.text):
153 |                 # indices of current token which will be replaced by the special token
154 |                 idx_start, idx_end = tkn.idx, tkn.idx + len(tkn)
155 |                 # append already processed text and the special token
156 |                 prep_text += doc.text[idx_already_preprocessed:idx_start]
157 |                 prep_text += self._NL_TOKEN
158 | 
159 |                 additional_offset = len(tkn) - len(self._NL_TOKEN)
160 |                 prep_offsets.append(
161 |                     (
162 |                         len(prep_text),  # index to apply offset
163 |                         additional_offset,  # offset to be applied
164 |                     )
165 |                 )
166 |                 idx_already_preprocessed = idx_end
167 | 
168 |         # add remaining text
169 |         prep_text += doc.text[idx_already_preprocessed:]
170 | 
171 |         return prep_text, np.array(prep_offsets)
172 | 
173 |     def _match_transformer_embeddings_to_spacy_tokens(
174 |         self,
175 |         transformer_embeddings: List[List[Tuple[int, int, List[List[float]]]]],
176 |         document_tokenized: Doc,
177 |         prep_offsets: np.ndarray = None,
178 |     ) -> List[List[float]]:
179 |         """
180 |         Transformer and spacy tokens differ. Usual the transformer tokenizer splits
181 |         splits the text into smaller subparts in comparison to the spacy tokenizer.
182 |         To create embeddings for the spacy tokens the transformer embeddings must be
183 |         matched. This is done by comparing the char spans of the tokens and matching the
184 |         tokens which overlap.
185 | 
186 |         Args:
187 |             transformer_embeddings: List of start and end indices for each transformer
188 |                 token and the corresponding embedding
189 |             document_tokenized: spacy tokens
190 |             prep_offsets: Indices and offsets to match the preprocessed text to the
191 |                 original document
192 |         Returns:
193 |             Embeddings for each spacy token in the tokenized document.
194 |         """
195 | 
196 |         embeddings = defaultdict(list)
197 | 
198 |         for index_start, index_end, transformer_emb in transformer_embeddings:
199 | 
200 |             if prep_offsets is not None:
201 |                 index_start = self._add_offset(index_start, prep_offsets)
202 |                 index_end = self._add_offset(index_end, prep_offsets)
203 | 
204 |             span = document_tokenized.char_span(
205 |                 index_start, index_end, alignment_mode="expand"
206 |             )
207 |             if span is not None:
208 |                 # if a transformer token include multiple spacy tokens, the spacy
209 |                 # tokens get the same transformer embedding.
210 |                 for token in span:
211 |                     embeddings[token.i].extend(transformer_emb)
212 |         for key, values in embeddings.items():
213 |             embeddings[key] = np.array(values).mean(0).tolist()
214 |         return list(embeddings.values())
215 | 
216 |     def _add_offset(self, idx: int, offsets: np.ndarray) -> int:
217 |         """
218 |         Adds offset to index according to the offsets array.
219 | 
220 |         Args:
221 |             idx: index to transform
222 |             offsets: indices and the corresponding offsets
223 |         Returns:
224 |             Index customized according to the offset
225 |         """
226 |         return idx + np.sum(offsets[np.where(offsets[:, 0] <= idx)][:, 1])
227 | 
228 |     def _get_transformer_embeddings(
229 |         self,
230 |         document: str,
231 |         idx_document: int,
232 |         idx_offset: int = 0,
233 |     ) -> List[List[Tuple[int, int, List[List[float]]]]]:
234 |         """
235 |         Calculates embeddings for the given document using a transformer model.
236 |         First, the corresponding transformer tokens are computed. The next steps
237 |         computes the embeddings. With each embedding the indices of the according
238 |         chars are returned. idx_offset is used to return the correct indices if the
239 |         document has been split.
240 | 
241 |         Args:
242 |             document: plain document text
243 |             idx_offset: offset if the document has been splitted
244 |         Returns:
245 |            Start and end index for each transformer token and the calculated
246 |            embedding
247 |         """
248 |         encoded = self.transformer_tokenizer(document, return_tensors="pt").to(
249 |             self.device
250 |         )
251 |         tokens = encoded.encodings[0]
252 | 
253 |         # fallback if the number of tokens is still too big
254 |         if len(tokens) > self.model.config.max_position_embeddings:
255 |             if WarningType.DOCUMENT_IS_SPLITTED.value in self._warnings:
256 |                 self._warnings[WarningType.DOCUMENT_IS_SPLITTED.value].append(
257 |                     idx_document
258 |                 )
259 |             else:
260 |                 self._warnings[WarningType.DOCUMENT_IS_SPLITTED.value] = [idx_document]
261 | 
262 |             token_embs = []
263 |             for doc_part, additional_idx_offset in self._split_document(
264 |                 document, len(tokens)
265 |             ):
266 |                 token_embs.extend(
267 |                     self._get_transformer_embeddings(
268 |                         doc_part, idx_document, idx_offset + additional_idx_offset
269 |                     )
270 |                 )
271 |             return token_embs
272 | 
273 |         with torch.no_grad():
274 |             output = self.model(**encoded)
275 | 
276 |         # Get all hidden states
277 |         states = output.hidden_states
278 |         # Stack and sum last four layers
279 |         layers = [-4, -3, -2, -1]
280 |         output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
281 | 
282 |         token_embeddings = []
283 |         # 1 and -1 are [CLS] tokens, and other tokens can be ##subwords
284 |         for word_idx in set(tokens.word_ids[1:-1]):
285 |             index_begin, index_end = tokens.word_to_chars(word_idx)
286 |             token_ids_word = np.where(np.array(encoded.word_ids()) == word_idx)
287 |             # Only select the tokens that constitute the requested word
288 |             word_tokens_output = output[token_ids_word]
289 |             token_embeddings.append(
290 |                 [
291 |                     index_begin + idx_offset,
292 |                     index_end + idx_offset,
293 |                     word_tokens_output.tolist(),
294 |                 ]
295 |             )
296 |         return token_embeddings
297 | 
298 |     def _estimate_token_number(self, document: str) -> int:
299 |         """
300 |         Estimates the number of tokens which are generated by the transformer model.
301 |         It is based on the rule of thumb that per token 3 subtokens are created by
302 |         the transformer tokenizer. Tokens are created by splitting at every
303 |         special and whitespace character.
304 |         Special Characters are handled seperately according to the assumption that each
305 |         special character is treated as a token by the transformer tokenizer.
306 | 
307 |         Args:
308 |             document: plain text document
309 |         Returns:
310 |             Estimation for the number of transformer tokens included in the document
311 |         """
312 |         avg_subtokens_per_token = 3
313 |         number_word_tokens = len(re.findall(r"\[NL\]|\w+", document))
314 |         number_special_characters = len(re.sub(r"[\w\s]+", "", document))
315 |         return avg_subtokens_per_token * number_word_tokens + number_special_characters
316 | 
317 |     def _split_document(
318 |         self, document: str, estimated_tokens: int
319 |     ) -> Iterator[Tuple[str, int]]:
320 |         """
321 |         Splits the documens into subparts, according to the model's max length and the
322 |         number of estimated tokens.
323 | 
324 |         Args:
325 |             document: plain text document
326 |             estimated_tokens: estimation for the token number
327 |         Returns:
328 |             Yields subpart of the document, splitted depending on max model length and
329 |             estimated number of tokens
330 |         """
331 |         # the regular expression matches the special token [NL], any word consiting of
332 |         # numbers and chars and single characters which are no whitespace or word
333 |         # character
334 |         token_spans = [
335 |             token.span() for token in re.finditer(r"\[NL\]|\w+|[^\w\s]+?", document)
336 |         ]
337 |         split_into = (
338 |             round(estimated_tokens / self.model.config.max_position_embeddings) + 1
339 |         )
340 |         len_part = math.ceil(len(token_spans) / split_into)
341 | 
342 |         prev_split_idx = 0
343 |         for i in range(split_into):
344 |             current_split_idx = min(
345 |                 len(document),
346 |                 token_spans[min((i + 1) * len_part, len(token_spans) - 1)][1],
347 |             )
348 |             yield document[prev_split_idx:current_split_idx], prev_split_idx
349 |             prev_split_idx = current_split_idx
350 | 


--------------------------------------------------------------------------------
/embedders/extraction/count_based.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Generator, Union
 2 | from sklearn.feature_extraction.text import CountVectorizer
 3 | from embedders import util
 4 | from spacy.tokens.doc import Doc
 5 | 
 6 | from embedders.extraction import TokenEmbedder
 7 | 
 8 | 
 9 | class BagOfCharsTokenEmbedder(TokenEmbedder):
10 |     """Embeds documents using plain Bag of Characters approach.
11 | 
12 |     Args:
13 |         language_code (str): Name of the spaCy language model
14 |         precomputed_docs (bool, optional): If you have a large text corpus, it might make sense to precompute the data and input tokenized spaCy documents. Defaults to False.
15 |         batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128.
16 |     """
17 | 
18 |     def __init__(
19 |         self,
20 |         language_code: str,
21 |         precomputed_docs: bool = False,
22 |         batch_size: int = 128,
23 |         **kwargs
24 |     ):
25 |         super().__init__(language_code, precomputed_docs, batch_size)
26 |         self.model = CountVectorizer(analyzer="char", min_df=0.01, **kwargs)
27 | 
28 |     def _encode(
29 |         self, documents: List[Union[str, Doc]], fit_model: bool
30 |     ) -> Generator[List[List[List[int]]], None, None]:
31 |         if fit_model:
32 |             if self.preloaded:
33 |                 self.model.fit([doc.text for doc in documents])
34 |             else:
35 |                 self.model.fit(documents)
36 | 
37 |         for documents_batch in util.batch(documents, self.batch_size):
38 |             documents_batch_embedded = []
39 |             for doc in documents_batch:
40 |                 documents_batch_embedded.append(
41 |                     self.model.transform(
42 |                         [tok.text for tok in self._get_tokenized_document(doc)]
43 |                     )
44 |                     .toarray()
45 |                     .tolist()
46 |                 )
47 |             yield documents_batch_embedded
48 | 


--------------------------------------------------------------------------------
/embedders/extraction/reduce.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Generator, Union
 2 | import numpy as np
 3 | from embedders import PCAReducer, util
 4 | 
 5 | 
 6 | class PCATokenReducer(PCAReducer):
 7 |     def __init__(self, embedder, **kwargs):
 8 |         super().__init__(embedder=embedder, **kwargs)
 9 |         self.nlp = embedder.nlp
10 | 
11 |     def _transform(
12 |         self, embeddings: List[List[List[Union[int, float]]]]
13 |     ) -> List[List[List[Union[float, int]]]]:
14 |         batch_concatenated = np.concatenate(embeddings)
15 |         start_idx = 0
16 |         batch_unsqueezed = []
17 |         for length in [len(embedding) for embedding in embeddings]:
18 |             end_idx = start_idx + length
19 |             batch_reduced = self.reducer.transform(
20 |                 batch_concatenated[start_idx:end_idx]
21 |             )
22 |             batch_unsqueezed.append(batch_reduced.tolist())
23 |             start_idx = end_idx
24 |         return batch_unsqueezed
25 | 
26 |     def _reduce(
27 |         self, documents, fit_model, fit_after_n_batches
28 |     ) -> Generator[List[List[List[Union[float, int]]]], None, None]:
29 |         if fit_model:
30 |             embeddings_training = []
31 |             num_batches = util.num_batches(documents, self.embedder.batch_size)
32 |             fit_after_n_batches = min(num_batches, fit_after_n_batches) - 1
33 |             for batch_idx, batch in enumerate(
34 |                 self.embedder.fit_transform(documents, as_generator=True)
35 |             ):
36 |                 if batch_idx <= fit_after_n_batches:
37 |                     embeddings_training.append(batch)
38 | 
39 |                 if batch_idx == fit_after_n_batches:
40 |                     embeddings_training_flattened = []
41 |                     for batch_training in embeddings_training:
42 |                         embeddings_training_flattened.extend(
43 |                             np.concatenate(batch_training).tolist()
44 |                         )
45 |                     embeddings_training_flattened = np.array(
46 |                         embeddings_training_flattened
47 |                     )
48 |                     if (
49 |                         embeddings_training_flattened.shape[1]
50 |                         < self.reducer.n_components
51 |                         and self.autocorrect_n_components
52 |                     ):
53 |                         self.reducer.n_components = embeddings_training_flattened.shape[
54 |                             1
55 |                         ]
56 |                     self.reducer.fit(embeddings_training_flattened)
57 | 
58 |                     for batch_training in embeddings_training:
59 |                         yield self._transform(batch_training)
60 |                 if batch_idx > fit_after_n_batches:
61 |                     yield self._transform(batch)
62 |         else:
63 |             embeddings = self.embedder.transform(documents)
64 |             yield self._transform(embeddings)
65 | 


--------------------------------------------------------------------------------
/embedders/samples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-kern-ai/embedders/9bb17ba7f663be53b0db047b18b6dc017df7c757/embedders/samples/__init__.py


--------------------------------------------------------------------------------
/embedders/samples/clickbait.py:
--------------------------------------------------------------------------------
  1 | DATA = [
  2 |     "UK guinea pig farm to close after owner's family grave robbed",
  3 |     "18 Sweet Pumpkin Treats You Won't Believe Are Healthy",
  4 |     'A Guy Just Did The Most Epic "Cha Cha Slide" Dance Ever',
  5 |     "Premium gas discounted for a few hours",
  6 |     "Sanctions on US products introduced by Brazil",
  7 |     "IPhone sales exceed BlackBerry",
  8 |     "Administration Seeks to Regulate Derivatives",
  9 |     "21 Life-Changing Products That Can Actually Make Your Skin Better",
 10 |     "US raids Iran 'liaison office', Russia says it is unacceptable",
 11 |     "US House of Representatives rejects bail out bill in vote",
 12 |     "23 Ways To Give Your Heart To Your Valentine",
 13 |     "Signs You Grew Up In Southern California",
 14 |     "5 killed in return bus trip from marching band competition",
 15 |     "Here's Definitive Proof That Leonardo DiCaprio Is Immortal",
 16 |     "Signs of Possible Deal in Pakistan Turmoil",
 17 |     "14 killed in Russian bus-truck collision",
 18 |     "In Icy Kentucky, Thousands Are Still Without Power",
 19 |     "Beans Memes Is The Only Twitter Account That Actually Matters",
 20 |     "Couples Who Prove Opposites Attract",
 21 |     "Ball State Upsets Tennessee in First Round",
 22 |     "India and U.S.A. work toward nuclear fuel agreement",
 23 |     "16 Tweets That Sum Up The Lengths You Would Go To Avoid Other People",
 24 |     "Gas Is Up, but Drivers May Look the Other Way",
 25 |     "Which Taylor Swift Track Should Be Your Personal Theme Song",
 26 |     "Bruno Mars Might Headline Super Bowl 50",
 27 |     "US clinic plans first face transplant",
 28 |     "South Korea says North Korea will test more nuclear bombs",
 29 |     "National Hockey League news: February 28, 2008",
 30 |     "41 Victoria's Secret Models Show What They Look Like Without Makeup",
 31 |     "North Korean military fires artillery on populated South Korean island",
 32 |     "Pilot killed as Su-25 military jet explodes near Vladivostok",
 33 |     "Conservatives Map Strategies on Court Fight",
 34 |     "Aziz Ansari's Instagram Post About His Dad Will Make You Cry",
 35 |     "Who Is Your Dad Actually",
 36 |     "Climate Research That Might Not Help",
 37 |     "11 Steamy Lyrics That Will Make See You Selena Gomez In A New Light",
 38 |     "What Percent Vegan Are You",
 39 |     "13 Misogynistic Phrases That Need To Die",
 40 |     "491 Scoreless Minutes Come to an End",
 41 |     "Oil spewing from crack in seafloor of Gulf of Mexico was fifty feet from Deepwater Horizon well",
 42 |     "Saudis Delay Local Elections by 2 Years",
 43 |     "7 Excellent Deals You Can Get This Weekend",
 44 |     "U.N. Warns of Refugee Crisis in Gaza Strip",
 45 |     "Executives from IT industry focus on 10-year anniversary of Microsoft Research Asia",
 46 |     "Independent Member of Australian Parliament calls for better indigenous policy",
 47 |     "Ayesha Curry Has Sparked A Debate About How Women Dress",
 48 |     "FCC requires VoIP providers to have 911 service",
 49 |     "17 Images That Will Only Make Sense To People Obsessed With High Heels",
 50 |     "Australian rules football: 2010 Gippsland Football League round 1 - Wonthaggi v Leongatha",
 51 |     "Two Killed in Violence on Gaza Border",
 52 |     "Stimulus Tour Takes Obama to New Blue States",
 53 |     "12 Bizarre Christmas Traditions From Around The World",
 54 |     "9 Differences Between Hanging Out With Your New Friend Vs. Your Best Friend",
 55 |     "Priest Reportedly Suspended For Riding A Hoverboard During Mass",
 56 |     "15 Songs You Loved (But Forgot About) From 10 Years Ago",
 57 |     "People Are Using The Hashtag #BurritoSelfie And It Is As Glorious As You'd Imagine",
 58 |     "This Taco Recipe Will Sexually Awaken Your Taste Buds",
 59 |     "16 Times Chris Martin Was Really Just An Excited Puppy",
 60 |     "19 Gorgeous Finnish Baby Names That Will Make You Broody",
 61 |     "Mugabe spokeperson tells critics to 'go hang'",
 62 |     "Night Owls Become Early Risers",
 63 |     "I Tested Pinterest Mug Recipes To See If They Actually Taste Like Food",
 64 |     "Former U.S. President Clinton stumps for Obama, Franken in Minneapolis",
 65 |     "Rangers Honor Andy Bathgate and Harry Howell",
 66 |     "Which Lola From 'Kalyeserye' Are You Based On These Really Hard Questions",
 67 |     "West African cholera claims more than 500 lives, more deaths feared",
 68 |     "17 Of The Most Beautifully Illustrated Picture Books In 2015",
 69 |     "Rwandan genocide investigations to be completed by end of July",
 70 |     "Mandela discharged from hospital",
 71 |     "15 Insanely Adorable Pins You Never Knew You Needed",
 72 |     "Internet Companies and Ad Agencies Find Some Common Ground",
 73 |     "NBA star Gilbert Arenas pleads guilty to gun possession, could face six months in prison",
 74 |     "Clothing Makers Exceed Quarterly Expectations",
 75 |     "22 Mesmerizing, Mundane Photos Of A Day In The Life Of Darth Vader",
 76 |     "19 Texts That Are Way Too Real For Anyone Who's A Little Bit Greedy",
 77 |     "What Percentage Do You Have Of Winning The Royal Rumble",
 78 |     "Students Stand When Called Upon, and When Not",
 79 |     "Justices Retain Oversight by U.S. on Voting",
 80 |     "Canadian film academy explains lack of Genie nomination for Juno",
 81 |     "National Academy of Sciences recommends manned Hubble repair",
 82 |     "18 Things You Didn't Know About Cold Callers",
 83 |     "When You Miss Your Friend",
 84 |     "Robin Cook dead after collapsing",
 85 |     "12 hurt in San Luis de La Balsa tourist bus accident",
 86 |     "Wreckage of plane thought to be missing Air France flight found in Atlantic",
 87 |     'Which Pink Lady From "Grease" Should Be Your BFF',
 88 |     "Attack on mosque kills 30 in Rawalpindi, Pakistan",
 89 |     "Russian Uranium Sale to U.S. Is Planned",
 90 |     "Billions Withdrawn Before Madoff Arrest",
 91 |     "Vestas occupation continues; left-wing political parties voice support",
 92 |     "Television appeal for 1984 murder in Bath, England",
 93 |     "Myanmar Dissident Testifies at Trial",
 94 |     "Independent presidential candidates debate this weekend",
 95 |     "Blake Lively And Ryan Reynolds Continue To Be Actual Relationship Goals",
 96 |     "F.B.I. Lab Houses Growing Database of DNA Profiles",
 97 |     "18 Differences Between Snow Days In Canada And America",
 98 |     "The 27 Most Annoying Things Every Bartender Has To Endure",
 99 |     "U.S. Tells Chrysler to Prepare for Bankruptcy Filing",
100 |     "Thieves steal £40 million from London jeweller",
101 |     "Obama on Spot Over a Benefit to Gay Couples",
102 | ]
103 | 
104 | 
105 | def get_sample_data():
106 |     return DATA
107 | 


--------------------------------------------------------------------------------
/embedders/util.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Generator, List
 2 | import numpy as np
 3 | 
 4 | 
 5 | def batch(documents: List[Any], batch_size: int) -> Generator[List[Any], None, None]:
 6 |     length = len(documents)
 7 |     for idx in range(0, length, batch_size):
 8 |         yield documents[idx : min(idx + batch_size, length)]
 9 | 
10 | 
11 | def num_batches(documents: List[Any], batch_size: int) -> int:
12 |     return int(np.ceil(len(documents) / batch_size))
13 | 


--------------------------------------------------------------------------------
/publish:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | rm -rf dist/*
3 | python3 setup.py bdist_wheel --universal
4 | twine upload dist/*


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | scikit-learn
 3 | sentence-transformers
 4 | spacy>=3.0.0
 5 | torch>=1.6.0
 6 | tqdm
 7 | transformers>=4.6.0,<5.0.0
 8 | openai
 9 | cohere
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | 
 4 | from setuptools import setup, find_packages
 5 | 
 6 | this_directory = os.path.abspath(os.path.dirname(__file__))
 7 | with open(os.path.join(this_directory, "README.md")) as file:
 8 |     long_description = file.read()
 9 | 
10 | setup(
11 |     name="embedders",
12 |     version="0.1.8",
13 |     author="Johannes Hötter",
14 |     author_email="johannes.hoetter@kern.ai",
15 |     description="High-level API for creating sentence and token embeddings",
16 |     long_description=long_description,
17 |     long_description_content_type="text/markdown",
18 |     url="https://github.com/code-kern-ai/embedders",
19 |     keywords=["kern", "machine learning", "representation learning", "python"],
20 |     classifiers=[
21 |         "Development Status :: 3 - Alpha",
22 |         "Programming Language :: Python :: 3",
23 |         "License :: OSI Approved :: MIT License",
24 |     ],
25 |     package_dir={"": "."},
26 |     packages=find_packages("."),
27 |     install_requires=[
28 |         "numpy",
29 |         "scikit-learn",
30 |         "sentence-transformers",
31 |         "spacy>=3.0.0",
32 |         "torch>=1.6.0",
33 |         "tqdm",
34 |         "transformers>=4.6.0,<5.0.0",
35 |         "openai",
36 |         "cohere",
37 |     ],
38 | )
39 | 


--------------------------------------------------------------------------------
/tutorials/Finding similar sentences within a text corpus.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "5252aaca",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Finding similar sentences within a text corpus\n",
  9 |     "\n",
 10 |     "One great use case for embedding data is to use their vector representation for similarity search (often referred to as \"neural search\"). In this very short example, we'll show you how to build a super simple sentence comparison via comparing the cosine similarity of two embeddings.\n",
 11 |     "\n",
 12 |     "![A huge pile of embedded data points in vector space](https://miro.medium.com/max/2028/1*1LHBbqmPI0X4I3rio5ujWQ.png)\n",
 13 |     "\n",
 14 |     "This notebook is only meant as a tutorial; be aware that there are many fascinating neural search engines, such as [qdrant](https://qdrant.tech).\n",
 15 |     "\n",
 16 |     "---\n",
 17 |     "\n",
 18 |     "To get started, we just load two things: a function to load some sample data from the library, and a pre-trained sentence embedder based on transformer architecture."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "id": "5105bfa0",
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "from embedders.samples.clickbait import get_sample_data\n",
 29 |     "from embedders.classification.contextual import TransformerSentenceEmbedder"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "id": "17cb52d5",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "The `clickbait` dataset is straightforward simple and consists of some short headliners. Here's an example."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "id": "3d2559b6",
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "texts = get_sample_data()\n",
 48 |     "print(texts[0])"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "id": "7551608e",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "Next, we just load the embedder via some Hugging Face configuration string. We make use of `distilbert-base-uncased`. You can input any other model from the Hub."
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "id": "f8c5c7f1",
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "embedder = TransformerSentenceEmbedder(\"distilbert-base-uncased\")"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "id": "35567368",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "And now the magic happens: we encode the data. This is as easy as with your favorite sklearn objects - just call `fit_transform`."
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "id": "543f6cc3",
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "embeddings = embedder.fit_transform(texts)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "id": "7822796f",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Now, to compute a vanilla similarity search, we'll make use of the cosine similarity, which helps us to compute similarities for given vectors."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "id": "7ae1cc92",
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "import numpy as np\n",
103 |     "\n",
104 |     "def cosine_similarity(vector_1, vector_2):\n",
105 |     "    return np.dot(vector_1, vector_2)/(np.linalg.norm(vector_1)*np.linalg.norm(vector_2))"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "id": "d678a387",
111 |    "metadata": {},
112 |    "source": [
113 |     "And finally, a simplistic nested loop to calculate pairwise similarities (excluding identical sentences)."
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "id": "84d28194",
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "from tqdm import tqdm\n",
124 |     "\n",
125 |     "highest_similarity = float(\"-inf\")\n",
126 |     "vector_pair = None, None\n",
127 |     "for vector_1_idx, vector_1 in tqdm(enumerate(embeddings), total=len(embeddings)):\n",
128 |     "    for vector_2_idx, vector_2 in enumerate(embeddings):\n",
129 |     "        if vector_1_idx != vector_2_idx:\n",
130 |     "            similarity = cosine_similarity(vector_1, vector_2)\n",
131 |     "            if similarity > highest_similarity:\n",
132 |     "                highest_similarity = similarity\n",
133 |     "                vector_pair = vector_1_idx, vector_2_idx"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "id": "7d5e41ce",
139 |    "metadata": {},
140 |    "source": [
141 |     "We can now take a look at the most similar pair in our text corpus."
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "id": "a8bf8107",
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "print(texts[vector_pair[0]], texts[vector_pair[1]])"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "id": "cd968e84",
157 |    "metadata": {},
158 |    "source": [
159 |     "Wow - isn't that amazing?! Embedding data is one of the most sophisticated and intelligent way to enrich your records with valuable semantic metadata. There are sheer endless use cases. And `embedders` helps you to quickly generate embeddings for your dataset! 😋\n",
160 |     "\n",
161 |     "---\n",
162 |     "\n",
163 |     "If you have further questions, don't hesitate to contact us. If there is anything you want to have added to the library, open an [issue](https://github.com/code-kern-ai/embedders/issues). And please, don't forget to give us a ⭐"
164 |    ]
165 |   }
166 |  ],
167 |  "metadata": {
168 |   "kernelspec": {
169 |    "display_name": "Python 3 (ipykernel)",
170 |    "language": "python",
171 |    "name": "python3"
172 |   },
173 |   "language_info": {
174 |    "codemirror_mode": {
175 |     "name": "ipython",
176 |     "version": 3
177 |    },
178 |    "file_extension": ".py",
179 |    "mimetype": "text/x-python",
180 |    "name": "python",
181 |    "nbconvert_exporter": "python",
182 |    "pygments_lexer": "ipython3",
183 |    "version": "3.9.12"
184 |   }
185 |  },
186 |  "nbformat": 4,
187 |  "nbformat_minor": 5
188 | }
189 | 


--------------------------------------------------------------------------------