├── .gitignore ├── LICENSE ├── README.md ├── embedders ├── __init__.py ├── classification │ ├── __init__.py │ ├── contextual.py │ ├── count_based.py │ └── reduce.py ├── enums.py ├── extraction │ ├── __init__.py │ ├── contextual.py │ ├── count_based.py │ └── reduce.py ├── samples │ ├── __init__.py │ └── clickbait.py └── util.py ├── publish ├── requirements.txt ├── setup.py └── tutorials └── Finding similar sentences within a text corpus.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .DS_Store 3 | debugging.py 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright (c) 2022 kern.ai 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![embedders](https://uploads-ssl.webflow.com/61e47fafb12bd56b40022a49/626ee1c35a3abf0ca872486d_embedder-banner.png) 2 | [![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-390/) 3 | [![pypi 0.1.8](https://img.shields.io/badge/pypi-0.1.8-red.svg)](https://pypi.org/project/embedders/0.1.8/) 4 | 5 | # ⚗️ embedders 6 | 7 | With `embedders`, you can easily convert your texts into sentence- or token-level embeddings within a few lines of code. Use cases for this include similarity search between texts, information extraction such as named entity recognition, or basic text classification. 8 | 9 | ## Prerequisites 10 | 11 | This library uses [spaCy](https://github.com/explosion/spaCy) for tokenization; to apply it, please download the [respective language model](https://spacy.io/models) first. 12 | 13 | ## Installation 14 | 15 | You can set up this library via either running `$ pip install embedders`, or via cloning this repository and running `$ pip install -r requirements.txt` in your repository. 16 | 17 | A sample installation would be: 18 | 19 | ``` 20 | $ conda create --name embedders python=3.9 21 | $ conda activate embedders 22 | $ pip install embedders 23 | $ python -m spacy download en_core_web_sm 24 | ``` 25 | 26 | ## Usage 27 | 28 | Once you installed the package, you can apply the embedders with a few lines of code. You can apply embedders on sentence- or token-level. 29 | 30 | ### Sentence embeddings 31 | 32 | `"Wow, what a cool tool!"` is embedded to 33 | 34 | ``` 35 | [ 36 | 2.453, 8.325, ..., 3.863 37 | ] 38 | ``` 39 | 40 | Currently, we provide the following sentence embeddings: 41 | | **Path** | **Name** | **Embeds documents using ...** | 42 | | ------------------------------------ | --------------------------- | ------------------------------------------------------------ | 43 | | embedders.classification.contextual | HuggingFaceSentenceEmbedder | large, pre-trained transformers from https://huggingface.co | 44 | | embedders.classification.contextual | OpenAISentenceEmbedder | large, pre-trained transformers from https://openai.com | 45 | | embedders.classification.contextual | CohereSentenceEmbedder | large, pre-trained transformers from https://cohere.com | 46 | | embedders.classification.count_based | BagOfCharsSentenceEmbedder | plain Bag of Chars approach | 47 | | embedders.classification.count_based | BagOfWordsSentenceEmbedder | plain Bag of Words approach | 48 | | embedders.classification.count_based | TfidfSentenceEmbedder | Term Frequency - Inverse Document Frequency (TFIDF) approach | 49 | 50 | ### Token embeddings 51 | 52 | `"Wow, what a cool tool!"` is embedded to 53 | 54 | ``` 55 | [ 56 | [8.453, 1.853, ...], 57 | [3.623, 2.023, ...], 58 | [1.906, 9.604, ...], 59 | [7.306, 2.325, ...], 60 | [6.630, 1.643, ...], 61 | [3.023, 4.974, ...] 62 | ] 63 | ``` 64 | 65 | Currently, we provide the following token embeddings: 66 | 67 | | **Path** | **Name** | **Embeds documents using ...** | 68 | | -------------------------------- | ------------------------ | ----------------------------------------------------------- | 69 | | embedders.extraction.contextual | TransformerTokenEmbedder | large, pre-trained transformers from https://huggingface.co | 70 | | embedders.extraction.count_based | BagOfCharsTokenEmbedder | plain Bag of Characters approach | 71 | 72 | You can choose the embedding category depending on your task at hand. To implement them, you can just grab one of the available methods and apply them to your text corpus as follows (shown for sentence embeddings, but the same is possible for token): 73 | 74 | ```python 75 | from embedders.classification.contextual import TransformerSentenceEmbedder 76 | from embedders.classification.reduce import PCASentenceReducer 77 | 78 | corpus = [ 79 | "I went to Cologne in 2009", 80 | "My favorite number is 41", 81 | # ... 82 | ] 83 | 84 | embedder = TransformerSentenceEmbedder("bert-base-cased") 85 | embeddings = embedder.fit_transform(corpus) # contains a list of shape [num_texts, embedding_dimension] 86 | ``` 87 | 88 | Sometimes, you want to reduce the size of the embeddings you received. To do so, you can easily wrap your embedder with some dimensionality reduction technique. 89 | 90 | ```python 91 | # if the dimension is too large, you can also apply dimensionality reduction 92 | reducer = PCASentenceReducer(embedder) 93 | embeddings_reduced = reducer.fit_transform(corpus) 94 | ``` 95 | 96 | Currently, we provide the following dimensionality reductions: 97 | | **Path** | **Name** | **Description** | 98 | | ------------------------------- | ------------------- | -------------------------------------------------------------------------------- | 99 | | embedders.classification.reduce | PCASentenceEmbedder | Wraps embedder into a principial component analysis to reduce the dimensionality | 100 | | embedders.extraction.reduce | PCATokenEmbedder | Wraps embedder into a principial component analysis to reduce the dimensionality | 101 | 102 | ## Pre-trained embedders 103 | 104 | With growing availability of large, pre-trained models such as provided by [🤗 Hugging Face](https://huggingface.co/), embedding complex sentences in a wide variety of languages and domains becomes much more applicable. If you want to make use of transformer models, you can just use the configuration string of the respective model, which will automatically pull the correct model for the [🤗 Hugging Face Hub](https://huggingface.co/models). 105 | 106 | ## Contributing 107 | 108 | Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**. 109 | 110 | If you have a suggestion that would make this better, please fork the repo and create a pull request. You can also simply open an issue with the tag "enhancement". 111 | Don't forget to give the project a star! Thanks again! 112 | 113 | 1. Fork the Project 114 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`) 115 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`) 116 | 4. Push to the Branch (`git push origin feature/AmazingFeature`) 117 | 5. Open a Pull Request 118 | 119 | And please don't forget to leave a ⭐ if you like the work! 120 | 121 | ## License 122 | 123 | Distributed under the Apache 2.0 License. See LICENSE.txt for more information. 124 | 125 | ## Contact 126 | 127 | This library is developed and maintained by [kern.ai](https://github.com/code-kern-ai). If you want to provide us with feedback or have some questions, don't hesitate to contact us. We're super happy to help ✌️ 128 | -------------------------------------------------------------------------------- /embedders/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | from typing import Dict, List, Generator, Optional, Union 3 | from spacy.tokens.doc import Doc 4 | from sklearn.decomposition import PCA 5 | from tqdm import tqdm 6 | from embedders import util 7 | from joblib import dump, load 8 | 9 | 10 | class Transformer(metaclass=ABCMeta): 11 | def __init__(self): 12 | self._warnings = {} 13 | 14 | @abstractmethod 15 | def fit_transform( 16 | self, documents: List[Union[str, Doc]], as_generator: bool 17 | ) -> Union[List, Generator]: 18 | """Trains the given algorithm to embed textual documents into semantic vector-spacy representations. 19 | 20 | Args: 21 | documents (List[Union[str, Doc]]): List of plain strings or spaCy documents. 22 | as_generator (bool): Embeddings are calculated batch-wise. If this is set to False, the results will be summarized in one list, else a generator will yield the values. 23 | 24 | Returns: 25 | Union[List, Generator]: List with all embeddings or generator that yields the embeddings. 26 | """ 27 | pass 28 | 29 | @abstractmethod 30 | def transform( 31 | self, documents: List[Union[str, Doc]], as_generator: bool 32 | ) -> Union[List, Generator]: 33 | """Uses the trained algorithm to embed textual documents into semantic vector-spacy representations. 34 | 35 | Args: 36 | documents (List[Union[str, Doc]]): List of plain strings or spaCy documents. 37 | as_generator (bool): Embeddings are calculated batch-wise. If this is set to False, the results will be summarized in one list, else a generator will yield the values. 38 | 39 | Returns: 40 | Union[List, Generator]: List with all embeddings or generator that yields the embeddings. 41 | """ 42 | pass 43 | 44 | @abstractmethod 45 | def get_warnings(self) -> Dict: 46 | """Collects all warnings reported during the embedding creation or PCA. 47 | 48 | Returns: 49 | List: List with all warnings 50 | """ 51 | pass 52 | 53 | 54 | class Embedder(Transformer, metaclass=ABCMeta): 55 | def __init__(self): 56 | super().__init__() 57 | 58 | @abstractmethod 59 | def _encode(self, documents: List[Union[str, Doc]], fit_model: bool) -> Generator: 60 | pass 61 | 62 | def _encode_batch( 63 | self, 64 | documents: List[Union[str, Doc]], 65 | as_generator: bool, 66 | fit_model: bool, 67 | show_progress: Optional[bool] = True, 68 | ) -> Union[List, Generator]: 69 | if as_generator: 70 | return self._encode(documents, fit_model) 71 | else: 72 | embeddings = [] 73 | if show_progress: 74 | num_batches = util.num_batches(documents, self.batch_size) 75 | print("Initializing model, might take some time...") 76 | for embedding_batch in tqdm( 77 | self._encode(documents, fit_model), 78 | total=num_batches, 79 | desc="Encoding batches ...", 80 | ): 81 | embeddings.extend(embedding_batch) 82 | else: 83 | for embedding_batch in self._encode(documents, fit_model): 84 | embeddings.extend(embedding_batch) 85 | return embeddings 86 | 87 | def fit_transform( 88 | self, documents: List[Union[str, Doc]], as_generator: bool = False 89 | ) -> Union[List, Generator]: 90 | return self._encode_batch(documents, as_generator, True) 91 | 92 | def transform( 93 | self, documents: List[Union[str, Doc]], as_generator: bool = False 94 | ) -> Union[List, Generator]: 95 | return self._encode_batch(documents, as_generator, False) 96 | 97 | def get_warnings(self) -> Dict: 98 | return self._warnings 99 | 100 | 101 | class PCAReducer(Transformer, metaclass=ABCMeta): 102 | """Wraps embedder into a principial component analysis to reduce the dimensionality. 103 | 104 | Args: 105 | embedder (Embedder): Algorithm to embed the documents. 106 | n_components (int, optional): Number of principal components to keep. Defaults to 8. 107 | autocorrect_n_components (bool, optional): If there are less data samples than specified components, this will automatically reduce the number of principial components. Defaults to True. 108 | """ 109 | 110 | def __init__( 111 | self, 112 | embedder: Embedder, 113 | n_components: int = 8, 114 | autocorrect_n_components: bool = True, 115 | **kwargs 116 | ): 117 | super().__init__() 118 | self.embedder = embedder 119 | self.reducer = PCA(n_components=n_components, **kwargs) 120 | self.batch_size = self.embedder.batch_size 121 | self.autocorrect_n_components = autocorrect_n_components 122 | 123 | def store_pca_weights(self, file_name: str): 124 | """Stores the PCA weights to a file. 125 | 126 | Args: 127 | file_name (str): Path to the file without any file endings. 128 | """ 129 | dump(self.reducer, f'{file_name}.joblib') 130 | 131 | def load_pca_weights(self, file_name: str): 132 | """Loads the PCA weights from a file. 133 | 134 | Args: 135 | file_name (str): Path to the file without any file endings. 136 | """ 137 | self.reducer = load(f'{file_name}.joblib') 138 | 139 | @abstractmethod 140 | def _reduce( 141 | self, 142 | documents: List[Union[str, Doc]], 143 | fit_model: bool, 144 | fit_after_n_batches: int, 145 | ): 146 | pass 147 | 148 | def _reduce_batch( 149 | self, 150 | documents: List[Union[str, Doc]], 151 | as_generator: bool, 152 | fit_model: bool, 153 | fit_after_n_batches: int, 154 | ) -> Union[List, Generator]: 155 | if as_generator: 156 | return self._reduce(documents, fit_model, fit_after_n_batches) 157 | else: 158 | embeddings = [] 159 | for embedding_batch in self._reduce( 160 | documents, fit_model, fit_after_n_batches 161 | ): 162 | embeddings.extend(embedding_batch) 163 | return embeddings 164 | 165 | def fit_transform( 166 | self, 167 | documents: List[Union[str, Doc]], 168 | as_generator: bool = False, 169 | fit_after_n_batches: int = 5, 170 | ) -> Union[List, Generator]: 171 | """Trains the given algorithm to embed textual documents into semantic vector-spacy representations. 172 | 173 | Args: 174 | documents (List[Union[str, Doc]]): List of plain strings or spaCy documents. 175 | as_generator (bool, optional): Embeddings are calculated batch-wise. If this is set to False, the results will be summarized in one list, else a generator will yield the values.. Defaults to False. 176 | fit_after_n_batches (int, optional): Maximal batch iteration, after which the PCA is fitted. Defaults to 5. 177 | 178 | Returns: 179 | Union[List, Generator]: List with all embeddings or generator that yields the embeddings. 180 | """ 181 | 182 | return self._reduce_batch( 183 | documents, 184 | as_generator, 185 | True, 186 | fit_after_n_batches, 187 | ) 188 | 189 | def transform(self, documents, as_generator=False) -> Union[List, Generator]: 190 | return self._reduce_batch(documents, as_generator, False, 0) 191 | 192 | def get_warnings(self) -> Dict: 193 | return {**self._warnings, **self.embedder.get_warnings()} 194 | -------------------------------------------------------------------------------- /embedders/classification/__init__.py: -------------------------------------------------------------------------------- 1 | from embedders import Embedder 2 | 3 | 4 | class SentenceEmbedder(Embedder): 5 | def __init__(self, batch_size: int = 128): 6 | super().__init__() 7 | self.batch_size = batch_size 8 | -------------------------------------------------------------------------------- /embedders/classification/contextual.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union, Generator 2 | from sentence_transformers import SentenceTransformer 3 | from embedders import util 4 | from embedders.classification import SentenceEmbedder 5 | from spacy.tokens.doc import Doc 6 | import torch 7 | import openai 8 | from openai import error as openai_error 9 | import cohere 10 | import time 11 | 12 | 13 | class TransformerSentenceEmbedder(SentenceEmbedder): 14 | """Embeds documents using large, pre-trained transformers from https://huggingface.co 15 | 16 | Args: 17 | config_string (str): Name of the model listed on https://huggingface.co/models 18 | batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. 19 | """ 20 | 21 | def __init__(self, config_string: str, batch_size: int = 128): 22 | super().__init__(batch_size) 23 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 24 | self.model = SentenceTransformer(config_string).to(self.device) 25 | 26 | def _encode( 27 | self, documents: List[Union[str, Doc]], fit_model: bool 28 | ) -> Generator[List[List[float]], None, None]: 29 | for documents_batch in util.batch(documents, self.batch_size): 30 | yield self.model.encode(documents_batch, show_progress_bar=False).tolist() 31 | 32 | 33 | class HuggingFaceSentenceEmbedder(TransformerSentenceEmbedder): 34 | def __init__(self, config_string: str, batch_size: int = 128): 35 | super().__init__(config_string, batch_size) 36 | 37 | 38 | class OpenAISentenceEmbedder(SentenceEmbedder): 39 | def __init__( 40 | self, 41 | openai_api_key: str, 42 | model_name: str, 43 | batch_size: int = 128, 44 | api_base: Optional[str] = None, 45 | api_type: Optional[str] = None, 46 | api_version: Optional[str] = None, 47 | ): 48 | """ 49 | Embeds documents using large language models from https://openai.com or https://azure.microsoft.com 50 | 51 | Args: 52 | openai_api_key (str): API key from OpenAI or Azure 53 | model_name (str): Name of the embedding model from OpenAI (e.g. text-embedding-ada-002) or the name of your Azure endpoint 54 | batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. 55 | api_base (str, optional): If you use Azure, you need to provide the base URL of your Azure endpoint (e.g. 'https://azureopenkernai.openai.azure.com/'). Defaults to None. 56 | api_type (str, optional): If you use Azure, you need to provide the type of your Azure endpoint (e.g. 'azure'). Defaults to None. 57 | api_version (str, optional): If you use Azure, you need to provide the version of your Azure endpoint (e.g. '2023-05-15'). Defaults to None. 58 | 59 | Raises: 60 | Exception: If you use Azure, you need to provide api_type, api_version and api_base. 61 | 62 | Examples: 63 | >>> from embedders.classification.contextual import OpenAISentenceEmbedder 64 | >>> embedder_openai = OpenAISentenceEmbedder( 65 | ... "my-key-from-openai", 66 | ... "text-embedding-ada-002", 67 | ... ) 68 | >>> embeddings = embedder_openai.transform(["This is a test", "This is another test"]) 69 | >>> print(embeddings) 70 | [[-0.0001, 0.0002, ...], [-0.0001, 0.0002, ...]] 71 | 72 | >>> from embedders.classification.contextual import OpenAISentenceEmbedder 73 | >>> embedder_azure = OpenAISentenceEmbedder( 74 | ... "my-key-from-azure", 75 | ... "my-endpoint-name", 76 | ... api_base="https://azureopenkernai.openai.azure.com/", 77 | ... api_type="azure", 78 | ... api_version="2023-05-15", 79 | ... ) 80 | >>> embeddings = embedder_azure.transform(["This is a test", "This is another test"]) 81 | >>> print(embeddings) 82 | [[-0.0001, 0.0002, ...], [-0.0001, 0.0002, ...]] 83 | 84 | """ 85 | super().__init__(batch_size) 86 | self.model_name = model_name 87 | self.openai_api_key = openai_api_key 88 | openai.api_key = self.openai_api_key 89 | self.api_base = api_base 90 | self.api_type = api_type 91 | self.api_version = api_version 92 | 93 | self.use_azure = any( 94 | [ 95 | api_base is not None, 96 | api_type is not None, 97 | api_version is not None, 98 | ] 99 | ) 100 | if self.use_azure: 101 | assert ( 102 | api_type is not None 103 | and api_version is not None 104 | and api_base is not None 105 | ), "If you want to use Azure, you need to provide api_type, api_version and api_base." 106 | 107 | openai.api_base = api_base 108 | openai.api_type = api_type 109 | openai.api_version = api_version 110 | 111 | def __getstate__(self): 112 | state = self.__dict__.copy() 113 | return state 114 | 115 | def __setstate__(self, state): 116 | self.__dict__.update(state) 117 | self.model_name = state["model_name"] 118 | self.openai_api_key = state["openai_api_key"] 119 | openai.api_key = self.openai_api_key 120 | self.use_azure = state.get("use_azure") 121 | if self.use_azure: 122 | self.api_base = state["api_base"] 123 | self.api_type = state["api_type"] 124 | self.api_version = state["api_version"] 125 | openai.api_base = self.api_base 126 | openai.api_type = self.api_type 127 | openai.api_version = self.api_version 128 | 129 | def _encode( 130 | self, documents: List[Union[str, Doc]], fit_model: bool 131 | ) -> Generator[List[List[float]], None, None]: 132 | for documents_batch in util.batch(documents, self.batch_size): 133 | documents_batch = [doc.replace("\n", " ") for doc in documents_batch] 134 | try: 135 | if self.use_azure: 136 | embeddings = [] 137 | for azure_batch in util.batch(documents_batch, 16): 138 | # azure only allows up to 16 documents per request 139 | count = 0 140 | while True and count < 60: 141 | try: 142 | count += 1 143 | response = openai.Embedding.create( 144 | input=azure_batch, engine=self.model_name 145 | ) 146 | break 147 | except openai.error.RateLimitError as e: 148 | if count >= 60: 149 | raise e 150 | if count == 1: 151 | print( 152 | "Rate limit exceeded. Waiting 10 seconds...", 153 | flush=True, 154 | ) 155 | time.sleep(10.05) 156 | else: 157 | time.sleep(1) 158 | embeddings += [entry["embedding"] for entry in response["data"]] 159 | else: 160 | response = openai.Embedding.create( 161 | input=documents_batch, engine=self.model_name 162 | ) 163 | embeddings = [entry["embedding"] for entry in response["data"]] 164 | yield embeddings 165 | except openai_error.AuthenticationError: 166 | raise Exception( 167 | "OpenAI API key is invalid. Please provide a valid API key in the constructor of OpenAISentenceEmbedder." 168 | ) 169 | 170 | 171 | class CohereSentenceEmbedder(SentenceEmbedder): 172 | def __init__(self, cohere_api_key: str, batch_size: int = 128): 173 | super().__init__(batch_size) 174 | self.cohere_api_key = cohere_api_key 175 | self.model = cohere.Client(self.cohere_api_key) 176 | 177 | def __getstate__(self): 178 | state = self.__dict__.copy() 179 | # Don't pickle 'model' 180 | del state["model"] 181 | return state 182 | 183 | def __setstate__(self, state): 184 | self.__dict__.update(state) 185 | # Restore 'model' after unpickling 186 | self.model = cohere.Client(self.cohere_api_key) 187 | 188 | def _encode( 189 | self, documents: List[Union[str, Doc]], fit_model: bool 190 | ) -> Generator[List[List[float]], None, None]: 191 | for documents_batch in util.batch(documents, self.batch_size): 192 | embeddings = self.model.embed(documents_batch).embeddings 193 | yield embeddings 194 | -------------------------------------------------------------------------------- /embedders/classification/count_based.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union, Generator 2 | from sklearn.feature_extraction.text import CountVectorizer 3 | from sklearn.feature_extraction.text import TfidfVectorizer 4 | from embedders.classification import SentenceEmbedder 5 | from embedders import util 6 | 7 | 8 | class CountSentenceEmbedder(SentenceEmbedder): 9 | def __init__(self, batch_size: int, min_df: float, **kwargs): 10 | super().__init__(batch_size) 11 | 12 | def _encode( 13 | self, documents: List[str], fit_model: bool 14 | ) -> Generator[List[List[Union[float, int]]], None, None]: 15 | if fit_model: 16 | self.model.fit(documents) 17 | 18 | for documents_batch in util.batch(documents, self.batch_size): 19 | documents_batch_embedded = [] 20 | for doc in documents_batch: 21 | documents_batch_embedded.append( 22 | self.model.transform([doc]).toarray().tolist()[0] 23 | ) 24 | yield documents_batch_embedded 25 | 26 | 27 | class BagOfCharsSentenceEmbedder(CountSentenceEmbedder): 28 | """Embeds documents using plain Bag of Characters approach. 29 | 30 | Args: 31 | batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. 32 | min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. 33 | """ 34 | 35 | def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs): 36 | super().__init__(batch_size, min_df) 37 | self.model = CountVectorizer(analyzer="char", min_df=min_df, **kwargs) 38 | 39 | 40 | class BagOfWordsSentenceEmbedder(CountSentenceEmbedder): 41 | """Embeds documents using plain Bag of Words approach. 42 | 43 | Args: 44 | batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. 45 | min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. 46 | """ 47 | 48 | def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs): 49 | super().__init__(batch_size, min_df) 50 | self.model = CountVectorizer(min_df=min_df, **kwargs) 51 | 52 | 53 | class TfidfSentenceEmbedder(CountSentenceEmbedder): 54 | """Embeds documents using Term Frequency - Inverse Document Frequency (TFIDF) approach. 55 | 56 | Args: 57 | batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. 58 | min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. 59 | """ 60 | 61 | def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs): 62 | super().__init__(batch_size, min_df) 63 | self.model = TfidfVectorizer(min_df=min_df, **kwargs) 64 | -------------------------------------------------------------------------------- /embedders/classification/reduce.py: -------------------------------------------------------------------------------- 1 | from spacy.tokens.doc import Doc 2 | from typing import Union, List, Generator 3 | import numpy as np 4 | from embedders import PCAReducer, util 5 | 6 | 7 | class PCASentenceReducer(PCAReducer): 8 | def _transform( 9 | self, embeddings: List[List[Union[int, float]]] 10 | ) -> List[List[Union[float, int]]]: 11 | return self.reducer.transform(embeddings).tolist() 12 | 13 | def _reduce( 14 | self, 15 | documents: List[Union[str, Doc]], 16 | fit_model: bool, 17 | fit_after_n_batches: int, 18 | ) -> Generator[List[List[Union[float, int]]], None, None]: 19 | if fit_model: 20 | embeddings_training = [] 21 | num_batches = util.num_batches(documents, self.embedder.batch_size) 22 | fit_after_n_batches = min(num_batches, fit_after_n_batches) - 1 23 | for batch_idx, batch in enumerate( 24 | self.embedder.fit_transform(documents, as_generator=True) 25 | ): 26 | if batch_idx <= fit_after_n_batches: 27 | embeddings_training.append(batch) 28 | 29 | if batch_idx == fit_after_n_batches: 30 | embeddings_training_flattened = [] 31 | for batch_training in embeddings_training: 32 | embeddings_training_flattened.extend(batch_training) 33 | embeddings_training_flattened = np.array( 34 | embeddings_training_flattened 35 | ) 36 | if ( 37 | embeddings_training_flattened.shape[1] 38 | < self.reducer.n_components 39 | and self.autocorrect_n_components 40 | ): 41 | self.reducer.n_components = embeddings_training_flattened.shape[ 42 | 1 43 | ] 44 | self.reducer.fit(embeddings_training_flattened) 45 | 46 | for batch_training in embeddings_training: 47 | yield self._transform(batch_training) 48 | if batch_idx > fit_after_n_batches: 49 | yield self._transform(batch) 50 | else: 51 | embeddings = self.embedder.transform(documents) 52 | yield self._transform(embeddings) 53 | -------------------------------------------------------------------------------- /embedders/enums.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class WarningType(Enum): 5 | DOCUMENT_IS_SPLITTED = "DOCUMENT_IS_SPLITTED" 6 | TOKEN_MISMATCHING = "TOKEN_MISMATCHING" 7 | -------------------------------------------------------------------------------- /embedders/extraction/__init__.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | from embedders import Embedder 3 | from spacy.tokens.doc import Doc 4 | from typing import Union 5 | 6 | 7 | class TokenEmbedder(Embedder): 8 | def __init__( 9 | self, language_code: str, precomputed_docs: bool = False, batch_size: int = 128 10 | ): 11 | super().__init__() 12 | self.preloaded = precomputed_docs 13 | if precomputed_docs: 14 | self.nlp = spacy.blank(language_code) 15 | else: 16 | self.nlp = spacy.load(language_code) 17 | self.batch_size = batch_size 18 | 19 | def _get_tokenized_document(self, document: Union[str, Doc]): 20 | if self.preloaded: 21 | return document 22 | else: 23 | return self.nlp(document) 24 | -------------------------------------------------------------------------------- /embedders/extraction/contextual.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Union, Iterator 2 | import torch 3 | import math 4 | import numpy as np 5 | import re 6 | from transformers import AutoTokenizer, AutoModel 7 | from collections import defaultdict 8 | from embedders import util 9 | from spacy.tokens.doc import Doc 10 | 11 | 12 | from embedders.enums import WarningType 13 | from embedders.extraction import TokenEmbedder 14 | 15 | 16 | class TransformerTokenEmbedder(TokenEmbedder): 17 | """Embeds documents using large, pre-trained transformers from https://huggingface.co 18 | 19 | Args: 20 | config_string (str): Name of the model listed on https://huggingface.co/models 21 | language_code (str): Name of the spaCy language model 22 | precomputed_docs (bool, optional): If you have a large text corpus, it might make sense to precompute the data and input tokenized spaCy documents. Defaults to False. 23 | batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. 24 | """ 25 | 26 | _NL_TOKEN = "[NL]" 27 | 28 | def __init__( 29 | self, 30 | config_string: str, 31 | language_code: str, 32 | precomputed_docs: bool = False, 33 | batch_size: int = 128, 34 | ): 35 | super().__init__(language_code, precomputed_docs, batch_size) 36 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 37 | 38 | self.transformer_tokenizer = AutoTokenizer.from_pretrained(config_string) 39 | self.transformer_tokenizer.add_special_tokens( 40 | {"additional_special_tokens": [self._NL_TOKEN]} 41 | ) 42 | 43 | self.model = AutoModel.from_pretrained( 44 | config_string, output_hidden_states=True 45 | ).to(self.device) 46 | self.model.resize_token_embeddings(len(self.transformer_tokenizer)) 47 | 48 | def _encode( 49 | self, documents: Union[List[str], List[Doc]], fit_model: bool 50 | ) -> Iterator[List[List[List[float]]]]: 51 | for batch_number, documents_batch in enumerate( 52 | util.batch(documents, self.batch_size) 53 | ): 54 | """ 55 | Computation of embeddings for each spacy token of each document. For the 56 | embedding creation transformer models are used. Embeddings are calculated 57 | for the transformer tokens of the document, then these token embeddings are 58 | matched to the spacy tokens. 59 | 60 | Args: 61 | documents: list of strings or spacy documents 62 | fit_model: not used, required by base class 63 | Return: 64 | Token embeddings for each document 65 | """ 66 | 67 | documents_batch_embedded = [] 68 | for document_number, document in enumerate(documents_batch): 69 | doc = self._get_tokenized_document(document) 70 | 71 | # no spacy token. 72 | # set special token as text, so that an embedding 73 | # is created that can be processed by the PCA. 74 | if len(doc) == 0: 75 | doc = self.nlp(self._NL_TOKEN) 76 | 77 | # spacy creates tokens which only contain whitespace characters. 78 | # the transformer tokenizer ignores these tokens. 79 | # in order to avoid problems while matching the tokens the text is 80 | # preprocessed. 81 | text, prep_offsets = self._preprocess_doc_text(doc) 82 | 83 | # transformer models have a maximum number of tokens which can be 84 | # processed at the same time. 85 | # in this case the text is splitted in mutliple subparts 86 | number_est_tokens = self._estimate_token_number(text) 87 | idx_document = batch_number * self.batch_size + document_number 88 | if self.model.config.max_position_embeddings < number_est_tokens: 89 | if WarningType.DOCUMENT_IS_SPLITTED.value in self._warnings: 90 | self._warnings[WarningType.DOCUMENT_IS_SPLITTED.value].append( 91 | idx_document 92 | ) 93 | else: 94 | self._warnings[WarningType.DOCUMENT_IS_SPLITTED.value] = [ 95 | idx_document 96 | ] 97 | 98 | transformer_embs = [] 99 | for doc_part, index_offset in self._split_document( 100 | text, number_est_tokens 101 | ): 102 | transformer_embs.extend( 103 | self._get_transformer_embeddings( 104 | doc_part, idx_document, index_offset 105 | ) 106 | ) 107 | else: 108 | transformer_embs = self._get_transformer_embeddings( 109 | text, idx_document 110 | ) 111 | 112 | document_embedded = self._match_transformer_embeddings_to_spacy_tokens( 113 | transformer_embs, doc, prep_offsets 114 | ) 115 | 116 | if len(document_embedded) != len(doc): 117 | idx_document = batch_number * self.batch_size + document_number 118 | if WarningType.TOKEN_MISMATCHING.value in self._warnings: 119 | self._warnings[WarningType.TOKEN_MISMATCHING.value].append( 120 | idx_document 121 | ) 122 | else: 123 | self._warnings[WarningType.TOKEN_MISMATCHING.value] = [ 124 | idx_document 125 | ] 126 | 127 | documents_batch_embedded.append(document_embedded) 128 | yield documents_batch_embedded 129 | 130 | def _preprocess_doc_text(self, doc: Doc) -> Tuple[str, np.ndarray]: 131 | """Replaces the text of tokens which only consist of whitespace characters 132 | with the special token [NL] (new line). 133 | The special token and the whitespace string can consist of different number of 134 | chars. To match the tokens later these differences are saved as offsets. 135 | 136 | Args: 137 | doc: spacy document 138 | Returns: 139 | Preprocessed text in which whitespace tokens are 140 | replaced by a special token, an array containing the indices of replaced 141 | strings and the resulting offset 142 | """ 143 | 144 | prep_text = "" 145 | idx_already_preprocessed = 0 146 | # pairs of the line number of the preprocessed text and the offset relative to 147 | # the original document, here, offset is the difference btw the preprocessed and 148 | # the original text. 149 | prep_offsets = [(0, 0)] 150 | 151 | for tkn in doc: 152 | if not re.sub(r"[\s]+", "", tkn.text): 153 | # indices of current token which will be replaced by the special token 154 | idx_start, idx_end = tkn.idx, tkn.idx + len(tkn) 155 | # append already processed text and the special token 156 | prep_text += doc.text[idx_already_preprocessed:idx_start] 157 | prep_text += self._NL_TOKEN 158 | 159 | additional_offset = len(tkn) - len(self._NL_TOKEN) 160 | prep_offsets.append( 161 | ( 162 | len(prep_text), # index to apply offset 163 | additional_offset, # offset to be applied 164 | ) 165 | ) 166 | idx_already_preprocessed = idx_end 167 | 168 | # add remaining text 169 | prep_text += doc.text[idx_already_preprocessed:] 170 | 171 | return prep_text, np.array(prep_offsets) 172 | 173 | def _match_transformer_embeddings_to_spacy_tokens( 174 | self, 175 | transformer_embeddings: List[List[Tuple[int, int, List[List[float]]]]], 176 | document_tokenized: Doc, 177 | prep_offsets: np.ndarray = None, 178 | ) -> List[List[float]]: 179 | """ 180 | Transformer and spacy tokens differ. Usual the transformer tokenizer splits 181 | splits the text into smaller subparts in comparison to the spacy tokenizer. 182 | To create embeddings for the spacy tokens the transformer embeddings must be 183 | matched. This is done by comparing the char spans of the tokens and matching the 184 | tokens which overlap. 185 | 186 | Args: 187 | transformer_embeddings: List of start and end indices for each transformer 188 | token and the corresponding embedding 189 | document_tokenized: spacy tokens 190 | prep_offsets: Indices and offsets to match the preprocessed text to the 191 | original document 192 | Returns: 193 | Embeddings for each spacy token in the tokenized document. 194 | """ 195 | 196 | embeddings = defaultdict(list) 197 | 198 | for index_start, index_end, transformer_emb in transformer_embeddings: 199 | 200 | if prep_offsets is not None: 201 | index_start = self._add_offset(index_start, prep_offsets) 202 | index_end = self._add_offset(index_end, prep_offsets) 203 | 204 | span = document_tokenized.char_span( 205 | index_start, index_end, alignment_mode="expand" 206 | ) 207 | if span is not None: 208 | # if a transformer token include multiple spacy tokens, the spacy 209 | # tokens get the same transformer embedding. 210 | for token in span: 211 | embeddings[token.i].extend(transformer_emb) 212 | for key, values in embeddings.items(): 213 | embeddings[key] = np.array(values).mean(0).tolist() 214 | return list(embeddings.values()) 215 | 216 | def _add_offset(self, idx: int, offsets: np.ndarray) -> int: 217 | """ 218 | Adds offset to index according to the offsets array. 219 | 220 | Args: 221 | idx: index to transform 222 | offsets: indices and the corresponding offsets 223 | Returns: 224 | Index customized according to the offset 225 | """ 226 | return idx + np.sum(offsets[np.where(offsets[:, 0] <= idx)][:, 1]) 227 | 228 | def _get_transformer_embeddings( 229 | self, 230 | document: str, 231 | idx_document: int, 232 | idx_offset: int = 0, 233 | ) -> List[List[Tuple[int, int, List[List[float]]]]]: 234 | """ 235 | Calculates embeddings for the given document using a transformer model. 236 | First, the corresponding transformer tokens are computed. The next steps 237 | computes the embeddings. With each embedding the indices of the according 238 | chars are returned. idx_offset is used to return the correct indices if the 239 | document has been split. 240 | 241 | Args: 242 | document: plain document text 243 | idx_offset: offset if the document has been splitted 244 | Returns: 245 | Start and end index for each transformer token and the calculated 246 | embedding 247 | """ 248 | encoded = self.transformer_tokenizer(document, return_tensors="pt").to( 249 | self.device 250 | ) 251 | tokens = encoded.encodings[0] 252 | 253 | # fallback if the number of tokens is still too big 254 | if len(tokens) > self.model.config.max_position_embeddings: 255 | if WarningType.DOCUMENT_IS_SPLITTED.value in self._warnings: 256 | self._warnings[WarningType.DOCUMENT_IS_SPLITTED.value].append( 257 | idx_document 258 | ) 259 | else: 260 | self._warnings[WarningType.DOCUMENT_IS_SPLITTED.value] = [idx_document] 261 | 262 | token_embs = [] 263 | for doc_part, additional_idx_offset in self._split_document( 264 | document, len(tokens) 265 | ): 266 | token_embs.extend( 267 | self._get_transformer_embeddings( 268 | doc_part, idx_document, idx_offset + additional_idx_offset 269 | ) 270 | ) 271 | return token_embs 272 | 273 | with torch.no_grad(): 274 | output = self.model(**encoded) 275 | 276 | # Get all hidden states 277 | states = output.hidden_states 278 | # Stack and sum last four layers 279 | layers = [-4, -3, -2, -1] 280 | output = torch.stack([states[i] for i in layers]).sum(0).squeeze() 281 | 282 | token_embeddings = [] 283 | # 1 and -1 are [CLS] tokens, and other tokens can be ##subwords 284 | for word_idx in set(tokens.word_ids[1:-1]): 285 | index_begin, index_end = tokens.word_to_chars(word_idx) 286 | token_ids_word = np.where(np.array(encoded.word_ids()) == word_idx) 287 | # Only select the tokens that constitute the requested word 288 | word_tokens_output = output[token_ids_word] 289 | token_embeddings.append( 290 | [ 291 | index_begin + idx_offset, 292 | index_end + idx_offset, 293 | word_tokens_output.tolist(), 294 | ] 295 | ) 296 | return token_embeddings 297 | 298 | def _estimate_token_number(self, document: str) -> int: 299 | """ 300 | Estimates the number of tokens which are generated by the transformer model. 301 | It is based on the rule of thumb that per token 3 subtokens are created by 302 | the transformer tokenizer. Tokens are created by splitting at every 303 | special and whitespace character. 304 | Special Characters are handled seperately according to the assumption that each 305 | special character is treated as a token by the transformer tokenizer. 306 | 307 | Args: 308 | document: plain text document 309 | Returns: 310 | Estimation for the number of transformer tokens included in the document 311 | """ 312 | avg_subtokens_per_token = 3 313 | number_word_tokens = len(re.findall(r"\[NL\]|\w+", document)) 314 | number_special_characters = len(re.sub(r"[\w\s]+", "", document)) 315 | return avg_subtokens_per_token * number_word_tokens + number_special_characters 316 | 317 | def _split_document( 318 | self, document: str, estimated_tokens: int 319 | ) -> Iterator[Tuple[str, int]]: 320 | """ 321 | Splits the documens into subparts, according to the model's max length and the 322 | number of estimated tokens. 323 | 324 | Args: 325 | document: plain text document 326 | estimated_tokens: estimation for the token number 327 | Returns: 328 | Yields subpart of the document, splitted depending on max model length and 329 | estimated number of tokens 330 | """ 331 | # the regular expression matches the special token [NL], any word consiting of 332 | # numbers and chars and single characters which are no whitespace or word 333 | # character 334 | token_spans = [ 335 | token.span() for token in re.finditer(r"\[NL\]|\w+|[^\w\s]+?", document) 336 | ] 337 | split_into = ( 338 | round(estimated_tokens / self.model.config.max_position_embeddings) + 1 339 | ) 340 | len_part = math.ceil(len(token_spans) / split_into) 341 | 342 | prev_split_idx = 0 343 | for i in range(split_into): 344 | current_split_idx = min( 345 | len(document), 346 | token_spans[min((i + 1) * len_part, len(token_spans) - 1)][1], 347 | ) 348 | yield document[prev_split_idx:current_split_idx], prev_split_idx 349 | prev_split_idx = current_split_idx 350 | -------------------------------------------------------------------------------- /embedders/extraction/count_based.py: -------------------------------------------------------------------------------- 1 | from typing import List, Generator, Union 2 | from sklearn.feature_extraction.text import CountVectorizer 3 | from embedders import util 4 | from spacy.tokens.doc import Doc 5 | 6 | from embedders.extraction import TokenEmbedder 7 | 8 | 9 | class BagOfCharsTokenEmbedder(TokenEmbedder): 10 | """Embeds documents using plain Bag of Characters approach. 11 | 12 | Args: 13 | language_code (str): Name of the spaCy language model 14 | precomputed_docs (bool, optional): If you have a large text corpus, it might make sense to precompute the data and input tokenized spaCy documents. Defaults to False. 15 | batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. 16 | """ 17 | 18 | def __init__( 19 | self, 20 | language_code: str, 21 | precomputed_docs: bool = False, 22 | batch_size: int = 128, 23 | **kwargs 24 | ): 25 | super().__init__(language_code, precomputed_docs, batch_size) 26 | self.model = CountVectorizer(analyzer="char", min_df=0.01, **kwargs) 27 | 28 | def _encode( 29 | self, documents: List[Union[str, Doc]], fit_model: bool 30 | ) -> Generator[List[List[List[int]]], None, None]: 31 | if fit_model: 32 | if self.preloaded: 33 | self.model.fit([doc.text for doc in documents]) 34 | else: 35 | self.model.fit(documents) 36 | 37 | for documents_batch in util.batch(documents, self.batch_size): 38 | documents_batch_embedded = [] 39 | for doc in documents_batch: 40 | documents_batch_embedded.append( 41 | self.model.transform( 42 | [tok.text for tok in self._get_tokenized_document(doc)] 43 | ) 44 | .toarray() 45 | .tolist() 46 | ) 47 | yield documents_batch_embedded 48 | -------------------------------------------------------------------------------- /embedders/extraction/reduce.py: -------------------------------------------------------------------------------- 1 | from typing import List, Generator, Union 2 | import numpy as np 3 | from embedders import PCAReducer, util 4 | 5 | 6 | class PCATokenReducer(PCAReducer): 7 | def __init__(self, embedder, **kwargs): 8 | super().__init__(embedder=embedder, **kwargs) 9 | self.nlp = embedder.nlp 10 | 11 | def _transform( 12 | self, embeddings: List[List[List[Union[int, float]]]] 13 | ) -> List[List[List[Union[float, int]]]]: 14 | batch_concatenated = np.concatenate(embeddings) 15 | start_idx = 0 16 | batch_unsqueezed = [] 17 | for length in [len(embedding) for embedding in embeddings]: 18 | end_idx = start_idx + length 19 | batch_reduced = self.reducer.transform( 20 | batch_concatenated[start_idx:end_idx] 21 | ) 22 | batch_unsqueezed.append(batch_reduced.tolist()) 23 | start_idx = end_idx 24 | return batch_unsqueezed 25 | 26 | def _reduce( 27 | self, documents, fit_model, fit_after_n_batches 28 | ) -> Generator[List[List[List[Union[float, int]]]], None, None]: 29 | if fit_model: 30 | embeddings_training = [] 31 | num_batches = util.num_batches(documents, self.embedder.batch_size) 32 | fit_after_n_batches = min(num_batches, fit_after_n_batches) - 1 33 | for batch_idx, batch in enumerate( 34 | self.embedder.fit_transform(documents, as_generator=True) 35 | ): 36 | if batch_idx <= fit_after_n_batches: 37 | embeddings_training.append(batch) 38 | 39 | if batch_idx == fit_after_n_batches: 40 | embeddings_training_flattened = [] 41 | for batch_training in embeddings_training: 42 | embeddings_training_flattened.extend( 43 | np.concatenate(batch_training).tolist() 44 | ) 45 | embeddings_training_flattened = np.array( 46 | embeddings_training_flattened 47 | ) 48 | if ( 49 | embeddings_training_flattened.shape[1] 50 | < self.reducer.n_components 51 | and self.autocorrect_n_components 52 | ): 53 | self.reducer.n_components = embeddings_training_flattened.shape[ 54 | 1 55 | ] 56 | self.reducer.fit(embeddings_training_flattened) 57 | 58 | for batch_training in embeddings_training: 59 | yield self._transform(batch_training) 60 | if batch_idx > fit_after_n_batches: 61 | yield self._transform(batch) 62 | else: 63 | embeddings = self.embedder.transform(documents) 64 | yield self._transform(embeddings) 65 | -------------------------------------------------------------------------------- /embedders/samples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code-kern-ai/embedders/9bb17ba7f663be53b0db047b18b6dc017df7c757/embedders/samples/__init__.py -------------------------------------------------------------------------------- /embedders/samples/clickbait.py: -------------------------------------------------------------------------------- 1 | DATA = [ 2 | "UK guinea pig farm to close after owner's family grave robbed", 3 | "18 Sweet Pumpkin Treats You Won't Believe Are Healthy", 4 | 'A Guy Just Did The Most Epic "Cha Cha Slide" Dance Ever', 5 | "Premium gas discounted for a few hours", 6 | "Sanctions on US products introduced by Brazil", 7 | "IPhone sales exceed BlackBerry", 8 | "Administration Seeks to Regulate Derivatives", 9 | "21 Life-Changing Products That Can Actually Make Your Skin Better", 10 | "US raids Iran 'liaison office', Russia says it is unacceptable", 11 | "US House of Representatives rejects bail out bill in vote", 12 | "23 Ways To Give Your Heart To Your Valentine", 13 | "Signs You Grew Up In Southern California", 14 | "5 killed in return bus trip from marching band competition", 15 | "Here's Definitive Proof That Leonardo DiCaprio Is Immortal", 16 | "Signs of Possible Deal in Pakistan Turmoil", 17 | "14 killed in Russian bus-truck collision", 18 | "In Icy Kentucky, Thousands Are Still Without Power", 19 | "Beans Memes Is The Only Twitter Account That Actually Matters", 20 | "Couples Who Prove Opposites Attract", 21 | "Ball State Upsets Tennessee in First Round", 22 | "India and U.S.A. work toward nuclear fuel agreement", 23 | "16 Tweets That Sum Up The Lengths You Would Go To Avoid Other People", 24 | "Gas Is Up, but Drivers May Look the Other Way", 25 | "Which Taylor Swift Track Should Be Your Personal Theme Song", 26 | "Bruno Mars Might Headline Super Bowl 50", 27 | "US clinic plans first face transplant", 28 | "South Korea says North Korea will test more nuclear bombs", 29 | "National Hockey League news: February 28, 2008", 30 | "41 Victoria's Secret Models Show What They Look Like Without Makeup", 31 | "North Korean military fires artillery on populated South Korean island", 32 | "Pilot killed as Su-25 military jet explodes near Vladivostok", 33 | "Conservatives Map Strategies on Court Fight", 34 | "Aziz Ansari's Instagram Post About His Dad Will Make You Cry", 35 | "Who Is Your Dad Actually", 36 | "Climate Research That Might Not Help", 37 | "11 Steamy Lyrics That Will Make See You Selena Gomez In A New Light", 38 | "What Percent Vegan Are You", 39 | "13 Misogynistic Phrases That Need To Die", 40 | "491 Scoreless Minutes Come to an End", 41 | "Oil spewing from crack in seafloor of Gulf of Mexico was fifty feet from Deepwater Horizon well", 42 | "Saudis Delay Local Elections by 2 Years", 43 | "7 Excellent Deals You Can Get This Weekend", 44 | "U.N. Warns of Refugee Crisis in Gaza Strip", 45 | "Executives from IT industry focus on 10-year anniversary of Microsoft Research Asia", 46 | "Independent Member of Australian Parliament calls for better indigenous policy", 47 | "Ayesha Curry Has Sparked A Debate About How Women Dress", 48 | "FCC requires VoIP providers to have 911 service", 49 | "17 Images That Will Only Make Sense To People Obsessed With High Heels", 50 | "Australian rules football: 2010 Gippsland Football League round 1 - Wonthaggi v Leongatha", 51 | "Two Killed in Violence on Gaza Border", 52 | "Stimulus Tour Takes Obama to New Blue States", 53 | "12 Bizarre Christmas Traditions From Around The World", 54 | "9 Differences Between Hanging Out With Your New Friend Vs. Your Best Friend", 55 | "Priest Reportedly Suspended For Riding A Hoverboard During Mass", 56 | "15 Songs You Loved (But Forgot About) From 10 Years Ago", 57 | "People Are Using The Hashtag #BurritoSelfie And It Is As Glorious As You'd Imagine", 58 | "This Taco Recipe Will Sexually Awaken Your Taste Buds", 59 | "16 Times Chris Martin Was Really Just An Excited Puppy", 60 | "19 Gorgeous Finnish Baby Names That Will Make You Broody", 61 | "Mugabe spokeperson tells critics to 'go hang'", 62 | "Night Owls Become Early Risers", 63 | "I Tested Pinterest Mug Recipes To See If They Actually Taste Like Food", 64 | "Former U.S. President Clinton stumps for Obama, Franken in Minneapolis", 65 | "Rangers Honor Andy Bathgate and Harry Howell", 66 | "Which Lola From 'Kalyeserye' Are You Based On These Really Hard Questions", 67 | "West African cholera claims more than 500 lives, more deaths feared", 68 | "17 Of The Most Beautifully Illustrated Picture Books In 2015", 69 | "Rwandan genocide investigations to be completed by end of July", 70 | "Mandela discharged from hospital", 71 | "15 Insanely Adorable Pins You Never Knew You Needed", 72 | "Internet Companies and Ad Agencies Find Some Common Ground", 73 | "NBA star Gilbert Arenas pleads guilty to gun possession, could face six months in prison", 74 | "Clothing Makers Exceed Quarterly Expectations", 75 | "22 Mesmerizing, Mundane Photos Of A Day In The Life Of Darth Vader", 76 | "19 Texts That Are Way Too Real For Anyone Who's A Little Bit Greedy", 77 | "What Percentage Do You Have Of Winning The Royal Rumble", 78 | "Students Stand When Called Upon, and When Not", 79 | "Justices Retain Oversight by U.S. on Voting", 80 | "Canadian film academy explains lack of Genie nomination for Juno", 81 | "National Academy of Sciences recommends manned Hubble repair", 82 | "18 Things You Didn't Know About Cold Callers", 83 | "When You Miss Your Friend", 84 | "Robin Cook dead after collapsing", 85 | "12 hurt in San Luis de La Balsa tourist bus accident", 86 | "Wreckage of plane thought to be missing Air France flight found in Atlantic", 87 | 'Which Pink Lady From "Grease" Should Be Your BFF', 88 | "Attack on mosque kills 30 in Rawalpindi, Pakistan", 89 | "Russian Uranium Sale to U.S. Is Planned", 90 | "Billions Withdrawn Before Madoff Arrest", 91 | "Vestas occupation continues; left-wing political parties voice support", 92 | "Television appeal for 1984 murder in Bath, England", 93 | "Myanmar Dissident Testifies at Trial", 94 | "Independent presidential candidates debate this weekend", 95 | "Blake Lively And Ryan Reynolds Continue To Be Actual Relationship Goals", 96 | "F.B.I. Lab Houses Growing Database of DNA Profiles", 97 | "18 Differences Between Snow Days In Canada And America", 98 | "The 27 Most Annoying Things Every Bartender Has To Endure", 99 | "U.S. Tells Chrysler to Prepare for Bankruptcy Filing", 100 | "Thieves steal £40 million from London jeweller", 101 | "Obama on Spot Over a Benefit to Gay Couples", 102 | ] 103 | 104 | 105 | def get_sample_data(): 106 | return DATA 107 | -------------------------------------------------------------------------------- /embedders/util.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Generator, List 2 | import numpy as np 3 | 4 | 5 | def batch(documents: List[Any], batch_size: int) -> Generator[List[Any], None, None]: 6 | length = len(documents) 7 | for idx in range(0, length, batch_size): 8 | yield documents[idx : min(idx + batch_size, length)] 9 | 10 | 11 | def num_batches(documents: List[Any], batch_size: int) -> int: 12 | return int(np.ceil(len(documents) / batch_size)) 13 | -------------------------------------------------------------------------------- /publish: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | rm -rf dist/* 3 | python3 setup.py bdist_wheel --universal 4 | twine upload dist/* -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scikit-learn 3 | sentence-transformers 4 | spacy>=3.0.0 5 | torch>=1.6.0 6 | tqdm 7 | transformers>=4.6.0,<5.0.0 8 | openai 9 | cohere 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | 4 | from setuptools import setup, find_packages 5 | 6 | this_directory = os.path.abspath(os.path.dirname(__file__)) 7 | with open(os.path.join(this_directory, "README.md")) as file: 8 | long_description = file.read() 9 | 10 | setup( 11 | name="embedders", 12 | version="0.1.8", 13 | author="Johannes Hötter", 14 | author_email="johannes.hoetter@kern.ai", 15 | description="High-level API for creating sentence and token embeddings", 16 | long_description=long_description, 17 | long_description_content_type="text/markdown", 18 | url="https://github.com/code-kern-ai/embedders", 19 | keywords=["kern", "machine learning", "representation learning", "python"], 20 | classifiers=[ 21 | "Development Status :: 3 - Alpha", 22 | "Programming Language :: Python :: 3", 23 | "License :: OSI Approved :: MIT License", 24 | ], 25 | package_dir={"": "."}, 26 | packages=find_packages("."), 27 | install_requires=[ 28 | "numpy", 29 | "scikit-learn", 30 | "sentence-transformers", 31 | "spacy>=3.0.0", 32 | "torch>=1.6.0", 33 | "tqdm", 34 | "transformers>=4.6.0,<5.0.0", 35 | "openai", 36 | "cohere", 37 | ], 38 | ) 39 | -------------------------------------------------------------------------------- /tutorials/Finding similar sentences within a text corpus.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5252aaca", 6 | "metadata": {}, 7 | "source": [ 8 | "# Finding similar sentences within a text corpus\n", 9 | "\n", 10 | "One great use case for embedding data is to use their vector representation for similarity search (often referred to as \"neural search\"). In this very short example, we'll show you how to build a super simple sentence comparison via comparing the cosine similarity of two embeddings.\n", 11 | "\n", 12 | "![A huge pile of embedded data points in vector space](https://miro.medium.com/max/2028/1*1LHBbqmPI0X4I3rio5ujWQ.png)\n", 13 | "\n", 14 | "This notebook is only meant as a tutorial; be aware that there are many fascinating neural search engines, such as [qdrant](https://qdrant.tech).\n", 15 | "\n", 16 | "---\n", 17 | "\n", 18 | "To get started, we just load two things: a function to load some sample data from the library, and a pre-trained sentence embedder based on transformer architecture." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "5105bfa0", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from embedders.samples.clickbait import get_sample_data\n", 29 | "from embedders.classification.contextual import TransformerSentenceEmbedder" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "17cb52d5", 35 | "metadata": {}, 36 | "source": [ 37 | "The `clickbait` dataset is straightforward simple and consists of some short headliners. Here's an example." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "id": "3d2559b6", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "texts = get_sample_data()\n", 48 | "print(texts[0])" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "id": "7551608e", 54 | "metadata": {}, 55 | "source": [ 56 | "Next, we just load the embedder via some Hugging Face configuration string. We make use of `distilbert-base-uncased`. You can input any other model from the Hub." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "id": "f8c5c7f1", 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "embedder = TransformerSentenceEmbedder(\"distilbert-base-uncased\")" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "id": "35567368", 72 | "metadata": {}, 73 | "source": [ 74 | "And now the magic happens: we encode the data. This is as easy as with your favorite sklearn objects - just call `fit_transform`." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "543f6cc3", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "embeddings = embedder.fit_transform(texts)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "7822796f", 90 | "metadata": {}, 91 | "source": [ 92 | "Now, to compute a vanilla similarity search, we'll make use of the cosine similarity, which helps us to compute similarities for given vectors." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "7ae1cc92", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "import numpy as np\n", 103 | "\n", 104 | "def cosine_similarity(vector_1, vector_2):\n", 105 | " return np.dot(vector_1, vector_2)/(np.linalg.norm(vector_1)*np.linalg.norm(vector_2))" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "id": "d678a387", 111 | "metadata": {}, 112 | "source": [ 113 | "And finally, a simplistic nested loop to calculate pairwise similarities (excluding identical sentences)." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "id": "84d28194", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "from tqdm import tqdm\n", 124 | "\n", 125 | "highest_similarity = float(\"-inf\")\n", 126 | "vector_pair = None, None\n", 127 | "for vector_1_idx, vector_1 in tqdm(enumerate(embeddings), total=len(embeddings)):\n", 128 | " for vector_2_idx, vector_2 in enumerate(embeddings):\n", 129 | " if vector_1_idx != vector_2_idx:\n", 130 | " similarity = cosine_similarity(vector_1, vector_2)\n", 131 | " if similarity > highest_similarity:\n", 132 | " highest_similarity = similarity\n", 133 | " vector_pair = vector_1_idx, vector_2_idx" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "7d5e41ce", 139 | "metadata": {}, 140 | "source": [ 141 | "We can now take a look at the most similar pair in our text corpus." 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "a8bf8107", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "print(texts[vector_pair[0]], texts[vector_pair[1]])" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "id": "cd968e84", 157 | "metadata": {}, 158 | "source": [ 159 | "Wow - isn't that amazing?! Embedding data is one of the most sophisticated and intelligent way to enrich your records with valuable semantic metadata. There are sheer endless use cases. And `embedders` helps you to quickly generate embeddings for your dataset! 😋\n", 160 | "\n", 161 | "---\n", 162 | "\n", 163 | "If you have further questions, don't hesitate to contact us. If there is anything you want to have added to the library, open an [issue](https://github.com/code-kern-ai/embedders/issues). And please, don't forget to give us a ⭐" 164 | ] 165 | } 166 | ], 167 | "metadata": { 168 | "kernelspec": { 169 | "display_name": "Python 3 (ipykernel)", 170 | "language": "python", 171 | "name": "python3" 172 | }, 173 | "language_info": { 174 | "codemirror_mode": { 175 | "name": "ipython", 176 | "version": 3 177 | }, 178 | "file_extension": ".py", 179 | "mimetype": "text/x-python", 180 | "name": "python", 181 | "nbconvert_exporter": "python", 182 | "pygments_lexer": "ipython3", 183 | "version": "3.9.12" 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 5 188 | } 189 | --------------------------------------------------------------------------------