├── .flake8 ├── .github └── workflows │ ├── style.yml │ └── unittest.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENCE ├── Makefile ├── README.md ├── datasets ├── data-quality.jsonl └── new-dataset.jsonl ├── docs ├── API │ ├── external.md │ ├── grab.md │ ├── model.md │ ├── multimodal.md │ ├── text.md │ ├── utils.md │ └── vision.md ├── applications.md ├── images │ ├── colorhistogram.png │ ├── columngrabber.png │ ├── contrastive-re-use.png │ ├── contrastive-same-weights.png │ ├── contrastive.png │ ├── difference-model.png │ ├── embed.png │ ├── feedforward.png │ ├── gradient.png │ ├── human-in-the-loop-1.png │ ├── human-in-the-loop-2.png │ ├── human-in-the-loop-3.png │ ├── human-in-the-loop-4.png │ ├── icon.png │ ├── imageloader.png │ ├── output.png │ ├── sense2vec.png │ ├── sentence-encoder.png │ ├── timm.png │ ├── x-finetuned-again.png │ ├── x-finetuned.png │ └── x-orig.png ├── index.md └── vegalite │ ├── lite_embed1.json │ └── lite_embed2.json ├── embetter ├── __init__.py ├── base.py ├── error.py ├── external │ ├── __init__.py │ ├── _cohere.py │ └── _openai.py ├── finetune │ ├── __init__.py │ ├── _constrastive_learn.py │ ├── _contrastive_tuner.py │ ├── _forward.py │ └── _sbert_learn.py ├── grab.py ├── model │ ├── __init__.py │ └── _diff.py ├── multi │ ├── __init__.py │ └── _clip.py ├── text │ ├── __init__.py │ ├── _bpemb.py │ ├── _keras.py │ ├── _lite.py │ ├── _model2vec.py │ ├── _s2v.py │ ├── _sbert.py │ ├── _spacy.py │ └── _word2vec.py ├── utils.py └── vision │ ├── __init__.py │ ├── _colorhist.py │ ├── _loader.py │ └── _torchvis.py ├── mkdocs.yml ├── setup.py └── tests ├── __init__.py ├── data ├── en.wiki.bpe.vs1000.d25.w2v.bin ├── en.wiki.bpe.vs1000.model └── thiscatdoesnotexist.jpeg ├── test_base.py ├── test_docs.py ├── test_text.py ├── test_utils.py └── test_vision.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 160 3 | ignore = E501, C901 4 | extend-ignore = E203, W503 -------------------------------------------------------------------------------- /.github/workflows/style.yml: -------------------------------------------------------------------------------- 1 | name: Style Checks 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ["3.12"] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v1 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | cache: 'pip' 25 | - name: Install Testing Dependencies 26 | run: python -m pip install ruff 27 | - name: Ruff 28 | if: always() 29 | run: ruff check embetter tests setup.py 30 | -------------------------------------------------------------------------------- /.github/workflows/unittest.yml: -------------------------------------------------------------------------------- 1 | name: Code Checks 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | build: 13 | if: ${{ always() }} 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.9", "3.11"] 18 | 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Install uv 22 | uses: astral-sh/setup-uv@v2 23 | - name: Set up Python ${{ matrix.python-version }} 24 | run: uv python install ${{ matrix.python-version }} 25 | - name: Set up venv 26 | run: uv venv 27 | - name: Install Base Dependencies 28 | run: | 29 | uv pip install -e '.[dev]' 30 | uv pip install -e '.[sbert]' 31 | - name: Prep CI tests 32 | run: | 33 | mkdir -p ~/.cache/bpemb/en 34 | mv tests/data/en.wiki.bpe.vs1000.d25.w2v.bin ~/.cache/bpemb/en 35 | mv tests/data/en.wiki.bpe.vs1000.model ~/.cache/bpemb/en 36 | - name: Unittest 37 | run: uv run pytest -n auto -vv 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | *.ipynb 131 | .vscode 132 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v4.6.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - id: check-yaml 10 | - id: check-added-large-files 11 | - repo: https://github.com/astral-sh/ruff-pre-commit 12 | # Ruff version. 13 | rev: v0.4.3 14 | hooks: 15 | # Run the linter. 16 | - id: ruff 17 | types_or: [ python, pyi, jupyter ] 18 | args: [ --fix ] 19 | # Run the formatter. 20 | - id: ruff-format 21 | types_or: [ python, pyi, jupyter ] 22 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Vincent D. Warmerdam 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: docs 2 | 3 | ruff: 4 | python -m ruff check embetter tests setup.py --fix 5 | 6 | test: 7 | pytest -n auto -vv 8 | 9 | install: 10 | python -m pip install -e ".[dev]" 11 | 12 | pypi: 13 | python setup.py sdist 14 | python setup.py bdist_wheel --universal 15 | twine upload dist/* 16 | 17 | clean: 18 | rm -rf **/.ipynb_checkpoints **/.pytest_cache **/__pycache__ **/**/__pycache__ .ipynb_checkpoints .pytest_cache 19 | 20 | check: clean ruff test clean 21 | 22 | docs: 23 | cp README.md docs/index.md 24 | python -m mkdocs serve 25 | 26 | deploy-docs: 27 | cp README.md docs/index.md 28 | python -m mkdocs gh-deploy 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # embetter 3 | 4 | > "Just a bunch of useful embeddings for scikit-learn pipelines, to get started quickly." 5 | 6 | 7 | 8 |
9 | 10 | Embetter implements scikit-learn compatible embeddings for computer vision and text. It should make it very easy to quickly build proof of concepts using scikit-learn pipelines and, in particular, should help with [bulk labelling](https://www.youtube.com/watch?v=gDk7_f3ovIk). It's also meant to play nice with [bulk](https://github.com/koaning/bulk) and [scikit-partial](https://github.com/koaning/scikit-partial) but it can also be used together with your favorite ANN solution like [lancedb](https://lancedb.github.io/lancedb/). 11 | 12 | ## Install 13 | 14 | You can install via pip. 15 | 16 | ``` 17 | python -m pip install embetter 18 | ``` 19 | 20 | Many of the embeddings are optional depending on your use-case, so if you 21 | want to nit-pick to download only the tools that you need: 22 | 23 | ``` 24 | python -m pip install "embetter[text]" 25 | python -m pip install "embetter[spacy]" 26 | python -m pip install "embetter[sense2vec]" 27 | python -m pip install "embetter[gensim]" 28 | python -m pip install "embetter[bpemb]" 29 | python -m pip install "embetter[vision]" 30 | python -m pip install "embetter[all]" 31 | ``` 32 | 33 | ## API Design 34 | 35 | This is what's being implemented now. 36 | 37 | ```python 38 | # Helpers to grab text or image from pandas column. 39 | from embetter.grab import ColumnGrabber 40 | 41 | # Representations/Helpers for computer vision 42 | from embetter.vision import ImageLoader, TimmEncoder, ColorHistogramEncoder 43 | 44 | # Representations for text 45 | from embetter.text import SentenceEncoder, MatryoshkaEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, GensimEncoder, TextEncoder 46 | 47 | # Representations from multi-modal models 48 | from embetter.multi import ClipEncoder 49 | 50 | # Finetuning components 51 | from embetter.finetune import FeedForwardTuner, ContrastiveTuner, ContrastiveLearner, SbertLearner 52 | 53 | # External embedding providers, typically needs an API key 54 | from embetter.external import CohereEncoder, OpenAIEncoder 55 | ``` 56 | 57 | All of these components are scikit-learn compatible, which means that you 58 | can apply them as you would normally in a scikit-learn pipeline. Just be aware 59 | that these components are stateless. They won't require training as these 60 | are all pretrained tools. 61 | 62 | ## Text Example 63 | 64 | To run this example, make sure that you `pip install 'embetter[sbert]'`. 65 | 66 | ```python 67 | import pandas as pd 68 | from sklearn.pipeline import make_pipeline 69 | from sklearn.linear_model import LogisticRegression 70 | 71 | from embetter.grab import ColumnGrabber 72 | from embetter.text import SentenceEncoder 73 | 74 | # This pipeline grabs the `text` column from a dataframe 75 | # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2. 76 | text_emb_pipeline = make_pipeline( 77 | ColumnGrabber("text"), 78 | SentenceEncoder('all-MiniLM-L6-v2') 79 | ) 80 | 81 | # This pipeline can also be trained to make predictions, using 82 | # the embedded features. 83 | text_clf_pipeline = make_pipeline( 84 | text_emb_pipeline, 85 | LogisticRegression() 86 | ) 87 | 88 | dataf = pd.DataFrame({ 89 | "text": ["positive sentiment", "super negative"], 90 | "label_col": ["pos", "neg"] 91 | }) 92 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) 93 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf) 94 | ``` 95 | 96 | ## Image Example 97 | 98 | The goal of the API is to allow pipelines like this: 99 | 100 | ```python 101 | import pandas as pd 102 | from sklearn.pipeline import make_pipeline 103 | from sklearn.linear_model import LogisticRegression 104 | 105 | from embetter.grab import ColumnGrabber 106 | from embetter.vision import ImageLoader 107 | from embetter.multi import ClipEncoder 108 | 109 | # This pipeline grabs the `img_path` column from a dataframe 110 | # then it grabs the image paths and turns them into `PIL.Image` objects 111 | # which then get fed into CLIP which can also handle images. 112 | image_emb_pipeline = make_pipeline( 113 | ColumnGrabber("img_path"), 114 | ImageLoader(convert="RGB"), 115 | ClipEncoder() 116 | ) 117 | 118 | dataf = pd.DataFrame({ 119 | "img_path": ["tests/data/thiscatdoesnotexist.jpeg"] 120 | }) 121 | image_emb_pipeline.fit_transform(dataf) 122 | ``` 123 | 124 | ## Batched Learning 125 | 126 | All of the encoding tools you've seen here are also compatible 127 | with the [`partial_fit` mechanic](https://scikit-learn.org/0.15/modules/scaling_strategies.html#incremental-learning) 128 | in scikit-learn. That means 129 | you can leverage [scikit-partial](https://github.com/koaning/scikit-partial) 130 | to build pipelines that can handle out-of-core datasets. 131 | 132 | -------------------------------------------------------------------------------- /datasets/data-quality.jsonl: -------------------------------------------------------------------------------- 1 | {"text":"By leveraging this diversity, the collected dataset and the collection system aim to achieve higher recognition accuracy.","cats":{"new-dataset":0,"data-quality":0}} 2 | {"text":"In this paper, we study linear regression applied to data structured on a manifold.","cats":{"new-dataset":0,"data-quality":0}} 3 | {"text":"We assume that the data manifold is smooth and is embedded in a Euclidean space, and our objective is to reveal the impact of the data manifold's extrinsic geometry on the regression.","cats":{"new-dataset":0,"data-quality":0}} 4 | {"text":"Large language models trained for safety and harmlessness remain susceptible to adversarial misuse, as evidenced by the prevalence of \"jailbreak\" attacks on early releases of ChatGPT that elicit undesired behavior.","cats":{"new-dataset":0,"data-quality":0}} 5 | {"text":"Our work opens up new possibilities for modeling very long sequences, e.g., treating a whole corpus or even the entire Internet as a sequence.","cats":{"new-dataset":0,"data-quality":0}} 6 | {"text":"However, we identify issues with the dataset quality and evaluation metric.","cats":{"new-dataset":0,"data-quality":1}} 7 | {"text":"We will release our annotation scheme, the corpus, and codes to the research community to alleviate the scarcity of labeled data in this domain.","cats":{"new-dataset":1,"data-quality":1}} 8 | {"text":"Extensive experiments are conducted to demonstrate the effectiveness of our proposed method.","cats":{"new-dataset":0,"data-quality":0}} 9 | {"text":"Compared to a variety of baselines, our method achieves superior results.","cats":{"new-dataset":0,"data-quality":0}} 10 | {"text":"Previous segmentation methods for noisy label problems only utilize a single image while the potential of leveraging the correlation between images has been overlooked.","cats":{"new-dataset":0,"data-quality":1}} 11 | {"text":"Experiments with both synthetic and real-world label noise demonstrate that our method outperforms recent state-of-the-art robust segmentation approaches.","cats":{"new-dataset":0,"data-quality":1}} 12 | {"text":"We detail corpus statistics and demonstrate high inter-annotator agreement.","cats":{"new-dataset":0,"data-quality":0}} 13 | {"text":"Alongside the images, we provide ground-truth annotations for several learning tasks, including semantic segmentation, object detection, and counting.","cats":{"new-dataset":0,"data-quality":0}} 14 | {"text":"However, even manually labeled datasets contain errors, not to mention automatically labeled ones.","cats":{"data-quality":1}} 15 | {"text":"Label error is a ubiquitous problem in annotated data.","cats":{"data-quality":1}} 16 | {"text":"After demonstrating that our methodology empirically outperforms other algorithms for label error detection, we apply our approach to discover many label errors in the CelebA image tagging dataset.","cats":{"data-quality":1}} 17 | {"text":"These properties highlight a tradeoff between classification error probability and error-correction capabilities of label encodings.","cats":{"data-quality":0}} 18 | {"text":"In this work, we for the first time introduce a benchmark for label error detection methods on object detection datasets as well as a label error detection method and a number of baselines.","cats":{"data-quality":1}} 19 | {"text":"Label encodings found by RLEL result in lower or comparable errors to manually designed label encodings.","cats":{"data-quality":1}} 20 | {"text":"We also propose an improved self-labeling loss; it is robust to pseudo-labeling errors and enforces stronger fairness.","cats":{"data-quality":1}} 21 | {"text":"Inferencing unlabeled data from labeled data is an error-prone process.","cats":{"data-quality":1}} 22 | {"text":"However, creating such large keypoint labels is time-consuming and costly, and is often error-prone due to inconsistent labeling.","cats":{"data-quality":0}} 23 | {"text":"The losses are computed with respect to the predictions and the noisy labels including simulated label errors, aiming at detecting the latter.","cats":{"data-quality":1}} 24 | {"text":"PseudoAugments outperforms pseudo labeling by mitigating pseudo labeling errors and generating diverse fused training scenes.","cats":{"data-quality":1}} 25 | {"text":"Our model is also able to maintain high classification accuracy with very few labels, with only 7.79% error when only using 145 labels.","cats":{"data-quality":0}} 26 | {"text":"Detecting errors in KGs is challenging since the patterns of errors are unknown and diverse, while ground-truth labels are rare or even unavailable.","cats":{"data-quality":1}} 27 | {"text":"We analyze the factors affecting this approximation error and design a pseudo-label clustering generation method to reduce the approximation error.","cats":{"data-quality":1}} 28 | {"text":"To ameliorate the impact of label errors, we equipped our method with a novel negative label sampling strategy to strengthen the model robustness.","cats":{"data-quality":1}} 29 | {"text":"We propose an extension of the Confident Learning framework to this setting, as well as a label quality score that ranks examples with label errors much higher than those which are correctly labeled.","cats":{"data-quality":1}} 30 | {"text":"The later case can generate dense flow labels but the interpolated events are prone to errors.","cats":{"data-quality":0}} 31 | {"text":"Improper fingerprint localization and finger labeling errors lead to poor matching performance.","cats":{"data-quality":0}} 32 | {"text":"Our experiments show that our method is robust to linguistic labels with poor orthography and alignment errors.","cats":{"data-quality":1}} 33 | {"text":"We derive an upper bound for the generalization error that is linear in the clients' label noise level.","cats":{"data-quality":1}} 34 | {"text":"For example, for the IMDB text data with known labeling errors, a 14% boost is shown.","cats":{"data-quality":1}} 35 | {"text":"Large amounts of label error substantially degrades the quality of deep learning models.","cats":{"data-quality":1}} 36 | {"text":"We simulate four different types of randomly introduced label errors on train and test sets of well-labeled object detection datasets.","cats":{"data-quality":1}} 37 | {"text":"We prove that semi-supervised labels improve the downstream error bound whereas noisy labels have limited effects under such a paradigm.","cats":{"data-quality":1}} 38 | {"text":"This paper provides an exact characterization of the expected generalization error (gen-error) for semi-supervised learning (SSL) with pseudo-labeling via the Gibbs algorithm.","cats":{"data-quality":0}} 39 | {"text":"However, corresponding class labels are noisy when provided by error-prone annotators, e.g., crowd workers.","cats":{"data-quality":1}} 40 | {"text":"Most existing methods utilize the off-the-shelf pose or parsing networks as pseudo labels, which are prone to error.","cats":{"data-quality":0}} 41 | {"text":"The result is an SSL classification framework explicitly designed to overcome inevitable pseudo-label errors.","cats":{"data-quality":1}} 42 | {"text":"Here we consider the task of finding sentences that contain label errors in token classification datasets.","cats":{"data-quality":1}} 43 | {"text":"Scaling sequence length has become a critical demand in the era of large language models.","cats":{"data-quality":0}} 44 | {"text":"However, existing methods struggle with either computational complexity or model expressivity, rendering the maximum sequence length restricted.","cats":{"data-quality":0}} 45 | {"text":"In this work, we introduce LongNet, a Transformer variant that can scale sequence length to more than 1 billion tokens, without sacrificing the performance on shorter sequences.","cats":{"data-quality":0}} 46 | {"text":"Specifically, we propose dilated attention, which expands the attentive field exponentially as the distance grows.","cats":{"data-quality":0}} 47 | {"text":"Experiments results demonstrate that LongNet yields strong performance on both long-sequence modeling and general language tasks.","cats":{"data-quality":0}} 48 | {"text":"Large Language Models (LLMs) have demonstrated impressive planning abilities in single-agent embodied tasks across various domains.","cats":{"data-quality":0}} 49 | {"text":"However, their capacity for planning and communication in multi-agent cooperation remains unclear, even though these are crucial skills for intelligent embodied agents.","cats":{"data-quality":0}} 50 | {"text":"In this paper, we present a novel framework that utilizes LLMs for multi-agent cooperation and tests it in various embodied environments.","cats":{"data-quality":0}} 51 | {"text":"Our framework enables embodied agents to plan, communicate, and cooperate with other embodied agents or humans to accomplish long-horizon tasks efficiently.","cats":{"data-quality":0}} 52 | {"text":"We demonstrate that recent LLMs, such as GPT-4, can surpass strong planning-based methods and exhibit emergent effective communication using our framework without requiring fine-tuning or few-shot prompting.","cats":{"data-quality":0}} 53 | {"text":"We also discover that LLM-based agents that communicate in natural language can earn more trust and cooperate more effectively with humans.","cats":{"data-quality":0}} 54 | {"text":"For QE in particular, high-quality labeled data is often lacking due to the high-cost and effort associated with labeling such data.","cats":{"data-quality":0}} 55 | {"text":"With many possible classes to consider, data annotators are likely to make errors when labeling such data in practice.","cats":{"data-quality":1}} 56 | {"text":"However, it usually suffers from a lack of high-quality datasets due to high annotation cost, inter-observer variability, human annotator error, and errors in computer-generated labels.","cats":{"data-quality":0}} 57 | {"text":"For such bone structure analyses, deep learning technologies are promising but require high-quality labeled data for the learning, while the data labeling is costly.","cats":{"data-quality":0}} 58 | {"text":"However, agreement between annotators is often low, leading to inconsistent labels that hinder the reliability of models.","cats":{"data-quality":1}} 59 | {"text":"Our experiments show that this approach consistently improves inter-annotator agreement and annotation accuracy.","cats":{"data-quality":1}} 60 | {"text":"We advocate for the use of IAA in predicting the labeling quality of individual annotators, leading to cost and time efficiency in data production.","cats":{"data-quality":1}} 61 | {"text":"This paper presents a novel approach of leveraging Inter-Annotator Agreement (IAA), traditionally used for assessing labeling consistency, to optimize Data Management Operations (DMOps).","cats":{"data-quality":1}} 62 | {"text":"Our study illustrates that different labeling methodologies directly impact the annotations' quality, as well as the capabilities of a deep learning classifier trained with the data respectively.","cats":{"data-quality":1}} 63 | {"text":"However, such annotations may fail in practice because of the change in annotation requirements, application scenarios, and modeling goals, where label validation and relabeling by domain experts are required.","cats":{"data-quality":1}} 64 | {"text":"However, selecting training samples based on the degree of agreement between annotators introduces a bias in the training data and does not improve the results.","cats":{"data-quality":1}} 65 | {"text":"However, these annotations are inherently subjective and some of the instances are hard to classify, resulting in noisy annotations due to error or lack of agreement.","cats":{"data-quality":1}} 66 | {"text":"We propose and evaluate an additional application of our method leading to the detection of annotation errors.","cats":{"data-quality":1}} 67 | {"text":"However, arbitrating the final annotation is not always effective because new biases might be produced during the process, especially when there are significant variations among annotations.","cats":{"data-quality":1}} 68 | {"text":"A two-step human annotation and inter-annotator agreement study guarantee the high quality of the PcMSP corpus.","cats":{"data-quality":0}} 69 | {"text":"We observe a striking correlation between the model's and humans' annotation: Categories with consistent human annotations (>$0.9$ inter-rater reliability, IRR) also display higher human-model agreement (>$0.7$), while categories with less consistent human annotations ($0.7$-$0.8$ IRR) correspondingly demonstrate lower human-model agreement ($0.3$-$0.5$).","cats":{"data-quality":1}} 70 | {"text":"We propose two metrics to audit the noise of annotations.","cats":{"data-quality":1}} 71 | {"text":"Whereas such annotation is costly and hard to scale, significantly holding back the development of the research.","cats":{"data-quality":0}} 72 | {"text":"A key challenge in effectively combining partial annotation with self-training to reduce annotation cost is determining which sub-structures to select to label.","cats":{"data-quality":1}} 73 | {"text":"We hypothesize two failure modes of safety training: competing objectives and mismatched generalization.","cats":{"data-quality":0}} 74 | {"text":"Competing objectives arise when a model's capabilities and safety goals conflict, while mismatched generalization occurs when safety training fails to generalize to a domain for which capabilities exist.","cats":{"data-quality":0}} 75 | {"text":"We find that vulnerabilities persist despite the extensive red-teaming and safety-training efforts behind these models.","cats":{"data-quality":0}} 76 | {"text":"Specifically, we analyze the impact of the manifold's curvatures (or higher order nonlinearity in the parameterization when the curvatures are locally zero) on the uniqueness of the regression solution.","cats":{"data-quality":0}} 77 | {"text":"Our findings suggest that the corresponding linear regression does not have a unique solution when the embedded submanifold is flat in some dimensions.","cats":{"data-quality":0}} 78 | {"text":"Our findings thus reveal the role of data manifold geometry in ensuring the stability of regression models for out-of-distribution inferences.","cats":{"data-quality":0}} 79 | {"text":"To disentangle these effects, we propose an evaluation framework based on \"counterfactual\" task variants that deviate from the default assumptions underlying standard tasks.","cats":{"data-quality":0}} 80 | {"text":"Across a suite of 11 tasks, we observe nontrivial performance on the counterfactual variants, but nevertheless find that performance substantially and consistently degrades compared to the default conditions.","cats":{"data-quality":0}} 81 | {"text":"We also propose an accurate pseudo label generation method through prototype learning.","cats":{"data-quality":0}} 82 | {"text":"Specifically, we frame aggregation of annotations as posterior inference of so-called plausibilities, representing distributions over classes in a classification setting, subject to a hyper-parameter encoding annotator reliability.","cats":{"data-quality":1}} 83 | {"text":"Based on this model, we propose a metric for measuring annotation uncertainty and provide uncertainty-adjusted metrics for performance evaluation.","cats":{"data-quality":0}} 84 | {"text":"Identifying the samples with corrupted labels and preventing the model from learning them is a promising approach to address this challenge.","cats":{"data-quality":1}} 85 | {"text":"Furthermore, we detect real label errors a) on commonly used test datasets in object detection and b) on a proprietary dataset.","cats":{"data-quality":1}} 86 | {"text":"Large-scale datasets in the real world inevitably involve label noise.","cats":{"data-quality":0}} 87 | {"text":"This is partially due to the fact that obtaining a balanced, diverse, and perfectly labeled dataset is typically expensive, time-consuming, and error-prone.","cats":{"data-quality":0}} 88 | {"text":"We develop an efficient algorithm for detecting label errors and outlier data points based on the relational graph structure of the dataset.","cats":{"data-quality":1}} 89 | {"text":"By focusing on finding incorrect labels in the original training datasets, we can eliminate erroneous examples in their root.","cats":{"data-quality":1}} 90 | {"text":"Manually labelling data with high-quality labels is generally a time-consuming and challenging task and often this turns out to be the bottleneck in a machine learning project.","cats":{"data-quality":0}} 91 | {"text":"Here we consider algorithms for finding mislabeled examples in multi-label classification datasets.","cats":{"data-quality":1}} 92 | {"text":"Negative labels are those that a corresponding data item does not belong.","cats":{"data-quality":0}} 93 | {"text":"This issue is due to biased labeling preferences at multiple clients and is a typical setting of data heterogeneity.","cats":{"data-quality":0}} 94 | {"text":"However, noisy samples (i.e., with wrong labels) in the training set induce confusion and cause the network to learn the incorrect representation.","cats":{"data-quality":1}} 95 | {"text":"Mislabeled examples are a common issue in real-world data, particularly for tasks like token classification where many labels must be chosen on a fine-grained basis.","cats":{"data-quality":1}} 96 | {"text":"We also introduced robust loss to reduce the noise effects of inaccurate labels generated in semi-supervised learning.","cats":{"data-quality":1}} 97 | {"text":"The main anomaly was found by the autoencoder and automatically created labels and was also recorded in the log files.","cats":{"data-quality":1}} 98 | {"text":"About 0.2% of the images could not be assigned a label, while for 5.1% the reviewers were uncertain, or they assigned an invalid label.","cats":{"data-quality":1}} 99 | {"text":"We find that the above issues are caused by the training dataset's pose imbalance. ","cats":{"data-quality":0}} 100 | {"text":"The labor-intensive annotation process of semantic segmentation datasets is often prone to errors, since humans struggle to label every pixel correctly.","cats":{"data-quality":1}} 101 | {"text":"We study algorithms to automatically detect such annotation errors, in particular methods to score label quality, such that the images with the lowest scores are least likely to be correctly labeled.","cats":{"data-quality":1}} 102 | {"text":"Widely applicable, our label quality scores rely on probabilistic predictions from a trained segmentation model -- any model architecture and training procedure can be utilized.","cats":{"data-quality":1}} 103 | {"text":"Here we study 7 different label quality scoring methods used in conjunction with a DeepLabV3+ or a FPN segmentation model to detect annotation errors in a version of the SYNTHIA dataset.","cats":{"data-quality":1}} 104 | {"text":"Precision-recall evaluations reveal a score -- the soft-minimum of the model-estimated likelihoods of each pixel's annotated class -- that is particularly effective to identify images that are mislabeled, across multiple types of annotation error.","cats":{"data-quality":1}} 105 | {"text":"In recent years, research on learning with noisy labels has focused on devising novel algorithms that can achieve robustness to noisy training labels while generalizing to clean data.","cats":{"data-quality":1}} 106 | {"text":"While some of these regularization strategies have been utilized in previous noisy label learning research, their full potential has not been thoroughly explored.","cats":{"data-quality":1}} 107 | {"text":"We also synthetically mislabel a proportion of the dataset by randomly corrupting the labels of a few samples, and show that sorting by curvature yields high AUROC values for identifying the mislabeled samples.","cats":{"data-quality":1}} 108 | {"text":"Further analysis shows that these gains come from an improved decision boundary after cleaning the label errors existed in the training data.","cats":{"data-quality":1}} 109 | {"text":"Nevertheless, few papers have tackled the data shift problem in labeled training sets, which occurs when there is a mismatch between the data distribution in the training set and the testing set.","cats":{"data-quality":1}} 110 | {"text":"In this work, we examine the problem for both labeled and unlabeled settings.","cats":{"data-quality":1}} 111 | {"text":"It is crucial to correctly predict areas that deviate from the background noise, in both the train and test sets of labels. ","cats":{"data-quality":0}} 112 | {"text":"Data completeness is ensured through the label provided during training.","cats":{"data-quality":0}} 113 | {"text":"Trustworthy pseudo labels on unlabeled data are generated after uncertainty estimation.","cats":{"data-quality":0}} 114 | {"text":"When random label noise is added to a training dataset, the prediction error of a neural network on a label-noise-free test dataset initially improves during early training but eventually deteriorates, following a U-shaped dependence on training time.","cats":{"data-quality":0}} 115 | {"text":"In this paper, we try to deal with error accumulation in noisy label learning from both model and data perspectives.","cats":{"data-quality":1}} 116 | {"text":"In our analysis, we find that SoundDesc contains several duplicates that cause leakage of training data to the evaluation data.","cats":{"data-quality":1}} 117 | {"text":"However, in many situations, language can be ambiguous and ineffective in describing specific image edits.","cats":{"data-quality":0}} 118 | {"text":"We propose an automatic metric to test the prevalence of the opinions that a summary expresses, based on counting the number of reviews that are consistent with each statement in the summary, while discrediting trivial or redundant statements.","cats":{"data-quality":0}} 119 | {"text":"To formulate this opinion prevalence metric, we consider several existing methods to score the factual consistency of a summary statement with respect to each individual source review.","cats":{"data-quality":0}} 120 | {"text":"On a corpus of Amazon product reviews, we gather multiple human judgments of the opinion consistency, to determine which automatic metric best expresses consistency in product reviews.","cats":{"data-quality":1}} 121 | {"text":"The system utilizes a weakly supervised technique that employs a fine-grained annotation scheme to identify verbally formulated uncertainty at the sentence level in scientific texts.","cats":{"data-quality":1}} 122 | {"text":"Additionally, UnScientify provides interpretable results, aiding in the comprehension of identified instances of scientific uncertainty in text.","cats":{"data-quality":0}} 123 | {"text":"Recent work in Machine Learning and Computer Vision has highlighted the presence of various types of systematic flaws inside ground truth object recognition benchmark datasets.","cats":{"data-quality":1}} 124 | {"text":"The net consequence is that the current annotation process is largely under-specified, thus leaving too much freedom to the subjective judgment of annotators.","cats":{"data-quality":1}} 125 | {"text":"Motivated by the optimal strategy, we introduce double-score OOD methods that leverage uncertainty scores from two chosen OOD detectors: one focused on OOD/ID discrimination and the other on misclassification detection.","cats":{"data-quality":1}} 126 | {"text":"The optimal prediction strategy for out-of-distribution (OOD) setups is a fundamental question in machine learning.","cats":{"data-quality":0}} 127 | {"text":"In this paper, we address this question and present several contributions.","cats":{"data-quality":0}} 128 | {"text":"We propose three reject option models for OOD setups: the Cost-based model, the Bounded TPR-FPR model, and the Bounded Precision-Recall model.","cats":{"data-quality":0}} 129 | {"text":"These models extend the standard reject option models used in non-OOD setups and define the notion of an optimal OOD selective classifier.","cats":{"data-quality":0}} 130 | {"text":"We establish that all the proposed models, despite their different formulations, share a common class of optimal strategies. ","cats":{"data-quality":0}} 131 | {"text":"The experimental results consistently demonstrate the superior performance of this simple strategy compared to state-of-the-art methods.","cats":{"data-quality":0}} 132 | {"text":"Additionally, we propose novel evaluation metrics derived from the definition of the optimal strategy under the proposed OOD rejection models.","cats":{"data-quality":0}} 133 | {"text":"These new metrics provide a comprehensive and reliable assessment of OOD methods without the deficiencies observed in existing evaluation approaches.","cats":{"data-quality":0}} 134 | {"text":"This analysis helps us find a, to the best of our knowledge, novel failure model on the CIFAR100 dataset, that of duplicated images with different labels","cats":{"data-quality":1}} 135 | {"text":"Neural networks are overparametrized and easily overfit the datasets they train on.","cats":{"data-quality":0}} 136 | {"text":"In the extreme case, it is shown that they can memorize a training set with fully randomized labels.","cats":{"data-quality":0}} 137 | {"text":"We propose using the curvature of loss function around the training sample as a measure of its memorization, averaged over all training epochs.","cats":{"data-quality":0}} 138 | {"text":"We use this to study the generalization versus memorization properties of different samples in popular image datasets.","cats":{"data-quality":0}} 139 | {"text":"We visualize samples with the highest curvature of loss around them, and show that these visually correspond to long-tailed, mislabeled or conflicting samples. .","cats":{"data-quality":0}} 140 | {"text":"We also synthetically mislabel a proportion of the dataset by randomly corrupting the labels of a few samples, and show that sorting by curvature yields","cats":{"data-quality":0}} 141 | {"text":"Medical image classification is a challenging task due to the scarcity of labeled samples and class imbalance caused by the high variance in disease prevalence.","cats":{"data-quality":0}} 142 | {"text":"Semi-supervised learning (SSL) methods can mitigate these challenges by leveraging both labeled and unlabeled data.","cats":{"data-quality":0}} 143 | {"text":"However, SSL methods for medical image classification need to address two key challenges: (1) estimating reliable pseudo-labels for the images in the unlabeled dataset and (2) reducing biases caused by class imbalance.","cats":{"data-quality":0}} 144 | {"text":"In this paper, we propose a novel SSL approach, SPLAL, that effectively addresses these challenges.","cats":{"data-quality":0}} 145 | {"text":"SPLAL leverages class prototypes and a weighted combination of classifiers to predict reliable pseudo-labels over a subset of unlabeled images.","cats":{"data-quality":0}} 146 | {"text":"Additionally, we introduce alignment loss to mitigate model biases toward majority classes.","cats":{"data-quality":0}} 147 | {"text":"To evaluate the performance of our proposed approach, we conduct experiments on two publicly available medical image classification benchmark datasets: the skin lesion classification (ISIC 2018) and the blood cell classification dataset (BCCD).","cats":{"data-quality":0}} 148 | {"text":"The experimental results empirically demonstrate that our approach outperforms several state-of-the-art SSL methods over various evaluation metrics.","cats":{"data-quality":0}} 149 | {"text":"Specifically, our proposed approach achieves a significant improvement over the state-of-the-art approach on the ISIC 2018 dataset in both Accuracy and F1 score, with relative margins of 2.24\\% and 11.40\\%, respectively.","cats":{"data-quality":0}} 150 | {"text":"Finally, we conduct extensive ablation experiments to examine the contribution of different components of our approach, validating its effectiveness.","cats":{"data-quality":0}} 151 | {"text":"Textual noise, such as typos or abbreviations, is a well-known issue that penalizes vanilla Transformers for most downstream tasks","cats":{"data-quality":1}} 152 | {"text":"Previous works addressing the noise issue mainly rely on data augmentation strategies, showing improved robustness when dealing with corrupted samples that are similar to the ones used for training.","cats":{"data-quality":1}} 153 | {"text":"However, all these methods still suffer from the token distribution shift induced by typos","cats":{"data-quality":1}} 154 | {"text":"We show that this is also the case for sentence similarity, a fundamental task in multiple domains, e.g. matching, retrieval or paraphrasing.","cats":{"data-quality":0}} 155 | {"text":"Sentence similarity can be approached using cross-encoders, where the two sentences are concatenated in the input allowing the model to exploit the inter-relations between them.","cats":{"data-quality":0}} 156 | {"text":"Previous works addressing the noise issue mainly rely on data augmentation strategies, showing improved robustness when dealing wixtual noise by equipping cross-encoders with a novel LExical-aware Attention module (LEA) that incorporates lexical similarities between words in both sentences.","cats":{"data-quality":0}} 157 | {"text":"By using raw text similarities, our ae that the attention bias introduced by LEA helps cross-encoders to tackle complex scenarios with textual noise, specially in domains with short-text descriptions and limited context.","cats":{"data-quality":0}} 158 | {"text":"Experiments using three popular Transformer encoders in five e-commerce datasets for product matching show that LEA consistently boosts performance under the presence of noise, while remaining competitive on the original (clean) splits.","cats":{"data-quality":0}} 159 | {"text":"We also evaluate our approach in two datasets for textual entailment and paraphrasing showing that LEA is robust to typos in domains with longer sentences and more natural context.","cats":{"data-quality":0}} 160 | {"text":"Additionally, we thoroughly analyze several design choices in our approach, providing insights about the impact of the decisions made and fostering future research in cross-encoders dealing with typos.","cats":{"data-quality":0}} 161 | {"text":"For safety, AI systems in health undergo thorough evaluations before deployment, validating their predictions against a ground truth that is assumed certain.","cats":{"data-quality":0}} 162 | {"text":"However, this is actually not the case and the ground truth may be uncertain.","cats":{"data-quality":0}} 163 | {"text":"Unfortunately, this is largely ignored in standard evaluation of AI models but can have severe consequences such as overestimating the future performance.","cats":{"data-quality":0}} 164 | {"text":"To avoid this, we measure the effects of ground truth uncertainty, which we assume decomposes into two main components: annotation uncertainty which stems from the lack of reliable annotations, and inherent uncertainty due to limited observational information.","cats":{"data-quality":0}} 165 | {"text":"This ground truth uncertainty is ignored when estimating the ground truth by deterministically aggregating annotations, e.g., by majority voting or averaging.","cats":{"data-quality":0}} 166 | {"text":"In contrast, we propose a framework where aggregation is done using a statistical model. ","cats":{"data-quality":0}} 167 | {"text":"We present a case study applying our framework to skin condition classification fromtion (IRN) from previous work ignores ground truth uncertainty in evaluation.","cats":{"data-quality":0}} 168 | {"text":"Instead, we present two alternative statistical models: a probabilistic version of IRN and a Plackett-Luce-based model.","cats":{"data-quality":0}} 169 | {"text":"We find that a large portion of the dataset exhibits significant ground truth uncertainty and standard IRN-based evaluation severely over-estimates performance without providing uncertainty estimates.","cats":{"data-quality":0}} 170 | {"text":"To systematically combat confirmation bias for pseudo-labeling-based entity alignment, we propose a Unified Pseudo-Labeling framework for Entity Alignment (UPL-EA) that explicitly eliminates pseudo-labeling errors to boost the accuracy of entity alignment","cats":{"data-quality":1}} 171 | {"text":"The two components are respectively designed to eliminate Type I and Type II pseudo-labeling errors identified through our analyse.","cats":{"data-quality":0}} 172 | {"text":"The effectiveness of UPL-EA in eliminating pseudo-labeling errors is both theoretically supported and experimentally validated.","cats":{"data-quality":1}} 173 | {"text":"Entity alignment (EA) aims at identifying equivalent entity pairs across different knowledge graphs (KGs) that refer to the same real-world identity. .","cats":{"data-quality":0}} 174 | {"text":"UPL-EA consists of two complementary components: (1) The Optimal Transport (OT)-based pseudo-labeling uses discrete OT modeling as an effective means to enable more accurate determination of entity correspondences across two KGs and to mitigate the adverse impact of erroneous matches.","cats":{"data-quality":0}} 175 | {"text":"A simple but highly effective criterion is further devised to derive pseudo-labeled entity pairs that satisfy one-to-one correspondences at each iteration.","cats":{"data-quality":0}} 176 | {"text":"(2) The cross-iteration pseudo-label calibration operates across multiple consecutive iterations to further improve the pseudo-labeling precision rate by reducing the local pseudo-label selection variability with a theoretical guarantee.","cats":{"data-quality":0}} 177 | {"text":"The calibrated pseudo-labels are thereafter used to augment prior alignment seeds to reinforce subsequent model training fomentally validated.","cats":{"data-quality":0}} 178 | {"text":"The experimental results show that our approach achieves competitive performance with limited prior alignment seeds.","cats":{"data-quality":0}} 179 | {"text":"A novel annotation method was used to collect three separate annotations for each region of interest, and these annotations were performed in a fully transparent setting using a web-based annotation tool.","cats":{"data-quality":1}} 180 | {"text":"This paper presents the challenge report for the 2021 Kidney and Kidney Tumor Segmentation Challenge (KiTS21) held in conjunction with the 2021 international conference on Medical Image Computing and Computer Assisted Interventions (MICCAI).","cats":{"data-quality":0}} 181 | {"text":"KiTS21 is a sequel to its first edition in 2019, and it features a variety of innovations in how the challenge was designed, in addition to a larger dataset. ","cats":{"data-quality":0}} 182 | {"text":"Further, the KiTS21 test set was collected from an outside institution, challenging participants to develop methods that generalize well to new populations.","cats":{"data-quality":0}} 183 | {"text":"Nonetheless, the top-performing teams achieved a significant improvement over the state of the art set in 2019, and this performance is shown to inch ever closer to human-level performance.","cats":{"data-quality":0}} 184 | {"text":"An in-depth meta-analysis is presented describing which methods were used and how they faired on the leaderboard, as well as the characteristics of which cases generally saw good performance, and which did not.","cats":{"data-quality":0}} 185 | {"text":"Overall KiTS21 facilitated a significant advancement in the state of the art in kidney tumor segmentation, and provides useful insights that are applicable to the field of semantic segmentation as a whole.","cats":{"data-quality":0}} 186 | {"text":"Additionally, label noise is inevitable in large-scale annotations and hinders the applications of learning-based models.","cats":{"data-quality":1}} 187 | {"text":"To tackle such a critical yet thorny problem, this paper focuses on reducing noise based on some inherent properties of multi-label classification and long-tailed learning under noisy cases","cats":{"data-quality":1}} 188 | {"text":"In detail, we propose a Stitch-Up augmentation to synthesize a cleaner sample, which directly reduces multi-label noise by stitching up multiple noisy training samples","cats":{"data-quality":1}} 189 | {"text":"In real-world scenarios, collected and annotated data often exhibit the characteristics of multiple classes and long-tailed distribution. ","cats":{"data-quality":0}} 190 | {"text":"Although many deep learning based methods have been proposed for handling long-tailed multi-label recognition or label noise respectively, learning with noisy labels in long-tailed multi-label visual data has not been well-studied because of the complexity of long-tailed distribution entangled with multi-label correlation.","cats":{"data-quality":0}} 191 | {"text":"To tackle such a critical yet thorny problem, this paper focuses on reducing noise based on some inherent properties of m by stitching up multiple noisy training samples.","cats":{"data-quality":0}} 192 | {"text":"Equipped with Stitch-Up, a Heterogeneous Co-Learning framework is further designed to leverage the inconsistency between long-tailed and balamarks, named VOC-MLT-Noise and COCO-MLT-Noise, respectively.","cats":{"data-quality":0}} 193 | {"text":"Most of the existing methods adopt a coarse-grained fixed label assignment strategy and suffer from the inconsistency between the classification score and localization accuracy.","cats":{"data-quality":1}} 194 | {"text":"Second, to further address the inconsistency between classification and localization, we propose a critical feature sampling (CFS) module, which performs localization refinement on the sampling location for classification task to extract critical features accurately","cats":{"data-quality":1}} 195 | {"text":"Arbitrary-oriented object detection is a relatively emerging but challenging task.","cats":{"data-quality":0}} 196 | {"text":"Although remarkable progress has been made, there still remain many unsolved issues due to the large diversity of patterns in orientation, scale, aspect ratio, and visual appearance of objects in aerial images. ","cats":{"data-quality":0}} 197 | {"text":"First, to align the metric inconsistency between sample selection and regression loss calculation caused by fixed IoU strategy, we introduce affine transformation to evaluate the quality of samples and propose a distance-based label assignment strategy.","cats":{"data-quality":0}} 198 | {"text":"The proposed metric-aligned selection (MAS) strategy can dynamically select samples according to the shape and rotation characteristic of objects.","cats":{"data-quality":0}} 199 | {"text":"Second, to further address the inconsistency between classification and localization, we propose a critical feature sampling (CFS) module, which performs localization refinementtics of proposals during training.","cats":{"data-quality":0}} 200 | {"text":"Extensive experiments are conducted on four challenging rotated object detection datasets DOTA, FAIR1M-1.0, HRSC2016, and UCAS-AOD.","cats":{"data-quality":0}} 201 | {"text":"The results show the state-of-the-art accuracy of the proposed detector.","cats":{"data-quality":0}} 202 | {"text":"However, results from even highly accurate methods require manual verification and correction","cats":{"data-quality":1}} 203 | {"text":"The reviewers corrected 62.8% of the labels and agreed with the model label in 31.9% of cases.","cats":{"data-quality":1}} 204 | {"text":"We learned that our automatic transcription is biased towards the most frequent codes, with a higher degree of misclassification for the lowest frequency codes","cats":{"data-quality":1}} 205 | {"text":"Machine learning methods have proven useful in transcribing historical data. .","cats":{"data-quality":0}} 206 | {"text":"Such manual review can be time-consuming and expensive, therefore the objective of this paper was to make it more efficient.","cats":{"data-quality":0}} 207 | {"text":"Previously, we used machine learning to transcribe 2.3 million handwritten occupation codes from the Norwegian 1950 census with high accuracy (97%).","cats":{"data-quality":0}} 208 | {"text":"We manually reviewed the 90,000 (3%) codes with the lowest model confidence.","cats":{"data-quality":0}} 209 | {"text":"We allocated those 90,000 codes to human reviewers, who used our annotation tool to review the codes.","cats":{"data-quality":0}} 210 | {"text":"To assess reviewer agreement, some codes were assigned to multiple reviewers.","cats":{"data-quality":0}} 211 | {"text":"We then analyzed the review results to understand the relationship between accuracy improvements and effort.","cats":{"data-quality":0}} 212 | {"text":"Additionally, we interviewed the reviewers to improve the workflow.","cats":{"data-quality":0}} 213 | {"text":"The reviewers corrected 62.8% of the labels and agreed with the model label in 31.9% of casescertain, or they assigned an invalid label.","cats":{"data-quality":0}} 214 | {"text":"9,000 images were independently reviewed by multiplds the most frequent codes, with a higher degree of misclassification for the lowest frequency codes.","cats":{"data-quality":0}} 215 | {"text":"Our interview findings show that the reviewers did internal quality control and found our custom tool well-suited.","cats":{"data-quality":0}} 216 | {"text":"So, only one reviewer is needed, but they shou","cats":{"data-quality":0}} 217 | {"text":" We advocate for the use of IAA in predicting the labeling quality of individual annotators, leading to cost and time efficiency in data production.","cats":{"data-quality":0}} 218 | {"text":"Additionally, our work highlights the IAA's broader application potential in data-driven research optimization and holds significant implications for large-scale data projects prioritizing efficiency, cost reduction, and high-quality data.","cats":{"data-quality":0}} 219 | {"text":"We present DiffInfinite, a hierarchical diffusion model that generates arbitrarily large histological images while preserving long-range correlation structural information.","cats":{"data-quality":0}} 220 | {"text":"Our approach first generates synthetic segmentation masks, subsequently used as conditions for the high-fidelity generative diffusion process.","cats":{"data-quality":0}} 221 | {"text":"The proposed sampling method can be scaled up to any desired image size while only requiring small patches for fast training.","cats":{"data-quality":0}} 222 | {"text":"Moreover, it can be parallelized more efficiently than previous large-content generation methods while avoiding tiling artefacts.","cats":{"data-quality":0}} 223 | {"text":"The training leverages classifier-free guidance to augment a small, sparsely annotated dataset with unlabelled data.","cats":{"data-quality":0}} 224 | {"text":"Our method alleviates unique challenges in histopathological imaging practice: large-scale information, costly manual annotation, and protective data handling.","cats":{"data-quality":0}} 225 | {"text":"The biological plausibility of DiffInfinite data is validated in a survey by ten experienced pathologists as well as a downstream segmentation task.","cats":{"data-quality":0}} 226 | {"text":"Furthermore, the model scores strongly on anti-copying metrics which is beneficial for the protection of patient data.","cats":{"data-quality":0}} 227 | {"text":"Understanding this, we, in this paper, first analyze this lack of granular annotations from available pre-annotated datasets to understand the practical inconsistencies and also perform a detailed survey to look into the human perception surrounding annotations.","cats":{"data-quality":1}} 228 | {"text":"Efficient human activity recognition (HAR) using sensor data needs a significant volume of annotated data.","cats":{"data-quality":0}} 229 | {"text":"The growing volume of unlabelled sensor data has challenged conventional practices for gathering HAR annotations with human-in-the-loop approaches, often leading to the collection of shallower annotations.","cats":{"data-quality":0}} 230 | {"text":"These shallower annotations ignore the fine-grained micro-activities that constitute any complex activities of daily living (ADL). ","cats":{"data-quality":0}} 231 | {"text":"Drawing motivations from these, we next develop the framework AmicroN that can automatically generate micro-activity annotations using locomotive signatures and the available coarse-grain macro-activity labels.","cats":{"data-quality":0}} 232 | {"text":"In the backend, AmicroN applies change-point detection followed by zero-shot learning with activity embeddings to identify the unseen micro-activities in an unsupervised manner.","cats":{"data-quality":0}} 233 | {"text":"Rigorous evaluation on publicly available datasets shows that AmicroN can accurately generate micro-activity annotations with a median F1-score of >0.75.","cats":{"data-quality":0}} 234 | {"text":"Additionally, we also show that AmicroN can be used in a plug-and-play manner with Large Language Models (LLMs) to obtain the micro-activity labels, thus making it more practical for realistic applications.","cats":{"data-quality":0}} 235 | {"text":"This paper presents a large publicly available multi-center lumbar spine magnetic resonance imaging (MRI) dataset with reference segmentations of vertebrae, intervertebral discs (IVDs), and spinal canal.","cats":{"data-quality":0}} 236 | {"text":"The dataset includes 447 sagittal T1 and T2 MRI series from 218 patients with a history of low back pain.","cats":{"data-quality":0}} 237 | {"text":"It was collected from four different hospitals and was divided into a training (179 patients) and validation (39 patients) set.","cats":{"data-quality":0}} 238 | {"text":"An iterative data annotation approach was used by training a segmentation algorithm on a small part of the dataset, enabling semi-automatic segmentation of the remaining images.","cats":{"data-quality":0}} 239 | {"text":"The algorithm provided an initial segmentation, which was subsequently reviewed, manually corrected, and added to the training data.","cats":{"data-quality":0}} 240 | {"text":"We provide reference performance values for this baseline algorithm and nnU-Net, which performed comparably.","cats":{"data-quality":0}} 241 | {"text":"We set up a continuous segmentation challenge to allow for a fair comparison of different segmentation algorithms.","cats":{"data-quality":0}} 242 | {"text":"This study may encourage wider collaboration in the field of spine segmentation, and improve the diagnostic value of lumbar spine MRI.","cats":{"data-quality":0}} 243 | {"text":"But meanwhile, the distributed and isolated nature of data isolation may be complicated by data quality, making it more vulnerable to noisy labels","cats":{"data-quality":1}} 244 | {"text":"Many efforts exist to defend against the negative impacts of noisy labels in centralized or federated settings","cats":{"data-quality":1}} 245 | {"text":"Also, we conduct comprehensive experiments to explore the characteristics of these data settings and unravel challenging scenarios on the federated noisy label learning, which may guide method development in the future.","cats":{"data-quality":0}} 246 | {"text":"We highlight the 20 basic settings for more than 5 datasets proposed in our benchmark and standardized simulation pipeline for federated noisy label learning.","cats":{"data-quality":1}} 247 | {"text":"Federated learning has gained popularity for distributed learning without aggregating sensitive data from clients. .","cats":{"data-quality":0}} 248 | {"text":"Many efforts exist to defend against the negative impacts of noisy labels in centralized or federated settings.","cats":{"data-quality":0}} 249 | {"text":"However, there is a lack of a benchis work, we serve the first standardized benchmark that can help researchers fully explore potential federated noisy settings.","cats":{"data-quality":0}} 250 | {"text":"We highlight the 20 basic settings f \\texttt{FedNoisy} is available at \\codeword{https://github.com/SMILELab-FL/FedNoisy}.","cats":{"data-quality":0}} 251 | {"text":"In this paper, we explore different ways of training a model for handwritten text recognition when multiple imperfect or noisy transcriptions are available","cats":{"data-quality":1}} 252 | {"text":"We consider various training configurations, such as selecting a single transcription, retaining all transcriptions, or computing an aggregated transcription from all available annotations.","cats":{"data-quality":0}} 253 | {"text":"In addition, we evaluate the impact of quality-based data selection, where samples with low agreement are removed from the training set.","cats":{"data-quality":0}} 254 | {"text":"Our experiments are carried out on municipal registers of the city of Belfort (France) written between 1790 and 1946.","cats":{"data-quality":0}} 255 | {"text":"% results The results show that computing a consensus transcription or training on multiple transcriptions are good alternatives.","cats":{"data-quality":0}} 256 | {"text":"However, selecting training samples based on the degree of agreement between annotators introduces a bias in the training data and does not improve the res","cats":{"data-quality":0}} 257 | {"text":"The aim of the experiment is to judge the final annotation quality when pre-annotation is used.","cats":{"data-quality":1}} 258 | {"text":"In addition, it evaluates the effect of automatic linguistically-based (rule-formulated) checks and another annotation on the same data available to the annotators, and their influence on annotation quality and efficiency.","cats":{"data-quality":1}} 259 | {"text":"This paper presents an analysis of annotation using an automatic pre-annotation for a mid-level annotation complexity task -- dependency syntax annotation.","cats":{"data-quality":0}} 260 | {"text":"It compares the annotation efforts made by annotators using a pre-annotated version (with a high-accuracy parser) and those made by fully manual annotation. ","cats":{"data-quality":0}} 261 | {"text":"In addition, it evaluates the effect of automatic linguistically-based (rule-formulated) checkstic annotation which increases the consistency of the resulting annotation without reducing its quality.","cats":{"data-quality":0}} -------------------------------------------------------------------------------- /docs/API/external.md: -------------------------------------------------------------------------------- 1 | ## OpenAIEncoder 2 | 3 | ::: embetter.external.OpenAIEncoder 4 | options: 5 | members: false 6 | 7 | ## AzureOpenAIEncoder 8 | ::: embetter.external.AzureOpenAIEncoder 9 | options: 10 | members: false 11 | 12 | ## `CohereEncoder` 13 | 14 | ::: embetter.external.CohereEncoder 15 | options: 16 | members: false 17 | -------------------------------------------------------------------------------- /docs/API/grab.md: -------------------------------------------------------------------------------- 1 | # Grabbers 2 | 3 | ## ColumnGrabber 4 | 5 | ::: embetter.grab.ColumnGrabber 6 | 7 | ## KeyGrabber 8 | 9 | ::: embetter.grab.KeyGrabber 10 | -------------------------------------------------------------------------------- /docs/API/model.md: -------------------------------------------------------------------------------- 1 | ## DifferenceClassifier 2 | 3 | ::: embetter.model.DifferenceClassifier 4 | -------------------------------------------------------------------------------- /docs/API/multimodal.md: -------------------------------------------------------------------------------- 1 | ## ClipEncoder 2 | 3 | ::: embetter.multi.ClipEncoder 4 | options: 5 | members: false 6 | -------------------------------------------------------------------------------- /docs/API/text.md: -------------------------------------------------------------------------------- 1 | ## TextEncoder 2 | 3 | ::: embetter.text.TextEncoder 4 | options: 5 | members: false 6 | 7 | ## SentenceEncoder 8 | 9 | ::: embetter.text.SentenceEncoder 10 | options: 11 | members: false 12 | 13 | ## MatryoshkaEncoder 14 | 15 | ::: embetter.text.MatryoshkaEncoder 16 | options: 17 | members: false 18 | 19 | ## LiteDocEncoder 20 | 21 | ::: embetter.text.LiteTextEncoder 22 | options: 23 | members: false 24 | 25 | ## KerasNLPEncoder 26 | 27 | ::: embetter.text.KerasNLPEncoder 28 | options: 29 | members: false 30 | 31 | ## spaCyEncoder 32 | 33 | ::: embetter.text.spaCyEncoder 34 | options: 35 | members: false 36 | 37 | ## Sense2VecEncoder 38 | 39 | ::: embetter.text.Sense2VecEncoder 40 | options: 41 | members: false 42 | 43 | ## BytePairEncoder 44 | 45 | ::: embetter.text.BytePairEncoder 46 | options: 47 | members: false 48 | 49 | ## GensimEncoder 50 | 51 | ::: embetter.text.GensimEncoder 52 | options: 53 | members: false 54 | 55 | -------------------------------------------------------------------------------- /docs/API/utils.md: -------------------------------------------------------------------------------- 1 | # Utils 2 | 3 | ## cached 4 | 5 | ::: embetter.utils.cached 6 | 7 | ## batched 8 | 9 | ::: embetter.utils.batched 10 | 11 | ## calc_distances 12 | 13 | ::: embetter.utils.calc_distances 14 | -------------------------------------------------------------------------------- /docs/API/vision.md: -------------------------------------------------------------------------------- 1 | ## ImageLoader 2 | 3 | ::: embetter.vision.ImageLoader 4 | options: 5 | members: false 6 | 7 | ## ColorHistogramEncoder 8 | 9 | ::: embetter.vision.ColorHistogramEncoder 10 | options: 11 | members: false 12 | 13 | ## TimmEncoder 14 | 15 | ::: embetter.vision.TimmEncoder 16 | options: 17 | members: false 18 | -------------------------------------------------------------------------------- /docs/applications.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Techniques 3 | --- 4 | 5 | This document contains some tricks, hints and demos of applications that you might want to consider 6 | in combination with this library. 7 | 8 | ## Cache 9 | 10 | Calculating embeddings can be expensive, even costly when you're using external providers. 11 | This is why this library offers an integration with [diskcache](https://grantjenks.com/docs/diskcache/). 12 | That way, you can infer the embeddings once and store them to disk for later. 13 | 14 | Here's an example of how you might run that. 15 | 16 | ```python 17 | from embetter.text import SentenceEncoder 18 | from embetter.utils import cached 19 | 20 | encoder = cached("sentence-enc", SentenceEncoder('all-MiniLM-L6-v2')) 21 | 22 | examples = [f"this is a pretty long text, which is more expensive {i}" for i in range(10_000)] 23 | 24 | # This might be a bit slow ~17.2s on our machine 25 | encoder.transform(examples) 26 | 27 | # This should be quicker ~4.71s on our machine 28 | encoder.transform(examples) 29 | ``` 30 | 31 | Note that you're also able to fetch the precalculated embeddings directly via: 32 | 33 | ```python 34 | from diskcache import Cache 35 | 36 | # Make sure that you use the same name as in `cached` 37 | cache = Cache("sentence-enc") 38 | # Use a string as a key, if it's precalculated you'll get an array back. 39 | cache["this is a pretty long text, which is more expensive 0"] 40 | ``` 41 | 42 | Be mindful of what goes into the encoder that you choose. It's preferable to give it 43 | text as opposed to numpy arrays. Also note that the first time that you'll run this 44 | it will take more time due to the overhead of writing into the cache. 45 | 46 | ## Lite Embeddings 47 | 48 | There are a lot of options out there for pretrained text embeddings but there are also a few noteworthy lightweight techniques that allow you to train your own from scratch. One such technique is to use the [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) 49 | from scikit-learn followed by [TruncatedSVD](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html). The `TfidfVectorizer` even allows 50 | you to specify `analyzer=char` with `ngram_range` = (3,4) to encode subwords, which even contributes to robustness against spelling errors if that's a concern. 51 | 52 | The main thing that's cool about this approach is the representations can still be very reasonable for a lot of applications _and_ train very quickly. Here's a quick demo: 53 | 54 | ```python 55 | import srsly 56 | from umap import UMAP 57 | from cluestar import plot_text 58 | from embetter.text import learn_lite_doc_embeddings 59 | 60 | # Train embeddings 61 | texts = [ex['text'] for ex in srsly.read_jsonl("datasets/new-dataset.jsonl")] 62 | enc = learn_lite_doc_embeddings(texts, dim=300) 63 | 64 | # Create a 2D UMAP representation 65 | X_orig = enc.transform(texts) # this takes ~56ms 66 | X = UMAP().fit_transform(X_orig) 67 | 68 | # Plot the UMAP representation with the text 69 | plot_text(X, texts) 70 | ``` 71 | 72 | !!! Note 73 | 74 | You can also store the trained embeddings as part of the training-call. 75 | 76 | ```python 77 | enc = learn_lite_doc_embeddings(texts, dim=300, path="stored/on/disk.emb") 78 | ``` 79 | 80 | 81 | 82 | 83 | 84 | 85 | Here's what this chart looks like. Note that you can click and drag to explore! 86 | 87 | 88 | 89 | Let's now consider what a similar chart might look like that uses [Sentence Transformers](https://sbert.net). 90 | 91 | ```python 92 | from embetter.text import SentenceEncoder 93 | 94 | sent_enc = SentenceEncoder() 95 | X_orig = sent_enc.transform(texts) # this takes ~13.5s 96 | X = UMAP().fit_transform(X_orig) 97 | plot_text(X, texts) 98 | ``` 99 | 100 | 101 | 102 | The charts differ, but if you squint you can spot a cluster on the right hand side here that 103 | corresponds with the cluster at the bottom of the previous chart. 104 | 105 | These "litetext" embeddings do overfit on the same words being used. But they are _much_ faster 106 | and still give a reasonable representation for a lot of use-cases. Also not that you don't have 107 | to use our utilities here, you can just create the same pipeline via: 108 | 109 | ```python 110 | from sklearn.decomposition import TruncatedSVD 111 | from sklearn.feature_extraction.text import TfidfVectorizer 112 | from sklearn.pipeline import make_pipeline 113 | 114 | enc = make_pipeline( 115 | TfidfVectorizer(), 116 | TruncatedSVD() 117 | ) 118 | ``` 119 | 120 | Our implementation does a few extra tricks internally to keep things lightweight, but it's really 121 | the same trick. 122 | 123 | ## Difference Models 124 | 125 | Embeddings can be very useful when you're dealing with a deduplication use-case. The thinking 126 | is that items that are close in embedded space might be great candidates to double-check. 127 | 128 | To help investigate this, this library offers a `DifferenceModel` utility. 129 | 130 | ![](images/difference-model.png) 131 | 132 | Here's how you might use it. 133 | 134 | ```python 135 | from embetter.model import DifferenceClassifier 136 | from embetter.text import SentenceEncoder 137 | 138 | mod = DifferenceClassifier(enc=SentenceEncoder()) 139 | 140 | # Suppose this is input data 141 | texts1 = ["hello", "firehydrant", "greetings"] 142 | texts2 = ["no", "yes", "greeting"] 143 | 144 | # You will need to have some definition of "similar" 145 | similar = [0, 0, 1] 146 | 147 | # Train a model to detect similarity 148 | mod.fit(X1=texts1, X2=texts2, y=similar) 149 | mod.predict(X1=texts1, X2=texts2) 150 | mod.predict_proba(X1=texts1, X2=texts2) 151 | 152 | # The classifier head is a scikit-learn model, which you could save 153 | # seperately if you like. The model can be accessed via: 154 | mod.clf_head 155 | ``` 156 | 157 | The model really is just a light wrapper, but it might make it easier to bootstrap. 158 | 159 | ## Available `SentenceEncoder`s 160 | 161 | There are _many_ available models out there. Just have a look at [MTEB](https://huggingface.co/spaces/mteb/leaderboard). 162 | 163 | Because the `SentenceEncoder` in this library is just a wrapper around `sentence-transformers` you should also 164 | be able to load any more that the library can load. 165 | 166 | ```python 167 | # https://huggingface.co/thenlper/gte-small 168 | model = SentenceEncoder('thenlper/gte-small') 169 | model = SentenceEncoder('thenlper/gte-base') 170 | model = SentenceEncoder('thenlper/gte-large') 171 | ``` 172 | 173 | There are many more models that you can consider. Just be aware that [some models](https://huggingface.co/intfloat/e5-large-v2) expect a prefix to be included in the text that you're encoding. 174 | 175 | 176 | ## Speedup with Modal 177 | 178 | Embedding text can be slow, especially when you're running on a CPU. If you wish 179 | to speed up your embedding calculations you may enjoy using [modal](https://modal.com/). 180 | Modal allows you to add a GPU to a Python function simply by adding a decorator. 181 | 182 | Not every encoder in embetter will get a speedup by using a GPU. But we've done some 183 | benchmarks and noticed that 184 | `SentenceEncoder` as well as `ClipEncoder` should both benefit. These components will 185 | also automatically detect when the GPU is available automatically. 186 | 187 | The code below gives an example. 188 | 189 | ```python 190 | import time 191 | import h5py 192 | import modal 193 | 194 | 195 | stub = modal.Stub("example-get-started") 196 | image = (modal.Image.debian_slim() 197 | .pip_install("simsity", "embetter[text]", "h5py") 198 | .run_commands("python -c 'from embetter.text import SentenceEncoder; SentenceEncoder()'")) 199 | 200 | 201 | # This is the function that actually runs the embedding, 202 | # notice that there's a GPU attached. 203 | @stub.function(image=image, gpu="any") 204 | def create(data): 205 | from embetter.text import SentenceEncoder 206 | return SentenceEncoder().transform(data) 207 | 208 | 209 | @stub.local_entrypoint() 210 | def main(): 211 | tic = time.time() 212 | 213 | # You'd need to write your own function to read in the texts 214 | data = read_text() 215 | 216 | # This runs our decorated function on external hardware 217 | X = create.call(data) 218 | 219 | # Next we save it to disk for re-use 220 | with h5py.File('embeddings.h5', 'w') as hf: 221 | hf.create_dataset("embeddings", data=X) 222 | toc = time.time() 223 | print(f"took {toc - tic}s to embed shape {X.shape}") 224 | ``` 225 | 226 | On our own benchmarks, we seem to get a 4-5x speedup with just a minor edit 227 | to the code. This can be extremely helpful when you're trying to embed data 228 | in bulk. 229 | -------------------------------------------------------------------------------- /docs/images/colorhistogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/colorhistogram.png -------------------------------------------------------------------------------- /docs/images/columngrabber.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/columngrabber.png -------------------------------------------------------------------------------- /docs/images/contrastive-re-use.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/contrastive-re-use.png -------------------------------------------------------------------------------- /docs/images/contrastive-same-weights.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/contrastive-same-weights.png -------------------------------------------------------------------------------- /docs/images/contrastive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/contrastive.png -------------------------------------------------------------------------------- /docs/images/difference-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/difference-model.png -------------------------------------------------------------------------------- /docs/images/embed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/embed.png -------------------------------------------------------------------------------- /docs/images/feedforward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/feedforward.png -------------------------------------------------------------------------------- /docs/images/gradient.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/gradient.png -------------------------------------------------------------------------------- /docs/images/human-in-the-loop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/human-in-the-loop-1.png -------------------------------------------------------------------------------- /docs/images/human-in-the-loop-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/human-in-the-loop-2.png -------------------------------------------------------------------------------- /docs/images/human-in-the-loop-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/human-in-the-loop-3.png -------------------------------------------------------------------------------- /docs/images/human-in-the-loop-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/human-in-the-loop-4.png -------------------------------------------------------------------------------- /docs/images/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/icon.png -------------------------------------------------------------------------------- /docs/images/imageloader.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/imageloader.png -------------------------------------------------------------------------------- /docs/images/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/output.png -------------------------------------------------------------------------------- /docs/images/sense2vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/sense2vec.png -------------------------------------------------------------------------------- /docs/images/sentence-encoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/sentence-encoder.png -------------------------------------------------------------------------------- /docs/images/timm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/timm.png -------------------------------------------------------------------------------- /docs/images/x-finetuned-again.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/x-finetuned-again.png -------------------------------------------------------------------------------- /docs/images/x-finetuned.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/x-finetuned.png -------------------------------------------------------------------------------- /docs/images/x-orig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/x-orig.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # embetter 4 | 5 | > "Just a bunch of useful embeddings to get started quickly." 6 | 7 |
8 | 9 | Embetter implements scikit-learn compatible embeddings for computer vision and text. It should make it very easy to quickly build proof of concepts using scikit-learn pipelines and, in particular, should help with [bulk labelling](https://www.youtube.com/watch?v=gDk7_f3ovIk). It's a also meant to play nice with [bulk](https://github.com/koaning/bulk) and [scikit-partial](https://github.com/koaning/scikit-partial) but it can also be used together with your favorite ANN solution like [lancedb](https://lancedb.github.io/lancedb/). 10 | 11 | ## Install 12 | 13 | You can install via pip. 14 | 15 | ``` 16 | python -m pip install embetter 17 | ``` 18 | 19 | Many of the embeddings are optional depending on your use-case, so if you 20 | want to nit-pick to download only the tools that you need: 21 | 22 | ``` 23 | python -m pip install "embetter[text]" 24 | python -m pip install "embetter[sbert]" 25 | python -m pip install "embetter[spacy]" 26 | python -m pip install "embetter[sense2vec]" 27 | python -m pip install "embetter[bpemb]" 28 | python -m pip install "embetter[gensim]" 29 | python -m pip install "embetter[vision]" 30 | python -m pip install "embetter[all]" 31 | ``` 32 | 33 | ## API Design 34 | 35 | This is what's being implemented now. 36 | 37 | ```python 38 | # Helpers to grab text or image from pandas column. 39 | from embetter.grab import ColumnGrabber 40 | 41 | # Representations/Helpers for computer vision 42 | from embetter.vision import ImageLoader, TimmEncoder, ColorHistogramEncoder 43 | 44 | # Representations for text 45 | from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, GensimEncoder 46 | 47 | # Representations from multi-modal models 48 | from embetter.multi import ClipEncoder 49 | 50 | # Finetuning components 51 | from embetter.finetune import FeedForwardTuner, ContrastiveTuner, ContrastiveLearner, SbertLearner 52 | 53 | # External embedding providers, typically needs an API key 54 | from embetter.external import CohereEncoder, OpenAIEncoder 55 | ``` 56 | 57 | All of these components are scikit-learn compatible, which means that you 58 | can apply them as you would normally in a scikit-learn pipeline. Just be aware 59 | that these components are stateless. They won't require training as these 60 | are all pretrained tools. 61 | 62 | ## Text Example 63 | 64 | To run this example, make sure that you `pip install 'embetter[sbert]'`. 65 | 66 | ```python 67 | import pandas as pd 68 | from sklearn.pipeline import make_pipeline 69 | from sklearn.linear_model import LogisticRegression 70 | 71 | from embetter.grab import ColumnGrabber 72 | from embetter.text import SentenceEncoder 73 | 74 | # This pipeline grabs the `text` column from a dataframe 75 | # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2. 76 | text_emb_pipeline = make_pipeline( 77 | ColumnGrabber("text"), 78 | SentenceEncoder('all-MiniLM-L6-v2') 79 | ) 80 | 81 | # This pipeline can also be trained to make predictions, using 82 | # the embedded features. 83 | text_clf_pipeline = make_pipeline( 84 | text_emb_pipeline, 85 | LogisticRegression() 86 | ) 87 | 88 | dataf = pd.DataFrame({ 89 | "text": ["positive sentiment", "super negative"], 90 | "label_col": ["pos", "neg"] 91 | }) 92 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) 93 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf) 94 | ``` 95 | 96 | ## Image Example 97 | 98 | The goal of the API is to allow pipelines like this: 99 | 100 | ```python 101 | import pandas as pd 102 | from sklearn.pipeline import make_pipeline 103 | from sklearn.linear_model import LogisticRegression 104 | 105 | from embetter.grab import ColumnGrabber 106 | from embetter.vision import ImageLoader 107 | from embetter.multi import ClipEncoder 108 | 109 | # This pipeline grabs the `img_path` column from a dataframe 110 | # then it grabs the image paths and turns them into `PIL.Image` objects 111 | # which then get fed into CLIP which can also handle images. 112 | image_emb_pipeline = make_pipeline( 113 | ColumnGrabber("img_path"), 114 | ImageLoader(convert="RGB"), 115 | ClipEncoder() 116 | ) 117 | 118 | dataf = pd.DataFrame({ 119 | "img_path": ["tests/data/thiscatdoesnotexist.jpeg"] 120 | }) 121 | image_emb_pipeline.fit_transform(dataf) 122 | ``` 123 | 124 | ## Batched Learning 125 | 126 | All of the encoding tools you've seen here are also compatible 127 | with the [`partial_fit` mechanic](https://scikit-learn.org/0.15/modules/scaling_strategies.html#incremental-learning) 128 | in scikit-learn. That means 129 | you can leverage [scikit-partial](https://github.com/koaning/scikit-partial) 130 | to build pipelines that can handle out-of-core datasets. 131 | -------------------------------------------------------------------------------- /embetter/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from importlib import metadata 3 | except ImportError: # for Python<3.8 4 | import importlib_metadata as metadata 5 | 6 | 7 | __title__ = __name__ 8 | __version__ = metadata.version(__title__) 9 | -------------------------------------------------------------------------------- /embetter/base.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin 2 | 3 | 4 | class EmbetterBase(BaseEstimator, TransformerMixin): 5 | """Base class for feature transformers in this library""" 6 | 7 | def fit(self, X, y=None): 8 | """No-op.""" 9 | return self 10 | 11 | def partial_fit(self, X, y=None): 12 | """No-op.""" 13 | return self 14 | -------------------------------------------------------------------------------- /embetter/error.py: -------------------------------------------------------------------------------- 1 | class NotInstalled: 2 | """ 3 | This object is used for optional dependencies. If a backend is not installed we 4 | replace the transformer/language with this object. This allows us to give a friendly 5 | message to the user that they need to install extra dependencies as well as a link 6 | to our documentation page. 7 | """ 8 | 9 | def __init__(self, tool, dep): 10 | self.tool = tool 11 | self.dep = dep 12 | 13 | msg = f"In order to use {self.tool} you'll need to install via;\n\n" 14 | msg += f"pip install embetter[{self.dep}]\n\n" 15 | self.msg = msg 16 | 17 | def __getattr__(self, *args, **kwargs): 18 | raise ModuleNotFoundError(self.msg) 19 | 20 | def __call__(self, *args, **kwargs): 21 | raise ModuleNotFoundError(self.msg) 22 | -------------------------------------------------------------------------------- /embetter/external/__init__.py: -------------------------------------------------------------------------------- 1 | from embetter.error import NotInstalled 2 | 3 | try: 4 | from ._openai import OpenAIEncoder 5 | except ModuleNotFoundError: 6 | OpenAIEncoder = NotInstalled("OpenAIEncoder", "openai") 7 | 8 | try: 9 | from ._openai import AzureOpenAIEncoder 10 | except ModuleNotFoundError: 11 | AzureOpenAIEncoder = NotInstalled("AzureOpenAIEncoder", "openai") 12 | 13 | try: 14 | from ._cohere import CohereEncoder 15 | except ModuleNotFoundError: 16 | CohereEncoder = NotInstalled("CohereEncoder", "cohere") 17 | 18 | 19 | __all__ = ["CohereEncoder", "OpenAIEncoder", "AzureOpenAIEncoder"] 20 | -------------------------------------------------------------------------------- /embetter/external/_cohere.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from itertools import islice 4 | 5 | from embetter.base import EmbetterBase 6 | 7 | 8 | def _batch(iterable, n=1): 9 | it = iter(iterable) 10 | while batch := list(islice(it, n)): 11 | yield batch 12 | 13 | 14 | class CohereEncoder(EmbetterBase): 15 | """ 16 | Encoder that can numerically encode sentences. 17 | 18 | Note that this is an **external** embedding provider. If their API breaks, so will this component. 19 | 20 | This encoder will require the `COHERE_KEY` environment variable to be set. 21 | If you have it defined in your `.env` file, you can use python-dotenv to load it. 22 | 23 | You also need to install the `cohere` library beforehand. 24 | 25 | ``` 26 | python -m pip install cohere 27 | ``` 28 | 29 | Arguments: 30 | model: name of model, can be "small" or "large" 31 | batch_size: Batch size to send to Cohere. 32 | 33 | **Usage**: 34 | 35 | ```python 36 | import pandas as pd 37 | from sklearn.pipeline import make_pipeline 38 | from sklearn.linear_model import LogisticRegression 39 | 40 | from embetter.grab import ColumnGrabber 41 | from embetter.external import CohereEncoder 42 | from dotenv import load_dotenv 43 | 44 | load_dotenv() # take environment variables from .env. 45 | 46 | # Let's suppose this is the input dataframe 47 | dataf = pd.DataFrame({ 48 | "text": ["positive sentiment", "super negative"], 49 | "label_col": ["pos", "neg"] 50 | }) 51 | 52 | # This pipeline grabs the `text` column from a dataframe 53 | # which then get fed into Cohere's endpoint 54 | text_emb_pipeline = make_pipeline( 55 | ColumnGrabber("text"), 56 | CohereEncoder(model="large") 57 | ) 58 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) 59 | 60 | # This pipeline can also be trained to make predictions, using 61 | # the embedded features. 62 | text_clf_pipeline = make_pipeline( 63 | text_emb_pipeline, 64 | LogisticRegression() 65 | ) 66 | 67 | # Prediction example 68 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf) 69 | ``` 70 | """ 71 | 72 | def __init__(self, model="large", batch_size=10): 73 | from cohere import Client 74 | 75 | self.client = Client(os.getenv("COHERE_KEY")) 76 | self.model = model 77 | self.batch_size = batch_size 78 | 79 | def transform(self, X, y=None): 80 | """Transforms the text into a numeric representation.""" 81 | result = [] 82 | for b in _batch(X, self.batch_size): 83 | response = self.client.embed(b) 84 | result.extend(response.embeddings) 85 | return np.array(result) 86 | -------------------------------------------------------------------------------- /embetter/external/_openai.py: -------------------------------------------------------------------------------- 1 | from itertools import islice 2 | 3 | import numpy as np 4 | from openai import AzureOpenAI, OpenAI 5 | 6 | from embetter.base import EmbetterBase 7 | 8 | 9 | def _batch(iterable, n=1): 10 | it = iter(iterable) 11 | while batch := list(islice(it, n)): 12 | yield batch 13 | 14 | 15 | class OpenAIEncoder(EmbetterBase): 16 | """ 17 | Encoder that can numerically encode sentences. 18 | 19 | Note that this is an **external** embedding provider. If their API breaks, so will this component. 20 | We also assume that you've already importen openai upfront and ran this command: 21 | 22 | This encoder will require the `OPENAI_API_KEY` (optionally `OPENAI_ORG_ID` and `OPENAI_PROJECT_ID`) environment variable to be set. 23 | If you have it defined in your `.env` file, you can use python-dotenv to load it. 24 | 25 | You also need to install the `openai` library beforehand. 26 | 27 | ``` 28 | python -m pip install openai 29 | ``` 30 | 31 | Arguments: 32 | model: name of model, can be "small" or "large" 33 | batch_size: Batch size to send to OpenAI. 34 | 35 | **Usage**: 36 | 37 | ```python 38 | import pandas as pd 39 | from sklearn.pipeline import make_pipeline 40 | from sklearn.linear_model import LogisticRegression 41 | 42 | from embetter.grab import ColumnGrabber 43 | from embetter.external import OpenAIEncoder 44 | from dotenv import load_dotenv 45 | 46 | load_dotenv() # take environment variables from .env. 47 | 48 | # Let's suppose this is the input dataframe 49 | dataf = pd.DataFrame({ 50 | "text": ["positive sentiment", "super negative"], 51 | "label_col": ["pos", "neg"] 52 | }) 53 | 54 | # This pipeline grabs the `text` column from a dataframe 55 | # which then get fed into OpenAI's endpoint 56 | text_emb_pipeline = make_pipeline( 57 | ColumnGrabber("text"), 58 | OpenAIEncoder() 59 | ) 60 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) 61 | 62 | # This pipeline can also be trained to make predictions, using 63 | # the embedded features. 64 | text_clf_pipeline = make_pipeline( 65 | text_emb_pipeline, 66 | LogisticRegression() 67 | ) 68 | 69 | # Prediction example 70 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf) 71 | ``` 72 | """ 73 | 74 | def __init__(self, model="text-embedding-ada-002", batch_size=25): 75 | # You must run this first! 76 | self.client = OpenAI() 77 | self.model = model 78 | self.batch_size = batch_size 79 | 80 | def transform(self, X, y=None): 81 | """Transforms the text into a numeric representation.""" 82 | result = [] 83 | for b in _batch(X, self.batch_size): 84 | resp = self.client.embeddings.create(input=b, model=self.model) # fmt: off 85 | result.extend([_.embedding for _ in resp.data]) 86 | return np.array(result) 87 | 88 | 89 | class AzureOpenAIEncoder(OpenAIEncoder): 90 | """ 91 | Encoder that can numerically encode sentences. 92 | 93 | Note that this is an *external* embedding provider. If their API breaks, so will this component. 94 | 95 | To use this encoder you must provide credentials. Please provide one of the `api_key`, `azure_ad_token`, `azure_ad_token_provider` arguments, or the `AZURE_OPENAI_API_KEY` or `AZURE_OPENAI_AD_TOKEN`. 96 | You must provide one of the `base_url` or `azure_endpoint` arguments, or the `AZURE_OPENAI_ENDPOINT` environment variable. 97 | Furthermore you must provide either the `api_version` argument or the `OPENAI_API_VERSION` environment variable. 98 | 99 | If you have your enviroment variables defined in your `.env` file, you can use python-dotenv to load it. 100 | 101 | You also need to install the `openai` library beforehand. 102 | 103 | ``` 104 | python -m pip install openai 105 | ``` 106 | 107 | Arguments: 108 | model: name of model. 109 | batch_size: Batch size to send to AzureOpenAI. 110 | 111 | *Usage*: 112 | 113 | ```python 114 | import pandas as pd 115 | from sklearn.pipeline import make_pipeline 116 | from sklearn.linear_model import LogisticRegression 117 | 118 | from embetter.grab import ColumnGrabber 119 | from embetter.external import AzureOpenAIEncoder 120 | from dotenv import load_dotenv 121 | 122 | load_dotenv() # take environment variables from .env. 123 | 124 | # Let's suppose this is the input dataframe 125 | dataf = pd.DataFrame({ 126 | "text": ["positive sentiment", "super negative"], 127 | "label_col": ["pos", "neg"] 128 | }) 129 | 130 | # This pipeline grabs the `text` column from a dataframe 131 | # which then get fed into OpenAI's endpoint 132 | text_emb_pipeline = make_pipeline( 133 | ColumnGrabber("text"), 134 | AzureOpenAIEncoder() 135 | ) 136 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) 137 | 138 | # This pipeline can also be trained to make predictions, using 139 | # the embedded features. 140 | text_clf_pipeline = make_pipeline( 141 | text_emb_pipeline, 142 | LogisticRegression() 143 | ) 144 | 145 | # Prediction example 146 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf) 147 | ``` 148 | """ 149 | 150 | def _init_(self, model="text-embedding-ada-002", batch_size=25, **kwargs): 151 | self.model = model 152 | self.batch_size = batch_size 153 | self.client = AzureOpenAI(**kwargs) 154 | -------------------------------------------------------------------------------- /embetter/finetune/__init__.py: -------------------------------------------------------------------------------- 1 | from embetter.finetune._forward import FeedForwardTuner 2 | from embetter.finetune._contrastive_tuner import ContrastiveTuner 3 | from embetter.finetune._constrastive_learn import ContrastiveLearner 4 | from embetter.finetune._sbert_learn import SbertLearner 5 | 6 | 7 | __all__ = ["FeedForwardTuner", "ContrastiveTuner", "SbertLearner", "ContrastiveLearner"] 8 | -------------------------------------------------------------------------------- /embetter/finetune/_constrastive_learn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from torch.nn import CosineSimilarity 5 | from torch import nn 6 | 7 | 8 | class ContrastiveNetwork(nn.Module): 9 | def __init__(self, shape_in, hidden_dim): 10 | super(ContrastiveNetwork, self).__init__() 11 | self.embed1 = nn.Linear(shape_in, hidden_dim) 12 | self.embed2 = nn.Linear(hidden_dim, hidden_dim) 13 | self.act = nn.ReLU() 14 | self.cos = nn.CosineSimilarity() 15 | 16 | def forward(self, input1, input2): 17 | """Feed forward.""" 18 | emb_1 = self.embed2(self.act(self.embed1(input1))) 19 | emb_2 = self.embed2(self.act(self.embed1(input2))) 20 | return self.cos(emb_1, emb_2) 21 | 22 | def embed(self, X): 23 | return self.embed2(self.act(self.embed1(X))) 24 | 25 | 26 | class ContrastiveLearner: 27 | """ 28 | A learner model that can finetune on pairs of data on top of numeric embeddings. 29 | 30 | It's similar to the scikit-learn models that you're used to, but it accepts 31 | two inputs `X1` and `X2` and tries to predict if they are similar. 32 | 33 | Arguments: 34 | sent_tfm: an instance of a `SentenceTransformer` that you'd like to finetune 35 | batch_size: the batch size during training 36 | epochs: the number of epochs to use while training 37 | warmup_steps: the number of warmup steps before training 38 | 39 | Usage: 40 | 41 | ```python 42 | from sentence_transformers import SentenceTransformer 43 | from embetter.finetune import ContrastiveLearner 44 | import random 45 | 46 | sent_tfm = SentenceTransformer('all-MiniLM-L6-v2') 47 | learner = SbertLearner(sent_tfm) 48 | 49 | def sample_generator(examples, n_neg=3): 50 | # A generator that assumes examples to be a dictionary of the shape 51 | # {"text": "some text", "cats": {"label_a": True, "label_b": False}} 52 | # this is typically a function that's very custom to your use-case though 53 | labels = set() 54 | for ex in examples: 55 | for cat in ex['cats'].keys(): 56 | if cat not in labels: 57 | labels = labels.union([cat]) 58 | for label in labels: 59 | pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1] 60 | neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0] 61 | for ex in pos_examples: 62 | sample = random.choice(pos_examples) 63 | yield (ex['text'], sample['text'], 1.0) 64 | for n in range(n_neg): 65 | sample = random.choice(neg_examples) 66 | yield (ex['text'], sample['text'], 0.0) 67 | 68 | learn_examples = sample_generator(examples, n_neg=3) 69 | X1, X2, y = zip(*learn_examples) 70 | 71 | # Learn a new representation 72 | learner.fit(X1, X2, y) 73 | 74 | # You now have an updated model that can create more "finetuned" embeddings 75 | learner.transform(X1) 76 | learner.transform(X2) 77 | ``` 78 | 79 | After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would. 80 | """ 81 | 82 | def __init__( 83 | self, 84 | shape_out: int = 300, 85 | batch_size: int = 16, 86 | epochs: int = 1, 87 | learning_rate=2e-05, 88 | ): 89 | self.learning_rate = learning_rate 90 | self.network_ = None 91 | self.batch_size = batch_size 92 | self.epochs = epochs 93 | self.shape_out = shape_out 94 | 95 | def fit(self, X1, X2, y): 96 | """Finetune an Sbert model based on similarities between two sets of texts.""" 97 | self.network_ = ContrastiveNetwork( 98 | shape_in=X1.shape[1], hidden_dim=self.shape_out 99 | ) 100 | criterion = nn.MSELoss() 101 | optimizer = torch.optim.Adam(self.network_.parameters(), lr=self.learning_rate) 102 | 103 | X1_torch = torch.from_numpy(X1).detach().float() 104 | X2_torch = torch.from_numpy(X2).detach().float() 105 | y_torch = torch.from_numpy(np.array(y)).detach().float() 106 | 107 | dataset = torch.utils.data.TensorDataset(X1_torch, X2_torch, y_torch) 108 | dataloader = torch.utils.data.DataLoader( 109 | dataset, batch_size=self.batch_size, shuffle=True 110 | ) 111 | 112 | for _ in range(self.epochs): # loop over the dataset multiple times 113 | for batch_X1, batch_X2, batch_y in dataloader: 114 | # zero the parameter gradients 115 | optimizer.zero_grad() 116 | 117 | # forward + backward + optimize 118 | cos_sim = self.network_(batch_X1, batch_X2) 119 | loss = criterion(cos_sim, batch_y) 120 | loss.backward() 121 | optimizer.step() 122 | return self 123 | 124 | def transform(self, X, y=None): 125 | """Encode a single batch of inputs.""" 126 | X_torch = torch.from_numpy(X).detach().float() 127 | return self.network_.embed(X_torch).detach().numpy() 128 | 129 | def predict(self, X1, X2): 130 | """Predicts the cosine similarity.""" 131 | emb1 = self.transform(X1) 132 | emb2 = self.transform(X2) 133 | return np.array(CosineSimilarity()(emb1, emb2)) 134 | 135 | def to_disk(self, path): 136 | """Save the finetuned Sbert model.""" 137 | self.sent_tfm.save(path=path) 138 | -------------------------------------------------------------------------------- /embetter/finetune/_contrastive_tuner.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin 2 | import random 3 | from collections import defaultdict 4 | from itertools import chain, groupby 5 | 6 | import numpy as np 7 | import torch 8 | from dataclasses import dataclass 9 | 10 | from ._constrastive_learn import ContrastiveLearner 11 | 12 | 13 | @dataclass 14 | class Example: 15 | """Internal example class.""" 16 | 17 | i1: int 18 | i2: int 19 | label: float 20 | 21 | 22 | def generate_pairs_batch(labels, n_neg=3): 23 | """ 24 | Copied with permission from Peter Baumgartners implementation 25 | https://github.com/pmbaumgartner/setfit 26 | """ 27 | # 7x faster than original implementation on small data, 28 | # 14x faster on 10000 examples 29 | pairs = [] 30 | lookup = defaultdict(list) 31 | single_example = {} 32 | indices = np.arange(len(labels)) 33 | for label, grouper in groupby( 34 | ((s, lab) for s, lab in zip(indices, labels)), key=lambda x: x[1] 35 | ): 36 | lookup[label].extend(list(i[0] for i in grouper)) 37 | single_example[label] = len(lookup[label]) == 1 38 | neg_lookup = {} 39 | for current_label in lookup: 40 | negative_options = list( 41 | chain.from_iterable( 42 | [indices for label, indices in lookup.items() if label != current_label] 43 | ) 44 | ) 45 | neg_lookup[current_label] = negative_options 46 | 47 | for current_idx, current_label in zip(indices, labels): 48 | positive_pair = random.choice(lookup[current_label]) 49 | if not single_example[current_label]: 50 | # choosing itself as a matched pair seems wrong, 51 | # but we need to account for the case of 1 positive example 52 | # so as long as there's not a single positive example, 53 | # we'll reselect the other item in the pair until it's different 54 | while positive_pair == current_idx: 55 | positive_pair = random.choice(lookup[current_label]) 56 | pairs.append(Example(current_idx, positive_pair, 1)) 57 | for i in range(n_neg): 58 | negative_pair = random.choice(neg_lookup[current_label]) 59 | pairs.append(Example(current_idx, negative_pair, 0)) 60 | 61 | return pairs 62 | 63 | 64 | class ContrastiveTuner(BaseEstimator, TransformerMixin): 65 | """ 66 | Run a contrastive network to finetune the embeddings towards a class. 67 | 68 | Arguments: 69 | hidden_dim: the dimension of the new learned representation 70 | n_neg: number of negative example pairs to sample per positive item 71 | n_epochs: number of epochs to use for training 72 | learning_rate: learning rate of the contrastive network 73 | """ 74 | 75 | def __init__(self, hidden_dim=50, n_neg=3, epochs=20, learning_rate=0.001) -> None: 76 | self.learner = ContrastiveLearner( 77 | shape_out=hidden_dim, 78 | batch_size=256, 79 | learning_rate=learning_rate, 80 | epochs=epochs, 81 | ) 82 | self.n_neg = n_neg 83 | self.hidden_dim = hidden_dim 84 | self.epochs = epochs 85 | self.learning_rate = learning_rate 86 | 87 | def fit(self, X, y): 88 | """Fits the finetuner.""" 89 | return self.partial_fit(X, y, classes=np.unique(y)) 90 | 91 | def generate_batch(self, X_torch, y): 92 | """Generate a batch of pytorch pairs used for finetuning""" 93 | pairs = generate_pairs_batch(y, n_neg=self.n_neg) 94 | X1 = torch.zeros(len(pairs), X_torch.shape[1]) 95 | X2 = torch.zeros(len(pairs), X_torch.shape[1]) 96 | labels = torch.tensor([ex.label for ex in pairs], dtype=torch.long) 97 | for i, pair in enumerate(pairs): 98 | X1[i] = X_torch[pair.i1] 99 | X2[i] = X_torch[pair.i2] 100 | return X1, X2, labels 101 | 102 | def partial_fit(self, X, y, classes=None): 103 | """Fits the finetuner using the partial_fit API.""" 104 | if not hasattr(self, "_classes"): 105 | if classes is None: 106 | raise ValueError("`classes` must be provided for partial_fit") 107 | self._classes = classes 108 | 109 | X_torch = torch.from_numpy(X).detach().float() 110 | 111 | X1, X2, out = self.generate_batch(X_torch, y=y) 112 | # TODO: change this, we should just generate numpy internally not cast all over 113 | self.learner.fit(np.array(X1), np.array(X2), np.array(out)) 114 | 115 | return self 116 | 117 | def transform(self, X, y=None): 118 | """Transforms the data according to the sklearn api by using the hidden layer.""" 119 | return self.learner.transform(X) 120 | -------------------------------------------------------------------------------- /embetter/finetune/_forward.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from sklearn.base import BaseEstimator, TransformerMixin 5 | from sklearn.preprocessing import LabelEncoder 6 | 7 | 8 | class FeedForwardModel(nn.Module): 9 | """ 10 | The internal model for the FeedForwardTuner 11 | """ 12 | 13 | def __init__(self, input_dim, hidden_dim, output_dim): 14 | super(FeedForwardModel, self).__init__() 15 | self.hidden = nn.Linear(input_dim, hidden_dim) 16 | self.linear = nn.Linear(hidden_dim, output_dim) 17 | self.sigmoid = nn.Sigmoid() 18 | 19 | def forward(self, x): 20 | """Runs the forward pass""" 21 | return self.sigmoid(self.linear(self.embed(x))) 22 | 23 | def embed(self, x): 24 | """Runs the embedding pass""" 25 | return self.sigmoid(self.hidden(x)) 26 | 27 | 28 | class FeedForwardTuner(BaseEstimator, TransformerMixin): 29 | """ 30 | Create a feed forward model to finetune the embeddings towards a class. 31 | 32 | Arguments: 33 | hidden_dim: The size of the hidden layer 34 | n_epochs: The number of epochs to run the optimiser for 35 | learning_rate: The learning rate of the feed forward model 36 | """ 37 | 38 | def __init__( 39 | self, hidden_dim=50, n_epochs=500, learning_rate=0.01, batch_size=32 40 | ) -> None: 41 | self.hidden_dim = hidden_dim 42 | self.n_epochs = n_epochs 43 | self.learning_rate = learning_rate 44 | self.batch_size = batch_size 45 | self.label_enc = LabelEncoder() 46 | 47 | def fit(self, X, y): 48 | """Fits the finetuner.""" 49 | return self.partial_fit(X, y, classes=np.unique(y)) 50 | 51 | def partial_fit(self, X, y, classes=None): 52 | """Fits the finetuner using the partial_fit API.""" 53 | if not hasattr(self, "_classes"): 54 | if classes is None: 55 | raise ValueError("`classes` must be provided for partial_fit") 56 | self._classes = classes 57 | self.label_enc.fit(classes) 58 | assert (self._classes == self.label_enc.classes_).all() 59 | # Create a model if it does not exist yet. 60 | if not hasattr(self, "_model"): 61 | self._model = FeedForwardModel( 62 | X.shape[1], self.hidden_dim, len(self._classes) 63 | ) 64 | self._optimizer = torch.optim.Adam( 65 | self._model.parameters(), lr=self.learning_rate 66 | ) 67 | self._criterion = nn.CrossEntropyLoss() 68 | 69 | torch_X = torch.from_numpy(X).detach().float() 70 | torch_y = torch.from_numpy(self.label_enc.transform(y)).detach() 71 | 72 | dataset = torch.utils.data.TensorDataset(torch_X, torch_y) 73 | dataloader = torch.utils.data.DataLoader( 74 | dataset, batch_size=self.batch_size, shuffle=True 75 | ) 76 | 77 | for _ in range(self.n_epochs): 78 | for batch_X, batch_y in dataloader: 79 | self._optimizer.zero_grad() 80 | out = self._model(batch_X) 81 | loss = self._criterion(out, batch_y) 82 | loss.backward() 83 | self._optimizer.step() 84 | 85 | return self 86 | 87 | def transform(self, X, y=None): 88 | """Transforms the data according to the sklearn api by using the hidden layer.""" 89 | Xt = torch.from_numpy(X).float().detach() 90 | return self._model.embed(Xt).detach().numpy() 91 | -------------------------------------------------------------------------------- /embetter/finetune/_sbert_learn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sentence_transformers import SentenceTransformer, InputExample, losses 4 | from torch.utils.data import DataLoader 5 | from torch.nn import CosineSimilarity 6 | 7 | 8 | class SbertLearner: 9 | """ 10 | A learner model that can finetune on pairs of data that leverages SBERT under the hood. 11 | 12 | It's similar to the scikit-learn models that you're used to, but it accepts 13 | two inputs `X1` and `X2` and tries to predict if they are similar. 14 | 15 | Arguments: 16 | sent_tfm: an instance of a `SentenceTransformer` that you'd like to finetune 17 | batch_size: the batch size during training 18 | epochs: the number of epochs to use while training 19 | warmup_steps: the number of warmup steps before training 20 | 21 | Usage: 22 | 23 | ```python 24 | from sentence_transformers import SentenceTransformer 25 | from embetter.finetune import SbertLearner 26 | import random 27 | 28 | sent_tfm = SentenceTransformer('all-MiniLM-L6-v2') 29 | learner = SbertLearner(sent_tfm) 30 | 31 | def sample_generator(examples, n_neg=3): 32 | # A generator that assumes examples to be a dictionary of the shape 33 | # {"text": "some text", "cats": {"label_a": True, "label_b": False}} 34 | # this is typically a function that's very custom to your use-case though 35 | labels = set() 36 | for ex in examples: 37 | for cat in ex['cats'].keys(): 38 | if cat not in labels: 39 | labels = labels.union([cat]) 40 | for label in labels: 41 | pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1] 42 | neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0] 43 | for ex in pos_examples: 44 | sample = random.choice(pos_examples) 45 | yield (ex['text'], sample['text'], 1.0) 46 | for n in range(n_neg): 47 | sample = random.choice(neg_examples) 48 | yield (ex['text'], sample['text'], 0.0) 49 | 50 | learn_examples = sample_generator(examples, n_neg=3) 51 | X1, X2, y = zip(*learn_examples) 52 | 53 | # Learn a new representation 54 | learner.fit(X1, X2, y) 55 | 56 | # You now have an updated model that can create more "finetuned" embeddings 57 | learner.transform(X1) 58 | learner.transform(X2) 59 | ``` 60 | 61 | After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would. 62 | """ 63 | 64 | def __init__( 65 | self, 66 | sent_tfm: SentenceTransformer, 67 | batch_size: int = 16, 68 | epochs: int = 1, 69 | warmup_steps: int = 100, 70 | ): 71 | self.sent_tfm = sent_tfm 72 | self.batch_size = batch_size 73 | self.epochs = epochs 74 | self.warmup_steps = warmup_steps 75 | 76 | def fit(self, X1, X2, y): 77 | """Finetune an Sbert model based on similarities between two sets of texts.""" 78 | train_examples = [ 79 | InputExample(texts=[x1, x2], label=float(lab)) 80 | for x1, x2, lab in zip(X1, X2, y) 81 | ] 82 | data_loader = DataLoader(train_examples, shuffle=True, batch_size=16) 83 | train_loss = losses.CosineSimilarityLoss(self.sent_tfm) 84 | self.sent_tfm.fit( 85 | train_objectives=[(data_loader, train_loss)], 86 | epochs=self.epochs, 87 | warmup_steps=self.warmup_steps, 88 | ) 89 | return self 90 | 91 | def transform(self, X, y=None): 92 | """Encode a single batch of Sbert inputs (usually texts).""" 93 | return self.sent_tfm.encode(X) 94 | 95 | def predict(self, X1, X2): 96 | """Predicts the cosine similarity.""" 97 | emb1 = self.transform(X1) 98 | emb2 = self.transform(X2) 99 | return np.array(CosineSimilarity(dim=1)(emb1, emb2)) 100 | 101 | def to_disk(self, path): 102 | """Save the finetuned Sbert model.""" 103 | self.sent_tfm.save(path=path) 104 | -------------------------------------------------------------------------------- /embetter/grab.py: -------------------------------------------------------------------------------- 1 | from embetter.base import EmbetterBase 2 | 3 | 4 | class ColumnGrabber(EmbetterBase): 5 | """ 6 | Component that can grab a pandas column as a list. 7 | 8 | ![](https://raw.githubusercontent.com/koaning/embetter/main/docs/images/columngrabber.png) 9 | 10 | This can be useful when dealing with text encoders as these 11 | sometimes cannot deal with pandas columns. 12 | 13 | Arguments: 14 | colname: the column name to grab from a dataframe 15 | 16 | **Usage** 17 | 18 | In essense, the `ColumnGrabber` really just selects a single column. 19 | 20 | ```python 21 | import pandas as pd 22 | from embetter.grab import ColumnGrabber 23 | 24 | # Let's say we start we start with a csv file with filepaths 25 | data = {"filepaths": ["tests/data/thiscatdoesnotexist.jpeg"]} 26 | df = pd.DataFrame(data) 27 | 28 | # You can use the component in stand-alone fashion 29 | ColumnGrabber("filepaths").fit_transform(df) 30 | ``` 31 | 32 | But the most common way to use the `ColumnGrabber` is part of a pipeline. 33 | 34 | ```python 35 | import pandas as pd 36 | from sklearn.pipeline import make_pipeline 37 | 38 | from embetter.grab import ColumnGrabber 39 | from embetter.vision import ImageLoader, ColorHistogramEncoder 40 | 41 | # Let's say we start we start with a csv file with filepaths 42 | data = {"filepaths": ["tests/data/thiscatdoesnotexist.jpeg"]} 43 | df = pd.DataFrame(data) 44 | 45 | # You can use the component in stand-alone fashion 46 | ColumnGrabber("filepaths").fit_transform(df) 47 | 48 | # But let's build a pipeline that grabs the column, turns it 49 | # into an image and embeds it. 50 | pipe = make_pipeline( 51 | ColumnGrabber("filepaths"), 52 | ImageLoader(), 53 | ColorHistogramEncoder() 54 | ) 55 | 56 | pipe.fit_transform(df) 57 | ``` 58 | """ 59 | 60 | def __init__(self, colname: str) -> None: 61 | self.colname = colname 62 | 63 | def transform(self, X, y=None): 64 | """ 65 | Takes a column from pandas and returns it as a list. 66 | """ 67 | return [x for x in X[self.colname]] 68 | 69 | 70 | class KeyGrabber: 71 | """ 72 | Effectively the same thing as the ColumnGrabber, except this is 73 | meant to work on generators of dictionaries instead of dataframes. 74 | """ 75 | 76 | def __init__(self, colname: str) -> None: 77 | self.colname = colname 78 | 79 | def transform(self, X, y=None): 80 | """ 81 | Takes a column from pandas and returns it as a list. 82 | """ 83 | if isinstance(X, dict): 84 | return X[self.colname] 85 | return [x[self.colname] for x in X] 86 | -------------------------------------------------------------------------------- /embetter/model/__init__.py: -------------------------------------------------------------------------------- 1 | from ._diff import DifferenceClassifier 2 | 3 | __all__ = ["DifferenceClassifier"] 4 | -------------------------------------------------------------------------------- /embetter/model/_diff.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.linear_model import LogisticRegression 4 | from sklearn.base import TransformerMixin, ClassifierMixin 5 | 6 | 7 | class DifferenceClassifier: 8 | """ 9 | Classifier for similarity using encoders under the hood. 10 | 11 | It's similar to the scikit-learn models that you're used to, but it accepts 12 | two inputs `X1` and `X2` and tries to predict if they are similar. Effectively 13 | it's just a classifier on top of `diff(X1 - X2)`. 14 | 15 | Arguments: 16 | enc: scikit-learn compatbile encoder of the input data 17 | clf_head: the classifier to apply at the end 18 | 19 | Usage: 20 | 21 | ```python 22 | from embetter.model import DifferenceClassifier 23 | from embetter.text import SentenceEncoder 24 | 25 | mod = DifferenceClassifier(enc=SentenceEncoder()) 26 | 27 | # Suppose this is input data 28 | texts1 = ["hello", "firehydrant", "greetings"] 29 | texts2 = ["no", "yes", "greeting"] 30 | 31 | # You will need to have some definition of "similar" 32 | similar = [0, 0, 1] 33 | 34 | # Train a model to detect similarity 35 | mod.fit(X1=texts1, X2=texts2, y=similar) 36 | mod.predict(X1=texts1, X2=texts2) 37 | 38 | # The classifier head is a scikit-learn model, which you could save 39 | # seperately if you like. The model can be accessed via: 40 | mod.clf_head 41 | ``` 42 | """ 43 | 44 | def __init__(self, enc: TransformerMixin, clf_head: ClassifierMixin = None): 45 | self.enc = enc 46 | self.clf_head = ( 47 | LogisticRegression(class_weight="balanced") if not clf_head else clf_head 48 | ) 49 | 50 | def _calc_feats(self, X1, X2): 51 | enc1 = self.enc.transform(X1) 52 | enc2 = self.enc.transform(X2) 53 | return np.abs(enc1 - enc2) 54 | 55 | def fit(self, X1, X2, y): 56 | self.clf_head.fit(self._calc_feats(X1, X2), y) 57 | return self 58 | 59 | def predict(self, X1, X2): 60 | return self.clf_head.predict(self._calc_feats(X1, X2)) 61 | 62 | def predict_proba(self, X1, X2): 63 | return self.clf_head.predict_proba(self._calc_feats(X1, X2)) 64 | -------------------------------------------------------------------------------- /embetter/multi/__init__.py: -------------------------------------------------------------------------------- 1 | from embetter.error import NotInstalled 2 | 3 | try: 4 | from embetter.multi._clip import ClipEncoder 5 | except ModuleNotFoundError: 6 | ClipEncoder = NotInstalled("ClipEncoder", "sentence-tfm") 7 | -------------------------------------------------------------------------------- /embetter/multi/_clip.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import torch 3 | from torch.nn import Linear 4 | from torch.quantization import quantize_dynamic 5 | from sentence_transformers import SentenceTransformer as SBERT 6 | 7 | from embetter.base import EmbetterBase 8 | 9 | 10 | class ClipEncoder(EmbetterBase): 11 | """ 12 | Clip model than can encode text and images. 13 | 14 | Under the hood it just wraps around the implementation of [sentence-transformers](https://sbert.net/docs/pretrained_models.html?highlight=clip) 15 | 16 | Arguments: 17 | name: name of model, see available options 18 | device: manually override cpu/mps/gpu device, tries to grab gpu or mps automatically when available 19 | quantize: turns on quantization 20 | num_threads: number of treads for pytorch to use, only affects when device=cpu 21 | 22 | The following model names should be supported: 23 | 24 | - `clip-ViT-B-32` 25 | - `clip-ViT-B-16` 26 | - `clip-ViT-B-14` 27 | - `clip-ViT-B-32-multilingual-v1` 28 | """ 29 | 30 | def __init__( 31 | self, name="clip-ViT-B-32", device=None, quantize=False, num_threads=None 32 | ): 33 | if not device: 34 | if torch.cuda.is_available(): 35 | device = torch.device("cuda") 36 | elif torch.backends.mps.is_available(): 37 | device = torch.device("mps") 38 | else: 39 | device = torch.device("cpu") 40 | self.name = name 41 | self.device = device 42 | self.tfm = SBERT(name, device=self.device) 43 | self.num_threads = num_threads 44 | self.quantize = quantize 45 | if quantize: 46 | self.tfm = quantize_dynamic(self.tfm, {Linear}) 47 | if num_threads: 48 | if self.device.type == "cpu": 49 | torch.set_num_threads(num_threads) 50 | 51 | def transform(self, X, y=None): 52 | """Transforms the text into a numeric representation.""" 53 | # Convert pd.Series objects to encode compatable 54 | if isinstance(X, pd.Series): 55 | X = X.to_numpy() 56 | 57 | return self.tfm.encode(X) 58 | -------------------------------------------------------------------------------- /embetter/text/__init__.py: -------------------------------------------------------------------------------- 1 | from embetter.error import NotInstalled 2 | from embetter.text._model2vec import TextEncoder 3 | 4 | try: 5 | from embetter.text._sbert import SentenceEncoder, MatrouskaEncoder, MatryoshkaEncoder 6 | except ModuleNotFoundError: 7 | SentenceEncoder = NotInstalled("SentenceEncoder", "sbert") 8 | MatrouskaEncoder = NotInstalled("MatrouskaEncoder", "sbert") 9 | MatryoshkaEncoder = NotInstalled("MatryoshkaEncoder", "sbert") 10 | 11 | try: 12 | from embetter.text._s2v import Sense2VecEncoder 13 | except ModuleNotFoundError: 14 | Sense2VecEncoder = NotInstalled("Sense2VecEncoder", "sense2vec") 15 | 16 | try: 17 | from embetter.text._bpemb import BytePairEncoder 18 | except ModuleNotFoundError: 19 | BytePairEncoder = NotInstalled("BytePairEncoder", "bpemb") 20 | 21 | try: 22 | from embetter.text._spacy import spaCyEncoder 23 | except ModuleNotFoundError: 24 | spaCyEncoder = NotInstalled("spaCyEncoder", "spacy") 25 | 26 | try: 27 | from embetter.text._word2vec import GensimEncoder 28 | except ModuleNotFoundError: 29 | GensimEncoder = NotInstalled("GensimEncoder", "gensim") 30 | 31 | try: 32 | from embetter.text._keras import KerasNLPEncoder 33 | except (ImportError, ModuleNotFoundError): 34 | KerasNLPEncoder = NotInstalled("KerasNLPEncoder", "keras_nlp") 35 | 36 | 37 | from embetter.text._lite import LiteTextEncoder, learn_lite_text_embeddings 38 | 39 | 40 | __all__ = [ 41 | "TextEncoder", 42 | "SentenceEncoder", 43 | "MatrouskaEncoder", 44 | "MatryoshkaEncoder", 45 | "Sense2VecEncoder", 46 | "BytePairEncoder", 47 | "spaCyEncoder", 48 | "GensimEncoder", 49 | "KerasNLPEncoder", 50 | "LiteTextEncoder", 51 | "learn_lite_text_embeddings", 52 | ] 53 | -------------------------------------------------------------------------------- /embetter/text/_bpemb.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pathlib import Path 3 | 4 | from bpemb import BPEmb 5 | 6 | from embetter.base import EmbetterBase 7 | 8 | 9 | class BytePairEncoder(EmbetterBase): 10 | """ 11 | This language represents token-free pre-trained subword embeddings. Originally created by 12 | Benjamin Heinzerling and Michael Strube. 13 | 14 | These vectors will auto-download by the [BPEmb package](https://nlp.h-its.org/bpemb/). 15 | You can also specify "multi" to download multi language embeddings. A full list of available 16 | languages can be found [here](https://nlp.h-its.org/bpemb). The article that 17 | belongs to this work can be found [here](http://www.lrec-conf.org/proceedings/lrec2018/pdf/1049.pdf) 18 | The availability of vocabulary size as well as dimensionality can be varified 19 | on the project website. See [here](https://nlp.h-its.org/bpemb/en/) for an 20 | example link in English. Please credit the original authors if you use their work. 21 | 22 | Arguments: 23 | lang: name of the model to load 24 | vs: vocabulary size of the byte pair model 25 | dim: the embedding dimensionality 26 | agg: the aggregation method to reduce many subword vectors into a single one, can be "max", "mean" or "both" 27 | cache_dir: The folder in which downloaded BPEmb files will be cached, can overwrite to custom folder. 28 | 29 | **Usage** 30 | 31 | ```python 32 | import pandas as pd 33 | from sklearn.pipeline import make_pipeline 34 | from sklearn.linear_model import LogisticRegression 35 | 36 | from embetter.grab import ColumnGrabber 37 | from embetter.text import BytePairEncoder 38 | 39 | # Let's suppose this is the input dataframe 40 | dataf = pd.DataFrame({ 41 | "text": ["positive sentiment", "super negative"], 42 | "label_col": ["pos", "neg"] 43 | }) 44 | 45 | # This pipeline grabs the `text` column from a dataframe 46 | # which then get fed into a small English model 47 | text_emb_pipeline = make_pipeline( 48 | ColumnGrabber("text"), 49 | BytePairEncoder(lang="en") 50 | ) 51 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) 52 | 53 | # This pipeline can also be trained to make predictions, using 54 | # the embedded features. 55 | text_clf_pipeline = make_pipeline( 56 | text_emb_pipeline, 57 | LogisticRegression() 58 | ) 59 | 60 | # Prediction example 61 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf) 62 | ``` 63 | """ 64 | 65 | def __init__( 66 | self, 67 | lang: str, 68 | vs: int = 1000, 69 | dim: int = 25, 70 | agg: str = "mean", 71 | cache_dir: Path = None, 72 | ): 73 | self.lang = lang 74 | self.vs = vs 75 | self.dim = dim 76 | self.cache_dir = cache_dir 77 | self.agg = agg 78 | if not cache_dir: 79 | cache_dir = Path.home() / Path(".cache/bpemb") 80 | self.module = BPEmb(lang=lang, vs=vs, dim=dim, cache_dir=cache_dir) 81 | 82 | def fit(self, X, y=None): 83 | """No-op. Merely checks for object inputs per sklearn standard.""" 84 | # Scikit-learn also expects this in the `.fit()` command. 85 | self._check_inputs(X) 86 | return self 87 | 88 | def _check_inputs(self, X): 89 | options = ["mean", "max", "both"] 90 | if self.agg not in options: 91 | raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.") 92 | 93 | def transform(self, X, y=None): 94 | """Transforms the phrase text into a numeric representation.""" 95 | self._check_inputs(X) 96 | if self.agg == "mean": 97 | return np.array([self.module.embed(x).mean(axis=0) for x in X]) 98 | if self.agg == "max": 99 | return np.array([self.module.embed(x).max(axis=0) for x in X]) 100 | if self.agg == "both": 101 | mean_arr = np.array([self.module.embed(x).max(axis=0) for x in X]) 102 | max_arr = np.array([self.module.embed(x).max(axis=0) for x in X]) 103 | return np.concatenate([mean_arr, max_arr], axis=1) 104 | -------------------------------------------------------------------------------- /embetter/text/_keras.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import keras_nlp 4 | from embetter.base import EmbetterBase 5 | 6 | 7 | class KerasNLPEncoder(EmbetterBase): 8 | """ 9 | Encoder that can numerically encode sentences. 10 | 11 | Arguments: 12 | name: name of model, see available options 13 | device: manually override cpu/gpu device, tries to grab gpu automatically when available 14 | quantize: turns on quantization 15 | num_threads: number of treads for pytorch to use, only affects when device=cpu 16 | 17 | The pre-trained model names that you could use can be found [here](https://keras.io/api/keras_nlp/models/). 18 | 19 | **Usage**: 20 | 21 | You can leverage the multiple backends from keras-core by setting the `KERAS_BACKEND` environment variable. 22 | 23 | ```python 24 | import os 25 | # Pick the right setting 26 | os.environ["KERAS_BACKEND"] = "jax" 27 | os.environ["KERAS_BACKEND"] = "torch" 28 | os.environ["KERAS_BACKEND"] = "tensorflow" 29 | ``` 30 | 31 | Once this is set, the following code will automatically use the right backend. 32 | 33 | ```python 34 | import pandas as pd 35 | from sklearn.pipeline import make_pipeline 36 | from sklearn.linear_model import LogisticRegression 37 | 38 | from embetter.grab import ColumnGrabber 39 | from embetter.text import SentenceEncoder 40 | 41 | # Let's suppose this is the input dataframe 42 | dataf = pd.DataFrame({ 43 | "text": ["positive sentiment", "super negative"], 44 | "label_col": ["pos", "neg"] 45 | }) 46 | 47 | # This pipeline grabs the `text` column from a dataframe 48 | # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2. 49 | text_emb_pipeline = make_pipeline( 50 | ColumnGrabber("text"), 51 | KerasNLPEncoder() 52 | ) 53 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) 54 | 55 | # This pipeline can also be trained to make predictions, using 56 | # the embedded features. 57 | text_clf_pipeline = make_pipeline( 58 | text_emb_pipeline, 59 | LogisticRegression() 60 | ) 61 | 62 | # Prediction example 63 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf) 64 | ``` 65 | """ 66 | 67 | def __init__(self, name="bert_tiny_en_uncased"): 68 | self.name = name 69 | self.backbone = keras_nlp.models.BertBackbone.from_preset(name) 70 | self.preprocessor = keras_nlp.models.BertPreprocessor.from_preset(name) 71 | 72 | def transform(self, X, y=None): 73 | """Transforms the text into a numeric representation.""" 74 | if isinstance(X, pd.Series): 75 | X = X.to_numpy() 76 | out = self.backbone(self.preprocessor(X))["pooled_output"] 77 | 78 | # Depending on the backend, return numpy by calling right methods. 79 | if keras_nlp.src.backend.config.backend() == "torch": 80 | return out.detach().numpy() 81 | else: 82 | return np.asarray(out) 83 | -------------------------------------------------------------------------------- /embetter/text/_lite.py: -------------------------------------------------------------------------------- 1 | from sklearn.pipeline import make_pipeline 2 | from sklearn.decomposition import TruncatedSVD 3 | from sklearn.feature_extraction.text import TfidfVectorizer 4 | 5 | import itertools as it 6 | from skops.io import dump, load 7 | 8 | 9 | def learn_lite_text_embeddings(text_stream, dim=300, lite=True, path=None, **kwargs): 10 | """ 11 | Function that can train a TF/iDF model followed by SVD to generate dense text representations. 12 | 13 | Arguments: 14 | path: path where model is saved 15 | 16 | This function can be used to load a model that's saved with `featherbed_textrepr`. 17 | 18 | **Usage**: 19 | 20 | You can leverage the multiple backends from keras-core by setting the `KERAS_BACKEND` environment variable. 21 | 22 | ```python 23 | from embetter.text import learn_lite_text_embeddings 24 | 25 | # Save a variable that contains the scikit-learn pipeline, but also store on disk. 26 | enc = learn_lite_text_embeddings(generator_of_strings, path="folder/embeddings.skops") 27 | ``` 28 | """ 29 | # Make two streams, keep memory footprint low 30 | stream1, stream2 = it.tee(text_stream) 31 | 32 | # Tf/Idf vectorizer can accept generators! 33 | tfidf = TfidfVectorizer(**kwargs).fit(stream1) 34 | X = tfidf.transform(stream2) 35 | if lite: 36 | # This makes a pretty big difference 37 | tfidf.idf_ = tfidf.idf_.astype("float16") 38 | 39 | # Turn the representation into floats 40 | svd = TruncatedSVD(n_components=dim, **kwargs).fit(X) 41 | 42 | # This makes it much more lightweight to save 43 | if lite: 44 | svd.components_ = svd.components_.astype("float16") 45 | pipe = make_pipeline(tfidf, svd) 46 | if path: 47 | # This makes a pretty big difference 48 | dump(pipe, path) 49 | return pipe 50 | 51 | 52 | def LiteTextEncoder(path): 53 | """ 54 | Function that looks like class so that it fits the API. 55 | 56 | Arguments: 57 | path: path where model is saved 58 | 59 | This function can be used to load a model that's saved with `featherbed_textrepr`. 60 | 61 | **Usage**: 62 | 63 | You can leverage the multiple backends from keras-core by setting the `KERAS_BACKEND` environment variable. 64 | 65 | ```python 66 | from embetter.text import learn_lite_text_embeddings, LiteTextEncoder 67 | 68 | learn_lite_text_embeddings(generator_of_strings, path="folder/embeddings.skops") 69 | 70 | enc = LiteTextEncoder(path="folder/embeddings.skops") 71 | enc.transform(["encode this examples", "and this one"]) 72 | ``` 73 | """ 74 | return load(path, trusted=True) 75 | -------------------------------------------------------------------------------- /embetter/text/_model2vec.py: -------------------------------------------------------------------------------- 1 | 2 | from model2vec import StaticModel 3 | 4 | from embetter.base import EmbetterBase 5 | 6 | 7 | class TextEncoder(EmbetterBase): 8 | """ 9 | Encoder that can numerically encode text using a model from the model2vec library. 10 | 11 | The main benefit of this encoder is that it uses distilled word embeddings, which means that they are super *fast*. 12 | 13 | Arguments: 14 | name: name of model, see available options, can also pass a model2vec StaticModel object directly 15 | 16 | The following model names should be supported: 17 | 18 | - `minishlab/potion-base-32M` 19 | - `minishlab/potion-base-8M` 20 | - `minishlab/potion-base-4M` 21 | - `minishlab/potion-base-2M` 22 | - `minishlab/potion-retrieval-32M` 23 | - `minishlab/M2V_multilingual_output` 24 | 25 | You can find the more options, and information, on the [Github repository](https://github.com/MinishLab/model2vec?tab=readme-ov-file#model-list). 26 | 27 | **Usage**: 28 | 29 | ```python 30 | import pandas as pd 31 | from sklearn.pipeline import make_pipeline 32 | from sklearn.linear_model import LogisticRegression 33 | 34 | from embetter.grab import ColumnGrabber 35 | from embetter.text import TextEncoder 36 | 37 | # Let's suppose this is the input dataframe 38 | dataf = pd.DataFrame({ 39 | "text": ["positive sentiment", "super negative"], 40 | "label_col": ["pos", "neg"] 41 | }) 42 | 43 | # This pipeline grabs the `text` column from a dataframe 44 | # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2. 45 | text_emb_pipeline = make_pipeline( 46 | ColumnGrabber("text"), 47 | TextEncoder() 48 | ) 49 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) 50 | 51 | # This pipeline can also be trained to make predictions, using 52 | # the embedded features. 53 | text_clf_pipeline = make_pipeline( 54 | text_emb_pipeline, 55 | LogisticRegression() 56 | ) 57 | 58 | # Prediction example 59 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf) 60 | ``` 61 | """ 62 | 63 | def __init__( 64 | self, model="minishlab/potion-base-8M" 65 | ): 66 | if isinstance(model, str): 67 | self.model = StaticModel.from_pretrained(model) 68 | else: 69 | assert isinstance(model, StaticModel), "model must be a string or a StaticModel from model2vec" 70 | self.model = model 71 | 72 | def transform(self, X, y=None): 73 | """Transforms the text into a numeric representation.""" 74 | return self.model.encode(X) 75 | 76 | 77 | -------------------------------------------------------------------------------- /embetter/text/_s2v.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sense2vec import Sense2Vec 3 | 4 | from embetter.base import BaseEstimator 5 | 6 | 7 | class Sense2VecEncoder(BaseEstimator): 8 | """ 9 | Create a [Sense2Vec encoder](https://github.com/explosion/sense2vec), meant to 10 | help when encoding phrases as opposed to sentences. 11 | 12 | Arguments: 13 | path: path to downloaded model 14 | 15 | **Usage** 16 | 17 | ```python 18 | import pandas as pd 19 | from sklearn.pipeline import make_pipeline 20 | from sklearn.linear_model import LogisticRegression 21 | 22 | from embetter.grab import ColumnGrabber 23 | from embetter.text import Sense2VecEncoder 24 | 25 | # Let's suppose this is the input dataframe 26 | dataf = pd.DataFrame({ 27 | "text": ["positive sentiment", "super negative"], 28 | "label_col": ["pos", "neg"] 29 | }) 30 | 31 | # This pipeline grabs the `text` column from a dataframe 32 | # which is then passed to the sense2vec model. 33 | text_emb_pipeline = make_pipeline( 34 | ColumnGrabber("text"), 35 | Sense2VecEncoder("path/to/s2v") 36 | ) 37 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) 38 | ``` 39 | """ 40 | 41 | def __init__(self, path: str): 42 | self.path = path 43 | self.s2v = Sense2Vec().from_disk(self.path) 44 | self.shape = self.s2v["duck|NOUN"].shape 45 | 46 | def _to_vector(self, text): 47 | sense = self.s2v.get_best_sense(text) 48 | if not sense: 49 | return np.zeros(shape=self.shape) 50 | return self.s2v[sense] 51 | 52 | def transform(self, X, y=None): 53 | """Transforms the phrase text into a numeric representation.""" 54 | return np.array([self._to_vector(x) for x in X]) 55 | -------------------------------------------------------------------------------- /embetter/text/_sbert.py: -------------------------------------------------------------------------------- 1 | from warnings import warn 2 | 3 | import pandas as pd 4 | import torch 5 | from torch.nn import Linear 6 | from torch.quantization import quantize_dynamic 7 | from sentence_transformers import SentenceTransformer as SBERT 8 | 9 | from embetter.base import EmbetterBase 10 | 11 | 12 | class SentenceEncoder(EmbetterBase): 13 | """ 14 | Encoder that can numerically encode sentences. 15 | 16 | Arguments: 17 | name: name of model, see available options 18 | device: manually override cpu/mps/gpu device, tries to grab gpu or mps automatically when available 19 | quantize: turns on quantization 20 | num_threads: number of treads for pytorch to use, only affects when device=cpu 21 | 22 | The following model names should be supported: 23 | 24 | - `all-mpnet-base-v2` 25 | - `multi-qa-mpnet-base-dot-v1` 26 | - `all-distilroberta-v1` 27 | - `all-MiniLM-L12-v2` 28 | - `multi-qa-distilbert-cos-v1` 29 | - `all-MiniLM-L6-v2` 30 | - `multi-qa-MiniLM-L6-cos-v1` 31 | - `paraphrase-multilingual-mpnet-base-v2` 32 | - `paraphrase-albert-small-v2` 33 | - `paraphrase-multilingual-MiniLM-L12-v2` 34 | - `paraphrase-MiniLM-L3-v2` 35 | - `distiluse-base-multilingual-cased-v1` 36 | - `distiluse-base-multilingual-cased-v2` 37 | 38 | You can find the more options, and information, on the [sentence-transformers docs page](https://www.sbert.net/docs/pretrained_models.html#model-overview). 39 | 40 | **Usage**: 41 | 42 | ```python 43 | import pandas as pd 44 | from sklearn.pipeline import make_pipeline 45 | from sklearn.linear_model import LogisticRegression 46 | 47 | from embetter.grab import ColumnGrabber 48 | from embetter.text import SentenceEncoder 49 | 50 | # Let's suppose this is the input dataframe 51 | dataf = pd.DataFrame({ 52 | "text": ["positive sentiment", "super negative"], 53 | "label_col": ["pos", "neg"] 54 | }) 55 | 56 | # This pipeline grabs the `text` column from a dataframe 57 | # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2. 58 | text_emb_pipeline = make_pipeline( 59 | ColumnGrabber("text"), 60 | SentenceEncoder('all-MiniLM-L6-v2') 61 | ) 62 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) 63 | 64 | # This pipeline can also be trained to make predictions, using 65 | # the embedded features. 66 | text_clf_pipeline = make_pipeline( 67 | text_emb_pipeline, 68 | LogisticRegression() 69 | ) 70 | 71 | # Prediction example 72 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf) 73 | ``` 74 | """ 75 | 76 | def __init__( 77 | self, name="all-MiniLM-L6-v2", device=None, quantize=False, num_threads=None 78 | ): 79 | if not device: 80 | if torch.cuda.is_available(): 81 | device = torch.device("cuda") 82 | elif torch.backends.mps.is_available(): 83 | device = torch.device("mps") 84 | else: 85 | device = torch.device("cpu") 86 | self.name = name 87 | self.device = device 88 | self.tfm = SBERT(name, device=self.device) 89 | self.num_threads = num_threads 90 | self.quantize = quantize 91 | if quantize: 92 | self.tfm = quantize_dynamic(self.tfm, {Linear}) 93 | if num_threads: 94 | if self.device.type == "cpu": 95 | torch.set_num_threads(num_threads) 96 | 97 | def transform(self, X, y=None): 98 | """Transforms the text into a numeric representation.""" 99 | # Convert pd.Series objects to encode compatable 100 | if isinstance(X, pd.Series): 101 | X = X.to_numpy() 102 | 103 | return self.tfm.encode(X) 104 | 105 | 106 | def MatrouskaEncoder(name="tomaarsen/mpnet-base-nli-matryoshka", **kwargs): 107 | warn( 108 | "Please use `MatryoshkaEncoder` instead of `MatrouskaEncoder." 109 | "We will use correct spelling going forward and `MatrouskaEncoder` will be deprecated.", 110 | DeprecationWarning, 111 | ) 112 | return MatryoshkaEncoder(name="tomaarsen/mpnet-base-nli-matryoshka", **kwargs) 113 | 114 | 115 | def MatryoshkaEncoder(name="tomaarsen/mpnet-base-nli-matryoshka", **kwargs): 116 | """ 117 | Encoder that can numerically encode sentences. 118 | 119 | This function, which looks like a class, offers a shorthand way to fetch pretrained 120 | [Matryoshka embeddings](https://www.sbert.net/examples/training/matryoshka/README.html). 121 | Under the hood it just returns a `SentenceEncoder` object, but the default name points 122 | to a pretrained Matryoshka model. 123 | 124 | These embeddings are more flexible in the sense that you can more easily reduce the 125 | dimensions without losing as much information. The aforementioned docs give more details 126 | 127 | **Usage**: 128 | 129 | ```python 130 | import pandas as pd 131 | from sklearn.pipeline import make_pipeline 132 | from sklearn.linear_model import LogisticRegression 133 | 134 | from embetter.grab import ColumnGrabber 135 | from embetter.text import SentenceEncoder 136 | 137 | # Let's suppose this is the input dataframe 138 | dataf = pd.DataFrame({ 139 | "text": ["positive sentiment", "super negative"], 140 | "label_col": ["pos", "neg"] 141 | }) 142 | 143 | # This pipeline grabs the `text` column from a dataframe 144 | # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2. 145 | text_emb_pipeline = make_pipeline( 146 | ColumnGrabber("text"), 147 | MatryoshkaEncoder() 148 | ) 149 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) 150 | 151 | # This pipeline can also be trained to make predictions, using 152 | # the embedded features. 153 | text_clf_pipeline = make_pipeline( 154 | text_emb_pipeline, 155 | LogisticRegression() 156 | ) 157 | 158 | # Prediction example 159 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf) 160 | ``` 161 | """ 162 | return SentenceEncoder(name=name, **kwargs) 163 | -------------------------------------------------------------------------------- /embetter/text/_spacy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import Union 3 | 4 | import spacy 5 | from spacy.language import Language 6 | 7 | from embetter.base import EmbetterBase 8 | 9 | 10 | class spaCyEncoder(EmbetterBase): 11 | """ 12 | **Usage** 13 | 14 | ```python 15 | import pandas as pd 16 | from sklearn.pipeline import make_pipeline 17 | from sklearn.linear_model import LogisticRegression 18 | 19 | from embetter.grab import ColumnGrabber 20 | from embetter.text import spaCyEncoder 21 | 22 | # Let's suppose this is the input dataframe 23 | dataf = pd.DataFrame({ 24 | "text": ["positive sentiment", "super negative"], 25 | "label_col": ["pos", "neg"] 26 | }) 27 | 28 | # This pipeline grabs the `text` column from a dataframe 29 | # which is then passed to the medium spaCy model. 30 | text_emb_pipeline = make_pipeline( 31 | ColumnGrabber("text"), 32 | spaCyEncoder("en_core_web_md") 33 | ) 34 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) 35 | 36 | # This pipeline can also be trained to make predictions, using 37 | # the embedded features. 38 | text_clf_pipeline = make_pipeline( 39 | text_emb_pipeline, 40 | LogisticRegression() 41 | ) 42 | 43 | # Prediction example 44 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf) 45 | ``` 46 | """ 47 | 48 | def __init__(self, nlp: Union[str, Language], agg: str = "base"): 49 | if isinstance(nlp, str): 50 | self.nlp = spacy.load(nlp, disable=["ner", "tagger", "parser"]) 51 | elif isinstance(nlp, Language): 52 | self.nlp = nlp 53 | else: 54 | raise ValueError("`nlp` must be `str` or spaCy-language object.") 55 | self.agg = agg 56 | 57 | def fit(self, X, y=None): 58 | """No-op. Merely checks for object inputs per sklearn standard.""" 59 | # Scikit-learn also expects this in the `.fit()` command. 60 | self._check_inputs(X) 61 | return self 62 | 63 | def _check_inputs(self, X): 64 | options = ["mean", "max", "both", "base"] 65 | if self.agg not in options: 66 | raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.") 67 | 68 | def transform(self, X, y=None): 69 | """Transforms the phrase text into a numeric representation.""" 70 | self._check_inputs(X) 71 | docs = self.nlp.pipe(X) 72 | if self.agg == "base": 73 | return np.array([d.vector for d in docs]) 74 | token_vectors = [np.array([tok.vector for tok in doc]) for doc in docs] 75 | if self.agg == "mean": 76 | return np.array([v.mean(axis=0) for v in token_vectors]) 77 | if self.agg == "max": 78 | return np.array([v.max(axis=0) for v in token_vectors]) 79 | if self.agg == "both": 80 | mean_arr = np.array([v.mean(axis=0) for v in token_vectors]) 81 | max_arr = np.array([v.max(axis=0) for v in token_vectors]) 82 | return np.concatenate([mean_arr, max_arr], axis=1) 83 | -------------------------------------------------------------------------------- /embetter/text/_word2vec.py: -------------------------------------------------------------------------------- 1 | from typing import List, Literal, Union 2 | 3 | import numpy as np 4 | from gensim import downloader 5 | from gensim.models import KeyedVectors, Word2Vec 6 | from gensim.utils import SaveLoad, tokenize 7 | 8 | from embetter.base import EmbetterBase 9 | 10 | 11 | class GensimEncoder(EmbetterBase): 12 | """ 13 | Encodes text using a static word embedding model. The component uses gensim's default tokenizer. 14 | 15 | Arguments: 16 | model: Model name, path to model on disk, Word2Vec instance or KeyedVectors instance. 17 | agg: Way to aggregate the word embeddings in a document. Can either take the maximum, mean or both of them concatenated. 18 | deacc: Specifies whether accents should be removed when tokenizing the text. 19 | lowercase: Specifies whether the text should be lowercased during tokenization. 20 | 21 | Currently the following models are supported by default: 22 | - `conceptnet-numberbatch-17-06-300` 23 | - `word2vec-ruscorpora-300` 24 | - `word2vec-google-news-300` 25 | - `glove-wiki-gigaword-50` 26 | - `glove-wiki-gigaword-100` 27 | - `glove-wiki-gigaword-200` 28 | - `glove-wiki-gigaword-300` 29 | - `glove-twitter-25` 30 | - `glove-twitter-50` 31 | - `glove-twitter-100` 32 | - `glove-twitter-200` 33 | 34 | **Usage** 35 | 36 | ```python 37 | import pandas as pd 38 | from sklearn.pipeline import make_pipeline 39 | from sklearn.linear_model import LogisticRegression 40 | 41 | from embetter.grab import ColumnGrabber 42 | from embetter.text import Word2VecEncoder 43 | 44 | # Let's suppose this is the input dataframe 45 | dataf = pd.DataFrame({ 46 | "text": ["positive sentiment", "super negative"], 47 | "label_col": ["pos", "neg"] 48 | }) 49 | 50 | # This pipeline grabs the `text` column from a dataframe 51 | # which is then passed to a Word2Vec model. 52 | text_emb_pipeline = make_pipeline( 53 | ColumnGrabber("text"), 54 | Word2VecEncoder("glove-wiki-gigaword-50") 55 | ) 56 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col']) 57 | 58 | # This pipeline can also be trained to make predictions, using 59 | # the embedded features. 60 | text_clf_pipeline = make_pipeline( 61 | text_emb_pipeline, 62 | LogisticRegression() 63 | ) 64 | 65 | # Prediction example 66 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf) 67 | ``` 68 | """ 69 | 70 | def __init__( 71 | self, 72 | model: Union[str, Word2Vec, KeyedVectors] = "word2vec-google-news-300", 73 | agg: Literal["mean", "max", "both"] = "mean", 74 | deacc: bool = False, 75 | lowercase: bool = False, 76 | ): 77 | self.model = model 78 | if isinstance(model, str): 79 | if model in downloader.info()["models"]: 80 | self.keyed_vectors: KeyedVectors = downloader.load(model) # type: ignore 81 | else: 82 | loaded_object = SaveLoad().load(self.model) 83 | if isinstance(loaded_object, Word2Vec): 84 | self.keyed_vectors = loaded_object.wv 85 | elif isinstance(loaded_object, KeyedVectors): 86 | self.keyed_vectors = loaded_object 87 | else: 88 | raise TypeError( 89 | "Object loaded from disk is not Word2Vec nor a KeyedVectors instance." 90 | ) 91 | elif isinstance(model, Word2Vec): 92 | self.keyed_vectors: KeyedVectors = model.wv 93 | elif isinstance(model, KeyedVectors): 94 | self.keyed_vectors: KeyedVectors = model 95 | else: 96 | raise TypeError( 97 | f"You should pass a model name, keyed vectors or a Word2Vec model to Word2VecEncoder, not {type(model)}" 98 | ) 99 | self.agg = agg 100 | self.deacc = deacc 101 | self.lowercase = lowercase 102 | self.n_features_out = ( 103 | self.keyed_vectors.vector_size 104 | if self.agg != "both" 105 | else self.keyed_vectors.vector_size * 2 106 | ) 107 | 108 | def fit(self, X, y=None): 109 | """No-op. Merely checks for object inputs per sklearn standard.""" 110 | # Scikit-learn also expects this in the `.fit()` command. 111 | self._check_inputs(X) 112 | return self 113 | 114 | def _check_inputs(self, X): 115 | options = ["mean", "max", "both"] 116 | if self.agg not in options: 117 | raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.") 118 | 119 | def _tokenize(self, X) -> List[List[int]]: 120 | token_indices = [] 121 | for text in X: 122 | tokens = tokenize(text, deacc=self.deacc, lowercase=self.lowercase) 123 | indices = [] 124 | for token in tokens: 125 | index = self.keyed_vectors.get_index(token, default=-1) 126 | if index != -1: 127 | indices.append(index) 128 | token_indices.append(indices) 129 | return token_indices 130 | 131 | def transform(self, X, y=None): 132 | """Transforms the phrase text into a numeric representation using word embeddings.""" 133 | self._check_inputs(X) 134 | tokens = self._tokenize(X) 135 | embeddings = np.empty((len(X), self.n_features_out)) 136 | for i_doc, token_indices in enumerate(tokens): 137 | if not len(token_indices): 138 | embeddings[i_doc, :] = np.nan 139 | doc_vectors = self.keyed_vectors.vectors[token_indices] 140 | if self.agg == "mean": 141 | embeddings[i_doc, :] = np.mean(doc_vectors, axis=0) 142 | elif self.agg == "max": 143 | embeddings[i_doc, :] = np.max(doc_vectors, axis=0) 144 | elif self.agg == "both": 145 | mean_vector = np.mean(doc_vectors, axis=0) 146 | max_vector = np.max(doc_vectors, axis=0) 147 | embeddings[i_doc, :] = np.concatenate((mean_vector, max_vector)) 148 | return embeddings 149 | -------------------------------------------------------------------------------- /embetter/utils.py: -------------------------------------------------------------------------------- 1 | from itertools import islice 2 | from typing import Callable, Iterable 3 | 4 | import numpy as np 5 | from diskcache import Cache 6 | from sklearn.base import BaseEstimator 7 | from sklearn.metrics import pairwise_distances 8 | 9 | 10 | def cached(name: str, pipeline: BaseEstimator): 11 | """ 12 | Uses a [diskcache](https://grantjenks.com/docs/diskcache/tutorial.html) in 13 | an attempt to fetch precalculated embeddings from disk instead of inferring them. 14 | This can save on compute, but also cloud credits, depending on the backend 15 | that you're using to generate embeddings. 16 | 17 | Be mindful of what does in to the encoder that you choose. It's preferable to give it 18 | text as opposed to numpy arrays. Also note that the first time that you'll run this 19 | it will take more time due to the overhead of writing into the cache. 20 | 21 | Arguments: 22 | name: the name of the local folder to represent the disk cache 23 | pipeline: the pipeline that you want to cache 24 | 25 | Usage: 26 | ```python 27 | from embetter.text import SentenceEncoder 28 | from embetter.utils import cached 29 | 30 | encoder = cached("sentence-enc", SentenceEncoder('all-MiniLM-L6-v2')) 31 | 32 | examples = [f"this is a pretty long text, which is more expensive {i}" for i in range(10_000)] 33 | 34 | # This might be a bit slow ~17.2s on our machine 35 | encoder.transform(examples) 36 | 37 | # This should be quicker ~4.71s on our machine 38 | encoder.transform(examples) 39 | ``` 40 | 41 | Note that you're also able to fetch the precalculated embeddings directly via: 42 | 43 | ```python 44 | from diskcache import Cache 45 | 46 | # Make sure that you use the same name as in `cached` 47 | cache = Cache("sentence-enc") 48 | # Use a string as a key, if it's precalculated you'll get an array back. 49 | cache["this is a pretty long text, which is more expensive 0"] 50 | ``` 51 | """ 52 | cache = Cache(name) 53 | 54 | def run_cached(method: Callable): 55 | def wrapped(X, y=None): 56 | results = {i: cache[x] if x in cache else "TODO" for i, x in enumerate(X)} 57 | text_todo = [X[i] for i, x in results.items() if str(x) == "TODO"] 58 | i_todo = [i for i, x in results.items() if str(x) == "TODO"] 59 | out = method(text_todo) 60 | with Cache(cache.directory) as reference: 61 | for i, text, x_tfm in zip(i_todo, text_todo, out): 62 | results[i] = x_tfm 63 | reference.set(text, x_tfm) 64 | return np.array([arr for i, arr in results.items()]) 65 | 66 | return wrapped 67 | 68 | pipeline.transform = run_cached(pipeline.transform) 69 | 70 | return pipeline 71 | 72 | 73 | def batched(iterable: Iterable, n: int = 64): 74 | """ 75 | Takes an iterable and turns it into a batched iterable. 76 | 77 | Arguments: 78 | iterable: the input stream 79 | n: the batch size 80 | """ 81 | if n < 1: 82 | raise ValueError("n must be at least one") 83 | it = iter(iterable) 84 | for batch in tuple(islice(it, n)): 85 | yield batch 86 | 87 | 88 | def calc_distances( 89 | inputs, 90 | anchors, 91 | pipeline, 92 | anchor_pipeline=None, 93 | metric="cosine", 94 | aggregate=np.max, 95 | n_jobs=None, 96 | ): 97 | """ 98 | Shortcut to compare a sequence of inputs to a set of anchors. 99 | 100 | The available metrics are: `cityblock`,`cosine`,`euclidean`,`haversine`,`l1`,`l2`,`manhattan` and `nan_euclidean`. 101 | 102 | You can read a verbose description of the metrics [here](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html#sklearn.metrics.pairwise.distance_metrics). 103 | 104 | Arguments: 105 | inputs: sequence of inputs to calculate scores for 106 | anchors: set/list of anchors to compare against 107 | pipeline: the pipeline to use to calculate the embeddings 108 | anchor_pipeline: the pipeline to apply to the anchors, meant to be used if the anchors should use a different pipeline 109 | metric: the distance metric to use 110 | aggregate: you'll want to aggregate the distances to the different anchors down to a single metric, numpy functions that offer axis=1, like `np.max` and `np.mean`, can be used 111 | n_jobs: set to -1 to use all cores for calculation 112 | """ 113 | X_input = pipeline.transform(inputs) 114 | if anchor_pipeline: 115 | X_anchors = anchor_pipeline.transform(anchors) 116 | else: 117 | X_anchors = pipeline.transform(anchors) 118 | 119 | X_dist = pairwise_distances(X_input, X_anchors, metric=metric, n_jobs=n_jobs) 120 | return aggregate(X_dist, axis=1) 121 | -------------------------------------------------------------------------------- /embetter/vision/__init__.py: -------------------------------------------------------------------------------- 1 | from embetter.error import NotInstalled 2 | from embetter.vision._colorhist import ColorHistogramEncoder 3 | from embetter.vision._loader import ImageLoader 4 | 5 | try: 6 | from embetter.vision._torchvis import TimmEncoder 7 | except ModuleNotFoundError: 8 | TimmEncoder = NotInstalled("TimmEncoder", "vision") 9 | 10 | 11 | __all__ = ["ImageLoader", "ColorHistogramEncoder", "TimmEncoder"] 12 | -------------------------------------------------------------------------------- /embetter/vision/_colorhist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from embetter.base import EmbetterBase 4 | 5 | 6 | class ColorHistogramEncoder(EmbetterBase): 7 | """ 8 | Encoder that generates an embedding based on the color histogram of the image. 9 | 10 | Arguments: 11 | n_buckets: number of buckets per color 12 | 13 | **Usage**: 14 | 15 | ```python 16 | import pandas as pd 17 | from sklearn.pipeline import make_pipeline 18 | 19 | from embetter.grab import ColumnGrabber 20 | from embetter.vision import ImageLoader, ColorHistogramEncoder 21 | 22 | # Let's say we start we start with a csv file with filepaths 23 | data = {"filepaths": ["tests/data/thiscatdoesnotexist.jpeg"]} 24 | df = pd.DataFrame(data) 25 | 26 | # Let's build a pipeline that grabs the column, turns it 27 | # into an image and embeds it. 28 | pipe = make_pipeline( 29 | ColumnGrabber("filepaths"), 30 | ImageLoader(), 31 | ColorHistogramEncoder() 32 | ) 33 | 34 | # This pipeline can now encode each image in the dataframe 35 | pipe.fit_transform(df) 36 | ``` 37 | """ 38 | 39 | def __init__(self, n_buckets=256): 40 | self.n_buckets = n_buckets 41 | 42 | def transform(self, X, y=None): 43 | """ 44 | Takes a sequence of `PIL.Image` and returns a numpy array representing 45 | a color histogram for each. 46 | """ 47 | output = np.zeros((len(X), self.n_buckets * 3)) 48 | for i, x in enumerate(X): 49 | arr = np.array(x) 50 | output[i, :] = np.concatenate( 51 | [ 52 | np.histogram( 53 | arr[:, :, 0].flatten(), 54 | bins=np.linspace(0, 255, self.n_buckets + 1), 55 | )[0], 56 | np.histogram( 57 | arr[:, :, 1].flatten(), 58 | bins=np.linspace(0, 255, self.n_buckets + 1), 59 | )[0], 60 | np.histogram( 61 | arr[:, :, 2].flatten(), 62 | bins=np.linspace(0, 255, self.n_buckets + 1), 63 | )[0], 64 | ] 65 | ) 66 | return output 67 | -------------------------------------------------------------------------------- /embetter/vision/_loader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | 4 | from embetter.base import EmbetterBase 5 | 6 | 7 | class ImageLoader(EmbetterBase): 8 | """ 9 | Component that can turn filepaths into a list of PIL.Image objects. 10 | 11 | Arguments: 12 | convert: Color [conversion setting](https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.convert) from the Python image library. 13 | out: What kind of image output format to expect. 14 | 15 | **Usage** 16 | 17 | You can use the `ImageLoader` in standalone fashion. 18 | 19 | ```python 20 | from embetter.vision import ImageLoader 21 | 22 | filepath = "tests/data/thiscatdoesnotexist.jpeg" 23 | ImageLoader(convert="RGB").fit_transform([filepath]) 24 | ``` 25 | 26 | But it's more common to see it part of a pipeline. 27 | 28 | ```python 29 | import pandas as pd 30 | from sklearn.pipeline import make_pipeline 31 | 32 | from embetter.grab import ColumnGrabber 33 | from embetter.vision import ImageLoader, ColorHistogramEncoder 34 | 35 | # Let's say we start we start with a csv file with filepaths 36 | data = {"filepaths": ["tests/data/thiscatdoesnotexist.jpeg"]} 37 | df = pd.DataFrame(data) 38 | 39 | # Let's build a pipeline that grabs the column, turns it 40 | # into an image and embeds it. 41 | pipe = make_pipeline( 42 | ColumnGrabber("filepaths"), 43 | ImageLoader(), 44 | ColorHistogramEncoder() 45 | ) 46 | 47 | pipe.fit_transform(df) 48 | ``` 49 | 50 | """ 51 | 52 | def __init__(self, convert: str = "RGB", out: str = "pil") -> None: 53 | self.convert = convert 54 | self.out = out 55 | 56 | def fit(self, X, y=None): 57 | """ 58 | Not actual "fitting" happens in this method, but it does check the input arguments 59 | per sklearn convention. 60 | """ 61 | if self.out not in ["pil", "numpy"]: 62 | raise ValueError( 63 | f"Output format parameter out={self.out} must be either pil/numpy." 64 | ) 65 | return self 66 | 67 | def transform(self, X, y=None): 68 | """ 69 | Turn a file path into numpy array containing pixel values. 70 | """ 71 | if self.out == "pil": 72 | return [Image.open(x).convert(self.convert) for x in X] 73 | if self.out == "numpy": 74 | return np.array([np.array(Image.open(x).convert(self.convert)) for x in X]) 75 | -------------------------------------------------------------------------------- /embetter/vision/_torchvis.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import timm 3 | from timm.data import resolve_data_config 4 | from timm.data.transforms_factory import create_transform 5 | 6 | from embetter.base import EmbetterBase 7 | 8 | 9 | class TimmEncoder(EmbetterBase): 10 | """ 11 | Use a pretrained vision model from TorchVision to generate embeddings. Embeddings 12 | are provider via the lovely `timm` library. 13 | 14 | You can find a list of available models [here](https://rwightman.github.io/pytorch-image-models/models/). 15 | 16 | Arguments: 17 | name: name of the model to use 18 | encode_predictions: output the predictions instead of the pooled embedding layer before 19 | 20 | **Usage**: 21 | 22 | ```python 23 | import pandas as pd 24 | from sklearn.pipeline import make_pipeline 25 | 26 | from embetter.grab import ColumnGrabber 27 | from embetter.vision import ImageLoader, TimmEncoder 28 | 29 | # Let's say we start we start with a csv file with filepaths 30 | data = {"filepaths": ["tests/data/thiscatdoesnotexist.jpeg"]} 31 | df = pd.DataFrame(data) 32 | 33 | # Let's build a pipeline that grabs the column, turns it 34 | # into an image and embeds it. 35 | pipe = make_pipeline( 36 | ColumnGrabber("filepaths"), 37 | ImageLoader(), 38 | TimmEncoder(name="mobilenetv3_large_100") 39 | ) 40 | 41 | # This pipeline can now encode each image in the dataframe 42 | pipe.fit_transform(df) 43 | ``` 44 | """ 45 | 46 | def __init__(self, name="mobilenetv3_large_100", encode_predictions=False): 47 | self.name = name 48 | self.encode_predictions = encode_predictions 49 | self.model = timm.create_model(name, pretrained=True, num_classes=0) 50 | if self.encode_predictions: 51 | self.model = timm.create_model(name, pretrained=True) 52 | self.config = resolve_data_config({}, model=self.model) 53 | self.transform_img = create_transform(**self.config) 54 | 55 | def transform(self, X, y=None): 56 | """ 57 | Transforms grabbed images into numeric representations. 58 | """ 59 | batch = [self.transform_img(x).unsqueeze(0) for x in X] 60 | return np.array([self.model(x).squeeze(0).detach().numpy() for x in batch]) 61 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Embetter Docs 2 | site_description: 'Scikit-Learn compatible embeddings' 3 | site_copy: Embetter offers embeddings for rapid-prototyping and finetuning in scikit-learn. 4 | repo_url: https://github.com/koaning/embetter 5 | nav: 6 | - Home: index.md 7 | - Techniques: applications.md 8 | - API: 9 | - Text: API/text.md 10 | - Vision: API/vision.md 11 | - MultiModal: API/multimodal.md 12 | - External: API/external.md 13 | - Finetuners: API/finetune.md 14 | - Model: API/model.md 15 | plugins: 16 | - mkdocstrings: 17 | handlers: 18 | python: 19 | options: 20 | annotations_path: brief 21 | show_root_heading: false 22 | show_root_toc_entry: false 23 | show_symbol_type_heading: true 24 | theme: 25 | name: material 26 | font: 27 | text: Inter 28 | code: Jetbrains Mono 29 | logo: images/icon.png 30 | palette: 31 | primary: white 32 | features: 33 | - toc.integrate 34 | - navigation.tabs 35 | - navigation.tabs.sticky 36 | - navigation.sections 37 | - navigation.expand 38 | - navigation.path 39 | - navigation.indexes 40 | - toc.follow 41 | - content.code.copy 42 | - content.code.select 43 | - content.code.annotate 44 | markdown_extensions: 45 | - pymdownx.highlight: 46 | use_pygments: true 47 | - pymdownx.superfences 48 | - attr_list 49 | - md_in_html 50 | - admonition 51 | 52 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | from setuptools import find_packages, setup 4 | 5 | base_packages = [ 6 | "scikit-learn>=1.0.0", 7 | "pandas>=1.0.0", 8 | "diskcache>=5.6.1", 9 | "skops>=0.8.0", 10 | "model2vec" 11 | ] 12 | 13 | sbert_pkgs = ["sentence-transformers>=2.2.2"] 14 | sense2vec_pkgs = ["sense2vec==2.0.0"] 15 | bpemb_packages = ["bpemb>=0.3.3"] 16 | spacy_packages = ["spacy>=3.5.0"] 17 | gensim_packages = ["gensim>=4.3.1", "scipy<1.13.0"] 18 | 19 | text_packages = sense2vec_pkgs + bpemb_packages + gensim_packages 20 | 21 | vision_packages = ["timm>=0.6.7"] 22 | 23 | pytorch_packages = ["torch>=1.12.0"] 24 | 25 | openai_packages = ["openai>=1.59.8"] 26 | 27 | cohere_packages = ["cohere>=4.11.2"] 28 | 29 | 30 | docs_packages = [ 31 | "mkdocs-material==9.6.9", 32 | "mkdocstrings==0.29.0", 33 | "mkdocstrings-python==1.16.0", 34 | "mktestdocs==0.2.4", 35 | ] 36 | 37 | test_packages = [ 38 | "interrogate>=1.5.0", 39 | "pytest>=4.0.2", 40 | "ruff", 41 | "pre-commit>=2.2.0", 42 | "mktestdocs==0.2.4", 43 | "datasets==2.8.0", 44 | "matplotlib==3.4.3", 45 | "pytest-xdist", 46 | ] 47 | 48 | all_packages = base_packages + text_packages + vision_packages + openai_packages 49 | dev_packages = all_packages + docs_packages + test_packages 50 | 51 | 52 | setup( 53 | name="embetter", 54 | version="0.7.0", 55 | author="Vincent D. Warmerdam", 56 | packages=find_packages(exclude=["notebooks", "docs", "datasets"]), 57 | description="Just a bunch of useful embeddings to get started quickly.", 58 | long_description=pathlib.Path("README.md").read_text(), 59 | long_description_content_type="text/markdown", 60 | license_files=("LICENSE"), 61 | url="https://koaning.github.io/embetter/", 62 | project_urls={ 63 | "Documentation": "https://koaning.github.io/embetter/", 64 | "Source Code": "https://github.com/koaning/embetter/", 65 | "Issue Tracker": "https://github.com/koaning/embetter/issues", 66 | }, 67 | install_requires=base_packages, 68 | extras_require={ 69 | "gensim": gensim_packages + base_packages, 70 | "sense2vec": sense2vec_pkgs + base_packages, 71 | "sbert": sbert_pkgs + base_packages, 72 | "spacy": spacy_packages + base_packages, 73 | "bpemb": bpemb_packages + base_packages, 74 | "text": text_packages + base_packages, 75 | "vision": vision_packages + base_packages, 76 | "pytorch": pytorch_packages + base_packages, 77 | "openai": openai_packages + base_packages, 78 | "cohere": cohere_packages + base_packages, 79 | "all": all_packages, 80 | "docs": docs_packages, 81 | "dev": dev_packages, 82 | }, 83 | classifiers=[ 84 | "Intended Audience :: Science/Research", 85 | "Programming Language :: Python :: 3", 86 | "License :: OSI Approved :: MIT License", 87 | "Topic :: Scientific/Engineering", 88 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 89 | ], 90 | ) 91 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/en.wiki.bpe.vs1000.d25.w2v.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/tests/data/en.wiki.bpe.vs1000.d25.w2v.bin -------------------------------------------------------------------------------- /tests/data/en.wiki.bpe.vs1000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/tests/data/en.wiki.bpe.vs1000.model -------------------------------------------------------------------------------- /tests/data/thiscatdoesnotexist.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/tests/data/thiscatdoesnotexist.jpeg -------------------------------------------------------------------------------- /tests/test_base.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from embetter.grab import ColumnGrabber 3 | 4 | 5 | def test_grab_column(): 6 | """Ensure that we can grab a text column.""" 7 | data = [{"text": "hi", "foo": 1}, {"text": "yes", "foo": 2}] 8 | dataframe = pd.DataFrame(data) 9 | out = ColumnGrabber("text").fit_transform(dataframe) 10 | assert out == ["hi", "yes"] 11 | -------------------------------------------------------------------------------- /tests/test_docs.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from mktestdocs import check_md_file, check_docstring 3 | from embetter.vision import ColorHistogramEncoder, TimmEncoder, ImageLoader 4 | from embetter.text import SentenceEncoder, BytePairEncoder 5 | from embetter.grab import ColumnGrabber 6 | from embetter.model import DifferenceClassifier 7 | 8 | 9 | def test_readme(): 10 | """Readme needs to be accurate""" 11 | check_md_file(fpath="README.md") 12 | 13 | 14 | # def test_finetune_docs(): 15 | # """Docs need to be accurate""" 16 | # check_md_file(fpath="docs/finetuners.md", memory=True) 17 | 18 | 19 | # I'm not testing spaCy, sense2vec because those docs would require 20 | # us to download `en_core_web_md` on every CI. Which is too heavy. 21 | objects = [ 22 | ColumnGrabber, 23 | SentenceEncoder, 24 | ColorHistogramEncoder, 25 | TimmEncoder, 26 | ImageLoader, 27 | BytePairEncoder, 28 | DifferenceClassifier, 29 | ] 30 | 31 | 32 | @pytest.mark.parametrize("func", objects, ids=lambda d: d.__name__) 33 | def test_docstring(func): 34 | """Check the docstrings of the components""" 35 | check_docstring(obj=func) 36 | -------------------------------------------------------------------------------- /tests/test_text.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | 3 | import numpy as np 4 | import pytest 5 | from gensim.models import Word2Vec 6 | from gensim.utils import tokenize 7 | from spacy.language import Language 8 | from spacy.vocab import Vocab 9 | 10 | from embetter.text import ( 11 | SentenceEncoder, 12 | GensimEncoder, 13 | spaCyEncoder, 14 | MatryoshkaEncoder, 15 | TextEncoder, 16 | ) 17 | from embetter.utils import cached 18 | 19 | test_sentences = [ 20 | "This is a test sentence!", 21 | "And this is another one", 22 | "\rUnicode stuff: ♣️,♦️,❤️,♠️\n", 23 | ] 24 | 25 | 26 | @pytest.mark.parametrize("setting", ["max", "mean", "both"]) 27 | def test_word2vec(setting): 28 | """Check if one can train and use a very simple word embedding model.""" 29 | vector_size = 25 30 | sentences = [list(tokenize(sent)) for sent in test_sentences] 31 | model = Word2Vec( 32 | sentences=sentences, vector_size=vector_size, window=3, min_count=1 33 | ) 34 | encoder = GensimEncoder(model, agg=setting) 35 | output = encoder.fit_transform(test_sentences) 36 | assert isinstance(output, np.ndarray) 37 | out_dim = vector_size if setting != "both" else vector_size * 2 38 | assert output.shape == (len(test_sentences), out_dim) 39 | # This tests whether it can load the model from disk 40 | with tempfile.NamedTemporaryFile() as fp: 41 | model.save(fp) 42 | encoder = GensimEncoder(fp.name, agg=setting) 43 | encoder.transform(test_sentences) 44 | assert repr(encoder) 45 | 46 | 47 | @pytest.mark.parametrize("encoder", [MatryoshkaEncoder, SentenceEncoder]) 48 | def test_basic_sentence_encoder(encoder): 49 | """Check correct dimensions and repr for SentenceEncoder.""" 50 | enc = encoder() 51 | # Embedding dim of underlying model 52 | output_dim = enc.tfm._modules["1"].word_embedding_dimension 53 | output = enc.fit_transform(test_sentences) 54 | assert isinstance(output, np.ndarray) 55 | assert output.shape == (len(test_sentences), output_dim) 56 | # scikit-learn configures repr dynamically from defined attributes. 57 | # To test correct implementation we should test if calling repr breaks. 58 | assert repr(enc) 59 | 60 | 61 | def test_basic_text_encoder(): 62 | """Check correct dimensions and repr for TextEncoder.""" 63 | enc = TextEncoder() 64 | output = enc.fit_transform(test_sentences) 65 | assert isinstance(output, np.ndarray) 66 | assert repr(enc) 67 | 68 | 69 | @pytest.fixture() 70 | def nlp(): 71 | """Just a fixture with a lightweight spaCy lang""" 72 | vector_data = { 73 | "red": np.array([1.0, 0.0]), 74 | "green": np.array([0.5, 0.5]), 75 | "blue": np.array([0.0, 1.0]), 76 | "purple": np.array([0.0, 1.0]), 77 | } 78 | 79 | vocab = Vocab(strings=list(vector_data.keys())) 80 | for word, vector in vector_data.items(): 81 | vocab.set_vector(word, vector) 82 | return Language(vocab=vocab) 83 | 84 | 85 | @pytest.mark.parametrize("setting", ["max", "mean", "both"]) 86 | def test_basic_spacy(setting, nlp): 87 | """Check correct dimensions and repr for spaCyEncoder.""" 88 | encoder = spaCyEncoder(nlp, agg=setting) 89 | # Embedding dim of underlying model 90 | output = encoder.fit_transform(test_sentences) 91 | assert isinstance(output, np.ndarray) 92 | assert output.shape == (len(test_sentences), 4 if setting == "both" else 2) 93 | # scikit-learn configures repr dynamically from defined attributes. 94 | # To test correct implementation we should test if calling repr breaks. 95 | assert repr(encoder) 96 | 97 | 98 | def test_basic_spacy_cached(nlp, tmpdir): 99 | """Just an e2e test for the cache.""" 100 | encoder = spaCyEncoder(nlp) 101 | output_before = encoder.transform(test_sentences) 102 | 103 | # Now we cache it 104 | encoder = cached(tmpdir, encoder) 105 | output_during = encoder.transform(test_sentences) 106 | 107 | encoder = cached(tmpdir, encoder) 108 | output_after = encoder.transform(test_sentences) 109 | assert (output_before == output_during).all() 110 | assert (output_during == output_after).all() 111 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from embetter.utils import calc_distances 4 | from embetter.text import SentenceEncoder 5 | 6 | 7 | def test_calc_distances(): 8 | """Make sure that the aggregation works as expected""" 9 | text_in = ["hi there", "no", "what is this then"] 10 | 11 | dists1 = calc_distances( 12 | text_in, ["greetings", "something else"], SentenceEncoder(), aggregate=np.min 13 | ) 14 | dists2 = calc_distances( 15 | text_in, 16 | ["greetings", "something unrelated"], 17 | SentenceEncoder(), 18 | aggregate=np.min, 19 | ) 20 | assert np.isclose(dists1.min(), dists2.min()) 21 | -------------------------------------------------------------------------------- /tests/test_vision.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from embetter.vision import ImageLoader, ColorHistogramEncoder, TimmEncoder 3 | 4 | 5 | @pytest.mark.parametrize("n_buckets", [5, 10, 25, 128]) 6 | def test_color_hist_resize(n_buckets): 7 | """Make sure we can resize and it fits""" 8 | X = ImageLoader().fit_transform(["tests/data/thiscatdoesnotexist.jpeg"]) 9 | shape_out = ColorHistogramEncoder(n_buckets=n_buckets).fit_transform(X).shape 10 | shape_exp = (1, n_buckets * 3) 11 | assert shape_exp == shape_out 12 | 13 | 14 | @pytest.mark.parametrize("encode_predictions,size", [(True, 1000), (False, 1280)]) 15 | def test_basic_timm(encode_predictions, size): 16 | """Super basic check for torch image model.""" 17 | model = TimmEncoder("mobilenetv2_120d", encode_predictions=encode_predictions) 18 | X = ImageLoader().fit_transform(["tests/data/thiscatdoesnotexist.jpeg"]) 19 | out = model.fit_transform(X) 20 | assert out.shape == (1, size) 21 | --------------------------------------------------------------------------------