├── .flake8
├── .github
└── workflows
│ ├── style.yml
│ └── unittest.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENCE
├── Makefile
├── README.md
├── datasets
├── data-quality.jsonl
└── new-dataset.jsonl
├── docs
├── API
│ ├── external.md
│ ├── grab.md
│ ├── model.md
│ ├── multimodal.md
│ ├── text.md
│ ├── utils.md
│ └── vision.md
├── applications.md
├── images
│ ├── colorhistogram.png
│ ├── columngrabber.png
│ ├── contrastive-re-use.png
│ ├── contrastive-same-weights.png
│ ├── contrastive.png
│ ├── difference-model.png
│ ├── embed.png
│ ├── feedforward.png
│ ├── gradient.png
│ ├── human-in-the-loop-1.png
│ ├── human-in-the-loop-2.png
│ ├── human-in-the-loop-3.png
│ ├── human-in-the-loop-4.png
│ ├── icon.png
│ ├── imageloader.png
│ ├── output.png
│ ├── sense2vec.png
│ ├── sentence-encoder.png
│ ├── timm.png
│ ├── x-finetuned-again.png
│ ├── x-finetuned.png
│ └── x-orig.png
├── index.md
└── vegalite
│ ├── lite_embed1.json
│ └── lite_embed2.json
├── embetter
├── __init__.py
├── base.py
├── error.py
├── external
│ ├── __init__.py
│ ├── _cohere.py
│ └── _openai.py
├── finetune
│ ├── __init__.py
│ ├── _constrastive_learn.py
│ ├── _contrastive_tuner.py
│ ├── _forward.py
│ └── _sbert_learn.py
├── grab.py
├── model
│ ├── __init__.py
│ └── _diff.py
├── multi
│ ├── __init__.py
│ └── _clip.py
├── text
│ ├── __init__.py
│ ├── _bpemb.py
│ ├── _keras.py
│ ├── _lite.py
│ ├── _model2vec.py
│ ├── _s2v.py
│ ├── _sbert.py
│ ├── _spacy.py
│ └── _word2vec.py
├── utils.py
└── vision
│ ├── __init__.py
│ ├── _colorhist.py
│ ├── _loader.py
│ └── _torchvis.py
├── mkdocs.yml
├── setup.py
└── tests
├── __init__.py
├── data
├── en.wiki.bpe.vs1000.d25.w2v.bin
├── en.wiki.bpe.vs1000.model
└── thiscatdoesnotexist.jpeg
├── test_base.py
├── test_docs.py
├── test_text.py
├── test_utils.py
└── test_vision.py
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 160
3 | ignore = E501, C901
4 | extend-ignore = E203, W503
--------------------------------------------------------------------------------
/.github/workflows/style.yml:
--------------------------------------------------------------------------------
1 | name: Style Checks
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - main
10 |
11 | jobs:
12 | build:
13 | runs-on: ubuntu-latest
14 | strategy:
15 | matrix:
16 | python-version: ["3.12"]
17 |
18 | steps:
19 | - uses: actions/checkout@v2
20 | - name: Set up Python ${{ matrix.python-version }}
21 | uses: actions/setup-python@v1
22 | with:
23 | python-version: ${{ matrix.python-version }}
24 | cache: 'pip'
25 | - name: Install Testing Dependencies
26 | run: python -m pip install ruff
27 | - name: Ruff
28 | if: always()
29 | run: ruff check embetter tests setup.py
30 |
--------------------------------------------------------------------------------
/.github/workflows/unittest.yml:
--------------------------------------------------------------------------------
1 | name: Code Checks
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - main
10 |
11 | jobs:
12 | build:
13 | if: ${{ always() }}
14 | runs-on: ubuntu-latest
15 | strategy:
16 | matrix:
17 | python-version: ["3.9", "3.11"]
18 |
19 | steps:
20 | - uses: actions/checkout@v2
21 | - name: Install uv
22 | uses: astral-sh/setup-uv@v2
23 | - name: Set up Python ${{ matrix.python-version }}
24 | run: uv python install ${{ matrix.python-version }}
25 | - name: Set up venv
26 | run: uv venv
27 | - name: Install Base Dependencies
28 | run: |
29 | uv pip install -e '.[dev]'
30 | uv pip install -e '.[sbert]'
31 | - name: Prep CI tests
32 | run: |
33 | mkdir -p ~/.cache/bpemb/en
34 | mv tests/data/en.wiki.bpe.vs1000.d25.w2v.bin ~/.cache/bpemb/en
35 | mv tests/data/en.wiki.bpe.vs1000.model ~/.cache/bpemb/en
36 | - name: Unittest
37 | run: uv run pytest -n auto -vv
38 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 | *.ipynb
131 | .vscode
132 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | repos:
4 | - repo: https://github.com/pre-commit/pre-commit-hooks
5 | rev: v4.6.0
6 | hooks:
7 | - id: trailing-whitespace
8 | - id: end-of-file-fixer
9 | - id: check-yaml
10 | - id: check-added-large-files
11 | - repo: https://github.com/astral-sh/ruff-pre-commit
12 | # Ruff version.
13 | rev: v0.4.3
14 | hooks:
15 | # Run the linter.
16 | - id: ruff
17 | types_or: [ python, pyi, jupyter ]
18 | args: [ --fix ]
19 | # Run the formatter.
20 | - id: ruff-format
21 | types_or: [ python, pyi, jupyter ]
22 |
--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Vincent D. Warmerdam
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: docs
2 |
3 | ruff:
4 | python -m ruff check embetter tests setup.py --fix
5 |
6 | test:
7 | pytest -n auto -vv
8 |
9 | install:
10 | python -m pip install -e ".[dev]"
11 |
12 | pypi:
13 | python setup.py sdist
14 | python setup.py bdist_wheel --universal
15 | twine upload dist/*
16 |
17 | clean:
18 | rm -rf **/.ipynb_checkpoints **/.pytest_cache **/__pycache__ **/**/__pycache__ .ipynb_checkpoints .pytest_cache
19 |
20 | check: clean ruff test clean
21 |
22 | docs:
23 | cp README.md docs/index.md
24 | python -m mkdocs serve
25 |
26 | deploy-docs:
27 | cp README.md docs/index.md
28 | python -m mkdocs gh-deploy
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # embetter
3 |
4 | > "Just a bunch of useful embeddings for scikit-learn pipelines, to get started quickly."
5 |
6 |
7 |
8 |
9 |
10 | Embetter implements scikit-learn compatible embeddings for computer vision and text. It should make it very easy to quickly build proof of concepts using scikit-learn pipelines and, in particular, should help with [bulk labelling](https://www.youtube.com/watch?v=gDk7_f3ovIk). It's also meant to play nice with [bulk](https://github.com/koaning/bulk) and [scikit-partial](https://github.com/koaning/scikit-partial) but it can also be used together with your favorite ANN solution like [lancedb](https://lancedb.github.io/lancedb/).
11 |
12 | ## Install
13 |
14 | You can install via pip.
15 |
16 | ```
17 | python -m pip install embetter
18 | ```
19 |
20 | Many of the embeddings are optional depending on your use-case, so if you
21 | want to nit-pick to download only the tools that you need:
22 |
23 | ```
24 | python -m pip install "embetter[text]"
25 | python -m pip install "embetter[spacy]"
26 | python -m pip install "embetter[sense2vec]"
27 | python -m pip install "embetter[gensim]"
28 | python -m pip install "embetter[bpemb]"
29 | python -m pip install "embetter[vision]"
30 | python -m pip install "embetter[all]"
31 | ```
32 |
33 | ## API Design
34 |
35 | This is what's being implemented now.
36 |
37 | ```python
38 | # Helpers to grab text or image from pandas column.
39 | from embetter.grab import ColumnGrabber
40 |
41 | # Representations/Helpers for computer vision
42 | from embetter.vision import ImageLoader, TimmEncoder, ColorHistogramEncoder
43 |
44 | # Representations for text
45 | from embetter.text import SentenceEncoder, MatryoshkaEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, GensimEncoder, TextEncoder
46 |
47 | # Representations from multi-modal models
48 | from embetter.multi import ClipEncoder
49 |
50 | # Finetuning components
51 | from embetter.finetune import FeedForwardTuner, ContrastiveTuner, ContrastiveLearner, SbertLearner
52 |
53 | # External embedding providers, typically needs an API key
54 | from embetter.external import CohereEncoder, OpenAIEncoder
55 | ```
56 |
57 | All of these components are scikit-learn compatible, which means that you
58 | can apply them as you would normally in a scikit-learn pipeline. Just be aware
59 | that these components are stateless. They won't require training as these
60 | are all pretrained tools.
61 |
62 | ## Text Example
63 |
64 | To run this example, make sure that you `pip install 'embetter[sbert]'`.
65 |
66 | ```python
67 | import pandas as pd
68 | from sklearn.pipeline import make_pipeline
69 | from sklearn.linear_model import LogisticRegression
70 |
71 | from embetter.grab import ColumnGrabber
72 | from embetter.text import SentenceEncoder
73 |
74 | # This pipeline grabs the `text` column from a dataframe
75 | # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
76 | text_emb_pipeline = make_pipeline(
77 | ColumnGrabber("text"),
78 | SentenceEncoder('all-MiniLM-L6-v2')
79 | )
80 |
81 | # This pipeline can also be trained to make predictions, using
82 | # the embedded features.
83 | text_clf_pipeline = make_pipeline(
84 | text_emb_pipeline,
85 | LogisticRegression()
86 | )
87 |
88 | dataf = pd.DataFrame({
89 | "text": ["positive sentiment", "super negative"],
90 | "label_col": ["pos", "neg"]
91 | })
92 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
93 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
94 | ```
95 |
96 | ## Image Example
97 |
98 | The goal of the API is to allow pipelines like this:
99 |
100 | ```python
101 | import pandas as pd
102 | from sklearn.pipeline import make_pipeline
103 | from sklearn.linear_model import LogisticRegression
104 |
105 | from embetter.grab import ColumnGrabber
106 | from embetter.vision import ImageLoader
107 | from embetter.multi import ClipEncoder
108 |
109 | # This pipeline grabs the `img_path` column from a dataframe
110 | # then it grabs the image paths and turns them into `PIL.Image` objects
111 | # which then get fed into CLIP which can also handle images.
112 | image_emb_pipeline = make_pipeline(
113 | ColumnGrabber("img_path"),
114 | ImageLoader(convert="RGB"),
115 | ClipEncoder()
116 | )
117 |
118 | dataf = pd.DataFrame({
119 | "img_path": ["tests/data/thiscatdoesnotexist.jpeg"]
120 | })
121 | image_emb_pipeline.fit_transform(dataf)
122 | ```
123 |
124 | ## Batched Learning
125 |
126 | All of the encoding tools you've seen here are also compatible
127 | with the [`partial_fit` mechanic](https://scikit-learn.org/0.15/modules/scaling_strategies.html#incremental-learning)
128 | in scikit-learn. That means
129 | you can leverage [scikit-partial](https://github.com/koaning/scikit-partial)
130 | to build pipelines that can handle out-of-core datasets.
131 |
132 |
--------------------------------------------------------------------------------
/datasets/data-quality.jsonl:
--------------------------------------------------------------------------------
1 | {"text":"By leveraging this diversity, the collected dataset and the collection system aim to achieve higher recognition accuracy.","cats":{"new-dataset":0,"data-quality":0}}
2 | {"text":"In this paper, we study linear regression applied to data structured on a manifold.","cats":{"new-dataset":0,"data-quality":0}}
3 | {"text":"We assume that the data manifold is smooth and is embedded in a Euclidean space, and our objective is to reveal the impact of the data manifold's extrinsic geometry on the regression.","cats":{"new-dataset":0,"data-quality":0}}
4 | {"text":"Large language models trained for safety and harmlessness remain susceptible to adversarial misuse, as evidenced by the prevalence of \"jailbreak\" attacks on early releases of ChatGPT that elicit undesired behavior.","cats":{"new-dataset":0,"data-quality":0}}
5 | {"text":"Our work opens up new possibilities for modeling very long sequences, e.g., treating a whole corpus or even the entire Internet as a sequence.","cats":{"new-dataset":0,"data-quality":0}}
6 | {"text":"However, we identify issues with the dataset quality and evaluation metric.","cats":{"new-dataset":0,"data-quality":1}}
7 | {"text":"We will release our annotation scheme, the corpus, and codes to the research community to alleviate the scarcity of labeled data in this domain.","cats":{"new-dataset":1,"data-quality":1}}
8 | {"text":"Extensive experiments are conducted to demonstrate the effectiveness of our proposed method.","cats":{"new-dataset":0,"data-quality":0}}
9 | {"text":"Compared to a variety of baselines, our method achieves superior results.","cats":{"new-dataset":0,"data-quality":0}}
10 | {"text":"Previous segmentation methods for noisy label problems only utilize a single image while the potential of leveraging the correlation between images has been overlooked.","cats":{"new-dataset":0,"data-quality":1}}
11 | {"text":"Experiments with both synthetic and real-world label noise demonstrate that our method outperforms recent state-of-the-art robust segmentation approaches.","cats":{"new-dataset":0,"data-quality":1}}
12 | {"text":"We detail corpus statistics and demonstrate high inter-annotator agreement.","cats":{"new-dataset":0,"data-quality":0}}
13 | {"text":"Alongside the images, we provide ground-truth annotations for several learning tasks, including semantic segmentation, object detection, and counting.","cats":{"new-dataset":0,"data-quality":0}}
14 | {"text":"However, even manually labeled datasets contain errors, not to mention automatically labeled ones.","cats":{"data-quality":1}}
15 | {"text":"Label error is a ubiquitous problem in annotated data.","cats":{"data-quality":1}}
16 | {"text":"After demonstrating that our methodology empirically outperforms other algorithms for label error detection, we apply our approach to discover many label errors in the CelebA image tagging dataset.","cats":{"data-quality":1}}
17 | {"text":"These properties highlight a tradeoff between classification error probability and error-correction capabilities of label encodings.","cats":{"data-quality":0}}
18 | {"text":"In this work, we for the first time introduce a benchmark for label error detection methods on object detection datasets as well as a label error detection method and a number of baselines.","cats":{"data-quality":1}}
19 | {"text":"Label encodings found by RLEL result in lower or comparable errors to manually designed label encodings.","cats":{"data-quality":1}}
20 | {"text":"We also propose an improved self-labeling loss; it is robust to pseudo-labeling errors and enforces stronger fairness.","cats":{"data-quality":1}}
21 | {"text":"Inferencing unlabeled data from labeled data is an error-prone process.","cats":{"data-quality":1}}
22 | {"text":"However, creating such large keypoint labels is time-consuming and costly, and is often error-prone due to inconsistent labeling.","cats":{"data-quality":0}}
23 | {"text":"The losses are computed with respect to the predictions and the noisy labels including simulated label errors, aiming at detecting the latter.","cats":{"data-quality":1}}
24 | {"text":"PseudoAugments outperforms pseudo labeling by mitigating pseudo labeling errors and generating diverse fused training scenes.","cats":{"data-quality":1}}
25 | {"text":"Our model is also able to maintain high classification accuracy with very few labels, with only 7.79% error when only using 145 labels.","cats":{"data-quality":0}}
26 | {"text":"Detecting errors in KGs is challenging since the patterns of errors are unknown and diverse, while ground-truth labels are rare or even unavailable.","cats":{"data-quality":1}}
27 | {"text":"We analyze the factors affecting this approximation error and design a pseudo-label clustering generation method to reduce the approximation error.","cats":{"data-quality":1}}
28 | {"text":"To ameliorate the impact of label errors, we equipped our method with a novel negative label sampling strategy to strengthen the model robustness.","cats":{"data-quality":1}}
29 | {"text":"We propose an extension of the Confident Learning framework to this setting, as well as a label quality score that ranks examples with label errors much higher than those which are correctly labeled.","cats":{"data-quality":1}}
30 | {"text":"The later case can generate dense flow labels but the interpolated events are prone to errors.","cats":{"data-quality":0}}
31 | {"text":"Improper fingerprint localization and finger labeling errors lead to poor matching performance.","cats":{"data-quality":0}}
32 | {"text":"Our experiments show that our method is robust to linguistic labels with poor orthography and alignment errors.","cats":{"data-quality":1}}
33 | {"text":"We derive an upper bound for the generalization error that is linear in the clients' label noise level.","cats":{"data-quality":1}}
34 | {"text":"For example, for the IMDB text data with known labeling errors, a 14% boost is shown.","cats":{"data-quality":1}}
35 | {"text":"Large amounts of label error substantially degrades the quality of deep learning models.","cats":{"data-quality":1}}
36 | {"text":"We simulate four different types of randomly introduced label errors on train and test sets of well-labeled object detection datasets.","cats":{"data-quality":1}}
37 | {"text":"We prove that semi-supervised labels improve the downstream error bound whereas noisy labels have limited effects under such a paradigm.","cats":{"data-quality":1}}
38 | {"text":"This paper provides an exact characterization of the expected generalization error (gen-error) for semi-supervised learning (SSL) with pseudo-labeling via the Gibbs algorithm.","cats":{"data-quality":0}}
39 | {"text":"However, corresponding class labels are noisy when provided by error-prone annotators, e.g., crowd workers.","cats":{"data-quality":1}}
40 | {"text":"Most existing methods utilize the off-the-shelf pose or parsing networks as pseudo labels, which are prone to error.","cats":{"data-quality":0}}
41 | {"text":"The result is an SSL classification framework explicitly designed to overcome inevitable pseudo-label errors.","cats":{"data-quality":1}}
42 | {"text":"Here we consider the task of finding sentences that contain label errors in token classification datasets.","cats":{"data-quality":1}}
43 | {"text":"Scaling sequence length has become a critical demand in the era of large language models.","cats":{"data-quality":0}}
44 | {"text":"However, existing methods struggle with either computational complexity or model expressivity, rendering the maximum sequence length restricted.","cats":{"data-quality":0}}
45 | {"text":"In this work, we introduce LongNet, a Transformer variant that can scale sequence length to more than 1 billion tokens, without sacrificing the performance on shorter sequences.","cats":{"data-quality":0}}
46 | {"text":"Specifically, we propose dilated attention, which expands the attentive field exponentially as the distance grows.","cats":{"data-quality":0}}
47 | {"text":"Experiments results demonstrate that LongNet yields strong performance on both long-sequence modeling and general language tasks.","cats":{"data-quality":0}}
48 | {"text":"Large Language Models (LLMs) have demonstrated impressive planning abilities in single-agent embodied tasks across various domains.","cats":{"data-quality":0}}
49 | {"text":"However, their capacity for planning and communication in multi-agent cooperation remains unclear, even though these are crucial skills for intelligent embodied agents.","cats":{"data-quality":0}}
50 | {"text":"In this paper, we present a novel framework that utilizes LLMs for multi-agent cooperation and tests it in various embodied environments.","cats":{"data-quality":0}}
51 | {"text":"Our framework enables embodied agents to plan, communicate, and cooperate with other embodied agents or humans to accomplish long-horizon tasks efficiently.","cats":{"data-quality":0}}
52 | {"text":"We demonstrate that recent LLMs, such as GPT-4, can surpass strong planning-based methods and exhibit emergent effective communication using our framework without requiring fine-tuning or few-shot prompting.","cats":{"data-quality":0}}
53 | {"text":"We also discover that LLM-based agents that communicate in natural language can earn more trust and cooperate more effectively with humans.","cats":{"data-quality":0}}
54 | {"text":"For QE in particular, high-quality labeled data is often lacking due to the high-cost and effort associated with labeling such data.","cats":{"data-quality":0}}
55 | {"text":"With many possible classes to consider, data annotators are likely to make errors when labeling such data in practice.","cats":{"data-quality":1}}
56 | {"text":"However, it usually suffers from a lack of high-quality datasets due to high annotation cost, inter-observer variability, human annotator error, and errors in computer-generated labels.","cats":{"data-quality":0}}
57 | {"text":"For such bone structure analyses, deep learning technologies are promising but require high-quality labeled data for the learning, while the data labeling is costly.","cats":{"data-quality":0}}
58 | {"text":"However, agreement between annotators is often low, leading to inconsistent labels that hinder the reliability of models.","cats":{"data-quality":1}}
59 | {"text":"Our experiments show that this approach consistently improves inter-annotator agreement and annotation accuracy.","cats":{"data-quality":1}}
60 | {"text":"We advocate for the use of IAA in predicting the labeling quality of individual annotators, leading to cost and time efficiency in data production.","cats":{"data-quality":1}}
61 | {"text":"This paper presents a novel approach of leveraging Inter-Annotator Agreement (IAA), traditionally used for assessing labeling consistency, to optimize Data Management Operations (DMOps).","cats":{"data-quality":1}}
62 | {"text":"Our study illustrates that different labeling methodologies directly impact the annotations' quality, as well as the capabilities of a deep learning classifier trained with the data respectively.","cats":{"data-quality":1}}
63 | {"text":"However, such annotations may fail in practice because of the change in annotation requirements, application scenarios, and modeling goals, where label validation and relabeling by domain experts are required.","cats":{"data-quality":1}}
64 | {"text":"However, selecting training samples based on the degree of agreement between annotators introduces a bias in the training data and does not improve the results.","cats":{"data-quality":1}}
65 | {"text":"However, these annotations are inherently subjective and some of the instances are hard to classify, resulting in noisy annotations due to error or lack of agreement.","cats":{"data-quality":1}}
66 | {"text":"We propose and evaluate an additional application of our method leading to the detection of annotation errors.","cats":{"data-quality":1}}
67 | {"text":"However, arbitrating the final annotation is not always effective because new biases might be produced during the process, especially when there are significant variations among annotations.","cats":{"data-quality":1}}
68 | {"text":"A two-step human annotation and inter-annotator agreement study guarantee the high quality of the PcMSP corpus.","cats":{"data-quality":0}}
69 | {"text":"We observe a striking correlation between the model's and humans' annotation: Categories with consistent human annotations (>$0.9$ inter-rater reliability, IRR) also display higher human-model agreement (>$0.7$), while categories with less consistent human annotations ($0.7$-$0.8$ IRR) correspondingly demonstrate lower human-model agreement ($0.3$-$0.5$).","cats":{"data-quality":1}}
70 | {"text":"We propose two metrics to audit the noise of annotations.","cats":{"data-quality":1}}
71 | {"text":"Whereas such annotation is costly and hard to scale, significantly holding back the development of the research.","cats":{"data-quality":0}}
72 | {"text":"A key challenge in effectively combining partial annotation with self-training to reduce annotation cost is determining which sub-structures to select to label.","cats":{"data-quality":1}}
73 | {"text":"We hypothesize two failure modes of safety training: competing objectives and mismatched generalization.","cats":{"data-quality":0}}
74 | {"text":"Competing objectives arise when a model's capabilities and safety goals conflict, while mismatched generalization occurs when safety training fails to generalize to a domain for which capabilities exist.","cats":{"data-quality":0}}
75 | {"text":"We find that vulnerabilities persist despite the extensive red-teaming and safety-training efforts behind these models.","cats":{"data-quality":0}}
76 | {"text":"Specifically, we analyze the impact of the manifold's curvatures (or higher order nonlinearity in the parameterization when the curvatures are locally zero) on the uniqueness of the regression solution.","cats":{"data-quality":0}}
77 | {"text":"Our findings suggest that the corresponding linear regression does not have a unique solution when the embedded submanifold is flat in some dimensions.","cats":{"data-quality":0}}
78 | {"text":"Our findings thus reveal the role of data manifold geometry in ensuring the stability of regression models for out-of-distribution inferences.","cats":{"data-quality":0}}
79 | {"text":"To disentangle these effects, we propose an evaluation framework based on \"counterfactual\" task variants that deviate from the default assumptions underlying standard tasks.","cats":{"data-quality":0}}
80 | {"text":"Across a suite of 11 tasks, we observe nontrivial performance on the counterfactual variants, but nevertheless find that performance substantially and consistently degrades compared to the default conditions.","cats":{"data-quality":0}}
81 | {"text":"We also propose an accurate pseudo label generation method through prototype learning.","cats":{"data-quality":0}}
82 | {"text":"Specifically, we frame aggregation of annotations as posterior inference of so-called plausibilities, representing distributions over classes in a classification setting, subject to a hyper-parameter encoding annotator reliability.","cats":{"data-quality":1}}
83 | {"text":"Based on this model, we propose a metric for measuring annotation uncertainty and provide uncertainty-adjusted metrics for performance evaluation.","cats":{"data-quality":0}}
84 | {"text":"Identifying the samples with corrupted labels and preventing the model from learning them is a promising approach to address this challenge.","cats":{"data-quality":1}}
85 | {"text":"Furthermore, we detect real label errors a) on commonly used test datasets in object detection and b) on a proprietary dataset.","cats":{"data-quality":1}}
86 | {"text":"Large-scale datasets in the real world inevitably involve label noise.","cats":{"data-quality":0}}
87 | {"text":"This is partially due to the fact that obtaining a balanced, diverse, and perfectly labeled dataset is typically expensive, time-consuming, and error-prone.","cats":{"data-quality":0}}
88 | {"text":"We develop an efficient algorithm for detecting label errors and outlier data points based on the relational graph structure of the dataset.","cats":{"data-quality":1}}
89 | {"text":"By focusing on finding incorrect labels in the original training datasets, we can eliminate erroneous examples in their root.","cats":{"data-quality":1}}
90 | {"text":"Manually labelling data with high-quality labels is generally a time-consuming and challenging task and often this turns out to be the bottleneck in a machine learning project.","cats":{"data-quality":0}}
91 | {"text":"Here we consider algorithms for finding mislabeled examples in multi-label classification datasets.","cats":{"data-quality":1}}
92 | {"text":"Negative labels are those that a corresponding data item does not belong.","cats":{"data-quality":0}}
93 | {"text":"This issue is due to biased labeling preferences at multiple clients and is a typical setting of data heterogeneity.","cats":{"data-quality":0}}
94 | {"text":"However, noisy samples (i.e., with wrong labels) in the training set induce confusion and cause the network to learn the incorrect representation.","cats":{"data-quality":1}}
95 | {"text":"Mislabeled examples are a common issue in real-world data, particularly for tasks like token classification where many labels must be chosen on a fine-grained basis.","cats":{"data-quality":1}}
96 | {"text":"We also introduced robust loss to reduce the noise effects of inaccurate labels generated in semi-supervised learning.","cats":{"data-quality":1}}
97 | {"text":"The main anomaly was found by the autoencoder and automatically created labels and was also recorded in the log files.","cats":{"data-quality":1}}
98 | {"text":"About 0.2% of the images could not be assigned a label, while for 5.1% the reviewers were uncertain, or they assigned an invalid label.","cats":{"data-quality":1}}
99 | {"text":"We find that the above issues are caused by the training dataset's pose imbalance. ","cats":{"data-quality":0}}
100 | {"text":"The labor-intensive annotation process of semantic segmentation datasets is often prone to errors, since humans struggle to label every pixel correctly.","cats":{"data-quality":1}}
101 | {"text":"We study algorithms to automatically detect such annotation errors, in particular methods to score label quality, such that the images with the lowest scores are least likely to be correctly labeled.","cats":{"data-quality":1}}
102 | {"text":"Widely applicable, our label quality scores rely on probabilistic predictions from a trained segmentation model -- any model architecture and training procedure can be utilized.","cats":{"data-quality":1}}
103 | {"text":"Here we study 7 different label quality scoring methods used in conjunction with a DeepLabV3+ or a FPN segmentation model to detect annotation errors in a version of the SYNTHIA dataset.","cats":{"data-quality":1}}
104 | {"text":"Precision-recall evaluations reveal a score -- the soft-minimum of the model-estimated likelihoods of each pixel's annotated class -- that is particularly effective to identify images that are mislabeled, across multiple types of annotation error.","cats":{"data-quality":1}}
105 | {"text":"In recent years, research on learning with noisy labels has focused on devising novel algorithms that can achieve robustness to noisy training labels while generalizing to clean data.","cats":{"data-quality":1}}
106 | {"text":"While some of these regularization strategies have been utilized in previous noisy label learning research, their full potential has not been thoroughly explored.","cats":{"data-quality":1}}
107 | {"text":"We also synthetically mislabel a proportion of the dataset by randomly corrupting the labels of a few samples, and show that sorting by curvature yields high AUROC values for identifying the mislabeled samples.","cats":{"data-quality":1}}
108 | {"text":"Further analysis shows that these gains come from an improved decision boundary after cleaning the label errors existed in the training data.","cats":{"data-quality":1}}
109 | {"text":"Nevertheless, few papers have tackled the data shift problem in labeled training sets, which occurs when there is a mismatch between the data distribution in the training set and the testing set.","cats":{"data-quality":1}}
110 | {"text":"In this work, we examine the problem for both labeled and unlabeled settings.","cats":{"data-quality":1}}
111 | {"text":"It is crucial to correctly predict areas that deviate from the background noise, in both the train and test sets of labels. ","cats":{"data-quality":0}}
112 | {"text":"Data completeness is ensured through the label provided during training.","cats":{"data-quality":0}}
113 | {"text":"Trustworthy pseudo labels on unlabeled data are generated after uncertainty estimation.","cats":{"data-quality":0}}
114 | {"text":"When random label noise is added to a training dataset, the prediction error of a neural network on a label-noise-free test dataset initially improves during early training but eventually deteriorates, following a U-shaped dependence on training time.","cats":{"data-quality":0}}
115 | {"text":"In this paper, we try to deal with error accumulation in noisy label learning from both model and data perspectives.","cats":{"data-quality":1}}
116 | {"text":"In our analysis, we find that SoundDesc contains several duplicates that cause leakage of training data to the evaluation data.","cats":{"data-quality":1}}
117 | {"text":"However, in many situations, language can be ambiguous and ineffective in describing specific image edits.","cats":{"data-quality":0}}
118 | {"text":"We propose an automatic metric to test the prevalence of the opinions that a summary expresses, based on counting the number of reviews that are consistent with each statement in the summary, while discrediting trivial or redundant statements.","cats":{"data-quality":0}}
119 | {"text":"To formulate this opinion prevalence metric, we consider several existing methods to score the factual consistency of a summary statement with respect to each individual source review.","cats":{"data-quality":0}}
120 | {"text":"On a corpus of Amazon product reviews, we gather multiple human judgments of the opinion consistency, to determine which automatic metric best expresses consistency in product reviews.","cats":{"data-quality":1}}
121 | {"text":"The system utilizes a weakly supervised technique that employs a fine-grained annotation scheme to identify verbally formulated uncertainty at the sentence level in scientific texts.","cats":{"data-quality":1}}
122 | {"text":"Additionally, UnScientify provides interpretable results, aiding in the comprehension of identified instances of scientific uncertainty in text.","cats":{"data-quality":0}}
123 | {"text":"Recent work in Machine Learning and Computer Vision has highlighted the presence of various types of systematic flaws inside ground truth object recognition benchmark datasets.","cats":{"data-quality":1}}
124 | {"text":"The net consequence is that the current annotation process is largely under-specified, thus leaving too much freedom to the subjective judgment of annotators.","cats":{"data-quality":1}}
125 | {"text":"Motivated by the optimal strategy, we introduce double-score OOD methods that leverage uncertainty scores from two chosen OOD detectors: one focused on OOD/ID discrimination and the other on misclassification detection.","cats":{"data-quality":1}}
126 | {"text":"The optimal prediction strategy for out-of-distribution (OOD) setups is a fundamental question in machine learning.","cats":{"data-quality":0}}
127 | {"text":"In this paper, we address this question and present several contributions.","cats":{"data-quality":0}}
128 | {"text":"We propose three reject option models for OOD setups: the Cost-based model, the Bounded TPR-FPR model, and the Bounded Precision-Recall model.","cats":{"data-quality":0}}
129 | {"text":"These models extend the standard reject option models used in non-OOD setups and define the notion of an optimal OOD selective classifier.","cats":{"data-quality":0}}
130 | {"text":"We establish that all the proposed models, despite their different formulations, share a common class of optimal strategies. ","cats":{"data-quality":0}}
131 | {"text":"The experimental results consistently demonstrate the superior performance of this simple strategy compared to state-of-the-art methods.","cats":{"data-quality":0}}
132 | {"text":"Additionally, we propose novel evaluation metrics derived from the definition of the optimal strategy under the proposed OOD rejection models.","cats":{"data-quality":0}}
133 | {"text":"These new metrics provide a comprehensive and reliable assessment of OOD methods without the deficiencies observed in existing evaluation approaches.","cats":{"data-quality":0}}
134 | {"text":"This analysis helps us find a, to the best of our knowledge, novel failure model on the CIFAR100 dataset, that of duplicated images with different labels","cats":{"data-quality":1}}
135 | {"text":"Neural networks are overparametrized and easily overfit the datasets they train on.","cats":{"data-quality":0}}
136 | {"text":"In the extreme case, it is shown that they can memorize a training set with fully randomized labels.","cats":{"data-quality":0}}
137 | {"text":"We propose using the curvature of loss function around the training sample as a measure of its memorization, averaged over all training epochs.","cats":{"data-quality":0}}
138 | {"text":"We use this to study the generalization versus memorization properties of different samples in popular image datasets.","cats":{"data-quality":0}}
139 | {"text":"We visualize samples with the highest curvature of loss around them, and show that these visually correspond to long-tailed, mislabeled or conflicting samples. .","cats":{"data-quality":0}}
140 | {"text":"We also synthetically mislabel a proportion of the dataset by randomly corrupting the labels of a few samples, and show that sorting by curvature yields","cats":{"data-quality":0}}
141 | {"text":"Medical image classification is a challenging task due to the scarcity of labeled samples and class imbalance caused by the high variance in disease prevalence.","cats":{"data-quality":0}}
142 | {"text":"Semi-supervised learning (SSL) methods can mitigate these challenges by leveraging both labeled and unlabeled data.","cats":{"data-quality":0}}
143 | {"text":"However, SSL methods for medical image classification need to address two key challenges: (1) estimating reliable pseudo-labels for the images in the unlabeled dataset and (2) reducing biases caused by class imbalance.","cats":{"data-quality":0}}
144 | {"text":"In this paper, we propose a novel SSL approach, SPLAL, that effectively addresses these challenges.","cats":{"data-quality":0}}
145 | {"text":"SPLAL leverages class prototypes and a weighted combination of classifiers to predict reliable pseudo-labels over a subset of unlabeled images.","cats":{"data-quality":0}}
146 | {"text":"Additionally, we introduce alignment loss to mitigate model biases toward majority classes.","cats":{"data-quality":0}}
147 | {"text":"To evaluate the performance of our proposed approach, we conduct experiments on two publicly available medical image classification benchmark datasets: the skin lesion classification (ISIC 2018) and the blood cell classification dataset (BCCD).","cats":{"data-quality":0}}
148 | {"text":"The experimental results empirically demonstrate that our approach outperforms several state-of-the-art SSL methods over various evaluation metrics.","cats":{"data-quality":0}}
149 | {"text":"Specifically, our proposed approach achieves a significant improvement over the state-of-the-art approach on the ISIC 2018 dataset in both Accuracy and F1 score, with relative margins of 2.24\\% and 11.40\\%, respectively.","cats":{"data-quality":0}}
150 | {"text":"Finally, we conduct extensive ablation experiments to examine the contribution of different components of our approach, validating its effectiveness.","cats":{"data-quality":0}}
151 | {"text":"Textual noise, such as typos or abbreviations, is a well-known issue that penalizes vanilla Transformers for most downstream tasks","cats":{"data-quality":1}}
152 | {"text":"Previous works addressing the noise issue mainly rely on data augmentation strategies, showing improved robustness when dealing with corrupted samples that are similar to the ones used for training.","cats":{"data-quality":1}}
153 | {"text":"However, all these methods still suffer from the token distribution shift induced by typos","cats":{"data-quality":1}}
154 | {"text":"We show that this is also the case for sentence similarity, a fundamental task in multiple domains, e.g. matching, retrieval or paraphrasing.","cats":{"data-quality":0}}
155 | {"text":"Sentence similarity can be approached using cross-encoders, where the two sentences are concatenated in the input allowing the model to exploit the inter-relations between them.","cats":{"data-quality":0}}
156 | {"text":"Previous works addressing the noise issue mainly rely on data augmentation strategies, showing improved robustness when dealing wixtual noise by equipping cross-encoders with a novel LExical-aware Attention module (LEA) that incorporates lexical similarities between words in both sentences.","cats":{"data-quality":0}}
157 | {"text":"By using raw text similarities, our ae that the attention bias introduced by LEA helps cross-encoders to tackle complex scenarios with textual noise, specially in domains with short-text descriptions and limited context.","cats":{"data-quality":0}}
158 | {"text":"Experiments using three popular Transformer encoders in five e-commerce datasets for product matching show that LEA consistently boosts performance under the presence of noise, while remaining competitive on the original (clean) splits.","cats":{"data-quality":0}}
159 | {"text":"We also evaluate our approach in two datasets for textual entailment and paraphrasing showing that LEA is robust to typos in domains with longer sentences and more natural context.","cats":{"data-quality":0}}
160 | {"text":"Additionally, we thoroughly analyze several design choices in our approach, providing insights about the impact of the decisions made and fostering future research in cross-encoders dealing with typos.","cats":{"data-quality":0}}
161 | {"text":"For safety, AI systems in health undergo thorough evaluations before deployment, validating their predictions against a ground truth that is assumed certain.","cats":{"data-quality":0}}
162 | {"text":"However, this is actually not the case and the ground truth may be uncertain.","cats":{"data-quality":0}}
163 | {"text":"Unfortunately, this is largely ignored in standard evaluation of AI models but can have severe consequences such as overestimating the future performance.","cats":{"data-quality":0}}
164 | {"text":"To avoid this, we measure the effects of ground truth uncertainty, which we assume decomposes into two main components: annotation uncertainty which stems from the lack of reliable annotations, and inherent uncertainty due to limited observational information.","cats":{"data-quality":0}}
165 | {"text":"This ground truth uncertainty is ignored when estimating the ground truth by deterministically aggregating annotations, e.g., by majority voting or averaging.","cats":{"data-quality":0}}
166 | {"text":"In contrast, we propose a framework where aggregation is done using a statistical model. ","cats":{"data-quality":0}}
167 | {"text":"We present a case study applying our framework to skin condition classification fromtion (IRN) from previous work ignores ground truth uncertainty in evaluation.","cats":{"data-quality":0}}
168 | {"text":"Instead, we present two alternative statistical models: a probabilistic version of IRN and a Plackett-Luce-based model.","cats":{"data-quality":0}}
169 | {"text":"We find that a large portion of the dataset exhibits significant ground truth uncertainty and standard IRN-based evaluation severely over-estimates performance without providing uncertainty estimates.","cats":{"data-quality":0}}
170 | {"text":"To systematically combat confirmation bias for pseudo-labeling-based entity alignment, we propose a Unified Pseudo-Labeling framework for Entity Alignment (UPL-EA) that explicitly eliminates pseudo-labeling errors to boost the accuracy of entity alignment","cats":{"data-quality":1}}
171 | {"text":"The two components are respectively designed to eliminate Type I and Type II pseudo-labeling errors identified through our analyse.","cats":{"data-quality":0}}
172 | {"text":"The effectiveness of UPL-EA in eliminating pseudo-labeling errors is both theoretically supported and experimentally validated.","cats":{"data-quality":1}}
173 | {"text":"Entity alignment (EA) aims at identifying equivalent entity pairs across different knowledge graphs (KGs) that refer to the same real-world identity. .","cats":{"data-quality":0}}
174 | {"text":"UPL-EA consists of two complementary components: (1) The Optimal Transport (OT)-based pseudo-labeling uses discrete OT modeling as an effective means to enable more accurate determination of entity correspondences across two KGs and to mitigate the adverse impact of erroneous matches.","cats":{"data-quality":0}}
175 | {"text":"A simple but highly effective criterion is further devised to derive pseudo-labeled entity pairs that satisfy one-to-one correspondences at each iteration.","cats":{"data-quality":0}}
176 | {"text":"(2) The cross-iteration pseudo-label calibration operates across multiple consecutive iterations to further improve the pseudo-labeling precision rate by reducing the local pseudo-label selection variability with a theoretical guarantee.","cats":{"data-quality":0}}
177 | {"text":"The calibrated pseudo-labels are thereafter used to augment prior alignment seeds to reinforce subsequent model training fomentally validated.","cats":{"data-quality":0}}
178 | {"text":"The experimental results show that our approach achieves competitive performance with limited prior alignment seeds.","cats":{"data-quality":0}}
179 | {"text":"A novel annotation method was used to collect three separate annotations for each region of interest, and these annotations were performed in a fully transparent setting using a web-based annotation tool.","cats":{"data-quality":1}}
180 | {"text":"This paper presents the challenge report for the 2021 Kidney and Kidney Tumor Segmentation Challenge (KiTS21) held in conjunction with the 2021 international conference on Medical Image Computing and Computer Assisted Interventions (MICCAI).","cats":{"data-quality":0}}
181 | {"text":"KiTS21 is a sequel to its first edition in 2019, and it features a variety of innovations in how the challenge was designed, in addition to a larger dataset. ","cats":{"data-quality":0}}
182 | {"text":"Further, the KiTS21 test set was collected from an outside institution, challenging participants to develop methods that generalize well to new populations.","cats":{"data-quality":0}}
183 | {"text":"Nonetheless, the top-performing teams achieved a significant improvement over the state of the art set in 2019, and this performance is shown to inch ever closer to human-level performance.","cats":{"data-quality":0}}
184 | {"text":"An in-depth meta-analysis is presented describing which methods were used and how they faired on the leaderboard, as well as the characteristics of which cases generally saw good performance, and which did not.","cats":{"data-quality":0}}
185 | {"text":"Overall KiTS21 facilitated a significant advancement in the state of the art in kidney tumor segmentation, and provides useful insights that are applicable to the field of semantic segmentation as a whole.","cats":{"data-quality":0}}
186 | {"text":"Additionally, label noise is inevitable in large-scale annotations and hinders the applications of learning-based models.","cats":{"data-quality":1}}
187 | {"text":"To tackle such a critical yet thorny problem, this paper focuses on reducing noise based on some inherent properties of multi-label classification and long-tailed learning under noisy cases","cats":{"data-quality":1}}
188 | {"text":"In detail, we propose a Stitch-Up augmentation to synthesize a cleaner sample, which directly reduces multi-label noise by stitching up multiple noisy training samples","cats":{"data-quality":1}}
189 | {"text":"In real-world scenarios, collected and annotated data often exhibit the characteristics of multiple classes and long-tailed distribution. ","cats":{"data-quality":0}}
190 | {"text":"Although many deep learning based methods have been proposed for handling long-tailed multi-label recognition or label noise respectively, learning with noisy labels in long-tailed multi-label visual data has not been well-studied because of the complexity of long-tailed distribution entangled with multi-label correlation.","cats":{"data-quality":0}}
191 | {"text":"To tackle such a critical yet thorny problem, this paper focuses on reducing noise based on some inherent properties of m by stitching up multiple noisy training samples.","cats":{"data-quality":0}}
192 | {"text":"Equipped with Stitch-Up, a Heterogeneous Co-Learning framework is further designed to leverage the inconsistency between long-tailed and balamarks, named VOC-MLT-Noise and COCO-MLT-Noise, respectively.","cats":{"data-quality":0}}
193 | {"text":"Most of the existing methods adopt a coarse-grained fixed label assignment strategy and suffer from the inconsistency between the classification score and localization accuracy.","cats":{"data-quality":1}}
194 | {"text":"Second, to further address the inconsistency between classification and localization, we propose a critical feature sampling (CFS) module, which performs localization refinement on the sampling location for classification task to extract critical features accurately","cats":{"data-quality":1}}
195 | {"text":"Arbitrary-oriented object detection is a relatively emerging but challenging task.","cats":{"data-quality":0}}
196 | {"text":"Although remarkable progress has been made, there still remain many unsolved issues due to the large diversity of patterns in orientation, scale, aspect ratio, and visual appearance of objects in aerial images. ","cats":{"data-quality":0}}
197 | {"text":"First, to align the metric inconsistency between sample selection and regression loss calculation caused by fixed IoU strategy, we introduce affine transformation to evaluate the quality of samples and propose a distance-based label assignment strategy.","cats":{"data-quality":0}}
198 | {"text":"The proposed metric-aligned selection (MAS) strategy can dynamically select samples according to the shape and rotation characteristic of objects.","cats":{"data-quality":0}}
199 | {"text":"Second, to further address the inconsistency between classification and localization, we propose a critical feature sampling (CFS) module, which performs localization refinementtics of proposals during training.","cats":{"data-quality":0}}
200 | {"text":"Extensive experiments are conducted on four challenging rotated object detection datasets DOTA, FAIR1M-1.0, HRSC2016, and UCAS-AOD.","cats":{"data-quality":0}}
201 | {"text":"The results show the state-of-the-art accuracy of the proposed detector.","cats":{"data-quality":0}}
202 | {"text":"However, results from even highly accurate methods require manual verification and correction","cats":{"data-quality":1}}
203 | {"text":"The reviewers corrected 62.8% of the labels and agreed with the model label in 31.9% of cases.","cats":{"data-quality":1}}
204 | {"text":"We learned that our automatic transcription is biased towards the most frequent codes, with a higher degree of misclassification for the lowest frequency codes","cats":{"data-quality":1}}
205 | {"text":"Machine learning methods have proven useful in transcribing historical data. .","cats":{"data-quality":0}}
206 | {"text":"Such manual review can be time-consuming and expensive, therefore the objective of this paper was to make it more efficient.","cats":{"data-quality":0}}
207 | {"text":"Previously, we used machine learning to transcribe 2.3 million handwritten occupation codes from the Norwegian 1950 census with high accuracy (97%).","cats":{"data-quality":0}}
208 | {"text":"We manually reviewed the 90,000 (3%) codes with the lowest model confidence.","cats":{"data-quality":0}}
209 | {"text":"We allocated those 90,000 codes to human reviewers, who used our annotation tool to review the codes.","cats":{"data-quality":0}}
210 | {"text":"To assess reviewer agreement, some codes were assigned to multiple reviewers.","cats":{"data-quality":0}}
211 | {"text":"We then analyzed the review results to understand the relationship between accuracy improvements and effort.","cats":{"data-quality":0}}
212 | {"text":"Additionally, we interviewed the reviewers to improve the workflow.","cats":{"data-quality":0}}
213 | {"text":"The reviewers corrected 62.8% of the labels and agreed with the model label in 31.9% of casescertain, or they assigned an invalid label.","cats":{"data-quality":0}}
214 | {"text":"9,000 images were independently reviewed by multiplds the most frequent codes, with a higher degree of misclassification for the lowest frequency codes.","cats":{"data-quality":0}}
215 | {"text":"Our interview findings show that the reviewers did internal quality control and found our custom tool well-suited.","cats":{"data-quality":0}}
216 | {"text":"So, only one reviewer is needed, but they shou","cats":{"data-quality":0}}
217 | {"text":" We advocate for the use of IAA in predicting the labeling quality of individual annotators, leading to cost and time efficiency in data production.","cats":{"data-quality":0}}
218 | {"text":"Additionally, our work highlights the IAA's broader application potential in data-driven research optimization and holds significant implications for large-scale data projects prioritizing efficiency, cost reduction, and high-quality data.","cats":{"data-quality":0}}
219 | {"text":"We present DiffInfinite, a hierarchical diffusion model that generates arbitrarily large histological images while preserving long-range correlation structural information.","cats":{"data-quality":0}}
220 | {"text":"Our approach first generates synthetic segmentation masks, subsequently used as conditions for the high-fidelity generative diffusion process.","cats":{"data-quality":0}}
221 | {"text":"The proposed sampling method can be scaled up to any desired image size while only requiring small patches for fast training.","cats":{"data-quality":0}}
222 | {"text":"Moreover, it can be parallelized more efficiently than previous large-content generation methods while avoiding tiling artefacts.","cats":{"data-quality":0}}
223 | {"text":"The training leverages classifier-free guidance to augment a small, sparsely annotated dataset with unlabelled data.","cats":{"data-quality":0}}
224 | {"text":"Our method alleviates unique challenges in histopathological imaging practice: large-scale information, costly manual annotation, and protective data handling.","cats":{"data-quality":0}}
225 | {"text":"The biological plausibility of DiffInfinite data is validated in a survey by ten experienced pathologists as well as a downstream segmentation task.","cats":{"data-quality":0}}
226 | {"text":"Furthermore, the model scores strongly on anti-copying metrics which is beneficial for the protection of patient data.","cats":{"data-quality":0}}
227 | {"text":"Understanding this, we, in this paper, first analyze this lack of granular annotations from available pre-annotated datasets to understand the practical inconsistencies and also perform a detailed survey to look into the human perception surrounding annotations.","cats":{"data-quality":1}}
228 | {"text":"Efficient human activity recognition (HAR) using sensor data needs a significant volume of annotated data.","cats":{"data-quality":0}}
229 | {"text":"The growing volume of unlabelled sensor data has challenged conventional practices for gathering HAR annotations with human-in-the-loop approaches, often leading to the collection of shallower annotations.","cats":{"data-quality":0}}
230 | {"text":"These shallower annotations ignore the fine-grained micro-activities that constitute any complex activities of daily living (ADL). ","cats":{"data-quality":0}}
231 | {"text":"Drawing motivations from these, we next develop the framework AmicroN that can automatically generate micro-activity annotations using locomotive signatures and the available coarse-grain macro-activity labels.","cats":{"data-quality":0}}
232 | {"text":"In the backend, AmicroN applies change-point detection followed by zero-shot learning with activity embeddings to identify the unseen micro-activities in an unsupervised manner.","cats":{"data-quality":0}}
233 | {"text":"Rigorous evaluation on publicly available datasets shows that AmicroN can accurately generate micro-activity annotations with a median F1-score of >0.75.","cats":{"data-quality":0}}
234 | {"text":"Additionally, we also show that AmicroN can be used in a plug-and-play manner with Large Language Models (LLMs) to obtain the micro-activity labels, thus making it more practical for realistic applications.","cats":{"data-quality":0}}
235 | {"text":"This paper presents a large publicly available multi-center lumbar spine magnetic resonance imaging (MRI) dataset with reference segmentations of vertebrae, intervertebral discs (IVDs), and spinal canal.","cats":{"data-quality":0}}
236 | {"text":"The dataset includes 447 sagittal T1 and T2 MRI series from 218 patients with a history of low back pain.","cats":{"data-quality":0}}
237 | {"text":"It was collected from four different hospitals and was divided into a training (179 patients) and validation (39 patients) set.","cats":{"data-quality":0}}
238 | {"text":"An iterative data annotation approach was used by training a segmentation algorithm on a small part of the dataset, enabling semi-automatic segmentation of the remaining images.","cats":{"data-quality":0}}
239 | {"text":"The algorithm provided an initial segmentation, which was subsequently reviewed, manually corrected, and added to the training data.","cats":{"data-quality":0}}
240 | {"text":"We provide reference performance values for this baseline algorithm and nnU-Net, which performed comparably.","cats":{"data-quality":0}}
241 | {"text":"We set up a continuous segmentation challenge to allow for a fair comparison of different segmentation algorithms.","cats":{"data-quality":0}}
242 | {"text":"This study may encourage wider collaboration in the field of spine segmentation, and improve the diagnostic value of lumbar spine MRI.","cats":{"data-quality":0}}
243 | {"text":"But meanwhile, the distributed and isolated nature of data isolation may be complicated by data quality, making it more vulnerable to noisy labels","cats":{"data-quality":1}}
244 | {"text":"Many efforts exist to defend against the negative impacts of noisy labels in centralized or federated settings","cats":{"data-quality":1}}
245 | {"text":"Also, we conduct comprehensive experiments to explore the characteristics of these data settings and unravel challenging scenarios on the federated noisy label learning, which may guide method development in the future.","cats":{"data-quality":0}}
246 | {"text":"We highlight the 20 basic settings for more than 5 datasets proposed in our benchmark and standardized simulation pipeline for federated noisy label learning.","cats":{"data-quality":1}}
247 | {"text":"Federated learning has gained popularity for distributed learning without aggregating sensitive data from clients. .","cats":{"data-quality":0}}
248 | {"text":"Many efforts exist to defend against the negative impacts of noisy labels in centralized or federated settings.","cats":{"data-quality":0}}
249 | {"text":"However, there is a lack of a benchis work, we serve the first standardized benchmark that can help researchers fully explore potential federated noisy settings.","cats":{"data-quality":0}}
250 | {"text":"We highlight the 20 basic settings f \\texttt{FedNoisy} is available at \\codeword{https://github.com/SMILELab-FL/FedNoisy}.","cats":{"data-quality":0}}
251 | {"text":"In this paper, we explore different ways of training a model for handwritten text recognition when multiple imperfect or noisy transcriptions are available","cats":{"data-quality":1}}
252 | {"text":"We consider various training configurations, such as selecting a single transcription, retaining all transcriptions, or computing an aggregated transcription from all available annotations.","cats":{"data-quality":0}}
253 | {"text":"In addition, we evaluate the impact of quality-based data selection, where samples with low agreement are removed from the training set.","cats":{"data-quality":0}}
254 | {"text":"Our experiments are carried out on municipal registers of the city of Belfort (France) written between 1790 and 1946.","cats":{"data-quality":0}}
255 | {"text":"% results The results show that computing a consensus transcription or training on multiple transcriptions are good alternatives.","cats":{"data-quality":0}}
256 | {"text":"However, selecting training samples based on the degree of agreement between annotators introduces a bias in the training data and does not improve the res","cats":{"data-quality":0}}
257 | {"text":"The aim of the experiment is to judge the final annotation quality when pre-annotation is used.","cats":{"data-quality":1}}
258 | {"text":"In addition, it evaluates the effect of automatic linguistically-based (rule-formulated) checks and another annotation on the same data available to the annotators, and their influence on annotation quality and efficiency.","cats":{"data-quality":1}}
259 | {"text":"This paper presents an analysis of annotation using an automatic pre-annotation for a mid-level annotation complexity task -- dependency syntax annotation.","cats":{"data-quality":0}}
260 | {"text":"It compares the annotation efforts made by annotators using a pre-annotated version (with a high-accuracy parser) and those made by fully manual annotation. ","cats":{"data-quality":0}}
261 | {"text":"In addition, it evaluates the effect of automatic linguistically-based (rule-formulated) checkstic annotation which increases the consistency of the resulting annotation without reducing its quality.","cats":{"data-quality":0}}
--------------------------------------------------------------------------------
/docs/API/external.md:
--------------------------------------------------------------------------------
1 | ## OpenAIEncoder
2 |
3 | ::: embetter.external.OpenAIEncoder
4 | options:
5 | members: false
6 |
7 | ## AzureOpenAIEncoder
8 | ::: embetter.external.AzureOpenAIEncoder
9 | options:
10 | members: false
11 |
12 | ## `CohereEncoder`
13 |
14 | ::: embetter.external.CohereEncoder
15 | options:
16 | members: false
17 |
--------------------------------------------------------------------------------
/docs/API/grab.md:
--------------------------------------------------------------------------------
1 | # Grabbers
2 |
3 | ## ColumnGrabber
4 |
5 | ::: embetter.grab.ColumnGrabber
6 |
7 | ## KeyGrabber
8 |
9 | ::: embetter.grab.KeyGrabber
10 |
--------------------------------------------------------------------------------
/docs/API/model.md:
--------------------------------------------------------------------------------
1 | ## DifferenceClassifier
2 |
3 | ::: embetter.model.DifferenceClassifier
4 |
--------------------------------------------------------------------------------
/docs/API/multimodal.md:
--------------------------------------------------------------------------------
1 | ## ClipEncoder
2 |
3 | ::: embetter.multi.ClipEncoder
4 | options:
5 | members: false
6 |
--------------------------------------------------------------------------------
/docs/API/text.md:
--------------------------------------------------------------------------------
1 | ## TextEncoder
2 |
3 | ::: embetter.text.TextEncoder
4 | options:
5 | members: false
6 |
7 | ## SentenceEncoder
8 |
9 | ::: embetter.text.SentenceEncoder
10 | options:
11 | members: false
12 |
13 | ## MatryoshkaEncoder
14 |
15 | ::: embetter.text.MatryoshkaEncoder
16 | options:
17 | members: false
18 |
19 | ## LiteDocEncoder
20 |
21 | ::: embetter.text.LiteTextEncoder
22 | options:
23 | members: false
24 |
25 | ## KerasNLPEncoder
26 |
27 | ::: embetter.text.KerasNLPEncoder
28 | options:
29 | members: false
30 |
31 | ## spaCyEncoder
32 |
33 | ::: embetter.text.spaCyEncoder
34 | options:
35 | members: false
36 |
37 | ## Sense2VecEncoder
38 |
39 | ::: embetter.text.Sense2VecEncoder
40 | options:
41 | members: false
42 |
43 | ## BytePairEncoder
44 |
45 | ::: embetter.text.BytePairEncoder
46 | options:
47 | members: false
48 |
49 | ## GensimEncoder
50 |
51 | ::: embetter.text.GensimEncoder
52 | options:
53 | members: false
54 |
55 |
--------------------------------------------------------------------------------
/docs/API/utils.md:
--------------------------------------------------------------------------------
1 | # Utils
2 |
3 | ## cached
4 |
5 | ::: embetter.utils.cached
6 |
7 | ## batched
8 |
9 | ::: embetter.utils.batched
10 |
11 | ## calc_distances
12 |
13 | ::: embetter.utils.calc_distances
14 |
--------------------------------------------------------------------------------
/docs/API/vision.md:
--------------------------------------------------------------------------------
1 | ## ImageLoader
2 |
3 | ::: embetter.vision.ImageLoader
4 | options:
5 | members: false
6 |
7 | ## ColorHistogramEncoder
8 |
9 | ::: embetter.vision.ColorHistogramEncoder
10 | options:
11 | members: false
12 |
13 | ## TimmEncoder
14 |
15 | ::: embetter.vision.TimmEncoder
16 | options:
17 | members: false
18 |
--------------------------------------------------------------------------------
/docs/applications.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Techniques
3 | ---
4 |
5 | This document contains some tricks, hints and demos of applications that you might want to consider
6 | in combination with this library.
7 |
8 | ## Cache
9 |
10 | Calculating embeddings can be expensive, even costly when you're using external providers.
11 | This is why this library offers an integration with [diskcache](https://grantjenks.com/docs/diskcache/).
12 | That way, you can infer the embeddings once and store them to disk for later.
13 |
14 | Here's an example of how you might run that.
15 |
16 | ```python
17 | from embetter.text import SentenceEncoder
18 | from embetter.utils import cached
19 |
20 | encoder = cached("sentence-enc", SentenceEncoder('all-MiniLM-L6-v2'))
21 |
22 | examples = [f"this is a pretty long text, which is more expensive {i}" for i in range(10_000)]
23 |
24 | # This might be a bit slow ~17.2s on our machine
25 | encoder.transform(examples)
26 |
27 | # This should be quicker ~4.71s on our machine
28 | encoder.transform(examples)
29 | ```
30 |
31 | Note that you're also able to fetch the precalculated embeddings directly via:
32 |
33 | ```python
34 | from diskcache import Cache
35 |
36 | # Make sure that you use the same name as in `cached`
37 | cache = Cache("sentence-enc")
38 | # Use a string as a key, if it's precalculated you'll get an array back.
39 | cache["this is a pretty long text, which is more expensive 0"]
40 | ```
41 |
42 | Be mindful of what goes into the encoder that you choose. It's preferable to give it
43 | text as opposed to numpy arrays. Also note that the first time that you'll run this
44 | it will take more time due to the overhead of writing into the cache.
45 |
46 | ## Lite Embeddings
47 |
48 | There are a lot of options out there for pretrained text embeddings but there are also a few noteworthy lightweight techniques that allow you to train your own from scratch. One such technique is to use the [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
49 | from scikit-learn followed by [TruncatedSVD](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html). The `TfidfVectorizer` even allows
50 | you to specify `analyzer=char` with `ngram_range` = (3,4) to encode subwords, which even contributes to robustness against spelling errors if that's a concern.
51 |
52 | The main thing that's cool about this approach is the representations can still be very reasonable for a lot of applications _and_ train very quickly. Here's a quick demo:
53 |
54 | ```python
55 | import srsly
56 | from umap import UMAP
57 | from cluestar import plot_text
58 | from embetter.text import learn_lite_doc_embeddings
59 |
60 | # Train embeddings
61 | texts = [ex['text'] for ex in srsly.read_jsonl("datasets/new-dataset.jsonl")]
62 | enc = learn_lite_doc_embeddings(texts, dim=300)
63 |
64 | # Create a 2D UMAP representation
65 | X_orig = enc.transform(texts) # this takes ~56ms
66 | X = UMAP().fit_transform(X_orig)
67 |
68 | # Plot the UMAP representation with the text
69 | plot_text(X, texts)
70 | ```
71 |
72 | !!! Note
73 |
74 | You can also store the trained embeddings as part of the training-call.
75 |
76 | ```python
77 | enc = learn_lite_doc_embeddings(texts, dim=300, path="stored/on/disk.emb")
78 | ```
79 |
80 |
81 |
82 |
83 |
84 |
85 | Here's what this chart looks like. Note that you can click and drag to explore!
86 |
87 |
88 |
89 | Let's now consider what a similar chart might look like that uses [Sentence Transformers](https://sbert.net).
90 |
91 | ```python
92 | from embetter.text import SentenceEncoder
93 |
94 | sent_enc = SentenceEncoder()
95 | X_orig = sent_enc.transform(texts) # this takes ~13.5s
96 | X = UMAP().fit_transform(X_orig)
97 | plot_text(X, texts)
98 | ```
99 |
100 |
101 |
102 | The charts differ, but if you squint you can spot a cluster on the right hand side here that
103 | corresponds with the cluster at the bottom of the previous chart.
104 |
105 | These "litetext" embeddings do overfit on the same words being used. But they are _much_ faster
106 | and still give a reasonable representation for a lot of use-cases. Also not that you don't have
107 | to use our utilities here, you can just create the same pipeline via:
108 |
109 | ```python
110 | from sklearn.decomposition import TruncatedSVD
111 | from sklearn.feature_extraction.text import TfidfVectorizer
112 | from sklearn.pipeline import make_pipeline
113 |
114 | enc = make_pipeline(
115 | TfidfVectorizer(),
116 | TruncatedSVD()
117 | )
118 | ```
119 |
120 | Our implementation does a few extra tricks internally to keep things lightweight, but it's really
121 | the same trick.
122 |
123 | ## Difference Models
124 |
125 | Embeddings can be very useful when you're dealing with a deduplication use-case. The thinking
126 | is that items that are close in embedded space might be great candidates to double-check.
127 |
128 | To help investigate this, this library offers a `DifferenceModel` utility.
129 |
130 | 
131 |
132 | Here's how you might use it.
133 |
134 | ```python
135 | from embetter.model import DifferenceClassifier
136 | from embetter.text import SentenceEncoder
137 |
138 | mod = DifferenceClassifier(enc=SentenceEncoder())
139 |
140 | # Suppose this is input data
141 | texts1 = ["hello", "firehydrant", "greetings"]
142 | texts2 = ["no", "yes", "greeting"]
143 |
144 | # You will need to have some definition of "similar"
145 | similar = [0, 0, 1]
146 |
147 | # Train a model to detect similarity
148 | mod.fit(X1=texts1, X2=texts2, y=similar)
149 | mod.predict(X1=texts1, X2=texts2)
150 | mod.predict_proba(X1=texts1, X2=texts2)
151 |
152 | # The classifier head is a scikit-learn model, which you could save
153 | # seperately if you like. The model can be accessed via:
154 | mod.clf_head
155 | ```
156 |
157 | The model really is just a light wrapper, but it might make it easier to bootstrap.
158 |
159 | ## Available `SentenceEncoder`s
160 |
161 | There are _many_ available models out there. Just have a look at [MTEB](https://huggingface.co/spaces/mteb/leaderboard).
162 |
163 | Because the `SentenceEncoder` in this library is just a wrapper around `sentence-transformers` you should also
164 | be able to load any more that the library can load.
165 |
166 | ```python
167 | # https://huggingface.co/thenlper/gte-small
168 | model = SentenceEncoder('thenlper/gte-small')
169 | model = SentenceEncoder('thenlper/gte-base')
170 | model = SentenceEncoder('thenlper/gte-large')
171 | ```
172 |
173 | There are many more models that you can consider. Just be aware that [some models](https://huggingface.co/intfloat/e5-large-v2) expect a prefix to be included in the text that you're encoding.
174 |
175 |
176 | ## Speedup with Modal
177 |
178 | Embedding text can be slow, especially when you're running on a CPU. If you wish
179 | to speed up your embedding calculations you may enjoy using [modal](https://modal.com/).
180 | Modal allows you to add a GPU to a Python function simply by adding a decorator.
181 |
182 | Not every encoder in embetter will get a speedup by using a GPU. But we've done some
183 | benchmarks and noticed that
184 | `SentenceEncoder` as well as `ClipEncoder` should both benefit. These components will
185 | also automatically detect when the GPU is available automatically.
186 |
187 | The code below gives an example.
188 |
189 | ```python
190 | import time
191 | import h5py
192 | import modal
193 |
194 |
195 | stub = modal.Stub("example-get-started")
196 | image = (modal.Image.debian_slim()
197 | .pip_install("simsity", "embetter[text]", "h5py")
198 | .run_commands("python -c 'from embetter.text import SentenceEncoder; SentenceEncoder()'"))
199 |
200 |
201 | # This is the function that actually runs the embedding,
202 | # notice that there's a GPU attached.
203 | @stub.function(image=image, gpu="any")
204 | def create(data):
205 | from embetter.text import SentenceEncoder
206 | return SentenceEncoder().transform(data)
207 |
208 |
209 | @stub.local_entrypoint()
210 | def main():
211 | tic = time.time()
212 |
213 | # You'd need to write your own function to read in the texts
214 | data = read_text()
215 |
216 | # This runs our decorated function on external hardware
217 | X = create.call(data)
218 |
219 | # Next we save it to disk for re-use
220 | with h5py.File('embeddings.h5', 'w') as hf:
221 | hf.create_dataset("embeddings", data=X)
222 | toc = time.time()
223 | print(f"took {toc - tic}s to embed shape {X.shape}")
224 | ```
225 |
226 | On our own benchmarks, we seem to get a 4-5x speedup with just a minor edit
227 | to the code. This can be extremely helpful when you're trying to embed data
228 | in bulk.
229 |
--------------------------------------------------------------------------------
/docs/images/colorhistogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/colorhistogram.png
--------------------------------------------------------------------------------
/docs/images/columngrabber.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/columngrabber.png
--------------------------------------------------------------------------------
/docs/images/contrastive-re-use.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/contrastive-re-use.png
--------------------------------------------------------------------------------
/docs/images/contrastive-same-weights.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/contrastive-same-weights.png
--------------------------------------------------------------------------------
/docs/images/contrastive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/contrastive.png
--------------------------------------------------------------------------------
/docs/images/difference-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/difference-model.png
--------------------------------------------------------------------------------
/docs/images/embed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/embed.png
--------------------------------------------------------------------------------
/docs/images/feedforward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/feedforward.png
--------------------------------------------------------------------------------
/docs/images/gradient.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/gradient.png
--------------------------------------------------------------------------------
/docs/images/human-in-the-loop-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/human-in-the-loop-1.png
--------------------------------------------------------------------------------
/docs/images/human-in-the-loop-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/human-in-the-loop-2.png
--------------------------------------------------------------------------------
/docs/images/human-in-the-loop-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/human-in-the-loop-3.png
--------------------------------------------------------------------------------
/docs/images/human-in-the-loop-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/human-in-the-loop-4.png
--------------------------------------------------------------------------------
/docs/images/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/icon.png
--------------------------------------------------------------------------------
/docs/images/imageloader.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/imageloader.png
--------------------------------------------------------------------------------
/docs/images/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/output.png
--------------------------------------------------------------------------------
/docs/images/sense2vec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/sense2vec.png
--------------------------------------------------------------------------------
/docs/images/sentence-encoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/sentence-encoder.png
--------------------------------------------------------------------------------
/docs/images/timm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/timm.png
--------------------------------------------------------------------------------
/docs/images/x-finetuned-again.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/x-finetuned-again.png
--------------------------------------------------------------------------------
/docs/images/x-finetuned.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/x-finetuned.png
--------------------------------------------------------------------------------
/docs/images/x-orig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/docs/images/x-orig.png
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # embetter
4 |
5 | > "Just a bunch of useful embeddings to get started quickly."
6 |
7 |
8 |
9 | Embetter implements scikit-learn compatible embeddings for computer vision and text. It should make it very easy to quickly build proof of concepts using scikit-learn pipelines and, in particular, should help with [bulk labelling](https://www.youtube.com/watch?v=gDk7_f3ovIk). It's a also meant to play nice with [bulk](https://github.com/koaning/bulk) and [scikit-partial](https://github.com/koaning/scikit-partial) but it can also be used together with your favorite ANN solution like [lancedb](https://lancedb.github.io/lancedb/).
10 |
11 | ## Install
12 |
13 | You can install via pip.
14 |
15 | ```
16 | python -m pip install embetter
17 | ```
18 |
19 | Many of the embeddings are optional depending on your use-case, so if you
20 | want to nit-pick to download only the tools that you need:
21 |
22 | ```
23 | python -m pip install "embetter[text]"
24 | python -m pip install "embetter[sbert]"
25 | python -m pip install "embetter[spacy]"
26 | python -m pip install "embetter[sense2vec]"
27 | python -m pip install "embetter[bpemb]"
28 | python -m pip install "embetter[gensim]"
29 | python -m pip install "embetter[vision]"
30 | python -m pip install "embetter[all]"
31 | ```
32 |
33 | ## API Design
34 |
35 | This is what's being implemented now.
36 |
37 | ```python
38 | # Helpers to grab text or image from pandas column.
39 | from embetter.grab import ColumnGrabber
40 |
41 | # Representations/Helpers for computer vision
42 | from embetter.vision import ImageLoader, TimmEncoder, ColorHistogramEncoder
43 |
44 | # Representations for text
45 | from embetter.text import SentenceEncoder, Sense2VecEncoder, BytePairEncoder, spaCyEncoder, GensimEncoder
46 |
47 | # Representations from multi-modal models
48 | from embetter.multi import ClipEncoder
49 |
50 | # Finetuning components
51 | from embetter.finetune import FeedForwardTuner, ContrastiveTuner, ContrastiveLearner, SbertLearner
52 |
53 | # External embedding providers, typically needs an API key
54 | from embetter.external import CohereEncoder, OpenAIEncoder
55 | ```
56 |
57 | All of these components are scikit-learn compatible, which means that you
58 | can apply them as you would normally in a scikit-learn pipeline. Just be aware
59 | that these components are stateless. They won't require training as these
60 | are all pretrained tools.
61 |
62 | ## Text Example
63 |
64 | To run this example, make sure that you `pip install 'embetter[sbert]'`.
65 |
66 | ```python
67 | import pandas as pd
68 | from sklearn.pipeline import make_pipeline
69 | from sklearn.linear_model import LogisticRegression
70 |
71 | from embetter.grab import ColumnGrabber
72 | from embetter.text import SentenceEncoder
73 |
74 | # This pipeline grabs the `text` column from a dataframe
75 | # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
76 | text_emb_pipeline = make_pipeline(
77 | ColumnGrabber("text"),
78 | SentenceEncoder('all-MiniLM-L6-v2')
79 | )
80 |
81 | # This pipeline can also be trained to make predictions, using
82 | # the embedded features.
83 | text_clf_pipeline = make_pipeline(
84 | text_emb_pipeline,
85 | LogisticRegression()
86 | )
87 |
88 | dataf = pd.DataFrame({
89 | "text": ["positive sentiment", "super negative"],
90 | "label_col": ["pos", "neg"]
91 | })
92 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
93 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
94 | ```
95 |
96 | ## Image Example
97 |
98 | The goal of the API is to allow pipelines like this:
99 |
100 | ```python
101 | import pandas as pd
102 | from sklearn.pipeline import make_pipeline
103 | from sklearn.linear_model import LogisticRegression
104 |
105 | from embetter.grab import ColumnGrabber
106 | from embetter.vision import ImageLoader
107 | from embetter.multi import ClipEncoder
108 |
109 | # This pipeline grabs the `img_path` column from a dataframe
110 | # then it grabs the image paths and turns them into `PIL.Image` objects
111 | # which then get fed into CLIP which can also handle images.
112 | image_emb_pipeline = make_pipeline(
113 | ColumnGrabber("img_path"),
114 | ImageLoader(convert="RGB"),
115 | ClipEncoder()
116 | )
117 |
118 | dataf = pd.DataFrame({
119 | "img_path": ["tests/data/thiscatdoesnotexist.jpeg"]
120 | })
121 | image_emb_pipeline.fit_transform(dataf)
122 | ```
123 |
124 | ## Batched Learning
125 |
126 | All of the encoding tools you've seen here are also compatible
127 | with the [`partial_fit` mechanic](https://scikit-learn.org/0.15/modules/scaling_strategies.html#incremental-learning)
128 | in scikit-learn. That means
129 | you can leverage [scikit-partial](https://github.com/koaning/scikit-partial)
130 | to build pipelines that can handle out-of-core datasets.
131 |
--------------------------------------------------------------------------------
/embetter/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 | from importlib import metadata
3 | except ImportError: # for Python<3.8
4 | import importlib_metadata as metadata
5 |
6 |
7 | __title__ = __name__
8 | __version__ = metadata.version(__title__)
9 |
--------------------------------------------------------------------------------
/embetter/base.py:
--------------------------------------------------------------------------------
1 | from sklearn.base import BaseEstimator, TransformerMixin
2 |
3 |
4 | class EmbetterBase(BaseEstimator, TransformerMixin):
5 | """Base class for feature transformers in this library"""
6 |
7 | def fit(self, X, y=None):
8 | """No-op."""
9 | return self
10 |
11 | def partial_fit(self, X, y=None):
12 | """No-op."""
13 | return self
14 |
--------------------------------------------------------------------------------
/embetter/error.py:
--------------------------------------------------------------------------------
1 | class NotInstalled:
2 | """
3 | This object is used for optional dependencies. If a backend is not installed we
4 | replace the transformer/language with this object. This allows us to give a friendly
5 | message to the user that they need to install extra dependencies as well as a link
6 | to our documentation page.
7 | """
8 |
9 | def __init__(self, tool, dep):
10 | self.tool = tool
11 | self.dep = dep
12 |
13 | msg = f"In order to use {self.tool} you'll need to install via;\n\n"
14 | msg += f"pip install embetter[{self.dep}]\n\n"
15 | self.msg = msg
16 |
17 | def __getattr__(self, *args, **kwargs):
18 | raise ModuleNotFoundError(self.msg)
19 |
20 | def __call__(self, *args, **kwargs):
21 | raise ModuleNotFoundError(self.msg)
22 |
--------------------------------------------------------------------------------
/embetter/external/__init__.py:
--------------------------------------------------------------------------------
1 | from embetter.error import NotInstalled
2 |
3 | try:
4 | from ._openai import OpenAIEncoder
5 | except ModuleNotFoundError:
6 | OpenAIEncoder = NotInstalled("OpenAIEncoder", "openai")
7 |
8 | try:
9 | from ._openai import AzureOpenAIEncoder
10 | except ModuleNotFoundError:
11 | AzureOpenAIEncoder = NotInstalled("AzureOpenAIEncoder", "openai")
12 |
13 | try:
14 | from ._cohere import CohereEncoder
15 | except ModuleNotFoundError:
16 | CohereEncoder = NotInstalled("CohereEncoder", "cohere")
17 |
18 |
19 | __all__ = ["CohereEncoder", "OpenAIEncoder", "AzureOpenAIEncoder"]
20 |
--------------------------------------------------------------------------------
/embetter/external/_cohere.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | from itertools import islice
4 |
5 | from embetter.base import EmbetterBase
6 |
7 |
8 | def _batch(iterable, n=1):
9 | it = iter(iterable)
10 | while batch := list(islice(it, n)):
11 | yield batch
12 |
13 |
14 | class CohereEncoder(EmbetterBase):
15 | """
16 | Encoder that can numerically encode sentences.
17 |
18 | Note that this is an **external** embedding provider. If their API breaks, so will this component.
19 |
20 | This encoder will require the `COHERE_KEY` environment variable to be set.
21 | If you have it defined in your `.env` file, you can use python-dotenv to load it.
22 |
23 | You also need to install the `cohere` library beforehand.
24 |
25 | ```
26 | python -m pip install cohere
27 | ```
28 |
29 | Arguments:
30 | model: name of model, can be "small" or "large"
31 | batch_size: Batch size to send to Cohere.
32 |
33 | **Usage**:
34 |
35 | ```python
36 | import pandas as pd
37 | from sklearn.pipeline import make_pipeline
38 | from sklearn.linear_model import LogisticRegression
39 |
40 | from embetter.grab import ColumnGrabber
41 | from embetter.external import CohereEncoder
42 | from dotenv import load_dotenv
43 |
44 | load_dotenv() # take environment variables from .env.
45 |
46 | # Let's suppose this is the input dataframe
47 | dataf = pd.DataFrame({
48 | "text": ["positive sentiment", "super negative"],
49 | "label_col": ["pos", "neg"]
50 | })
51 |
52 | # This pipeline grabs the `text` column from a dataframe
53 | # which then get fed into Cohere's endpoint
54 | text_emb_pipeline = make_pipeline(
55 | ColumnGrabber("text"),
56 | CohereEncoder(model="large")
57 | )
58 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
59 |
60 | # This pipeline can also be trained to make predictions, using
61 | # the embedded features.
62 | text_clf_pipeline = make_pipeline(
63 | text_emb_pipeline,
64 | LogisticRegression()
65 | )
66 |
67 | # Prediction example
68 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
69 | ```
70 | """
71 |
72 | def __init__(self, model="large", batch_size=10):
73 | from cohere import Client
74 |
75 | self.client = Client(os.getenv("COHERE_KEY"))
76 | self.model = model
77 | self.batch_size = batch_size
78 |
79 | def transform(self, X, y=None):
80 | """Transforms the text into a numeric representation."""
81 | result = []
82 | for b in _batch(X, self.batch_size):
83 | response = self.client.embed(b)
84 | result.extend(response.embeddings)
85 | return np.array(result)
86 |
--------------------------------------------------------------------------------
/embetter/external/_openai.py:
--------------------------------------------------------------------------------
1 | from itertools import islice
2 |
3 | import numpy as np
4 | from openai import AzureOpenAI, OpenAI
5 |
6 | from embetter.base import EmbetterBase
7 |
8 |
9 | def _batch(iterable, n=1):
10 | it = iter(iterable)
11 | while batch := list(islice(it, n)):
12 | yield batch
13 |
14 |
15 | class OpenAIEncoder(EmbetterBase):
16 | """
17 | Encoder that can numerically encode sentences.
18 |
19 | Note that this is an **external** embedding provider. If their API breaks, so will this component.
20 | We also assume that you've already importen openai upfront and ran this command:
21 |
22 | This encoder will require the `OPENAI_API_KEY` (optionally `OPENAI_ORG_ID` and `OPENAI_PROJECT_ID`) environment variable to be set.
23 | If you have it defined in your `.env` file, you can use python-dotenv to load it.
24 |
25 | You also need to install the `openai` library beforehand.
26 |
27 | ```
28 | python -m pip install openai
29 | ```
30 |
31 | Arguments:
32 | model: name of model, can be "small" or "large"
33 | batch_size: Batch size to send to OpenAI.
34 |
35 | **Usage**:
36 |
37 | ```python
38 | import pandas as pd
39 | from sklearn.pipeline import make_pipeline
40 | from sklearn.linear_model import LogisticRegression
41 |
42 | from embetter.grab import ColumnGrabber
43 | from embetter.external import OpenAIEncoder
44 | from dotenv import load_dotenv
45 |
46 | load_dotenv() # take environment variables from .env.
47 |
48 | # Let's suppose this is the input dataframe
49 | dataf = pd.DataFrame({
50 | "text": ["positive sentiment", "super negative"],
51 | "label_col": ["pos", "neg"]
52 | })
53 |
54 | # This pipeline grabs the `text` column from a dataframe
55 | # which then get fed into OpenAI's endpoint
56 | text_emb_pipeline = make_pipeline(
57 | ColumnGrabber("text"),
58 | OpenAIEncoder()
59 | )
60 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
61 |
62 | # This pipeline can also be trained to make predictions, using
63 | # the embedded features.
64 | text_clf_pipeline = make_pipeline(
65 | text_emb_pipeline,
66 | LogisticRegression()
67 | )
68 |
69 | # Prediction example
70 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
71 | ```
72 | """
73 |
74 | def __init__(self, model="text-embedding-ada-002", batch_size=25):
75 | # You must run this first!
76 | self.client = OpenAI()
77 | self.model = model
78 | self.batch_size = batch_size
79 |
80 | def transform(self, X, y=None):
81 | """Transforms the text into a numeric representation."""
82 | result = []
83 | for b in _batch(X, self.batch_size):
84 | resp = self.client.embeddings.create(input=b, model=self.model) # fmt: off
85 | result.extend([_.embedding for _ in resp.data])
86 | return np.array(result)
87 |
88 |
89 | class AzureOpenAIEncoder(OpenAIEncoder):
90 | """
91 | Encoder that can numerically encode sentences.
92 |
93 | Note that this is an *external* embedding provider. If their API breaks, so will this component.
94 |
95 | To use this encoder you must provide credentials. Please provide one of the `api_key`, `azure_ad_token`, `azure_ad_token_provider` arguments, or the `AZURE_OPENAI_API_KEY` or `AZURE_OPENAI_AD_TOKEN`.
96 | You must provide one of the `base_url` or `azure_endpoint` arguments, or the `AZURE_OPENAI_ENDPOINT` environment variable.
97 | Furthermore you must provide either the `api_version` argument or the `OPENAI_API_VERSION` environment variable.
98 |
99 | If you have your enviroment variables defined in your `.env` file, you can use python-dotenv to load it.
100 |
101 | You also need to install the `openai` library beforehand.
102 |
103 | ```
104 | python -m pip install openai
105 | ```
106 |
107 | Arguments:
108 | model: name of model.
109 | batch_size: Batch size to send to AzureOpenAI.
110 |
111 | *Usage*:
112 |
113 | ```python
114 | import pandas as pd
115 | from sklearn.pipeline import make_pipeline
116 | from sklearn.linear_model import LogisticRegression
117 |
118 | from embetter.grab import ColumnGrabber
119 | from embetter.external import AzureOpenAIEncoder
120 | from dotenv import load_dotenv
121 |
122 | load_dotenv() # take environment variables from .env.
123 |
124 | # Let's suppose this is the input dataframe
125 | dataf = pd.DataFrame({
126 | "text": ["positive sentiment", "super negative"],
127 | "label_col": ["pos", "neg"]
128 | })
129 |
130 | # This pipeline grabs the `text` column from a dataframe
131 | # which then get fed into OpenAI's endpoint
132 | text_emb_pipeline = make_pipeline(
133 | ColumnGrabber("text"),
134 | AzureOpenAIEncoder()
135 | )
136 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
137 |
138 | # This pipeline can also be trained to make predictions, using
139 | # the embedded features.
140 | text_clf_pipeline = make_pipeline(
141 | text_emb_pipeline,
142 | LogisticRegression()
143 | )
144 |
145 | # Prediction example
146 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
147 | ```
148 | """
149 |
150 | def _init_(self, model="text-embedding-ada-002", batch_size=25, **kwargs):
151 | self.model = model
152 | self.batch_size = batch_size
153 | self.client = AzureOpenAI(**kwargs)
154 |
--------------------------------------------------------------------------------
/embetter/finetune/__init__.py:
--------------------------------------------------------------------------------
1 | from embetter.finetune._forward import FeedForwardTuner
2 | from embetter.finetune._contrastive_tuner import ContrastiveTuner
3 | from embetter.finetune._constrastive_learn import ContrastiveLearner
4 | from embetter.finetune._sbert_learn import SbertLearner
5 |
6 |
7 | __all__ = ["FeedForwardTuner", "ContrastiveTuner", "SbertLearner", "ContrastiveLearner"]
8 |
--------------------------------------------------------------------------------
/embetter/finetune/_constrastive_learn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 | from torch.nn import CosineSimilarity
5 | from torch import nn
6 |
7 |
8 | class ContrastiveNetwork(nn.Module):
9 | def __init__(self, shape_in, hidden_dim):
10 | super(ContrastiveNetwork, self).__init__()
11 | self.embed1 = nn.Linear(shape_in, hidden_dim)
12 | self.embed2 = nn.Linear(hidden_dim, hidden_dim)
13 | self.act = nn.ReLU()
14 | self.cos = nn.CosineSimilarity()
15 |
16 | def forward(self, input1, input2):
17 | """Feed forward."""
18 | emb_1 = self.embed2(self.act(self.embed1(input1)))
19 | emb_2 = self.embed2(self.act(self.embed1(input2)))
20 | return self.cos(emb_1, emb_2)
21 |
22 | def embed(self, X):
23 | return self.embed2(self.act(self.embed1(X)))
24 |
25 |
26 | class ContrastiveLearner:
27 | """
28 | A learner model that can finetune on pairs of data on top of numeric embeddings.
29 |
30 | It's similar to the scikit-learn models that you're used to, but it accepts
31 | two inputs `X1` and `X2` and tries to predict if they are similar.
32 |
33 | Arguments:
34 | sent_tfm: an instance of a `SentenceTransformer` that you'd like to finetune
35 | batch_size: the batch size during training
36 | epochs: the number of epochs to use while training
37 | warmup_steps: the number of warmup steps before training
38 |
39 | Usage:
40 |
41 | ```python
42 | from sentence_transformers import SentenceTransformer
43 | from embetter.finetune import ContrastiveLearner
44 | import random
45 |
46 | sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
47 | learner = SbertLearner(sent_tfm)
48 |
49 | def sample_generator(examples, n_neg=3):
50 | # A generator that assumes examples to be a dictionary of the shape
51 | # {"text": "some text", "cats": {"label_a": True, "label_b": False}}
52 | # this is typically a function that's very custom to your use-case though
53 | labels = set()
54 | for ex in examples:
55 | for cat in ex['cats'].keys():
56 | if cat not in labels:
57 | labels = labels.union([cat])
58 | for label in labels:
59 | pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1]
60 | neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0]
61 | for ex in pos_examples:
62 | sample = random.choice(pos_examples)
63 | yield (ex['text'], sample['text'], 1.0)
64 | for n in range(n_neg):
65 | sample = random.choice(neg_examples)
66 | yield (ex['text'], sample['text'], 0.0)
67 |
68 | learn_examples = sample_generator(examples, n_neg=3)
69 | X1, X2, y = zip(*learn_examples)
70 |
71 | # Learn a new representation
72 | learner.fit(X1, X2, y)
73 |
74 | # You now have an updated model that can create more "finetuned" embeddings
75 | learner.transform(X1)
76 | learner.transform(X2)
77 | ```
78 |
79 | After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would.
80 | """
81 |
82 | def __init__(
83 | self,
84 | shape_out: int = 300,
85 | batch_size: int = 16,
86 | epochs: int = 1,
87 | learning_rate=2e-05,
88 | ):
89 | self.learning_rate = learning_rate
90 | self.network_ = None
91 | self.batch_size = batch_size
92 | self.epochs = epochs
93 | self.shape_out = shape_out
94 |
95 | def fit(self, X1, X2, y):
96 | """Finetune an Sbert model based on similarities between two sets of texts."""
97 | self.network_ = ContrastiveNetwork(
98 | shape_in=X1.shape[1], hidden_dim=self.shape_out
99 | )
100 | criterion = nn.MSELoss()
101 | optimizer = torch.optim.Adam(self.network_.parameters(), lr=self.learning_rate)
102 |
103 | X1_torch = torch.from_numpy(X1).detach().float()
104 | X2_torch = torch.from_numpy(X2).detach().float()
105 | y_torch = torch.from_numpy(np.array(y)).detach().float()
106 |
107 | dataset = torch.utils.data.TensorDataset(X1_torch, X2_torch, y_torch)
108 | dataloader = torch.utils.data.DataLoader(
109 | dataset, batch_size=self.batch_size, shuffle=True
110 | )
111 |
112 | for _ in range(self.epochs): # loop over the dataset multiple times
113 | for batch_X1, batch_X2, batch_y in dataloader:
114 | # zero the parameter gradients
115 | optimizer.zero_grad()
116 |
117 | # forward + backward + optimize
118 | cos_sim = self.network_(batch_X1, batch_X2)
119 | loss = criterion(cos_sim, batch_y)
120 | loss.backward()
121 | optimizer.step()
122 | return self
123 |
124 | def transform(self, X, y=None):
125 | """Encode a single batch of inputs."""
126 | X_torch = torch.from_numpy(X).detach().float()
127 | return self.network_.embed(X_torch).detach().numpy()
128 |
129 | def predict(self, X1, X2):
130 | """Predicts the cosine similarity."""
131 | emb1 = self.transform(X1)
132 | emb2 = self.transform(X2)
133 | return np.array(CosineSimilarity()(emb1, emb2))
134 |
135 | def to_disk(self, path):
136 | """Save the finetuned Sbert model."""
137 | self.sent_tfm.save(path=path)
138 |
--------------------------------------------------------------------------------
/embetter/finetune/_contrastive_tuner.py:
--------------------------------------------------------------------------------
1 | from sklearn.base import BaseEstimator, TransformerMixin
2 | import random
3 | from collections import defaultdict
4 | from itertools import chain, groupby
5 |
6 | import numpy as np
7 | import torch
8 | from dataclasses import dataclass
9 |
10 | from ._constrastive_learn import ContrastiveLearner
11 |
12 |
13 | @dataclass
14 | class Example:
15 | """Internal example class."""
16 |
17 | i1: int
18 | i2: int
19 | label: float
20 |
21 |
22 | def generate_pairs_batch(labels, n_neg=3):
23 | """
24 | Copied with permission from Peter Baumgartners implementation
25 | https://github.com/pmbaumgartner/setfit
26 | """
27 | # 7x faster than original implementation on small data,
28 | # 14x faster on 10000 examples
29 | pairs = []
30 | lookup = defaultdict(list)
31 | single_example = {}
32 | indices = np.arange(len(labels))
33 | for label, grouper in groupby(
34 | ((s, lab) for s, lab in zip(indices, labels)), key=lambda x: x[1]
35 | ):
36 | lookup[label].extend(list(i[0] for i in grouper))
37 | single_example[label] = len(lookup[label]) == 1
38 | neg_lookup = {}
39 | for current_label in lookup:
40 | negative_options = list(
41 | chain.from_iterable(
42 | [indices for label, indices in lookup.items() if label != current_label]
43 | )
44 | )
45 | neg_lookup[current_label] = negative_options
46 |
47 | for current_idx, current_label in zip(indices, labels):
48 | positive_pair = random.choice(lookup[current_label])
49 | if not single_example[current_label]:
50 | # choosing itself as a matched pair seems wrong,
51 | # but we need to account for the case of 1 positive example
52 | # so as long as there's not a single positive example,
53 | # we'll reselect the other item in the pair until it's different
54 | while positive_pair == current_idx:
55 | positive_pair = random.choice(lookup[current_label])
56 | pairs.append(Example(current_idx, positive_pair, 1))
57 | for i in range(n_neg):
58 | negative_pair = random.choice(neg_lookup[current_label])
59 | pairs.append(Example(current_idx, negative_pair, 0))
60 |
61 | return pairs
62 |
63 |
64 | class ContrastiveTuner(BaseEstimator, TransformerMixin):
65 | """
66 | Run a contrastive network to finetune the embeddings towards a class.
67 |
68 | Arguments:
69 | hidden_dim: the dimension of the new learned representation
70 | n_neg: number of negative example pairs to sample per positive item
71 | n_epochs: number of epochs to use for training
72 | learning_rate: learning rate of the contrastive network
73 | """
74 |
75 | def __init__(self, hidden_dim=50, n_neg=3, epochs=20, learning_rate=0.001) -> None:
76 | self.learner = ContrastiveLearner(
77 | shape_out=hidden_dim,
78 | batch_size=256,
79 | learning_rate=learning_rate,
80 | epochs=epochs,
81 | )
82 | self.n_neg = n_neg
83 | self.hidden_dim = hidden_dim
84 | self.epochs = epochs
85 | self.learning_rate = learning_rate
86 |
87 | def fit(self, X, y):
88 | """Fits the finetuner."""
89 | return self.partial_fit(X, y, classes=np.unique(y))
90 |
91 | def generate_batch(self, X_torch, y):
92 | """Generate a batch of pytorch pairs used for finetuning"""
93 | pairs = generate_pairs_batch(y, n_neg=self.n_neg)
94 | X1 = torch.zeros(len(pairs), X_torch.shape[1])
95 | X2 = torch.zeros(len(pairs), X_torch.shape[1])
96 | labels = torch.tensor([ex.label for ex in pairs], dtype=torch.long)
97 | for i, pair in enumerate(pairs):
98 | X1[i] = X_torch[pair.i1]
99 | X2[i] = X_torch[pair.i2]
100 | return X1, X2, labels
101 |
102 | def partial_fit(self, X, y, classes=None):
103 | """Fits the finetuner using the partial_fit API."""
104 | if not hasattr(self, "_classes"):
105 | if classes is None:
106 | raise ValueError("`classes` must be provided for partial_fit")
107 | self._classes = classes
108 |
109 | X_torch = torch.from_numpy(X).detach().float()
110 |
111 | X1, X2, out = self.generate_batch(X_torch, y=y)
112 | # TODO: change this, we should just generate numpy internally not cast all over
113 | self.learner.fit(np.array(X1), np.array(X2), np.array(out))
114 |
115 | return self
116 |
117 | def transform(self, X, y=None):
118 | """Transforms the data according to the sklearn api by using the hidden layer."""
119 | return self.learner.transform(X)
120 |
--------------------------------------------------------------------------------
/embetter/finetune/_forward.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | from sklearn.base import BaseEstimator, TransformerMixin
5 | from sklearn.preprocessing import LabelEncoder
6 |
7 |
8 | class FeedForwardModel(nn.Module):
9 | """
10 | The internal model for the FeedForwardTuner
11 | """
12 |
13 | def __init__(self, input_dim, hidden_dim, output_dim):
14 | super(FeedForwardModel, self).__init__()
15 | self.hidden = nn.Linear(input_dim, hidden_dim)
16 | self.linear = nn.Linear(hidden_dim, output_dim)
17 | self.sigmoid = nn.Sigmoid()
18 |
19 | def forward(self, x):
20 | """Runs the forward pass"""
21 | return self.sigmoid(self.linear(self.embed(x)))
22 |
23 | def embed(self, x):
24 | """Runs the embedding pass"""
25 | return self.sigmoid(self.hidden(x))
26 |
27 |
28 | class FeedForwardTuner(BaseEstimator, TransformerMixin):
29 | """
30 | Create a feed forward model to finetune the embeddings towards a class.
31 |
32 | Arguments:
33 | hidden_dim: The size of the hidden layer
34 | n_epochs: The number of epochs to run the optimiser for
35 | learning_rate: The learning rate of the feed forward model
36 | """
37 |
38 | def __init__(
39 | self, hidden_dim=50, n_epochs=500, learning_rate=0.01, batch_size=32
40 | ) -> None:
41 | self.hidden_dim = hidden_dim
42 | self.n_epochs = n_epochs
43 | self.learning_rate = learning_rate
44 | self.batch_size = batch_size
45 | self.label_enc = LabelEncoder()
46 |
47 | def fit(self, X, y):
48 | """Fits the finetuner."""
49 | return self.partial_fit(X, y, classes=np.unique(y))
50 |
51 | def partial_fit(self, X, y, classes=None):
52 | """Fits the finetuner using the partial_fit API."""
53 | if not hasattr(self, "_classes"):
54 | if classes is None:
55 | raise ValueError("`classes` must be provided for partial_fit")
56 | self._classes = classes
57 | self.label_enc.fit(classes)
58 | assert (self._classes == self.label_enc.classes_).all()
59 | # Create a model if it does not exist yet.
60 | if not hasattr(self, "_model"):
61 | self._model = FeedForwardModel(
62 | X.shape[1], self.hidden_dim, len(self._classes)
63 | )
64 | self._optimizer = torch.optim.Adam(
65 | self._model.parameters(), lr=self.learning_rate
66 | )
67 | self._criterion = nn.CrossEntropyLoss()
68 |
69 | torch_X = torch.from_numpy(X).detach().float()
70 | torch_y = torch.from_numpy(self.label_enc.transform(y)).detach()
71 |
72 | dataset = torch.utils.data.TensorDataset(torch_X, torch_y)
73 | dataloader = torch.utils.data.DataLoader(
74 | dataset, batch_size=self.batch_size, shuffle=True
75 | )
76 |
77 | for _ in range(self.n_epochs):
78 | for batch_X, batch_y in dataloader:
79 | self._optimizer.zero_grad()
80 | out = self._model(batch_X)
81 | loss = self._criterion(out, batch_y)
82 | loss.backward()
83 | self._optimizer.step()
84 |
85 | return self
86 |
87 | def transform(self, X, y=None):
88 | """Transforms the data according to the sklearn api by using the hidden layer."""
89 | Xt = torch.from_numpy(X).float().detach()
90 | return self._model.embed(Xt).detach().numpy()
91 |
--------------------------------------------------------------------------------
/embetter/finetune/_sbert_learn.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from sentence_transformers import SentenceTransformer, InputExample, losses
4 | from torch.utils.data import DataLoader
5 | from torch.nn import CosineSimilarity
6 |
7 |
8 | class SbertLearner:
9 | """
10 | A learner model that can finetune on pairs of data that leverages SBERT under the hood.
11 |
12 | It's similar to the scikit-learn models that you're used to, but it accepts
13 | two inputs `X1` and `X2` and tries to predict if they are similar.
14 |
15 | Arguments:
16 | sent_tfm: an instance of a `SentenceTransformer` that you'd like to finetune
17 | batch_size: the batch size during training
18 | epochs: the number of epochs to use while training
19 | warmup_steps: the number of warmup steps before training
20 |
21 | Usage:
22 |
23 | ```python
24 | from sentence_transformers import SentenceTransformer
25 | from embetter.finetune import SbertLearner
26 | import random
27 |
28 | sent_tfm = SentenceTransformer('all-MiniLM-L6-v2')
29 | learner = SbertLearner(sent_tfm)
30 |
31 | def sample_generator(examples, n_neg=3):
32 | # A generator that assumes examples to be a dictionary of the shape
33 | # {"text": "some text", "cats": {"label_a": True, "label_b": False}}
34 | # this is typically a function that's very custom to your use-case though
35 | labels = set()
36 | for ex in examples:
37 | for cat in ex['cats'].keys():
38 | if cat not in labels:
39 | labels = labels.union([cat])
40 | for label in labels:
41 | pos_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 1]
42 | neg_examples = [ex for ex in examples if label in ex['cats'] and ex['cats'][label] == 0]
43 | for ex in pos_examples:
44 | sample = random.choice(pos_examples)
45 | yield (ex['text'], sample['text'], 1.0)
46 | for n in range(n_neg):
47 | sample = random.choice(neg_examples)
48 | yield (ex['text'], sample['text'], 0.0)
49 |
50 | learn_examples = sample_generator(examples, n_neg=3)
51 | X1, X2, y = zip(*learn_examples)
52 |
53 | # Learn a new representation
54 | learner.fit(X1, X2, y)
55 |
56 | # You now have an updated model that can create more "finetuned" embeddings
57 | learner.transform(X1)
58 | learner.transform(X2)
59 | ```
60 |
61 | After a learning is done training it can be used inside of a scikit-learn pipeline as you normally would.
62 | """
63 |
64 | def __init__(
65 | self,
66 | sent_tfm: SentenceTransformer,
67 | batch_size: int = 16,
68 | epochs: int = 1,
69 | warmup_steps: int = 100,
70 | ):
71 | self.sent_tfm = sent_tfm
72 | self.batch_size = batch_size
73 | self.epochs = epochs
74 | self.warmup_steps = warmup_steps
75 |
76 | def fit(self, X1, X2, y):
77 | """Finetune an Sbert model based on similarities between two sets of texts."""
78 | train_examples = [
79 | InputExample(texts=[x1, x2], label=float(lab))
80 | for x1, x2, lab in zip(X1, X2, y)
81 | ]
82 | data_loader = DataLoader(train_examples, shuffle=True, batch_size=16)
83 | train_loss = losses.CosineSimilarityLoss(self.sent_tfm)
84 | self.sent_tfm.fit(
85 | train_objectives=[(data_loader, train_loss)],
86 | epochs=self.epochs,
87 | warmup_steps=self.warmup_steps,
88 | )
89 | return self
90 |
91 | def transform(self, X, y=None):
92 | """Encode a single batch of Sbert inputs (usually texts)."""
93 | return self.sent_tfm.encode(X)
94 |
95 | def predict(self, X1, X2):
96 | """Predicts the cosine similarity."""
97 | emb1 = self.transform(X1)
98 | emb2 = self.transform(X2)
99 | return np.array(CosineSimilarity(dim=1)(emb1, emb2))
100 |
101 | def to_disk(self, path):
102 | """Save the finetuned Sbert model."""
103 | self.sent_tfm.save(path=path)
104 |
--------------------------------------------------------------------------------
/embetter/grab.py:
--------------------------------------------------------------------------------
1 | from embetter.base import EmbetterBase
2 |
3 |
4 | class ColumnGrabber(EmbetterBase):
5 | """
6 | Component that can grab a pandas column as a list.
7 |
8 | 
9 |
10 | This can be useful when dealing with text encoders as these
11 | sometimes cannot deal with pandas columns.
12 |
13 | Arguments:
14 | colname: the column name to grab from a dataframe
15 |
16 | **Usage**
17 |
18 | In essense, the `ColumnGrabber` really just selects a single column.
19 |
20 | ```python
21 | import pandas as pd
22 | from embetter.grab import ColumnGrabber
23 |
24 | # Let's say we start we start with a csv file with filepaths
25 | data = {"filepaths": ["tests/data/thiscatdoesnotexist.jpeg"]}
26 | df = pd.DataFrame(data)
27 |
28 | # You can use the component in stand-alone fashion
29 | ColumnGrabber("filepaths").fit_transform(df)
30 | ```
31 |
32 | But the most common way to use the `ColumnGrabber` is part of a pipeline.
33 |
34 | ```python
35 | import pandas as pd
36 | from sklearn.pipeline import make_pipeline
37 |
38 | from embetter.grab import ColumnGrabber
39 | from embetter.vision import ImageLoader, ColorHistogramEncoder
40 |
41 | # Let's say we start we start with a csv file with filepaths
42 | data = {"filepaths": ["tests/data/thiscatdoesnotexist.jpeg"]}
43 | df = pd.DataFrame(data)
44 |
45 | # You can use the component in stand-alone fashion
46 | ColumnGrabber("filepaths").fit_transform(df)
47 |
48 | # But let's build a pipeline that grabs the column, turns it
49 | # into an image and embeds it.
50 | pipe = make_pipeline(
51 | ColumnGrabber("filepaths"),
52 | ImageLoader(),
53 | ColorHistogramEncoder()
54 | )
55 |
56 | pipe.fit_transform(df)
57 | ```
58 | """
59 |
60 | def __init__(self, colname: str) -> None:
61 | self.colname = colname
62 |
63 | def transform(self, X, y=None):
64 | """
65 | Takes a column from pandas and returns it as a list.
66 | """
67 | return [x for x in X[self.colname]]
68 |
69 |
70 | class KeyGrabber:
71 | """
72 | Effectively the same thing as the ColumnGrabber, except this is
73 | meant to work on generators of dictionaries instead of dataframes.
74 | """
75 |
76 | def __init__(self, colname: str) -> None:
77 | self.colname = colname
78 |
79 | def transform(self, X, y=None):
80 | """
81 | Takes a column from pandas and returns it as a list.
82 | """
83 | if isinstance(X, dict):
84 | return X[self.colname]
85 | return [x[self.colname] for x in X]
86 |
--------------------------------------------------------------------------------
/embetter/model/__init__.py:
--------------------------------------------------------------------------------
1 | from ._diff import DifferenceClassifier
2 |
3 | __all__ = ["DifferenceClassifier"]
4 |
--------------------------------------------------------------------------------
/embetter/model/_diff.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from sklearn.linear_model import LogisticRegression
4 | from sklearn.base import TransformerMixin, ClassifierMixin
5 |
6 |
7 | class DifferenceClassifier:
8 | """
9 | Classifier for similarity using encoders under the hood.
10 |
11 | It's similar to the scikit-learn models that you're used to, but it accepts
12 | two inputs `X1` and `X2` and tries to predict if they are similar. Effectively
13 | it's just a classifier on top of `diff(X1 - X2)`.
14 |
15 | Arguments:
16 | enc: scikit-learn compatbile encoder of the input data
17 | clf_head: the classifier to apply at the end
18 |
19 | Usage:
20 |
21 | ```python
22 | from embetter.model import DifferenceClassifier
23 | from embetter.text import SentenceEncoder
24 |
25 | mod = DifferenceClassifier(enc=SentenceEncoder())
26 |
27 | # Suppose this is input data
28 | texts1 = ["hello", "firehydrant", "greetings"]
29 | texts2 = ["no", "yes", "greeting"]
30 |
31 | # You will need to have some definition of "similar"
32 | similar = [0, 0, 1]
33 |
34 | # Train a model to detect similarity
35 | mod.fit(X1=texts1, X2=texts2, y=similar)
36 | mod.predict(X1=texts1, X2=texts2)
37 |
38 | # The classifier head is a scikit-learn model, which you could save
39 | # seperately if you like. The model can be accessed via:
40 | mod.clf_head
41 | ```
42 | """
43 |
44 | def __init__(self, enc: TransformerMixin, clf_head: ClassifierMixin = None):
45 | self.enc = enc
46 | self.clf_head = (
47 | LogisticRegression(class_weight="balanced") if not clf_head else clf_head
48 | )
49 |
50 | def _calc_feats(self, X1, X2):
51 | enc1 = self.enc.transform(X1)
52 | enc2 = self.enc.transform(X2)
53 | return np.abs(enc1 - enc2)
54 |
55 | def fit(self, X1, X2, y):
56 | self.clf_head.fit(self._calc_feats(X1, X2), y)
57 | return self
58 |
59 | def predict(self, X1, X2):
60 | return self.clf_head.predict(self._calc_feats(X1, X2))
61 |
62 | def predict_proba(self, X1, X2):
63 | return self.clf_head.predict_proba(self._calc_feats(X1, X2))
64 |
--------------------------------------------------------------------------------
/embetter/multi/__init__.py:
--------------------------------------------------------------------------------
1 | from embetter.error import NotInstalled
2 |
3 | try:
4 | from embetter.multi._clip import ClipEncoder
5 | except ModuleNotFoundError:
6 | ClipEncoder = NotInstalled("ClipEncoder", "sentence-tfm")
7 |
--------------------------------------------------------------------------------
/embetter/multi/_clip.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import torch
3 | from torch.nn import Linear
4 | from torch.quantization import quantize_dynamic
5 | from sentence_transformers import SentenceTransformer as SBERT
6 |
7 | from embetter.base import EmbetterBase
8 |
9 |
10 | class ClipEncoder(EmbetterBase):
11 | """
12 | Clip model than can encode text and images.
13 |
14 | Under the hood it just wraps around the implementation of [sentence-transformers](https://sbert.net/docs/pretrained_models.html?highlight=clip)
15 |
16 | Arguments:
17 | name: name of model, see available options
18 | device: manually override cpu/mps/gpu device, tries to grab gpu or mps automatically when available
19 | quantize: turns on quantization
20 | num_threads: number of treads for pytorch to use, only affects when device=cpu
21 |
22 | The following model names should be supported:
23 |
24 | - `clip-ViT-B-32`
25 | - `clip-ViT-B-16`
26 | - `clip-ViT-B-14`
27 | - `clip-ViT-B-32-multilingual-v1`
28 | """
29 |
30 | def __init__(
31 | self, name="clip-ViT-B-32", device=None, quantize=False, num_threads=None
32 | ):
33 | if not device:
34 | if torch.cuda.is_available():
35 | device = torch.device("cuda")
36 | elif torch.backends.mps.is_available():
37 | device = torch.device("mps")
38 | else:
39 | device = torch.device("cpu")
40 | self.name = name
41 | self.device = device
42 | self.tfm = SBERT(name, device=self.device)
43 | self.num_threads = num_threads
44 | self.quantize = quantize
45 | if quantize:
46 | self.tfm = quantize_dynamic(self.tfm, {Linear})
47 | if num_threads:
48 | if self.device.type == "cpu":
49 | torch.set_num_threads(num_threads)
50 |
51 | def transform(self, X, y=None):
52 | """Transforms the text into a numeric representation."""
53 | # Convert pd.Series objects to encode compatable
54 | if isinstance(X, pd.Series):
55 | X = X.to_numpy()
56 |
57 | return self.tfm.encode(X)
58 |
--------------------------------------------------------------------------------
/embetter/text/__init__.py:
--------------------------------------------------------------------------------
1 | from embetter.error import NotInstalled
2 | from embetter.text._model2vec import TextEncoder
3 |
4 | try:
5 | from embetter.text._sbert import SentenceEncoder, MatrouskaEncoder, MatryoshkaEncoder
6 | except ModuleNotFoundError:
7 | SentenceEncoder = NotInstalled("SentenceEncoder", "sbert")
8 | MatrouskaEncoder = NotInstalled("MatrouskaEncoder", "sbert")
9 | MatryoshkaEncoder = NotInstalled("MatryoshkaEncoder", "sbert")
10 |
11 | try:
12 | from embetter.text._s2v import Sense2VecEncoder
13 | except ModuleNotFoundError:
14 | Sense2VecEncoder = NotInstalled("Sense2VecEncoder", "sense2vec")
15 |
16 | try:
17 | from embetter.text._bpemb import BytePairEncoder
18 | except ModuleNotFoundError:
19 | BytePairEncoder = NotInstalled("BytePairEncoder", "bpemb")
20 |
21 | try:
22 | from embetter.text._spacy import spaCyEncoder
23 | except ModuleNotFoundError:
24 | spaCyEncoder = NotInstalled("spaCyEncoder", "spacy")
25 |
26 | try:
27 | from embetter.text._word2vec import GensimEncoder
28 | except ModuleNotFoundError:
29 | GensimEncoder = NotInstalled("GensimEncoder", "gensim")
30 |
31 | try:
32 | from embetter.text._keras import KerasNLPEncoder
33 | except (ImportError, ModuleNotFoundError):
34 | KerasNLPEncoder = NotInstalled("KerasNLPEncoder", "keras_nlp")
35 |
36 |
37 | from embetter.text._lite import LiteTextEncoder, learn_lite_text_embeddings
38 |
39 |
40 | __all__ = [
41 | "TextEncoder",
42 | "SentenceEncoder",
43 | "MatrouskaEncoder",
44 | "MatryoshkaEncoder",
45 | "Sense2VecEncoder",
46 | "BytePairEncoder",
47 | "spaCyEncoder",
48 | "GensimEncoder",
49 | "KerasNLPEncoder",
50 | "LiteTextEncoder",
51 | "learn_lite_text_embeddings",
52 | ]
53 |
--------------------------------------------------------------------------------
/embetter/text/_bpemb.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from pathlib import Path
3 |
4 | from bpemb import BPEmb
5 |
6 | from embetter.base import EmbetterBase
7 |
8 |
9 | class BytePairEncoder(EmbetterBase):
10 | """
11 | This language represents token-free pre-trained subword embeddings. Originally created by
12 | Benjamin Heinzerling and Michael Strube.
13 |
14 | These vectors will auto-download by the [BPEmb package](https://nlp.h-its.org/bpemb/).
15 | You can also specify "multi" to download multi language embeddings. A full list of available
16 | languages can be found [here](https://nlp.h-its.org/bpemb). The article that
17 | belongs to this work can be found [here](http://www.lrec-conf.org/proceedings/lrec2018/pdf/1049.pdf)
18 | The availability of vocabulary size as well as dimensionality can be varified
19 | on the project website. See [here](https://nlp.h-its.org/bpemb/en/) for an
20 | example link in English. Please credit the original authors if you use their work.
21 |
22 | Arguments:
23 | lang: name of the model to load
24 | vs: vocabulary size of the byte pair model
25 | dim: the embedding dimensionality
26 | agg: the aggregation method to reduce many subword vectors into a single one, can be "max", "mean" or "both"
27 | cache_dir: The folder in which downloaded BPEmb files will be cached, can overwrite to custom folder.
28 |
29 | **Usage**
30 |
31 | ```python
32 | import pandas as pd
33 | from sklearn.pipeline import make_pipeline
34 | from sklearn.linear_model import LogisticRegression
35 |
36 | from embetter.grab import ColumnGrabber
37 | from embetter.text import BytePairEncoder
38 |
39 | # Let's suppose this is the input dataframe
40 | dataf = pd.DataFrame({
41 | "text": ["positive sentiment", "super negative"],
42 | "label_col": ["pos", "neg"]
43 | })
44 |
45 | # This pipeline grabs the `text` column from a dataframe
46 | # which then get fed into a small English model
47 | text_emb_pipeline = make_pipeline(
48 | ColumnGrabber("text"),
49 | BytePairEncoder(lang="en")
50 | )
51 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
52 |
53 | # This pipeline can also be trained to make predictions, using
54 | # the embedded features.
55 | text_clf_pipeline = make_pipeline(
56 | text_emb_pipeline,
57 | LogisticRegression()
58 | )
59 |
60 | # Prediction example
61 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
62 | ```
63 | """
64 |
65 | def __init__(
66 | self,
67 | lang: str,
68 | vs: int = 1000,
69 | dim: int = 25,
70 | agg: str = "mean",
71 | cache_dir: Path = None,
72 | ):
73 | self.lang = lang
74 | self.vs = vs
75 | self.dim = dim
76 | self.cache_dir = cache_dir
77 | self.agg = agg
78 | if not cache_dir:
79 | cache_dir = Path.home() / Path(".cache/bpemb")
80 | self.module = BPEmb(lang=lang, vs=vs, dim=dim, cache_dir=cache_dir)
81 |
82 | def fit(self, X, y=None):
83 | """No-op. Merely checks for object inputs per sklearn standard."""
84 | # Scikit-learn also expects this in the `.fit()` command.
85 | self._check_inputs(X)
86 | return self
87 |
88 | def _check_inputs(self, X):
89 | options = ["mean", "max", "both"]
90 | if self.agg not in options:
91 | raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.")
92 |
93 | def transform(self, X, y=None):
94 | """Transforms the phrase text into a numeric representation."""
95 | self._check_inputs(X)
96 | if self.agg == "mean":
97 | return np.array([self.module.embed(x).mean(axis=0) for x in X])
98 | if self.agg == "max":
99 | return np.array([self.module.embed(x).max(axis=0) for x in X])
100 | if self.agg == "both":
101 | mean_arr = np.array([self.module.embed(x).max(axis=0) for x in X])
102 | max_arr = np.array([self.module.embed(x).max(axis=0) for x in X])
103 | return np.concatenate([mean_arr, max_arr], axis=1)
104 |
--------------------------------------------------------------------------------
/embetter/text/_keras.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import keras_nlp
4 | from embetter.base import EmbetterBase
5 |
6 |
7 | class KerasNLPEncoder(EmbetterBase):
8 | """
9 | Encoder that can numerically encode sentences.
10 |
11 | Arguments:
12 | name: name of model, see available options
13 | device: manually override cpu/gpu device, tries to grab gpu automatically when available
14 | quantize: turns on quantization
15 | num_threads: number of treads for pytorch to use, only affects when device=cpu
16 |
17 | The pre-trained model names that you could use can be found [here](https://keras.io/api/keras_nlp/models/).
18 |
19 | **Usage**:
20 |
21 | You can leverage the multiple backends from keras-core by setting the `KERAS_BACKEND` environment variable.
22 |
23 | ```python
24 | import os
25 | # Pick the right setting
26 | os.environ["KERAS_BACKEND"] = "jax"
27 | os.environ["KERAS_BACKEND"] = "torch"
28 | os.environ["KERAS_BACKEND"] = "tensorflow"
29 | ```
30 |
31 | Once this is set, the following code will automatically use the right backend.
32 |
33 | ```python
34 | import pandas as pd
35 | from sklearn.pipeline import make_pipeline
36 | from sklearn.linear_model import LogisticRegression
37 |
38 | from embetter.grab import ColumnGrabber
39 | from embetter.text import SentenceEncoder
40 |
41 | # Let's suppose this is the input dataframe
42 | dataf = pd.DataFrame({
43 | "text": ["positive sentiment", "super negative"],
44 | "label_col": ["pos", "neg"]
45 | })
46 |
47 | # This pipeline grabs the `text` column from a dataframe
48 | # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
49 | text_emb_pipeline = make_pipeline(
50 | ColumnGrabber("text"),
51 | KerasNLPEncoder()
52 | )
53 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
54 |
55 | # This pipeline can also be trained to make predictions, using
56 | # the embedded features.
57 | text_clf_pipeline = make_pipeline(
58 | text_emb_pipeline,
59 | LogisticRegression()
60 | )
61 |
62 | # Prediction example
63 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
64 | ```
65 | """
66 |
67 | def __init__(self, name="bert_tiny_en_uncased"):
68 | self.name = name
69 | self.backbone = keras_nlp.models.BertBackbone.from_preset(name)
70 | self.preprocessor = keras_nlp.models.BertPreprocessor.from_preset(name)
71 |
72 | def transform(self, X, y=None):
73 | """Transforms the text into a numeric representation."""
74 | if isinstance(X, pd.Series):
75 | X = X.to_numpy()
76 | out = self.backbone(self.preprocessor(X))["pooled_output"]
77 |
78 | # Depending on the backend, return numpy by calling right methods.
79 | if keras_nlp.src.backend.config.backend() == "torch":
80 | return out.detach().numpy()
81 | else:
82 | return np.asarray(out)
83 |
--------------------------------------------------------------------------------
/embetter/text/_lite.py:
--------------------------------------------------------------------------------
1 | from sklearn.pipeline import make_pipeline
2 | from sklearn.decomposition import TruncatedSVD
3 | from sklearn.feature_extraction.text import TfidfVectorizer
4 |
5 | import itertools as it
6 | from skops.io import dump, load
7 |
8 |
9 | def learn_lite_text_embeddings(text_stream, dim=300, lite=True, path=None, **kwargs):
10 | """
11 | Function that can train a TF/iDF model followed by SVD to generate dense text representations.
12 |
13 | Arguments:
14 | path: path where model is saved
15 |
16 | This function can be used to load a model that's saved with `featherbed_textrepr`.
17 |
18 | **Usage**:
19 |
20 | You can leverage the multiple backends from keras-core by setting the `KERAS_BACKEND` environment variable.
21 |
22 | ```python
23 | from embetter.text import learn_lite_text_embeddings
24 |
25 | # Save a variable that contains the scikit-learn pipeline, but also store on disk.
26 | enc = learn_lite_text_embeddings(generator_of_strings, path="folder/embeddings.skops")
27 | ```
28 | """
29 | # Make two streams, keep memory footprint low
30 | stream1, stream2 = it.tee(text_stream)
31 |
32 | # Tf/Idf vectorizer can accept generators!
33 | tfidf = TfidfVectorizer(**kwargs).fit(stream1)
34 | X = tfidf.transform(stream2)
35 | if lite:
36 | # This makes a pretty big difference
37 | tfidf.idf_ = tfidf.idf_.astype("float16")
38 |
39 | # Turn the representation into floats
40 | svd = TruncatedSVD(n_components=dim, **kwargs).fit(X)
41 |
42 | # This makes it much more lightweight to save
43 | if lite:
44 | svd.components_ = svd.components_.astype("float16")
45 | pipe = make_pipeline(tfidf, svd)
46 | if path:
47 | # This makes a pretty big difference
48 | dump(pipe, path)
49 | return pipe
50 |
51 |
52 | def LiteTextEncoder(path):
53 | """
54 | Function that looks like class so that it fits the API.
55 |
56 | Arguments:
57 | path: path where model is saved
58 |
59 | This function can be used to load a model that's saved with `featherbed_textrepr`.
60 |
61 | **Usage**:
62 |
63 | You can leverage the multiple backends from keras-core by setting the `KERAS_BACKEND` environment variable.
64 |
65 | ```python
66 | from embetter.text import learn_lite_text_embeddings, LiteTextEncoder
67 |
68 | learn_lite_text_embeddings(generator_of_strings, path="folder/embeddings.skops")
69 |
70 | enc = LiteTextEncoder(path="folder/embeddings.skops")
71 | enc.transform(["encode this examples", "and this one"])
72 | ```
73 | """
74 | return load(path, trusted=True)
75 |
--------------------------------------------------------------------------------
/embetter/text/_model2vec.py:
--------------------------------------------------------------------------------
1 |
2 | from model2vec import StaticModel
3 |
4 | from embetter.base import EmbetterBase
5 |
6 |
7 | class TextEncoder(EmbetterBase):
8 | """
9 | Encoder that can numerically encode text using a model from the model2vec library.
10 |
11 | The main benefit of this encoder is that it uses distilled word embeddings, which means that they are super *fast*.
12 |
13 | Arguments:
14 | name: name of model, see available options, can also pass a model2vec StaticModel object directly
15 |
16 | The following model names should be supported:
17 |
18 | - `minishlab/potion-base-32M`
19 | - `minishlab/potion-base-8M`
20 | - `minishlab/potion-base-4M`
21 | - `minishlab/potion-base-2M`
22 | - `minishlab/potion-retrieval-32M`
23 | - `minishlab/M2V_multilingual_output`
24 |
25 | You can find the more options, and information, on the [Github repository](https://github.com/MinishLab/model2vec?tab=readme-ov-file#model-list).
26 |
27 | **Usage**:
28 |
29 | ```python
30 | import pandas as pd
31 | from sklearn.pipeline import make_pipeline
32 | from sklearn.linear_model import LogisticRegression
33 |
34 | from embetter.grab import ColumnGrabber
35 | from embetter.text import TextEncoder
36 |
37 | # Let's suppose this is the input dataframe
38 | dataf = pd.DataFrame({
39 | "text": ["positive sentiment", "super negative"],
40 | "label_col": ["pos", "neg"]
41 | })
42 |
43 | # This pipeline grabs the `text` column from a dataframe
44 | # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
45 | text_emb_pipeline = make_pipeline(
46 | ColumnGrabber("text"),
47 | TextEncoder()
48 | )
49 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
50 |
51 | # This pipeline can also be trained to make predictions, using
52 | # the embedded features.
53 | text_clf_pipeline = make_pipeline(
54 | text_emb_pipeline,
55 | LogisticRegression()
56 | )
57 |
58 | # Prediction example
59 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
60 | ```
61 | """
62 |
63 | def __init__(
64 | self, model="minishlab/potion-base-8M"
65 | ):
66 | if isinstance(model, str):
67 | self.model = StaticModel.from_pretrained(model)
68 | else:
69 | assert isinstance(model, StaticModel), "model must be a string or a StaticModel from model2vec"
70 | self.model = model
71 |
72 | def transform(self, X, y=None):
73 | """Transforms the text into a numeric representation."""
74 | return self.model.encode(X)
75 |
76 |
77 |
--------------------------------------------------------------------------------
/embetter/text/_s2v.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sense2vec import Sense2Vec
3 |
4 | from embetter.base import BaseEstimator
5 |
6 |
7 | class Sense2VecEncoder(BaseEstimator):
8 | """
9 | Create a [Sense2Vec encoder](https://github.com/explosion/sense2vec), meant to
10 | help when encoding phrases as opposed to sentences.
11 |
12 | Arguments:
13 | path: path to downloaded model
14 |
15 | **Usage**
16 |
17 | ```python
18 | import pandas as pd
19 | from sklearn.pipeline import make_pipeline
20 | from sklearn.linear_model import LogisticRegression
21 |
22 | from embetter.grab import ColumnGrabber
23 | from embetter.text import Sense2VecEncoder
24 |
25 | # Let's suppose this is the input dataframe
26 | dataf = pd.DataFrame({
27 | "text": ["positive sentiment", "super negative"],
28 | "label_col": ["pos", "neg"]
29 | })
30 |
31 | # This pipeline grabs the `text` column from a dataframe
32 | # which is then passed to the sense2vec model.
33 | text_emb_pipeline = make_pipeline(
34 | ColumnGrabber("text"),
35 | Sense2VecEncoder("path/to/s2v")
36 | )
37 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
38 | ```
39 | """
40 |
41 | def __init__(self, path: str):
42 | self.path = path
43 | self.s2v = Sense2Vec().from_disk(self.path)
44 | self.shape = self.s2v["duck|NOUN"].shape
45 |
46 | def _to_vector(self, text):
47 | sense = self.s2v.get_best_sense(text)
48 | if not sense:
49 | return np.zeros(shape=self.shape)
50 | return self.s2v[sense]
51 |
52 | def transform(self, X, y=None):
53 | """Transforms the phrase text into a numeric representation."""
54 | return np.array([self._to_vector(x) for x in X])
55 |
--------------------------------------------------------------------------------
/embetter/text/_sbert.py:
--------------------------------------------------------------------------------
1 | from warnings import warn
2 |
3 | import pandas as pd
4 | import torch
5 | from torch.nn import Linear
6 | from torch.quantization import quantize_dynamic
7 | from sentence_transformers import SentenceTransformer as SBERT
8 |
9 | from embetter.base import EmbetterBase
10 |
11 |
12 | class SentenceEncoder(EmbetterBase):
13 | """
14 | Encoder that can numerically encode sentences.
15 |
16 | Arguments:
17 | name: name of model, see available options
18 | device: manually override cpu/mps/gpu device, tries to grab gpu or mps automatically when available
19 | quantize: turns on quantization
20 | num_threads: number of treads for pytorch to use, only affects when device=cpu
21 |
22 | The following model names should be supported:
23 |
24 | - `all-mpnet-base-v2`
25 | - `multi-qa-mpnet-base-dot-v1`
26 | - `all-distilroberta-v1`
27 | - `all-MiniLM-L12-v2`
28 | - `multi-qa-distilbert-cos-v1`
29 | - `all-MiniLM-L6-v2`
30 | - `multi-qa-MiniLM-L6-cos-v1`
31 | - `paraphrase-multilingual-mpnet-base-v2`
32 | - `paraphrase-albert-small-v2`
33 | - `paraphrase-multilingual-MiniLM-L12-v2`
34 | - `paraphrase-MiniLM-L3-v2`
35 | - `distiluse-base-multilingual-cased-v1`
36 | - `distiluse-base-multilingual-cased-v2`
37 |
38 | You can find the more options, and information, on the [sentence-transformers docs page](https://www.sbert.net/docs/pretrained_models.html#model-overview).
39 |
40 | **Usage**:
41 |
42 | ```python
43 | import pandas as pd
44 | from sklearn.pipeline import make_pipeline
45 | from sklearn.linear_model import LogisticRegression
46 |
47 | from embetter.grab import ColumnGrabber
48 | from embetter.text import SentenceEncoder
49 |
50 | # Let's suppose this is the input dataframe
51 | dataf = pd.DataFrame({
52 | "text": ["positive sentiment", "super negative"],
53 | "label_col": ["pos", "neg"]
54 | })
55 |
56 | # This pipeline grabs the `text` column from a dataframe
57 | # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
58 | text_emb_pipeline = make_pipeline(
59 | ColumnGrabber("text"),
60 | SentenceEncoder('all-MiniLM-L6-v2')
61 | )
62 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
63 |
64 | # This pipeline can also be trained to make predictions, using
65 | # the embedded features.
66 | text_clf_pipeline = make_pipeline(
67 | text_emb_pipeline,
68 | LogisticRegression()
69 | )
70 |
71 | # Prediction example
72 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
73 | ```
74 | """
75 |
76 | def __init__(
77 | self, name="all-MiniLM-L6-v2", device=None, quantize=False, num_threads=None
78 | ):
79 | if not device:
80 | if torch.cuda.is_available():
81 | device = torch.device("cuda")
82 | elif torch.backends.mps.is_available():
83 | device = torch.device("mps")
84 | else:
85 | device = torch.device("cpu")
86 | self.name = name
87 | self.device = device
88 | self.tfm = SBERT(name, device=self.device)
89 | self.num_threads = num_threads
90 | self.quantize = quantize
91 | if quantize:
92 | self.tfm = quantize_dynamic(self.tfm, {Linear})
93 | if num_threads:
94 | if self.device.type == "cpu":
95 | torch.set_num_threads(num_threads)
96 |
97 | def transform(self, X, y=None):
98 | """Transforms the text into a numeric representation."""
99 | # Convert pd.Series objects to encode compatable
100 | if isinstance(X, pd.Series):
101 | X = X.to_numpy()
102 |
103 | return self.tfm.encode(X)
104 |
105 |
106 | def MatrouskaEncoder(name="tomaarsen/mpnet-base-nli-matryoshka", **kwargs):
107 | warn(
108 | "Please use `MatryoshkaEncoder` instead of `MatrouskaEncoder."
109 | "We will use correct spelling going forward and `MatrouskaEncoder` will be deprecated.",
110 | DeprecationWarning,
111 | )
112 | return MatryoshkaEncoder(name="tomaarsen/mpnet-base-nli-matryoshka", **kwargs)
113 |
114 |
115 | def MatryoshkaEncoder(name="tomaarsen/mpnet-base-nli-matryoshka", **kwargs):
116 | """
117 | Encoder that can numerically encode sentences.
118 |
119 | This function, which looks like a class, offers a shorthand way to fetch pretrained
120 | [Matryoshka embeddings](https://www.sbert.net/examples/training/matryoshka/README.html).
121 | Under the hood it just returns a `SentenceEncoder` object, but the default name points
122 | to a pretrained Matryoshka model.
123 |
124 | These embeddings are more flexible in the sense that you can more easily reduce the
125 | dimensions without losing as much information. The aforementioned docs give more details
126 |
127 | **Usage**:
128 |
129 | ```python
130 | import pandas as pd
131 | from sklearn.pipeline import make_pipeline
132 | from sklearn.linear_model import LogisticRegression
133 |
134 | from embetter.grab import ColumnGrabber
135 | from embetter.text import SentenceEncoder
136 |
137 | # Let's suppose this is the input dataframe
138 | dataf = pd.DataFrame({
139 | "text": ["positive sentiment", "super negative"],
140 | "label_col": ["pos", "neg"]
141 | })
142 |
143 | # This pipeline grabs the `text` column from a dataframe
144 | # which then get fed into Sentence-Transformers' all-MiniLM-L6-v2.
145 | text_emb_pipeline = make_pipeline(
146 | ColumnGrabber("text"),
147 | MatryoshkaEncoder()
148 | )
149 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
150 |
151 | # This pipeline can also be trained to make predictions, using
152 | # the embedded features.
153 | text_clf_pipeline = make_pipeline(
154 | text_emb_pipeline,
155 | LogisticRegression()
156 | )
157 |
158 | # Prediction example
159 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
160 | ```
161 | """
162 | return SentenceEncoder(name=name, **kwargs)
163 |
--------------------------------------------------------------------------------
/embetter/text/_spacy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from typing import Union
3 |
4 | import spacy
5 | from spacy.language import Language
6 |
7 | from embetter.base import EmbetterBase
8 |
9 |
10 | class spaCyEncoder(EmbetterBase):
11 | """
12 | **Usage**
13 |
14 | ```python
15 | import pandas as pd
16 | from sklearn.pipeline import make_pipeline
17 | from sklearn.linear_model import LogisticRegression
18 |
19 | from embetter.grab import ColumnGrabber
20 | from embetter.text import spaCyEncoder
21 |
22 | # Let's suppose this is the input dataframe
23 | dataf = pd.DataFrame({
24 | "text": ["positive sentiment", "super negative"],
25 | "label_col": ["pos", "neg"]
26 | })
27 |
28 | # This pipeline grabs the `text` column from a dataframe
29 | # which is then passed to the medium spaCy model.
30 | text_emb_pipeline = make_pipeline(
31 | ColumnGrabber("text"),
32 | spaCyEncoder("en_core_web_md")
33 | )
34 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
35 |
36 | # This pipeline can also be trained to make predictions, using
37 | # the embedded features.
38 | text_clf_pipeline = make_pipeline(
39 | text_emb_pipeline,
40 | LogisticRegression()
41 | )
42 |
43 | # Prediction example
44 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
45 | ```
46 | """
47 |
48 | def __init__(self, nlp: Union[str, Language], agg: str = "base"):
49 | if isinstance(nlp, str):
50 | self.nlp = spacy.load(nlp, disable=["ner", "tagger", "parser"])
51 | elif isinstance(nlp, Language):
52 | self.nlp = nlp
53 | else:
54 | raise ValueError("`nlp` must be `str` or spaCy-language object.")
55 | self.agg = agg
56 |
57 | def fit(self, X, y=None):
58 | """No-op. Merely checks for object inputs per sklearn standard."""
59 | # Scikit-learn also expects this in the `.fit()` command.
60 | self._check_inputs(X)
61 | return self
62 |
63 | def _check_inputs(self, X):
64 | options = ["mean", "max", "both", "base"]
65 | if self.agg not in options:
66 | raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.")
67 |
68 | def transform(self, X, y=None):
69 | """Transforms the phrase text into a numeric representation."""
70 | self._check_inputs(X)
71 | docs = self.nlp.pipe(X)
72 | if self.agg == "base":
73 | return np.array([d.vector for d in docs])
74 | token_vectors = [np.array([tok.vector for tok in doc]) for doc in docs]
75 | if self.agg == "mean":
76 | return np.array([v.mean(axis=0) for v in token_vectors])
77 | if self.agg == "max":
78 | return np.array([v.max(axis=0) for v in token_vectors])
79 | if self.agg == "both":
80 | mean_arr = np.array([v.mean(axis=0) for v in token_vectors])
81 | max_arr = np.array([v.max(axis=0) for v in token_vectors])
82 | return np.concatenate([mean_arr, max_arr], axis=1)
83 |
--------------------------------------------------------------------------------
/embetter/text/_word2vec.py:
--------------------------------------------------------------------------------
1 | from typing import List, Literal, Union
2 |
3 | import numpy as np
4 | from gensim import downloader
5 | from gensim.models import KeyedVectors, Word2Vec
6 | from gensim.utils import SaveLoad, tokenize
7 |
8 | from embetter.base import EmbetterBase
9 |
10 |
11 | class GensimEncoder(EmbetterBase):
12 | """
13 | Encodes text using a static word embedding model. The component uses gensim's default tokenizer.
14 |
15 | Arguments:
16 | model: Model name, path to model on disk, Word2Vec instance or KeyedVectors instance.
17 | agg: Way to aggregate the word embeddings in a document. Can either take the maximum, mean or both of them concatenated.
18 | deacc: Specifies whether accents should be removed when tokenizing the text.
19 | lowercase: Specifies whether the text should be lowercased during tokenization.
20 |
21 | Currently the following models are supported by default:
22 | - `conceptnet-numberbatch-17-06-300`
23 | - `word2vec-ruscorpora-300`
24 | - `word2vec-google-news-300`
25 | - `glove-wiki-gigaword-50`
26 | - `glove-wiki-gigaword-100`
27 | - `glove-wiki-gigaword-200`
28 | - `glove-wiki-gigaword-300`
29 | - `glove-twitter-25`
30 | - `glove-twitter-50`
31 | - `glove-twitter-100`
32 | - `glove-twitter-200`
33 |
34 | **Usage**
35 |
36 | ```python
37 | import pandas as pd
38 | from sklearn.pipeline import make_pipeline
39 | from sklearn.linear_model import LogisticRegression
40 |
41 | from embetter.grab import ColumnGrabber
42 | from embetter.text import Word2VecEncoder
43 |
44 | # Let's suppose this is the input dataframe
45 | dataf = pd.DataFrame({
46 | "text": ["positive sentiment", "super negative"],
47 | "label_col": ["pos", "neg"]
48 | })
49 |
50 | # This pipeline grabs the `text` column from a dataframe
51 | # which is then passed to a Word2Vec model.
52 | text_emb_pipeline = make_pipeline(
53 | ColumnGrabber("text"),
54 | Word2VecEncoder("glove-wiki-gigaword-50")
55 | )
56 | X = text_emb_pipeline.fit_transform(dataf, dataf['label_col'])
57 |
58 | # This pipeline can also be trained to make predictions, using
59 | # the embedded features.
60 | text_clf_pipeline = make_pipeline(
61 | text_emb_pipeline,
62 | LogisticRegression()
63 | )
64 |
65 | # Prediction example
66 | text_clf_pipeline.fit(dataf, dataf['label_col']).predict(dataf)
67 | ```
68 | """
69 |
70 | def __init__(
71 | self,
72 | model: Union[str, Word2Vec, KeyedVectors] = "word2vec-google-news-300",
73 | agg: Literal["mean", "max", "both"] = "mean",
74 | deacc: bool = False,
75 | lowercase: bool = False,
76 | ):
77 | self.model = model
78 | if isinstance(model, str):
79 | if model in downloader.info()["models"]:
80 | self.keyed_vectors: KeyedVectors = downloader.load(model) # type: ignore
81 | else:
82 | loaded_object = SaveLoad().load(self.model)
83 | if isinstance(loaded_object, Word2Vec):
84 | self.keyed_vectors = loaded_object.wv
85 | elif isinstance(loaded_object, KeyedVectors):
86 | self.keyed_vectors = loaded_object
87 | else:
88 | raise TypeError(
89 | "Object loaded from disk is not Word2Vec nor a KeyedVectors instance."
90 | )
91 | elif isinstance(model, Word2Vec):
92 | self.keyed_vectors: KeyedVectors = model.wv
93 | elif isinstance(model, KeyedVectors):
94 | self.keyed_vectors: KeyedVectors = model
95 | else:
96 | raise TypeError(
97 | f"You should pass a model name, keyed vectors or a Word2Vec model to Word2VecEncoder, not {type(model)}"
98 | )
99 | self.agg = agg
100 | self.deacc = deacc
101 | self.lowercase = lowercase
102 | self.n_features_out = (
103 | self.keyed_vectors.vector_size
104 | if self.agg != "both"
105 | else self.keyed_vectors.vector_size * 2
106 | )
107 |
108 | def fit(self, X, y=None):
109 | """No-op. Merely checks for object inputs per sklearn standard."""
110 | # Scikit-learn also expects this in the `.fit()` command.
111 | self._check_inputs(X)
112 | return self
113 |
114 | def _check_inputs(self, X):
115 | options = ["mean", "max", "both"]
116 | if self.agg not in options:
117 | raise ValueError(f"The `agg` value must be in {options}. Got {self.agg}.")
118 |
119 | def _tokenize(self, X) -> List[List[int]]:
120 | token_indices = []
121 | for text in X:
122 | tokens = tokenize(text, deacc=self.deacc, lowercase=self.lowercase)
123 | indices = []
124 | for token in tokens:
125 | index = self.keyed_vectors.get_index(token, default=-1)
126 | if index != -1:
127 | indices.append(index)
128 | token_indices.append(indices)
129 | return token_indices
130 |
131 | def transform(self, X, y=None):
132 | """Transforms the phrase text into a numeric representation using word embeddings."""
133 | self._check_inputs(X)
134 | tokens = self._tokenize(X)
135 | embeddings = np.empty((len(X), self.n_features_out))
136 | for i_doc, token_indices in enumerate(tokens):
137 | if not len(token_indices):
138 | embeddings[i_doc, :] = np.nan
139 | doc_vectors = self.keyed_vectors.vectors[token_indices]
140 | if self.agg == "mean":
141 | embeddings[i_doc, :] = np.mean(doc_vectors, axis=0)
142 | elif self.agg == "max":
143 | embeddings[i_doc, :] = np.max(doc_vectors, axis=0)
144 | elif self.agg == "both":
145 | mean_vector = np.mean(doc_vectors, axis=0)
146 | max_vector = np.max(doc_vectors, axis=0)
147 | embeddings[i_doc, :] = np.concatenate((mean_vector, max_vector))
148 | return embeddings
149 |
--------------------------------------------------------------------------------
/embetter/utils.py:
--------------------------------------------------------------------------------
1 | from itertools import islice
2 | from typing import Callable, Iterable
3 |
4 | import numpy as np
5 | from diskcache import Cache
6 | from sklearn.base import BaseEstimator
7 | from sklearn.metrics import pairwise_distances
8 |
9 |
10 | def cached(name: str, pipeline: BaseEstimator):
11 | """
12 | Uses a [diskcache](https://grantjenks.com/docs/diskcache/tutorial.html) in
13 | an attempt to fetch precalculated embeddings from disk instead of inferring them.
14 | This can save on compute, but also cloud credits, depending on the backend
15 | that you're using to generate embeddings.
16 |
17 | Be mindful of what does in to the encoder that you choose. It's preferable to give it
18 | text as opposed to numpy arrays. Also note that the first time that you'll run this
19 | it will take more time due to the overhead of writing into the cache.
20 |
21 | Arguments:
22 | name: the name of the local folder to represent the disk cache
23 | pipeline: the pipeline that you want to cache
24 |
25 | Usage:
26 | ```python
27 | from embetter.text import SentenceEncoder
28 | from embetter.utils import cached
29 |
30 | encoder = cached("sentence-enc", SentenceEncoder('all-MiniLM-L6-v2'))
31 |
32 | examples = [f"this is a pretty long text, which is more expensive {i}" for i in range(10_000)]
33 |
34 | # This might be a bit slow ~17.2s on our machine
35 | encoder.transform(examples)
36 |
37 | # This should be quicker ~4.71s on our machine
38 | encoder.transform(examples)
39 | ```
40 |
41 | Note that you're also able to fetch the precalculated embeddings directly via:
42 |
43 | ```python
44 | from diskcache import Cache
45 |
46 | # Make sure that you use the same name as in `cached`
47 | cache = Cache("sentence-enc")
48 | # Use a string as a key, if it's precalculated you'll get an array back.
49 | cache["this is a pretty long text, which is more expensive 0"]
50 | ```
51 | """
52 | cache = Cache(name)
53 |
54 | def run_cached(method: Callable):
55 | def wrapped(X, y=None):
56 | results = {i: cache[x] if x in cache else "TODO" for i, x in enumerate(X)}
57 | text_todo = [X[i] for i, x in results.items() if str(x) == "TODO"]
58 | i_todo = [i for i, x in results.items() if str(x) == "TODO"]
59 | out = method(text_todo)
60 | with Cache(cache.directory) as reference:
61 | for i, text, x_tfm in zip(i_todo, text_todo, out):
62 | results[i] = x_tfm
63 | reference.set(text, x_tfm)
64 | return np.array([arr for i, arr in results.items()])
65 |
66 | return wrapped
67 |
68 | pipeline.transform = run_cached(pipeline.transform)
69 |
70 | return pipeline
71 |
72 |
73 | def batched(iterable: Iterable, n: int = 64):
74 | """
75 | Takes an iterable and turns it into a batched iterable.
76 |
77 | Arguments:
78 | iterable: the input stream
79 | n: the batch size
80 | """
81 | if n < 1:
82 | raise ValueError("n must be at least one")
83 | it = iter(iterable)
84 | for batch in tuple(islice(it, n)):
85 | yield batch
86 |
87 |
88 | def calc_distances(
89 | inputs,
90 | anchors,
91 | pipeline,
92 | anchor_pipeline=None,
93 | metric="cosine",
94 | aggregate=np.max,
95 | n_jobs=None,
96 | ):
97 | """
98 | Shortcut to compare a sequence of inputs to a set of anchors.
99 |
100 | The available metrics are: `cityblock`,`cosine`,`euclidean`,`haversine`,`l1`,`l2`,`manhattan` and `nan_euclidean`.
101 |
102 | You can read a verbose description of the metrics [here](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html#sklearn.metrics.pairwise.distance_metrics).
103 |
104 | Arguments:
105 | inputs: sequence of inputs to calculate scores for
106 | anchors: set/list of anchors to compare against
107 | pipeline: the pipeline to use to calculate the embeddings
108 | anchor_pipeline: the pipeline to apply to the anchors, meant to be used if the anchors should use a different pipeline
109 | metric: the distance metric to use
110 | aggregate: you'll want to aggregate the distances to the different anchors down to a single metric, numpy functions that offer axis=1, like `np.max` and `np.mean`, can be used
111 | n_jobs: set to -1 to use all cores for calculation
112 | """
113 | X_input = pipeline.transform(inputs)
114 | if anchor_pipeline:
115 | X_anchors = anchor_pipeline.transform(anchors)
116 | else:
117 | X_anchors = pipeline.transform(anchors)
118 |
119 | X_dist = pairwise_distances(X_input, X_anchors, metric=metric, n_jobs=n_jobs)
120 | return aggregate(X_dist, axis=1)
121 |
--------------------------------------------------------------------------------
/embetter/vision/__init__.py:
--------------------------------------------------------------------------------
1 | from embetter.error import NotInstalled
2 | from embetter.vision._colorhist import ColorHistogramEncoder
3 | from embetter.vision._loader import ImageLoader
4 |
5 | try:
6 | from embetter.vision._torchvis import TimmEncoder
7 | except ModuleNotFoundError:
8 | TimmEncoder = NotInstalled("TimmEncoder", "vision")
9 |
10 |
11 | __all__ = ["ImageLoader", "ColorHistogramEncoder", "TimmEncoder"]
12 |
--------------------------------------------------------------------------------
/embetter/vision/_colorhist.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from embetter.base import EmbetterBase
4 |
5 |
6 | class ColorHistogramEncoder(EmbetterBase):
7 | """
8 | Encoder that generates an embedding based on the color histogram of the image.
9 |
10 | Arguments:
11 | n_buckets: number of buckets per color
12 |
13 | **Usage**:
14 |
15 | ```python
16 | import pandas as pd
17 | from sklearn.pipeline import make_pipeline
18 |
19 | from embetter.grab import ColumnGrabber
20 | from embetter.vision import ImageLoader, ColorHistogramEncoder
21 |
22 | # Let's say we start we start with a csv file with filepaths
23 | data = {"filepaths": ["tests/data/thiscatdoesnotexist.jpeg"]}
24 | df = pd.DataFrame(data)
25 |
26 | # Let's build a pipeline that grabs the column, turns it
27 | # into an image and embeds it.
28 | pipe = make_pipeline(
29 | ColumnGrabber("filepaths"),
30 | ImageLoader(),
31 | ColorHistogramEncoder()
32 | )
33 |
34 | # This pipeline can now encode each image in the dataframe
35 | pipe.fit_transform(df)
36 | ```
37 | """
38 |
39 | def __init__(self, n_buckets=256):
40 | self.n_buckets = n_buckets
41 |
42 | def transform(self, X, y=None):
43 | """
44 | Takes a sequence of `PIL.Image` and returns a numpy array representing
45 | a color histogram for each.
46 | """
47 | output = np.zeros((len(X), self.n_buckets * 3))
48 | for i, x in enumerate(X):
49 | arr = np.array(x)
50 | output[i, :] = np.concatenate(
51 | [
52 | np.histogram(
53 | arr[:, :, 0].flatten(),
54 | bins=np.linspace(0, 255, self.n_buckets + 1),
55 | )[0],
56 | np.histogram(
57 | arr[:, :, 1].flatten(),
58 | bins=np.linspace(0, 255, self.n_buckets + 1),
59 | )[0],
60 | np.histogram(
61 | arr[:, :, 2].flatten(),
62 | bins=np.linspace(0, 255, self.n_buckets + 1),
63 | )[0],
64 | ]
65 | )
66 | return output
67 |
--------------------------------------------------------------------------------
/embetter/vision/_loader.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from PIL import Image
3 |
4 | from embetter.base import EmbetterBase
5 |
6 |
7 | class ImageLoader(EmbetterBase):
8 | """
9 | Component that can turn filepaths into a list of PIL.Image objects.
10 |
11 | Arguments:
12 | convert: Color [conversion setting](https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.convert) from the Python image library.
13 | out: What kind of image output format to expect.
14 |
15 | **Usage**
16 |
17 | You can use the `ImageLoader` in standalone fashion.
18 |
19 | ```python
20 | from embetter.vision import ImageLoader
21 |
22 | filepath = "tests/data/thiscatdoesnotexist.jpeg"
23 | ImageLoader(convert="RGB").fit_transform([filepath])
24 | ```
25 |
26 | But it's more common to see it part of a pipeline.
27 |
28 | ```python
29 | import pandas as pd
30 | from sklearn.pipeline import make_pipeline
31 |
32 | from embetter.grab import ColumnGrabber
33 | from embetter.vision import ImageLoader, ColorHistogramEncoder
34 |
35 | # Let's say we start we start with a csv file with filepaths
36 | data = {"filepaths": ["tests/data/thiscatdoesnotexist.jpeg"]}
37 | df = pd.DataFrame(data)
38 |
39 | # Let's build a pipeline that grabs the column, turns it
40 | # into an image and embeds it.
41 | pipe = make_pipeline(
42 | ColumnGrabber("filepaths"),
43 | ImageLoader(),
44 | ColorHistogramEncoder()
45 | )
46 |
47 | pipe.fit_transform(df)
48 | ```
49 |
50 | """
51 |
52 | def __init__(self, convert: str = "RGB", out: str = "pil") -> None:
53 | self.convert = convert
54 | self.out = out
55 |
56 | def fit(self, X, y=None):
57 | """
58 | Not actual "fitting" happens in this method, but it does check the input arguments
59 | per sklearn convention.
60 | """
61 | if self.out not in ["pil", "numpy"]:
62 | raise ValueError(
63 | f"Output format parameter out={self.out} must be either pil/numpy."
64 | )
65 | return self
66 |
67 | def transform(self, X, y=None):
68 | """
69 | Turn a file path into numpy array containing pixel values.
70 | """
71 | if self.out == "pil":
72 | return [Image.open(x).convert(self.convert) for x in X]
73 | if self.out == "numpy":
74 | return np.array([np.array(Image.open(x).convert(self.convert)) for x in X])
75 |
--------------------------------------------------------------------------------
/embetter/vision/_torchvis.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import timm
3 | from timm.data import resolve_data_config
4 | from timm.data.transforms_factory import create_transform
5 |
6 | from embetter.base import EmbetterBase
7 |
8 |
9 | class TimmEncoder(EmbetterBase):
10 | """
11 | Use a pretrained vision model from TorchVision to generate embeddings. Embeddings
12 | are provider via the lovely `timm` library.
13 |
14 | You can find a list of available models [here](https://rwightman.github.io/pytorch-image-models/models/).
15 |
16 | Arguments:
17 | name: name of the model to use
18 | encode_predictions: output the predictions instead of the pooled embedding layer before
19 |
20 | **Usage**:
21 |
22 | ```python
23 | import pandas as pd
24 | from sklearn.pipeline import make_pipeline
25 |
26 | from embetter.grab import ColumnGrabber
27 | from embetter.vision import ImageLoader, TimmEncoder
28 |
29 | # Let's say we start we start with a csv file with filepaths
30 | data = {"filepaths": ["tests/data/thiscatdoesnotexist.jpeg"]}
31 | df = pd.DataFrame(data)
32 |
33 | # Let's build a pipeline that grabs the column, turns it
34 | # into an image and embeds it.
35 | pipe = make_pipeline(
36 | ColumnGrabber("filepaths"),
37 | ImageLoader(),
38 | TimmEncoder(name="mobilenetv3_large_100")
39 | )
40 |
41 | # This pipeline can now encode each image in the dataframe
42 | pipe.fit_transform(df)
43 | ```
44 | """
45 |
46 | def __init__(self, name="mobilenetv3_large_100", encode_predictions=False):
47 | self.name = name
48 | self.encode_predictions = encode_predictions
49 | self.model = timm.create_model(name, pretrained=True, num_classes=0)
50 | if self.encode_predictions:
51 | self.model = timm.create_model(name, pretrained=True)
52 | self.config = resolve_data_config({}, model=self.model)
53 | self.transform_img = create_transform(**self.config)
54 |
55 | def transform(self, X, y=None):
56 | """
57 | Transforms grabbed images into numeric representations.
58 | """
59 | batch = [self.transform_img(x).unsqueeze(0) for x in X]
60 | return np.array([self.model(x).squeeze(0).detach().numpy() for x in batch])
61 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Embetter Docs
2 | site_description: 'Scikit-Learn compatible embeddings'
3 | site_copy: Embetter offers embeddings for rapid-prototyping and finetuning in scikit-learn.
4 | repo_url: https://github.com/koaning/embetter
5 | nav:
6 | - Home: index.md
7 | - Techniques: applications.md
8 | - API:
9 | - Text: API/text.md
10 | - Vision: API/vision.md
11 | - MultiModal: API/multimodal.md
12 | - External: API/external.md
13 | - Finetuners: API/finetune.md
14 | - Model: API/model.md
15 | plugins:
16 | - mkdocstrings:
17 | handlers:
18 | python:
19 | options:
20 | annotations_path: brief
21 | show_root_heading: false
22 | show_root_toc_entry: false
23 | show_symbol_type_heading: true
24 | theme:
25 | name: material
26 | font:
27 | text: Inter
28 | code: Jetbrains Mono
29 | logo: images/icon.png
30 | palette:
31 | primary: white
32 | features:
33 | - toc.integrate
34 | - navigation.tabs
35 | - navigation.tabs.sticky
36 | - navigation.sections
37 | - navigation.expand
38 | - navigation.path
39 | - navigation.indexes
40 | - toc.follow
41 | - content.code.copy
42 | - content.code.select
43 | - content.code.annotate
44 | markdown_extensions:
45 | - pymdownx.highlight:
46 | use_pygments: true
47 | - pymdownx.superfences
48 | - attr_list
49 | - md_in_html
50 | - admonition
51 |
52 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 |
3 | from setuptools import find_packages, setup
4 |
5 | base_packages = [
6 | "scikit-learn>=1.0.0",
7 | "pandas>=1.0.0",
8 | "diskcache>=5.6.1",
9 | "skops>=0.8.0",
10 | "model2vec"
11 | ]
12 |
13 | sbert_pkgs = ["sentence-transformers>=2.2.2"]
14 | sense2vec_pkgs = ["sense2vec==2.0.0"]
15 | bpemb_packages = ["bpemb>=0.3.3"]
16 | spacy_packages = ["spacy>=3.5.0"]
17 | gensim_packages = ["gensim>=4.3.1", "scipy<1.13.0"]
18 |
19 | text_packages = sense2vec_pkgs + bpemb_packages + gensim_packages
20 |
21 | vision_packages = ["timm>=0.6.7"]
22 |
23 | pytorch_packages = ["torch>=1.12.0"]
24 |
25 | openai_packages = ["openai>=1.59.8"]
26 |
27 | cohere_packages = ["cohere>=4.11.2"]
28 |
29 |
30 | docs_packages = [
31 | "mkdocs-material==9.6.9",
32 | "mkdocstrings==0.29.0",
33 | "mkdocstrings-python==1.16.0",
34 | "mktestdocs==0.2.4",
35 | ]
36 |
37 | test_packages = [
38 | "interrogate>=1.5.0",
39 | "pytest>=4.0.2",
40 | "ruff",
41 | "pre-commit>=2.2.0",
42 | "mktestdocs==0.2.4",
43 | "datasets==2.8.0",
44 | "matplotlib==3.4.3",
45 | "pytest-xdist",
46 | ]
47 |
48 | all_packages = base_packages + text_packages + vision_packages + openai_packages
49 | dev_packages = all_packages + docs_packages + test_packages
50 |
51 |
52 | setup(
53 | name="embetter",
54 | version="0.7.0",
55 | author="Vincent D. Warmerdam",
56 | packages=find_packages(exclude=["notebooks", "docs", "datasets"]),
57 | description="Just a bunch of useful embeddings to get started quickly.",
58 | long_description=pathlib.Path("README.md").read_text(),
59 | long_description_content_type="text/markdown",
60 | license_files=("LICENSE"),
61 | url="https://koaning.github.io/embetter/",
62 | project_urls={
63 | "Documentation": "https://koaning.github.io/embetter/",
64 | "Source Code": "https://github.com/koaning/embetter/",
65 | "Issue Tracker": "https://github.com/koaning/embetter/issues",
66 | },
67 | install_requires=base_packages,
68 | extras_require={
69 | "gensim": gensim_packages + base_packages,
70 | "sense2vec": sense2vec_pkgs + base_packages,
71 | "sbert": sbert_pkgs + base_packages,
72 | "spacy": spacy_packages + base_packages,
73 | "bpemb": bpemb_packages + base_packages,
74 | "text": text_packages + base_packages,
75 | "vision": vision_packages + base_packages,
76 | "pytorch": pytorch_packages + base_packages,
77 | "openai": openai_packages + base_packages,
78 | "cohere": cohere_packages + base_packages,
79 | "all": all_packages,
80 | "docs": docs_packages,
81 | "dev": dev_packages,
82 | },
83 | classifiers=[
84 | "Intended Audience :: Science/Research",
85 | "Programming Language :: Python :: 3",
86 | "License :: OSI Approved :: MIT License",
87 | "Topic :: Scientific/Engineering",
88 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
89 | ],
90 | )
91 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/tests/__init__.py
--------------------------------------------------------------------------------
/tests/data/en.wiki.bpe.vs1000.d25.w2v.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/tests/data/en.wiki.bpe.vs1000.d25.w2v.bin
--------------------------------------------------------------------------------
/tests/data/en.wiki.bpe.vs1000.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/tests/data/en.wiki.bpe.vs1000.model
--------------------------------------------------------------------------------
/tests/data/thiscatdoesnotexist.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/embetter/c27728bc3002263714bf3c9d1e688cc7bdce6f9c/tests/data/thiscatdoesnotexist.jpeg
--------------------------------------------------------------------------------
/tests/test_base.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from embetter.grab import ColumnGrabber
3 |
4 |
5 | def test_grab_column():
6 | """Ensure that we can grab a text column."""
7 | data = [{"text": "hi", "foo": 1}, {"text": "yes", "foo": 2}]
8 | dataframe = pd.DataFrame(data)
9 | out = ColumnGrabber("text").fit_transform(dataframe)
10 | assert out == ["hi", "yes"]
11 |
--------------------------------------------------------------------------------
/tests/test_docs.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from mktestdocs import check_md_file, check_docstring
3 | from embetter.vision import ColorHistogramEncoder, TimmEncoder, ImageLoader
4 | from embetter.text import SentenceEncoder, BytePairEncoder
5 | from embetter.grab import ColumnGrabber
6 | from embetter.model import DifferenceClassifier
7 |
8 |
9 | def test_readme():
10 | """Readme needs to be accurate"""
11 | check_md_file(fpath="README.md")
12 |
13 |
14 | # def test_finetune_docs():
15 | # """Docs need to be accurate"""
16 | # check_md_file(fpath="docs/finetuners.md", memory=True)
17 |
18 |
19 | # I'm not testing spaCy, sense2vec because those docs would require
20 | # us to download `en_core_web_md` on every CI. Which is too heavy.
21 | objects = [
22 | ColumnGrabber,
23 | SentenceEncoder,
24 | ColorHistogramEncoder,
25 | TimmEncoder,
26 | ImageLoader,
27 | BytePairEncoder,
28 | DifferenceClassifier,
29 | ]
30 |
31 |
32 | @pytest.mark.parametrize("func", objects, ids=lambda d: d.__name__)
33 | def test_docstring(func):
34 | """Check the docstrings of the components"""
35 | check_docstring(obj=func)
36 |
--------------------------------------------------------------------------------
/tests/test_text.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 |
3 | import numpy as np
4 | import pytest
5 | from gensim.models import Word2Vec
6 | from gensim.utils import tokenize
7 | from spacy.language import Language
8 | from spacy.vocab import Vocab
9 |
10 | from embetter.text import (
11 | SentenceEncoder,
12 | GensimEncoder,
13 | spaCyEncoder,
14 | MatryoshkaEncoder,
15 | TextEncoder,
16 | )
17 | from embetter.utils import cached
18 |
19 | test_sentences = [
20 | "This is a test sentence!",
21 | "And this is another one",
22 | "\rUnicode stuff: ♣️,♦️,❤️,♠️\n",
23 | ]
24 |
25 |
26 | @pytest.mark.parametrize("setting", ["max", "mean", "both"])
27 | def test_word2vec(setting):
28 | """Check if one can train and use a very simple word embedding model."""
29 | vector_size = 25
30 | sentences = [list(tokenize(sent)) for sent in test_sentences]
31 | model = Word2Vec(
32 | sentences=sentences, vector_size=vector_size, window=3, min_count=1
33 | )
34 | encoder = GensimEncoder(model, agg=setting)
35 | output = encoder.fit_transform(test_sentences)
36 | assert isinstance(output, np.ndarray)
37 | out_dim = vector_size if setting != "both" else vector_size * 2
38 | assert output.shape == (len(test_sentences), out_dim)
39 | # This tests whether it can load the model from disk
40 | with tempfile.NamedTemporaryFile() as fp:
41 | model.save(fp)
42 | encoder = GensimEncoder(fp.name, agg=setting)
43 | encoder.transform(test_sentences)
44 | assert repr(encoder)
45 |
46 |
47 | @pytest.mark.parametrize("encoder", [MatryoshkaEncoder, SentenceEncoder])
48 | def test_basic_sentence_encoder(encoder):
49 | """Check correct dimensions and repr for SentenceEncoder."""
50 | enc = encoder()
51 | # Embedding dim of underlying model
52 | output_dim = enc.tfm._modules["1"].word_embedding_dimension
53 | output = enc.fit_transform(test_sentences)
54 | assert isinstance(output, np.ndarray)
55 | assert output.shape == (len(test_sentences), output_dim)
56 | # scikit-learn configures repr dynamically from defined attributes.
57 | # To test correct implementation we should test if calling repr breaks.
58 | assert repr(enc)
59 |
60 |
61 | def test_basic_text_encoder():
62 | """Check correct dimensions and repr for TextEncoder."""
63 | enc = TextEncoder()
64 | output = enc.fit_transform(test_sentences)
65 | assert isinstance(output, np.ndarray)
66 | assert repr(enc)
67 |
68 |
69 | @pytest.fixture()
70 | def nlp():
71 | """Just a fixture with a lightweight spaCy lang"""
72 | vector_data = {
73 | "red": np.array([1.0, 0.0]),
74 | "green": np.array([0.5, 0.5]),
75 | "blue": np.array([0.0, 1.0]),
76 | "purple": np.array([0.0, 1.0]),
77 | }
78 |
79 | vocab = Vocab(strings=list(vector_data.keys()))
80 | for word, vector in vector_data.items():
81 | vocab.set_vector(word, vector)
82 | return Language(vocab=vocab)
83 |
84 |
85 | @pytest.mark.parametrize("setting", ["max", "mean", "both"])
86 | def test_basic_spacy(setting, nlp):
87 | """Check correct dimensions and repr for spaCyEncoder."""
88 | encoder = spaCyEncoder(nlp, agg=setting)
89 | # Embedding dim of underlying model
90 | output = encoder.fit_transform(test_sentences)
91 | assert isinstance(output, np.ndarray)
92 | assert output.shape == (len(test_sentences), 4 if setting == "both" else 2)
93 | # scikit-learn configures repr dynamically from defined attributes.
94 | # To test correct implementation we should test if calling repr breaks.
95 | assert repr(encoder)
96 |
97 |
98 | def test_basic_spacy_cached(nlp, tmpdir):
99 | """Just an e2e test for the cache."""
100 | encoder = spaCyEncoder(nlp)
101 | output_before = encoder.transform(test_sentences)
102 |
103 | # Now we cache it
104 | encoder = cached(tmpdir, encoder)
105 | output_during = encoder.transform(test_sentences)
106 |
107 | encoder = cached(tmpdir, encoder)
108 | output_after = encoder.transform(test_sentences)
109 | assert (output_before == output_during).all()
110 | assert (output_during == output_after).all()
111 |
--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from embetter.utils import calc_distances
4 | from embetter.text import SentenceEncoder
5 |
6 |
7 | def test_calc_distances():
8 | """Make sure that the aggregation works as expected"""
9 | text_in = ["hi there", "no", "what is this then"]
10 |
11 | dists1 = calc_distances(
12 | text_in, ["greetings", "something else"], SentenceEncoder(), aggregate=np.min
13 | )
14 | dists2 = calc_distances(
15 | text_in,
16 | ["greetings", "something unrelated"],
17 | SentenceEncoder(),
18 | aggregate=np.min,
19 | )
20 | assert np.isclose(dists1.min(), dists2.min())
21 |
--------------------------------------------------------------------------------
/tests/test_vision.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from embetter.vision import ImageLoader, ColorHistogramEncoder, TimmEncoder
3 |
4 |
5 | @pytest.mark.parametrize("n_buckets", [5, 10, 25, 128])
6 | def test_color_hist_resize(n_buckets):
7 | """Make sure we can resize and it fits"""
8 | X = ImageLoader().fit_transform(["tests/data/thiscatdoesnotexist.jpeg"])
9 | shape_out = ColorHistogramEncoder(n_buckets=n_buckets).fit_transform(X).shape
10 | shape_exp = (1, n_buckets * 3)
11 | assert shape_exp == shape_out
12 |
13 |
14 | @pytest.mark.parametrize("encode_predictions,size", [(True, 1000), (False, 1280)])
15 | def test_basic_timm(encode_predictions, size):
16 | """Super basic check for torch image model."""
17 | model = TimmEncoder("mobilenetv2_120d", encode_predictions=encode_predictions)
18 | X = ImageLoader().fit_transform(["tests/data/thiscatdoesnotexist.jpeg"])
19 | out = model.fit_transform(X)
20 | assert out.shape == (1, size)
21 |
--------------------------------------------------------------------------------