├── tests
    ├── docs
    │   ├── __init__.py
    │   ├── test_configurations.py
    │   └── test_tutorials.py
    ├── text
    │   ├── __init__.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── heads
    │   │   │   ├── __init__.py
    │   │   │   ├── classification
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_relation_classifier.py
    │   │   │   │   ├── test_document_classification.py
    │   │   │   │   └── test_text_classification.py
    │   │   │   ├── test_language_modelling.py
    │   │   │   └── test_task_head.py
    │   │   └── configuration
    │   │   │   └── test_component_configuration.py
    │   ├── test_text_cleaning.py
    │   ├── test_commons.py
    │   ├── test_cli.py
    │   ├── test_features_configuration.py
    │   ├── test_metrics.py
    │   ├── test_pipeline_save.py
    │   ├── test_pipeline_copy.py
    │   ├── test_pipeline_to_mlflow.py
    │   ├── test_pipeline_with_optional_inputs.py
    │   ├── test_pipeline_tokenizer.py
    │   ├── test_pipeline_datasets.py
    │   ├── test_pipeline_with_custom_head.py
    │   ├── test_model_predict.py
    │   ├── test_pipeline_predict.py
    │   ├── test_tokenizer.py
    │   ├── test_hpo.py
    │   ├── test_pretrained_word_vectors.py
    │   ├── test_pipeline_vocab.py
    │   ├── test_features_transformers.py
    │   └── test_trainer.py
    ├── __init__.py
    ├── resources
    │   └── data
    │   │   ├── test.xlsx
    │   │   ├── test.parquet
    │   │   ├── nested-list.jsonl
    │   │   ├── to-be-flattened.jsonl
    │   │   ├── dataset_source.csv
    │   │   ├── dataset_sequence.jsonl
    │   │   ├── dataset_sequence.json
    │   │   ├── emotions_with_transformers.txt
    │   │   └── dataset_source.jsonl
    ├── conftest.py
    └── text_classification_integration_test.py
├── src
    └── biome
    │   ├── text
    │       ├── modules
    │       │   ├── __init__.py
    │       │   ├── heads
    │       │   │   ├── classification
    │       │   │   │   ├── __init__.py
    │       │   │   │   └── record_classification.py
    │       │   │   └── __init__.py
    │       │   ├── encoders
    │       │   │   ├── __init__.py
    │       │   │   └── time_distributed_encoder.py
    │       │   └── configuration
    │       │   │   ├── __init__.py
    │       │   │   ├── allennlp_configuration.py
    │       │   │   └── defs.py
    │       ├── cli
    │       │   ├── __init__.py
    │       │   ├── cli.py
    │       │   ├── evaluate.py
    │       │   ├── train.py
    │       │   └── serve.py
    │       ├── commons.py
    │       ├── mlflow_model.py
    │       ├── __init__.py
    │       ├── errors.py
    │       ├── metrics.py
    │       ├── backbone.py
    │       ├── text_cleaning.py
    │       └── vocabulary.py
    │   └── __init__.py
├── docs
    ├── docs
    │   ├── .vuepress
    │   │   ├── theme
    │   │   │   ├── index.js
    │   │   │   ├── enhanceApp.js
    │   │   │   ├── components
    │   │   │   │   ├── search.svg
    │   │   │   │   ├── search-orange.svg
    │   │   │   │   ├── Sidebar.vue
    │   │   │   │   ├── NavLink.vue
    │   │   │   │   ├── Versions.vue
    │   │   │   │   ├── Navbar.vue
    │   │   │   │   └── PageNav.vue
    │   │   │   ├── styles
    │   │   │   │   ├── fonts.styl
    │   │   │   │   ├── palette.styl
    │   │   │   │   ├── code-colors.styl
    │   │   │   │   └── index.styl
    │   │   │   └── layouts
    │   │   │   │   └── Layout.vue
    │   │   ├── public
    │   │   │   ├── favicon.ico
    │   │   │   └── assets
    │   │   │   │   ├── img
    │   │   │   │       ├── allennlp.png
    │   │   │   │       ├── hugging.png
    │   │   │   │       ├── recognai.png
    │   │   │   │       ├── bg.svg
    │   │   │   │       ├── biome-isotype.svg
    │   │   │   │       └── pytorch.svg
    │   │   │   │   └── fonts
    │   │   │   │       ├── justmeagaindownhere.woff
    │   │   │   │       ├── BasisGrotesquePro-Bold.woff
    │   │   │   │       ├── BasisGrotesquePro-Light.woff
    │   │   │   │       └── BasisGrotesquePro-Regular.woff
    │   │   └── config.js
    │   ├── documentation
    │   │   ├── tutorials
    │   │   │   └── img
    │   │   │   │   ├── analysis_df.png
    │   │   │   │   ├── hpo_tensorboard.png
    │   │   │   │   └── text_classifier_explore_screenshot.png
    │   │   ├── community
    │   │   │   ├── 2-get_help.md
    │   │   │   ├── 1-contributing.md
    │   │   │   └── 3-developer_guides.md
    │   │   ├── readme.md
    │   │   └── user-guides
    │   │   │   └── 1-nlp-tasks.md
    │   ├── icons
    │   │   ├── chev-left.svg
    │   │   ├── chev-right.svg
    │   │   └── blank.svg
    │   ├── api
    │   │   └── README.md
    │   └── README.md
    ├── biome_text_logo_for_readme.png
    ├── prepare_versioned_build.sh
    ├── package.json
    └── .templates
    │   └── config.mako
├── AUTHORS.rst
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── question.md
    │   ├── feature_request.md
    │   └── bug_report.md
    └── workflows
    │   └── ci.yml
├── environment_dev.yml
├── .pre-commit-config.yaml
├── MANIFEST.in
├── CHANGELOG.rst
├── setup.cfg
├── Makefile
├── setup.py
├── .gitignore
└── README.md


/tests/docs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/text/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/text/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/biome/text/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/text/modules/heads/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/text/modules/heads/classification/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/biome/text/modules/heads/classification/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logging.basicConfig(level=logging.INFO)
4 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/index.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   extend: '@vuepress/theme-default',
3 | }
4 | 


--------------------------------------------------------------------------------
/src/biome/text/cli/__init__.py:
--------------------------------------------------------------------------------
1 | from .cli import main
2 | 
3 | if __name__ == "__main__":
4 |     main()
5 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | ============
2 | Contributors
3 | ============
4 | 
5 | * Francisco Aranda <francisco@recogn.ai>
6 | 


--------------------------------------------------------------------------------
/tests/resources/data/test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/tests/resources/data/test.xlsx


--------------------------------------------------------------------------------
/tests/resources/data/test.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/tests/resources/data/test.parquet


--------------------------------------------------------------------------------
/docs/biome_text_logo_for_readme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/biome_text_logo_for_readme.png


--------------------------------------------------------------------------------
/docs/docs/.vuepress/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/favicon.ico


--------------------------------------------------------------------------------
/docs/docs/.vuepress/public/assets/img/allennlp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/img/allennlp.png


--------------------------------------------------------------------------------
/docs/docs/.vuepress/public/assets/img/hugging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/img/hugging.png


--------------------------------------------------------------------------------
/docs/docs/.vuepress/public/assets/img/recognai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/img/recognai.png


--------------------------------------------------------------------------------
/docs/docs/documentation/tutorials/img/analysis_df.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/documentation/tutorials/img/analysis_df.png


--------------------------------------------------------------------------------
/docs/docs/icons/chev-left.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 8 14"><path d="M8 13.533L7.5 14l-7-6.533L0 7l.5-.467L7.5 0l.5.467L1 7z"/></svg>
2 | 


--------------------------------------------------------------------------------
/docs/docs/icons/chev-right.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 8 14"><path d="M0 13.533L.5 14l7-6.533L8 7l-.5-.467L.5 0 0 .467 7 7z"/></svg>
2 | 


--------------------------------------------------------------------------------
/docs/docs/documentation/tutorials/img/hpo_tensorboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/documentation/tutorials/img/hpo_tensorboard.png


--------------------------------------------------------------------------------
/docs/docs/.vuepress/public/assets/fonts/justmeagaindownhere.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/fonts/justmeagaindownhere.woff


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/enhanceApp.js:
--------------------------------------------------------------------------------
1 | import VClickOutside from 'v-click-outside'
2 | 
3 | export default ({ Vue, options, router, siteData }) => {
4 |   Vue.use(VClickOutside)
5 | }
6 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/public/assets/fonts/BasisGrotesquePro-Bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/fonts/BasisGrotesquePro-Bold.woff


--------------------------------------------------------------------------------
/docs/docs/.vuepress/public/assets/fonts/BasisGrotesquePro-Light.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/fonts/BasisGrotesquePro-Light.woff


--------------------------------------------------------------------------------
/docs/docs/.vuepress/public/assets/fonts/BasisGrotesquePro-Regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/fonts/BasisGrotesquePro-Regular.woff


--------------------------------------------------------------------------------
/docs/docs/documentation/community/2-get_help.md:
--------------------------------------------------------------------------------
1 | # Getting help
2 | 
3 | The best way to get help is by creating an issue on [Github](https://github.com/recognai/biome-text/issues/new/choose)
4 | 


--------------------------------------------------------------------------------
/src/biome/__init__.py:
--------------------------------------------------------------------------------
1 | # https://packaging.python.org/guides/packaging-namespace-packages/#pkgutil-style-namespace-packages
2 | __path__ = __import__("pkgutil").extend_path(__path__, __name__)
3 | 


--------------------------------------------------------------------------------
/docs/docs/documentation/tutorials/img/text_classifier_explore_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/documentation/tutorials/img/text_classifier_explore_screenshot.png


--------------------------------------------------------------------------------
/src/biome/text/modules/encoders/__init__.py:
--------------------------------------------------------------------------------
1 | from ..configuration import Seq2SeqEncoderConfiguration
2 | from .time_distributed_encoder import TimeDistributedEncoder
3 | 
4 | Encoder = Seq2SeqEncoderConfiguration
5 | 


--------------------------------------------------------------------------------
/tests/text/test_text_cleaning.py:
--------------------------------------------------------------------------------
1 | from biome.text import text_cleaning
2 | 
3 | 
4 | def test_make_rule_callable():
5 |     clean_text = text_cleaning.strip_spaces("   This is a text \n\n")
6 |     assert clean_text == "This is a text"
7 | 


--------------------------------------------------------------------------------
/docs/docs/api/README.md:
--------------------------------------------------------------------------------
1 | # biome.text API reference
2 | Here you can find the API reference of the `biome.text` library.
3 | 
4 | Use the left-side bar to navigate through the library API or the search bar to find specific modules, classes and methods.
5 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/components/search.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?><svg xmlns="http://www.w3.org/2000/svg" width="12" height="13"><g stroke-width="2" stroke="#4A4A4A" fill="none"><path d="M11.29 11.71l-4-4"/><circle cx="5" cy="5" r="4"/></g></svg>
2 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/components/search-orange.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?><svg xmlns="http://www.w3.org/2000/svg" width="12" height="13"><g stroke-width="2" stroke="#F38959" fill="none"><path d="M11.29 11.71l-4-4"/><circle cx="5" cy="5" r="4"/></g></svg>
2 | 


--------------------------------------------------------------------------------
/tests/resources/data/nested-list.jsonl:
--------------------------------------------------------------------------------
1 | { 		"classification": [ 			{ 	"origin": [ 					{ 						"source": "WL", 						"key": "1038.4450287.WL" 					} 				] 			}, 			{ "origin": [ 					{ 						"source": "SAP-BP", 						"key": "DZ_FFM.0022194281.SAP-BP" 					}, 					{ 						"source": "DGHYP", 						"key": "531.9009058308.DGHYP" 					} 				] 			} 		]}
2 | 


--------------------------------------------------------------------------------
/docs/docs/icons/blank.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 11 10"><path d="M10.34 0H6.435v.6h3.439L5.267 4.788l.466.424 4.607-4.188V4.15H11V0z"/><path d="M10.34 7.95c-.001.8-.715 1.449-1.595 1.45h-6.49C1.375 9.399.661 8.75.66 7.95v-5.9c0-.8.714-1.45 1.595-1.45H4.62V0H2.255C1.01.002.002.919 0 2.05v5.9C.002 9.081 1.01 9.998 2.255 10h6.49C9.99 9.998 10.998 9.081 11 7.95V5.8h-.66v2.15z"/></svg>
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: Ask a question
 4 | title: "[QUESTION]"
 5 | labels: help wanted, question
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Check
11 | 
12 | * [ ] I have tried to find a similar issue and have not found anything which solves my question.
13 | 
14 | ## Description
15 | 
16 | Describe what you'd like to do, problems you have encountered, unclear documentation sections, etc.
17 | 


--------------------------------------------------------------------------------
/tests/text/test_commons.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from biome.text.commons import ImmutableDict
 4 | 
 5 | 
 6 | class TestImmutableDict:
 7 |     def test_cannot_mutate(self):
 8 |         dict = ImmutableDict(a=1, b="2", c=1000.00)
 9 | 
10 |         with pytest.raises(TypeError):
11 |             dict.f = "F"
12 | 
13 |         with pytest.raises(TypeError):
14 |             dict.a = 100
15 | 
16 |     # TODO: Test a serialization/deserialization
17 | 


--------------------------------------------------------------------------------
/src/biome/text/modules/configuration/__init__.py:
--------------------------------------------------------------------------------
 1 | # fmt: off
 2 | from .allennlp_configuration import BiMpmMatchingConfiguration
 3 | from .allennlp_configuration import EmbeddingConfiguration
 4 | from .allennlp_configuration import FeedForwardConfiguration
 5 | from .allennlp_configuration import Seq2SeqEncoderConfiguration
 6 | from .allennlp_configuration import Seq2VecEncoderConfiguration
 7 | from .defs import ComponentConfiguration
 8 | 
 9 | # fmt: on
10 | 


--------------------------------------------------------------------------------
/environment_dev.yml:
--------------------------------------------------------------------------------
 1 | name: biometext
 2 | 
 3 | channels:
 4 | - conda-forge
 5 | 
 6 | dependencies:
 7 | - python~=3.7.0
 8 | - pip>=20.3.0
 9 | # for building the docs
10 | - nodejs==14.15.1
11 | - pip:
12 |   # testing
13 |   - pytest>=6.2.0
14 |   - pytest-cov>=2.10.0
15 |   - pytest-pylint>=0.14.0
16 |   - pytest-notebook~=0.6.0
17 |   - wandb>=0.10.12
18 |   - xlrd~=1.2.0
19 |   # documentation
20 |   - pdoc3~=0.8.1
21 |   # development
22 |   - pre-commit~=2.9.0
23 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |     -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |         rev: v2.3.0
 4 |         hooks:
 5 |             -   id: check-yaml
 6 |             -   id: end-of-file-fixer
 7 |             -   id: trailing-whitespace
 8 |     -   repo: https://github.com/psf/black
 9 |         rev: 20.8b1
10 |         hooks:
11 |             -   id: black
12 |     -   repo: https://github.com/pycqa/isort
13 |         rev: 5.6.4
14 |         hooks:
15 |             -   id: isort
16 | 


--------------------------------------------------------------------------------
/tests/text/test_cli.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from biome.text import Pipeline
 4 | from biome.text.cli.serve import _serve
 5 | 
 6 | 
 7 | @pytest.mark.skip("Please execute this test manually and check your localhost:9999")
 8 | def test_serve():
 9 |     """Needs to be automatized this test!"""
10 |     pipeline = Pipeline.from_config(
11 |         {
12 |             "name": "serve_test",
13 |             "head": {"type": "TextClassification", "labels": ["a", "b"]},
14 |         }
15 |     )
16 | 
17 |     _serve(pipeline)
18 | 


--------------------------------------------------------------------------------
/src/biome/text/commons.py:
--------------------------------------------------------------------------------
 1 | class ImmutableDict(dict):
 2 |     """Immutable version of python's dict type"""
 3 | 
 4 |     def __hash__(self):
 5 |         return id(self)
 6 | 
 7 |     def _immutable(self, *args, **kws):
 8 |         raise TypeError("object is immutable")
 9 | 
10 |     __setitem__ = _immutable
11 |     __delitem__ = _immutable
12 |     __setattr__ = _immutable
13 | 
14 |     clear = _immutable
15 |     update = _immutable
16 |     setdefault = _immutable
17 |     pop = _immutable
18 |     popitem = _immutable
19 | 


--------------------------------------------------------------------------------
/tests/text/test_features_configuration.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from allennlp.common import Params
 3 | from allennlp.common.checks import ConfigurationError
 4 | 
 5 | from biome.text.configuration import FeaturesConfiguration
 6 | 
 7 | 
 8 | def test_non_configurable_features():
 9 |     wrong_config = dict(ner=dict(embedding=15))
10 |     with pytest.raises(TypeError):
11 |         FeaturesConfiguration(**wrong_config)
12 | 
13 |     with pytest.raises(ConfigurationError):
14 |         FeaturesConfiguration.from_params(Params(wrong_config))
15 | 


--------------------------------------------------------------------------------
/tests/resources/data/to-be-flattened.jsonl:
--------------------------------------------------------------------------------
1 | { "a": "ajj lsd", "complexData": { "a": "a", "b":"b"}, "persons": [{"name": "Frank", "lastName": "Rubber"}, {"name": "Thomas", "lastName": "Sabo"}]}
2 | { "a": "ajj lsd", "complexData": { "a": "a", "b":"b"}, "persons": [{"name": "Anthony", "lastName": "Rubber"}]}
3 | { "a": "ajj lsd", "complexData": { "a": "a", "b":"b"}, "persons": [{"name": "Peter", "lastName": "Gabriel"}, {"name": "Thomas", "lastName": "Sabo"}]}
4 | { "a": "ajj lsd", "complexData": { "a": "a", "b":"b"}, "persons": [{"name": "Lucien", "lastName": "Pasteque"}, {"name": "Thomas", "lastName": "Bowler"}]}
5 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | # https://docs.python.org/3.7/distutils/sourcedist.html#specifying-the-files-to-distribute
 2 | include AUTHORS.rst
 3 | include CHANGELOG.rst
 4 | include docker-compose.yml
 5 | include LICENSE.txt
 6 | include Makefile
 7 | include README.md
 8 | include setup.cfg
 9 | 
10 | recursive-include docker *
11 | recursive-include docs *
12 | recursive-include src/biome *
13 | recursive-include tests *
14 | 
15 | prune build
16 | prune docs/_build
17 | prune docs/_static
18 | prune docs/node_modules
19 | prune docs/api
20 | prune dist
21 | prune tests/**/htmlcov
22 | prune tests/mlruns
23 | prune tests/runs
24 | prune tests/output
25 | 
26 | global-exclude *.pyc *.o
27 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Changelog
 3 | =========
 4 | 
 5 | Version 2.0
 6 | ===========
 7 | 
 8 | - Replaced `DataSource` with `Dataset`
 9 | - Vocab creation is now automatically done when executing `Pipeline.train()`
10 | - Introduced `TuneExperiment` class
11 | - Added the *transformers* feature
12 | - Move `Pipeline.explore()` command to its own module
13 | - `Pipeline.train()` modifies the pipeline inplace instead of creating a copy for the training
14 | - `TokenClassification` accepts entities
15 | - Added a `RelationClassification` head
16 | - A LOT if minor and not so minor changes ...
17 | 
18 | Version 1.0
19 | ===========
20 | 
21 | - Introduce the *pipeline, backbone, head* concept
22 | 


--------------------------------------------------------------------------------
/src/biome/text/cli/cli.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import click
 5 | from click import Group
 6 | 
 7 | from .evaluate import evaluate
 8 | from .serve import serve
 9 | from .train import train
10 | 
11 | SUPPORTED_COMMANDS = [train, evaluate, serve]
12 | 
13 | 
14 | def main():
15 |     _add_project_modules_to_sys_path()
16 | 
17 |     commands = Group(no_args_is_help=True)
18 |     for command in SUPPORTED_COMMANDS:
19 |         commands.add_command(command, command.name)
20 |     click.CommandCollection(sources=[commands])()
21 | 
22 | 
23 | def _add_project_modules_to_sys_path():
24 |     """This methods allows load udf defined from project location"""
25 |     sys.path.append(os.getcwd())
26 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[BUG]"
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # Describe the bug
11 | A clear and concise description of what the bug is.
12 | 
13 | ## To Reproduce
14 | Steps to reproduce the behavior:
15 | 1. Code snippet or gist.
16 | 2. Error message(s) if applicable
17 | 
18 | ## Expected behavior
19 | A clear and concise description of what you expected to happen.
20 | 
21 | ## Screenshots
22 | If applicable, add screenshots to help explain your problem.
23 | 
24 | ## OS environment
25 | - OS: [e.g. Linux / Windows / macOS]
26 |  - biome.text Version [e.g. 1.0.0]
27 | 
28 | ## Additional context
29 | Add any other context about the problem here.
30 | 


--------------------------------------------------------------------------------
/tests/text/test_metrics.py:
--------------------------------------------------------------------------------
 1 | from allennlp.data import Vocabulary
 2 | 
 3 | from biome.text.metrics import Metrics
 4 | 
 5 | 
 6 | def test_metrics():
 7 |     metrics = Metrics(
 8 |         accuracy={"type": "categorical_accuracy"},
 9 |         f1={
10 |             "type": "span_f1",
11 |             "vocabulary": Vocabulary.empty(),
12 |         },
13 |     )
14 | 
15 |     # Check that training and validation metrics are different instances
16 |     assert (
17 |         metrics.get_dict()["accuracy"]
18 |         is not metrics.get_dict(is_train=False)["accuracy"]
19 |     )
20 |     # Check if we share the same vocab
21 |     assert (
22 |         metrics.get_dict()["f1"]._label_vocabulary
23 |         is metrics.get_dict(is_train=False)["f1"]._label_vocabulary
24 |     )
25 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/styles/fonts.styl:
--------------------------------------------------------------------------------
 1 | @font-face
 2 |   font-family: 'Basis Grotesque Pro'
 3 |   font-style: normal
 4 |   font-weight: normal
 5 |   src: local('Basis Grotesque Pro'), url('/biome-text/master/assets/fonts/BasisGrotesquePro-Regular.woff') format('woff')
 6 | 
 7 | 
 8 | @font-face
 9 |   font-family: 'Basis Grotesque Pro Bold'
10 |   font-style: normal
11 |   font-weight: normal
12 |   src: local('Basis Grotesque Pro Bold'), url('/biome-text/master/assets/fonts/BasisGrotesquePro-Bold.woff') format('woff')
13 | 
14 | 
15 | @font-face
16 |   font-family: 'Basis Grotesque Pro Light'
17 |   font-style: normal
18 |   font-weight: normal
19 |   src: local('Basis Grotesque Pro Light'), url('/biome-text/master/assets/fonts/BasisGrotesquePro-Light.woff') format('woff')
20 | 


--------------------------------------------------------------------------------
/docs/docs/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | home: true
 3 | navImage: /assets/img/biome.svg
 4 | heroImage: /assets/img/biome-isotype.svg
 5 | heroText: biome.
 6 | heroSubText: text
 7 | tagline: Practical NLP open source library
 8 | actionText: Get Started
 9 | actionLink: /documentation/
10 | features:
11 | - title: Easy to use
12 |   details: Create natural language processing custom models with powerful building blocks and simple workflows.
13 | - title: Powerful
14 |   details:  Benefit from the latest research and models in NLP powered by PyTorch, AllenNLP and Huggingface.
15 |   img1: /assets/img/pytorch.svg
16 |   img2: /assets/img/allennlp.png
17 |   img3: /assets/img/hugging.png
18 | - title: Industry-ready
19 |   details: Easily package and serve your models in production.
20 | footer: Maintained by
21 | ---
22 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [tool:pytest]
 2 | # Options for py.test:
 3 | # Specify command line options as you would do when invoking py.test directly.
 4 | # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml
 5 | # in order to write a coverage file that can be read by Jenkins.
 6 | testpaths = tests
 7 | addopts = --color yes --cov biome --cov-report html --verbose
 8 | 
 9 | [flake8]
10 | # Some sane defaults for the code style checker flake8
11 | exclude =
12 |     .tox
13 |     build
14 |     dist
15 |     .eggs
16 |     docs/conf.py
17 | 
18 | [pylint]
19 | max-line-length = 120
20 | disable = C0330,C0111,C0303,C0415,R0801
21 | skip = docs/*.py
22 | output-format = colorized
23 | generated-members = numpy.*,torch.*
24 | score = y
25 | reports = n
26 | 
27 | [isort]
28 | profile = black
29 | force_single_line = True
30 | 


--------------------------------------------------------------------------------
/tests/text/modules/configuration/test_component_configuration.py:
--------------------------------------------------------------------------------
 1 | from biome.text import helpers
 2 | from biome.text.modules.heads.classification.text_classification import (
 3 |     TextClassification,
 4 | )
 5 | from biome.text.modules.heads.classification.text_classification import (
 6 |     TextClassificationConfiguration,
 7 | )
 8 | 
 9 | 
10 | def test_component_spec_config_with_type():
11 |     head = TextClassificationConfiguration(
12 |         pooler="boe",
13 |         labels=[
14 |             "toxic",
15 |             "severe_toxic",
16 |             "obscene",
17 |             "threat",
18 |             "insult",
19 |             "identity_hate",
20 |         ],
21 |         multilabel=True,
22 |     )
23 | 
24 |     assert "type" in head.config
25 |     assert head.config["type"] == helpers.get_full_class_name(TextClassification)
26 | 


--------------------------------------------------------------------------------
/tests/text/test_pipeline_save.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from numpy.testing import assert_allclose
 3 | 
 4 | from biome.text import Pipeline
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def pipeline():
 9 |     return Pipeline.from_config(
10 |         {
11 |             "name": "test_pipeline_copy",
12 |             "head": {"type": "TextClassification", "labels": ["a", "b"]},
13 |         }
14 |     )
15 | 
16 | 
17 | def test_save(pipeline, tmp_path):
18 |     pipeline.save(tmp_path)
19 | 
20 |     assert (tmp_path / "model.tar.gz").is_file()
21 | 
22 |     expected_prediction = pipeline.predict("test")
23 |     prediction = Pipeline.from_pretrained(tmp_path / "model.tar.gz").predict("test")
24 | 
25 |     assert prediction["labels"] == expected_prediction["labels"]
26 |     assert_allclose(prediction["probabilities"], expected_prediction["probabilities"])
27 | 


--------------------------------------------------------------------------------
/src/biome/text/mlflow_model.py:
--------------------------------------------------------------------------------
 1 | import mlflow
 2 | import pandas as pd
 3 | 
 4 | 
 5 | class BiomeTextModel(mlflow.pyfunc.PythonModel):
 6 |     """A custom MLflow model with the 'python_function' flavor for biome.text pipelines.
 7 | 
 8 |     This class is used by the `Pipeline.to_mlflow()` method.
 9 |     """
10 | 
11 |     ARTIFACT_CONTEXT = "model"
12 | 
13 |     def __init__(self):
14 |         self.pipeline = None
15 | 
16 |     def load_context(self, context):
17 |         from biome.text import Pipeline
18 | 
19 |         self.pipeline = Pipeline.from_pretrained(
20 |             context.artifacts[self.ARTIFACT_CONTEXT]
21 |         )
22 | 
23 |     def predict(self, context, dataframe: pd.DataFrame):
24 |         batch = dataframe.to_dict(orient="records")
25 |         predictions = self.pipeline.predict(batch=batch)
26 | 
27 |         return pd.DataFrame(predictions)
28 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/styles/palette.styl:
--------------------------------------------------------------------------------
 1 | // font family
 2 | @import url('https://fonts.googleapis.com/css2?family=Open+Sans&display=swap')
 3 | @require "./fonts.styl"
 4 | $primaryFontFamily = 'Basis Grotesque Pro'
 5 | $secondaryFontFamily = 'Open Sans'
 6 | $handMadeFontFamily = 'Just Me Again Down Here', cursive
 7 | 
 8 | // colors
 9 | $accentColor = #F38959
10 | $textColorLight = #686A6D
11 | $textColor = #4A4A4A
12 | $borderColor = #D8D8D8
13 | $codeBgColor = #4A4A4A
14 | $arrowBgColor = #ccc
15 | $badgeTipColor = #9013FE
16 | $badgeWarningColor = darken(#ffe564, 35%)
17 | $badgeErrorColor = #DA5961
18 | $sidebarBgColor = #F5F5F6
19 | $codePillColor = #F0E7FF
20 | $yellow = #F8D11C
21 | $green = #6ACE91
22 | $red = #FF1E5E
23 | 
24 | // layout
25 | $navbarHeight = 3.6rem
26 | $sidebarWidth = 20rem
27 | $contentWidth = 740px
28 | $homePageWidth = 960px
29 | 
30 | // responsive breakpoints
31 | $MQNarrow = 959px
32 | $MQMobile = 719px
33 | $MQMobileNarrow = 419px
34 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: default dev check test ui build_ui docs build_docs dist
 2 | default: help
 3 | 
 4 | dev: ## install package in development mode
 5 | 	@pip install --upgrade -e .
 6 | 	@pre-commit install
 7 | 
 8 | check: ## applies a code pylint with autopep8 reformating
 9 | 	@pre-commit run --all-files
10 | 	@pylint --exit-zero --rcfile=setup.cfg --unsafe-load-any-extension=y src
11 | 
12 | test: ## launch package tests
13 | 	@python -m pytest
14 | 	@python -m pytest --doctest-modules src/biome/text
15 | 
16 | docs: ## serve the documentation for development
17 | 	@cd docs && npm install && npm run dev:site
18 | 
19 | build_docs: ## build the documentation
20 | 	@cd docs && npm install && npm run build:site
21 | 
22 | dist: ## build a package distribution
23 | 	@python setup.py sdist bdist_wheel
24 | 
25 | 
26 | .PHONY: help
27 | help:
28 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
29 | 


--------------------------------------------------------------------------------
/src/biome/text/modules/encoders/time_distributed_encoder.py:
--------------------------------------------------------------------------------
 1 | from allennlp.modules import Seq2SeqEncoder
 2 | from allennlp.modules import TimeDistributed
 3 | 
 4 | 
 5 | class TimeDistributedEncoder(Seq2SeqEncoder):
 6 |     """Wraps a Seq2SeqEncoder into a TimeDistributed module and implements the Seq2SeqEncoder API"""
 7 | 
 8 |     def __init__(self, encoder: Seq2SeqEncoder):
 9 |         super(TimeDistributedEncoder, self).__init__()
10 | 
11 |         self._input_dim = encoder.get_input_dim()
12 |         self._output_dim = encoder.get_output_dim()
13 |         self._is_bidirectional = (
14 |             hasattr(encoder, "is_bidirectional") and encoder.is_bidirectional()
15 |         )
16 | 
17 |         self._encoder = TimeDistributed(encoder)
18 | 
19 |     def forward(self, *input, **inputs):
20 |         return self._encoder(*input, **inputs)
21 | 
22 |     def is_bidirectional(self) -> bool:
23 |         return self._is_bidirectional
24 | 
25 |     def get_output_dim(self) -> int:
26 |         return self._output_dim
27 | 
28 |     def get_input_dim(self):
29 |         return self._input_dim
30 | 


--------------------------------------------------------------------------------
/docs/docs/documentation/readme.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | For the installation we recommend setting up a fresh [conda](https://docs.conda.io/en/latest/miniconda.html) environment:
 4 | 
 5 | ```shell script
 6 | conda create -n biome python~=3.7.0 pip>=20.3.0
 7 | conda activate biome
 8 | ```
 9 | 
10 | Once the conda environment is activated, you can install the latest release or the development version via pip.
11 | 
12 | ## Latest release (recommended)
13 | 
14 | To install the latest release of *biome.text* type in:
15 | 
16 | ````shell script
17 | pip install -U biome-text
18 | ````
19 | 
20 | After installing *biome.text*, the best way to test your installation is by running the *biome.text* cli command:
21 | 
22 | ```shell script
23 | biome --help
24 | ```
25 | 
26 | ## Master branch
27 | 
28 | The *master branch* contains the latest features, but is less well tested.
29 | If you are looking for a specific feature that has not been released yet, you can install the package from our master branch with:
30 | 
31 | ````shell script
32 | pip install -U git+https://github.com/recognai/biome-text.git
33 | ````
34 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/styles/code-colors.styl:
--------------------------------------------------------------------------------
 1 | :not(pre)>code[class*=language-], pre[class*=language-], div[class*="language-"], .theme-default-content pre, .theme-default-content pre[class*=language-]
 2 |   background #F5F5F6
 3 | 
 4 | div[class*="language-"] .highlight-lines .highlighted
 5 |   background-color rgba(0, 0, 0, 0.08)
 6 | 
 7 | .theme-default-content pre code, .theme-default-content pre[class*="language-"] code
 8 |   color #000000  !important
 9 | 
10 | .token.atrule, .token.builtin, .token.important, .token.keyword, .token.selector
11 |   color #A9261B  !important
12 | 
13 | div[class*="language-"]::before
14 |   color #999 !important
15 | 
16 | .token.punctuation
17 |   color #C58D09 !important
18 | 
19 | .token.entity, .token.operator, .token.url
20 |   color #004898  !important
21 | 
22 | .token.boolean, .token.number, .token.function
23 |   color #4C10BC !important
24 | 
25 | .token.string, .token.char, .token.attr-value, .token.regex, .token.variable
26 |   color #429E9E !important
27 | 
28 | .token.property, .token.class-name, .token.constant, .token.symbol
29 |   color #67BF89 !important
30 | 


--------------------------------------------------------------------------------
/src/biome/text/modules/configuration/allennlp_configuration.py:
--------------------------------------------------------------------------------
 1 | from allennlp.modules import BiMpmMatching
 2 | from allennlp.modules import Embedding
 3 | from allennlp.modules import FeedForward
 4 | from allennlp.modules import Seq2SeqEncoder
 5 | from allennlp.modules import Seq2VecEncoder
 6 | 
 7 | from .defs import ComponentConfiguration
 8 | 
 9 | 
10 | class Seq2VecEncoderConfiguration(ComponentConfiguration[Seq2VecEncoder]):
11 |     """Layer spec for Seq2VecEncoder components"""
12 | 
13 |     pass
14 | 
15 | 
16 | class Seq2SeqEncoderConfiguration(ComponentConfiguration[Seq2SeqEncoder]):
17 |     """Layer spec for Seq2SeqEncoder components"""
18 | 
19 |     pass
20 | 
21 | 
22 | class FeedForwardConfiguration(ComponentConfiguration[FeedForward]):
23 |     """Layer spec for FeedForward components"""
24 | 
25 |     pass
26 | 
27 | 
28 | class BiMpmMatchingConfiguration(ComponentConfiguration[BiMpmMatching]):
29 |     """Layer spec for BiMpmMatching components"""
30 | 
31 |     pass
32 | 
33 | 
34 | class EmbeddingConfiguration(ComponentConfiguration[Embedding]):
35 |     """Layer spec for Embedding components"""
36 | 
37 |     pass
38 | 


--------------------------------------------------------------------------------
/src/biome/text/__init__.py:
--------------------------------------------------------------------------------
 1 | import pkg_resources
 2 | 
 3 | try:
 4 |     __version__ = pkg_resources.get_distribution("biome-text").version
 5 | except pkg_resources.DistributionNotFound:
 6 |     # package is not installed
 7 |     pass
 8 | 
 9 | import logging
10 | 
11 | # configure basic 'biome.text' logging
12 | _handler = logging.StreamHandler()
13 | _handler.setFormatter(
14 |     logging.Formatter("%(levelname)s:%(name)s: %(message)s")
15 | )  # "%(levelname)s: %(message)s"))
16 | _LOGGER = logging.getLogger(__name__)
17 | _LOGGER.addHandler(_handler)
18 | _LOGGER.setLevel("INFO")
19 | # configure 'allennlp' logging
20 | _ALLENNLP_LOGGER = logging.getLogger("allennlp")
21 | _ALLENNLP_LOGGER.addHandler(_handler)
22 | _ALLENNLP_LOGGER.setLevel("WARNING")
23 | 
24 | # TODO: Remove this hack when allennlp 1.8.0 is out
25 | import transformers
26 | 
27 | transformers.__spec__ = ""
28 | 
29 | from .configuration import PipelineConfiguration
30 | from .configuration import TrainerConfiguration
31 | from .configuration import VocabularyConfiguration
32 | from .dataset import Dataset
33 | from .pipeline import Pipeline
34 | from .trainer import Trainer
35 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | 
 6 | 
 7 | def pytest_configure(config):
 8 |     # It's really hard to do testing with wandb enabled ...
 9 |     os.environ["WANDB_MODE"] = "disabled"
10 | 
11 | 
12 | @pytest.fixture
13 | def resources_path() -> Path:
14 |     return Path(__file__).parent / "resources"
15 | 
16 | 
17 | @pytest.fixture
18 | def resources_data_path(resources_path) -> Path:
19 |     return resources_path / "data"
20 | 
21 | 
22 | @pytest.fixture
23 | def tutorials_path() -> Path:
24 |     repo_root = Path(__file__).parent.parent
25 |     return repo_root / "docs" / "docs" / "documentation" / "tutorials"
26 | 
27 | 
28 | @pytest.fixture
29 | def configurations_path() -> Path:
30 |     repo_root = Path(__file__).parent.parent
31 |     return (
32 |         repo_root
33 |         / "docs"
34 |         / "docs"
35 |         / "documentation"
36 |         / "user-guides"
37 |         / "2-configuration.md"
38 |     )
39 | 
40 | 
41 | @pytest.fixture
42 | def change_to_tmp_working_dir(tmp_path) -> Path:
43 |     cwd = os.getcwd()
44 |     os.chdir(tmp_path)
45 |     yield tmp_path
46 |     os.chdir(cwd)
47 | 


--------------------------------------------------------------------------------
/tests/resources/data/dataset_source.csv:
--------------------------------------------------------------------------------
 1 | age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
 2 | 44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,210,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
 3 | 53,technician,married,unknown,no,no,no,cellular,nov,fri,138,1,999,0,nonexistent,-0.1,93.2,-42,4.021,5195.8,0
 4 | 28,management,single,university.degree,no,yes,no,cellular,jun,thu,339,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
 5 | 39,services,married,high.school,no,no,no,cellular,apr,fri,185,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
 6 | 55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,137,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1
 7 | 30,management,divorced,basic.4y,no,yes,no,cellular,jul,tue,68,8,999,0,nonexistent,1.4,93.918,-42.7,4.961,5228.1,0
 8 | 37,blue-collar,married,basic.4y,no,yes,no,cellular,may,thu,204,1,999,0,nonexistent,-1.8,92.893,-46.2,1.327,5099.1,0
 9 | 39,blue-collar,divorced,basic.9y,no,yes,no,cellular,may,fri,191,1,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,0
10 | 36,admin.,married,university.degree,no,no,no,cellular,jun,mon,174,1,3,1,success,-2.9,92.963,-40.8,1.266,5076.2,1
11 | 


--------------------------------------------------------------------------------
/tests/resources/data/dataset_sequence.jsonl:
--------------------------------------------------------------------------------
1 | {"hypothesis": "Irmalotte  Schneider  Poggenburg  4  48485  Neuenkirchen, Kreis Steinfurt  Irmalotte-S92@freemail.de  22.4.1992  01636496234", "premise": " DE  Frau Dr.  Iramlotte  Schneider  Poggenburg  4  48485  Neuenkirchen, Kreis Steinfurt  Irmalotte-S92@freemail.de 17. Juli 1967  01636496234", "label": "duplicate"}
2 | {"hypothesis": "Irmalotte  Schneider  Poggenburg  4  48485  Neuenkirchen, Kreis Steinfurt  Irmalotte-S92@freemail.de  22.4.1992  01636496234", "premise": "Herr  Karlheinz  Hofmann  Seglerweg  5  48485  Neuenkirchen, Kreis Steinfurt  karlheinz-hofmann@hotmail.de 3.10.1952  0152359493301", "label": "not_duplicate"}
3 | {"hypothesis": "Herr  Karlheinz  Hofmann  Seglerweg  5  48485  Neuenkirchen, Kreis Steinfurt  karlheinz-hofmann@hotmail.de 3.10.1952  0152359493301", "premise": "Frau Dr.  Iramlotte  Schneider  Poggenburg  4  48485  Neuenkirchen, Kreis Steinfurt  Irmalotte-S92@freemail.de 17. Juli 1967  01636496234", "label": "not_duplicate"}
4 | {"hypothesis": "Herr  Karlheinz  Hofmann  Seglerweg  5  48485  Neuenkirchen, Kreis Steinfurt  karlheinz-hofmann@hotmail.de 3.10.1952  0152359493301", "premise": " DE  Herr  Karlheinz   Hofamnn  Seglerweg  5  48485  Neuenkirchen, Kreis Steinfurt  karlheinz-hofmann@hotmail.de 3. October 52  0152359493301", "label": "duplicate"}
5 | 


--------------------------------------------------------------------------------
/docs/prepare_versioned_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | print_help(){
 4 |   echo "Usage: bash" "$0"
 5 |   echo ""
 6 |   echo "  Small bash script to prepare the docs for a _versioned_ build."
 7 |   echo ""
 8 |   echo "  The environment variable BIOME_TEXT_DOC_VERSION must be set!"
 9 |   echo "  This env variable must match the release tag (e.g. v2.2.0 or v2.2.0rc1)."
10 | }
11 | 
12 | if [ "$1" == "--help" ] || [ "$1" == "-h" ]; then
13 |     print_help
14 |     exit 0
15 | fi
16 | 
17 | if [ -z "$BIOME_TEXT_DOC_VERSION" ]; then
18 |   echo "ERROR: BIOME_TEXT_DOC_VERSION not set!"
19 |   print_help
20 |   exit 1
21 | fi
22 | 
23 | 
24 | echo " - Modifying font urls  ..."
25 | 
26 | if ! sed -i "s|/biome-text/master/|/biome-text/$BIOME_TEXT_DOC_VERSION/|g" ./docs/.vuepress/theme/styles/fonts.styl; then
27 |   echo "ERROR: Could not modify 'fonts.styl'!"
28 |   exit 1
29 | fi
30 | 
31 | 
32 | echo " - Modifying tutorials ..."
33 | 
34 | modified=$(find ./docs/documentation/tutorials -maxdepth 1 -name "*.ipynb" \
35 |   -exec sed -i -e "s|pip install -U git+https://github.com/recognai/biome-text.git|pip install -U biome-text|g" \
36 |     -e "s|/biome-text/master/|/biome-text/$BIOME_TEXT_DOC_VERSION/|g" \
37 |     -e "s|/biome-text/blob/master/|/biome-text/blob/$BIOME_TEXT_DOC_VERSION/|g" {} \; \
38 |   -exec echo {} \; | wc -l)
39 | if [ "$modified" -eq 0 ]; then
40 |   echo "ERROR: No tutorials modified!"
41 |   exit 1
42 | fi
43 | 
44 | 
45 | echo " - Done!"
46 | 
47 | exit 0
48 | 


--------------------------------------------------------------------------------
/src/biome/text/errors.py:
--------------------------------------------------------------------------------
 1 | from fastapi import HTTPException
 2 | 
 3 | 
 4 | class BaseError(Exception):
 5 |     """Base error. This class could include common error attributes or methods"""
 6 | 
 7 |     pass
 8 | 
 9 | 
10 | class ValidationError(BaseError):
11 |     """Base error for data validation"""
12 | 
13 |     pass
14 | 
15 | 
16 | class WrongInputError(ValidationError):
17 |     """Error related with input params"""
18 | 
19 |     def __init__(self, arg_name: str):
20 |         super(WrongInputError, self).__init__()
21 |         self.arg_name = arg_name
22 | 
23 |     def __str__(self) -> str:
24 |         return f"Wrong model input '{self.arg_name}'"
25 | 
26 | 
27 | class ActionNotSupportedError(ValidationError):
28 |     """Raised when an action is not supported for a given component state"""
29 | 
30 | 
31 | class EmptyVocabError(ValidationError):
32 |     """Error related with using empty vocabs for a training"""
33 | 
34 |     pass
35 | 
36 | 
37 | class WrongValueError(ValidationError):
38 |     """Wrong value error"""
39 | 
40 | 
41 | class http_error_handling:
42 |     """Error handling for http error transcription"""
43 | 
44 |     def __enter__(self):
45 |         pass
46 | 
47 |     def __exit__(self, exc_type, exc_val, exc_tb):
48 |         if isinstance(exc_val, ValidationError):
49 |             raise HTTPException(status_code=400, detail=str(exc_val))
50 |         if isinstance(exc_val, Exception):
51 |             # Common http error handling
52 |             raise HTTPException(status_code=500, detail=str(exc_val))
53 | 


--------------------------------------------------------------------------------
/src/biome/text/modules/heads/__init__.py:
--------------------------------------------------------------------------------
 1 | from .classification.doc_classification import DocumentClassification
 2 | from .classification.doc_classification import DocumentClassificationConfiguration
 3 | from .classification.record_classification import RecordClassification
 4 | from .classification.record_classification import RecordClassificationConfiguration
 5 | from .classification.record_pair_classification import RecordPairClassification
 6 | from .classification.record_pair_classification import (
 7 |     RecordPairClassificationConfiguration,
 8 | )
 9 | from .classification.relation_classification import RelationClassification
10 | from .classification.relation_classification import RelationClassificationConfiguration
11 | from .classification.text_classification import TextClassification
12 | from .classification.text_classification import TextClassificationConfiguration
13 | from .language_modelling import LanguageModelling
14 | from .language_modelling import LanguageModellingConfiguration
15 | from .task_head import TaskHead
16 | from .task_head import TaskHeadConfiguration
17 | from .task_head import TaskName
18 | from .task_head import TaskPrediction
19 | from .token_classification import TokenClassification
20 | from .token_classification import TokenClassificationConfiguration
21 | 
22 | for head in [
23 |     TextClassification,
24 |     TokenClassification,
25 |     DocumentClassification,
26 |     RecordClassification,
27 |     LanguageModelling,
28 |     RecordPairClassification,
29 |     RelationClassification,
30 | ]:
31 |     head.register(overrides=True)
32 | 


--------------------------------------------------------------------------------
/tests/resources/data/dataset_sequence.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "0.1.0",
 3 |   "data": [
 4 |     {
 5 |       "hypothesis": "Irmalotte  Schneider  Poggenburg  4  48485  Neuenkirchen, Kreis Steinfurt  Irmalotte-S92@freemail.de  22.4.1992  01636496234",
 6 |       "premise": " DE  Frau Dr.  Iramlotte  Schneider  Poggenburg  4  48485  Neuenkirchen, Kreis Steinfurt  Irmalotte-S92@freemail.de 17. Juli 1967  01636496234",
 7 |       "label": "duplicate"
 8 |     },
 9 |     {
10 |       "hypothesis": "Irmalotte  Schneider  Poggenburg  4  48485  Neuenkirchen, Kreis Steinfurt  Irmalotte-S92@freemail.de  22.4.1992  01636496234",
11 |       "premise": "Herr  Karlheinz  Hofmann  Seglerweg  5  48485  Neuenkirchen, Kreis Steinfurt  karlheinz-hofmann@hotmail.de 3.10.1952  0152359493301",
12 |       "label": "not_duplicate"
13 |     },
14 |     {
15 |       "hypothesis": "Herr  Karlheinz  Hofmann  Seglerweg  5  48485  Neuenkirchen, Kreis Steinfurt  karlheinz-hofmann@hotmail.de 3.10.1952  0152359493301",
16 |       "premise": "Frau Dr.  Iramlotte  Schneider  Poggenburg  4  48485  Neuenkirchen, Kreis Steinfurt  Irmalotte-S92@freemail.de 17. Juli 1967  01636496234",
17 |       "label": "not_duplicate"
18 |     },
19 |     {
20 |       "hypothesis": "Herr  Karlheinz  Hofmann  Seglerweg  5  48485  Neuenkirchen, Kreis Steinfurt  karlheinz-hofmann@hotmail.de 3.10.1952  0152359493301",
21 |       "premise": " DE  Herr  Karlheinz   Hofamnn  Seglerweg  5  48485  Neuenkirchen, Kreis Steinfurt  karlheinz-hofmann@hotmail.de 3. October 52  0152359493301",
22 |       "label": "duplicate"
23 |     }
24 |   ]
25 | }
26 | 


--------------------------------------------------------------------------------
/docs/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "docs",
 3 |   "version": "1.0.0",
 4 |   "description": "Documentation",
 5 |   "private": true,
 6 |   "env": {
 7 |     "apipath": "docs/api",
 8 |     "package": "biome.text",
 9 |     "templates_path": ".templates/"
10 |   },
11 |   "scripts": {
12 |     "clean:api": "rm -rf $npm_package_env_apipath/biome",
13 |     "build:api": "pdoc -o $npm_package_env_apipath $npm_package_env_package --force --template-dir $npm_package_env_templates_path --html",
14 |     "rename:index": "find $npm_package_env_apipath | renamer --find index.md --replace README.md",
15 |     "rename:html": "find $npm_package_env_apipath | renamer --find .html --replace .md",
16 |     "rename": "npm run rename:html && npm run rename:index",
17 |     "build:tutorials": "find docs/documentation/tutorials -iname *.ipynb -maxdepth 1 -exec jupyter nbconvert --to markdown {} \\;",
18 |     "build:docs": "npm run clean:api && npm run build:api && npm run build:tutorials && npm run rename",
19 |     "build:site": "npm run build:docs && vuepress build docs",
20 |     "dev:site": "npm run build:docs && vuepress dev docs",
21 |     "docs:dev": "npm run dev:site",
22 |     "docs:svgo": "vuepress svgo docs"
23 |   },
24 |   "devDependencies": {
25 |     "@goy/vuepress-plugin-svg-icons": "^4.1.0",
26 |     "@vuepress/plugin-active-header-links": "^1.4.1",
27 |     "@vuepress/plugin-back-to-top": "^1.4.1",
28 |     "renamer": "^2.0.0",
29 |     "vuepress": "^1.4.1"
30 |   },
31 |   "dependencies": {
32 |     "axios": ">=0.21.1",
33 |     "v-click-outside": "^3.1.2",
34 |     "vuepress-bar": "^0.3.0"
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/tests/text/test_pipeline_copy.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from numpy.testing import assert_allclose
 3 | 
 4 | from biome.text import Dataset
 5 | from biome.text import Pipeline
 6 | from biome.text import Trainer
 7 | from biome.text import TrainerConfiguration
 8 | 
 9 | 
10 | @pytest.fixture
11 | def pipeline():
12 |     return Pipeline.from_config(
13 |         {
14 |             "name": "test_pipeline_copy",
15 |             "head": {
16 |                 "type": "TextClassification",
17 |                 "labels": ["a", "b"],
18 |             },
19 |         }
20 |     )
21 | 
22 | 
23 | @pytest.fixture
24 | def dataset():
25 |     return Dataset.from_dict(
26 |         {
27 |             "text": ["this is", "a test"],
28 |             "label": ["a", "b"],
29 |         }
30 |     )
31 | 
32 | 
33 | def test_copy(pipeline):
34 |     prediction = pipeline.predict("check this")
35 |     pipeline_copy = pipeline.copy()
36 |     prediction_copy = pipeline_copy.predict("check this")
37 | 
38 |     assert_allclose(prediction["probabilities"], prediction_copy["probabilities"])
39 | 
40 | 
41 | def test_train_from_pretrained(pipeline, dataset, tmp_path):
42 |     output_path = tmp_path / "test_train_from_pretrained_output"
43 |     trainer_config = TrainerConfiguration(max_epochs=1, batch_size=2, gpus=0)
44 |     trainer = Trainer(
45 |         pipeline=pipeline, train_dataset=dataset, trainer_config=trainer_config
46 |     )
47 |     trainer.fit(output_path)
48 | 
49 |     prediction = pipeline.predict("a test")
50 |     pipeline_loaded = Pipeline.from_pretrained(output_path / "model.tar.gz")
51 |     prediction_loaded = pipeline_loaded.predict("a test")
52 | 
53 |     assert_allclose(prediction["probabilities"], prediction_loaded["probabilities"])
54 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/components/Sidebar.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <aside class="sidebar">
 3 |     <div class="sidebar__link">
 4 |       <a :href="$withBase('/')">
 5 |         <img class="sidebar__img" :src="$withBase('/assets/img/biome.svg')"></img>
 6 |       </a>
 7 |     </div>
 8 |     <Versions/>
 9 |     <NavLinks />
10 |     <slot name="top" />
11 | 
12 |     <SidebarLinks
13 |       :depth="0"
14 |       :items="items"
15 |     />
16 |     <slot name="bottom" />
17 |   </aside>
18 | </template>
19 | 
20 | <script>
21 | import SidebarLinks from '@theme/components/SidebarLinks.vue'
22 | import NavLinks from '@theme/components/NavLinks.vue'
23 | import Versions from '@theme/components/Versions.vue'
24 | 
25 | export default {
26 |   name: 'Sidebar',
27 | 
28 |   components: { SidebarLinks, NavLinks, Versions },
29 | 
30 |   props: ['items']
31 | }
32 | </script>
33 | 
34 | <style lang="stylus">
35 | .sidebar
36 |   ul
37 |     padding 0
38 |     margin 0
39 |     list-style-type none
40 |   a
41 |     display inline-block
42 |   .nav-links
43 |     display none
44 |     border-bottom 1px solid $borderColor
45 |     padding 0.5rem 0 0.75rem 0
46 |     a
47 |       font-weight 600
48 |     .nav-item, .repo-link
49 |       display block
50 |       line-height 1.25rem
51 |       font-size 1.1em
52 |       padding 0.5rem 0 0.5rem 1.5rem
53 |   & > .sidebar-links
54 |     padding 0
55 |     & > li > a.sidebar-link
56 |       font-size 1.1em
57 |       line-height 1.7
58 |       font-weight bold
59 |     & > li:not(:first-child)
60 |       margin-top .75rem
61 | 
62 | @media (max-width: $MQMobile)
63 |   .sidebar
64 |     .nav-links
65 |       display block
66 |       .dropdown-wrapper .nav-dropdown .dropdown-item a.router-link-active::after
67 |         top calc(1rem - 2px)
68 |     & > .sidebar-links
69 |       padding 1rem 0
70 | </style>
71 | 


--------------------------------------------------------------------------------
/src/biome/text/metrics.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | from typing import Any
 3 | from typing import Dict
 4 | 
 5 | from allennlp.common import Params
 6 | from allennlp.training.metrics import Metric
 7 | 
 8 | 
 9 | class Metrics:
10 |     """Stores two dictionaries of identical metrics, one for training and one for validation.
11 | 
12 |     Parameters
13 |     ----------
14 |     **kwargs
15 |         The key defines the name of the metric, the value must be a dictionary that can be used to instantiate a
16 |         child class of `allennlp.training.metrics.Metric` via its `from_params` method.
17 | 
18 |     Examples
19 |     --------
20 |     >>> from allennlp.training.metrics import Metric
21 |     >>> metrics = Metrics(accuracy={"type": "categorical_accuracy"}, f1={"type": "fbeta"})
22 |     >>> for metric in metrics.get_dict(is_train=False).values():
23 |     ...     assert isinstance(metric, Metric)
24 |     """
25 | 
26 |     def __init__(self, **kwargs: Dict[str, Any]):
27 |         self.training_metrics = {}
28 |         self.validation_metrics = {}
29 |         for name, metric_kwargs in kwargs.items():
30 |             # We need a special logic for the vocabulary, we do not want to deep copy it,
31 |             # and it cannot be used in Params
32 |             vocab = metric_kwargs.pop("vocabulary", None)
33 |             self.training_metrics[name] = Metric.from_params(
34 |                 Params(copy.deepcopy(metric_kwargs)),
35 |                 **{} if vocab is None else {"vocabulary": vocab}
36 |             )
37 |             self.validation_metrics[name] = Metric.from_params(
38 |                 Params(metric_kwargs), **{} if vocab is None else {"vocabulary": vocab}
39 |             )
40 | 
41 |     def get_dict(self, is_train: bool = True) -> Dict[str, Metric]:
42 |         if is_train:
43 |             return self.validation_metrics
44 |         return self.training_metrics
45 | 


--------------------------------------------------------------------------------
/src/biome/text/cli/evaluate.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import click
 4 | 
 5 | from biome.text import Pipeline
 6 | from biome.text.cli.train import dataset_from_path
 7 | 
 8 | 
 9 | @click.command()
10 | @click.argument(
11 |     "pipeline_path",
12 |     type=click.Path(exists=True),
13 | )
14 | @click.option(
15 |     "--output",
16 |     "-o",
17 |     type=click.Path(),
18 |     required=True,
19 |     help="Path to write the evaluation metrics to.",
20 | )
21 | @click.option(
22 |     "--dataset",
23 |     "-ds",
24 |     type=click.Path(exists=True),
25 |     required=True,
26 |     help="Path to the dataset",
27 | )
28 | @click.option(
29 |     "--batch_size",
30 |     "-bs",
31 |     type=int,
32 |     default=16,
33 |     show_default=True,
34 |     help="Batch size during evaluation.",
35 | )
36 | @click.option(
37 |     "--lazy",
38 |     "-l",
39 |     type=bool,
40 |     default=False,
41 |     show_default=True,
42 |     help="If true, data is lazily loaded from disk, otherwise it is loaded into memory.",
43 | )
44 | @click.option(
45 |     "--prediction_output",
46 |     "-po",
47 |     type=click.Path(),
48 |     default=None,
49 |     help="Write batch predictions to this file.",
50 | )
51 | def evaluate(
52 |     pipeline_path: str,
53 |     output: str,
54 |     dataset: str,
55 |     batch_size: int = 16,
56 |     lazy: bool = False,
57 |     prediction_output: Optional[str] = None,
58 | ) -> None:
59 |     """Evaluate a pipeline on a given dataset.
60 | 
61 |     PIPELINE_PATH is the path to a pretrained pipeline (model.tar.gz file).
62 |     """
63 |     pipeline = Pipeline.from_pretrained(pipeline_path)
64 |     dataset = dataset_from_path(dataset)
65 | 
66 |     pipeline.evaluate(
67 |         dataset,
68 |         batch_size=batch_size,
69 |         lazy=lazy,
70 |         predictions_output_file=prediction_output,
71 |         metrics_output_file=output,
72 |     )
73 | 


--------------------------------------------------------------------------------
/tests/text/test_pipeline_to_mlflow.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import mlflow
 4 | import pandas as pd
 5 | import pytest
 6 | import yaml
 7 | from numpy.testing import assert_allclose
 8 | 
 9 | from biome.text import Pipeline
10 | from biome.text import __version__
11 | 
12 | 
13 | @pytest.fixture
14 | def pipeline():
15 |     return Pipeline.from_config(
16 |         {
17 |             "name": "test_pipeline_copy",
18 |             "head": {"type": "TextClassification", "labels": ["a", "b"]},
19 |         }
20 |     )
21 | 
22 | 
23 | def test_to_mlflow(pipeline, tmp_path):
24 |     test_str_for_prediction = "test this prediction"
25 |     expected_prediction = pipeline.predict(text=test_str_for_prediction)
26 | 
27 |     model_uri = pipeline.to_mlflow(
28 |         tracking_uri=str(tmp_path / "to_mlflow_test"), experiment_id=0
29 |     )
30 | 
31 |     df = mlflow.search_runs(experiment_ids=["0"])
32 |     assert len(df) == 1 and df["tags.mlflow.runName"][0] == "log_biometext_model"
33 | 
34 |     # load MLFlow model and make predictions
35 |     model = mlflow.pyfunc.load_model(model_uri=model_uri)
36 |     prediction: pd.DataFrame = model.predict(
37 |         pd.DataFrame([{"text": test_str_for_prediction}])
38 |     )
39 | 
40 |     assert len(prediction) == 1
41 |     assert expected_prediction["labels"] == prediction["labels"][0]
42 |     assert_allclose(
43 |         expected_prediction["probabilities"], prediction["probabilities"][0]
44 |     )
45 |     with (Path(model_uri) / "conda.yaml").open() as file:
46 |         conda_env = yaml.load(file)
47 |         assert conda_env == {
48 |             "name": "mlflow-dev",
49 |             "channels": ["defaults", "conda-forge"],
50 |             "dependencies": [
51 |                 "python=3.7.9",
52 |                 "pip>=20.3.0",
53 |                 {"pip": ["mlflow", f"biome-text=={__version__}"]},
54 |             ],
55 |         }
56 | 


--------------------------------------------------------------------------------
/tests/text/test_pipeline_with_optional_inputs.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | from typing import List
 3 | from typing import Optional
 4 | from typing import Union
 5 | 
 6 | from astroid import Instance
 7 | 
 8 | from biome.text import Pipeline
 9 | from biome.text import PipelineConfiguration
10 | from biome.text.configuration import FeaturesConfiguration
11 | from biome.text.modules.heads import TaskHeadConfiguration
12 | from biome.text.modules.heads import TextClassification
13 | 
14 | 
15 | class MyCustomHead(TextClassification):
16 |     """Just a head renaming the original TextClassification head"""
17 | 
18 |     def inputs(self) -> Optional[List[str]]:
19 |         return ["text", "second_text"]
20 | 
21 |     def featurize(
22 |         self,
23 |         text: Any,
24 |         second_text: Optional[Any] = None,
25 |         label: Optional[Union[int, str, List[Union[int, str]]]] = None,
26 |     ) -> Optional[Instance]:
27 |         instance = self.backbone.featurizer(
28 |             {"text": text, "text-2": second_text},
29 |             to_field=self.forward_arg_name,
30 |             aggregate=True,
31 |             exclude_record_keys=True,
32 |         )
33 |         return self._add_label(instance, label, to_field=self.label_name)
34 | 
35 | 
36 | def test_check_pipeline_inputs_and_output():
37 |     config = PipelineConfiguration(
38 |         "test-pipeline",
39 |         head=TaskHeadConfiguration(
40 |             type=MyCustomHead,
41 |             labels=[
42 |                 "blue-collar",
43 |                 "technician",
44 |                 "management",
45 |                 "services",
46 |                 "retired",
47 |                 "admin.",
48 |             ],
49 |         ),
50 |         features=FeaturesConfiguration(),
51 |     )
52 | 
53 |     pipeline = Pipeline.from_config(config)
54 | 
55 |     assert pipeline.inputs == ["text", "second_text"]
56 |     assert pipeline.output == ["label"]
57 | 


--------------------------------------------------------------------------------
/docs/docs/documentation/community/1-contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | We are open and very happy to receive contributions to make *biome.text* more useful for you and others.
 4 | 
 5 | If you want to start contributing to *biome.text* there are three things you need to do.
 6 | There are basically three ways you can contribute to *biome.text*:
 7 | 
 8 | 1. report a bug
 9 | 2. make a feature request
10 | 3. submit a pull request
11 | 
12 | ## Report a bug
13 | 
14 | To report a bug in the library or point our an error in the documentation please open an [issue on GitHub](https://github.com/recognai/biome-text/issues/new/choose).
15 | 
16 | ## Make a feature request
17 | 
18 | If you are missing some feature in the library, please let us know in a [GitHub issue](https://github.com/recognai/biome-text/issues/new/choose).
19 | It is always helpful if you describe a concrete use case for the feature.
20 | 
21 | ## Submit a pull request
22 | 
23 | You can contribute to the code base via [Pull Requests (PRs)](https://docs.github.com/en/free-pro-team@latest/github/collaborating-with-issues-and-pull-requests/about-pull-requests).
24 | Here is a quick guide on how to [set up your system](./3-developer_guides.md#setting-up-for-development) for *biome.text* development.
25 | 
26 | A PR should always reference an issue.
27 | So before starting to work on some bugfix or new feature, make sure to open a corresponding GitHub issue.
28 | If a corresponding issue already exists, please leave a quick comment that you are working on it.
29 | 
30 | **For example**: you find an error in the documentation and open a new issue, #13, describing the error.
31 | You want to fix the error and create a new branch in your forked repo with a meaningful name, such as `documentation/#13`.
32 | You work on this branch, make the necessary changes, test them, push them and create a PR against our repo.
33 | This PR should include the text "Closes #13" at the end of its description.
34 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import glob
 4 | 
 5 | from setuptools import setup
 6 | 
 7 | try:
 8 |     from setuptools import find_namespace_packages
 9 | except ImportError as error:
10 |     raise ImportError("Make sure you have setuptools >= 40.1.0 installed!") from error
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     setup(
15 |         name="biome-text",
16 |         use_scm_version=True,
17 |         setup_requires=["setuptools_scm"],
18 |         description="Biome-text is a light-weight open source Natural Language Processing toolbox"
19 |         " built with AllenNLP",
20 |         author="Recognai",
21 |         author_email="francisco@recogn.ai",
22 |         url="https://www.recogn.ai/",
23 |         long_description=open("README.md").read(),
24 |         long_description_content_type="text/markdown",
25 |         packages=find_namespace_packages("src"),
26 |         package_dir={"": "src"},
27 |         install_requires=[
28 |             "allennlp~=2.7.0",
29 |             "beautifulsoup4~=4.9.0",
30 |             "captum~=0.2.0",
31 |             "click~=7.1.0",
32 |             "datasets>=1.10.0,<1.12.0",
33 |             "flatdict~=4.0.0",
34 |             "lxml~=4.6.2",
35 |             "mlflow>=1.13.1,<1.21.0",
36 |             "numpy",
37 |             "pandas",
38 |             "pytorch-lightning~=1.4.0",
39 |             "ray[tune]>=1.3.0,<1.7.0",
40 |             "spacy>=2.3.0,<3.2.0",
41 |             "torch",  # the version is defined by allennlp
42 |             "transformers",  # the version is defined by allennlp
43 |             "tqdm>=4.49.0",
44 |             "fastapi~=0.63.0",  # newer versions brings pydantic conflicts with spaCy 3.0.x
45 |             "uvicorn>=0.13.0",
46 |             "pyyaml",
47 |         ],
48 |         entry_points={"console_scripts": ["biome=biome.text.cli:main"]},
49 |         python_requires=">=3.6.1",  # taken from AllenNLP
50 |         zip_safe=False,
51 |     )
52 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/components/NavLink.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <RouterLink
 3 |     v-if="isInternal"
 4 |     class="nav-link"
 5 |     :to="link"
 6 |     :exact="exact"
 7 |     @focusout.native="focusoutAction"
 8 |   >
 9 |     {{ item.text }}
10 |   </RouterLink>
11 |   <a
12 |     v-else
13 |     :href="link"
14 |     class="nav-link external"
15 |     :target="target"
16 |     :rel="rel"
17 |     @focusout="focusoutAction"
18 |   >
19 |     {{ item.text }}
20 |     <span class="external__icon" v-if="isBlankTarget">
21 |       <vp-icon color="#4A4A4A" name="blank" size="12px"/>
22 |     </span>
23 |   </a>
24 | </template>
25 | 
26 | <script>
27 | import { isExternal, isMailto, isTel, ensureExt } from '@vuepress/theme-default/util'
28 | 
29 | export default {
30 |   name: 'NavLink',
31 | 
32 |   props: {
33 |     item: {
34 |       required: true
35 |     }
36 |   },
37 | 
38 |   computed: {
39 |     link () {
40 |       return ensureExt(this.item.link)
41 |     },
42 | 
43 |     exact () {
44 |       if (this.$site.locales) {
45 |         return Object.keys(this.$site.locales).some(rootLink => rootLink === this.link)
46 |       }
47 |       return this.link === '/'
48 |     },
49 | 
50 |     isNonHttpURI () {
51 |       return isMailto(this.link) || isTel(this.link)
52 |     },
53 | 
54 |     isBlankTarget () {
55 |       return this.target === '_blank'
56 |     },
57 | 
58 |     isInternal () {
59 |       return !isExternal(this.link) && !this.isBlankTarget
60 |     },
61 | 
62 |     target () {
63 |       if (this.isNonHttpURI) {
64 |         return null
65 |       }
66 |       if (this.item.target) {
67 |         return this.item.target
68 |       }
69 |       return isExternal(this.link) ? '_blank' : ''
70 |     },
71 | 
72 |     rel () {
73 |       if (this.isNonHttpURI) {
74 |         return null
75 |       }
76 |       if (this.item.rel) {
77 |         return this.item.rel
78 |       }
79 |       return this.isBlankTarget ? 'noopener noreferrer' : ''
80 |     }
81 |   },
82 | 
83 |   methods: {
84 |     focusoutAction () {
85 |       this.$emit('focusout')
86 |     }
87 |   }
88 | }
89 | </script>
90 | 


--------------------------------------------------------------------------------
/tests/text/test_pipeline_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from allennlp.data.token_indexers import PretrainedTransformerIndexer
 3 | from allennlp.data.token_indexers import PretrainedTransformerMismatchedIndexer
 4 | 
 5 | from biome.text import Pipeline
 6 | from biome.text.configuration import TokenizerConfiguration
 7 | from biome.text.tokenizer import Tokenizer
 8 | from biome.text.tokenizer import TransformersTokenizer
 9 | 
10 | 
11 | @pytest.fixture
12 | def pipeline_dict(request) -> dict:
13 |     """Pipeline config dict. You need to update the labels!"""
14 |     pipeline_dict = {
15 |         "name": "transformers_tokenizer_test",
16 |         "features": {
17 |             "transformers": {"model_name": "sshleifer/tiny-distilroberta-base"}
18 |         },
19 |         "head": {
20 |             "type": "TextClassification",
21 |             "labels": ["a", "b"],
22 |         },
23 |     }
24 |     return pipeline_dict
25 | 
26 | 
27 | def test_pipeline_transformers_tokenizer(pipeline_dict):
28 |     pipeline_dict["tokenizer"] = {"truncate_input": 1}
29 |     pl = Pipeline.from_config(pipeline_dict)
30 | 
31 |     assert pl.config.tokenizer_config.transformers_kwargs == {
32 |         "model_name": "sshleifer/tiny-distilroberta-base"
33 |     }
34 |     assert pl.config.features.transformers.mismatched is False
35 |     assert (
36 |         type(pl.backbone.featurizer.indexer["transformers"])
37 |         is PretrainedTransformerIndexer
38 |     )
39 |     assert type(pl.backbone.tokenizer) is TransformersTokenizer
40 | 
41 |     # test max_sequence_length, only <s>, t, </s> should survive
42 |     assert (
43 |         len(pl.backbone.tokenizer.tokenize_text("this is a multi token text")[0]) == 3
44 |     )
45 | 
46 |     assert pl.predict("Test this!")
47 | 
48 | 
49 | def test_pipeline_default_tokenizer(pipeline_dict):
50 |     pipeline_dict["features"].update({"word": {"embedding_dim": 2}})
51 |     pl = Pipeline.from_config(pipeline_dict)
52 | 
53 |     assert pl.config.tokenizer_config == TokenizerConfiguration()
54 |     assert pl.config.features.transformers.mismatched is True
55 |     assert (
56 |         type(pl.backbone.featurizer.indexer["transformers"])
57 |         is PretrainedTransformerMismatchedIndexer
58 |     )
59 |     assert type(pl.backbone.tokenizer) is Tokenizer
60 | 
61 |     prediction = pl.predict("Test this!")
62 | 


--------------------------------------------------------------------------------
/tests/text/modules/heads/test_language_modelling.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | import pytest
 4 | 
 5 | from biome.text import Dataset
 6 | from biome.text import Pipeline
 7 | from biome.text import Trainer
 8 | from biome.text import TrainerConfiguration
 9 | 
10 | 
11 | @pytest.fixture
12 | def training_dataset() -> Dataset:
13 |     """Creating the dataframe."""
14 |     data = {
15 |         "text": [
16 |             "this is a text",
17 |             "my name is dani",
18 |             "this is a table",
19 |             "my name is paco",
20 |         ],
21 |     }
22 |     return Dataset.from_dict(data)
23 | 
24 | 
25 | @pytest.fixture
26 | def pipeline_dict() -> Dict:
27 |     """Creating the pipeline dictionary"""
28 | 
29 |     pipeline_dict = {
30 |         "name": "lm",
31 |         "features": {
32 |             "word": {"embedding_dim": 50, "lowercase_tokens": True, "trainable": True},
33 |             "char": {
34 |                 "embedding_dim": 50,
35 |                 "dropout": 0.1,
36 |                 "encoder": {
37 |                     "type": "gru",
38 |                     "hidden_size": 10,
39 |                     "num_layers": 1,
40 |                     "bidirectional": True,
41 |                 },
42 |             },
43 |         },
44 |         "encoder": {
45 |             "type": "gru",
46 |             "num_layers": 1,
47 |             "hidden_size": 10,
48 |             "bidirectional": True,
49 |         },
50 |         "head": {"type": "LanguageModelling", "dropout": 0.1, "bidirectional": True},
51 |     }
52 | 
53 |     return pipeline_dict
54 | 
55 | 
56 | @pytest.fixture
57 | def trainer_config() -> TrainerConfiguration:
58 |     return TrainerConfiguration(
59 |         max_epochs=2,
60 |         optimizer={"type": "adam", "amsgrad": True, "lr": 0.002},
61 |         gpus=0,
62 |     )
63 | 
64 | 
65 | def test_train(pipeline_dict, training_dataset, trainer_config, tmp_path):
66 |     """Testing the correct working of prediction, vocab creating and training"""
67 | 
68 |     pipeline = Pipeline.from_config(pipeline_dict)
69 |     pipeline.predict(text="my name is juan")
70 | 
71 |     trainer = Trainer(
72 |         pipeline=pipeline,
73 |         train_dataset=training_dataset,
74 |         valid_dataset=training_dataset,
75 |         trainer_config=trainer_config,
76 |     )
77 |     trainer.fit(tmp_path / "lm")
78 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .esrunner
  2 | 
  3 | # docs
  4 | docs/docs/api/biome/text/
  5 | docs/docs/documentation/tutorials/*
  6 | !docs/docs/documentation/tutorials/*.ipynb
  7 | !docs/docs/documentation/tutorials/img/
  8 | docs/site
  9 | **/node_modules
 10 | **/yarn.lock
 11 | **/package-lock.json
 12 | 
 13 | **/*.th
 14 | **/*.tar.gz
 15 | **/metrics*.json
 16 | **/config.json
 17 | **/events.out*
 18 | **/vocabulary/*.txt
 19 | 
 20 | **/webapp
 21 | 
 22 | examples/**/experiment*/
 23 | 
 24 | # Byte-compiled / optimized / DLL files
 25 | __pycache__/
 26 | *.py[cod]
 27 | *$py.class
 28 | 
 29 | # C extensions
 30 | *.so
 31 | 
 32 | # Distribution / packaging
 33 | .Python
 34 | env/
 35 | build/
 36 | develop-eggs/
 37 | dist/
 38 | downloads/
 39 | eggs/
 40 | .eggs/
 41 | lib/
 42 | lib64/
 43 | parts/
 44 | sdist/
 45 | dist/
 46 | var/
 47 | *.egg-info/
 48 | .installed.cfg
 49 | *.egg
 50 | 
 51 | # PyInstaller
 52 | #  Usually these files are written by a python script from a template
 53 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 54 | *.manifest
 55 | *.spec
 56 | 
 57 | # Installer logs
 58 | pip-log.txt
 59 | pip-delete-this-directory.txt
 60 | 
 61 | # Unit test / coverage reports
 62 | htmlcov/
 63 | .tox/
 64 | .coverage
 65 | .coverage.*
 66 | .cache
 67 | nosetests.xml
 68 | coverage.xml
 69 | *,cover
 70 | .hypothesis/
 71 | 
 72 | # Translations
 73 | *.mo
 74 | *.pot
 75 | 
 76 | # Django stuff:
 77 | *.log
 78 | local_settings.py
 79 | 
 80 | # Flask stuff:
 81 | instance/
 82 | .webassets-cache
 83 | 
 84 | # Scrapy stuff:
 85 | .scrapy
 86 | 
 87 | # Sphinx documentation
 88 | docs/_build/
 89 | 
 90 | # PyBuilder
 91 | target/
 92 | 
 93 | # IPython Notebook
 94 | .ipynb_checkpoints
 95 | 
 96 | # pyenv
 97 | .python-version
 98 | 
 99 | # celery beat schedule file
100 | celerybeat-schedule
101 | 
102 | # dotenv
103 | .env
104 | 
105 | # virtualenv
106 | .venv/
107 | venv/
108 | ENV/
109 | .virtualenv/
110 | 
111 | # Spyder project settings
112 | .spyderproject
113 | 
114 | # Rope project settings
115 | .ropeproject
116 | 
117 | 
118 | .idea
119 | 
120 | .history
121 | 
122 | .vscode
123 | **/dask-worker-space
124 | 
125 | tools
126 | 
127 | .generated*
128 | generated*
129 | 
130 | *venv
131 | 
132 | .DS_Store
133 | 
134 | .dask
135 | 
136 | **/mlruns/
137 | **/runs/
138 | **/.yalc/
139 | 
140 | #pylint
141 | .pylintrc
142 | 


--------------------------------------------------------------------------------
/tests/text/test_pipeline_datasets.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import pytest
 5 | import torch
 6 | 
 7 | from biome.text import Dataset
 8 | from biome.text import Pipeline
 9 | from biome.text import PipelineConfiguration
10 | from biome.text import Trainer
11 | from biome.text import TrainerConfiguration
12 | from biome.text.backbone import ModelBackbone
13 | from biome.text.modules.heads import TextClassification
14 | from biome.text.modules.heads import TextClassificationConfiguration
15 | 
16 | 
17 | class TestHead(TextClassification):
18 |     def __init__(self, backbone: ModelBackbone):
19 |         super(TestHead, self).__init__(backbone, labels=["test", "notest"])
20 | 
21 | 
22 | @pytest.fixture
23 | def dataset(tmp_path) -> Dataset:
24 |     data = {
25 |         "text": ["A common text", "This is why you get", "Seriosly?, I'm not sure"],
26 |         "label": ["one", "zero", "zero"],
27 |     }
28 |     ds = Dataset.from_dict(data)
29 | 
30 |     # we save and load it here to be able to lazily read from it
31 |     ds_path = tmp_path / "test_pipeline_datasets" / "dataset"
32 |     ds.save_to_disk(str(ds_path))
33 | 
34 |     return Dataset.load_from_disk(str(ds_path))
35 | 
36 | 
37 | @pytest.fixture
38 | def pipeline() -> Pipeline:
39 |     config = PipelineConfiguration(
40 |         name="test-classifier",
41 |         head=TextClassificationConfiguration(labels=["one", "zero"]),
42 |     )
43 |     return Pipeline.from_config(config)
44 | 
45 | 
46 | def test_training_from_pretrained_with_head_replace(pipeline, dataset, tmp_path):
47 |     trainer_config = TrainerConfiguration(
48 |         batch_size=2,
49 |         max_epochs=5,
50 |         gpus=0,
51 |     )
52 | 
53 |     trainer = Trainer(pipeline, train_dataset=dataset, trainer_config=trainer_config)
54 |     trainer.fit(tmp_path / "output")
55 | 
56 |     pipeline.set_head(TestHead)
57 |     pipeline.config.tokenizer_config.max_nr_of_sentences = 3
58 |     copied = pipeline.copy()
59 |     assert isinstance(copied.head, TestHead)
60 |     assert copied.num_parameters == pipeline.num_parameters
61 |     assert copied.num_trainable_parameters == pipeline.num_trainable_parameters
62 |     copied_model_state = copied._model.state_dict()
63 |     original_model_state = pipeline._model.state_dict()
64 |     for key, value in copied_model_state.items():
65 |         if "backbone" in key:
66 |             assert torch.all(torch.eq(value, original_model_state[key]))
67 |     assert copied.backbone.featurizer.tokenizer.config.max_nr_of_sentences == 3
68 | 


--------------------------------------------------------------------------------
/tests/text/test_pipeline_with_custom_head.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | from tempfile import mkdtemp
 4 | 
 5 | import pytest
 6 | 
 7 | from biome.text import Dataset
 8 | from biome.text import Pipeline
 9 | from biome.text import PipelineConfiguration
10 | from biome.text.configuration import FeaturesConfiguration
11 | from biome.text.configuration import VocabularyConfiguration
12 | from biome.text.modules.heads import TaskHeadConfiguration
13 | from biome.text.modules.heads import TextClassification
14 | 
15 | 
16 | class MyCustomHead(TextClassification):
17 |     """Just a head renaming the original TextClassification head"""
18 | 
19 |     pass
20 | 
21 | 
22 | @pytest.fixture
23 | def training_dataset() -> Dataset:
24 |     """Creates the training dataset and gives the structure"""
25 |     resources_path = (
26 |         Path(__file__).parent.parent.parent / "tests" / "resources" / "data"
27 |     )
28 |     training_ds = Dataset.from_csv(paths=str(resources_path / "dataset_source.csv"))
29 | 
30 |     # Keeping just 'label' and text 'category'
31 |     training_ds = training_ds.map(
32 |         lambda x: {"label": x["job"], "text": x["education"] + " " + x["marital"]},
33 |     )
34 | 
35 |     return training_ds
36 | 
37 | 
38 | def test_load_pipeline_with_custom_head(training_dataset, tmp_path):
39 |     """Testing a model training inserting a class as custom heard"""
40 | 
41 |     # Pipeline configuration dict with custom head
42 |     config = PipelineConfiguration(
43 |         "test-pipeline",
44 |         head=TaskHeadConfiguration(
45 |             type=MyCustomHead,
46 |             labels=[
47 |                 "blue-collar",
48 |                 "technician",
49 |                 "management",
50 |                 "services",
51 |                 "retired",
52 |                 "admin.",
53 |             ],
54 |         ),
55 |         features=FeaturesConfiguration(),
56 |     )
57 | 
58 |     # Asserting that pipeline.head is an instance of MyCustomHead
59 |     pipeline = Pipeline.from_config(config)
60 |     assert isinstance(pipeline.head, MyCustomHead)
61 | 
62 |     # Saving the pipeline to output
63 |     output = tmp_path / "pipeline"
64 |     pipeline.save(output)
65 | 
66 |     # Loading model from output
67 |     trained_pl = Pipeline.from_pretrained(os.path.join(str(output), "model.tar.gz"))
68 |     trained_pl.predict("Oh yeah")
69 | 
70 |     # Asserting that the pipeline head is recognized as `MyCustomHead` instance after loading from a model.tar.gz
71 |     assert isinstance(trained_pl.head, MyCustomHead)
72 | 


--------------------------------------------------------------------------------
/src/biome/text/backbone.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import torch
 4 | from allennlp.data import TextFieldTensors
 5 | from allennlp.data import Vocabulary
 6 | from allennlp.modules import TextFieldEmbedder
 7 | from allennlp.modules.seq2seq_encoders import PassThroughEncoder
 8 | 
 9 | from .featurizer import InputFeaturizer
10 | from .modules.encoders import Encoder
11 | from .tokenizer import Tokenizer
12 | 
13 | 
14 | class ModelBackbone(torch.nn.Module):
15 |     """The backbone of the model.
16 | 
17 |     It is composed of a tokenizer, featurizer and an encoder.
18 |     This component of the model can be pretrained and used with different task heads.
19 | 
20 |     Attributes
21 |     ----------
22 |     vocab
23 |         The vocabulary of the pipeline
24 |     featurizer
25 |         Defines the input features of the tokens and indexes
26 |     embedder
27 |         The embedding layer
28 |     encoder
29 |         Outputs an encoded sequence of the tokens
30 |     """
31 | 
32 |     def __init__(
33 |         self,
34 |         vocab: Vocabulary,
35 |         featurizer: InputFeaturizer,
36 |         embedder: TextFieldEmbedder,
37 |         encoder: Optional[Encoder] = None,
38 |     ):
39 |         super(ModelBackbone, self).__init__()
40 | 
41 |         self.vocab = vocab
42 |         self.featurizer = featurizer
43 |         self.embedder = embedder
44 |         self.encoder = (
45 |             encoder.input_dim(self.embedder.get_output_dim()).compile()
46 |             if encoder
47 |             else PassThroughEncoder(self.embedder.get_output_dim())
48 |         )
49 | 
50 |     @property
51 |     def tokenizer(self) -> Tokenizer:
52 |         return self.featurizer.tokenizer
53 | 
54 |     def forward(
55 |         self, text: TextFieldTensors, mask: torch.Tensor, num_wrapping_dims: int = 0
56 |     ) -> torch.Tensor:
57 |         """Applies the embedding and encoding layer
58 | 
59 |         Parameters
60 |         ----------
61 |         text
62 |             Output of the `batch.as_tensor_dict()` method, basically the indices of the indexed tokens
63 |         mask
64 |             A mask indicating which one of the tokens are padding tokens
65 |         num_wrapping_dims
66 |             0 if `text` is the output of a `TextField`, 1 if it is the output of a `ListField`
67 | 
68 |         Returns
69 |         -------
70 |         tensor
71 |             Encoded representation of the input
72 |         """
73 |         embeddings = self.embedder(text, num_wrapping_dims=num_wrapping_dims)
74 |         return self.encoder(embeddings, mask=mask)
75 | 


--------------------------------------------------------------------------------
/src/biome/text/cli/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | from typing import Optional
 4 | 
 5 | import click
 6 | 
 7 | from biome.text import Dataset
 8 | from biome.text import Pipeline
 9 | from biome.text import Trainer
10 | from biome.text import TrainerConfiguration
11 | from biome.text.helpers import yaml_to_dict
12 | 
13 | 
14 | @click.command()
15 | @click.argument(
16 |     "pipeline_path",
17 |     type=click.Path(exists=True),
18 |     required=True,
19 | )
20 | @click.option(
21 |     "--output",
22 |     "-o",
23 |     type=click.Path(),
24 |     required=True,
25 |     help="Path of the training output.",
26 | )
27 | @click.option(
28 |     "--trainer_config",
29 |     type=click.Path(exists=True),
30 |     required=True,
31 |     help="Path to the trainer configuration YAML file.",
32 | )
33 | @click.option(
34 |     "--train_data",
35 |     type=click.Path(exists=True),
36 |     required=True,
37 |     help="Path to the training data.",
38 | )
39 | @click.option(
40 |     "--valid_data",
41 |     type=click.Path(exists=True),
42 |     required=False,
43 |     help="Path to the validation data.",
44 | )
45 | def train(
46 |     pipeline_path: str,
47 |     output: str,
48 |     trainer_config: str,
49 |     train_data: str,
50 |     valid_data: Optional[str] = None,
51 | ) -> None:
52 |     """Train a pipeline.
53 | 
54 |     PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file),
55 |     or the path to a pipeline configuration (YAML file).
56 |     """
57 |     _, extension = os.path.splitext(pipeline_path)
58 |     extension = extension[1:].lower()
59 |     pipeline = (
60 |         Pipeline.from_yaml(pipeline_path)
61 |         if extension in ["yaml", "yml"]
62 |         else Pipeline.from_pretrained(pipeline_path)
63 |     )
64 | 
65 |     datasets = {
66 |         "train": dataset_from_path(train_data),
67 |         "validation": dataset_from_path(valid_data) if valid_data else None,
68 |     }
69 | 
70 |     trainer = Trainer(
71 |         pipeline=pipeline,
72 |         train_dataset=datasets["train"],
73 |         valid_dataset=datasets["validation"],
74 |         trainer_config=TrainerConfiguration(**yaml_to_dict(trainer_config)),
75 |     )
76 |     trainer.fit(output_dir=output)
77 | 
78 | 
79 | def dataset_from_path(path: str) -> Dataset:
80 |     file_extension = Path(path).suffix
81 |     if file_extension in [".csv"]:
82 |         return Dataset.from_csv(path)
83 |     elif file_extension in [".json", ".jsonl"]:
84 |         return Dataset.from_json(path)
85 |     else:
86 |         raise ValueError(
87 |             f"Could not create a Dataset from '{path}'. "
88 |             f"We only support following formats: [csv, json, jsonl]"
89 |         )
90 | 


--------------------------------------------------------------------------------
/tests/text/test_model_predict.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from biome.text import Pipeline
 4 | from biome.text.configuration import PredictionConfiguration
 5 | from biome.text.model import PipelineModel
 6 | from biome.text.modules.heads.task_prediction import TaskPrediction
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def model() -> PipelineModel:
11 |     pipeline = Pipeline.from_config(
12 |         {
13 |             "name": "test_predict",
14 |             "head": {"type": "TextClassification", "labels": ["a"]},
15 |         }
16 |     )
17 |     return pipeline._model
18 | 
19 | 
20 | def test_activate_eval_mode(model):
21 |     model.train()
22 |     model.predict([{"text": "test"}], PredictionConfiguration)
23 |     assert model.training is False
24 | 
25 | 
26 | def test_forward_pass_error(model, monkeypatch, caplog):
27 |     def mock_text_to_instance(**kwargs):
28 |         return "mock instance"
29 | 
30 |     def mock_forward_on_instances(*args, **kwargs):
31 |         raise Exception("mock Exception")
32 | 
33 |     monkeypatch.setattr(model, "text_to_instance", mock_text_to_instance)
34 |     monkeypatch.setattr(model, "forward_on_instances", mock_forward_on_instances)
35 | 
36 |     predictions = model.predict(
37 |         [{"text": "Some value that breaks the forward pass"}], PredictionConfiguration
38 |     )
39 | 
40 |     assert predictions == [model.head.empty_prediction]
41 |     assert len(caplog.record_tuples) == 2
42 |     assert caplog.record_tuples[0] == ("biome.text.model", 40, "mock Exception")
43 |     assert caplog.record_tuples[1] == (
44 |         "biome.text.model",
45 |         30,
46 |         "Failed to make a forward pass for '[{'text': 'Some value that breaks the forward pass'}]'",
47 |     )
48 | 
49 | 
50 | def test_return_type(model, monkeypatch):
51 |     def mock_make_task_prediction(*args, **kwargs):
52 |         return TaskPrediction()
53 | 
54 |     monkeypatch.setattr(model.head, "make_task_prediction", mock_make_task_prediction)
55 | 
56 |     predictions = model.predict(
57 |         [{"text": "test"}, {"text": "test2"}], PredictionConfiguration()
58 |     )
59 |     assert isinstance(predictions, list)
60 |     assert all([isinstance(pred, TaskPrediction) for pred in predictions])
61 | 
62 | 
63 | def test_text_to_instance(model, caplog):
64 |     with pytest.raises(TypeError):
65 |         model.text_to_instance(wrong_kwarg="wrong argument")
66 | 
67 |     with pytest.raises(TypeError):
68 |         model.text_to_instance(label="missing required argument")
69 | 
70 |     model.text_to_instance(text="")
71 |     assert caplog.record_tuples[0] == (
72 |         "biome.text.model",
73 |         30,
74 |         "The provided input data contains empty strings/tokens: ",
75 |     )
76 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/config.js:
--------------------------------------------------------------------------------
 1 | const path = require("path");
 2 | const glob = require("glob");
 3 | 
 4 | // The env variable is set in our GitHub Action CI when building the docs.
 5 | // It must be the same as the release tag or 'master', that is e.g. "v2.0.0" or "v2.1.0rc1" or "master"
 6 | const basePath = process.env.BIOME_TEXT_DOC_VERSION
 7 |     ? `/biome-text/${process.env.BIOME_TEXT_DOC_VERSION}/`
 8 |     : "/biome-text/master/"
 9 | 
10 | function getSidebarChildren(location, replacement) {
11 |     if (!replacement) {
12 |         replacement = location
13 |     }
14 |     return glob.sync(
15 |         location + '/**/*.md').map(
16 |             f => f.replace(replacement + '/','')).filter(s => s.toLowerCase().indexOf("readme.md") == -1
17 |         )
18 | }
19 | 
20 | module.exports = {
21 |   dest: 'site',
22 |   title: 'biome.text',
23 |   description: 'biome.text practical NLP open source library.',
24 |   head: [
25 |     ['meta', { name: 'viewport', content: 'width=device-width, initial-scale=1.0' }],
26 |     ['link', { rel: "shortcut icon", href: "/favicon.ico"}],
27 |     ['meta', { property: 'og:image', content: 'https://www.recogn.ai/images/biome_og.png' }],
28 |   ],
29 |   base: basePath,
30 |   plugins: [
31 |     '@goy/svg-icons',
32 |     '@vuepress/back-to-top'
33 |   ],
34 |   themeConfig: {
35 |     sidebarDepth: 1,
36 |     displayAllHeaders: false,
37 |     searchPlaceholder: 'Search',
38 |     nav: [
39 |       { text: 'API', link: '/api/'},
40 |       { text: 'Documentation', link: '/documentation/'},
41 |       { text: 'Github', link: 'https://github.com/recognai/biome-text' },
42 |       { text: 'Recognai', link: 'https://recogn.ai' },
43 |     ],
44 |     sidebar: {
45 |       '/api/': [{
46 |           title: 'API',
47 |           children: getSidebarChildren('docs/api'),
48 |           collapsable: false,
49 |       }],
50 |       '/documentation/': [
51 |       {
52 |           title: 'Get started',
53 |           children: ['', 'basics.md'],
54 |           collapsable: false
55 |       },
56 |       {
57 |           title: 'Tutorials',
58 |           children:getSidebarChildren('docs/documentation/tutorials', 'docs/documentation'),
59 |           collapsable: false
60 |       },
61 |       {
62 |           title: 'User Guides',
63 |           children:getSidebarChildren('docs/documentation/user-guides', 'docs/documentation'),
64 |           collapsable: false
65 |       },
66 |       {
67 |           title: 'Community',
68 |           children:getSidebarChildren('docs/documentation/community', 'docs/documentation'),
69 |           collapsable: false
70 |       }]
71 |     },
72 |     algolia: {
73 |       apiKey: '4f8d6b27d633951bde8c33e391ea6a4d',
74 |       indexName: 'recogn_biome-text'
75 |     },
76 |     plugins: ['@vuepress/active-header-links'],
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/tests/text/test_pipeline_predict.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from biome.text import Pipeline
 4 | from biome.text.modules.heads.task_prediction import TextClassificationPrediction
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def pipeline() -> Pipeline:
 9 |     return Pipeline.from_config(
10 |         {
11 |             "name": "test_predict",
12 |             "head": {"type": "TextClassification", "labels": ["a"]},
13 |         }
14 |     )
15 | 
16 | 
17 | def test_return_empty_prediction_for_failed_prediction(pipeline):
18 |     empty_prediction = {"labels": [], "probabilities": []}
19 |     assert pipeline.predict("") == empty_prediction
20 |     assert (
21 |         pipeline.predict(batch=[{"text": ""}, {"text": ""}]) == [empty_prediction] * 2
22 |     )
23 | 
24 | 
25 | def test_batch_parameter_gets_ignored(pipeline):
26 |     prediction = pipeline.predict("testtt", batch=[{"text": "test"}], add_tokens=True)
27 |     assert prediction["tokens"][0]["text"] == "testtt"
28 | 
29 |     prediction = pipeline.predict(
30 |         text="testtt", batch=[{"text": "test"}], add_tokens=True
31 |     )
32 |     assert prediction["tokens"][0]["text"] == "testtt"
33 | 
34 | 
35 | def test_map_args_kwargs_to_input():
36 |     class MockPipeline:
37 |         def __init__(self, inputs):
38 |             self._inputs = inputs
39 | 
40 |         @property
41 |         def inputs(self):
42 |             return self._inputs
43 | 
44 |     assert Pipeline._map_args_kwargs_to_input(MockPipeline(["text"]), "test") == {
45 |         "text": "test"
46 |     }
47 |     assert Pipeline._map_args_kwargs_to_input(MockPipeline(["text"]), text="test") == {
48 |         "text": "test"
49 |     }
50 |     assert Pipeline._map_args_kwargs_to_input(
51 |         MockPipeline(["text", "text2"]), "test", text2="test2"
52 |     ) == {"text": "test", "text2": "test2"}
53 | 
54 | 
55 | def test_return_single_or_list(pipeline, monkeypatch):
56 |     def mock_predict(batch, prediction_config):
57 |         return [
58 |             TextClassificationPrediction(labels=["a"], probabilities=[1])
59 |             if i % 2 == 0
60 |             else pipeline.head.empty_prediction
61 |             for i, _ in enumerate(batch)
62 |         ]
63 | 
64 |     monkeypatch.setattr(pipeline._model, "predict", mock_predict)
65 | 
66 |     assert isinstance(pipeline.predict("test"), dict)
67 | 
68 |     batch_prediction = pipeline.predict(batch=[{"text": "test"}])
69 |     assert isinstance(batch_prediction, list) and len(batch_prediction) == 1
70 |     assert isinstance(batch_prediction[0], dict)
71 | 
72 |     batch_prediction = pipeline.predict(
73 |         batch=[{"text": "test"}, {"text": "no instance for this input"}]
74 |     )
75 |     assert isinstance(batch_prediction, list) and len(batch_prediction) == 2
76 |     assert (
77 |         isinstance(batch_prediction[0], dict)
78 |         and batch_prediction[1] == pipeline.head.empty_prediction.as_dict()
79 |     )
80 | 


--------------------------------------------------------------------------------
/tests/docs/test_configurations.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Any
 3 | from typing import Dict
 4 | 
 5 | import pytorch_lightning
 6 | import torch.nn as nn
 7 | from allennlp.common import Params
 8 | from allennlp.training.learning_rate_schedulers import LearningRateScheduler
 9 | from allennlp.training.optimizers import Optimizer
10 | 
11 | from biome.text import Dataset
12 | from biome.text import Pipeline
13 | from biome.text import Trainer
14 | from biome.text import TrainerConfiguration
15 | from biome.text import VocabularyConfiguration
16 | 
17 | 
18 | def _read_configs(configurations_path: Path, section: str) -> Dict[str, Any]:
19 |     code_blocks = {}
20 |     with configurations_path.open() as file:
21 |         in_section = False
22 |         in_new_config = False
23 | 
24 |         for line in file.readlines():
25 |             if line.startswith(f"## {section}"):
26 |                 in_section = True
27 |             elif line.startswith("### ") and in_section:
28 |                 code_blocks[line.split(maxsplit=1)[1]] = ""
29 |             elif line.startswith("```python") and in_section:
30 |                 in_new_config = True
31 |             elif line.startswith("```") and in_new_config:
32 |                 in_new_config = False
33 |             elif line.startswith("## ") and in_section:
34 |                 in_section = False
35 | 
36 |             elif in_section and in_new_config:
37 |                 key = list(code_blocks.keys())[-1]
38 |                 code_blocks[key] += line
39 | 
40 |     configurations = {}
41 |     for name, code in code_blocks.items():
42 |         config = {}
43 |         exec(code, globals(), config)
44 |         configurations[name] = config[list(config.keys())[-1]]
45 | 
46 |     return configurations
47 | 
48 | 
49 | def test_pipeline_configs(configurations_path):
50 |     configs = _read_configs(configurations_path, "Pipeline")
51 |     for config_name, config in configs.items():
52 |         Pipeline.from_config(config)
53 | 
54 | 
55 | def test_trainer_configs(configurations_path):
56 |     configs = _read_configs(configurations_path, "Trainer")
57 |     pipeline = Pipeline.from_config(
58 |         {
59 |             "name": "test",
60 |             "head": {"type": "TextClassification", "labels": ["pos", "neg"]},
61 |         }
62 |     )
63 |     dataset = Dataset.from_dict({"text": ["test"], "label": ["pos"]})
64 |     linear = nn.Linear(2, 2)
65 |     for config_name, config in configs.items():
66 |         assert isinstance(config, TrainerConfiguration)
67 | 
68 |         trainer = Trainer(
69 |             pipeline=pipeline, train_dataset=dataset, trainer_config=config
70 |         )
71 |         assert isinstance(trainer.trainer, pytorch_lightning.Trainer)
72 | 
73 | 
74 | def test_vocab_configs(configurations_path):
75 |     configs = _read_configs(configurations_path, "Vocabulary")
76 |     for config_name, config in configs.items():
77 |         assert isinstance(config, VocabularyConfiguration)
78 | 


--------------------------------------------------------------------------------
/docs/.templates/config.mako:
--------------------------------------------------------------------------------
 1 | <%!
 2 |     # Template configuration. Copy over in your template directory
 3 |     # (used with `--template-dir`) and adapt as necessary.
 4 |     # Note, defaults are loaded from this distribution file, so your
 5 |     # config.mako only needs to contain values you want overridden.
 6 |     # You can also run pdoc with `--config KEY=VALUE` to override
 7 |     # individual values.
 8 | 
 9 |     html_lang = 'en'
10 |     show_inherited_members = False
11 |     extract_module_toc_into_sidebar = True
12 |     list_class_variables_in_index = True
13 |     sort_identifiers = False
14 |     show_type_annotations = True
15 | 
16 |     # The default docstring format
17 |     docformat = 'numpy'
18 | 
19 |     # Show collapsed source code block next to each item.
20 |     # Disabling this can improve rendering speed of large modules.
21 |     show_source_code = False
22 | 
23 |     # If set, format links to objects in online source code repository
24 |     # according to this template. Supported keywords for interpolation
25 |     # are: commit, path, start_line, end_line.
26 |     #git_link_template = 'https://github.com/USER/PROJECT/blob/{commit}/{path}#L{start_line}-L{end_line}'
27 |     #git_link_template = 'https://gitlab.com/USER/PROJECT/blob/{commit}/{path}#L{start_line}-L{end_line}'
28 |     #git_link_template = 'https://bitbucket.org/USER/PROJECT/src/{commit}/{path}#lines-{start_line}:{end_line}'
29 |     #git_link_template = 'https://CGIT_HOSTNAME/PROJECT/tree/{path}?id={commit}#n{start-line}'
30 |     git_link_template = None
31 | 
32 |     # A prefix to use for every HTML hyperlink in the generated documentation.
33 |     # No prefix results in all links being relative.
34 |     link_prefix = ''
35 | 
36 |     # Enable syntax highlighting for code/source blocks by including Highlight.js
37 |     syntax_highlighting = True
38 | 
39 |     # Set the style keyword such as 'atom-one-light' or 'github-gist'
40 |     #     Options: https://github.com/highlightjs/highlight.js/tree/master/src/styles
41 |     #     Demo: https://highlightjs.org/static/demo/
42 |     hljs_style = 'github'
43 | 
44 |     # If set, insert Google Analytics tracking code. Value is GA
45 |     # tracking id (UA-XXXXXX-Y).
46 |     google_analytics = ''
47 | 
48 |     # If set, insert Google Custom Search search bar widget above the sidebar index.
49 |     # The whitespace-separated tokens represent arbitrary extra queries (at least one
50 |     # must match) passed to regular Google search. Example:
51 |     #search_query = 'inurl:github.com/USER/PROJECT  site:PROJECT.github.io  site:PROJECT.website'
52 |     search_query = ''
53 | 
54 |     # If set, render LaTeX math syntax within \(...\) (inline equations),
55 |     # or within \[...\] or $$...$$ or `.. math::` (block equations)
56 |     # as nicely-formatted math formulas using MathJax.
57 |     # Note: in Python docstrings, either all backslashes need to be escaped (\\)
58 |     # or you need to use raw r-strings.
59 |     latex_math = False
60 | %>
61 | 


--------------------------------------------------------------------------------
/tests/resources/data/emotions_with_transformers.txt:
--------------------------------------------------------------------------------
 1 | i didnt feel humiliated;sadness
 2 | i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake;sadness
 3 | im grabbing a minute to post i feel greedy wrong;anger
 4 | i am ever feeling nostalgic about the fireplace i will know that it is still on the property;love
 5 | i am feeling grouchy;anger
 6 | ive been feeling a little burdened lately wasnt sure why that was;sadness
 7 | ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny;surprise
 8 | i feel as confused about life as a teenager or as jaded as a year old man;fear
 9 | i have been with petronas for years i feel that petronas has performed well and made a huge profit;joy
10 | i feel romantic too;love
11 | i feel like i have to make the suffering i m seeing mean something;sadness
12 | i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter;joy
13 | i think it s the easiest time of year to feel dissatisfied;anger
14 | i feel low energy i m just thirsty;sadness
15 | i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious;joy
16 | i do not feel reassured anxiety is on each side;joy
17 | i didnt really feel that embarrassed;sadness
18 | i feel pretty pathetic most of the time;sadness
19 | i started feeling sentimental about dolls i had as a child and so began a collection of vintage barbie dolls from the sixties;sadness
20 | i now feel compromised and skeptical of the value of every unit of work i put in;fear
21 | i feel irritated and rejected without anyone doing anything or saying anything;anger
22 | i am feeling completely overwhelmed i have two strategies that help me to feel grounded pour my heart out in my journal in the form of a letter to god and then end with a list of five things i am most grateful for;fear
23 | i have the feeling she was amused and delighted;joy
24 | i was able to help chai lifeline with your support and encouragement is a great feeling and i am so glad you were able to help me;joy
25 | i already feel like i fucked up though because i dont usually eat at all in the morning;anger
26 | i still love my so and wish the best for him i can no longer tolerate the effect that bm has on our lives and the fact that is has turned my so into a bitter angry person who is not always particularly kind to the people around him when he is feeling stressed;sadness
27 | i feel so inhibited in someone elses kitchen like im painting on someone elses picture;sadness
28 | i become overwhelmed and feel defeated;sadness
29 | i feel kinda appalled that she feels like she needs to explain in wide and lenghth her body measures etc pp;anger
30 | i feel more superior dead chicken or grieving child;joy
31 | i get giddy over feeling elegant in a perfectly fitted pencil skirt;joy
32 | i remember feeling acutely distressed for a few days;fear
33 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/public/assets/img/bg.svg:
--------------------------------------------------------------------------------
 1 | 
 2 | <svg width="494px" height="487px" viewBox="0 0 494 487" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 3 |     <!-- Generator: Sketch 48.1 (47250) - http://www.bohemiancoding.com/sketch -->
 4 |     <desc>Created with Sketch.</desc>
 5 |     <defs></defs>
 6 |     <g id="*-Documentation" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
 7 |         <g id="Home-biome-text-Copy" transform="translate(-967.000000, 0.000000)" stroke="#4C1DBF">
 8 |             <g id="Page-1" transform="translate(1283.285018, 114.169241) scale(1, -1) rotate(-222.000000) translate(-1283.285018, -114.169241) translate(905.285018, -340.330759)">
 9 |                 <path d="M190.848191,92.6665641 C190.848191,192.936685 213.213577,625.695092 235.586289,736.523335 C254.765514,831.56423 285.912986,889.573385 330.651084,889.573385 C364.200995,889.573385 384.068724,834.404885 403.343168,720.689711 C425.715879,588.752345 436.900404,182.385682 436.900404,97.9420653 C436.900404,48.4228362 434.966734,0 317.536552,0 C200.099045,0 190.848191,42.0900989 190.848191,92.6665641 Z" id="Stroke-1"></path>
10 |                 <path d="M462.669807,123.072945 C430.876905,211.06309 314.908331,598.02194 301.004923,702.481229 C289.087189,792.060095 300.264439,852.999143 342.737252,867.408306 C374.592783,878.218748 410.946475,836.208684 465.303866,742.63481 C528.370228,634.062698 667.827973,281.070342 694.60327,206.971687 C710.30447,163.519279 723.824742,120.402466 612.328543,82.569488 C500.836027,44.7365105 478.706251,78.6922951 462.669807,123.072945 Z" id="Stroke-3"></path>
11 |                 <path d="M522.165599,265.889246 C484.284461,338.587617 340.934261,662.14445 319.206142,752.302566 C300.572019,829.62022 306.700936,885.333623 346.978259,904.95665 C377.187174,919.671243 415.917499,888.385777 476.23195,814.402289 C546.21606,728.556455 709.802763,438.844613 741.702669,377.626624 C760.410634,341.725445 776.966094,305.770719 671.234891,254.266074 C565.503688,202.761429 541.272313,229.224136 522.165599,265.889246 Z" id="Stroke-5"></path>
12 |                 <path d="M7.4784994,296.640189 C36.7838113,378.827884 184.591094,726.457068 238.303966,810.212619 C284.362025,882.038214 330.995856,919.720195 373.625119,905.548919 C405.598912,894.923135 408.403955,843.411958 393.53723,744.099005 C376.28991,628.87232 268.166596,292.24663 243.482222,223.032939 C229.006727,182.4433 213.014294,143.364503 101.111555,180.55831 C-10.791185,217.752117 -7.30702698,255.184666 7.4784994,296.640189 Z" id="Stroke-7"></path>
13 |                 <path d="M222.393346,293.866117 C222.393346,371.203174 244.758733,704.97758 267.127782,790.456697 C286.31067,863.760269 317.458142,908.500479 362.196239,908.500479 C395.746151,908.500479 415.61388,865.948649 434.888324,778.245395 C457.261035,676.485733 468.445559,363.064689 468.445559,297.935359 C468.445559,259.74241 466.511889,222.393346 349.078045,222.393346 C231.644201,222.393346 222.393346,254.857889 222.393346,293.866117 Z" id="Stroke-9"></path>
14 |             </g>
15 |         </g>
16 |     </g>
17 | </svg>
18 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/public/assets/img/biome-isotype.svg:
--------------------------------------------------------------------------------
 1 | 
 2 | <svg width="116px" height="136px" viewBox="0 0 116 136" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 3 |     <!-- Generator: Sketch 48.1 (47250) - http://www.bohemiancoding.com/sketch -->
 4 |     <desc>Biome text.</desc>
 5 |     <defs></defs>
 6 |     <g id="*-Documentation" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
 7 |         <g id="Home-biome-text-Copy" transform="translate(-671.000000, -97.000000)" fill-rule="nonzero">
 8 |             <g id="Recurso-1" transform="translate(671.000000, 97.000000)">
 9 |                 <path d="M29.7536723,13.880678 C29.7536723,28.9082486 33.1746893,93.7371751 36.5957062,110.344181 C39.5317514,124.578757 44.2962712,133.282034 51.1383051,133.282034 C56.2698305,133.282034 59.3107345,125.017853 62.279548,107.984859 C65.700565,88.2189831 67.4110734,27.3353672 67.4110734,14.6867797 C67.3914124,7.25491525 67.0964972,0 49.1525424,0 C31.2085876,0 29.7536723,6.30463277 29.7536723,13.880678 Z" id="Shape" fill="#4C10BC"></path>
10 |                 <path d="M71.0287006,18.3502825 C66.1920904,31.4903955 48.5627119,89.2937853 46.4458757,104.898079 C44.6305085,118.280678 46.3344633,127.377175 52.7898305,129.533333 C57.6329944,131.145537 63.1642938,124.873672 71.4284746,110.894689 C81.0164972,94.6743503 102.230734,41.9500565 106.294011,30.880904 C108.679548,24.3862147 110.730847,17.9505085 93.7830508,12.2946893 C76.8352542,6.63887006 73.4666667,11.7179661 71.0287006,18.3502825 Z" id="Shape" fill="#FFB493"></path>
11 |                 <path d="M80.2103955,39.859435 C74.4628249,50.7188701 52.6849718,99.0587571 49.4081356,112.526554 C46.5835028,124.080678 47.5141243,132.403842 53.6221469,135.333333 C58.2097175,137.528814 64.1080226,132.856045 73.2307345,121.806554 C83.8476836,108.981017 108.666441,65.700565 113.509605,56.5516384 C116.347345,51.1907345 118.857401,45.8167232 102.820565,38.1227119 C86.7837288,30.4287006 83.1071186,34.380565 80.2103955,39.859435 Z" id="Shape" fill="#6A29E2" opacity="0.85"></path>
12 |                 <path d="M1.78915254,44.3487006 C6.23909605,56.6499435 28.659209,108.672994 36.8185311,121.210169 C43.8112994,131.958192 50.8892655,137.59435 57.3577401,135.477514 C62.2140113,133.891525 62.600678,126.177853 60.3789831,111.314124 C57.7575141,94.0714124 41.3733333,43.6933333 37.6049718,33.3254237 C35.4094915,27.2501695 32.9846328,21.4042938 15.9778531,26.9683616 C-1.02892655,32.5324294 -0.45220339,38.1423729 1.78915254,44.3487006 Z" id="Shape" fill="#FF8373"></path>
13 |                 <path d="M34.459209,44.0472316 C34.459209,55.5816949 37.880226,105.356836 41.3012429,118.103729 C44.2372881,129.035254 49.0018079,135.706893 55.8438418,135.706893 C60.9753672,135.706893 64.0162712,129.362938 66.9850847,116.281808 C70.379887,101.110056 72.0903955,54.3954802 72.0903955,44.6501695 C72.0903955,38.9550282 71.7954802,33.3844068 53.8384181,33.3844068 C35.8813559,33.3844068 34.459209,38.2275706 34.459209,44.0472316 Z" id="Shape" fill="#F7D4B9" opacity="0.85"></path>
14 |             </g>
15 |         </g>
16 |     </g>
17 | </svg>
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |     <br>
 3 |     <img src="https://github.com/recognai/biome-text/raw/master/docs/biome_text_logo_for_readme.png" width="600"/>
 4 |     <br>
 5 | <p>
 6 | <p align="center">
 7 |     <a href="https://github.com/recognai/biome-text/actions">
 8 |         <img alt="CI" src="https://github.com/recognai/biome-text/workflows/CI/badge.svg?branch=master&event=push">
 9 |     </a>
10 |     <a href="https://codecov.io/gh/recognai/biome-text">
11 |         <img src="https://codecov.io/gh/recognai/biome-text/branch/master/graph/badge.svg?token=943463Z6F7"/>
12 |     </a>
13 |     <a href="https://github.com/recognai/biome-text/blob/master/LICENSE.txt">
14 |         <img alt="GitHub" src="https://img.shields.io/github/license/recognai/biome-text.svg?color=blue">
15 |     </a>
16 |     <a href="https://www.recogn.ai/biome-text/">
17 |         <img alt="Documentation" src="https://img.shields.io/website/http/recognai.github.io/biome-text/index.html.svg?down_color=red&down_message=offline&up_message=online">
18 |     </a>
19 |     <a href="https://github.com/recognai/biome-text/releases">
20 |         <img alt="GitHub release" src="https://img.shields.io/github/release/recognai/biome-text.svg">
21 |     </a>
22 | </p>
23 | 
24 | <h3 align="center">
25 | <p>Natural Language Processing library built with AllenNLP
26 | </h3>
27 | 
28 | ## Quick Links
29 | - [Documentation](https://recognai.github.io/biome-text/)
30 | 
31 | 
32 | ## Features
33 | * State-of-the-art and not so state-of-the-art models trained with **your own data** with simple workflows.
34 | 
35 | * **Efficient data reading** for (large) datasets in multiple formats and sources (CSV, Parquet, JSON, etc.).
36 | 
37 | * **Modular configuration and extensibility** of models, datasets and training runs programmatically or via config files.
38 | 
39 | * Use via **`cli`** or as plain Python (e.g., inside a Jupyter Notebook)
40 | 
41 | * **Compatible with AllenNLP**
42 | 
43 | ## Installation
44 | 
45 | For the installation we recommend setting up a fresh [conda](https://docs.conda.io/en/latest/miniconda.html) environment:
46 | 
47 | ```shell script
48 | conda create -n biome python~=3.7.0 pip>=20.3.0
49 | conda activate biome
50 | ```
51 | 
52 | Once the conda environment is activated, you can install the latest release via pip:
53 | 
54 | ````shell script
55 | pip install -U biome-text
56 | ````
57 | 
58 | After installing *biome.text*, the best way to test your installation is by running the *biome.text* cli command:
59 | 
60 | ```shell script
61 | biome --help
62 | ```
63 | 
64 | ## Get started
65 | 
66 | The best way to see how *biome.text* works is to go through our [first tutorial](https://recognai.github.io/biome-text/master/documentation/tutorials/1-Training_a_text_classifier.html).
67 | 
68 | Please refer to our [documentation](https://recognai.github.io/biome-text) for more tutorials, detailed user guides and how you can [contribute](https://recognai.github.io/biome-text/master/documentation/community/1-contributing.html) to *biome.text*.
69 | 
70 | ## Licensing
71 | 
72 | The code in this project is licensed under Apache 2 license.
73 | 


--------------------------------------------------------------------------------
/tests/text/modules/heads/classification/test_relation_classifier.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | import pytest
 4 | 
 5 | from biome.text import Dataset
 6 | from biome.text import Pipeline
 7 | from biome.text import Trainer
 8 | from biome.text import TrainerConfiguration
 9 | 
10 | 
11 | @pytest.fixture
12 | def training_dataset() -> Dataset:
13 |     """Creating the dataframe."""
14 |     data = {
15 |         "text": [
16 |             "The most common audits were about waste and recycling.",
17 |             "The company fabricates plastic chairs.",
18 |         ],
19 |         "entities": [
20 |             [
21 |                 {"start": 34, "end": 39, "label": "PN", "text": "waste"},
22 |                 {"start": 16, "end": 22, "label": "QTY", "text": "audits"},
23 |             ],
24 |             [
25 |                 {"start": 4, "end": 11, "label": "OBJECT", "text": "company"},
26 |                 {"start": 31, "end": 37, "label": "SUBJECT", "text": "chairs"},
27 |             ],
28 |         ],
29 |         "label": ["Message-Topic(e1,e2)", "Product-Producer(e2,e1)"],
30 |     }
31 | 
32 |     return Dataset.from_dict(data)
33 | 
34 | 
35 | @pytest.fixture
36 | def pipeline_dict() -> Dict:
37 |     """Creating the pipeline dictionary"""
38 | 
39 |     pipeline_dict = {
40 |         "name": "biome-rele",
41 |         "features": {
42 |             "word": {"embedding_dim": 2},
43 |             "char": {
44 |                 "embedding_dim": 2,
45 |                 "dropout": 0.1,
46 |                 "encoder": {
47 |                     "type": "gru",
48 |                     "hidden_size": 2,
49 |                 },
50 |             },
51 |         },
52 |         "head": {
53 |             "type": "RelationClassification",
54 |             "labels": ["Message-Topic(e1,e2)", "Product-Producer(e2,e1)"],
55 |             "entities_embedder": {"num_embeddings": 12, "embedding_dim": 50},
56 |             "feedforward": {
57 |                 "num_layers": 1,
58 |                 "hidden_dims": [4],
59 |                 "activations": ["relu"],
60 |                 "dropout": [0.1],
61 |             },
62 |         },
63 |     }
64 | 
65 |     return pipeline_dict
66 | 
67 | 
68 | @pytest.fixture
69 | def trainer_config() -> TrainerConfiguration:
70 |     return TrainerConfiguration(
71 |         max_epochs=1,
72 |         optimizer={"type": "adamw", "lr": 0.002},
73 |         gpus=0,
74 |     )
75 | 
76 | 
77 | def test_train(pipeline_dict, training_dataset, trainer_config, tmp_path):
78 |     """Testing a classifier made from scratch"""
79 | 
80 |     pipeline = Pipeline.from_config(pipeline_dict)
81 |     pipeline.predict(
82 |         text="The most common audits were about waste and recycling",
83 |         entities=[
84 |             {"start": 34, "end": 39, "label": "OBJECT", "text": "waste"},
85 |             {"start": 16, "end": 22, "label": "SUBJECT", "text": "audits"},
86 |         ],
87 |     )
88 | 
89 |     trainer = Trainer(
90 |         pipeline=pipeline,
91 |         train_dataset=training_dataset,
92 |         valid_dataset=training_dataset,
93 |         trainer_config=trainer_config,
94 |     )
95 |     trainer.fit(tmp_path / "relation_classifier")
96 | 
97 |     # test loading
98 |     Pipeline.from_pretrained(tmp_path / "relation_classifier" / "model.tar.gz")
99 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/components/Versions.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |   <span class="nav-versions" v-if="options && options.length > 0">
  3 |     <div class="nav-versions__select" v-model="selected" @click="showOptions = true"><strong>{{selected}}</strong></div>
  4 |     <div class="nav-versions__options__container" v-if="showOptions">
  5 |       <ul class="nav-versions__options" v-click-outside="clickOutside">
  6 |         <li class="nav-versions__option" @click="onChange(option)" v-for="option in options" :value="option">
  7 |           <a :class="option === selected ? 'active' : ''" href="#">{{ option }}</a>
  8 |         </li>
  9 |       </ul>
 10 |     </div>
 11 |   </span>
 12 | </template>
 13 | 
 14 | <script>
 15 | import Axios from 'axios';
 16 | export default {
 17 |   name: 'Versions',
 18 | 
 19 |   data() {
 20 |     return {
 21 |       selected: undefined,
 22 |       options: [],
 23 |       showOptions: false,
 24 |     };
 25 |   },
 26 |   created: async function() {
 27 |     try {
 28 |       // This hardcoded url will be problematic if we want to move away from github pages ...
 29 |       let res = await Axios.get(
 30 |         'https://raw.githubusercontent.com/recognai/biome-text/gh-pages/versions.txt'
 31 |       );
 32 |       this.options = res.data.split('\n')
 33 |         .filter((e) => {return e !== ""})
 34 |         .map((e) => {return e.trim()})
 35 | 
 36 |       this.selected = window.location.pathname.split('/')[2];
 37 |     } catch (ex) {}
 38 |   },
 39 |   methods: {
 40 |     onChange(option) {
 41 |       this.showOptions = false;
 42 |       this.selected = option;
 43 |       const targetVersionPath = `/${this.selected}/`;
 44 |       const paths = window.location.pathname.split('/');
 45 |       window.location.pathname =
 46 |         paths.slice(0,2).join('/') +
 47 |         targetVersionPath +
 48 |         paths.slice(3).join('/')
 49 |     },
 50 |     clickOutside() {
 51 |       this.showOptions = false;
 52 |     }
 53 |   }
 54 | };
 55 | </script>
 56 | <style lang="stylus">
 57 | 
 58 | .nav-versions
 59 |   display: block
 60 |   margin: auto auto 1.5em auto
 61 |   text-align: center
 62 |   position: relative
 63 |   z-index: 1
 64 |   &__select
 65 |     background: transparent
 66 |     min-height: 30px
 67 |     padding: 0.5em
 68 |     color: $textColor
 69 |     font-size: 15px
 70 |     cursor: pointer
 71 |     @media (max-width: $MQMobile)
 72 |       font-size: 16px
 73 |     &::after
 74 |       content: ''
 75 |       border-left: 4px solid transparent
 76 |       border-right: 4px solid transparent
 77 |       border-top: 6px solid $arrowBgColor
 78 |       border-bottom: 0
 79 |       display: inline-block
 80 |       margin-left: 0.5em
 81 |   &__options
 82 |     background: white
 83 |     border: 1px solid $borderColor
 84 |     display: inline-block
 85 |     width: auto
 86 |     list-style: none
 87 |     border-radius: 4px
 88 |     &__container
 89 |       position: absolute
 90 |       top: 2em
 91 |       left: 0
 92 |       right: 0
 93 |       margin: auto !important
 94 |   &__option
 95 |     padding: 0.2em 1em
 96 |     font-size: 15px
 97 |     text-align: left
 98 |     @media (max-width: $MQMobile)
 99 |       padding: 0.5em 2em
100 |     a
101 |       color: $textColor
102 |       &:hover, &:focus, &.active
103 |         color: $accentColor
104 | 
105 | </style>
106 | 


--------------------------------------------------------------------------------
/src/biome/text/text_cleaning.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import re
  3 | from typing import Callable
  4 | from typing import Dict
  5 | from typing import List
  6 | 
  7 | from allennlp.common import Registrable
  8 | from bs4 import BeautifulSoup
  9 | 
 10 | 
 11 | class TextCleaning(Registrable):
 12 |     """Defines rules that can be applied to the text before it gets tokenized.
 13 | 
 14 |     Each rule is a simple python function that receives and returns a `str`.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     rules: `List[str]`
 19 |         A list of registered rule method names to be applied to text inputs
 20 |     """
 21 | 
 22 |     default_implementation = "default"
 23 | 
 24 |     def __init__(self, rules: List[str] = None):
 25 |         self.rules = rules or []
 26 |         for rule in self.rules:
 27 |             if rule not in TextCleaningRule.registered_rules():
 28 |                 raise AttributeError(
 29 |                     f"No rule '{rule}' registered"
 30 |                     f"Available rules are [{[k for k in TextCleaningRule.registered_rules().keys()]}]"
 31 |                 )
 32 | 
 33 |     def __call__(self, text: str) -> str:
 34 |         for rule in self.rules:
 35 |             text = TextCleaningRule.registered_rules()[rule](text)
 36 |         return text
 37 | 
 38 | 
 39 | TextCleaning.register(TextCleaning.default_implementation)(TextCleaning)
 40 | 
 41 | 
 42 | class TextCleaningRule:
 43 |     """Registers a function as a rule for the text cleaning implementation
 44 | 
 45 |     Use the decorator `@TextCleaningRule` for creating custom text cleaning and pre-processing rules.
 46 | 
 47 |     An example function to strip spaces would be:
 48 | 
 49 |     ```python
 50 |     @TextCleaningRule
 51 |     def strip_spaces(text: str) -> str:
 52 |         return text.strip()
 53 |     ```
 54 | 
 55 |     You can query available rules via `TextCleaningRule.registered_rules()`.
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |     func: `Callable[[str]`
 60 |         The function to register
 61 |     """
 62 | 
 63 |     __REGISTERED_RULES = {}
 64 | 
 65 |     def __init__(self, func: Callable[[str], str]):
 66 |         self.__callable__ = func
 67 |         self.__REGISTERED_RULES[func.__name__] = func
 68 | 
 69 |     @classmethod
 70 |     def registered_rules(cls) -> Dict[str, Callable[[str], str]]:
 71 |         """Registered rules dictionary"""
 72 |         return copy.deepcopy(cls.__REGISTERED_RULES)
 73 | 
 74 |     def __call__(self, *args, **kwargs) -> str:
 75 |         """Enables call single rule"""
 76 |         return self.__callable__(*args, **kwargs)
 77 | 
 78 | 
 79 | @TextCleaningRule
 80 | def strip_spaces(text: str) -> str:
 81 |     """Strips leading and trailing spaces/new lines"""
 82 |     return text.strip()
 83 | 
 84 | 
 85 | @TextCleaningRule
 86 | def rm_useless_spaces(text: str) -> str:
 87 |     """Removes multiple spaces in `str`"""
 88 |     return re.sub(" {2,}", " ", text)
 89 | 
 90 | 
 91 | @TextCleaningRule
 92 | def fix_html(text: str) -> str:
 93 |     """Replaces some special HTML characters: `&nbsp;`, `<br>`, etc."""
 94 |     text = (
 95 |         # non breakable space -> space
 96 |         text.replace("&nbsp;", " ")
 97 |         .replace("&#160;", " ")
 98 |         .replace("&#xa0;", " ")
 99 |         # <br> html single line breaks -> unicode line breaks
100 |         .replace("<br>", "\n")
101 |     )
102 | 
103 |     return text
104 | 
105 | 
106 | @TextCleaningRule
107 | def html_to_text(text: str) -> str:
108 |     """Extracts text from an HTML document"""
109 |     return BeautifulSoup(text, "lxml").get_text()
110 | 


--------------------------------------------------------------------------------
/tests/resources/data/dataset_source.jsonl:
--------------------------------------------------------------------------------
1 | {"reviewerID": "A2HD75EMZR8QLN", "asin": "0700099867", "reviewerName": "123", "helpful": [8, 12], "reviewText": "Installing the game was a struggle (because of games for windows live bugs).Some championship races and cars can only be \"unlocked\" by buying them as an addon to the game. I paid nearly 30 dollars when the game was new. I don't like the idea that I have to keep paying to keep playing.I noticed no improvement in the physics or graphics compared to Dirt 2.I tossed it in the garbage and vowed never to buy another codemasters game. I'm really tired of arcade style rally/racing games anyway.I'll continue to get my fix from Richard Burns Rally, and you should to. :)http://www.amazon.com/Richard-Burns-Rally-PC/dp/B000C97156/ref=sr_1_1?ie=UTF8&qid;=1341886844&sr;=8-1&keywords;=richard+burns+rallyThank you for reading my review! If you enjoyed it, be sure to rate it as helpful.", "overall": 1.0, "summary": "Pay to unlock content? I don't think so.", "unixReviewTime": 1341792000, "reviewTime": "07 9, 2012"}
2 | {"reviewerID": "A3UR8NLLY1ZHCX", "asin": "0700099867", "reviewerName": "Alejandro Henao \"Electronic Junky\"", "helpful": [0, 0], "reviewText": "If you like rally cars get this game you will have fun.It is more oriented to &#34;European market&#34; since here in America there isn't a huge rally fan party. Music it is very European and even the voices from the game very &#34;English&#34; accent.The multiplayer isn't the best but it works just ok.", "overall": 4.0, "summary": "Good rally game", "unixReviewTime": 1372550400, "reviewTime": "06 30, 2013"}
3 | {"reviewerID": "A1INA0F5CWW3J4", "asin": "0700099867", "reviewerName": "Amazon Shopper \"Mr.Repsol\"", "helpful": [0, 0], "reviewText": "1st shipment received a book instead of the game.2nd shipment got a FAKE one. Game arrived with a wrong key inside on sealed box. I got in contact with codemasters and send them pictures of the DVD and the content. They said nothing they can do its a fake DVD.Returned it good bye.!", "overall": 1.0, "summary": "Wrong key", "unixReviewTime": 1403913600, "reviewTime": "06 28, 2014"}
4 | {"reviewerID": "A2HD75EMZR8QLN", "asin": "0700099867", "reviewerName": "123", "helpful": [8, 12], "reviewText": "Installing the game was a struggle (because of games for windows live bugs).Some championship races and cars can only be \"unlocked\" by buying them as an addon to the game. I paid nearly 30 dollars when the game was new. I don't like the idea that I have to keep paying to keep playing.I noticed no improvement in the physics or graphics compared to Dirt 2.I tossed it in the garbage and vowed never to buy another codemasters game. I'm really tired of arcade style rally/racing games anyway.I'll continue to get my fix from Richard Burns Rally, and you should to. :)http://www.amazon.com/Richard-Burns-Rally-PC/dp/B000C97156/ref=sr_1_1?ie=UTF8&qid;=1341886844&sr;=8-1&keywords;=richard+burns+rallyThank you for reading my review! If you enjoyed it, be sure to rate it as helpful.", "overall": 1.0, "summary": "Pay to unlock content? I don't think so.", "unixReviewTime": 1341792000, "reviewTime": "07 9, 2012"}
5 | {"reviewerID": "A3UR8NLLY1ZHCX", "asin": "0700099867", "reviewerName": "Alejandro Henao \"Electronic Junky\"", "helpful": [0, 0], "reviewText": "If you like rally cars get this game you will have fun.It is more oriented to &#34;European market&#34; since here in America there isn't a huge rally fan party. Music it is very European and even the voices from the game very &#34;English&#34; accent.The multiplayer isn't the best but it works just ok.", "overall": 4.0, "summary": "Good rally game", "unixReviewTime": 1372550400, "reviewTime": "06 30, 2013"}
6 | 


--------------------------------------------------------------------------------
/tests/text/modules/heads/classification/test_document_classification.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from numpy.testing import assert_allclose
 3 | 
 4 | from biome.text import Pipeline
 5 | from biome.text.modules.heads.task_prediction import Attribution
 6 | from biome.text.modules.heads.task_prediction import DocumentClassificationPrediction
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def pipeline() -> Pipeline:
11 |     labels = ["a", "b", "c", "d", "e", "f"]
12 |     return Pipeline.from_config(
13 |         {
14 |             "name": "test_document_classification",
15 |             "tokenizer": {"segment_sentences": False},
16 |             "head": {
17 |                 "type": "DocumentClassification",
18 |                 "labels": labels,
19 |                 "dropout": 0.1,
20 |             },
21 |         }
22 |     )
23 | 
24 | 
25 | @pytest.mark.parametrize(
26 |     "segment_sentences, input, output",
27 |     [
28 |         (False, "one sentence. two sentence", (1, 5)),
29 |         (True, "one sentence. two sentence", (2, 3)),
30 |         (False, ["one sentence. two sentence", "test"], (2, 5)),
31 |         (True, ["one sentence. two sentence", "test"], (3, 3)),
32 |         (False, {"one": "one sentence. two sentence", "two": "test"}, (2, 5)),
33 |         (True, {"one": "one sentence. two sentence", "two": "test"}, (3, 3)),
34 |     ],
35 | )
36 | def test_tokenization_of_different_input(segment_sentences, input, output):
37 |     pipeline = Pipeline.from_config(
38 |         {
39 |             "name": "test_document_classification",
40 |             "tokenizer": {"segment_sentences": segment_sentences},
41 |             "head": {"type": "DocumentClassification", "labels": "a"},
42 |         }
43 |     )
44 |     instance = pipeline.head.featurize(input)
45 |     tokens = pipeline.head._extract_tokens(instance)
46 | 
47 |     assert len(tokens) == output[0]
48 |     assert len(tokens[0]) == output[1]
49 | 
50 | 
51 | def test_make_task_prediction(pipeline):
52 |     instance = pipeline.head.featurize("test this sentence")
53 |     forward_output = pipeline.model.forward_on_instances([instance])
54 | 
55 |     prediction = pipeline.head._make_task_prediction(forward_output[0], None)
56 | 
57 |     assert isinstance(prediction, DocumentClassificationPrediction)
58 |     assert isinstance(prediction.labels, list) and isinstance(
59 |         prediction.probabilities, list
60 |     )
61 |     assert len(prediction.labels) == len(prediction.probabilities) == 6
62 |     # check descending order
63 |     assert_allclose(
64 |         sorted(prediction.probabilities, reverse=True), prediction.probabilities
65 |     )
66 |     assert all([isinstance(label, str) for label in prediction.labels])
67 |     assert set(pipeline.head.labels) == set(prediction.labels)
68 |     assert all([isinstance(prob, float) for prob in prediction.probabilities])
69 | 
70 | 
71 | def test_compute_attributions(pipeline):
72 |     instance = pipeline.head.featurize("test this sentence")
73 |     pipeline.model.eval()
74 |     forward_output = pipeline.model.forward_on_instances([instance])
75 | 
76 |     attributions = pipeline.head._compute_attributions(
77 |         forward_output[0], instance, n_steps=1
78 |     )
79 | 
80 |     assert isinstance(attributions, list) and isinstance(attributions[0], list)
81 |     assert len(attributions) == 1 and len(attributions[0]) == 3
82 |     assert all(
83 |         [isinstance(attribution, Attribution) for attribution in attributions[0]]
84 |     )
85 |     assert all([attr.field == "text" for attr in attributions[0]])
86 |     assert all([isinstance(attr.attribution, float) for attr in attributions[0]])
87 |     assert attributions[0][1].start == 5 and attributions[0][1].end == 9
88 | 


--------------------------------------------------------------------------------
/src/biome/text/modules/configuration/defs.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import inspect
 3 | from typing import Any
 4 | from typing import Dict
 5 | from typing import Generic
 6 | from typing import Optional
 7 | from typing import Type
 8 | from typing import TypeVar
 9 | from typing import Union
10 | 
11 | from allennlp.common import FromParams
12 | from allennlp.common import Params
13 | from allennlp.modules.bimpm_matching import BiMpmMatching
14 | from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper
15 | from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
16 | 
17 | from biome.text import helpers
18 | 
19 | 
20 | def _find_input_attribute(component: Type[Any]) -> str:
21 |     """Find the properly input dimension attribute name for a given component"""
22 |     input_dim_attribute = None
23 |     if issubclass(component, (PytorchSeq2SeqWrapper, PytorchSeq2VecWrapper)):
24 |         input_dim_attribute = "input_size"
25 |     elif component is BiMpmMatching:
26 |         input_dim_attribute = "hidden_dim"
27 |     else:
28 |         init_method_keys = inspect.signature(component.__init__).parameters.keys()
29 |         for param_name in ["embedding_dim", "input_dim"]:
30 |             if param_name in init_method_keys:
31 |                 input_dim_attribute = param_name
32 |                 break
33 |     return input_dim_attribute
34 | 
35 | 
36 | T = TypeVar("T")
37 | 
38 | 
39 | class ComponentConfiguration(Generic[T], FromParams):
40 |     """
41 |     The layer spec component allows create Pytorch modules lazily,
42 |     and instantiate them inside a context (Model or other component) dimension layer chain.
43 | 
44 |     The layer spec wraps a component params and will generate an instance of type T once the input_dim is set.
45 | 
46 |     """
47 | 
48 |     @classmethod
49 |     def from_params(cls: Type[T], params: Params, **extras) -> T:
50 |         return cls(**params.as_dict())
51 | 
52 |     def __resolve_layer_class(
53 |         self, type_name: Optional[Union[Type, str]] = None
54 |     ) -> Type[T]:
55 |         if isinstance(type_name, Type):
56 |             return type_name
57 | 
58 |         layer_class = getattr(self.__class__, "__orig_bases__")[0].__args__[0]
59 |         return layer_class.by_name(type_name) if type_name else layer_class
60 | 
61 |     def __init__(self, **config):
62 |         self._layer_class = self.__resolve_layer_class(config.get("type"))
63 |         config["type"] = helpers.get_full_class_name(self._layer_class)
64 |         self._config = config or {}
65 | 
66 |     def input_dim(self, input_dim: int) -> "ComponentConfiguration":
67 |         """Sets the input dimension attribute for this layer configuration"""
68 |         self.__update_config_with_input_dim(input_dim)
69 |         return self
70 | 
71 |     def __update_config_with_input_dim(self, input_dim: int):
72 |         input_dim_attribute = _find_input_attribute(self._layer_class)
73 | 
74 |         if input_dim_attribute:
75 |             self._config[input_dim_attribute] = input_dim
76 | 
77 |     @property
78 |     def config(self) -> Dict[str, Any]:
79 |         """Component read-only configuration"""
80 |         return copy.deepcopy(self._config)
81 | 
82 |     def compile(self, **extras) -> T:
83 |         """
84 |         Using the wrapped configuration and the input dimension, generates a
85 |         instance of type T representing the layer configuration
86 |         """
87 |         if not self.config:
88 |             raise ValueError(f"No configuration found for {self}")
89 | 
90 |         config = self.config
91 |         if "type" in config:
92 |             config.pop("type")
93 | 
94 |         return self._layer_class.from_params(Params(config), **extras)
95 | 


--------------------------------------------------------------------------------
/tests/text/test_tokenizer.py:
--------------------------------------------------------------------------------
  1 | from allennlp.data import Token as AllennlpToken
  2 | from spacy.tokens.token import Token as SpacyToken
  3 | 
  4 | from biome.text.configuration import TokenizerConfiguration
  5 | from biome.text.tokenizer import Tokenizer
  6 | 
  7 | html_text = """
  8 |         <!DOCTYPE html>
  9 |         <html>
 10 |         <body>
 11 | 
 12 |         <h1>My First Heading</h1>
 13 |         <p>My first paragraph.</p>
 14 |         <p>My second paragraph.</p>
 15 |         </body>
 16 |         </html>
 17 |     """
 18 | 
 19 | 
 20 | def test_text_cleaning_with_sentence_segmentation():
 21 |     tokenizer = Tokenizer(
 22 |         TokenizerConfiguration(
 23 |             text_cleaning={"rules": ["html_to_text", "strip_spaces"]},
 24 |             segment_sentences=True,
 25 |         )
 26 |     )
 27 | 
 28 |     tokenized = tokenizer.tokenize_text(html_text)
 29 |     assert len(tokenized) == 2
 30 |     assert (
 31 |         len(tokenized[0]) == 7
 32 |     ), "Expected [My, First, Heading, My, first, paragraph, .]"
 33 |     assert len(tokenized[1]) == 4, "Expected [My, second, paragraph, .]"
 34 | 
 35 | 
 36 | def test_text_cleaning_with_sentence_segmentation_and_max_sequence():
 37 |     tokenizer = Tokenizer(
 38 |         TokenizerConfiguration(
 39 |             truncate_sentence=8,
 40 |             text_cleaning={"rules": ["html_to_text", "strip_spaces"]},
 41 |             segment_sentences=True,
 42 |         )
 43 |     )
 44 | 
 45 |     tokenized = tokenizer.tokenize_text(html_text)
 46 |     assert len(tokenized) == 2
 47 |     assert len(tokenized[0]) == 2, "Expected [My, First]"
 48 |     assert len(tokenized[1]) == 2, "Expected [My, second]"
 49 | 
 50 | 
 51 | def test_document_cleaning():
 52 |     tokenizer = Tokenizer(
 53 |         TokenizerConfiguration(
 54 |             text_cleaning={"rules": ["html_to_text", "strip_spaces"]},
 55 |             segment_sentences=True,
 56 |         )
 57 |     )
 58 | 
 59 |     tokenized = tokenizer.tokenize_document([html_text])
 60 |     assert len(tokenized) == 2
 61 |     assert (
 62 |         len(tokenized[0]) == 7
 63 |     ), "Expected [My, First, Heading, My, first, paragraph, .]"
 64 |     assert len(tokenized[1]) == 4, "Expected [My, second, paragraph, .]"
 65 | 
 66 | 
 67 | def test_using_spacy_tokens():
 68 |     tokenizer = Tokenizer(TokenizerConfiguration(use_spacy_tokens=True))
 69 |     tokenized = tokenizer.tokenize_text("This is a text")
 70 |     assert len(tokenized) == 1
 71 |     assert len(tokenized[0]) == 4
 72 |     assert all(map(lambda t: isinstance(t, SpacyToken), tokenized[0]))
 73 | 
 74 | 
 75 | def test_using_allennlp_tokens():
 76 |     tokenizer = Tokenizer(TokenizerConfiguration(use_spacy_tokens=False))
 77 |     tokenized = tokenizer.tokenize_text("This is a text")
 78 |     assert len(tokenized) == 1
 79 |     assert len(tokenized[0]) == 4
 80 |     assert all(map(lambda t: isinstance(t, AllennlpToken), tokenized[0]))
 81 | 
 82 | 
 83 | def test_set_sentence_segmentation_with_max_number_of_sentences():
 84 |     tokenizer = Tokenizer(TokenizerConfiguration(max_nr_of_sentences=2))
 85 |     tokenized = tokenizer.tokenize_document(
 86 |         [
 87 |             "This is a sentence. This is another sentence.",
 88 |             "One more sentence here.",
 89 |             "Last sentence here.",
 90 |         ]
 91 |     )
 92 |     assert len(tokenized) == 2
 93 | 
 94 | 
 95 | def test_min_max_sentence_length():
 96 |     tokenizer = Tokenizer(
 97 |         TokenizerConfiguration(
 98 |             segment_sentences=True, min_sentence_length=10, max_sentence_length=15
 99 |         )
100 |     )
101 |     tokenized = tokenizer.tokenize_text("short. A very long sentence. This is fine")
102 | 
103 |     assert len(tokenized) == 1
104 |     assert len(tokenized[0]) == 3
105 | 


--------------------------------------------------------------------------------
/tests/text/test_hpo.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import pytest
  4 | from ray import tune
  5 | 
  6 | from biome.text import Pipeline
  7 | from biome.text import TrainerConfiguration
  8 | from biome.text import VocabularyConfiguration
  9 | from biome.text.dataset import Dataset
 10 | from biome.text.hpo import TuneExperiment
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def dataset():
 15 |     return Dataset.from_dict({"text": ["a", "b"], "label": ["a", "b"]})
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def pipeline_config():
 20 |     return {
 21 |         "name": "test_ray_tune_trainable",
 22 |         "features": {
 23 |             "word": {"embedding_dim": 2},
 24 |         },
 25 |         "head": {"type": "TextClassification", "labels": ["a", "b"]},
 26 |     }
 27 | 
 28 | 
 29 | @pytest.fixture
 30 | def trainer_config() -> TrainerConfiguration:
 31 |     return TrainerConfiguration(
 32 |         max_epochs=1,
 33 |         batch_size=2,
 34 |         add_wandb_logger=False,
 35 |     )
 36 | 
 37 | 
 38 | def test_tune_exp_default_trainable(tmp_path, dataset, pipeline_config, trainer_config):
 39 |     pipeline_config["features"]["word"]["embedding_dim"] = tune.choice([2, 4])
 40 |     trainer_config.optimizer["lr"] = tune.loguniform(0.001, 0.01)
 41 | 
 42 |     my_exp = TuneExperiment(
 43 |         pipeline_config=pipeline_config,
 44 |         trainer_config=trainer_config,
 45 |         train_dataset=dataset,
 46 |         valid_dataset=dataset,
 47 |         num_samples=1,
 48 |         local_dir=str(tmp_path),
 49 |     )
 50 | 
 51 |     assert my_exp._name.startswith("HPO on")
 52 |     assert my_exp.name == my_exp._name
 53 |     assert my_exp._run_identifier == "_default_trainable"
 54 | 
 55 |     analysis = tune.run(my_exp)
 56 |     assert len(analysis.trials) == 1
 57 | 
 58 | 
 59 | def test_tune_exp_save_dataset_and_vocab(
 60 |     dataset, pipeline_config, trainer_config, monkeypatch
 61 | ):
 62 |     pl = Pipeline.from_config(pipeline_config)
 63 | 
 64 |     my_exp = TuneExperiment(
 65 |         pipeline_config=pipeline_config,
 66 |         trainer_config=trainer_config,
 67 |         train_dataset=dataset,
 68 |         valid_dataset=dataset,
 69 |     )
 70 | 
 71 |     config = my_exp.config
 72 | 
 73 |     assert dataset[:] == Dataset.load_from_disk(config["train_dataset_path"])[:]
 74 |     assert dataset[:] == Dataset.load_from_disk(config["valid_dataset_path"])[:]
 75 | 
 76 | 
 77 | def test_tune_exp_custom_trainable(
 78 |     dataset,
 79 |     pipeline_config,
 80 |     trainer_config,
 81 | ):
 82 |     def my_trainable(config):
 83 |         pass
 84 | 
 85 |     my_exp = TuneExperiment(
 86 |         pipeline_config=pipeline_config,
 87 |         trainer_config=trainer_config,
 88 |         train_dataset=dataset,
 89 |         valid_dataset=dataset,
 90 |         name="custom trainable",
 91 |         trainable=my_trainable,
 92 |     )
 93 | 
 94 |     assert my_exp.name == "custom trainable"
 95 |     assert my_exp.trainable == my_trainable
 96 |     assert my_exp._run_identifier == "my_trainable"
 97 | 
 98 | 
 99 | def test_vocab_config(tmp_path, pipeline_config, trainer_config, dataset):
100 |     vocab_config = VocabularyConfiguration(max_vocab_size=1)
101 | 
102 |     my_exp = TuneExperiment(
103 |         pipeline_config=pipeline_config,
104 |         trainer_config=trainer_config,
105 |         train_dataset=dataset,
106 |         valid_dataset=dataset,
107 |         vocab_config=vocab_config,
108 |         name="test_vocab_config",
109 |         local_dir=str(tmp_path),
110 |     )
111 | 
112 |     analysis = tune.run(my_exp)
113 |     pl = Pipeline.from_pretrained(
114 |         Path(analysis.get_best_logdir("validation_loss", "min"))
115 |         / "output"
116 |         / "model.tar.gz"
117 |     )
118 | 
119 |     assert pl.vocab.get_vocab_size("word") == 3
120 | 


--------------------------------------------------------------------------------
/tests/text/modules/heads/classification/test_text_classification.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from allennlp.data import Batch
 3 | from numpy.testing import assert_allclose
 4 | 
 5 | from biome.text import Pipeline
 6 | from biome.text.modules.heads.task_prediction import Attribution
 7 | from biome.text.modules.heads.task_prediction import TextClassificationPrediction
 8 | 
 9 | 
10 | @pytest.fixture
11 | def pipeline() -> Pipeline:
12 |     labels = ["a", "b", "c", "d", "e", "f"]
13 |     return Pipeline.from_config(
14 |         {
15 |             "name": "test_text_classification",
16 |             "head": {"type": "TextClassification", "labels": labels, "dropout": 0.1},
17 |         }
18 |     )
19 | 
20 | 
21 | def test_make_task_prediction(pipeline):
22 |     instance = pipeline.head.featurize("test this sentence")
23 |     forward_output = pipeline._model.forward_on_instances([instance])
24 | 
25 |     prediction = pipeline.head._make_task_prediction(forward_output[0], None)
26 | 
27 |     assert isinstance(prediction, TextClassificationPrediction)
28 |     assert isinstance(prediction.labels, list) and isinstance(
29 |         prediction.probabilities, list
30 |     )
31 |     assert len(prediction.labels) == len(prediction.probabilities) == 6
32 |     # check descending order
33 |     assert_allclose(
34 |         sorted(prediction.probabilities, reverse=True), prediction.probabilities
35 |     )
36 |     assert all([isinstance(label, str) for label in prediction.labels])
37 |     assert set(pipeline.head.labels) == set(prediction.labels)
38 |     assert all([isinstance(prob, float) for prob in prediction.probabilities])
39 | 
40 | 
41 | def test_compute_attributions(pipeline):
42 |     instance = pipeline.head.featurize("test this sentence")
43 |     pipeline.model.eval()
44 |     forward_output = pipeline.model.forward_on_instances([instance])
45 | 
46 |     attributions = pipeline.head._compute_attributions(
47 |         forward_output[0], instance, n_steps=1
48 |     )
49 | 
50 |     assert all([isinstance(attribution, Attribution) for attribution in attributions])
51 |     assert len(attributions) == 3
52 |     assert all([attr.field == "text" for attr in attributions])
53 |     assert all([isinstance(attr.attribution, float) for attr in attributions])
54 |     assert attributions[1].start == 5 and attributions[1].end == 9
55 | 
56 | 
57 | def test_metrics(pipeline):
58 |     instance = pipeline.head.featurize(text="test this", label="a")
59 |     batch = Batch([instance])
60 |     batch.index_instances(pipeline.vocab)
61 | 
62 |     pipeline.head.forward(**batch.as_tensor_dict())
63 |     # validation metric should have never been called
64 |     assert pipeline.head._metrics.get_dict()["accuracy"].total_count == 1
65 |     assert pipeline.head._metrics.get_dict(is_train=False)["accuracy"].total_count == 0
66 | 
67 |     train_metrics = pipeline.head.get_metrics(reset=True)
68 |     expected_metric_names = (
69 |         ["accuracy"]
70 |         + [
71 |             f"{label}/{metric}"
72 |             for label in ["micro", "macro"]
73 |             for metric in ["precision", "recall", "fscore"]
74 |         ]
75 |         + [
76 |             f"_{metric}/{label}"
77 |             for metric in ["precision", "recall", "fscore"]
78 |             for label in ["a", "b", "c", "d", "e", "f"]
79 |         ]
80 |     )
81 |     assert all(name in train_metrics for name in expected_metric_names)
82 | 
83 |     pipeline.head.training = False
84 |     pipeline.head.forward(**batch.as_tensor_dict())
85 |     # training metric should have never been called after its reset
86 |     assert pipeline.head._metrics.get_dict()["accuracy"].total_count == 0
87 |     assert pipeline.head._metrics.get_dict(is_train=False)["accuracy"].total_count == 1
88 | 
89 |     valid_metrics = pipeline.head.get_metrics()
90 |     assert all(name in valid_metrics for name in expected_metric_names)
91 | 


--------------------------------------------------------------------------------
/docs/docs/documentation/community/3-developer_guides.md:
--------------------------------------------------------------------------------
 1 | # Developer guides
 2 | 
 3 | ## Setting up for development
 4 | To set up your system for *biome.text* development, you first of all have to [fork](https://guides.github.com/activities/forking/)
 5 | our repository and clone your fork to your computer:
 6 | 
 7 | ````shell script
 8 | git clone https://github.com/[your-github-username]/biome-text.git
 9 | cd biome-text
10 | ````
11 | 
12 | To keep your fork's master branch up to date with our repo you should add it as an [upstream remote branch](https://dev.to/louhayes3/git-add-an-upstream-to-a-forked-repo-1mik):
13 | 
14 | ````shell script
15 | git remote add upstream https://github.com/recognai/biome-text.git
16 | ````
17 | 
18 | Now go ahead and create a new conda environment in which the development will take place and activate it:
19 | 
20 | ````shell script
21 | conda env create -f environment_dev.yml
22 | conda activate biometext
23 | ````
24 | 
25 | Once you activated the conda environment, it is time to install *biome.text* in editable mode with all its development dependencies.
26 | The best way to do this is to take advantage of the make directive:
27 | 
28 | ````shell script
29 | make dev
30 | ````
31 | 
32 | After installing *biome.text*, the best way to test your installation is by running the *biome.text* cli command:
33 | 
34 | ```shell script
35 | biome --help
36 | ```
37 | 
38 | ### Running tests locally
39 | 
40 | *Biome.text* uses [pytest](https://docs.pytest.org/en/latest/) for its unit and integration tests.
41 | If you are working on the code base we advise you to run our tests locally before submitting a Pull Request (see below) to make sure your changes did not break and existing functionality.
42 | To achieve this you can simply run:
43 | 
44 | ````shell script
45 | make test
46 | ````
47 | 
48 | If you open a Pull Request, the test suite will be run automatically via a GitHub Action.
49 | 
50 | ### Serving docs locally
51 | 
52 | If you are working on the documentation and want to check out the results locally on your machine, you can simply run:
53 | 
54 | ````shell script
55 | make docs
56 | ````
57 | 
58 | The docs will be built and deployed automatically via a GitHub Action when our master branch is updated.
59 | If for some reason you want to build them locally, you can do so with:
60 | 
61 | ````shell script
62 | make build_docs
63 | ````
64 | 
65 | ## Make a release
66 | 
67 | To make a release you have to follow 4 steps:
68 | 
69 | 1. Run the `prepare_versioned_build.sh` script inside the `docs` folder and commit the changes to the master branch.
70 | The commit message should say something like: "v2.2.0 release".
71 | 
72 | 2. Create a new [GitHub release](https://docs.github.com/en/free-pro-team@latest/github/administering-a-repository/managing-releases-in-a-repository#creating-a-release).
73 | 
74 |     The version tags should be `v1.1.0` or for release candidates `v1.1.0rc1`.
75 |     Major and minor releases should always be made against the master branch, bugfix releases against the corresponding minor release tag.
76 | 
77 |     After publishing the release, the CI is triggered and if everything goes well the release gets published on PyPi.
78 |     The CI does:
79 |     - run tests & build docs
80 |     - build package
81 |     - upload to testpypi
82 |     - install from testpypi
83 |     - upload to pypi
84 | 
85 | 3. Revert the last commit in which you changed the docs, the commit message should read something like:
86 | "back to master release".
87 | 
88 | 4. **Docs**: In order for the Algolia Search to work, you need to add the new version number of the docs to our
89 | algolia [config file](https://github.com/algolia/docsearch-configs/blob/master/configs/recogn_biome-text.json) and submit a PR.
90 | 
91 | 
92 | Under the hood the versioning of our package is managed by [`setuptools_scm`](https://github.com/pypa/setuptools_scm),
93 | that basically works with the git tags in a repo.
94 | 


--------------------------------------------------------------------------------
/tests/docs/test_tutorials.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import pytest
 4 | from pytest_notebook.nb_regression import NBRegressionFixture
 5 | from pytest_notebook.notebook import dump_notebook
 6 | from pytest_notebook.notebook import load_notebook
 7 | 
 8 | pytestmark = pytest.mark.skip(
 9 |     reason="The pytest-notebook package is not actively maintained and "
10 |     "the tutorial tests are quite heavy on resources. "
11 |     "The idea is to run those tests locally and manually from time to time."
12 |     "THESE TESTS ARE ALSO OUT OF DATE ... :/"
13 | )
14 | 
15 | 
16 | def test_text_classifier_tutorial(tmp_path, tutorials_path):
17 |     notebook_path = tutorials_path / "Training_a_text_classifier.ipynb"
18 | 
19 |     # adapt notebook to CI (make its execution quicker + comment lines)
20 |     notebook = load_notebook(str(notebook_path))
21 |     for cell in notebook["cells"]:
22 |         if cell["source"].startswith("!pip install"):
23 |             cell["source"] = re.sub(r"!pip install", r"#!pip install", cell["source"])
24 |         if cell["source"].startswith("trainer_config ="):
25 |             cell["source"] = re.sub(
26 |                 r"num_epochs=[0-9][0-9]?", r"num_epochs=1", cell["source"]
27 |             )
28 |         if cell["source"].startswith("pl.train("):
29 |             cell["source"] = re.sub(
30 |                 r"training=train_ds", r"training=valid_ds", cell["source"]
31 |             )
32 |     # dump adapted notebook
33 |     mod_notebook_path = tmp_path / notebook_path.name
34 |     with mod_notebook_path.open("w") as file:
35 |         file.write(str(dump_notebook(notebook)))
36 | 
37 |     # test adapted notebook
38 |     fixture = NBRegressionFixture(exec_timeout=100)
39 |     fixture.check(str(mod_notebook_path))
40 | 
41 | 
42 | def test_slot_filling_tutorial(tmp_path, tutorials_path):
43 |     notebook_path = tutorials_path / "Training_a_sequence_tagger_for_Slot_Filling.ipynb"
44 | 
45 |     # adapt notebook to CI (make its execution quicker + comment lines)
46 |     notebook = load_notebook(str(notebook_path))
47 |     for cell in notebook["cells"]:
48 |         if cell["source"].startswith("!pip install"):
49 |             cell["source"] = re.sub(r"!pip install", r"#!pip install", cell["source"])
50 |         if cell["source"].startswith(
51 |             "from biome.text.configuration import FeaturesConfiguration"
52 |         ):
53 |             cell["source"] = re.sub(
54 |                 r"https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip",
55 |                 r"https://biome-tutorials-data.s3-eu-west-1.amazonaws.com/token_classifier/wiki-news-300d-1M.head.vec",
56 |                 cell["source"],
57 |             )
58 |         if cell["source"].startswith("trainer_config ="):
59 |             cell["source"] = re.sub(
60 |                 r"TrainerConfiguration\(\)",
61 |                 r"TrainerConfiguration(num_epochs=1)",
62 |                 cell["source"],
63 |             )
64 |         if cell["source"].startswith("pl.train("):
65 |             cell["source"] = re.sub(
66 |                 r"pl.train",
67 |                 r"from biome.text.configuration import TrainerConfiguration\npl.train",
68 |                 cell["source"],
69 |             )
70 |             cell["source"] = re.sub(
71 |                 r"training=train_ds",
72 |                 r"training=valid_ds",
73 |                 cell["source"],
74 |             )
75 |             cell["source"] = re.sub(
76 |                 r"test=test_ds,",
77 |                 r"test=test_ds, trainer=TrainerConfiguration(num_epochs=1)",
78 |                 cell["source"],
79 |             )
80 | 
81 |     # dump adapted notebook
82 |     mod_notebook_path = tmp_path / notebook_path.name
83 |     with mod_notebook_path.open("w") as file:
84 |         file.write(str(dump_notebook(notebook)))
85 | 
86 |     # test adapted notebook
87 |     fixture = NBRegressionFixture(exec_timeout=200)
88 |     fixture.check(str(mod_notebook_path))
89 | 


--------------------------------------------------------------------------------
/tests/text/test_pretrained_word_vectors.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import pytest
  4 | import torch
  5 | from torch.testing import assert_allclose
  6 | 
  7 | from biome.text import Dataset
  8 | from biome.text import Pipeline
  9 | from biome.text import Trainer
 10 | from biome.text import TrainerConfiguration
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def pretrained_word_vectors(tmp_path) -> Path:
 15 |     file_path = tmp_path / "pretrained_word_vectors.txt"
 16 |     file_path.write_text("2 2\ntest 0.66 0.33\nthis 0.25 0.75")
 17 | 
 18 |     return file_path
 19 | 
 20 | 
 21 | @pytest.fixture
 22 | def dataset() -> Dataset:
 23 |     data = {"text": ["test"], "label": ["good"]}
 24 |     return Dataset.from_dict(data)
 25 | 
 26 | 
 27 | @pytest.fixture
 28 | def dataset2() -> Dataset:
 29 |     data = {"text": ["this"], "label": ["good"]}
 30 |     return Dataset.from_dict(data)
 31 | 
 32 | 
 33 | @pytest.fixture
 34 | def pipeline_config(pretrained_word_vectors) -> dict:
 35 |     config = {
 36 |         "name": "pretrained_word_vectors_test",
 37 |         "features": {
 38 |             "word": {
 39 |                 "embedding_dim": 2,
 40 |                 "weights_file": str(pretrained_word_vectors.absolute()),
 41 |             }
 42 |         },
 43 |         "head": {"type": "TextClassification", "labels": ["good"]},
 44 |     }
 45 |     return config
 46 | 
 47 | 
 48 | def test_create_pipeline_with_weights_file(pipeline_config, dataset, tmp_path):
 49 |     pipeline = Pipeline.from_config(pipeline_config)
 50 | 
 51 |     output = tmp_path / "pretrained_word_vector_output"
 52 |     trainer = Trainer(
 53 |         pipeline=pipeline,
 54 |         train_dataset=dataset,
 55 |         trainer_config=TrainerConfiguration(max_epochs=1, gpus=0),
 56 |     )
 57 |     trainer.fit(output)
 58 | 
 59 |     instance = pipeline.head.featurize("test")
 60 |     instance.index_fields(pipeline.vocab)
 61 | 
 62 |     assert_allclose(
 63 |         pipeline.backbone.embedder(instance.as_tensor_dict()["text"], 0),
 64 |         torch.tensor([[0.66, 0.33]]),
 65 |     )
 66 | 
 67 |     # Loading a pretrained model without the weights file should work
 68 |     Path(pipeline_config["features"]["word"]["weights_file"]).unlink()
 69 |     assert isinstance(Pipeline.from_pretrained(str(output / "model.tar.gz")), Pipeline)
 70 | 
 71 | 
 72 | def test_extending_vocab_with_weights_file(
 73 |     pipeline_config, dataset, dataset2, capsys, caplog
 74 | ):
 75 |     pipeline = Pipeline.from_config(pipeline_config)
 76 |     # create vocab
 77 |     pipeline.create_vocab([dataset.to_instances(pipeline)])
 78 | 
 79 |     # extending the vocab with the weights file available should apply the pretrained weights
 80 |     pipeline.create_vocab([dataset2.to_instances(pipeline)])
 81 | 
 82 |     instance = pipeline.head.featurize("this")
 83 |     instance.index_fields(pipeline.vocab)
 84 | 
 85 |     assert_allclose(
 86 |         pipeline.backbone.embedder(instance.as_tensor_dict()["text"]),
 87 |         torch.tensor([[0.25, 0.75]]),
 88 |     )
 89 | 
 90 |     # extending the vocab with the weights file deleted should trigger a warning
 91 |     Path(pipeline_config["features"]["word"]["weights_file"]).unlink()
 92 |     ds = Dataset.from_dict({"text": ["that"], "label": ["good"]})
 93 |     pipeline.create_vocab([ds.to_instances(pipeline)])
 94 | 
 95 |     assert caplog.record_tuples[-1][0] == "allennlp.modules.token_embedders.embedding"
 96 |     assert caplog.record_tuples[-1][1] == 30
 97 |     assert (
 98 |         "Embedding at model_path, "
 99 |         "_head.backbone.embedder.token_embedder_word cannot locate the pretrained_file."
100 |         in caplog.record_tuples[-1][2]
101 |     )
102 | 
103 | 
104 | def test_raise_filenotfound_error(pipeline_config, dataset):
105 |     Path(pipeline_config["features"]["word"]["weights_file"]).unlink()
106 |     pipeline = Pipeline.from_config(pipeline_config)
107 | 
108 |     with pytest.raises(FileNotFoundError):
109 |         pipeline.create_vocab([dataset.to_instances(pipeline)])
110 | 


--------------------------------------------------------------------------------
/tests/text/modules/heads/test_task_head.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from allennlp.data import Instance
  3 | from allennlp.data.fields import ListField
  4 | from allennlp.data.fields import TextField
  5 | from spacy.tokenizer import Tokenizer
  6 | from spacy.vocab import Vocab
  7 | 
  8 | from biome.text.configuration import PredictionConfiguration
  9 | from biome.text.helpers import spacy_to_allennlp_token
 10 | from biome.text.modules.heads import TaskHead
 11 | from biome.text.modules.heads import TaskPrediction
 12 | from biome.text.modules.heads.task_prediction import Token
 13 | 
 14 | 
 15 | @pytest.fixture
 16 | def task_head() -> TaskHead:
 17 |     return TaskHead(backbone="mock_backbone")
 18 | 
 19 | 
 20 | def test_prediction_not_implemented(task_head):
 21 |     with pytest.raises(NotImplementedError):
 22 |         task_head.make_task_prediction("mock", "mock", "mock")
 23 | 
 24 | 
 25 | def test_attributions_not_implemented(task_head, monkeypatch):
 26 |     def mock_make_task_prediction(*args, **kwargs):
 27 |         return TaskPrediction()
 28 | 
 29 |     monkeypatch.setattr(task_head, "_make_task_prediction", mock_make_task_prediction)
 30 | 
 31 |     with pytest.raises(NotImplementedError):
 32 |         task_head.make_task_prediction(
 33 |             "mock", "mock", PredictionConfiguration(add_attributions=True)
 34 |         )
 35 | 
 36 | 
 37 | def test_make_task_prediction(monkeypatch, task_head):
 38 |     def mock_make_task_prediction(*args, **kwargs):
 39 |         return TaskPrediction()
 40 | 
 41 |     def mock_compute_attributions(*args, **kwargs):
 42 |         return kwargs
 43 | 
 44 |     def mock_extract_tokens(*args, **kwargs):
 45 |         return "tokens"
 46 | 
 47 |     monkeypatch.setattr(task_head, "_make_task_prediction", mock_make_task_prediction)
 48 |     monkeypatch.setattr(task_head, "_compute_attributions", mock_compute_attributions)
 49 |     monkeypatch.setattr(task_head, "_extract_tokens", mock_extract_tokens)
 50 | 
 51 |     prediction = task_head.make_task_prediction(
 52 |         "mock_forward_output",
 53 |         "mock_instance",
 54 |         PredictionConfiguration(
 55 |             add_tokens=True,
 56 |             add_attributions=True,
 57 |             attributions_kwargs={"test": "kwarg"},
 58 |         ),
 59 |     )
 60 | 
 61 |     assert isinstance(prediction, TaskPrediction)
 62 |     assert hasattr(prediction, "tokens") and hasattr(prediction, "attributions")
 63 |     assert prediction.tokens == "tokens"
 64 |     assert prediction.attributions == {"test": "kwarg"}
 65 | 
 66 | 
 67 | @pytest.mark.parametrize("allennlp_tokens", [False, True])
 68 | def test_extract_tokens(task_head, allennlp_tokens):
 69 |     tokenizer = Tokenizer(Vocab())
 70 |     input_tokens = list(tokenizer("test this sentence."))
 71 |     if allennlp_tokens:
 72 |         input_tokens = [spacy_to_allennlp_token(tok) for tok in input_tokens]
 73 | 
 74 |     tf = TextField(input_tokens, None)
 75 |     instance = Instance({"test": tf})
 76 | 
 77 |     tokens = task_head._extract_tokens(instance)
 78 | 
 79 |     assert all([isinstance(tok, Token) for tok in tokens])
 80 |     assert all(itok.text == otok.text for itok, otok in zip(input_tokens, tokens))
 81 |     assert all(itok.idx == otok.start for itok, otok in zip(input_tokens, tokens))
 82 |     if allennlp_tokens:
 83 |         assert all(itok.idx_end == otok.end for itok, otok in zip(input_tokens, tokens))
 84 |     else:
 85 |         assert all(
 86 |             itok.idx + len(itok.text) == otok.end
 87 |             for itok, otok in zip(input_tokens, tokens)
 88 |         )
 89 |     assert all([tok.field == "test" for tok in tokens])
 90 | 
 91 | 
 92 | def test_extract_tokens_listfield(task_head):
 93 |     tokenizer = Tokenizer(Vocab())
 94 |     input_tokens = list(tokenizer("test this sentence."))
 95 | 
 96 |     tf = TextField(input_tokens, None)
 97 |     instance = Instance({"test": ListField([tf, tf])})
 98 | 
 99 |     tokens = task_head._extract_tokens(instance)
100 | 
101 |     assert len(tokens) == 2 and len(tokens[0]) == 3 and len(tokens[1]) == 3
102 |     assert all(
103 |         [all([isinstance(tok, Token) for tok in tf_tokens] for tf_tokens in tokens)]
104 |     )
105 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/components/Navbar.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |   <header class="navbar">
  3 |     <SidebarButton @toggle-sidebar="$emit('toggle-sidebar')" />
  4 | 
  5 |     <RouterLink
  6 |       :to="$localePath"
  7 |       class="home-link"
  8 |     >
  9 |       <img
 10 |         v-if="$site.themeConfig.logo"
 11 |         class="logo"
 12 |         :src="$withBase($site.themeConfig.logo)"
 13 |         :alt="$siteTitle"
 14 |       >
 15 |       <span
 16 |         v-if="$siteTitle"
 17 |         ref="siteName"
 18 |         class="site-name"
 19 |         :class="{ 'can-hide': $site.themeConfig.logo }"
 20 |       >biome<span>.text</span></span>
 21 |     </RouterLink>
 22 | 
 23 |     <div
 24 |       class="links"
 25 |       :style="linksWrapMaxWidth ? {
 26 |         'max-width': linksWrapMaxWidth + 'px'
 27 |       } : {}"
 28 |     >
 29 |       <AlgoliaSearchBox
 30 |         v-if="isAlgoliaSearch"
 31 |         :options="algolia"
 32 |       />
 33 |       <SearchBox v-else-if="$site.themeConfig.search !== false && $page.frontmatter.search !== false" />
 34 |       <NavLinks class="can-hide" />
 35 |     </div>
 36 |   </header>
 37 | </template>
 38 | 
 39 | <script>
 40 | import AlgoliaSearchBox from './AlgoliaSearchBox'
 41 | import SearchBox from '@SearchBox'
 42 | import SidebarButton from '@theme/components/SidebarButton.vue'
 43 | import NavLinks from '@theme/components/NavLinks.vue'
 44 | 
 45 | export default {
 46 |   name: 'Navbar',
 47 | 
 48 |   components: {
 49 |     SidebarButton,
 50 |     NavLinks,
 51 |     SearchBox,
 52 |     AlgoliaSearchBox
 53 |   },
 54 | 
 55 |   data () {
 56 |     return {
 57 |       linksWrapMaxWidth: null
 58 |     }
 59 |   },
 60 | 
 61 |   computed: {
 62 |     algolia () {
 63 |       return this.$themeLocaleConfig.algolia || this.$site.themeConfig.algolia || {}
 64 |     },
 65 | 
 66 |     isAlgoliaSearch () {
 67 |       return this.algolia && this.algolia.apiKey && this.algolia.indexName
 68 |     }
 69 |   },
 70 | 
 71 |   mounted () {
 72 |     const MOBILE_DESKTOP_BREAKPOINT = 719 // refer to config.styl
 73 |     const NAVBAR_VERTICAL_PADDING = parseInt(css(this.$el, 'paddingLeft')) + parseInt(css(this.$el, 'paddingRight'))
 74 |     const handleLinksWrapWidth = () => {
 75 |       if (document.documentElement.clientWidth < MOBILE_DESKTOP_BREAKPOINT) {
 76 |         this.linksWrapMaxWidth = null
 77 |       } else {
 78 |         this.linksWrapMaxWidth = this.$el.offsetWidth - NAVBAR_VERTICAL_PADDING
 79 |           - (this.$refs.siteName && this.$refs.siteName.offsetWidth || 0)
 80 |       }
 81 |     }
 82 |     handleLinksWrapWidth()
 83 |     window.addEventListener('resize', handleLinksWrapWidth, false)
 84 |   }
 85 | }
 86 | 
 87 | function css (el, property) {
 88 |   // NOTE: Known bug, will return 'auto' if style value is 'auto'
 89 |   const win = el.ownerDocument.defaultView
 90 |   // null means not to return pseudo styles
 91 |   return win.getComputedStyle(el, null)[property]
 92 | }
 93 | </script>
 94 | 
 95 | <style lang="stylus">
 96 | $navbar-vertical-padding = 0.7rem
 97 | $navbar-horizontal-padding = 1.5rem
 98 | 
 99 | .navbar
100 |   padding $navbar-vertical-padding $navbar-horizontal-padding
101 |   line-height $navbarHeight - 1.4rem
102 |   a, span, img
103 |     display inline-block
104 |   .logo
105 |     height $navbarHeight - 1.4rem
106 |     min-width $navbarHeight - 1.4rem
107 |     margin-right 0.8rem
108 |     vertical-align top
109 |   .site-name
110 |     font-size 1.3rem
111 |     font-weight 600
112 |     color $textColor
113 |     position relative
114 |   .links
115 |     padding-left 1.5rem
116 |     box-sizing border-box
117 |     background-color white
118 |     white-space nowrap
119 |     font-size 0.9rem
120 |     position absolute
121 |     right $navbar-horizontal-padding
122 |     top $navbar-vertical-padding
123 |     display flex
124 |     .search-box
125 |       flex: 0 0 auto
126 |       vertical-align top
127 | 
128 | @media (max-width: $MQMobile)
129 |   .navbar
130 |     padding-left 4rem
131 |     .can-hide
132 |       display none
133 |     .links
134 |       padding-left 1.5rem
135 |     .site-name
136 |       width calc(100vw - 9.4rem)
137 |       overflow hidden
138 |       white-space nowrap
139 |       text-overflow ellipsis
140 | </style>
141 | 


--------------------------------------------------------------------------------
/tests/text/test_pipeline_vocab.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from biome.text import Dataset
  4 | from biome.text import Pipeline
  5 | from biome.text import Trainer
  6 | from biome.text import VocabularyConfiguration
  7 | from biome.text.errors import EmptyVocabError
  8 | from biome.text.features import CharFeatures
  9 | from biome.text.features import TransformersFeatures
 10 | from biome.text.features import WordFeatures
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def pipeline():
 15 |     config = {
 16 |         "name": "vocab_test",
 17 |         "features": {
 18 |             "transformers": {"model_name": "sshleifer/tiny-distilbert-base-cased"},
 19 |             "word": {"embedding_dim": 2},
 20 |             "char": {
 21 |                 "embedding_dim": 2,
 22 |                 "dropout": 0.1,
 23 |                 "encoder": {
 24 |                     "type": "gru",
 25 |                     "hidden_size": 2,
 26 |                     "num_layers": 1,
 27 |                     "bidirectional": False,
 28 |                 },
 29 |             },
 30 |         },
 31 |         "head": {
 32 |             "type": "TextClassification",
 33 |             "labels": ["good", "bad"],
 34 |         },
 35 |     }
 36 | 
 37 |     return Pipeline.from_config(config)
 38 | 
 39 | 
 40 | @pytest.fixture
 41 | def train_dataset():
 42 |     data = {"text": ["this is a test", "and another one"], "label": ["good", "bad"]}
 43 |     return Dataset.from_dict(data)
 44 | 
 45 | 
 46 | @pytest.fixture
 47 | def valid_dataset():
 48 |     data = {
 49 |         "text": ["and what about the validation", "do not forget this one"],
 50 |         "label": ["bad", "good"],
 51 |     }
 52 |     return Dataset.from_dict(data)
 53 | 
 54 | 
 55 | def test_default_vocab(pipeline, train_dataset, valid_dataset):
 56 |     # Transformer vocab is added on pipeline creation
 57 |     assert pipeline.vocab.get_vocab_size(TransformersFeatures.namespace) == 28996
 58 |     # While word and char vocab should be empty (except for the oov and padding token)
 59 |     assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 2
 60 |     assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 2
 61 | 
 62 |     # Training should build a default vocab with only the training dataset
 63 |     Trainer(pipeline, train_dataset=train_dataset)
 64 |     assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 9
 65 |     assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 12
 66 |     assert pipeline.vocab.get_vocab_size(TransformersFeatures.namespace) == 28996
 67 | 
 68 |     # Pretrained pipelines should extend the vocab by default
 69 |     Trainer(pipeline, train_dataset=valid_dataset)
 70 |     assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 16
 71 |     assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 19
 72 |     assert pipeline.vocab.get_vocab_size(TransformersFeatures.namespace) == 28996
 73 | 
 74 | 
 75 | def test_specific_vocab_config(pipeline, train_dataset, valid_dataset):
 76 |     vocab_config = VocabularyConfiguration(include_valid_data=True)
 77 | 
 78 |     Trainer(
 79 |         pipeline,
 80 |         train_dataset=train_dataset,
 81 |         valid_dataset=valid_dataset,
 82 |         vocab_config=vocab_config,
 83 |     )
 84 |     assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 16
 85 |     assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 19
 86 |     assert pipeline.vocab.get_vocab_size(TransformersFeatures.namespace) == 28996
 87 | 
 88 | 
 89 | def test_not_touching_vocab(pipeline, train_dataset, valid_dataset):
 90 |     # vocab_config=None leaves the pipeline's vocab empty from an unpretrained pipeline
 91 |     with pytest.raises(EmptyVocabError):
 92 |         Trainer(pipeline, train_dataset=train_dataset, vocab_config=None)
 93 | 
 94 |     # vocab_config=None should not extend the vocab for a pretrained pipeline
 95 |     Trainer(pipeline, train_dataset=train_dataset)
 96 |     assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 9
 97 |     assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 12
 98 |     Trainer(pipeline, train_dataset=valid_dataset, vocab_config=None)
 99 |     assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 9
100 |     assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 12
101 | 


--------------------------------------------------------------------------------
/docs/docs/documentation/user-guides/1-nlp-tasks.md:
--------------------------------------------------------------------------------
 1 | # NLP Tasks
 2 | 
 3 | In *biome.text* NLP tasks are defined via ``TaskHead`` classes.
 4 | 
 5 | This section gives a summary of the library's main heads and tasks.
 6 | 
 7 | ## TextClassification
 8 | 
 9 | **Tutorials**: [Training a short text classifier of German business names](../tutorials/Training_a_text_classifier.md)
10 | 
11 | **NLP tasks**: text classification, sentiment analysis, entity typing, relation classification.
12 | 
13 | **Input**: `text`: a single field or a concatenation of input fields.
14 | 
15 | **Output**: `label` by default, a probability distribution over labels except if `multilabel` is enabled for multi-label classification problems.
16 | 
17 | **Main parameters**:
18 | 
19 | `pooler`: a `Seq2VecEncoderConfiguration` to pool a sequence of encoded word/char vectors into a single vector representing the input text
20 | 
21 | 
22 | See [TextClassification API](../../api/biome/text/modules/heads/classification/text_classification.md#textclassification) for more details.
23 | 
24 | ## RecordClassification
25 | 
26 | **NLP tasks**: text classification, sentiment analysis, entity typing, relation classification and semi-structured data classification problems with product, customer data, etc.
27 | 
28 | **Input**: `document`: a list of fields.
29 | 
30 | **Output**: `labels` by default, a probability distribution over labels except if `multilabel` is enabled for multi-label classification problems.
31 | 
32 | **Main parameters**:
33 | 
34 | `record_keys`: field keys to be used as input features to the model, e.g., name, first_name, body, subject, etc.
35 | 
36 | `tokens_pooler`: a `Seq2VecEncoderConfiguration` to pool a sequence of encoded word/char vectors **for each field** into a single vector representing the field.
37 | 
38 | `fields_encoder`: a `Seq2SeqEncoderConfiguration` to encode a sequence of field vectors.
39 | 
40 | `fields_pooler`: a `Seq2VecEncoderConfiguration` to pool a sequence of encoded field vectors into a single vector representing the whole document/record.
41 | 
42 | See [RecordClassification API](../../api/biome/text/modules/heads/classification/record_classification.md#recordclassification) for more details.
43 | 
44 | ## RecordPairClassification
45 | 
46 | **NLP tasks**: Classify the relation between a pair of structured data. For example, do two sets of customer data belong to the same customer or not.
47 | 
48 | **Input**: `record1`, `record2`. Two dictionaries that should share the same keys, preferably in the same order.
49 | 
50 | **Output**: `labels`. By default, a probability distribution over labels except if `multilabel` is enabled for multi-label classification problems.
51 | 
52 | **Main parameters**:
53 | 
54 | `field_encoder`: A `Seq2VecEncoder` to encode and pool the single dictionary items of both inputs. It takes both, the key and the value, into account.
55 | 
56 | `record_encoder`: A `Seq2SeqEncoder` to contextualize the encoded dictionary items within its record.
57 | 
58 | `matcher_forward`: A `BiMPMMatching` layer for the (optionally only forward) record encoder layer.
59 | 
60 | `aggregator`: A `Seq2VecEncoder` to pool the output of the matching layers.
61 | 
62 | See the [RecordPairClassification API](../../api/biome/text/modules/heads/classification/record_pair_classification.md) for more details.
63 | 
64 | ##  TokenClassification
65 | 
66 | **Tutorials**: [Training a sequence tagger for Slot Filling](../tutorials/Training_a_sequence_tagger_for_Slot_Filling.md)
67 | 
68 | **NLP tasks**: NER, Slot filling, Part of speech tagging.
69 | 
70 | **Input**: `text`: **pretokenized text** as a list of tokens.
71 | 
72 | **Output**: `labels`: one label for each token according to the `label_encoding` scheme defined in the head (e.g., BIO).
73 | 
74 | **Main parameters**:
75 | 
76 | `feedforward`: feed-forward layer to be applied after token encoding.
77 | 
78 | See [TokenClassification API](../../api/biome/text/modules/heads/token_classification.md#tokenclassification) for more details.
79 | 
80 | ##  LanguageModelling
81 | 
82 | **NLP tasks**: Pre-training, word-level next token language model.
83 | 
84 | **Input**: `text`: a single field or a concatenation of input fields.
85 | 
86 | **Output**: contextualized word vectors.
87 | 
88 | **Main parameters**:
89 | 
90 | `dropout` to be applied after token encoding.
91 | 
92 | 
93 | See [LanguageModelling API](../../api/biome/text/modules/heads/language_modelling.md#languagemodelling) for more details.
94 | 


--------------------------------------------------------------------------------
/src/biome/text/modules/heads/classification/record_classification.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict
  2 | from typing import List
  3 | from typing import Optional
  4 | from typing import Union
  5 | 
  6 | import numpy
  7 | from allennlp.data import Instance
  8 | 
  9 | from biome.text.backbone import ModelBackbone
 10 | from biome.text.modules.configuration import ComponentConfiguration
 11 | from biome.text.modules.configuration import FeedForwardConfiguration
 12 | from biome.text.modules.configuration import Seq2SeqEncoderConfiguration
 13 | from biome.text.modules.configuration import Seq2VecEncoderConfiguration
 14 | from biome.text.modules.heads import DocumentClassification
 15 | from biome.text.modules.heads.task_prediction import RecordClassificationPrediction
 16 | 
 17 | 
 18 | class RecordClassification(DocumentClassification):
 19 |     """
 20 |     Task head for data record  classification.
 21 |     Accepts a variable data inputs and apply featuring over defined record keys.
 22 | 
 23 |     This head applies a doc2vec architecture from a structured record data input
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     backbone
 28 |         The backbone of your model. Must not be provided when initiating with `Pipeline.from_config`.
 29 |     labels
 30 |         A list of labels for your classification task.
 31 |     token_pooler
 32 |         The pooler at token level to provide one vector per record field. Default: `BagOfEmbeddingsEncoder`.
 33 |     fields_encoder
 34 |         An optional sequence to sequence encoder that contextualizes the record field representations. Default: None.
 35 |     fields_pooler
 36 |         The pooler at sentence level to provide a vector for the whole record. Default: `BagOfEmbeddingsEncoder`.
 37 |     feedforward
 38 |         An optional feedforward layer applied to the output of the fields pooler. Default: None.
 39 |     multilabel
 40 |         Is this a multi label classification task? Default: False
 41 |     label_weights
 42 |         A list of weights for each label. The weights must be in the same order as the `labels`.
 43 |         You can also provide a dictionary that maps the label to its weight. Default: None.
 44 |     """
 45 | 
 46 |     def __init__(
 47 |         self,
 48 |         backbone: ModelBackbone,
 49 |         labels: List[str],
 50 |         record_keys: List[str],
 51 |         token_pooler: Optional[Seq2VecEncoderConfiguration] = None,
 52 |         fields_encoder: Optional[Seq2SeqEncoderConfiguration] = None,
 53 |         fields_pooler: Optional[Seq2VecEncoderConfiguration] = None,
 54 |         feedforward: Optional[FeedForwardConfiguration] = None,
 55 |         multilabel: Optional[bool] = False,
 56 |         label_weights: Optional[Union[List[float], Dict[str, float]]] = None,
 57 |     ) -> None:
 58 | 
 59 |         super().__init__(
 60 |             backbone,
 61 |             labels=labels,
 62 |             token_pooler=token_pooler,
 63 |             sentence_encoder=fields_encoder,
 64 |             sentence_pooler=fields_pooler,
 65 |             feedforward=feedforward,
 66 |             multilabel=multilabel,
 67 |             label_weights=label_weights,
 68 |         )
 69 | 
 70 |         self._empty_prediction = RecordClassificationPrediction(
 71 |             labels=[], probabilities=[]
 72 |         )
 73 | 
 74 |         self._inputs = record_keys
 75 | 
 76 |     def inputs(self) -> Optional[List[str]]:
 77 |         """The inputs names are determined by configured record keys"""
 78 |         return self._inputs
 79 | 
 80 |     def featurize(
 81 |         self, label: Optional[Union[str, List[str]]] = None, **inputs
 82 |     ) -> Instance:
 83 | 
 84 |         record = {input_key: inputs[input_key] for input_key in self._inputs}
 85 |         instance = self.backbone.featurizer(record, to_field=self.forward_arg_name)
 86 | 
 87 |         return self._add_label(instance, label)
 88 | 
 89 |     def _make_task_prediction(
 90 |         self,
 91 |         single_forward_output: Dict[str, numpy.ndarray],
 92 |         instance: Instance,
 93 |     ) -> RecordClassificationPrediction:
 94 |         labels, probabilities = self._compute_labels_and_probabilities(
 95 |             single_forward_output
 96 |         )
 97 | 
 98 |         return RecordClassificationPrediction(
 99 |             labels=labels, probabilities=probabilities
100 |         )
101 | 
102 | 
103 | class RecordClassificationConfiguration(ComponentConfiguration[RecordClassification]):
104 |     """Lazy initialization for document classification head components"""
105 | 
106 |     pass
107 | 


--------------------------------------------------------------------------------
/tests/text_classification_integration_test.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | 
  3 | import pytest
  4 | from pytorch_lightning import seed_everything
  5 | 
  6 | from biome.text import Dataset
  7 | from biome.text import Pipeline
  8 | from biome.text import Trainer
  9 | from biome.text import VocabularyConfiguration
 10 | from biome.text.configuration import CharFeatures
 11 | from biome.text.configuration import TrainerConfiguration
 12 | from biome.text.configuration import WordFeatures
 13 | 
 14 | 
 15 | @pytest.fixture
 16 | def train_valid_dataset(resources_data_path) -> Tuple[Dataset, Dataset]:
 17 |     """Returns both training and validation datasets"""
 18 | 
 19 |     training_ds = Dataset.from_csv(
 20 |         paths=str(resources_data_path / "business.cat.2k.train.csv")
 21 |     )
 22 |     validation_ds = Dataset.from_csv(
 23 |         paths=str(resources_data_path / "business.cat.2k.valid.csv")
 24 |     )
 25 | 
 26 |     return training_ds, validation_ds
 27 | 
 28 | 
 29 | @pytest.fixture
 30 | def pipeline_dict() -> dict:
 31 |     return {
 32 |         "name": "german_business_names",
 33 |         "features": {
 34 |             "word": {"embedding_dim": 16, "lowercase_tokens": True},
 35 |             "char": {
 36 |                 "embedding_dim": 16,
 37 |                 "encoder": {
 38 |                     "type": "gru",
 39 |                     "num_layers": 1,
 40 |                     "hidden_size": 32,
 41 |                     "bidirectional": True,
 42 |                 },
 43 |                 "dropout": 0.1,
 44 |             },
 45 |         },
 46 |         "head": {
 47 |             "type": "TextClassification",
 48 |             "labels": [
 49 |                 "Unternehmensberatungen",
 50 |                 "Friseure",
 51 |                 "Tiefbau",
 52 |                 "Dienstleistungen",
 53 |                 "Gebrauchtwagen",
 54 |                 "Restaurants",
 55 |                 "Architekturbüros",
 56 |                 "Elektriker",
 57 |                 "Vereine",
 58 |                 "Versicherungsvermittler",
 59 |                 "Sanitärinstallationen",
 60 |                 "Edv",
 61 |                 "Maler",
 62 |                 "Physiotherapie",
 63 |                 "Werbeagenturen",
 64 |                 "Apotheken",
 65 |                 "Vermittlungen",
 66 |                 "Hotels",
 67 |                 "Autowerkstätten",
 68 |                 "Elektrotechnik",
 69 |                 "Allgemeinärzte",
 70 |                 "Handelsvermittler Und -vertreter",
 71 |             ],
 72 |             "pooler": {
 73 |                 "type": "gru",
 74 |                 "num_layers": 1,
 75 |                 "hidden_size": 16,
 76 |                 "bidirectional": True,
 77 |             },
 78 |             "feedforward": {
 79 |                 "num_layers": 1,
 80 |                 "hidden_dims": [16],
 81 |                 "activations": ["relu"],
 82 |                 "dropout": [0.1],
 83 |             },
 84 |         },
 85 |     }
 86 | 
 87 | 
 88 | def test_text_classification(tmp_path, pipeline_dict, train_valid_dataset):
 89 |     """Apart from a well specified training, this also tests the vocab creation!"""
 90 |     seed_everything(43)
 91 | 
 92 |     pl = Pipeline.from_config(pipeline_dict)
 93 |     train_ds = train_valid_dataset[0]
 94 |     valid_ds = train_valid_dataset[1]
 95 | 
 96 |     vocab_config = VocabularyConfiguration(max_vocab_size={"word": 50})
 97 |     trainer_config = TrainerConfiguration(
 98 |         batch_size=64,
 99 |         optimizer={"type": "adam", "lr": 0.01},
100 |         max_epochs=5,
101 |         default_root_dir=str(tmp_path),
102 |         gpus=0,  # turn off gpus even if available
103 |     )
104 | 
105 |     trainer = Trainer(
106 |         pipeline=pl,
107 |         train_dataset=train_ds,
108 |         valid_dataset=valid_ds,
109 |         trainer_config=trainer_config,
110 |         vocab_config=vocab_config,
111 |     )
112 | 
113 |     trainer.fit(tmp_path / "output")
114 | 
115 |     assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
116 |     assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
117 | 
118 |     assert pl.num_trainable_parameters == 22070
119 | 
120 |     evaluation = trainer.test(valid_ds, batch_size=16)
121 | 
122 |     # Reminder: the value depends on the batch_size!
123 |     assert evaluation["test_loss"] == pytest.approx(0.7404146790504456, abs=0.003)
124 | 
125 |     Pipeline.from_pretrained(str(tmp_path / "output" / "model.tar.gz"))
126 | 
127 |     assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52
128 |     assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83
129 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/layouts/Layout.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |   <div
  3 |     class="theme-container"
  4 |     :class="pageClasses"
  5 |     @touchstart="onTouchStart"
  6 |     @touchend="onTouchEnd"
  7 |   >
  8 |     <Navbar
  9 |       v-if="shouldShowNavbar"
 10 |       @toggle-sidebar="toggleSidebar"
 11 |     />
 12 | 
 13 |     <div
 14 |       class="sidebar-mask"
 15 |       @click="toggleSidebar(false)"
 16 |     />
 17 | 
 18 |     <Sidebar
 19 |       :items="sidebarItems"
 20 |       @toggle-sidebar="toggleSidebar"
 21 |     >
 22 |       <template #top>
 23 |         <slot name="sidebar-top" />
 24 |       </template>
 25 |       <template #bottom>
 26 |         <slot name="sidebar-bottom" />
 27 |       </template>
 28 |     </Sidebar>
 29 | 
 30 |     <Home v-if="$page.frontmatter.home" />
 31 | 
 32 |     <Page
 33 |       v-else
 34 |       :sidebar-items="sidebarItems"
 35 |     >
 36 |       <template #top>
 37 |         <slot name="page-top" />
 38 |       </template>
 39 |       <template #bottom>
 40 |         <footer class="footer">
 41 |           <div>
 42 |             Maintained by
 43 |             <a href="https://www.recogn.ai/" target="_blank">
 44 |               <img width="70px" class="footer__img" :src="$withBase('/assets/img/recognai.png')"></img>
 45 |             </a>
 46 |           </div>
 47 |         </footer>
 48 |         <slot name="page-bottom" />
 49 |       </template>
 50 |     </Page>
 51 |   </div>
 52 | </template>
 53 | 
 54 | <script>
 55 | import Home from '@theme/components/Home.vue'
 56 | import Navbar from '@theme/components/Navbar.vue'
 57 | import Page from '@theme/components/Page.vue'
 58 | import Sidebar from '@theme/components/Sidebar.vue'
 59 | import { resolveSidebarItems } from '@vuepress/theme-default/util'
 60 | 
 61 | export default {
 62 |   name: 'Layout',
 63 | 
 64 |   components: {
 65 |     Home,
 66 |     Page,
 67 |     Sidebar,
 68 |     Navbar
 69 |   },
 70 | 
 71 |   data () {
 72 |     return {
 73 |       isSidebarOpen: false
 74 |     }
 75 |   },
 76 | 
 77 |   computed: {
 78 |     shouldShowNavbar () {
 79 |       const { themeConfig } = this.$site
 80 |       const { frontmatter } = this.$page
 81 |       if (
 82 |         frontmatter.navbar === false
 83 |         || themeConfig.navbar === false) {
 84 |         return false
 85 |       }
 86 |       return (
 87 |         this.$title
 88 |         || themeConfig.logo
 89 |         || themeConfig.repo
 90 |         || themeConfig.nav
 91 |         || this.$themeLocaleConfig.nav
 92 |       )
 93 |     },
 94 | 
 95 |     shouldShowSidebar () {
 96 |       const { frontmatter } = this.$page
 97 |       return (
 98 |         !frontmatter.home
 99 |         && frontmatter.sidebar !== false
100 |         && this.sidebarItems.length
101 |       )
102 |     },
103 | 
104 |     sidebarItems () {
105 |       return resolveSidebarItems(
106 |         this.$page,
107 |         this.$page.regularPath,
108 |         this.$site,
109 |         this.$localePath
110 |       )
111 |     },
112 | 
113 |     pageClasses () {
114 |       const userPageClass = this.$page.frontmatter.pageClass
115 |       return [
116 |         {
117 |           'no-navbar': !this.shouldShowNavbar,
118 |           'sidebar-open': this.isSidebarOpen,
119 |           'no-sidebar': !this.shouldShowSidebar
120 |         },
121 |         userPageClass
122 |       ]
123 |     }
124 |   },
125 | 
126 |   mounted () {
127 |     this.$router.afterEach(() => {
128 |       this.isSidebarOpen = false
129 |     })
130 |   },
131 | 
132 |   methods: {
133 |     toggleSidebar (to) {
134 |       this.isSidebarOpen = typeof to === 'boolean' ? to : !this.isSidebarOpen
135 |       this.$emit('toggle-sidebar', this.isSidebarOpen)
136 |     },
137 | 
138 |     // side swipe
139 |     onTouchStart (e) {
140 |       this.touchStart = {
141 |         x: e.changedTouches[0].clientX,
142 |         y: e.changedTouches[0].clientY
143 |       }
144 |     },
145 | 
146 |     onTouchEnd (e) {
147 |       const dx = e.changedTouches[0].clientX - this.touchStart.x
148 |       const dy = e.changedTouches[0].clientY - this.touchStart.y
149 |       if (Math.abs(dx) > Math.abs(dy) && Math.abs(dx) > 40) {
150 |         if (dx > 0 && this.touchStart.x <= 80) {
151 |           this.toggleSidebar(true)
152 |         } else {
153 |           this.toggleSidebar(false)
154 |         }
155 |       }
156 |     }
157 |   }
158 | }
159 | </script>
160 | <style lang="stylus" scoped>
161 |   .footer
162 |     padding 2.5rem
163 |     // border-top 1px solid $borderColor
164 |     text-align center
165 |     color lighten($textColor, 25%)
166 |     font-size 12px
167 |     & > div
168 |       margin: auto
169 |       display flex
170 |       align-items center
171 |       width 160px
172 |     img
173 |      margin-left 1em
174 | </style>
175 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/public/assets/img/pytorch.svg:
--------------------------------------------------------------------------------
 1 | 
 2 | <svg width="88px" height="29px" viewBox="0 0 88 29" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 3 |     <!-- Generator: Sketch 48.1 (47250) - http://www.bohemiancoding.com/sketch -->
 4 |     <defs></defs>
 5 |     <g id="*-Documentation" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
 6 |         <g id="Home-biome-text-Copy" transform="translate(-558.000000, -768.000000)" fill="#686A6D" fill-rule="nonzero">
 7 |             <g id="Group-9" transform="translate(558.000000, 768.000000)">
 8 |                 <g id="Group">
 9 |                     <path d="M20.9431933,8.5035078 L18.8287363,10.5626125 C22.3528313,13.9944538 22.3528313,19.5834524 18.8287363,23.0152936 C15.3046413,26.4471348 9.56540079,26.4471348 6.04130576,23.0152936 C2.51721074,19.5834524 2.51721074,13.9944538 6.04130576,10.5626125 L11.6798578,5.07166657 L12.3846768,4.28724572 L12.3846768,0.169036238 L3.82616032,8.5035078 C1.55470737,10.6931911 0.27667501,13.6764595 0.27667501,16.7889531 C0.27667501,19.9014467 1.55470737,22.884715 3.82616032,25.0743983 C6.07470609,27.2863892 9.13816981,28.5309652 12.3343326,28.5309652 C15.5304954,28.5309652 18.5939591,27.2863892 20.8425049,25.0743983 C25.6755495,20.4659258 25.6755495,13.0139277 20.9431933,8.5035078 Z" id="Shape"></path>
10 |                     <ellipse id="Oval" cx="16.7142793" cy="6.34635046" rx="1.61101487" ry="1.56884171"></ellipse>
11 |                 </g>
12 |                 <path d="M34.2754948,16.1147236 L32.6202532,16.1147236 L32.6202532,20.2605578 L31.3846154,20.2605578 L31.3846154,8.49199801 L34.4054313,8.49199801 C37.609979,8.49199801 39.120387,10.007194 39.120387,12.1993926 C39.120387,14.7848973 37.243343,16.0776496 34.2680463,16.1219772 L34.2754948,16.1147236 Z M34.3582569,9.60663424 L32.5805274,9.60663424 L32.5805274,15.0008933 L34.3201864,14.9565658 C36.6093855,14.9122382 37.845851,14.0200457 37.845851,12.192139 C37.845851,10.5431543 36.6557323,9.60663424 34.3665331,9.60663424 L34.3582569,9.60663424 Z M44.7134484,20.2154243 L43.981004,22.0876586 C43.1566937,24.1831424 42.3257624,24.8069519 41.0967455,24.8069519 C40.4098202,24.8069519 39.9066268,24.6296417 39.3570866,24.4055861 L39.722895,23.3360834 C40.1350501,23.559333 40.5918968,23.6923156 41.0967455,23.6923156 C41.7836707,23.6923156 42.2868642,23.3360834 42.9274427,21.6774273 L43.5233297,20.1622313 L40.0903586,11.6481191 L41.3731708,11.6481191 L44.1655634,18.7808238 L46.9132645,11.6481191 L48.1489023,11.6481191 L44.7134484,20.2154243 Z M52.2654882,9.65096179 L52.2654882,20.3056913 L51.0298503,20.3056913 L51.0298503,9.65096179 L46.7725689,9.65096179 L46.7725689,8.49199801 L56.521942,8.49199801 L56.521942,9.60663424 L52.2646606,9.60663424 L52.2654882,9.65096179 Z M60.0012598,20.527329 C57.5291565,20.527329 55.6976317,18.7445558 55.6976317,15.980129 C55.6976317,13.2157022 57.5746756,11.3886015 60.0840219,11.3886015 C62.5561252,11.3886015 64.3413033,13.1713747 64.3413033,15.9358015 C64.3413033,18.7002283 62.4642593,20.527329 59.992156,20.527329 L60.0012598,20.527329 Z M60.046779,12.4677757 C58.169735,12.4677757 56.9349248,13.9386442 56.9349248,15.944667 C56.9349248,18.0401509 58.217737,19.4666918 60.0931257,19.4666918 C61.9685145,19.4666918 63.2049799,17.9958233 63.2049799,15.9898005 C63.2049799,13.8943166 61.9221677,12.4677757 60.046779,12.4677757 Z M67.4175698,20.3048854 L66.2274511,20.3048854 L66.2274511,11.6569846 L67.4175698,11.433735 L67.4175698,13.2616417 C68.0134568,12.1470055 68.8824586,11.433735 70.0270582,11.433735 C70.572088,11.4381206 71.1071989,11.5761574 71.5829853,11.8351007 L71.2635237,12.948931 C70.8977153,12.7256814 70.3945218,12.5926988 69.8896732,12.5926988 C68.9743245,12.5926988 68.1044951,13.2616417 67.3720507,14.8219712 L67.3720507,20.3048854 L67.4175698,20.3048854 Z M76.297941,20.527329 C73.6429335,20.527329 71.9487937,18.6550948 71.9487937,15.980129 C71.9487937,13.2608357 73.7794909,11.3886015 76.297941,11.3886015 C77.3961938,11.3886015 78.31237,11.6561787 79.0903336,12.1461995 L78.7700443,13.2157022 C78.0831191,12.7700089 77.2596364,12.5024318 76.297941,12.5024318 C74.3753779,12.5024318 73.1860868,13.8846452 73.1860868,15.9358015 C73.1860868,18.0312854 74.468899,19.4126928 76.3442878,19.4126928 C77.2212153,19.4087559 78.0787973,19.1613197 78.8163911,18.6994223 L79.0456421,19.7689251 C78.2676785,20.2589459 77.3059831,20.5265231 76.297941,20.5265231 L76.297941,20.527329 Z M86.505816,20.3048854 L86.505816,14.7325102 C86.505816,13.2173141 85.8652375,12.5564308 84.628772,12.5564308 C83.6215575,12.5564308 82.614343,13.0464516 81.8810709,13.8056615 L81.8810709,20.3580784 L80.6909522,20.3580784 L80.6909522,7.60786501 L81.8810709,7.38461538 L81.8810709,12.823202 C82.7964195,11.9318154 83.9865383,11.4409886 84.9482336,11.4409886 C86.6878925,11.4409886 87.7406262,12.5104913 87.7406262,14.3827256 L87.7406262,20.311333 L86.505816,20.3048854 Z" id="Shape"></path>
13 |             </g>
14 |         </g>
15 |     </g>
16 | </svg>
17 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/styles/index.styl:
--------------------------------------------------------------------------------
  1 | @require "./code-colors.styl"
  2 | 
  3 | body
  4 |   font-family: $primaryFontFamily !important
  5 | h1, h2, h3, h4
  6 |   font-family: $secondaryFontFamily
  7 | h2
  8 |   border-bottom none
  9 | dd, dt
 10 |   line-height 1.7em
 11 | pre.title
 12 |     line-height: 0 !important
 13 |     padding: 0 !important
 14 |     margin: 0 !important
 15 |     background-color: transparent !important
 16 |     overflow: visible !important
 17 |     h2, h3
 18 |       margin-bottom: 0 !important
 19 | .badge.green, .badge.tip
 20 |   background: $accentColor
 21 | .custom-block.danger, .custom-block.tip, .custom-block.warning
 22 |   border-width 1px !important
 23 |   border-style solid !important
 24 |   background #FFFFFF !important
 25 |   color $textColor !important
 26 | .custom-block.danger
 27 |   border-color $red
 28 |   .custom-block-title
 29 |     color $red
 30 | .custom-block.tip
 31 |   border-color $green
 32 |   .custom-block-title
 33 |     color $green
 34 | .custom-block.warning
 35 |   border-color $yellow
 36 |   .custom-block-title
 37 |     color $yellow
 38 | .table-of-contents
 39 |   ul
 40 |     list-style none
 41 |   li
 42 |     padding-left 1em
 43 |     position relative
 44 |   li:before
 45 |     content ""
 46 |     height 4px
 47 |     width 4px
 48 |     border-radius 50%
 49 |     position absolute
 50 |     background $accentColor
 51 |     padding 1px
 52 |     margin-right 0.5em
 53 |     left 0
 54 |     top 12px
 55 | .no-sidebar
 56 |   .navbar
 57 |     display: none
 58 | .page p img:not(.icon)
 59 |   border 1px solid $borderColor
 60 | .navbar .site-name span
 61 |   font-family 'Basis Grotesque Pro Light' !important
 62 |   font-weight lighter
 63 | .sidebar
 64 |   background: $sidebarBgColor
 65 |   &__link
 66 |     display: block !important
 67 |     a
 68 |       display: block !important
 69 |   &__img
 70 |     width auto
 71 |     max-width 180px
 72 |     margin: 2em auto 1em auto
 73 |     display: block
 74 |     @media screen and (max-width: $MQNarrow)
 75 |       max-width 130px
 76 |   &-heading
 77 |     pointer-events: none
 78 |     font-family: $secondaryFontFamily
 79 |     &:after
 80 |       content: ''
 81 |       width: 35px
 82 |       height: 2px
 83 |       background: none
 84 |       display: block
 85 |     &.open
 86 |       color: $accentColor !important
 87 |       &:after
 88 |         background: $accentColor
 89 |   &-link
 90 |     border-left: none !important
 91 |     font-family: $secondaryFontFamily
 92 | .theme-default-content code
 93 |   background-color: $codePillColor
 94 | .go-to-top
 95 |   background: white
 96 |   height: 1.2rem !important
 97 |   width: 1.2rem !important
 98 |   border-radius: 13px
 99 |   padding: 0.8rem
100 |   box-shadow: 0 2px 4px #929292
101 | .nav-links
102 |   .nav-item:first-child
103 |     text-transform: uppercase
104 | .nav-link
105 |   font-weight: 600 !important
106 |   font-family: $secondaryFontFamily
107 | .external__icon
108 |   margin-left 0.3em
109 | .search-box input
110 |   border-radius 0 !important
111 | .footer
112 |   //position: absolute
113 |   bottom: 1rem
114 |   margin-top: 5rem
115 |   text-align: center
116 |   &__img
117 |     max-width: 100px
118 | 
119 | @media screen and (min-width: $MQNarrow)
120 |   .navbar
121 |     position: relative
122 |     border-bottom: none
123 |     // line-height: 4.4em !important
124 |     margin-left: 320px
125 |     .nav-links
126 |       display flex
127 |       width 100%
128 |       align-items center
129 |       margin-left 3em
130 |       margin-right -1em
131 |       .nav-item
132 |         width: auto
133 |         margin-left auto
134 |     .nav-link.external
135 |       font-size 12px
136 |     .no-sidebar &
137 |       max-width: 960px
138 |       padding: 0 2.5rem
139 |       margin: auto
140 |       box-sizing: content-box;
141 |       .links
142 |         max-width: 100%
143 |         margin-left: 0
144 |         padding-left: 0 !important
145 |     .home-link
146 |       display: none
147 |     .links
148 |       position: relative !important
149 |       right: auto !important
150 |       max-width: 740px !important;
151 |       margin: 1.4em auto 0.7em 4.5em;
152 |   .page-nav
153 |     margin-left: 4.5rem !important
154 |   .sidebar
155 |     top: 0
156 |     z-index: 21
157 |     overflow: visible
158 |     border-right: none
159 |     > ul
160 |       overflow-y: auto
161 |       max-height: calc(100% - 170px)
162 |       > li:last-child
163 |        margin-bottom 2em
164 |     .sidebar-sub-headers .sidebar-sub-headers
165 |       display none
166 |     .active + .sidebar-sub-headers .sidebar-sub-headers
167 |       display block
168 |   .search-box input
169 |     border-width: 2px !important
170 |     border-radius: 2px !important
171 |     color $accentColor !important
172 |     min-width: 250px !important
173 |     padding-left 0.5rem !important
174 |     background-position calc(100% - 1rem) !important
175 |     height 2.5rem !important
176 |     border-color $sidebarBgColor !important
177 |     &:focus
178 |       border-color $accentColor !important
179 |   .theme-default-content:not(.custom)
180 |       max-width: 740px
181 |       margin: 0 auto
182 |       padding: 2rem 2.5rem
183 |       margin-left: 4.5rem
184 | 


--------------------------------------------------------------------------------
/tests/text/test_features_transformers.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import pytest
  4 | from numpy.testing import assert_allclose
  5 | 
  6 | from biome.text import Dataset
  7 | from biome.text import Pipeline
  8 | from biome.text import Trainer
  9 | from biome.text import TrainerConfiguration
 10 | from biome.text.features import TransformersFeatures
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def train_dataset() -> Dataset:
 15 |     """Creates the training dataset"""
 16 |     source = (
 17 |         Path(__file__).parent.parent
 18 |         / "resources"
 19 |         / "data"
 20 |         / "emotions_with_transformers.txt"
 21 |     )
 22 | 
 23 |     train_dataset = Dataset.from_csv(
 24 |         paths=str(source), delimiter=";", column_names=["text", "label"]
 25 |     )
 26 |     return train_dataset
 27 | 
 28 | 
 29 | @pytest.fixture
 30 | def pipeline_dict() -> dict:
 31 |     """Creation of pipeline dictionary"""
 32 | 
 33 |     pipeline_dict = {
 34 |         "name": "emotions_with_transformers",
 35 |         "features": {
 36 |             "transformers": {"model_name": "sshleifer/tiny-distilbert-base-cased"}
 37 |         },
 38 |         "head": {
 39 |             "type": "TextClassification",
 40 |             "labels": [
 41 |                 "anger",
 42 |                 "fear",
 43 |                 "joy",
 44 |                 "love",
 45 |                 "sadness",
 46 |                 "surprise",
 47 |             ],
 48 |             "pooler": {
 49 |                 "type": "bert_pooler",
 50 |                 "pretrained_model": "sshleifer/tiny-distilbert-base-cased",
 51 |                 "requires_grad": True,
 52 |                 "dropout": 0.1,
 53 |             },
 54 |         },
 55 |     }
 56 | 
 57 |     return pipeline_dict
 58 | 
 59 | 
 60 | @pytest.fixture
 61 | def trainer_config(tmp_path) -> TrainerConfiguration:
 62 |     return TrainerConfiguration(
 63 |         batch_size=16,
 64 |         max_epochs=1,
 65 |         optimizer={
 66 |             "type": "adam",
 67 |             "lr": 0.0001,
 68 |         },
 69 |         gpus=0,
 70 |         default_root_dir=str(tmp_path),
 71 |     )
 72 | 
 73 | 
 74 | def test_pure_transformers(tmp_path, pipeline_dict, trainer_config, train_dataset):
 75 |     """Testing a Transformer training process and a model load"""
 76 | 
 77 |     pl = Pipeline.from_config(pipeline_dict)
 78 | 
 79 |     # Check a fixed vocabulary size for the model
 80 |     assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
 81 | 
 82 |     pl.predict(text="test")
 83 | 
 84 |     output = tmp_path / "output"
 85 |     trainer = Trainer(
 86 |         pipeline=pl, train_dataset=train_dataset, trainer_config=trainer_config
 87 |     )
 88 |     trainer.fit(output_dir=output)
 89 | 
 90 |     # Test vocabulary from a pretrained file
 91 |     pl = Pipeline.from_pretrained(str(output / "model.tar.gz"))
 92 | 
 93 |     # Check a fixed vocabulary size for the model after loading
 94 |     assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
 95 | 
 96 | 
 97 | def test_transformers_and_word(tmp_path, pipeline_dict, trainer_config, train_dataset):
 98 |     """Testing Transformer pipeline with an added word feature layer"""
 99 |     # Changing the pipeline to delete the BERT pooler and add a word feature
100 |     del pipeline_dict["head"]["pooler"]
101 |     pipeline_dict["features"].update(
102 |         {"word": {"embedding_dim": 16, "lowercase_tokens": True}}
103 |     )
104 | 
105 |     pl = Pipeline.from_config(pipeline_dict)
106 |     pl.predict(text="test")
107 | 
108 |     output = tmp_path / "output"
109 |     trainer = Trainer(
110 |         pipeline=pl, train_dataset=train_dataset, trainer_config=trainer_config
111 |     )
112 |     trainer.fit(output_dir=output)
113 | 
114 |     # Check a fixed vocabulary size for the transformer and the word feature
115 |     assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
116 |     assert pl.backbone.vocab.get_vocab_size("word") == 273
117 | 
118 |     # Test vocab from a pretrained file
119 |     pl = Pipeline.from_pretrained(str(output / "model.tar.gz"))
120 | 
121 |     # Check a fixed vocabulary size for the transformer and the word feature after loading
122 |     assert pl.backbone.vocab.get_vocab_size("transformers") == 28996
123 |     assert pl.backbone.vocab.get_vocab_size("word") == 273
124 | 
125 | 
126 | def test_max_length_not_affecting_shorter_sequences(pipeline_dict):
127 |     """Max length change should not affect at all previous shorter-length models"""
128 | 
129 |     pl = Pipeline.from_config(pipeline_dict)
130 |     state_dict = pl._model.state_dict()  # dict with the whole state of the module
131 |     probs = pl.predict("Test this")["probabilities"]  # probabilities of the test input
132 | 
133 |     pipeline_dict["features"]["transformers"]["max_length"] = 100  # changing max length
134 |     pl = Pipeline.from_config(pipeline_dict)
135 |     pl._model.load_state_dict(state_dict)  # loading previous state from dict
136 |     probs_max_length = pl.predict("Test this")["probabilities"]
137 | 
138 |     assert_allclose(probs, probs_max_length)
139 | 
140 | 
141 | def test_serialization(pipeline_dict):
142 |     """Testing object saving. Model from the pipeline must be equal to the model from .json"""
143 | 
144 |     feature = TransformersFeatures(**pipeline_dict["features"]["transformers"])
145 |     assert feature == TransformersFeatures(**feature.to_json())
146 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   pull_request:
  5 |   push:
  6 |     branches: [master]
  7 |   release:
  8 |     types: [published]
  9 | 
 10 | jobs:
 11 |   tests_docs:
 12 |     name: Run Tests & Build Docs
 13 |     runs-on: ubuntu-latest
 14 |     # make sure commands run in a bash shell
 15 |     defaults:
 16 |       run:
 17 |         shell: bash -l {0}
 18 |     steps:
 19 |       - name: Set BIOME_TEXT_DOC_VERSION for Release 🥦
 20 |         if: ${{ github.event_name == 'release' }}
 21 |         run: echo BIOME_TEXT_DOC_VERSION=${{ github.event.release.tag_name }} >> $GITHUB_ENV
 22 |       - name: Checkout Code 🛎
 23 |         uses: actions/checkout@v2
 24 |       - name: Setup Conda Env 🐍
 25 |         uses: conda-incubator/setup-miniconda@v2
 26 |         with:
 27 |           environment-file: environment_dev.yml
 28 |           activate-environment: biometext
 29 |       - name: Cache pip 👜
 30 |         uses: actions/cache@v2
 31 |         env:
 32 |           # Increase this value to reset cache if setup.py has not changed
 33 |           CACHE_NUMBER: 0
 34 |         with:
 35 |           path: ~/.cache/pip
 36 |           key: ${{ runner.os }}-pip-${{ env.CACHE_NUMBER }}-${{ hashFiles('setup.py') }}
 37 |       - name: Install Biome 🌿
 38 |         run: make dev
 39 |       - name: Linting 🍩
 40 |         # TODO: there is an issue with pylint and our CI, for now we only run our pre-commit hooks
 41 |         run: pre-commit run --all-files
 42 |       - name: Run Tests 📈
 43 |         run: make test
 44 |       - name: Build Docs 📘
 45 |         # build and zip the docs
 46 |         run: |
 47 |           make build_docs
 48 |           tar -czf docs_build_output.tar.gz docs/site
 49 |       - name: Upload Build Output 🍕
 50 |         if: ${{ github.event_name == 'push' || github.event_name == 'release' }}
 51 |         uses: actions/upload-artifact@v2
 52 |         with:
 53 |           name: docs_build_output
 54 |           path: docs_build_output.tar.gz
 55 | 
 56 |   deploy_docs:
 57 |     name: Deploy Docs
 58 |     runs-on: ubuntu-latest
 59 |     needs: tests_docs
 60 |     if: ${{ github.event_name == 'push' || github.event_name == 'release' }}
 61 |     env:
 62 |       BIOME_TEXT_DOC_VERSION: master
 63 |     # make sure commands run in a bash shell
 64 |     defaults:
 65 |       run:
 66 |         shell: bash -l {0}
 67 |     steps:
 68 |       - name: Set BIOME_TEXT_DOC_VERSION for Release 🥦
 69 |         if: ${{ github.event_name == 'release' }}
 70 |         run: echo BIOME_TEXT_DOC_VERSION=${{ github.event.release.tag_name }} >> $GITHUB_ENV
 71 |       - name: Checkout Code 🛎
 72 |         # Recommended and required by JamesIves/github-pages-deploy-action
 73 |         uses: actions/checkout@v2
 74 |         with:
 75 |           persist-credentials: false
 76 |       - name: Download Build Output 🧀
 77 |         uses: actions/download-artifact@v2
 78 |         with:
 79 |           name: docs_build_output
 80 |       - name: Extract Build Output 🍗
 81 |         run: tar -xzf docs_build_output.tar.gz
 82 |       - name: Deploy Docs 🚀
 83 |         uses: JamesIves/github-pages-deploy-action@3.7.1
 84 |         with:
 85 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 86 |           BRANCH: gh-pages # The branch the action should deploy to.
 87 |           FOLDER: docs/site # The folder the action should deploy.
 88 |           TARGET_FOLDER: /${{ env.BIOME_TEXT_DOC_VERSION }}/
 89 |           CLEAN: true # Automatically remove deleted files from the deploy branch
 90 |       - name: Checkout gh-pages for Release 🛎
 91 |         if: ${{ github.event_name == 'release' }}
 92 |         uses: actions/checkout@v2
 93 |         with:
 94 |           ref: gh-pages
 95 |       - name: Update Versions and Index for Release 🍗
 96 |         if: ${{ github.event_name == 'release' }}
 97 |         run: |
 98 |           sed -i 's/master/master\n${{ env.BIOME_TEXT_DOC_VERSION }}/' versions.txt
 99 |           sed -i 's/biome-text\/.*\//biome-text\/${{ env.BIOME_TEXT_DOC_VERSION }}\//' index.html
100 |           git config user.name github-actions
101 |           git config user.email github-actions@github.com
102 |           git add versions.txt index.html
103 |           git commit -m "Update versions.txt and index.html due to new release"
104 |           git push
105 | 
106 |   deploy_release:
107 |     name: Deploy Release
108 |     runs-on: ubuntu-latest
109 |     if: ${{ github.event_name == 'release' }}
110 |     needs: tests_docs
111 |     defaults:
112 |       run:
113 |         shell: bash -l {0}
114 |     steps:
115 |       - name: Checkout Code 🛎
116 |         uses: actions/checkout@v2
117 |       - name: Setup Conda Env 🐍
118 |         uses: conda-incubator/setup-miniconda@v2
119 |         with:
120 |           environment-file: environment_dev.yml
121 |           activate-environment: biome
122 |       - name: Build Package 🍟
123 |         run: make dist
124 |       - name: Publish Package to TestPyPI 🥪
125 |         uses: pypa/gh-action-pypi-publish@master
126 |         with:
127 |           user: __token__
128 |           password: ${{ secrets.TEST_PYPI_API_TOKEN }}
129 |           repository_url: https://test.pypi.org/legacy/
130 |       - name: Test Installing 🍿
131 |         run: pip install --index-url https://test.pypi.org/simple --no-deps biome-text==${GITHUB_REF#refs/*/v}
132 |       - name: Publish Package to PyPI 🥩
133 |         uses: pypa/gh-action-pypi-publish@master
134 |         with:
135 |           user: __token__
136 |           password: ${{ secrets.PYPI_API_TOKEN }}
137 | 


--------------------------------------------------------------------------------
/src/biome/text/vocabulary.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Manages vocabulary tasks and fetches vocabulary information
  3 | 
  4 |     Provides utilities for getting information from a given vocabulary.
  5 | 
  6 |     Provides management actions such as extending the labels, setting new labels or creating an "empty" vocab.
  7 | """
  8 | import logging
  9 | from typing import Dict
 10 | from typing import List
 11 | 
 12 | from allennlp.data import Vocabulary
 13 | from allennlp.data.vocabulary import DEFAULT_NON_PADDED_NAMESPACES
 14 | 
 15 | from biome.text.features import TransformersFeatures
 16 | from biome.text.features import WordFeatures
 17 | 
 18 | LABELS_NAMESPACE = "gold_labels"
 19 | 
 20 | _LOGGER = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | def get_labels(vocab: Vocabulary) -> List[str]:
 24 |     """Gets list of labels in the vocabulary
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     vocab: `allennlp.data.Vocabulary`
 29 | 
 30 |     Returns
 31 |     -------
 32 |     labels: `List[str]`
 33 |         A list of label strings
 34 |     """
 35 |     return [k for k in vocab.get_token_to_index_vocabulary(namespace=LABELS_NAMESPACE)]
 36 | 
 37 | 
 38 | def label_for_index(vocab: Vocabulary, idx: int) -> str:
 39 |     """Gets label string for a label `int` id
 40 | 
 41 |     Parameters
 42 |     ----------
 43 |     vocab: `allennlp.data.Vocabulary`
 44 |     idx: `int
 45 |         the token index
 46 | 
 47 |     Returns
 48 |     -------
 49 |     label: `str`
 50 |         The string for a label id
 51 |     """
 52 |     return vocab.get_token_from_index(idx, namespace=LABELS_NAMESPACE)
 53 | 
 54 | 
 55 | def index_for_label(vocab: Vocabulary, label: str) -> int:
 56 |     """Gets the label `int` id for label string
 57 | 
 58 |     Parameters
 59 |     ----------
 60 |     vocab: `allennlp.data.Vocabulary``
 61 |     label: `str`
 62 |         the label
 63 | 
 64 |     Returns
 65 |     -------
 66 |     label_idx: `int`
 67 |         The label id for label string
 68 |     """
 69 |     return vocab.get_token_index(label, namespace=LABELS_NAMESPACE)
 70 | 
 71 | 
 72 | def get_index_to_labels_dictionary(vocab: Vocabulary) -> Dict[int, str]:
 73 |     """Gets a dictionary for turning label `int` ids into label strings
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     vocab: `allennlp.data.Vocabulary`
 78 | 
 79 |     Returns
 80 |     -------
 81 |     labels: `Dict[int, str]`
 82 |         A dictionary to get fetch label strings from ids
 83 |     """
 84 |     return vocab.get_index_to_token_vocabulary(LABELS_NAMESPACE)
 85 | 
 86 | 
 87 | def words_vocab_size(vocab: Vocabulary) -> int:
 88 |     """Fetches the vocabulary size for the `words` namespace
 89 | 
 90 |     Parameters
 91 |     ----------
 92 |     vocab: `allennlp.data.Vocabulary`
 93 | 
 94 |     Returns
 95 |     -------
 96 |     size: `int`
 97 |         The vocabulary size for the words namespace
 98 |     """
 99 |     return vocab.get_vocab_size(WordFeatures.namespace)
100 | 
101 | 
102 | def extend_labels(vocab: Vocabulary, labels: List[str]):
103 |     """Adds a list of label strings to the vocabulary
104 | 
105 |     Use this to add new labels to your vocabulary (e.g., useful for reusing the weights of an existing classifier)
106 | 
107 |     Parameters
108 |     ----------
109 |     vocab: `allennlp.data.Vocabulary`
110 |     labels: `List[str]`
111 |         A list of strings containing the labels to add to an existing vocabulary
112 |     """
113 |     vocab.add_tokens_to_namespace(labels, namespace=LABELS_NAMESPACE)
114 | 
115 | 
116 | def set_labels(vocab: Vocabulary, new_labels: List[str]):
117 |     """Resets the labels in the vocabulary with a given labels string list
118 | 
119 |     Parameters
120 |     ----------
121 |     vocab: `allennlp.data.Vocabulary`
122 |     new_labels: `List[str]`
123 |         The label strings to add to the vocabulary
124 |     """
125 |     for namespace_vocab in [
126 |         vocab.get_token_to_index_vocabulary(LABELS_NAMESPACE),
127 |         vocab.get_index_to_token_vocabulary(LABELS_NAMESPACE),
128 |     ]:
129 |         tokens = list(namespace_vocab.keys())
130 |         for token in tokens:
131 |             del namespace_vocab[token]
132 | 
133 |     extend_labels(vocab, new_labels)
134 | 
135 | 
136 | def create_empty_vocabulary() -> Vocabulary:
137 |     """Creates an empty Vocabulary with configured namespaces
138 | 
139 |     Returns
140 |     -------
141 |     empty_vocab
142 |         The transformers namespace is added to the `non_padded_namespace`.
143 |     """
144 |     # Following is a hack, because AllenNLP handles the Transformers vocab differently!
145 |     # The transformer vocab has its own padding and oov token, so we add it to the non_padded_namespaces.
146 |     # AllenNLP gives its "transformer vocab" by default the "tags" namespace, which is a non_padded_namespace ...
147 |     # If we do not do this, then writing the vocab to a file and loading it will fail, since AllenNLP will
148 |     # look for its default OVV token in the vocab unless it is flagged as non_padded_namespace.
149 |     # (see the doc string of `allennlp.data.token_indexers.PretrainedTransformerIndexer`)
150 |     return Vocabulary(
151 |         non_padded_namespaces=DEFAULT_NON_PADDED_NAMESPACES
152 |         + (TransformersFeatures.namespace,)
153 |     )
154 | 
155 | 
156 | def is_empty(vocab: Vocabulary, namespaces: List[str]) -> bool:
157 |     """Checks if at least one of the given namespaces has an empty vocab.
158 | 
159 |     Parameters
160 |     ----------
161 |     vocab
162 |         The vocabulary
163 |     namespaces
164 |         Namespaces to check in the vocabulary
165 | 
166 |     Returns
167 |     -------
168 |     True if one or more namespaces have an empty vocab
169 |     """
170 |     # If a namespace does not exist in the vocab, a default one is created on the fly with a padding and oov token
171 |     # We must drop the padding and out of vocab (oov) tokens -> 2 tokens
172 |     return any([vocab.get_vocab_size(namespace) < 3 for namespace in namespaces])
173 | 


--------------------------------------------------------------------------------
/src/biome/text/cli/serve.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | from typing import List
  3 | 
  4 | import click
  5 | import uvicorn
  6 | from allennlp.common.util import sanitize
  7 | from click import Path
  8 | from fastapi import FastAPI
  9 | from fastapi import HTTPException
 10 | from fastapi.exceptions import RequestValidationError
 11 | from fastapi.responses import PlainTextResponse
 12 | from pydantic import BaseConfig
 13 | from pydantic import create_model
 14 | from starlette.exceptions import HTTPException as StarletteHTTPException
 15 | 
 16 | from biome.text import Pipeline
 17 | 
 18 | 
 19 | @click.command()
 20 | @click.argument("pipeline_path", type=Path(exists=True))
 21 | @click.option(
 22 |     "--port",
 23 |     "-p",
 24 |     type=int,
 25 |     default=9999,
 26 |     show_default=True,
 27 |     help="Port on which to serve the REST API.",
 28 | )
 29 | @click.option(
 30 |     "--predictions_dir",
 31 |     "-pd",
 32 |     type=click.Path(),
 33 |     default=None,
 34 |     help="Path to log raw predictions from the service.",
 35 | )
 36 | @click.option(
 37 |     "--host",
 38 |     type=str,
 39 |     default="0.0.0.0",
 40 |     help="Host of the underlying uvicorn server",
 41 | )
 42 | def serve(pipeline_path: str, port: int, predictions_dir: str, host: str) -> None:
 43 |     """Serves the pipeline predictions as a REST API
 44 | 
 45 |     PIPELINE_PATH is the path to a pretrained pipeline (model.tar.gz file).
 46 |     """
 47 |     pipeline = Pipeline.from_pretrained(pipeline_path)
 48 |     pipeline._model.eval()
 49 | 
 50 |     if predictions_dir:
 51 |         pipeline.init_prediction_logger(predictions_dir)
 52 | 
 53 |     return _serve(pipeline, port, host)
 54 | 
 55 | 
 56 | def _serve(pipeline: Pipeline, port: int = 9999, host: str = "0.0.0.0"):
 57 |     """Serves an pipeline as rest api"""
 58 |     predict_parameters = inspect.signature(pipeline.predict).parameters
 59 |     model_parameters = {
 60 |         name: (
 61 |             par.annotation,
 62 |             None,  # We need a default value to allow for batch predictions!
 63 |         )
 64 |         for name, par in predict_parameters.items()
 65 |         if par.default == inspect.Parameter.empty
 66 |     }
 67 |     optional_parameters = {
 68 |         name: (par.annotation, par.default)
 69 |         for name, par in predict_parameters.items()
 70 |         # The batch parameter needs an extra logic to allow for a proper BaseModel for it
 71 |         if par.default != inspect.Parameter.empty and name != "batch"
 72 |     }
 73 | 
 74 |     class Config(BaseConfig):
 75 |         extra = "forbid"
 76 | 
 77 |     ModelInput = create_model("ModelInput", **model_parameters, __config__=Config)
 78 |     PredictInput = create_model(
 79 |         "PredictInput",
 80 |         **model_parameters,
 81 |         batch=(List[ModelInput], None),
 82 |         **optional_parameters,
 83 |         __config__=Config,
 84 |     )
 85 | 
 86 |     class http_error_handling:
 87 |         """Error handling for http error transcription"""
 88 | 
 89 |         def __enter__(self):
 90 |             pass
 91 | 
 92 |         def __exit__(self, exc_type, exc_val, exc_tb):
 93 |             if isinstance(exc_val, Exception):
 94 |                 # Common http error handling
 95 |                 raise HTTPException(status_code=500, detail=str(exc_val))
 96 | 
 97 |     def make_app() -> FastAPI:
 98 |         app = FastAPI()
 99 | 
100 |         error_msg = f"\nCheck the docs at '0.0.0.0:{port}/docs' for an example of a valid request body."
101 | 
102 |         @app.exception_handler(RequestValidationError)
103 |         async def validation_exception_handler(request, exc):
104 |             return PlainTextResponse(str(exc) + error_msg, status_code=400)
105 | 
106 |         @app.exception_handler(StarletteHTTPException)
107 |         async def http_exception_handler(request, exc):
108 |             if exc.status_code == 400:
109 |                 return PlainTextResponse(
110 |                     str(exc.detail) + error_msg, status_code=exc.status_code
111 |                 )
112 |             else:
113 |                 return PlainTextResponse(str(exc.detail), status_code=exc.status_code)
114 | 
115 |         @app.post("/predict", tags=["Pipeline"])
116 |         async def predict(predict_input: PredictInput):
117 |             """Returns a prediction given some input data
118 | 
119 |             Parameters
120 |             ----------
121 |             - **args/kwargs:** See the Example Value for the Request body below.
122 |             If provided, the **batch** parameter will be ignored.
123 |             - **batch:** A list of dictionaries that represents a batch of inputs.
124 |             The dictionary keys must comply with the **args/kwargs**.
125 |             Predicting batches should typically be faster than repeated calls with **args/kwargs**.
126 |             - **add_tokens:** If true, adds a 'tokens' key in the prediction that contains the tokenized input.
127 |             - **add_attributions:** If true, adds a 'attributions' key that contains attributions of the input to the prediction.
128 |             - **attributions_kwargs:** This dict is directly passed on to the `TaskHead.compute_attributions()`.
129 | 
130 |             Returns
131 |             -------
132 |             - **predictions:** A dictionary or a list of dictionaries containing the predictions and additional information.
133 |             """
134 |             with http_error_handling():
135 |                 return sanitize(
136 |                     pipeline.predict(**predict_input.dict(skip_defaults=True))
137 |                 )
138 | 
139 |         @app.get("/config", tags=["Pipeline"])
140 |         async def config():
141 |             """The configuration of the pipeline"""
142 |             with http_error_handling():
143 |                 return pipeline.config.as_dict()
144 | 
145 |         @app.get("/_status", tags=["REST service"])
146 |         async def status():
147 |             with http_error_handling():
148 |                 return {"ok": True}
149 | 
150 |         return app
151 | 
152 |     uvicorn.run(make_app(), host=host, port=port)
153 | 


--------------------------------------------------------------------------------
/tests/text/test_trainer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | 
  4 | import pytest
  5 | from pytorch_lightning.loggers import CSVLogger
  6 | from pytorch_lightning.loggers import LoggerCollection
  7 | from pytorch_lightning.loggers import MLFlowLogger
  8 | from pytorch_lightning.loggers import TensorBoardLogger
  9 | from pytorch_lightning.loggers import WandbLogger
 10 | 
 11 | from biome.text import Dataset
 12 | from biome.text import Pipeline
 13 | from biome.text import Trainer
 14 | from biome.text.configuration import TrainerConfiguration
 15 | 
 16 | 
 17 | @pytest.fixture
 18 | def dataset(resources_data_path) -> Dataset:
 19 |     return Dataset.from_csv(
 20 |         paths=str(resources_data_path / "business.cat.2k.valid.csv")
 21 |     )
 22 | 
 23 | 
 24 | @pytest.fixture
 25 | def pipeline_dict() -> dict:
 26 |     return {
 27 |         "name": "german_business_names",
 28 |         "features": {
 29 |             "word": {"embedding_dim": 16, "lowercase_tokens": True},
 30 |         },
 31 |         "head": {
 32 |             "type": "TextClassification",
 33 |             "labels": [
 34 |                 "Unternehmensberatungen",
 35 |                 "Friseure",
 36 |                 "Tiefbau",
 37 |                 "Dienstleistungen",
 38 |                 "Gebrauchtwagen",
 39 |                 "Restaurants",
 40 |                 "Architekturbüros",
 41 |                 "Elektriker",
 42 |                 "Vereine",
 43 |                 "Versicherungsvermittler",
 44 |                 "Sanitärinstallationen",
 45 |                 "Edv",
 46 |                 "Maler",
 47 |                 "Physiotherapie",
 48 |                 "Werbeagenturen",
 49 |                 "Apotheken",
 50 |                 "Vermittlungen",
 51 |                 "Hotels",
 52 |                 "Autowerkstätten",
 53 |                 "Elektrotechnik",
 54 |                 "Allgemeinärzte",
 55 |                 "Handelsvermittler Und -vertreter",
 56 |             ],
 57 |         },
 58 |     }
 59 | 
 60 | 
 61 | def test_default_root_dir(change_to_tmp_working_dir, pipeline_dict, dataset):
 62 |     pl = Pipeline.from_config(pipeline_dict)
 63 |     trainer = Trainer(pl, train_dataset=dataset)
 64 |     assert trainer.trainer.default_root_dir == str(
 65 |         change_to_tmp_working_dir / "training_logs"
 66 |     )
 67 | 
 68 | 
 69 | def test_deep_copy_of_trainer_config(pipeline_dict, dataset):
 70 |     pl = Pipeline.from_config(pipeline_dict)
 71 |     trainer_config = TrainerConfiguration()
 72 |     trainer = Trainer(pl, train_dataset=dataset, trainer_config=trainer_config)
 73 |     assert trainer_config is not trainer._trainer_config
 74 | 
 75 | 
 76 | @pytest.mark.parametrize(
 77 |     "input_kwargs,expected_loggers",
 78 |     [
 79 |         ({}, ["csv", "tensorboard", "wandb"]),
 80 |         ({"logger": False}, []),
 81 |         (
 82 |             {
 83 |                 "logger": MLFlowLogger(
 84 |                     tracking_uri=os.path.join(tempfile.gettempdir(), "mlruns")
 85 |                 ),
 86 |                 "add_wandb_logger": False,
 87 |             },
 88 |             ["csv", "tensorboard", "mlflow"],
 89 |         ),
 90 |         (
 91 |             {
 92 |                 "logger": [
 93 |                     MLFlowLogger(
 94 |                         tracking_uri=os.path.join(tempfile.gettempdir(), "mlruns")
 95 |                     ),
 96 |                     CSVLogger(save_dir=tempfile.gettempdir()),
 97 |                 ],
 98 |                 "add_wandb_logger": False,
 99 |                 "add_tensorboard_logger": False,
100 |             },
101 |             ["csv", "mlflow"],
102 |         ),
103 |     ],
104 | )
105 | def test_add_default_loggers(
106 |     input_kwargs, expected_loggers, pipeline_dict, dataset, tmp_path
107 | ):
108 |     trainer_config = TrainerConfiguration(
109 |         **input_kwargs, default_root_dir=str(tmp_path)
110 |     )
111 |     trainer = Trainer(
112 |         Pipeline.from_config(pipeline_dict),
113 |         train_dataset=dataset,
114 |         trainer_config=trainer_config,
115 |     )
116 |     if input_kwargs.get("logger") is not False:
117 |         assert isinstance(trainer.trainer.logger, LoggerCollection)
118 |         assert len(trainer.trainer.logger.experiment) == len(expected_loggers)
119 |     else:
120 |         assert trainer._trainer_config.logger is False
121 | 
122 |     def loggers_include(logger_type) -> bool:
123 |         return any(
124 |             [
125 |                 isinstance(logger, logger_type)
126 |                 for logger in trainer._trainer_config.logger
127 |             ]
128 |         )
129 | 
130 |     for logger in expected_loggers:
131 |         if logger == "csv":
132 |             assert loggers_include(CSVLogger)
133 |         if logger == "tensorboard":
134 |             assert loggers_include(TensorBoardLogger)
135 |         if logger == "wandb":
136 |             assert loggers_include(WandbLogger)
137 |             assert (tmp_path / "wandb").is_dir()
138 |         if logger == "mlflow":
139 |             assert loggers_include(MLFlowLogger)
140 | 
141 | 
142 | def test_pipeline_test(pipeline_dict, dataset, tmp_path):
143 |     import json
144 | 
145 |     pl = Pipeline.from_config(pipeline_dict)
146 |     trainer = Trainer(pl)
147 |     first_metrics = trainer.test(dataset, output_dir=tmp_path, batch_size=16)
148 |     assert "test_loss" in first_metrics
149 | 
150 |     assert (tmp_path / "metrics.json").is_file()
151 |     with (tmp_path / "metrics.json").open() as file:
152 |         assert "test_loss" in json.load(file)
153 | 
154 |     assert pl.evaluate(dataset)["test_loss"] == pytest.approx(
155 |         first_metrics["test_loss"]
156 |     )
157 | 
158 | 
159 | def test_create_output_dir(pipeline_dict, dataset, tmp_path):
160 |     config = TrainerConfiguration(
161 |         logger=False, fast_dev_run=True, batch_size=1, max_epochs=1, gpus=0
162 |     )
163 |     pipeline = Pipeline.from_config(pipeline_dict)
164 |     trainer = Trainer(pipeline, train_dataset=dataset, trainer_config=config)
165 | 
166 |     output_dir = tmp_path / "test_this_non_existing_parent_dir" / "output"
167 |     trainer.fit(output_dir=output_dir)
168 | 
169 |     assert output_dir.is_dir()
170 | 


--------------------------------------------------------------------------------
/docs/docs/.vuepress/theme/components/PageNav.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |   <div
  3 |     v-if="prev || next"
  4 |     class="page-nav"
  5 |   >
  6 |     <p class="inner">
  7 |       <span
  8 |         v-if="prev"
  9 |         class="page-nav__button prev"
 10 |       >
 11 |         <a
 12 |           v-if="prev.type === 'external'"
 13 |           class="prev"
 14 |           :href="prev.path"
 15 |           target="_blank"
 16 |           rel="noopener noreferrer"
 17 |         >
 18 |           <span class="page-nav__button__icon">
 19 |             <vp-icon color="#4A4A4A" name="chev-left" size="18px"/>
 20 |           </span>
 21 |           <span class="page-nav__button__text">
 22 |             {{ prev.title || prev.path }}
 23 |           </span>
 24 | 
 25 |           <OutboundLink />
 26 |         </a>
 27 | 
 28 |         <RouterLink
 29 |           v-else
 30 |           class="prev"
 31 |           :to="prev.path"
 32 |         >
 33 |           <span class="page-nav__button__icon">
 34 |             <vp-icon color="#4A4A4A" name="chev-left" size="18px"/>
 35 |           </span>
 36 |           <span class="page-nav__button__text">
 37 |             {{ prev.title || prev.path }}
 38 |           </span>
 39 |         </RouterLink>
 40 |       </span>
 41 | 
 42 |       <span
 43 |         v-if="next"
 44 |         class="page-nav__button next"
 45 |       >
 46 |         <a
 47 |           v-if="next.type === 'external'"
 48 |           :href="next.path"
 49 |           target="_blank"
 50 |           rel="noopener noreferrer"
 51 |         >
 52 |           <span class="page-nav__button__text">
 53 |             {{ next.title || next.path }}
 54 |           </span>
 55 |           <span class="page-nav__button__icon">
 56 |             <vp-icon color="#4A4A4A" name="chev-right" size="18px"/>
 57 |           </span>
 58 |           <OutboundLink />
 59 |         </a>
 60 | 
 61 |         <RouterLink
 62 |           v-else
 63 |           :to="next.path"
 64 |         >
 65 |           <span class="page-nav__button__text">
 66 |             {{ next.title || next.path }}
 67 |           </span>
 68 |           <span class="page-nav__button__icon">
 69 |             <vp-icon color="#4A4A4A" name="chev-right" size="18px"/>
 70 |           </span>
 71 |         </RouterLink>
 72 |       </span>
 73 |     </p>
 74 |   </div>
 75 | </template>
 76 | 
 77 | <script>
 78 | import { resolvePage } from '@vuepress/theme-default/util'
 79 | import isString from 'lodash/isString'
 80 | import isNil from 'lodash/isNil'
 81 | 
 82 | export default {
 83 |   name: 'PageNav',
 84 | 
 85 |   props: ['sidebarItems'],
 86 | 
 87 |   computed: {
 88 |     prev () {
 89 |       return resolvePageLink(LINK_TYPES.PREV, this)
 90 |     },
 91 | 
 92 |     next () {
 93 |       return resolvePageLink(LINK_TYPES.NEXT, this)
 94 |     }
 95 |   }
 96 | }
 97 | 
 98 | function resolvePrev (page, items) {
 99 |   return find(page, items, -1)
100 | }
101 | 
102 | function resolveNext (page, items) {
103 |   return find(page, items, 1)
104 | }
105 | 
106 | const LINK_TYPES = {
107 |   NEXT: {
108 |     resolveLink: resolveNext,
109 |     getThemeLinkConfig: ({ nextLinks }) => nextLinks,
110 |     getPageLinkConfig: ({ frontmatter }) => frontmatter.next
111 |   },
112 |   PREV: {
113 |     resolveLink: resolvePrev,
114 |     getThemeLinkConfig: ({ prevLinks }) => prevLinks,
115 |     getPageLinkConfig: ({ frontmatter }) => frontmatter.prev
116 |   }
117 | }
118 | 
119 | function resolvePageLink (
120 |   linkType,
121 |   { $themeConfig, $page, $route, $site, sidebarItems }
122 | ) {
123 |   const { resolveLink, getThemeLinkConfig, getPageLinkConfig } = linkType
124 | 
125 |   // Get link config from theme
126 |   const themeLinkConfig = getThemeLinkConfig($themeConfig)
127 | 
128 |   // Get link config from current page
129 |   const pageLinkConfig = getPageLinkConfig($page)
130 | 
131 |   // Page link config will overwrite global theme link config if defined
132 |   const link = isNil(pageLinkConfig) ? themeLinkConfig : pageLinkConfig
133 | 
134 |   if (link === false) {
135 |     return
136 |   } else if (isString(link)) {
137 |     return resolvePage($site.pages, link, $route.path)
138 |   } else {
139 |     return resolveLink($page, sidebarItems)
140 |   }
141 | }
142 | 
143 | function find (page, items, offset) {
144 |   const res = []
145 |   flatten(items, res)
146 |   for (let i = 0; i < res.length; i++) {
147 |     const cur = res[i]
148 |     if (cur.type === 'page' && cur.path === decodeURIComponent(page.path)) {
149 |       return res[i + offset]
150 |     }
151 |   }
152 | }
153 | 
154 | function flatten (items, res) {
155 |   for (let i = 0, l = items.length; i < l; i++) {
156 |     if (items[i].type === 'group') {
157 |       flatten(items[i].children || [], res)
158 |     } else {
159 |       res.push(items[i])
160 |     }
161 |   }
162 | }
163 | </script>
164 | 
165 | <style lang="stylus">
166 | 
167 | .page-nav
168 |   padding-top 1rem
169 |   padding-bottom 0
170 |   max-width: 740px;
171 |   margin: 0 auto;
172 |   padding: 1rem 2.5rem 0 2.5rem;
173 |   @media (max-width: $MQMobileNarrow)
174 |     padding: 1.5rem;
175 |   &__button
176 |     border: 1px solid $borderColor
177 |     border-radius: 3px
178 |     font-family: $secondaryFontFamily
179 |     &:hover
180 |       border-color: $textColor !important
181 |       a
182 |         color: $textColor !important
183 |     a
184 |       display: flex
185 |       padding: 0.9em
186 |       color: $textColorLight !important
187 |       font-weight: 600
188 |       font-size: 0.75rem
189 |     &__icon
190 |       display: flex
191 |       align-items: center
192 |     &__text
193 |       max-width 260px
194 |       overflow hidden
195 |       text-overflow ellipsis
196 |       display inline-block
197 |       white-space nowrap
198 |     @media (max-width: $MQMobileNarrow)
199 |       border: none
200 |       padding: 1em 0
201 |       a
202 |         padding: 0
203 |   .inner
204 |     min-height 2rem
205 |     margin-top 0
206 |     border-top 1px solid $borderColor
207 |     padding-top 1rem
208 |     overflow auto // clear float
209 |   .prev
210 |     float: left
211 |     .page-nav__button__icon
212 |       float: left
213 |       margin-right: 1.5rem
214 |   .next
215 |     float right
216 |     .page-nav__button__icon
217 |       float: right
218 |       margin-left: 1.5rem
219 | </style>
220 | 


--------------------------------------------------------------------------------