├── tests ├── docs │ ├── __init__.py │ ├── test_configurations.py │ └── test_tutorials.py ├── text │ ├── __init__.py │ ├── modules │ │ ├── __init__.py │ │ ├── heads │ │ │ ├── __init__.py │ │ │ ├── classification │ │ │ │ ├── __init__.py │ │ │ │ ├── test_relation_classifier.py │ │ │ │ ├── test_document_classification.py │ │ │ │ └── test_text_classification.py │ │ │ ├── test_language_modelling.py │ │ │ └── test_task_head.py │ │ └── configuration │ │ │ └── test_component_configuration.py │ ├── test_text_cleaning.py │ ├── test_commons.py │ ├── test_cli.py │ ├── test_features_configuration.py │ ├── test_metrics.py │ ├── test_pipeline_save.py │ ├── test_pipeline_copy.py │ ├── test_pipeline_to_mlflow.py │ ├── test_pipeline_with_optional_inputs.py │ ├── test_pipeline_tokenizer.py │ ├── test_pipeline_datasets.py │ ├── test_pipeline_with_custom_head.py │ ├── test_model_predict.py │ ├── test_pipeline_predict.py │ ├── test_tokenizer.py │ ├── test_hpo.py │ ├── test_pretrained_word_vectors.py │ ├── test_pipeline_vocab.py │ ├── test_features_transformers.py │ └── test_trainer.py ├── __init__.py ├── resources │ └── data │ │ ├── test.xlsx │ │ ├── test.parquet │ │ ├── nested-list.jsonl │ │ ├── to-be-flattened.jsonl │ │ ├── dataset_source.csv │ │ ├── dataset_sequence.jsonl │ │ ├── dataset_sequence.json │ │ ├── emotions_with_transformers.txt │ │ └── dataset_source.jsonl ├── conftest.py └── text_classification_integration_test.py ├── src └── biome │ ├── text │ ├── modules │ │ ├── __init__.py │ │ ├── heads │ │ │ ├── classification │ │ │ │ ├── __init__.py │ │ │ │ └── record_classification.py │ │ │ └── __init__.py │ │ ├── encoders │ │ │ ├── __init__.py │ │ │ └── time_distributed_encoder.py │ │ └── configuration │ │ │ ├── __init__.py │ │ │ ├── allennlp_configuration.py │ │ │ └── defs.py │ ├── cli │ │ ├── __init__.py │ │ ├── cli.py │ │ ├── evaluate.py │ │ ├── train.py │ │ └── serve.py │ ├── commons.py │ ├── mlflow_model.py │ ├── __init__.py │ ├── errors.py │ ├── metrics.py │ ├── backbone.py │ ├── text_cleaning.py │ └── vocabulary.py │ └── __init__.py ├── docs ├── docs │ ├── .vuepress │ │ ├── theme │ │ │ ├── index.js │ │ │ ├── enhanceApp.js │ │ │ ├── components │ │ │ │ ├── search.svg │ │ │ │ ├── search-orange.svg │ │ │ │ ├── Sidebar.vue │ │ │ │ ├── NavLink.vue │ │ │ │ ├── Versions.vue │ │ │ │ ├── Navbar.vue │ │ │ │ └── PageNav.vue │ │ │ ├── styles │ │ │ │ ├── fonts.styl │ │ │ │ ├── palette.styl │ │ │ │ ├── code-colors.styl │ │ │ │ └── index.styl │ │ │ └── layouts │ │ │ │ └── Layout.vue │ │ ├── public │ │ │ ├── favicon.ico │ │ │ └── assets │ │ │ │ ├── img │ │ │ │ ├── allennlp.png │ │ │ │ ├── hugging.png │ │ │ │ ├── recognai.png │ │ │ │ ├── bg.svg │ │ │ │ ├── biome-isotype.svg │ │ │ │ └── pytorch.svg │ │ │ │ └── fonts │ │ │ │ ├── justmeagaindownhere.woff │ │ │ │ ├── BasisGrotesquePro-Bold.woff │ │ │ │ ├── BasisGrotesquePro-Light.woff │ │ │ │ └── BasisGrotesquePro-Regular.woff │ │ └── config.js │ ├── documentation │ │ ├── tutorials │ │ │ └── img │ │ │ │ ├── analysis_df.png │ │ │ │ ├── hpo_tensorboard.png │ │ │ │ └── text_classifier_explore_screenshot.png │ │ ├── community │ │ │ ├── 2-get_help.md │ │ │ ├── 1-contributing.md │ │ │ └── 3-developer_guides.md │ │ ├── readme.md │ │ └── user-guides │ │ │ └── 1-nlp-tasks.md │ ├── icons │ │ ├── chev-left.svg │ │ ├── chev-right.svg │ │ └── blank.svg │ ├── api │ │ └── README.md │ └── README.md ├── biome_text_logo_for_readme.png ├── prepare_versioned_build.sh ├── package.json └── .templates │ └── config.mako ├── AUTHORS.rst ├── .github ├── ISSUE_TEMPLATE │ ├── question.md │ ├── feature_request.md │ └── bug_report.md └── workflows │ └── ci.yml ├── environment_dev.yml ├── .pre-commit-config.yaml ├── MANIFEST.in ├── CHANGELOG.rst ├── setup.cfg ├── Makefile ├── setup.py ├── .gitignore └── README.md /tests/docs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/text/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/text/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/biome/text/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/text/modules/heads/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/text/modules/heads/classification/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/biome/text/modules/heads/classification/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logging.basicConfig(level=logging.INFO) 4 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/index.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | extend: '@vuepress/theme-default', 3 | } 4 | -------------------------------------------------------------------------------- /src/biome/text/cli/__init__.py: -------------------------------------------------------------------------------- 1 | from .cli import main 2 | 3 | if __name__ == "__main__": 4 | main() 5 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Contributors 3 | ============ 4 | 5 | * Francisco Aranda 6 | -------------------------------------------------------------------------------- /tests/resources/data/test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/tests/resources/data/test.xlsx -------------------------------------------------------------------------------- /tests/resources/data/test.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/tests/resources/data/test.parquet -------------------------------------------------------------------------------- /docs/biome_text_logo_for_readme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/biome_text_logo_for_readme.png -------------------------------------------------------------------------------- /docs/docs/.vuepress/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/favicon.ico -------------------------------------------------------------------------------- /docs/docs/.vuepress/public/assets/img/allennlp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/img/allennlp.png -------------------------------------------------------------------------------- /docs/docs/.vuepress/public/assets/img/hugging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/img/hugging.png -------------------------------------------------------------------------------- /docs/docs/.vuepress/public/assets/img/recognai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/img/recognai.png -------------------------------------------------------------------------------- /docs/docs/documentation/tutorials/img/analysis_df.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/documentation/tutorials/img/analysis_df.png -------------------------------------------------------------------------------- /docs/docs/icons/chev-left.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/docs/icons/chev-right.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/docs/documentation/tutorials/img/hpo_tensorboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/documentation/tutorials/img/hpo_tensorboard.png -------------------------------------------------------------------------------- /docs/docs/.vuepress/public/assets/fonts/justmeagaindownhere.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/fonts/justmeagaindownhere.woff -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/enhanceApp.js: -------------------------------------------------------------------------------- 1 | import VClickOutside from 'v-click-outside' 2 | 3 | export default ({ Vue, options, router, siteData }) => { 4 | Vue.use(VClickOutside) 5 | } 6 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/public/assets/fonts/BasisGrotesquePro-Bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/fonts/BasisGrotesquePro-Bold.woff -------------------------------------------------------------------------------- /docs/docs/.vuepress/public/assets/fonts/BasisGrotesquePro-Light.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/fonts/BasisGrotesquePro-Light.woff -------------------------------------------------------------------------------- /docs/docs/.vuepress/public/assets/fonts/BasisGrotesquePro-Regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/.vuepress/public/assets/fonts/BasisGrotesquePro-Regular.woff -------------------------------------------------------------------------------- /docs/docs/documentation/community/2-get_help.md: -------------------------------------------------------------------------------- 1 | # Getting help 2 | 3 | The best way to get help is by creating an issue on [Github](https://github.com/recognai/biome-text/issues/new/choose) 4 | -------------------------------------------------------------------------------- /src/biome/__init__.py: -------------------------------------------------------------------------------- 1 | # https://packaging.python.org/guides/packaging-namespace-packages/#pkgutil-style-namespace-packages 2 | __path__ = __import__("pkgutil").extend_path(__path__, __name__) 3 | -------------------------------------------------------------------------------- /docs/docs/documentation/tutorials/img/text_classifier_explore_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argilla-io/biome-text/HEAD/docs/docs/documentation/tutorials/img/text_classifier_explore_screenshot.png -------------------------------------------------------------------------------- /src/biome/text/modules/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | from ..configuration import Seq2SeqEncoderConfiguration 2 | from .time_distributed_encoder import TimeDistributedEncoder 3 | 4 | Encoder = Seq2SeqEncoderConfiguration 5 | -------------------------------------------------------------------------------- /tests/text/test_text_cleaning.py: -------------------------------------------------------------------------------- 1 | from biome.text import text_cleaning 2 | 3 | 4 | def test_make_rule_callable(): 5 | clean_text = text_cleaning.strip_spaces(" This is a text \n\n") 6 | assert clean_text == "This is a text" 7 | -------------------------------------------------------------------------------- /docs/docs/api/README.md: -------------------------------------------------------------------------------- 1 | # biome.text API reference 2 | Here you can find the API reference of the `biome.text` library. 3 | 4 | Use the left-side bar to navigate through the library API or the search bar to find specific modules, classes and methods. 5 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/components/search.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/components/search-orange.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/resources/data/nested-list.jsonl: -------------------------------------------------------------------------------- 1 | { "classification": [ { "origin": [ { "source": "WL", "key": "1038.4450287.WL" } ] }, { "origin": [ { "source": "SAP-BP", "key": "DZ_FFM.0022194281.SAP-BP" }, { "source": "DGHYP", "key": "531.9009058308.DGHYP" } ] } ]} 2 | -------------------------------------------------------------------------------- /docs/docs/icons/blank.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask a question 4 | title: "[QUESTION]" 5 | labels: help wanted, question 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Check 11 | 12 | * [ ] I have tried to find a similar issue and have not found anything which solves my question. 13 | 14 | ## Description 15 | 16 | Describe what you'd like to do, problems you have encountered, unclear documentation sections, etc. 17 | -------------------------------------------------------------------------------- /tests/text/test_commons.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from biome.text.commons import ImmutableDict 4 | 5 | 6 | class TestImmutableDict: 7 | def test_cannot_mutate(self): 8 | dict = ImmutableDict(a=1, b="2", c=1000.00) 9 | 10 | with pytest.raises(TypeError): 11 | dict.f = "F" 12 | 13 | with pytest.raises(TypeError): 14 | dict.a = 100 15 | 16 | # TODO: Test a serialization/deserialization 17 | -------------------------------------------------------------------------------- /src/biome/text/modules/configuration/__init__.py: -------------------------------------------------------------------------------- 1 | # fmt: off 2 | from .allennlp_configuration import BiMpmMatchingConfiguration 3 | from .allennlp_configuration import EmbeddingConfiguration 4 | from .allennlp_configuration import FeedForwardConfiguration 5 | from .allennlp_configuration import Seq2SeqEncoderConfiguration 6 | from .allennlp_configuration import Seq2VecEncoderConfiguration 7 | from .defs import ComponentConfiguration 8 | 9 | # fmt: on 10 | -------------------------------------------------------------------------------- /environment_dev.yml: -------------------------------------------------------------------------------- 1 | name: biometext 2 | 3 | channels: 4 | - conda-forge 5 | 6 | dependencies: 7 | - python~=3.7.0 8 | - pip>=20.3.0 9 | # for building the docs 10 | - nodejs==14.15.1 11 | - pip: 12 | # testing 13 | - pytest>=6.2.0 14 | - pytest-cov>=2.10.0 15 | - pytest-pylint>=0.14.0 16 | - pytest-notebook~=0.6.0 17 | - wandb>=0.10.12 18 | - xlrd~=1.2.0 19 | # documentation 20 | - pdoc3~=0.8.1 21 | # development 22 | - pre-commit~=2.9.0 23 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/psf/black 9 | rev: 20.8b1 10 | hooks: 11 | - id: black 12 | - repo: https://github.com/pycqa/isort 13 | rev: 5.6.4 14 | hooks: 15 | - id: isort 16 | -------------------------------------------------------------------------------- /tests/text/test_cli.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from biome.text import Pipeline 4 | from biome.text.cli.serve import _serve 5 | 6 | 7 | @pytest.mark.skip("Please execute this test manually and check your localhost:9999") 8 | def test_serve(): 9 | """Needs to be automatized this test!""" 10 | pipeline = Pipeline.from_config( 11 | { 12 | "name": "serve_test", 13 | "head": {"type": "TextClassification", "labels": ["a", "b"]}, 14 | } 15 | ) 16 | 17 | _serve(pipeline) 18 | -------------------------------------------------------------------------------- /src/biome/text/commons.py: -------------------------------------------------------------------------------- 1 | class ImmutableDict(dict): 2 | """Immutable version of python's dict type""" 3 | 4 | def __hash__(self): 5 | return id(self) 6 | 7 | def _immutable(self, *args, **kws): 8 | raise TypeError("object is immutable") 9 | 10 | __setitem__ = _immutable 11 | __delitem__ = _immutable 12 | __setattr__ = _immutable 13 | 14 | clear = _immutable 15 | update = _immutable 16 | setdefault = _immutable 17 | pop = _immutable 18 | popitem = _immutable 19 | -------------------------------------------------------------------------------- /tests/text/test_features_configuration.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from allennlp.common import Params 3 | from allennlp.common.checks import ConfigurationError 4 | 5 | from biome.text.configuration import FeaturesConfiguration 6 | 7 | 8 | def test_non_configurable_features(): 9 | wrong_config = dict(ner=dict(embedding=15)) 10 | with pytest.raises(TypeError): 11 | FeaturesConfiguration(**wrong_config) 12 | 13 | with pytest.raises(ConfigurationError): 14 | FeaturesConfiguration.from_params(Params(wrong_config)) 15 | -------------------------------------------------------------------------------- /tests/resources/data/to-be-flattened.jsonl: -------------------------------------------------------------------------------- 1 | { "a": "ajj lsd", "complexData": { "a": "a", "b":"b"}, "persons": [{"name": "Frank", "lastName": "Rubber"}, {"name": "Thomas", "lastName": "Sabo"}]} 2 | { "a": "ajj lsd", "complexData": { "a": "a", "b":"b"}, "persons": [{"name": "Anthony", "lastName": "Rubber"}]} 3 | { "a": "ajj lsd", "complexData": { "a": "a", "b":"b"}, "persons": [{"name": "Peter", "lastName": "Gabriel"}, {"name": "Thomas", "lastName": "Sabo"}]} 4 | { "a": "ajj lsd", "complexData": { "a": "a", "b":"b"}, "persons": [{"name": "Lucien", "lastName": "Pasteque"}, {"name": "Thomas", "lastName": "Bowler"}]} 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # https://docs.python.org/3.7/distutils/sourcedist.html#specifying-the-files-to-distribute 2 | include AUTHORS.rst 3 | include CHANGELOG.rst 4 | include docker-compose.yml 5 | include LICENSE.txt 6 | include Makefile 7 | include README.md 8 | include setup.cfg 9 | 10 | recursive-include docker * 11 | recursive-include docs * 12 | recursive-include src/biome * 13 | recursive-include tests * 14 | 15 | prune build 16 | prune docs/_build 17 | prune docs/_static 18 | prune docs/node_modules 19 | prune docs/api 20 | prune dist 21 | prune tests/**/htmlcov 22 | prune tests/mlruns 23 | prune tests/runs 24 | prune tests/output 25 | 26 | global-exclude *.pyc *.o 27 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Changelog 3 | ========= 4 | 5 | Version 2.0 6 | =========== 7 | 8 | - Replaced `DataSource` with `Dataset` 9 | - Vocab creation is now automatically done when executing `Pipeline.train()` 10 | - Introduced `TuneExperiment` class 11 | - Added the *transformers* feature 12 | - Move `Pipeline.explore()` command to its own module 13 | - `Pipeline.train()` modifies the pipeline inplace instead of creating a copy for the training 14 | - `TokenClassification` accepts entities 15 | - Added a `RelationClassification` head 16 | - A LOT if minor and not so minor changes ... 17 | 18 | Version 1.0 19 | =========== 20 | 21 | - Introduce the *pipeline, backbone, head* concept 22 | -------------------------------------------------------------------------------- /src/biome/text/cli/cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import click 5 | from click import Group 6 | 7 | from .evaluate import evaluate 8 | from .serve import serve 9 | from .train import train 10 | 11 | SUPPORTED_COMMANDS = [train, evaluate, serve] 12 | 13 | 14 | def main(): 15 | _add_project_modules_to_sys_path() 16 | 17 | commands = Group(no_args_is_help=True) 18 | for command in SUPPORTED_COMMANDS: 19 | commands.add_command(command, command.name) 20 | click.CommandCollection(sources=[commands])() 21 | 22 | 23 | def _add_project_modules_to_sys_path(): 24 | """This methods allows load udf defined from project location""" 25 | sys.path.append(os.getcwd()) 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | # Describe the bug 11 | A clear and concise description of what the bug is. 12 | 13 | ## To Reproduce 14 | Steps to reproduce the behavior: 15 | 1. Code snippet or gist. 16 | 2. Error message(s) if applicable 17 | 18 | ## Expected behavior 19 | A clear and concise description of what you expected to happen. 20 | 21 | ## Screenshots 22 | If applicable, add screenshots to help explain your problem. 23 | 24 | ## OS environment 25 | - OS: [e.g. Linux / Windows / macOS] 26 | - biome.text Version [e.g. 1.0.0] 27 | 28 | ## Additional context 29 | Add any other context about the problem here. 30 | -------------------------------------------------------------------------------- /tests/text/test_metrics.py: -------------------------------------------------------------------------------- 1 | from allennlp.data import Vocabulary 2 | 3 | from biome.text.metrics import Metrics 4 | 5 | 6 | def test_metrics(): 7 | metrics = Metrics( 8 | accuracy={"type": "categorical_accuracy"}, 9 | f1={ 10 | "type": "span_f1", 11 | "vocabulary": Vocabulary.empty(), 12 | }, 13 | ) 14 | 15 | # Check that training and validation metrics are different instances 16 | assert ( 17 | metrics.get_dict()["accuracy"] 18 | is not metrics.get_dict(is_train=False)["accuracy"] 19 | ) 20 | # Check if we share the same vocab 21 | assert ( 22 | metrics.get_dict()["f1"]._label_vocabulary 23 | is metrics.get_dict(is_train=False)["f1"]._label_vocabulary 24 | ) 25 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/styles/fonts.styl: -------------------------------------------------------------------------------- 1 | @font-face 2 | font-family: 'Basis Grotesque Pro' 3 | font-style: normal 4 | font-weight: normal 5 | src: local('Basis Grotesque Pro'), url('/biome-text/master/assets/fonts/BasisGrotesquePro-Regular.woff') format('woff') 6 | 7 | 8 | @font-face 9 | font-family: 'Basis Grotesque Pro Bold' 10 | font-style: normal 11 | font-weight: normal 12 | src: local('Basis Grotesque Pro Bold'), url('/biome-text/master/assets/fonts/BasisGrotesquePro-Bold.woff') format('woff') 13 | 14 | 15 | @font-face 16 | font-family: 'Basis Grotesque Pro Light' 17 | font-style: normal 18 | font-weight: normal 19 | src: local('Basis Grotesque Pro Light'), url('/biome-text/master/assets/fonts/BasisGrotesquePro-Light.woff') format('woff') 20 | -------------------------------------------------------------------------------- /docs/docs/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | home: true 3 | navImage: /assets/img/biome.svg 4 | heroImage: /assets/img/biome-isotype.svg 5 | heroText: biome. 6 | heroSubText: text 7 | tagline: Practical NLP open source library 8 | actionText: Get Started 9 | actionLink: /documentation/ 10 | features: 11 | - title: Easy to use 12 | details: Create natural language processing custom models with powerful building blocks and simple workflows. 13 | - title: Powerful 14 | details: Benefit from the latest research and models in NLP powered by PyTorch, AllenNLP and Huggingface. 15 | img1: /assets/img/pytorch.svg 16 | img2: /assets/img/allennlp.png 17 | img3: /assets/img/hugging.png 18 | - title: Industry-ready 19 | details: Easily package and serve your models in production. 20 | footer: Maintained by 21 | --- 22 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | # Options for py.test: 3 | # Specify command line options as you would do when invoking py.test directly. 4 | # e.g. --cov-report html (or xml) for html/xml output or --junitxml junit.xml 5 | # in order to write a coverage file that can be read by Jenkins. 6 | testpaths = tests 7 | addopts = --color yes --cov biome --cov-report html --verbose 8 | 9 | [flake8] 10 | # Some sane defaults for the code style checker flake8 11 | exclude = 12 | .tox 13 | build 14 | dist 15 | .eggs 16 | docs/conf.py 17 | 18 | [pylint] 19 | max-line-length = 120 20 | disable = C0330,C0111,C0303,C0415,R0801 21 | skip = docs/*.py 22 | output-format = colorized 23 | generated-members = numpy.*,torch.* 24 | score = y 25 | reports = n 26 | 27 | [isort] 28 | profile = black 29 | force_single_line = True 30 | -------------------------------------------------------------------------------- /tests/text/modules/configuration/test_component_configuration.py: -------------------------------------------------------------------------------- 1 | from biome.text import helpers 2 | from biome.text.modules.heads.classification.text_classification import ( 3 | TextClassification, 4 | ) 5 | from biome.text.modules.heads.classification.text_classification import ( 6 | TextClassificationConfiguration, 7 | ) 8 | 9 | 10 | def test_component_spec_config_with_type(): 11 | head = TextClassificationConfiguration( 12 | pooler="boe", 13 | labels=[ 14 | "toxic", 15 | "severe_toxic", 16 | "obscene", 17 | "threat", 18 | "insult", 19 | "identity_hate", 20 | ], 21 | multilabel=True, 22 | ) 23 | 24 | assert "type" in head.config 25 | assert head.config["type"] == helpers.get_full_class_name(TextClassification) 26 | -------------------------------------------------------------------------------- /tests/text/test_pipeline_save.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from numpy.testing import assert_allclose 3 | 4 | from biome.text import Pipeline 5 | 6 | 7 | @pytest.fixture 8 | def pipeline(): 9 | return Pipeline.from_config( 10 | { 11 | "name": "test_pipeline_copy", 12 | "head": {"type": "TextClassification", "labels": ["a", "b"]}, 13 | } 14 | ) 15 | 16 | 17 | def test_save(pipeline, tmp_path): 18 | pipeline.save(tmp_path) 19 | 20 | assert (tmp_path / "model.tar.gz").is_file() 21 | 22 | expected_prediction = pipeline.predict("test") 23 | prediction = Pipeline.from_pretrained(tmp_path / "model.tar.gz").predict("test") 24 | 25 | assert prediction["labels"] == expected_prediction["labels"] 26 | assert_allclose(prediction["probabilities"], expected_prediction["probabilities"]) 27 | -------------------------------------------------------------------------------- /src/biome/text/mlflow_model.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | import pandas as pd 3 | 4 | 5 | class BiomeTextModel(mlflow.pyfunc.PythonModel): 6 | """A custom MLflow model with the 'python_function' flavor for biome.text pipelines. 7 | 8 | This class is used by the `Pipeline.to_mlflow()` method. 9 | """ 10 | 11 | ARTIFACT_CONTEXT = "model" 12 | 13 | def __init__(self): 14 | self.pipeline = None 15 | 16 | def load_context(self, context): 17 | from biome.text import Pipeline 18 | 19 | self.pipeline = Pipeline.from_pretrained( 20 | context.artifacts[self.ARTIFACT_CONTEXT] 21 | ) 22 | 23 | def predict(self, context, dataframe: pd.DataFrame): 24 | batch = dataframe.to_dict(orient="records") 25 | predictions = self.pipeline.predict(batch=batch) 26 | 27 | return pd.DataFrame(predictions) 28 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/styles/palette.styl: -------------------------------------------------------------------------------- 1 | // font family 2 | @import url('https://fonts.googleapis.com/css2?family=Open+Sans&display=swap') 3 | @require "./fonts.styl" 4 | $primaryFontFamily = 'Basis Grotesque Pro' 5 | $secondaryFontFamily = 'Open Sans' 6 | $handMadeFontFamily = 'Just Me Again Down Here', cursive 7 | 8 | // colors 9 | $accentColor = #F38959 10 | $textColorLight = #686A6D 11 | $textColor = #4A4A4A 12 | $borderColor = #D8D8D8 13 | $codeBgColor = #4A4A4A 14 | $arrowBgColor = #ccc 15 | $badgeTipColor = #9013FE 16 | $badgeWarningColor = darken(#ffe564, 35%) 17 | $badgeErrorColor = #DA5961 18 | $sidebarBgColor = #F5F5F6 19 | $codePillColor = #F0E7FF 20 | $yellow = #F8D11C 21 | $green = #6ACE91 22 | $red = #FF1E5E 23 | 24 | // layout 25 | $navbarHeight = 3.6rem 26 | $sidebarWidth = 20rem 27 | $contentWidth = 740px 28 | $homePageWidth = 960px 29 | 30 | // responsive breakpoints 31 | $MQNarrow = 959px 32 | $MQMobile = 719px 33 | $MQMobileNarrow = 419px 34 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: default dev check test ui build_ui docs build_docs dist 2 | default: help 3 | 4 | dev: ## install package in development mode 5 | @pip install --upgrade -e . 6 | @pre-commit install 7 | 8 | check: ## applies a code pylint with autopep8 reformating 9 | @pre-commit run --all-files 10 | @pylint --exit-zero --rcfile=setup.cfg --unsafe-load-any-extension=y src 11 | 12 | test: ## launch package tests 13 | @python -m pytest 14 | @python -m pytest --doctest-modules src/biome/text 15 | 16 | docs: ## serve the documentation for development 17 | @cd docs && npm install && npm run dev:site 18 | 19 | build_docs: ## build the documentation 20 | @cd docs && npm install && npm run build:site 21 | 22 | dist: ## build a package distribution 23 | @python setup.py sdist bdist_wheel 24 | 25 | 26 | .PHONY: help 27 | help: 28 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 29 | -------------------------------------------------------------------------------- /src/biome/text/modules/encoders/time_distributed_encoder.py: -------------------------------------------------------------------------------- 1 | from allennlp.modules import Seq2SeqEncoder 2 | from allennlp.modules import TimeDistributed 3 | 4 | 5 | class TimeDistributedEncoder(Seq2SeqEncoder): 6 | """Wraps a Seq2SeqEncoder into a TimeDistributed module and implements the Seq2SeqEncoder API""" 7 | 8 | def __init__(self, encoder: Seq2SeqEncoder): 9 | super(TimeDistributedEncoder, self).__init__() 10 | 11 | self._input_dim = encoder.get_input_dim() 12 | self._output_dim = encoder.get_output_dim() 13 | self._is_bidirectional = ( 14 | hasattr(encoder, "is_bidirectional") and encoder.is_bidirectional() 15 | ) 16 | 17 | self._encoder = TimeDistributed(encoder) 18 | 19 | def forward(self, *input, **inputs): 20 | return self._encoder(*input, **inputs) 21 | 22 | def is_bidirectional(self) -> bool: 23 | return self._is_bidirectional 24 | 25 | def get_output_dim(self) -> int: 26 | return self._output_dim 27 | 28 | def get_input_dim(self): 29 | return self._input_dim 30 | -------------------------------------------------------------------------------- /docs/docs/documentation/readme.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | For the installation we recommend setting up a fresh [conda](https://docs.conda.io/en/latest/miniconda.html) environment: 4 | 5 | ```shell script 6 | conda create -n biome python~=3.7.0 pip>=20.3.0 7 | conda activate biome 8 | ``` 9 | 10 | Once the conda environment is activated, you can install the latest release or the development version via pip. 11 | 12 | ## Latest release (recommended) 13 | 14 | To install the latest release of *biome.text* type in: 15 | 16 | ````shell script 17 | pip install -U biome-text 18 | ```` 19 | 20 | After installing *biome.text*, the best way to test your installation is by running the *biome.text* cli command: 21 | 22 | ```shell script 23 | biome --help 24 | ``` 25 | 26 | ## Master branch 27 | 28 | The *master branch* contains the latest features, but is less well tested. 29 | If you are looking for a specific feature that has not been released yet, you can install the package from our master branch with: 30 | 31 | ````shell script 32 | pip install -U git+https://github.com/recognai/biome-text.git 33 | ```` 34 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/styles/code-colors.styl: -------------------------------------------------------------------------------- 1 | :not(pre)>code[class*=language-], pre[class*=language-], div[class*="language-"], .theme-default-content pre, .theme-default-content pre[class*=language-] 2 | background #F5F5F6 3 | 4 | div[class*="language-"] .highlight-lines .highlighted 5 | background-color rgba(0, 0, 0, 0.08) 6 | 7 | .theme-default-content pre code, .theme-default-content pre[class*="language-"] code 8 | color #000000 !important 9 | 10 | .token.atrule, .token.builtin, .token.important, .token.keyword, .token.selector 11 | color #A9261B !important 12 | 13 | div[class*="language-"]::before 14 | color #999 !important 15 | 16 | .token.punctuation 17 | color #C58D09 !important 18 | 19 | .token.entity, .token.operator, .token.url 20 | color #004898 !important 21 | 22 | .token.boolean, .token.number, .token.function 23 | color #4C10BC !important 24 | 25 | .token.string, .token.char, .token.attr-value, .token.regex, .token.variable 26 | color #429E9E !important 27 | 28 | .token.property, .token.class-name, .token.constant, .token.symbol 29 | color #67BF89 !important 30 | -------------------------------------------------------------------------------- /src/biome/text/modules/configuration/allennlp_configuration.py: -------------------------------------------------------------------------------- 1 | from allennlp.modules import BiMpmMatching 2 | from allennlp.modules import Embedding 3 | from allennlp.modules import FeedForward 4 | from allennlp.modules import Seq2SeqEncoder 5 | from allennlp.modules import Seq2VecEncoder 6 | 7 | from .defs import ComponentConfiguration 8 | 9 | 10 | class Seq2VecEncoderConfiguration(ComponentConfiguration[Seq2VecEncoder]): 11 | """Layer spec for Seq2VecEncoder components""" 12 | 13 | pass 14 | 15 | 16 | class Seq2SeqEncoderConfiguration(ComponentConfiguration[Seq2SeqEncoder]): 17 | """Layer spec for Seq2SeqEncoder components""" 18 | 19 | pass 20 | 21 | 22 | class FeedForwardConfiguration(ComponentConfiguration[FeedForward]): 23 | """Layer spec for FeedForward components""" 24 | 25 | pass 26 | 27 | 28 | class BiMpmMatchingConfiguration(ComponentConfiguration[BiMpmMatching]): 29 | """Layer spec for BiMpmMatching components""" 30 | 31 | pass 32 | 33 | 34 | class EmbeddingConfiguration(ComponentConfiguration[Embedding]): 35 | """Layer spec for Embedding components""" 36 | 37 | pass 38 | -------------------------------------------------------------------------------- /src/biome/text/__init__.py: -------------------------------------------------------------------------------- 1 | import pkg_resources 2 | 3 | try: 4 | __version__ = pkg_resources.get_distribution("biome-text").version 5 | except pkg_resources.DistributionNotFound: 6 | # package is not installed 7 | pass 8 | 9 | import logging 10 | 11 | # configure basic 'biome.text' logging 12 | _handler = logging.StreamHandler() 13 | _handler.setFormatter( 14 | logging.Formatter("%(levelname)s:%(name)s: %(message)s") 15 | ) # "%(levelname)s: %(message)s")) 16 | _LOGGER = logging.getLogger(__name__) 17 | _LOGGER.addHandler(_handler) 18 | _LOGGER.setLevel("INFO") 19 | # configure 'allennlp' logging 20 | _ALLENNLP_LOGGER = logging.getLogger("allennlp") 21 | _ALLENNLP_LOGGER.addHandler(_handler) 22 | _ALLENNLP_LOGGER.setLevel("WARNING") 23 | 24 | # TODO: Remove this hack when allennlp 1.8.0 is out 25 | import transformers 26 | 27 | transformers.__spec__ = "" 28 | 29 | from .configuration import PipelineConfiguration 30 | from .configuration import TrainerConfiguration 31 | from .configuration import VocabularyConfiguration 32 | from .dataset import Dataset 33 | from .pipeline import Pipeline 34 | from .trainer import Trainer 35 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | 7 | def pytest_configure(config): 8 | # It's really hard to do testing with wandb enabled ... 9 | os.environ["WANDB_MODE"] = "disabled" 10 | 11 | 12 | @pytest.fixture 13 | def resources_path() -> Path: 14 | return Path(__file__).parent / "resources" 15 | 16 | 17 | @pytest.fixture 18 | def resources_data_path(resources_path) -> Path: 19 | return resources_path / "data" 20 | 21 | 22 | @pytest.fixture 23 | def tutorials_path() -> Path: 24 | repo_root = Path(__file__).parent.parent 25 | return repo_root / "docs" / "docs" / "documentation" / "tutorials" 26 | 27 | 28 | @pytest.fixture 29 | def configurations_path() -> Path: 30 | repo_root = Path(__file__).parent.parent 31 | return ( 32 | repo_root 33 | / "docs" 34 | / "docs" 35 | / "documentation" 36 | / "user-guides" 37 | / "2-configuration.md" 38 | ) 39 | 40 | 41 | @pytest.fixture 42 | def change_to_tmp_working_dir(tmp_path) -> Path: 43 | cwd = os.getcwd() 44 | os.chdir(tmp_path) 45 | yield tmp_path 46 | os.chdir(cwd) 47 | -------------------------------------------------------------------------------- /tests/resources/data/dataset_source.csv: -------------------------------------------------------------------------------- 1 | age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y 2 | 44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,210,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0 3 | 53,technician,married,unknown,no,no,no,cellular,nov,fri,138,1,999,0,nonexistent,-0.1,93.2,-42,4.021,5195.8,0 4 | 28,management,single,university.degree,no,yes,no,cellular,jun,thu,339,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1 5 | 39,services,married,high.school,no,no,no,cellular,apr,fri,185,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0 6 | 55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,137,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1 7 | 30,management,divorced,basic.4y,no,yes,no,cellular,jul,tue,68,8,999,0,nonexistent,1.4,93.918,-42.7,4.961,5228.1,0 8 | 37,blue-collar,married,basic.4y,no,yes,no,cellular,may,thu,204,1,999,0,nonexistent,-1.8,92.893,-46.2,1.327,5099.1,0 9 | 39,blue-collar,divorced,basic.9y,no,yes,no,cellular,may,fri,191,1,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,0 10 | 36,admin.,married,university.degree,no,no,no,cellular,jun,mon,174,1,3,1,success,-2.9,92.963,-40.8,1.266,5076.2,1 11 | -------------------------------------------------------------------------------- /tests/resources/data/dataset_sequence.jsonl: -------------------------------------------------------------------------------- 1 | {"hypothesis": "Irmalotte Schneider Poggenburg 4 48485 Neuenkirchen, Kreis Steinfurt Irmalotte-S92@freemail.de 22.4.1992 01636496234", "premise": " DE Frau Dr. Iramlotte Schneider Poggenburg 4 48485 Neuenkirchen, Kreis Steinfurt Irmalotte-S92@freemail.de 17. Juli 1967 01636496234", "label": "duplicate"} 2 | {"hypothesis": "Irmalotte Schneider Poggenburg 4 48485 Neuenkirchen, Kreis Steinfurt Irmalotte-S92@freemail.de 22.4.1992 01636496234", "premise": "Herr Karlheinz Hofmann Seglerweg 5 48485 Neuenkirchen, Kreis Steinfurt karlheinz-hofmann@hotmail.de 3.10.1952 0152359493301", "label": "not_duplicate"} 3 | {"hypothesis": "Herr Karlheinz Hofmann Seglerweg 5 48485 Neuenkirchen, Kreis Steinfurt karlheinz-hofmann@hotmail.de 3.10.1952 0152359493301", "premise": "Frau Dr. Iramlotte Schneider Poggenburg 4 48485 Neuenkirchen, Kreis Steinfurt Irmalotte-S92@freemail.de 17. Juli 1967 01636496234", "label": "not_duplicate"} 4 | {"hypothesis": "Herr Karlheinz Hofmann Seglerweg 5 48485 Neuenkirchen, Kreis Steinfurt karlheinz-hofmann@hotmail.de 3.10.1952 0152359493301", "premise": " DE Herr Karlheinz Hofamnn Seglerweg 5 48485 Neuenkirchen, Kreis Steinfurt karlheinz-hofmann@hotmail.de 3. October 52 0152359493301", "label": "duplicate"} 5 | -------------------------------------------------------------------------------- /docs/prepare_versioned_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | print_help(){ 4 | echo "Usage: bash" "$0" 5 | echo "" 6 | echo " Small bash script to prepare the docs for a _versioned_ build." 7 | echo "" 8 | echo " The environment variable BIOME_TEXT_DOC_VERSION must be set!" 9 | echo " This env variable must match the release tag (e.g. v2.2.0 or v2.2.0rc1)." 10 | } 11 | 12 | if [ "$1" == "--help" ] || [ "$1" == "-h" ]; then 13 | print_help 14 | exit 0 15 | fi 16 | 17 | if [ -z "$BIOME_TEXT_DOC_VERSION" ]; then 18 | echo "ERROR: BIOME_TEXT_DOC_VERSION not set!" 19 | print_help 20 | exit 1 21 | fi 22 | 23 | 24 | echo " - Modifying font urls ..." 25 | 26 | if ! sed -i "s|/biome-text/master/|/biome-text/$BIOME_TEXT_DOC_VERSION/|g" ./docs/.vuepress/theme/styles/fonts.styl; then 27 | echo "ERROR: Could not modify 'fonts.styl'!" 28 | exit 1 29 | fi 30 | 31 | 32 | echo " - Modifying tutorials ..." 33 | 34 | modified=$(find ./docs/documentation/tutorials -maxdepth 1 -name "*.ipynb" \ 35 | -exec sed -i -e "s|pip install -U git+https://github.com/recognai/biome-text.git|pip install -U biome-text|g" \ 36 | -e "s|/biome-text/master/|/biome-text/$BIOME_TEXT_DOC_VERSION/|g" \ 37 | -e "s|/biome-text/blob/master/|/biome-text/blob/$BIOME_TEXT_DOC_VERSION/|g" {} \; \ 38 | -exec echo {} \; | wc -l) 39 | if [ "$modified" -eq 0 ]; then 40 | echo "ERROR: No tutorials modified!" 41 | exit 1 42 | fi 43 | 44 | 45 | echo " - Done!" 46 | 47 | exit 0 48 | -------------------------------------------------------------------------------- /src/biome/text/errors.py: -------------------------------------------------------------------------------- 1 | from fastapi import HTTPException 2 | 3 | 4 | class BaseError(Exception): 5 | """Base error. This class could include common error attributes or methods""" 6 | 7 | pass 8 | 9 | 10 | class ValidationError(BaseError): 11 | """Base error for data validation""" 12 | 13 | pass 14 | 15 | 16 | class WrongInputError(ValidationError): 17 | """Error related with input params""" 18 | 19 | def __init__(self, arg_name: str): 20 | super(WrongInputError, self).__init__() 21 | self.arg_name = arg_name 22 | 23 | def __str__(self) -> str: 24 | return f"Wrong model input '{self.arg_name}'" 25 | 26 | 27 | class ActionNotSupportedError(ValidationError): 28 | """Raised when an action is not supported for a given component state""" 29 | 30 | 31 | class EmptyVocabError(ValidationError): 32 | """Error related with using empty vocabs for a training""" 33 | 34 | pass 35 | 36 | 37 | class WrongValueError(ValidationError): 38 | """Wrong value error""" 39 | 40 | 41 | class http_error_handling: 42 | """Error handling for http error transcription""" 43 | 44 | def __enter__(self): 45 | pass 46 | 47 | def __exit__(self, exc_type, exc_val, exc_tb): 48 | if isinstance(exc_val, ValidationError): 49 | raise HTTPException(status_code=400, detail=str(exc_val)) 50 | if isinstance(exc_val, Exception): 51 | # Common http error handling 52 | raise HTTPException(status_code=500, detail=str(exc_val)) 53 | -------------------------------------------------------------------------------- /src/biome/text/modules/heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .classification.doc_classification import DocumentClassification 2 | from .classification.doc_classification import DocumentClassificationConfiguration 3 | from .classification.record_classification import RecordClassification 4 | from .classification.record_classification import RecordClassificationConfiguration 5 | from .classification.record_pair_classification import RecordPairClassification 6 | from .classification.record_pair_classification import ( 7 | RecordPairClassificationConfiguration, 8 | ) 9 | from .classification.relation_classification import RelationClassification 10 | from .classification.relation_classification import RelationClassificationConfiguration 11 | from .classification.text_classification import TextClassification 12 | from .classification.text_classification import TextClassificationConfiguration 13 | from .language_modelling import LanguageModelling 14 | from .language_modelling import LanguageModellingConfiguration 15 | from .task_head import TaskHead 16 | from .task_head import TaskHeadConfiguration 17 | from .task_head import TaskName 18 | from .task_head import TaskPrediction 19 | from .token_classification import TokenClassification 20 | from .token_classification import TokenClassificationConfiguration 21 | 22 | for head in [ 23 | TextClassification, 24 | TokenClassification, 25 | DocumentClassification, 26 | RecordClassification, 27 | LanguageModelling, 28 | RecordPairClassification, 29 | RelationClassification, 30 | ]: 31 | head.register(overrides=True) 32 | -------------------------------------------------------------------------------- /tests/resources/data/dataset_sequence.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.1.0", 3 | "data": [ 4 | { 5 | "hypothesis": "Irmalotte Schneider Poggenburg 4 48485 Neuenkirchen, Kreis Steinfurt Irmalotte-S92@freemail.de 22.4.1992 01636496234", 6 | "premise": " DE Frau Dr. Iramlotte Schneider Poggenburg 4 48485 Neuenkirchen, Kreis Steinfurt Irmalotte-S92@freemail.de 17. Juli 1967 01636496234", 7 | "label": "duplicate" 8 | }, 9 | { 10 | "hypothesis": "Irmalotte Schneider Poggenburg 4 48485 Neuenkirchen, Kreis Steinfurt Irmalotte-S92@freemail.de 22.4.1992 01636496234", 11 | "premise": "Herr Karlheinz Hofmann Seglerweg 5 48485 Neuenkirchen, Kreis Steinfurt karlheinz-hofmann@hotmail.de 3.10.1952 0152359493301", 12 | "label": "not_duplicate" 13 | }, 14 | { 15 | "hypothesis": "Herr Karlheinz Hofmann Seglerweg 5 48485 Neuenkirchen, Kreis Steinfurt karlheinz-hofmann@hotmail.de 3.10.1952 0152359493301", 16 | "premise": "Frau Dr. Iramlotte Schneider Poggenburg 4 48485 Neuenkirchen, Kreis Steinfurt Irmalotte-S92@freemail.de 17. Juli 1967 01636496234", 17 | "label": "not_duplicate" 18 | }, 19 | { 20 | "hypothesis": "Herr Karlheinz Hofmann Seglerweg 5 48485 Neuenkirchen, Kreis Steinfurt karlheinz-hofmann@hotmail.de 3.10.1952 0152359493301", 21 | "premise": " DE Herr Karlheinz Hofamnn Seglerweg 5 48485 Neuenkirchen, Kreis Steinfurt karlheinz-hofmann@hotmail.de 3. October 52 0152359493301", 22 | "label": "duplicate" 23 | } 24 | ] 25 | } 26 | -------------------------------------------------------------------------------- /docs/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "docs", 3 | "version": "1.0.0", 4 | "description": "Documentation", 5 | "private": true, 6 | "env": { 7 | "apipath": "docs/api", 8 | "package": "biome.text", 9 | "templates_path": ".templates/" 10 | }, 11 | "scripts": { 12 | "clean:api": "rm -rf $npm_package_env_apipath/biome", 13 | "build:api": "pdoc -o $npm_package_env_apipath $npm_package_env_package --force --template-dir $npm_package_env_templates_path --html", 14 | "rename:index": "find $npm_package_env_apipath | renamer --find index.md --replace README.md", 15 | "rename:html": "find $npm_package_env_apipath | renamer --find .html --replace .md", 16 | "rename": "npm run rename:html && npm run rename:index", 17 | "build:tutorials": "find docs/documentation/tutorials -iname *.ipynb -maxdepth 1 -exec jupyter nbconvert --to markdown {} \\;", 18 | "build:docs": "npm run clean:api && npm run build:api && npm run build:tutorials && npm run rename", 19 | "build:site": "npm run build:docs && vuepress build docs", 20 | "dev:site": "npm run build:docs && vuepress dev docs", 21 | "docs:dev": "npm run dev:site", 22 | "docs:svgo": "vuepress svgo docs" 23 | }, 24 | "devDependencies": { 25 | "@goy/vuepress-plugin-svg-icons": "^4.1.0", 26 | "@vuepress/plugin-active-header-links": "^1.4.1", 27 | "@vuepress/plugin-back-to-top": "^1.4.1", 28 | "renamer": "^2.0.0", 29 | "vuepress": "^1.4.1" 30 | }, 31 | "dependencies": { 32 | "axios": ">=0.21.1", 33 | "v-click-outside": "^3.1.2", 34 | "vuepress-bar": "^0.3.0" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/text/test_pipeline_copy.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from numpy.testing import assert_allclose 3 | 4 | from biome.text import Dataset 5 | from biome.text import Pipeline 6 | from biome.text import Trainer 7 | from biome.text import TrainerConfiguration 8 | 9 | 10 | @pytest.fixture 11 | def pipeline(): 12 | return Pipeline.from_config( 13 | { 14 | "name": "test_pipeline_copy", 15 | "head": { 16 | "type": "TextClassification", 17 | "labels": ["a", "b"], 18 | }, 19 | } 20 | ) 21 | 22 | 23 | @pytest.fixture 24 | def dataset(): 25 | return Dataset.from_dict( 26 | { 27 | "text": ["this is", "a test"], 28 | "label": ["a", "b"], 29 | } 30 | ) 31 | 32 | 33 | def test_copy(pipeline): 34 | prediction = pipeline.predict("check this") 35 | pipeline_copy = pipeline.copy() 36 | prediction_copy = pipeline_copy.predict("check this") 37 | 38 | assert_allclose(prediction["probabilities"], prediction_copy["probabilities"]) 39 | 40 | 41 | def test_train_from_pretrained(pipeline, dataset, tmp_path): 42 | output_path = tmp_path / "test_train_from_pretrained_output" 43 | trainer_config = TrainerConfiguration(max_epochs=1, batch_size=2, gpus=0) 44 | trainer = Trainer( 45 | pipeline=pipeline, train_dataset=dataset, trainer_config=trainer_config 46 | ) 47 | trainer.fit(output_path) 48 | 49 | prediction = pipeline.predict("a test") 50 | pipeline_loaded = Pipeline.from_pretrained(output_path / "model.tar.gz") 51 | prediction_loaded = pipeline_loaded.predict("a test") 52 | 53 | assert_allclose(prediction["probabilities"], prediction_loaded["probabilities"]) 54 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/components/Sidebar.vue: -------------------------------------------------------------------------------- 1 | 19 | 20 | 33 | 34 | 71 | -------------------------------------------------------------------------------- /src/biome/text/metrics.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Any 3 | from typing import Dict 4 | 5 | from allennlp.common import Params 6 | from allennlp.training.metrics import Metric 7 | 8 | 9 | class Metrics: 10 | """Stores two dictionaries of identical metrics, one for training and one for validation. 11 | 12 | Parameters 13 | ---------- 14 | **kwargs 15 | The key defines the name of the metric, the value must be a dictionary that can be used to instantiate a 16 | child class of `allennlp.training.metrics.Metric` via its `from_params` method. 17 | 18 | Examples 19 | -------- 20 | >>> from allennlp.training.metrics import Metric 21 | >>> metrics = Metrics(accuracy={"type": "categorical_accuracy"}, f1={"type": "fbeta"}) 22 | >>> for metric in metrics.get_dict(is_train=False).values(): 23 | ... assert isinstance(metric, Metric) 24 | """ 25 | 26 | def __init__(self, **kwargs: Dict[str, Any]): 27 | self.training_metrics = {} 28 | self.validation_metrics = {} 29 | for name, metric_kwargs in kwargs.items(): 30 | # We need a special logic for the vocabulary, we do not want to deep copy it, 31 | # and it cannot be used in Params 32 | vocab = metric_kwargs.pop("vocabulary", None) 33 | self.training_metrics[name] = Metric.from_params( 34 | Params(copy.deepcopy(metric_kwargs)), 35 | **{} if vocab is None else {"vocabulary": vocab} 36 | ) 37 | self.validation_metrics[name] = Metric.from_params( 38 | Params(metric_kwargs), **{} if vocab is None else {"vocabulary": vocab} 39 | ) 40 | 41 | def get_dict(self, is_train: bool = True) -> Dict[str, Metric]: 42 | if is_train: 43 | return self.validation_metrics 44 | return self.training_metrics 45 | -------------------------------------------------------------------------------- /src/biome/text/cli/evaluate.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import click 4 | 5 | from biome.text import Pipeline 6 | from biome.text.cli.train import dataset_from_path 7 | 8 | 9 | @click.command() 10 | @click.argument( 11 | "pipeline_path", 12 | type=click.Path(exists=True), 13 | ) 14 | @click.option( 15 | "--output", 16 | "-o", 17 | type=click.Path(), 18 | required=True, 19 | help="Path to write the evaluation metrics to.", 20 | ) 21 | @click.option( 22 | "--dataset", 23 | "-ds", 24 | type=click.Path(exists=True), 25 | required=True, 26 | help="Path to the dataset", 27 | ) 28 | @click.option( 29 | "--batch_size", 30 | "-bs", 31 | type=int, 32 | default=16, 33 | show_default=True, 34 | help="Batch size during evaluation.", 35 | ) 36 | @click.option( 37 | "--lazy", 38 | "-l", 39 | type=bool, 40 | default=False, 41 | show_default=True, 42 | help="If true, data is lazily loaded from disk, otherwise it is loaded into memory.", 43 | ) 44 | @click.option( 45 | "--prediction_output", 46 | "-po", 47 | type=click.Path(), 48 | default=None, 49 | help="Write batch predictions to this file.", 50 | ) 51 | def evaluate( 52 | pipeline_path: str, 53 | output: str, 54 | dataset: str, 55 | batch_size: int = 16, 56 | lazy: bool = False, 57 | prediction_output: Optional[str] = None, 58 | ) -> None: 59 | """Evaluate a pipeline on a given dataset. 60 | 61 | PIPELINE_PATH is the path to a pretrained pipeline (model.tar.gz file). 62 | """ 63 | pipeline = Pipeline.from_pretrained(pipeline_path) 64 | dataset = dataset_from_path(dataset) 65 | 66 | pipeline.evaluate( 67 | dataset, 68 | batch_size=batch_size, 69 | lazy=lazy, 70 | predictions_output_file=prediction_output, 71 | metrics_output_file=output, 72 | ) 73 | -------------------------------------------------------------------------------- /tests/text/test_pipeline_to_mlflow.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import mlflow 4 | import pandas as pd 5 | import pytest 6 | import yaml 7 | from numpy.testing import assert_allclose 8 | 9 | from biome.text import Pipeline 10 | from biome.text import __version__ 11 | 12 | 13 | @pytest.fixture 14 | def pipeline(): 15 | return Pipeline.from_config( 16 | { 17 | "name": "test_pipeline_copy", 18 | "head": {"type": "TextClassification", "labels": ["a", "b"]}, 19 | } 20 | ) 21 | 22 | 23 | def test_to_mlflow(pipeline, tmp_path): 24 | test_str_for_prediction = "test this prediction" 25 | expected_prediction = pipeline.predict(text=test_str_for_prediction) 26 | 27 | model_uri = pipeline.to_mlflow( 28 | tracking_uri=str(tmp_path / "to_mlflow_test"), experiment_id=0 29 | ) 30 | 31 | df = mlflow.search_runs(experiment_ids=["0"]) 32 | assert len(df) == 1 and df["tags.mlflow.runName"][0] == "log_biometext_model" 33 | 34 | # load MLFlow model and make predictions 35 | model = mlflow.pyfunc.load_model(model_uri=model_uri) 36 | prediction: pd.DataFrame = model.predict( 37 | pd.DataFrame([{"text": test_str_for_prediction}]) 38 | ) 39 | 40 | assert len(prediction) == 1 41 | assert expected_prediction["labels"] == prediction["labels"][0] 42 | assert_allclose( 43 | expected_prediction["probabilities"], prediction["probabilities"][0] 44 | ) 45 | with (Path(model_uri) / "conda.yaml").open() as file: 46 | conda_env = yaml.load(file) 47 | assert conda_env == { 48 | "name": "mlflow-dev", 49 | "channels": ["defaults", "conda-forge"], 50 | "dependencies": [ 51 | "python=3.7.9", 52 | "pip>=20.3.0", 53 | {"pip": ["mlflow", f"biome-text=={__version__}"]}, 54 | ], 55 | } 56 | -------------------------------------------------------------------------------- /tests/text/test_pipeline_with_optional_inputs.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from typing import List 3 | from typing import Optional 4 | from typing import Union 5 | 6 | from astroid import Instance 7 | 8 | from biome.text import Pipeline 9 | from biome.text import PipelineConfiguration 10 | from biome.text.configuration import FeaturesConfiguration 11 | from biome.text.modules.heads import TaskHeadConfiguration 12 | from biome.text.modules.heads import TextClassification 13 | 14 | 15 | class MyCustomHead(TextClassification): 16 | """Just a head renaming the original TextClassification head""" 17 | 18 | def inputs(self) -> Optional[List[str]]: 19 | return ["text", "second_text"] 20 | 21 | def featurize( 22 | self, 23 | text: Any, 24 | second_text: Optional[Any] = None, 25 | label: Optional[Union[int, str, List[Union[int, str]]]] = None, 26 | ) -> Optional[Instance]: 27 | instance = self.backbone.featurizer( 28 | {"text": text, "text-2": second_text}, 29 | to_field=self.forward_arg_name, 30 | aggregate=True, 31 | exclude_record_keys=True, 32 | ) 33 | return self._add_label(instance, label, to_field=self.label_name) 34 | 35 | 36 | def test_check_pipeline_inputs_and_output(): 37 | config = PipelineConfiguration( 38 | "test-pipeline", 39 | head=TaskHeadConfiguration( 40 | type=MyCustomHead, 41 | labels=[ 42 | "blue-collar", 43 | "technician", 44 | "management", 45 | "services", 46 | "retired", 47 | "admin.", 48 | ], 49 | ), 50 | features=FeaturesConfiguration(), 51 | ) 52 | 53 | pipeline = Pipeline.from_config(config) 54 | 55 | assert pipeline.inputs == ["text", "second_text"] 56 | assert pipeline.output == ["label"] 57 | -------------------------------------------------------------------------------- /docs/docs/documentation/community/1-contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | We are open and very happy to receive contributions to make *biome.text* more useful for you and others. 4 | 5 | If you want to start contributing to *biome.text* there are three things you need to do. 6 | There are basically three ways you can contribute to *biome.text*: 7 | 8 | 1. report a bug 9 | 2. make a feature request 10 | 3. submit a pull request 11 | 12 | ## Report a bug 13 | 14 | To report a bug in the library or point our an error in the documentation please open an [issue on GitHub](https://github.com/recognai/biome-text/issues/new/choose). 15 | 16 | ## Make a feature request 17 | 18 | If you are missing some feature in the library, please let us know in a [GitHub issue](https://github.com/recognai/biome-text/issues/new/choose). 19 | It is always helpful if you describe a concrete use case for the feature. 20 | 21 | ## Submit a pull request 22 | 23 | You can contribute to the code base via [Pull Requests (PRs)](https://docs.github.com/en/free-pro-team@latest/github/collaborating-with-issues-and-pull-requests/about-pull-requests). 24 | Here is a quick guide on how to [set up your system](./3-developer_guides.md#setting-up-for-development) for *biome.text* development. 25 | 26 | A PR should always reference an issue. 27 | So before starting to work on some bugfix or new feature, make sure to open a corresponding GitHub issue. 28 | If a corresponding issue already exists, please leave a quick comment that you are working on it. 29 | 30 | **For example**: you find an error in the documentation and open a new issue, #13, describing the error. 31 | You want to fix the error and create a new branch in your forked repo with a meaningful name, such as `documentation/#13`. 32 | You work on this branch, make the necessary changes, test them, push them and create a PR against our repo. 33 | This PR should include the text "Closes #13" at the end of its description. 34 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import glob 4 | 5 | from setuptools import setup 6 | 7 | try: 8 | from setuptools import find_namespace_packages 9 | except ImportError as error: 10 | raise ImportError("Make sure you have setuptools >= 40.1.0 installed!") from error 11 | 12 | 13 | if __name__ == "__main__": 14 | setup( 15 | name="biome-text", 16 | use_scm_version=True, 17 | setup_requires=["setuptools_scm"], 18 | description="Biome-text is a light-weight open source Natural Language Processing toolbox" 19 | " built with AllenNLP", 20 | author="Recognai", 21 | author_email="francisco@recogn.ai", 22 | url="https://www.recogn.ai/", 23 | long_description=open("README.md").read(), 24 | long_description_content_type="text/markdown", 25 | packages=find_namespace_packages("src"), 26 | package_dir={"": "src"}, 27 | install_requires=[ 28 | "allennlp~=2.7.0", 29 | "beautifulsoup4~=4.9.0", 30 | "captum~=0.2.0", 31 | "click~=7.1.0", 32 | "datasets>=1.10.0,<1.12.0", 33 | "flatdict~=4.0.0", 34 | "lxml~=4.6.2", 35 | "mlflow>=1.13.1,<1.21.0", 36 | "numpy", 37 | "pandas", 38 | "pytorch-lightning~=1.4.0", 39 | "ray[tune]>=1.3.0,<1.7.0", 40 | "spacy>=2.3.0,<3.2.0", 41 | "torch", # the version is defined by allennlp 42 | "transformers", # the version is defined by allennlp 43 | "tqdm>=4.49.0", 44 | "fastapi~=0.63.0", # newer versions brings pydantic conflicts with spaCy 3.0.x 45 | "uvicorn>=0.13.0", 46 | "pyyaml", 47 | ], 48 | entry_points={"console_scripts": ["biome=biome.text.cli:main"]}, 49 | python_requires=">=3.6.1", # taken from AllenNLP 50 | zip_safe=False, 51 | ) 52 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/components/NavLink.vue: -------------------------------------------------------------------------------- 1 | 25 | 26 | 90 | -------------------------------------------------------------------------------- /tests/text/test_pipeline_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from allennlp.data.token_indexers import PretrainedTransformerIndexer 3 | from allennlp.data.token_indexers import PretrainedTransformerMismatchedIndexer 4 | 5 | from biome.text import Pipeline 6 | from biome.text.configuration import TokenizerConfiguration 7 | from biome.text.tokenizer import Tokenizer 8 | from biome.text.tokenizer import TransformersTokenizer 9 | 10 | 11 | @pytest.fixture 12 | def pipeline_dict(request) -> dict: 13 | """Pipeline config dict. You need to update the labels!""" 14 | pipeline_dict = { 15 | "name": "transformers_tokenizer_test", 16 | "features": { 17 | "transformers": {"model_name": "sshleifer/tiny-distilroberta-base"} 18 | }, 19 | "head": { 20 | "type": "TextClassification", 21 | "labels": ["a", "b"], 22 | }, 23 | } 24 | return pipeline_dict 25 | 26 | 27 | def test_pipeline_transformers_tokenizer(pipeline_dict): 28 | pipeline_dict["tokenizer"] = {"truncate_input": 1} 29 | pl = Pipeline.from_config(pipeline_dict) 30 | 31 | assert pl.config.tokenizer_config.transformers_kwargs == { 32 | "model_name": "sshleifer/tiny-distilroberta-base" 33 | } 34 | assert pl.config.features.transformers.mismatched is False 35 | assert ( 36 | type(pl.backbone.featurizer.indexer["transformers"]) 37 | is PretrainedTransformerIndexer 38 | ) 39 | assert type(pl.backbone.tokenizer) is TransformersTokenizer 40 | 41 | # test max_sequence_length, only , t, should survive 42 | assert ( 43 | len(pl.backbone.tokenizer.tokenize_text("this is a multi token text")[0]) == 3 44 | ) 45 | 46 | assert pl.predict("Test this!") 47 | 48 | 49 | def test_pipeline_default_tokenizer(pipeline_dict): 50 | pipeline_dict["features"].update({"word": {"embedding_dim": 2}}) 51 | pl = Pipeline.from_config(pipeline_dict) 52 | 53 | assert pl.config.tokenizer_config == TokenizerConfiguration() 54 | assert pl.config.features.transformers.mismatched is True 55 | assert ( 56 | type(pl.backbone.featurizer.indexer["transformers"]) 57 | is PretrainedTransformerMismatchedIndexer 58 | ) 59 | assert type(pl.backbone.tokenizer) is Tokenizer 60 | 61 | prediction = pl.predict("Test this!") 62 | -------------------------------------------------------------------------------- /tests/text/modules/heads/test_language_modelling.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import pytest 4 | 5 | from biome.text import Dataset 6 | from biome.text import Pipeline 7 | from biome.text import Trainer 8 | from biome.text import TrainerConfiguration 9 | 10 | 11 | @pytest.fixture 12 | def training_dataset() -> Dataset: 13 | """Creating the dataframe.""" 14 | data = { 15 | "text": [ 16 | "this is a text", 17 | "my name is dani", 18 | "this is a table", 19 | "my name is paco", 20 | ], 21 | } 22 | return Dataset.from_dict(data) 23 | 24 | 25 | @pytest.fixture 26 | def pipeline_dict() -> Dict: 27 | """Creating the pipeline dictionary""" 28 | 29 | pipeline_dict = { 30 | "name": "lm", 31 | "features": { 32 | "word": {"embedding_dim": 50, "lowercase_tokens": True, "trainable": True}, 33 | "char": { 34 | "embedding_dim": 50, 35 | "dropout": 0.1, 36 | "encoder": { 37 | "type": "gru", 38 | "hidden_size": 10, 39 | "num_layers": 1, 40 | "bidirectional": True, 41 | }, 42 | }, 43 | }, 44 | "encoder": { 45 | "type": "gru", 46 | "num_layers": 1, 47 | "hidden_size": 10, 48 | "bidirectional": True, 49 | }, 50 | "head": {"type": "LanguageModelling", "dropout": 0.1, "bidirectional": True}, 51 | } 52 | 53 | return pipeline_dict 54 | 55 | 56 | @pytest.fixture 57 | def trainer_config() -> TrainerConfiguration: 58 | return TrainerConfiguration( 59 | max_epochs=2, 60 | optimizer={"type": "adam", "amsgrad": True, "lr": 0.002}, 61 | gpus=0, 62 | ) 63 | 64 | 65 | def test_train(pipeline_dict, training_dataset, trainer_config, tmp_path): 66 | """Testing the correct working of prediction, vocab creating and training""" 67 | 68 | pipeline = Pipeline.from_config(pipeline_dict) 69 | pipeline.predict(text="my name is juan") 70 | 71 | trainer = Trainer( 72 | pipeline=pipeline, 73 | train_dataset=training_dataset, 74 | valid_dataset=training_dataset, 75 | trainer_config=trainer_config, 76 | ) 77 | trainer.fit(tmp_path / "lm") 78 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .esrunner 2 | 3 | # docs 4 | docs/docs/api/biome/text/ 5 | docs/docs/documentation/tutorials/* 6 | !docs/docs/documentation/tutorials/*.ipynb 7 | !docs/docs/documentation/tutorials/img/ 8 | docs/site 9 | **/node_modules 10 | **/yarn.lock 11 | **/package-lock.json 12 | 13 | **/*.th 14 | **/*.tar.gz 15 | **/metrics*.json 16 | **/config.json 17 | **/events.out* 18 | **/vocabulary/*.txt 19 | 20 | **/webapp 21 | 22 | examples/**/experiment*/ 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # C extensions 30 | *.so 31 | 32 | # Distribution / packaging 33 | .Python 34 | env/ 35 | build/ 36 | develop-eggs/ 37 | dist/ 38 | downloads/ 39 | eggs/ 40 | .eggs/ 41 | lib/ 42 | lib64/ 43 | parts/ 44 | sdist/ 45 | dist/ 46 | var/ 47 | *.egg-info/ 48 | .installed.cfg 49 | *.egg 50 | 51 | # PyInstaller 52 | # Usually these files are written by a python script from a template 53 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 54 | *.manifest 55 | *.spec 56 | 57 | # Installer logs 58 | pip-log.txt 59 | pip-delete-this-directory.txt 60 | 61 | # Unit test / coverage reports 62 | htmlcov/ 63 | .tox/ 64 | .coverage 65 | .coverage.* 66 | .cache 67 | nosetests.xml 68 | coverage.xml 69 | *,cover 70 | .hypothesis/ 71 | 72 | # Translations 73 | *.mo 74 | *.pot 75 | 76 | # Django stuff: 77 | *.log 78 | local_settings.py 79 | 80 | # Flask stuff: 81 | instance/ 82 | .webassets-cache 83 | 84 | # Scrapy stuff: 85 | .scrapy 86 | 87 | # Sphinx documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | # IPython Notebook 94 | .ipynb_checkpoints 95 | 96 | # pyenv 97 | .python-version 98 | 99 | # celery beat schedule file 100 | celerybeat-schedule 101 | 102 | # dotenv 103 | .env 104 | 105 | # virtualenv 106 | .venv/ 107 | venv/ 108 | ENV/ 109 | .virtualenv/ 110 | 111 | # Spyder project settings 112 | .spyderproject 113 | 114 | # Rope project settings 115 | .ropeproject 116 | 117 | 118 | .idea 119 | 120 | .history 121 | 122 | .vscode 123 | **/dask-worker-space 124 | 125 | tools 126 | 127 | .generated* 128 | generated* 129 | 130 | *venv 131 | 132 | .DS_Store 133 | 134 | .dask 135 | 136 | **/mlruns/ 137 | **/runs/ 138 | **/.yalc/ 139 | 140 | #pylint 141 | .pylintrc 142 | -------------------------------------------------------------------------------- /tests/text/test_pipeline_datasets.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import pytest 5 | import torch 6 | 7 | from biome.text import Dataset 8 | from biome.text import Pipeline 9 | from biome.text import PipelineConfiguration 10 | from biome.text import Trainer 11 | from biome.text import TrainerConfiguration 12 | from biome.text.backbone import ModelBackbone 13 | from biome.text.modules.heads import TextClassification 14 | from biome.text.modules.heads import TextClassificationConfiguration 15 | 16 | 17 | class TestHead(TextClassification): 18 | def __init__(self, backbone: ModelBackbone): 19 | super(TestHead, self).__init__(backbone, labels=["test", "notest"]) 20 | 21 | 22 | @pytest.fixture 23 | def dataset(tmp_path) -> Dataset: 24 | data = { 25 | "text": ["A common text", "This is why you get", "Seriosly?, I'm not sure"], 26 | "label": ["one", "zero", "zero"], 27 | } 28 | ds = Dataset.from_dict(data) 29 | 30 | # we save and load it here to be able to lazily read from it 31 | ds_path = tmp_path / "test_pipeline_datasets" / "dataset" 32 | ds.save_to_disk(str(ds_path)) 33 | 34 | return Dataset.load_from_disk(str(ds_path)) 35 | 36 | 37 | @pytest.fixture 38 | def pipeline() -> Pipeline: 39 | config = PipelineConfiguration( 40 | name="test-classifier", 41 | head=TextClassificationConfiguration(labels=["one", "zero"]), 42 | ) 43 | return Pipeline.from_config(config) 44 | 45 | 46 | def test_training_from_pretrained_with_head_replace(pipeline, dataset, tmp_path): 47 | trainer_config = TrainerConfiguration( 48 | batch_size=2, 49 | max_epochs=5, 50 | gpus=0, 51 | ) 52 | 53 | trainer = Trainer(pipeline, train_dataset=dataset, trainer_config=trainer_config) 54 | trainer.fit(tmp_path / "output") 55 | 56 | pipeline.set_head(TestHead) 57 | pipeline.config.tokenizer_config.max_nr_of_sentences = 3 58 | copied = pipeline.copy() 59 | assert isinstance(copied.head, TestHead) 60 | assert copied.num_parameters == pipeline.num_parameters 61 | assert copied.num_trainable_parameters == pipeline.num_trainable_parameters 62 | copied_model_state = copied._model.state_dict() 63 | original_model_state = pipeline._model.state_dict() 64 | for key, value in copied_model_state.items(): 65 | if "backbone" in key: 66 | assert torch.all(torch.eq(value, original_model_state[key])) 67 | assert copied.backbone.featurizer.tokenizer.config.max_nr_of_sentences == 3 68 | -------------------------------------------------------------------------------- /tests/text/test_pipeline_with_custom_head.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from tempfile import mkdtemp 4 | 5 | import pytest 6 | 7 | from biome.text import Dataset 8 | from biome.text import Pipeline 9 | from biome.text import PipelineConfiguration 10 | from biome.text.configuration import FeaturesConfiguration 11 | from biome.text.configuration import VocabularyConfiguration 12 | from biome.text.modules.heads import TaskHeadConfiguration 13 | from biome.text.modules.heads import TextClassification 14 | 15 | 16 | class MyCustomHead(TextClassification): 17 | """Just a head renaming the original TextClassification head""" 18 | 19 | pass 20 | 21 | 22 | @pytest.fixture 23 | def training_dataset() -> Dataset: 24 | """Creates the training dataset and gives the structure""" 25 | resources_path = ( 26 | Path(__file__).parent.parent.parent / "tests" / "resources" / "data" 27 | ) 28 | training_ds = Dataset.from_csv(paths=str(resources_path / "dataset_source.csv")) 29 | 30 | # Keeping just 'label' and text 'category' 31 | training_ds = training_ds.map( 32 | lambda x: {"label": x["job"], "text": x["education"] + " " + x["marital"]}, 33 | ) 34 | 35 | return training_ds 36 | 37 | 38 | def test_load_pipeline_with_custom_head(training_dataset, tmp_path): 39 | """Testing a model training inserting a class as custom heard""" 40 | 41 | # Pipeline configuration dict with custom head 42 | config = PipelineConfiguration( 43 | "test-pipeline", 44 | head=TaskHeadConfiguration( 45 | type=MyCustomHead, 46 | labels=[ 47 | "blue-collar", 48 | "technician", 49 | "management", 50 | "services", 51 | "retired", 52 | "admin.", 53 | ], 54 | ), 55 | features=FeaturesConfiguration(), 56 | ) 57 | 58 | # Asserting that pipeline.head is an instance of MyCustomHead 59 | pipeline = Pipeline.from_config(config) 60 | assert isinstance(pipeline.head, MyCustomHead) 61 | 62 | # Saving the pipeline to output 63 | output = tmp_path / "pipeline" 64 | pipeline.save(output) 65 | 66 | # Loading model from output 67 | trained_pl = Pipeline.from_pretrained(os.path.join(str(output), "model.tar.gz")) 68 | trained_pl.predict("Oh yeah") 69 | 70 | # Asserting that the pipeline head is recognized as `MyCustomHead` instance after loading from a model.tar.gz 71 | assert isinstance(trained_pl.head, MyCustomHead) 72 | -------------------------------------------------------------------------------- /src/biome/text/backbone.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | from allennlp.data import TextFieldTensors 5 | from allennlp.data import Vocabulary 6 | from allennlp.modules import TextFieldEmbedder 7 | from allennlp.modules.seq2seq_encoders import PassThroughEncoder 8 | 9 | from .featurizer import InputFeaturizer 10 | from .modules.encoders import Encoder 11 | from .tokenizer import Tokenizer 12 | 13 | 14 | class ModelBackbone(torch.nn.Module): 15 | """The backbone of the model. 16 | 17 | It is composed of a tokenizer, featurizer and an encoder. 18 | This component of the model can be pretrained and used with different task heads. 19 | 20 | Attributes 21 | ---------- 22 | vocab 23 | The vocabulary of the pipeline 24 | featurizer 25 | Defines the input features of the tokens and indexes 26 | embedder 27 | The embedding layer 28 | encoder 29 | Outputs an encoded sequence of the tokens 30 | """ 31 | 32 | def __init__( 33 | self, 34 | vocab: Vocabulary, 35 | featurizer: InputFeaturizer, 36 | embedder: TextFieldEmbedder, 37 | encoder: Optional[Encoder] = None, 38 | ): 39 | super(ModelBackbone, self).__init__() 40 | 41 | self.vocab = vocab 42 | self.featurizer = featurizer 43 | self.embedder = embedder 44 | self.encoder = ( 45 | encoder.input_dim(self.embedder.get_output_dim()).compile() 46 | if encoder 47 | else PassThroughEncoder(self.embedder.get_output_dim()) 48 | ) 49 | 50 | @property 51 | def tokenizer(self) -> Tokenizer: 52 | return self.featurizer.tokenizer 53 | 54 | def forward( 55 | self, text: TextFieldTensors, mask: torch.Tensor, num_wrapping_dims: int = 0 56 | ) -> torch.Tensor: 57 | """Applies the embedding and encoding layer 58 | 59 | Parameters 60 | ---------- 61 | text 62 | Output of the `batch.as_tensor_dict()` method, basically the indices of the indexed tokens 63 | mask 64 | A mask indicating which one of the tokens are padding tokens 65 | num_wrapping_dims 66 | 0 if `text` is the output of a `TextField`, 1 if it is the output of a `ListField` 67 | 68 | Returns 69 | ------- 70 | tensor 71 | Encoded representation of the input 72 | """ 73 | embeddings = self.embedder(text, num_wrapping_dims=num_wrapping_dims) 74 | return self.encoder(embeddings, mask=mask) 75 | -------------------------------------------------------------------------------- /src/biome/text/cli/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | import click 6 | 7 | from biome.text import Dataset 8 | from biome.text import Pipeline 9 | from biome.text import Trainer 10 | from biome.text import TrainerConfiguration 11 | from biome.text.helpers import yaml_to_dict 12 | 13 | 14 | @click.command() 15 | @click.argument( 16 | "pipeline_path", 17 | type=click.Path(exists=True), 18 | required=True, 19 | ) 20 | @click.option( 21 | "--output", 22 | "-o", 23 | type=click.Path(), 24 | required=True, 25 | help="Path of the training output.", 26 | ) 27 | @click.option( 28 | "--trainer_config", 29 | type=click.Path(exists=True), 30 | required=True, 31 | help="Path to the trainer configuration YAML file.", 32 | ) 33 | @click.option( 34 | "--train_data", 35 | type=click.Path(exists=True), 36 | required=True, 37 | help="Path to the training data.", 38 | ) 39 | @click.option( 40 | "--valid_data", 41 | type=click.Path(exists=True), 42 | required=False, 43 | help="Path to the validation data.", 44 | ) 45 | def train( 46 | pipeline_path: str, 47 | output: str, 48 | trainer_config: str, 49 | train_data: str, 50 | valid_data: Optional[str] = None, 51 | ) -> None: 52 | """Train a pipeline. 53 | 54 | PIPELINE_PATH is either the path to a pretrained pipeline (model.tar.gz file), 55 | or the path to a pipeline configuration (YAML file). 56 | """ 57 | _, extension = os.path.splitext(pipeline_path) 58 | extension = extension[1:].lower() 59 | pipeline = ( 60 | Pipeline.from_yaml(pipeline_path) 61 | if extension in ["yaml", "yml"] 62 | else Pipeline.from_pretrained(pipeline_path) 63 | ) 64 | 65 | datasets = { 66 | "train": dataset_from_path(train_data), 67 | "validation": dataset_from_path(valid_data) if valid_data else None, 68 | } 69 | 70 | trainer = Trainer( 71 | pipeline=pipeline, 72 | train_dataset=datasets["train"], 73 | valid_dataset=datasets["validation"], 74 | trainer_config=TrainerConfiguration(**yaml_to_dict(trainer_config)), 75 | ) 76 | trainer.fit(output_dir=output) 77 | 78 | 79 | def dataset_from_path(path: str) -> Dataset: 80 | file_extension = Path(path).suffix 81 | if file_extension in [".csv"]: 82 | return Dataset.from_csv(path) 83 | elif file_extension in [".json", ".jsonl"]: 84 | return Dataset.from_json(path) 85 | else: 86 | raise ValueError( 87 | f"Could not create a Dataset from '{path}'. " 88 | f"We only support following formats: [csv, json, jsonl]" 89 | ) 90 | -------------------------------------------------------------------------------- /tests/text/test_model_predict.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from biome.text import Pipeline 4 | from biome.text.configuration import PredictionConfiguration 5 | from biome.text.model import PipelineModel 6 | from biome.text.modules.heads.task_prediction import TaskPrediction 7 | 8 | 9 | @pytest.fixture 10 | def model() -> PipelineModel: 11 | pipeline = Pipeline.from_config( 12 | { 13 | "name": "test_predict", 14 | "head": {"type": "TextClassification", "labels": ["a"]}, 15 | } 16 | ) 17 | return pipeline._model 18 | 19 | 20 | def test_activate_eval_mode(model): 21 | model.train() 22 | model.predict([{"text": "test"}], PredictionConfiguration) 23 | assert model.training is False 24 | 25 | 26 | def test_forward_pass_error(model, monkeypatch, caplog): 27 | def mock_text_to_instance(**kwargs): 28 | return "mock instance" 29 | 30 | def mock_forward_on_instances(*args, **kwargs): 31 | raise Exception("mock Exception") 32 | 33 | monkeypatch.setattr(model, "text_to_instance", mock_text_to_instance) 34 | monkeypatch.setattr(model, "forward_on_instances", mock_forward_on_instances) 35 | 36 | predictions = model.predict( 37 | [{"text": "Some value that breaks the forward pass"}], PredictionConfiguration 38 | ) 39 | 40 | assert predictions == [model.head.empty_prediction] 41 | assert len(caplog.record_tuples) == 2 42 | assert caplog.record_tuples[0] == ("biome.text.model", 40, "mock Exception") 43 | assert caplog.record_tuples[1] == ( 44 | "biome.text.model", 45 | 30, 46 | "Failed to make a forward pass for '[{'text': 'Some value that breaks the forward pass'}]'", 47 | ) 48 | 49 | 50 | def test_return_type(model, monkeypatch): 51 | def mock_make_task_prediction(*args, **kwargs): 52 | return TaskPrediction() 53 | 54 | monkeypatch.setattr(model.head, "make_task_prediction", mock_make_task_prediction) 55 | 56 | predictions = model.predict( 57 | [{"text": "test"}, {"text": "test2"}], PredictionConfiguration() 58 | ) 59 | assert isinstance(predictions, list) 60 | assert all([isinstance(pred, TaskPrediction) for pred in predictions]) 61 | 62 | 63 | def test_text_to_instance(model, caplog): 64 | with pytest.raises(TypeError): 65 | model.text_to_instance(wrong_kwarg="wrong argument") 66 | 67 | with pytest.raises(TypeError): 68 | model.text_to_instance(label="missing required argument") 69 | 70 | model.text_to_instance(text="") 71 | assert caplog.record_tuples[0] == ( 72 | "biome.text.model", 73 | 30, 74 | "The provided input data contains empty strings/tokens: ", 75 | ) 76 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/config.js: -------------------------------------------------------------------------------- 1 | const path = require("path"); 2 | const glob = require("glob"); 3 | 4 | // The env variable is set in our GitHub Action CI when building the docs. 5 | // It must be the same as the release tag or 'master', that is e.g. "v2.0.0" or "v2.1.0rc1" or "master" 6 | const basePath = process.env.BIOME_TEXT_DOC_VERSION 7 | ? `/biome-text/${process.env.BIOME_TEXT_DOC_VERSION}/` 8 | : "/biome-text/master/" 9 | 10 | function getSidebarChildren(location, replacement) { 11 | if (!replacement) { 12 | replacement = location 13 | } 14 | return glob.sync( 15 | location + '/**/*.md').map( 16 | f => f.replace(replacement + '/','')).filter(s => s.toLowerCase().indexOf("readme.md") == -1 17 | ) 18 | } 19 | 20 | module.exports = { 21 | dest: 'site', 22 | title: 'biome.text', 23 | description: 'biome.text practical NLP open source library.', 24 | head: [ 25 | ['meta', { name: 'viewport', content: 'width=device-width, initial-scale=1.0' }], 26 | ['link', { rel: "shortcut icon", href: "/favicon.ico"}], 27 | ['meta', { property: 'og:image', content: 'https://www.recogn.ai/images/biome_og.png' }], 28 | ], 29 | base: basePath, 30 | plugins: [ 31 | '@goy/svg-icons', 32 | '@vuepress/back-to-top' 33 | ], 34 | themeConfig: { 35 | sidebarDepth: 1, 36 | displayAllHeaders: false, 37 | searchPlaceholder: 'Search', 38 | nav: [ 39 | { text: 'API', link: '/api/'}, 40 | { text: 'Documentation', link: '/documentation/'}, 41 | { text: 'Github', link: 'https://github.com/recognai/biome-text' }, 42 | { text: 'Recognai', link: 'https://recogn.ai' }, 43 | ], 44 | sidebar: { 45 | '/api/': [{ 46 | title: 'API', 47 | children: getSidebarChildren('docs/api'), 48 | collapsable: false, 49 | }], 50 | '/documentation/': [ 51 | { 52 | title: 'Get started', 53 | children: ['', 'basics.md'], 54 | collapsable: false 55 | }, 56 | { 57 | title: 'Tutorials', 58 | children:getSidebarChildren('docs/documentation/tutorials', 'docs/documentation'), 59 | collapsable: false 60 | }, 61 | { 62 | title: 'User Guides', 63 | children:getSidebarChildren('docs/documentation/user-guides', 'docs/documentation'), 64 | collapsable: false 65 | }, 66 | { 67 | title: 'Community', 68 | children:getSidebarChildren('docs/documentation/community', 'docs/documentation'), 69 | collapsable: false 70 | }] 71 | }, 72 | algolia: { 73 | apiKey: '4f8d6b27d633951bde8c33e391ea6a4d', 74 | indexName: 'recogn_biome-text' 75 | }, 76 | plugins: ['@vuepress/active-header-links'], 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /tests/text/test_pipeline_predict.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from biome.text import Pipeline 4 | from biome.text.modules.heads.task_prediction import TextClassificationPrediction 5 | 6 | 7 | @pytest.fixture 8 | def pipeline() -> Pipeline: 9 | return Pipeline.from_config( 10 | { 11 | "name": "test_predict", 12 | "head": {"type": "TextClassification", "labels": ["a"]}, 13 | } 14 | ) 15 | 16 | 17 | def test_return_empty_prediction_for_failed_prediction(pipeline): 18 | empty_prediction = {"labels": [], "probabilities": []} 19 | assert pipeline.predict("") == empty_prediction 20 | assert ( 21 | pipeline.predict(batch=[{"text": ""}, {"text": ""}]) == [empty_prediction] * 2 22 | ) 23 | 24 | 25 | def test_batch_parameter_gets_ignored(pipeline): 26 | prediction = pipeline.predict("testtt", batch=[{"text": "test"}], add_tokens=True) 27 | assert prediction["tokens"][0]["text"] == "testtt" 28 | 29 | prediction = pipeline.predict( 30 | text="testtt", batch=[{"text": "test"}], add_tokens=True 31 | ) 32 | assert prediction["tokens"][0]["text"] == "testtt" 33 | 34 | 35 | def test_map_args_kwargs_to_input(): 36 | class MockPipeline: 37 | def __init__(self, inputs): 38 | self._inputs = inputs 39 | 40 | @property 41 | def inputs(self): 42 | return self._inputs 43 | 44 | assert Pipeline._map_args_kwargs_to_input(MockPipeline(["text"]), "test") == { 45 | "text": "test" 46 | } 47 | assert Pipeline._map_args_kwargs_to_input(MockPipeline(["text"]), text="test") == { 48 | "text": "test" 49 | } 50 | assert Pipeline._map_args_kwargs_to_input( 51 | MockPipeline(["text", "text2"]), "test", text2="test2" 52 | ) == {"text": "test", "text2": "test2"} 53 | 54 | 55 | def test_return_single_or_list(pipeline, monkeypatch): 56 | def mock_predict(batch, prediction_config): 57 | return [ 58 | TextClassificationPrediction(labels=["a"], probabilities=[1]) 59 | if i % 2 == 0 60 | else pipeline.head.empty_prediction 61 | for i, _ in enumerate(batch) 62 | ] 63 | 64 | monkeypatch.setattr(pipeline._model, "predict", mock_predict) 65 | 66 | assert isinstance(pipeline.predict("test"), dict) 67 | 68 | batch_prediction = pipeline.predict(batch=[{"text": "test"}]) 69 | assert isinstance(batch_prediction, list) and len(batch_prediction) == 1 70 | assert isinstance(batch_prediction[0], dict) 71 | 72 | batch_prediction = pipeline.predict( 73 | batch=[{"text": "test"}, {"text": "no instance for this input"}] 74 | ) 75 | assert isinstance(batch_prediction, list) and len(batch_prediction) == 2 76 | assert ( 77 | isinstance(batch_prediction[0], dict) 78 | and batch_prediction[1] == pipeline.head.empty_prediction.as_dict() 79 | ) 80 | -------------------------------------------------------------------------------- /tests/docs/test_configurations.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any 3 | from typing import Dict 4 | 5 | import pytorch_lightning 6 | import torch.nn as nn 7 | from allennlp.common import Params 8 | from allennlp.training.learning_rate_schedulers import LearningRateScheduler 9 | from allennlp.training.optimizers import Optimizer 10 | 11 | from biome.text import Dataset 12 | from biome.text import Pipeline 13 | from biome.text import Trainer 14 | from biome.text import TrainerConfiguration 15 | from biome.text import VocabularyConfiguration 16 | 17 | 18 | def _read_configs(configurations_path: Path, section: str) -> Dict[str, Any]: 19 | code_blocks = {} 20 | with configurations_path.open() as file: 21 | in_section = False 22 | in_new_config = False 23 | 24 | for line in file.readlines(): 25 | if line.startswith(f"## {section}"): 26 | in_section = True 27 | elif line.startswith("### ") and in_section: 28 | code_blocks[line.split(maxsplit=1)[1]] = "" 29 | elif line.startswith("```python") and in_section: 30 | in_new_config = True 31 | elif line.startswith("```") and in_new_config: 32 | in_new_config = False 33 | elif line.startswith("## ") and in_section: 34 | in_section = False 35 | 36 | elif in_section and in_new_config: 37 | key = list(code_blocks.keys())[-1] 38 | code_blocks[key] += line 39 | 40 | configurations = {} 41 | for name, code in code_blocks.items(): 42 | config = {} 43 | exec(code, globals(), config) 44 | configurations[name] = config[list(config.keys())[-1]] 45 | 46 | return configurations 47 | 48 | 49 | def test_pipeline_configs(configurations_path): 50 | configs = _read_configs(configurations_path, "Pipeline") 51 | for config_name, config in configs.items(): 52 | Pipeline.from_config(config) 53 | 54 | 55 | def test_trainer_configs(configurations_path): 56 | configs = _read_configs(configurations_path, "Trainer") 57 | pipeline = Pipeline.from_config( 58 | { 59 | "name": "test", 60 | "head": {"type": "TextClassification", "labels": ["pos", "neg"]}, 61 | } 62 | ) 63 | dataset = Dataset.from_dict({"text": ["test"], "label": ["pos"]}) 64 | linear = nn.Linear(2, 2) 65 | for config_name, config in configs.items(): 66 | assert isinstance(config, TrainerConfiguration) 67 | 68 | trainer = Trainer( 69 | pipeline=pipeline, train_dataset=dataset, trainer_config=config 70 | ) 71 | assert isinstance(trainer.trainer, pytorch_lightning.Trainer) 72 | 73 | 74 | def test_vocab_configs(configurations_path): 75 | configs = _read_configs(configurations_path, "Vocabulary") 76 | for config_name, config in configs.items(): 77 | assert isinstance(config, VocabularyConfiguration) 78 | -------------------------------------------------------------------------------- /docs/.templates/config.mako: -------------------------------------------------------------------------------- 1 | <%! 2 | # Template configuration. Copy over in your template directory 3 | # (used with `--template-dir`) and adapt as necessary. 4 | # Note, defaults are loaded from this distribution file, so your 5 | # config.mako only needs to contain values you want overridden. 6 | # You can also run pdoc with `--config KEY=VALUE` to override 7 | # individual values. 8 | 9 | html_lang = 'en' 10 | show_inherited_members = False 11 | extract_module_toc_into_sidebar = True 12 | list_class_variables_in_index = True 13 | sort_identifiers = False 14 | show_type_annotations = True 15 | 16 | # The default docstring format 17 | docformat = 'numpy' 18 | 19 | # Show collapsed source code block next to each item. 20 | # Disabling this can improve rendering speed of large modules. 21 | show_source_code = False 22 | 23 | # If set, format links to objects in online source code repository 24 | # according to this template. Supported keywords for interpolation 25 | # are: commit, path, start_line, end_line. 26 | #git_link_template = 'https://github.com/USER/PROJECT/blob/{commit}/{path}#L{start_line}-L{end_line}' 27 | #git_link_template = 'https://gitlab.com/USER/PROJECT/blob/{commit}/{path}#L{start_line}-L{end_line}' 28 | #git_link_template = 'https://bitbucket.org/USER/PROJECT/src/{commit}/{path}#lines-{start_line}:{end_line}' 29 | #git_link_template = 'https://CGIT_HOSTNAME/PROJECT/tree/{path}?id={commit}#n{start-line}' 30 | git_link_template = None 31 | 32 | # A prefix to use for every HTML hyperlink in the generated documentation. 33 | # No prefix results in all links being relative. 34 | link_prefix = '' 35 | 36 | # Enable syntax highlighting for code/source blocks by including Highlight.js 37 | syntax_highlighting = True 38 | 39 | # Set the style keyword such as 'atom-one-light' or 'github-gist' 40 | # Options: https://github.com/highlightjs/highlight.js/tree/master/src/styles 41 | # Demo: https://highlightjs.org/static/demo/ 42 | hljs_style = 'github' 43 | 44 | # If set, insert Google Analytics tracking code. Value is GA 45 | # tracking id (UA-XXXXXX-Y). 46 | google_analytics = '' 47 | 48 | # If set, insert Google Custom Search search bar widget above the sidebar index. 49 | # The whitespace-separated tokens represent arbitrary extra queries (at least one 50 | # must match) passed to regular Google search. Example: 51 | #search_query = 'inurl:github.com/USER/PROJECT site:PROJECT.github.io site:PROJECT.website' 52 | search_query = '' 53 | 54 | # If set, render LaTeX math syntax within \(...\) (inline equations), 55 | # or within \[...\] or $$...$$ or `.. math::` (block equations) 56 | # as nicely-formatted math formulas using MathJax. 57 | # Note: in Python docstrings, either all backslashes need to be escaped (\\) 58 | # or you need to use raw r-strings. 59 | latex_math = False 60 | %> 61 | -------------------------------------------------------------------------------- /tests/resources/data/emotions_with_transformers.txt: -------------------------------------------------------------------------------- 1 | i didnt feel humiliated;sadness 2 | i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake;sadness 3 | im grabbing a minute to post i feel greedy wrong;anger 4 | i am ever feeling nostalgic about the fireplace i will know that it is still on the property;love 5 | i am feeling grouchy;anger 6 | ive been feeling a little burdened lately wasnt sure why that was;sadness 7 | ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny;surprise 8 | i feel as confused about life as a teenager or as jaded as a year old man;fear 9 | i have been with petronas for years i feel that petronas has performed well and made a huge profit;joy 10 | i feel romantic too;love 11 | i feel like i have to make the suffering i m seeing mean something;sadness 12 | i do feel that running is a divine experience and that i can expect to have some type of spiritual encounter;joy 13 | i think it s the easiest time of year to feel dissatisfied;anger 14 | i feel low energy i m just thirsty;sadness 15 | i have immense sympathy with the general point but as a possible proto writer trying to find time to write in the corners of life and with no sign of an agent let alone a publishing contract this feels a little precious;joy 16 | i do not feel reassured anxiety is on each side;joy 17 | i didnt really feel that embarrassed;sadness 18 | i feel pretty pathetic most of the time;sadness 19 | i started feeling sentimental about dolls i had as a child and so began a collection of vintage barbie dolls from the sixties;sadness 20 | i now feel compromised and skeptical of the value of every unit of work i put in;fear 21 | i feel irritated and rejected without anyone doing anything or saying anything;anger 22 | i am feeling completely overwhelmed i have two strategies that help me to feel grounded pour my heart out in my journal in the form of a letter to god and then end with a list of five things i am most grateful for;fear 23 | i have the feeling she was amused and delighted;joy 24 | i was able to help chai lifeline with your support and encouragement is a great feeling and i am so glad you were able to help me;joy 25 | i already feel like i fucked up though because i dont usually eat at all in the morning;anger 26 | i still love my so and wish the best for him i can no longer tolerate the effect that bm has on our lives and the fact that is has turned my so into a bitter angry person who is not always particularly kind to the people around him when he is feeling stressed;sadness 27 | i feel so inhibited in someone elses kitchen like im painting on someone elses picture;sadness 28 | i become overwhelmed and feel defeated;sadness 29 | i feel kinda appalled that she feels like she needs to explain in wide and lenghth her body measures etc pp;anger 30 | i feel more superior dead chicken or grieving child;joy 31 | i get giddy over feeling elegant in a perfectly fitted pencil skirt;joy 32 | i remember feeling acutely distressed for a few days;fear 33 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/public/assets/img/bg.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Created with Sketch. 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/public/assets/img/biome-isotype.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Biome text. 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |
3 | 4 |
5 |

6 |

7 | 8 | CI 9 | 10 | 11 | 12 | 13 | 14 | GitHub 15 | 16 | 17 | Documentation 18 | 19 | 20 | GitHub release 21 | 22 |

23 | 24 |

25 |

Natural Language Processing library built with AllenNLP 26 |

27 | 28 | ## Quick Links 29 | - [Documentation](https://recognai.github.io/biome-text/) 30 | 31 | 32 | ## Features 33 | * State-of-the-art and not so state-of-the-art models trained with **your own data** with simple workflows. 34 | 35 | * **Efficient data reading** for (large) datasets in multiple formats and sources (CSV, Parquet, JSON, etc.). 36 | 37 | * **Modular configuration and extensibility** of models, datasets and training runs programmatically or via config files. 38 | 39 | * Use via **`cli`** or as plain Python (e.g., inside a Jupyter Notebook) 40 | 41 | * **Compatible with AllenNLP** 42 | 43 | ## Installation 44 | 45 | For the installation we recommend setting up a fresh [conda](https://docs.conda.io/en/latest/miniconda.html) environment: 46 | 47 | ```shell script 48 | conda create -n biome python~=3.7.0 pip>=20.3.0 49 | conda activate biome 50 | ``` 51 | 52 | Once the conda environment is activated, you can install the latest release via pip: 53 | 54 | ````shell script 55 | pip install -U biome-text 56 | ```` 57 | 58 | After installing *biome.text*, the best way to test your installation is by running the *biome.text* cli command: 59 | 60 | ```shell script 61 | biome --help 62 | ``` 63 | 64 | ## Get started 65 | 66 | The best way to see how *biome.text* works is to go through our [first tutorial](https://recognai.github.io/biome-text/master/documentation/tutorials/1-Training_a_text_classifier.html). 67 | 68 | Please refer to our [documentation](https://recognai.github.io/biome-text) for more tutorials, detailed user guides and how you can [contribute](https://recognai.github.io/biome-text/master/documentation/community/1-contributing.html) to *biome.text*. 69 | 70 | ## Licensing 71 | 72 | The code in this project is licensed under Apache 2 license. 73 | -------------------------------------------------------------------------------- /tests/text/modules/heads/classification/test_relation_classifier.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import pytest 4 | 5 | from biome.text import Dataset 6 | from biome.text import Pipeline 7 | from biome.text import Trainer 8 | from biome.text import TrainerConfiguration 9 | 10 | 11 | @pytest.fixture 12 | def training_dataset() -> Dataset: 13 | """Creating the dataframe.""" 14 | data = { 15 | "text": [ 16 | "The most common audits were about waste and recycling.", 17 | "The company fabricates plastic chairs.", 18 | ], 19 | "entities": [ 20 | [ 21 | {"start": 34, "end": 39, "label": "PN", "text": "waste"}, 22 | {"start": 16, "end": 22, "label": "QTY", "text": "audits"}, 23 | ], 24 | [ 25 | {"start": 4, "end": 11, "label": "OBJECT", "text": "company"}, 26 | {"start": 31, "end": 37, "label": "SUBJECT", "text": "chairs"}, 27 | ], 28 | ], 29 | "label": ["Message-Topic(e1,e2)", "Product-Producer(e2,e1)"], 30 | } 31 | 32 | return Dataset.from_dict(data) 33 | 34 | 35 | @pytest.fixture 36 | def pipeline_dict() -> Dict: 37 | """Creating the pipeline dictionary""" 38 | 39 | pipeline_dict = { 40 | "name": "biome-rele", 41 | "features": { 42 | "word": {"embedding_dim": 2}, 43 | "char": { 44 | "embedding_dim": 2, 45 | "dropout": 0.1, 46 | "encoder": { 47 | "type": "gru", 48 | "hidden_size": 2, 49 | }, 50 | }, 51 | }, 52 | "head": { 53 | "type": "RelationClassification", 54 | "labels": ["Message-Topic(e1,e2)", "Product-Producer(e2,e1)"], 55 | "entities_embedder": {"num_embeddings": 12, "embedding_dim": 50}, 56 | "feedforward": { 57 | "num_layers": 1, 58 | "hidden_dims": [4], 59 | "activations": ["relu"], 60 | "dropout": [0.1], 61 | }, 62 | }, 63 | } 64 | 65 | return pipeline_dict 66 | 67 | 68 | @pytest.fixture 69 | def trainer_config() -> TrainerConfiguration: 70 | return TrainerConfiguration( 71 | max_epochs=1, 72 | optimizer={"type": "adamw", "lr": 0.002}, 73 | gpus=0, 74 | ) 75 | 76 | 77 | def test_train(pipeline_dict, training_dataset, trainer_config, tmp_path): 78 | """Testing a classifier made from scratch""" 79 | 80 | pipeline = Pipeline.from_config(pipeline_dict) 81 | pipeline.predict( 82 | text="The most common audits were about waste and recycling", 83 | entities=[ 84 | {"start": 34, "end": 39, "label": "OBJECT", "text": "waste"}, 85 | {"start": 16, "end": 22, "label": "SUBJECT", "text": "audits"}, 86 | ], 87 | ) 88 | 89 | trainer = Trainer( 90 | pipeline=pipeline, 91 | train_dataset=training_dataset, 92 | valid_dataset=training_dataset, 93 | trainer_config=trainer_config, 94 | ) 95 | trainer.fit(tmp_path / "relation_classifier") 96 | 97 | # test loading 98 | Pipeline.from_pretrained(tmp_path / "relation_classifier" / "model.tar.gz") 99 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/components/Versions.vue: -------------------------------------------------------------------------------- 1 | 13 | 14 | 56 | 106 | -------------------------------------------------------------------------------- /src/biome/text/text_cleaning.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import re 3 | from typing import Callable 4 | from typing import Dict 5 | from typing import List 6 | 7 | from allennlp.common import Registrable 8 | from bs4 import BeautifulSoup 9 | 10 | 11 | class TextCleaning(Registrable): 12 | """Defines rules that can be applied to the text before it gets tokenized. 13 | 14 | Each rule is a simple python function that receives and returns a `str`. 15 | 16 | Parameters 17 | ---------- 18 | rules: `List[str]` 19 | A list of registered rule method names to be applied to text inputs 20 | """ 21 | 22 | default_implementation = "default" 23 | 24 | def __init__(self, rules: List[str] = None): 25 | self.rules = rules or [] 26 | for rule in self.rules: 27 | if rule not in TextCleaningRule.registered_rules(): 28 | raise AttributeError( 29 | f"No rule '{rule}' registered" 30 | f"Available rules are [{[k for k in TextCleaningRule.registered_rules().keys()]}]" 31 | ) 32 | 33 | def __call__(self, text: str) -> str: 34 | for rule in self.rules: 35 | text = TextCleaningRule.registered_rules()[rule](text) 36 | return text 37 | 38 | 39 | TextCleaning.register(TextCleaning.default_implementation)(TextCleaning) 40 | 41 | 42 | class TextCleaningRule: 43 | """Registers a function as a rule for the text cleaning implementation 44 | 45 | Use the decorator `@TextCleaningRule` for creating custom text cleaning and pre-processing rules. 46 | 47 | An example function to strip spaces would be: 48 | 49 | ```python 50 | @TextCleaningRule 51 | def strip_spaces(text: str) -> str: 52 | return text.strip() 53 | ``` 54 | 55 | You can query available rules via `TextCleaningRule.registered_rules()`. 56 | 57 | Parameters 58 | ---------- 59 | func: `Callable[[str]` 60 | The function to register 61 | """ 62 | 63 | __REGISTERED_RULES = {} 64 | 65 | def __init__(self, func: Callable[[str], str]): 66 | self.__callable__ = func 67 | self.__REGISTERED_RULES[func.__name__] = func 68 | 69 | @classmethod 70 | def registered_rules(cls) -> Dict[str, Callable[[str], str]]: 71 | """Registered rules dictionary""" 72 | return copy.deepcopy(cls.__REGISTERED_RULES) 73 | 74 | def __call__(self, *args, **kwargs) -> str: 75 | """Enables call single rule""" 76 | return self.__callable__(*args, **kwargs) 77 | 78 | 79 | @TextCleaningRule 80 | def strip_spaces(text: str) -> str: 81 | """Strips leading and trailing spaces/new lines""" 82 | return text.strip() 83 | 84 | 85 | @TextCleaningRule 86 | def rm_useless_spaces(text: str) -> str: 87 | """Removes multiple spaces in `str`""" 88 | return re.sub(" {2,}", " ", text) 89 | 90 | 91 | @TextCleaningRule 92 | def fix_html(text: str) -> str: 93 | """Replaces some special HTML characters: ` `, `
`, etc.""" 94 | text = ( 95 | # non breakable space -> space 96 | text.replace(" ", " ") 97 | .replace(" ", " ") 98 | .replace(" ", " ") 99 | #
html single line breaks -> unicode line breaks 100 | .replace("
", "\n") 101 | ) 102 | 103 | return text 104 | 105 | 106 | @TextCleaningRule 107 | def html_to_text(text: str) -> str: 108 | """Extracts text from an HTML document""" 109 | return BeautifulSoup(text, "lxml").get_text() 110 | -------------------------------------------------------------------------------- /tests/resources/data/dataset_source.jsonl: -------------------------------------------------------------------------------- 1 | {"reviewerID": "A2HD75EMZR8QLN", "asin": "0700099867", "reviewerName": "123", "helpful": [8, 12], "reviewText": "Installing the game was a struggle (because of games for windows live bugs).Some championship races and cars can only be \"unlocked\" by buying them as an addon to the game. I paid nearly 30 dollars when the game was new. I don't like the idea that I have to keep paying to keep playing.I noticed no improvement in the physics or graphics compared to Dirt 2.I tossed it in the garbage and vowed never to buy another codemasters game. I'm really tired of arcade style rally/racing games anyway.I'll continue to get my fix from Richard Burns Rally, and you should to. :)http://www.amazon.com/Richard-Burns-Rally-PC/dp/B000C97156/ref=sr_1_1?ie=UTF8&qid;=1341886844&sr;=8-1&keywords;=richard+burns+rallyThank you for reading my review! If you enjoyed it, be sure to rate it as helpful.", "overall": 1.0, "summary": "Pay to unlock content? I don't think so.", "unixReviewTime": 1341792000, "reviewTime": "07 9, 2012"} 2 | {"reviewerID": "A3UR8NLLY1ZHCX", "asin": "0700099867", "reviewerName": "Alejandro Henao \"Electronic Junky\"", "helpful": [0, 0], "reviewText": "If you like rally cars get this game you will have fun.It is more oriented to "European market" since here in America there isn't a huge rally fan party. Music it is very European and even the voices from the game very "English" accent.The multiplayer isn't the best but it works just ok.", "overall": 4.0, "summary": "Good rally game", "unixReviewTime": 1372550400, "reviewTime": "06 30, 2013"} 3 | {"reviewerID": "A1INA0F5CWW3J4", "asin": "0700099867", "reviewerName": "Amazon Shopper \"Mr.Repsol\"", "helpful": [0, 0], "reviewText": "1st shipment received a book instead of the game.2nd shipment got a FAKE one. Game arrived with a wrong key inside on sealed box. I got in contact with codemasters and send them pictures of the DVD and the content. They said nothing they can do its a fake DVD.Returned it good bye.!", "overall": 1.0, "summary": "Wrong key", "unixReviewTime": 1403913600, "reviewTime": "06 28, 2014"} 4 | {"reviewerID": "A2HD75EMZR8QLN", "asin": "0700099867", "reviewerName": "123", "helpful": [8, 12], "reviewText": "Installing the game was a struggle (because of games for windows live bugs).Some championship races and cars can only be \"unlocked\" by buying them as an addon to the game. I paid nearly 30 dollars when the game was new. I don't like the idea that I have to keep paying to keep playing.I noticed no improvement in the physics or graphics compared to Dirt 2.I tossed it in the garbage and vowed never to buy another codemasters game. I'm really tired of arcade style rally/racing games anyway.I'll continue to get my fix from Richard Burns Rally, and you should to. :)http://www.amazon.com/Richard-Burns-Rally-PC/dp/B000C97156/ref=sr_1_1?ie=UTF8&qid;=1341886844&sr;=8-1&keywords;=richard+burns+rallyThank you for reading my review! If you enjoyed it, be sure to rate it as helpful.", "overall": 1.0, "summary": "Pay to unlock content? I don't think so.", "unixReviewTime": 1341792000, "reviewTime": "07 9, 2012"} 5 | {"reviewerID": "A3UR8NLLY1ZHCX", "asin": "0700099867", "reviewerName": "Alejandro Henao \"Electronic Junky\"", "helpful": [0, 0], "reviewText": "If you like rally cars get this game you will have fun.It is more oriented to "European market" since here in America there isn't a huge rally fan party. Music it is very European and even the voices from the game very "English" accent.The multiplayer isn't the best but it works just ok.", "overall": 4.0, "summary": "Good rally game", "unixReviewTime": 1372550400, "reviewTime": "06 30, 2013"} 6 | -------------------------------------------------------------------------------- /tests/text/modules/heads/classification/test_document_classification.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from numpy.testing import assert_allclose 3 | 4 | from biome.text import Pipeline 5 | from biome.text.modules.heads.task_prediction import Attribution 6 | from biome.text.modules.heads.task_prediction import DocumentClassificationPrediction 7 | 8 | 9 | @pytest.fixture 10 | def pipeline() -> Pipeline: 11 | labels = ["a", "b", "c", "d", "e", "f"] 12 | return Pipeline.from_config( 13 | { 14 | "name": "test_document_classification", 15 | "tokenizer": {"segment_sentences": False}, 16 | "head": { 17 | "type": "DocumentClassification", 18 | "labels": labels, 19 | "dropout": 0.1, 20 | }, 21 | } 22 | ) 23 | 24 | 25 | @pytest.mark.parametrize( 26 | "segment_sentences, input, output", 27 | [ 28 | (False, "one sentence. two sentence", (1, 5)), 29 | (True, "one sentence. two sentence", (2, 3)), 30 | (False, ["one sentence. two sentence", "test"], (2, 5)), 31 | (True, ["one sentence. two sentence", "test"], (3, 3)), 32 | (False, {"one": "one sentence. two sentence", "two": "test"}, (2, 5)), 33 | (True, {"one": "one sentence. two sentence", "two": "test"}, (3, 3)), 34 | ], 35 | ) 36 | def test_tokenization_of_different_input(segment_sentences, input, output): 37 | pipeline = Pipeline.from_config( 38 | { 39 | "name": "test_document_classification", 40 | "tokenizer": {"segment_sentences": segment_sentences}, 41 | "head": {"type": "DocumentClassification", "labels": "a"}, 42 | } 43 | ) 44 | instance = pipeline.head.featurize(input) 45 | tokens = pipeline.head._extract_tokens(instance) 46 | 47 | assert len(tokens) == output[0] 48 | assert len(tokens[0]) == output[1] 49 | 50 | 51 | def test_make_task_prediction(pipeline): 52 | instance = pipeline.head.featurize("test this sentence") 53 | forward_output = pipeline.model.forward_on_instances([instance]) 54 | 55 | prediction = pipeline.head._make_task_prediction(forward_output[0], None) 56 | 57 | assert isinstance(prediction, DocumentClassificationPrediction) 58 | assert isinstance(prediction.labels, list) and isinstance( 59 | prediction.probabilities, list 60 | ) 61 | assert len(prediction.labels) == len(prediction.probabilities) == 6 62 | # check descending order 63 | assert_allclose( 64 | sorted(prediction.probabilities, reverse=True), prediction.probabilities 65 | ) 66 | assert all([isinstance(label, str) for label in prediction.labels]) 67 | assert set(pipeline.head.labels) == set(prediction.labels) 68 | assert all([isinstance(prob, float) for prob in prediction.probabilities]) 69 | 70 | 71 | def test_compute_attributions(pipeline): 72 | instance = pipeline.head.featurize("test this sentence") 73 | pipeline.model.eval() 74 | forward_output = pipeline.model.forward_on_instances([instance]) 75 | 76 | attributions = pipeline.head._compute_attributions( 77 | forward_output[0], instance, n_steps=1 78 | ) 79 | 80 | assert isinstance(attributions, list) and isinstance(attributions[0], list) 81 | assert len(attributions) == 1 and len(attributions[0]) == 3 82 | assert all( 83 | [isinstance(attribution, Attribution) for attribution in attributions[0]] 84 | ) 85 | assert all([attr.field == "text" for attr in attributions[0]]) 86 | assert all([isinstance(attr.attribution, float) for attr in attributions[0]]) 87 | assert attributions[0][1].start == 5 and attributions[0][1].end == 9 88 | -------------------------------------------------------------------------------- /src/biome/text/modules/configuration/defs.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import inspect 3 | from typing import Any 4 | from typing import Dict 5 | from typing import Generic 6 | from typing import Optional 7 | from typing import Type 8 | from typing import TypeVar 9 | from typing import Union 10 | 11 | from allennlp.common import FromParams 12 | from allennlp.common import Params 13 | from allennlp.modules.bimpm_matching import BiMpmMatching 14 | from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper 15 | from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper 16 | 17 | from biome.text import helpers 18 | 19 | 20 | def _find_input_attribute(component: Type[Any]) -> str: 21 | """Find the properly input dimension attribute name for a given component""" 22 | input_dim_attribute = None 23 | if issubclass(component, (PytorchSeq2SeqWrapper, PytorchSeq2VecWrapper)): 24 | input_dim_attribute = "input_size" 25 | elif component is BiMpmMatching: 26 | input_dim_attribute = "hidden_dim" 27 | else: 28 | init_method_keys = inspect.signature(component.__init__).parameters.keys() 29 | for param_name in ["embedding_dim", "input_dim"]: 30 | if param_name in init_method_keys: 31 | input_dim_attribute = param_name 32 | break 33 | return input_dim_attribute 34 | 35 | 36 | T = TypeVar("T") 37 | 38 | 39 | class ComponentConfiguration(Generic[T], FromParams): 40 | """ 41 | The layer spec component allows create Pytorch modules lazily, 42 | and instantiate them inside a context (Model or other component) dimension layer chain. 43 | 44 | The layer spec wraps a component params and will generate an instance of type T once the input_dim is set. 45 | 46 | """ 47 | 48 | @classmethod 49 | def from_params(cls: Type[T], params: Params, **extras) -> T: 50 | return cls(**params.as_dict()) 51 | 52 | def __resolve_layer_class( 53 | self, type_name: Optional[Union[Type, str]] = None 54 | ) -> Type[T]: 55 | if isinstance(type_name, Type): 56 | return type_name 57 | 58 | layer_class = getattr(self.__class__, "__orig_bases__")[0].__args__[0] 59 | return layer_class.by_name(type_name) if type_name else layer_class 60 | 61 | def __init__(self, **config): 62 | self._layer_class = self.__resolve_layer_class(config.get("type")) 63 | config["type"] = helpers.get_full_class_name(self._layer_class) 64 | self._config = config or {} 65 | 66 | def input_dim(self, input_dim: int) -> "ComponentConfiguration": 67 | """Sets the input dimension attribute for this layer configuration""" 68 | self.__update_config_with_input_dim(input_dim) 69 | return self 70 | 71 | def __update_config_with_input_dim(self, input_dim: int): 72 | input_dim_attribute = _find_input_attribute(self._layer_class) 73 | 74 | if input_dim_attribute: 75 | self._config[input_dim_attribute] = input_dim 76 | 77 | @property 78 | def config(self) -> Dict[str, Any]: 79 | """Component read-only configuration""" 80 | return copy.deepcopy(self._config) 81 | 82 | def compile(self, **extras) -> T: 83 | """ 84 | Using the wrapped configuration and the input dimension, generates a 85 | instance of type T representing the layer configuration 86 | """ 87 | if not self.config: 88 | raise ValueError(f"No configuration found for {self}") 89 | 90 | config = self.config 91 | if "type" in config: 92 | config.pop("type") 93 | 94 | return self._layer_class.from_params(Params(config), **extras) 95 | -------------------------------------------------------------------------------- /tests/text/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | from allennlp.data import Token as AllennlpToken 2 | from spacy.tokens.token import Token as SpacyToken 3 | 4 | from biome.text.configuration import TokenizerConfiguration 5 | from biome.text.tokenizer import Tokenizer 6 | 7 | html_text = """ 8 | 9 | 10 | 11 | 12 |

My First Heading

13 |

My first paragraph.

14 |

My second paragraph.

15 | 16 | 17 | """ 18 | 19 | 20 | def test_text_cleaning_with_sentence_segmentation(): 21 | tokenizer = Tokenizer( 22 | TokenizerConfiguration( 23 | text_cleaning={"rules": ["html_to_text", "strip_spaces"]}, 24 | segment_sentences=True, 25 | ) 26 | ) 27 | 28 | tokenized = tokenizer.tokenize_text(html_text) 29 | assert len(tokenized) == 2 30 | assert ( 31 | len(tokenized[0]) == 7 32 | ), "Expected [My, First, Heading, My, first, paragraph, .]" 33 | assert len(tokenized[1]) == 4, "Expected [My, second, paragraph, .]" 34 | 35 | 36 | def test_text_cleaning_with_sentence_segmentation_and_max_sequence(): 37 | tokenizer = Tokenizer( 38 | TokenizerConfiguration( 39 | truncate_sentence=8, 40 | text_cleaning={"rules": ["html_to_text", "strip_spaces"]}, 41 | segment_sentences=True, 42 | ) 43 | ) 44 | 45 | tokenized = tokenizer.tokenize_text(html_text) 46 | assert len(tokenized) == 2 47 | assert len(tokenized[0]) == 2, "Expected [My, First]" 48 | assert len(tokenized[1]) == 2, "Expected [My, second]" 49 | 50 | 51 | def test_document_cleaning(): 52 | tokenizer = Tokenizer( 53 | TokenizerConfiguration( 54 | text_cleaning={"rules": ["html_to_text", "strip_spaces"]}, 55 | segment_sentences=True, 56 | ) 57 | ) 58 | 59 | tokenized = tokenizer.tokenize_document([html_text]) 60 | assert len(tokenized) == 2 61 | assert ( 62 | len(tokenized[0]) == 7 63 | ), "Expected [My, First, Heading, My, first, paragraph, .]" 64 | assert len(tokenized[1]) == 4, "Expected [My, second, paragraph, .]" 65 | 66 | 67 | def test_using_spacy_tokens(): 68 | tokenizer = Tokenizer(TokenizerConfiguration(use_spacy_tokens=True)) 69 | tokenized = tokenizer.tokenize_text("This is a text") 70 | assert len(tokenized) == 1 71 | assert len(tokenized[0]) == 4 72 | assert all(map(lambda t: isinstance(t, SpacyToken), tokenized[0])) 73 | 74 | 75 | def test_using_allennlp_tokens(): 76 | tokenizer = Tokenizer(TokenizerConfiguration(use_spacy_tokens=False)) 77 | tokenized = tokenizer.tokenize_text("This is a text") 78 | assert len(tokenized) == 1 79 | assert len(tokenized[0]) == 4 80 | assert all(map(lambda t: isinstance(t, AllennlpToken), tokenized[0])) 81 | 82 | 83 | def test_set_sentence_segmentation_with_max_number_of_sentences(): 84 | tokenizer = Tokenizer(TokenizerConfiguration(max_nr_of_sentences=2)) 85 | tokenized = tokenizer.tokenize_document( 86 | [ 87 | "This is a sentence. This is another sentence.", 88 | "One more sentence here.", 89 | "Last sentence here.", 90 | ] 91 | ) 92 | assert len(tokenized) == 2 93 | 94 | 95 | def test_min_max_sentence_length(): 96 | tokenizer = Tokenizer( 97 | TokenizerConfiguration( 98 | segment_sentences=True, min_sentence_length=10, max_sentence_length=15 99 | ) 100 | ) 101 | tokenized = tokenizer.tokenize_text("short. A very long sentence. This is fine") 102 | 103 | assert len(tokenized) == 1 104 | assert len(tokenized[0]) == 3 105 | -------------------------------------------------------------------------------- /tests/text/test_hpo.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | from ray import tune 5 | 6 | from biome.text import Pipeline 7 | from biome.text import TrainerConfiguration 8 | from biome.text import VocabularyConfiguration 9 | from biome.text.dataset import Dataset 10 | from biome.text.hpo import TuneExperiment 11 | 12 | 13 | @pytest.fixture 14 | def dataset(): 15 | return Dataset.from_dict({"text": ["a", "b"], "label": ["a", "b"]}) 16 | 17 | 18 | @pytest.fixture 19 | def pipeline_config(): 20 | return { 21 | "name": "test_ray_tune_trainable", 22 | "features": { 23 | "word": {"embedding_dim": 2}, 24 | }, 25 | "head": {"type": "TextClassification", "labels": ["a", "b"]}, 26 | } 27 | 28 | 29 | @pytest.fixture 30 | def trainer_config() -> TrainerConfiguration: 31 | return TrainerConfiguration( 32 | max_epochs=1, 33 | batch_size=2, 34 | add_wandb_logger=False, 35 | ) 36 | 37 | 38 | def test_tune_exp_default_trainable(tmp_path, dataset, pipeline_config, trainer_config): 39 | pipeline_config["features"]["word"]["embedding_dim"] = tune.choice([2, 4]) 40 | trainer_config.optimizer["lr"] = tune.loguniform(0.001, 0.01) 41 | 42 | my_exp = TuneExperiment( 43 | pipeline_config=pipeline_config, 44 | trainer_config=trainer_config, 45 | train_dataset=dataset, 46 | valid_dataset=dataset, 47 | num_samples=1, 48 | local_dir=str(tmp_path), 49 | ) 50 | 51 | assert my_exp._name.startswith("HPO on") 52 | assert my_exp.name == my_exp._name 53 | assert my_exp._run_identifier == "_default_trainable" 54 | 55 | analysis = tune.run(my_exp) 56 | assert len(analysis.trials) == 1 57 | 58 | 59 | def test_tune_exp_save_dataset_and_vocab( 60 | dataset, pipeline_config, trainer_config, monkeypatch 61 | ): 62 | pl = Pipeline.from_config(pipeline_config) 63 | 64 | my_exp = TuneExperiment( 65 | pipeline_config=pipeline_config, 66 | trainer_config=trainer_config, 67 | train_dataset=dataset, 68 | valid_dataset=dataset, 69 | ) 70 | 71 | config = my_exp.config 72 | 73 | assert dataset[:] == Dataset.load_from_disk(config["train_dataset_path"])[:] 74 | assert dataset[:] == Dataset.load_from_disk(config["valid_dataset_path"])[:] 75 | 76 | 77 | def test_tune_exp_custom_trainable( 78 | dataset, 79 | pipeline_config, 80 | trainer_config, 81 | ): 82 | def my_trainable(config): 83 | pass 84 | 85 | my_exp = TuneExperiment( 86 | pipeline_config=pipeline_config, 87 | trainer_config=trainer_config, 88 | train_dataset=dataset, 89 | valid_dataset=dataset, 90 | name="custom trainable", 91 | trainable=my_trainable, 92 | ) 93 | 94 | assert my_exp.name == "custom trainable" 95 | assert my_exp.trainable == my_trainable 96 | assert my_exp._run_identifier == "my_trainable" 97 | 98 | 99 | def test_vocab_config(tmp_path, pipeline_config, trainer_config, dataset): 100 | vocab_config = VocabularyConfiguration(max_vocab_size=1) 101 | 102 | my_exp = TuneExperiment( 103 | pipeline_config=pipeline_config, 104 | trainer_config=trainer_config, 105 | train_dataset=dataset, 106 | valid_dataset=dataset, 107 | vocab_config=vocab_config, 108 | name="test_vocab_config", 109 | local_dir=str(tmp_path), 110 | ) 111 | 112 | analysis = tune.run(my_exp) 113 | pl = Pipeline.from_pretrained( 114 | Path(analysis.get_best_logdir("validation_loss", "min")) 115 | / "output" 116 | / "model.tar.gz" 117 | ) 118 | 119 | assert pl.vocab.get_vocab_size("word") == 3 120 | -------------------------------------------------------------------------------- /tests/text/modules/heads/classification/test_text_classification.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from allennlp.data import Batch 3 | from numpy.testing import assert_allclose 4 | 5 | from biome.text import Pipeline 6 | from biome.text.modules.heads.task_prediction import Attribution 7 | from biome.text.modules.heads.task_prediction import TextClassificationPrediction 8 | 9 | 10 | @pytest.fixture 11 | def pipeline() -> Pipeline: 12 | labels = ["a", "b", "c", "d", "e", "f"] 13 | return Pipeline.from_config( 14 | { 15 | "name": "test_text_classification", 16 | "head": {"type": "TextClassification", "labels": labels, "dropout": 0.1}, 17 | } 18 | ) 19 | 20 | 21 | def test_make_task_prediction(pipeline): 22 | instance = pipeline.head.featurize("test this sentence") 23 | forward_output = pipeline._model.forward_on_instances([instance]) 24 | 25 | prediction = pipeline.head._make_task_prediction(forward_output[0], None) 26 | 27 | assert isinstance(prediction, TextClassificationPrediction) 28 | assert isinstance(prediction.labels, list) and isinstance( 29 | prediction.probabilities, list 30 | ) 31 | assert len(prediction.labels) == len(prediction.probabilities) == 6 32 | # check descending order 33 | assert_allclose( 34 | sorted(prediction.probabilities, reverse=True), prediction.probabilities 35 | ) 36 | assert all([isinstance(label, str) for label in prediction.labels]) 37 | assert set(pipeline.head.labels) == set(prediction.labels) 38 | assert all([isinstance(prob, float) for prob in prediction.probabilities]) 39 | 40 | 41 | def test_compute_attributions(pipeline): 42 | instance = pipeline.head.featurize("test this sentence") 43 | pipeline.model.eval() 44 | forward_output = pipeline.model.forward_on_instances([instance]) 45 | 46 | attributions = pipeline.head._compute_attributions( 47 | forward_output[0], instance, n_steps=1 48 | ) 49 | 50 | assert all([isinstance(attribution, Attribution) for attribution in attributions]) 51 | assert len(attributions) == 3 52 | assert all([attr.field == "text" for attr in attributions]) 53 | assert all([isinstance(attr.attribution, float) for attr in attributions]) 54 | assert attributions[1].start == 5 and attributions[1].end == 9 55 | 56 | 57 | def test_metrics(pipeline): 58 | instance = pipeline.head.featurize(text="test this", label="a") 59 | batch = Batch([instance]) 60 | batch.index_instances(pipeline.vocab) 61 | 62 | pipeline.head.forward(**batch.as_tensor_dict()) 63 | # validation metric should have never been called 64 | assert pipeline.head._metrics.get_dict()["accuracy"].total_count == 1 65 | assert pipeline.head._metrics.get_dict(is_train=False)["accuracy"].total_count == 0 66 | 67 | train_metrics = pipeline.head.get_metrics(reset=True) 68 | expected_metric_names = ( 69 | ["accuracy"] 70 | + [ 71 | f"{label}/{metric}" 72 | for label in ["micro", "macro"] 73 | for metric in ["precision", "recall", "fscore"] 74 | ] 75 | + [ 76 | f"_{metric}/{label}" 77 | for metric in ["precision", "recall", "fscore"] 78 | for label in ["a", "b", "c", "d", "e", "f"] 79 | ] 80 | ) 81 | assert all(name in train_metrics for name in expected_metric_names) 82 | 83 | pipeline.head.training = False 84 | pipeline.head.forward(**batch.as_tensor_dict()) 85 | # training metric should have never been called after its reset 86 | assert pipeline.head._metrics.get_dict()["accuracy"].total_count == 0 87 | assert pipeline.head._metrics.get_dict(is_train=False)["accuracy"].total_count == 1 88 | 89 | valid_metrics = pipeline.head.get_metrics() 90 | assert all(name in valid_metrics for name in expected_metric_names) 91 | -------------------------------------------------------------------------------- /docs/docs/documentation/community/3-developer_guides.md: -------------------------------------------------------------------------------- 1 | # Developer guides 2 | 3 | ## Setting up for development 4 | To set up your system for *biome.text* development, you first of all have to [fork](https://guides.github.com/activities/forking/) 5 | our repository and clone your fork to your computer: 6 | 7 | ````shell script 8 | git clone https://github.com/[your-github-username]/biome-text.git 9 | cd biome-text 10 | ```` 11 | 12 | To keep your fork's master branch up to date with our repo you should add it as an [upstream remote branch](https://dev.to/louhayes3/git-add-an-upstream-to-a-forked-repo-1mik): 13 | 14 | ````shell script 15 | git remote add upstream https://github.com/recognai/biome-text.git 16 | ```` 17 | 18 | Now go ahead and create a new conda environment in which the development will take place and activate it: 19 | 20 | ````shell script 21 | conda env create -f environment_dev.yml 22 | conda activate biometext 23 | ```` 24 | 25 | Once you activated the conda environment, it is time to install *biome.text* in editable mode with all its development dependencies. 26 | The best way to do this is to take advantage of the make directive: 27 | 28 | ````shell script 29 | make dev 30 | ```` 31 | 32 | After installing *biome.text*, the best way to test your installation is by running the *biome.text* cli command: 33 | 34 | ```shell script 35 | biome --help 36 | ``` 37 | 38 | ### Running tests locally 39 | 40 | *Biome.text* uses [pytest](https://docs.pytest.org/en/latest/) for its unit and integration tests. 41 | If you are working on the code base we advise you to run our tests locally before submitting a Pull Request (see below) to make sure your changes did not break and existing functionality. 42 | To achieve this you can simply run: 43 | 44 | ````shell script 45 | make test 46 | ```` 47 | 48 | If you open a Pull Request, the test suite will be run automatically via a GitHub Action. 49 | 50 | ### Serving docs locally 51 | 52 | If you are working on the documentation and want to check out the results locally on your machine, you can simply run: 53 | 54 | ````shell script 55 | make docs 56 | ```` 57 | 58 | The docs will be built and deployed automatically via a GitHub Action when our master branch is updated. 59 | If for some reason you want to build them locally, you can do so with: 60 | 61 | ````shell script 62 | make build_docs 63 | ```` 64 | 65 | ## Make a release 66 | 67 | To make a release you have to follow 4 steps: 68 | 69 | 1. Run the `prepare_versioned_build.sh` script inside the `docs` folder and commit the changes to the master branch. 70 | The commit message should say something like: "v2.2.0 release". 71 | 72 | 2. Create a new [GitHub release](https://docs.github.com/en/free-pro-team@latest/github/administering-a-repository/managing-releases-in-a-repository#creating-a-release). 73 | 74 | The version tags should be `v1.1.0` or for release candidates `v1.1.0rc1`. 75 | Major and minor releases should always be made against the master branch, bugfix releases against the corresponding minor release tag. 76 | 77 | After publishing the release, the CI is triggered and if everything goes well the release gets published on PyPi. 78 | The CI does: 79 | - run tests & build docs 80 | - build package 81 | - upload to testpypi 82 | - install from testpypi 83 | - upload to pypi 84 | 85 | 3. Revert the last commit in which you changed the docs, the commit message should read something like: 86 | "back to master release". 87 | 88 | 4. **Docs**: In order for the Algolia Search to work, you need to add the new version number of the docs to our 89 | algolia [config file](https://github.com/algolia/docsearch-configs/blob/master/configs/recogn_biome-text.json) and submit a PR. 90 | 91 | 92 | Under the hood the versioning of our package is managed by [`setuptools_scm`](https://github.com/pypa/setuptools_scm), 93 | that basically works with the git tags in a repo. 94 | -------------------------------------------------------------------------------- /tests/docs/test_tutorials.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | from pytest_notebook.nb_regression import NBRegressionFixture 5 | from pytest_notebook.notebook import dump_notebook 6 | from pytest_notebook.notebook import load_notebook 7 | 8 | pytestmark = pytest.mark.skip( 9 | reason="The pytest-notebook package is not actively maintained and " 10 | "the tutorial tests are quite heavy on resources. " 11 | "The idea is to run those tests locally and manually from time to time." 12 | "THESE TESTS ARE ALSO OUT OF DATE ... :/" 13 | ) 14 | 15 | 16 | def test_text_classifier_tutorial(tmp_path, tutorials_path): 17 | notebook_path = tutorials_path / "Training_a_text_classifier.ipynb" 18 | 19 | # adapt notebook to CI (make its execution quicker + comment lines) 20 | notebook = load_notebook(str(notebook_path)) 21 | for cell in notebook["cells"]: 22 | if cell["source"].startswith("!pip install"): 23 | cell["source"] = re.sub(r"!pip install", r"#!pip install", cell["source"]) 24 | if cell["source"].startswith("trainer_config ="): 25 | cell["source"] = re.sub( 26 | r"num_epochs=[0-9][0-9]?", r"num_epochs=1", cell["source"] 27 | ) 28 | if cell["source"].startswith("pl.train("): 29 | cell["source"] = re.sub( 30 | r"training=train_ds", r"training=valid_ds", cell["source"] 31 | ) 32 | # dump adapted notebook 33 | mod_notebook_path = tmp_path / notebook_path.name 34 | with mod_notebook_path.open("w") as file: 35 | file.write(str(dump_notebook(notebook))) 36 | 37 | # test adapted notebook 38 | fixture = NBRegressionFixture(exec_timeout=100) 39 | fixture.check(str(mod_notebook_path)) 40 | 41 | 42 | def test_slot_filling_tutorial(tmp_path, tutorials_path): 43 | notebook_path = tutorials_path / "Training_a_sequence_tagger_for_Slot_Filling.ipynb" 44 | 45 | # adapt notebook to CI (make its execution quicker + comment lines) 46 | notebook = load_notebook(str(notebook_path)) 47 | for cell in notebook["cells"]: 48 | if cell["source"].startswith("!pip install"): 49 | cell["source"] = re.sub(r"!pip install", r"#!pip install", cell["source"]) 50 | if cell["source"].startswith( 51 | "from biome.text.configuration import FeaturesConfiguration" 52 | ): 53 | cell["source"] = re.sub( 54 | r"https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip", 55 | r"https://biome-tutorials-data.s3-eu-west-1.amazonaws.com/token_classifier/wiki-news-300d-1M.head.vec", 56 | cell["source"], 57 | ) 58 | if cell["source"].startswith("trainer_config ="): 59 | cell["source"] = re.sub( 60 | r"TrainerConfiguration\(\)", 61 | r"TrainerConfiguration(num_epochs=1)", 62 | cell["source"], 63 | ) 64 | if cell["source"].startswith("pl.train("): 65 | cell["source"] = re.sub( 66 | r"pl.train", 67 | r"from biome.text.configuration import TrainerConfiguration\npl.train", 68 | cell["source"], 69 | ) 70 | cell["source"] = re.sub( 71 | r"training=train_ds", 72 | r"training=valid_ds", 73 | cell["source"], 74 | ) 75 | cell["source"] = re.sub( 76 | r"test=test_ds,", 77 | r"test=test_ds, trainer=TrainerConfiguration(num_epochs=1)", 78 | cell["source"], 79 | ) 80 | 81 | # dump adapted notebook 82 | mod_notebook_path = tmp_path / notebook_path.name 83 | with mod_notebook_path.open("w") as file: 84 | file.write(str(dump_notebook(notebook))) 85 | 86 | # test adapted notebook 87 | fixture = NBRegressionFixture(exec_timeout=200) 88 | fixture.check(str(mod_notebook_path)) 89 | -------------------------------------------------------------------------------- /tests/text/test_pretrained_word_vectors.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | import torch 5 | from torch.testing import assert_allclose 6 | 7 | from biome.text import Dataset 8 | from biome.text import Pipeline 9 | from biome.text import Trainer 10 | from biome.text import TrainerConfiguration 11 | 12 | 13 | @pytest.fixture 14 | def pretrained_word_vectors(tmp_path) -> Path: 15 | file_path = tmp_path / "pretrained_word_vectors.txt" 16 | file_path.write_text("2 2\ntest 0.66 0.33\nthis 0.25 0.75") 17 | 18 | return file_path 19 | 20 | 21 | @pytest.fixture 22 | def dataset() -> Dataset: 23 | data = {"text": ["test"], "label": ["good"]} 24 | return Dataset.from_dict(data) 25 | 26 | 27 | @pytest.fixture 28 | def dataset2() -> Dataset: 29 | data = {"text": ["this"], "label": ["good"]} 30 | return Dataset.from_dict(data) 31 | 32 | 33 | @pytest.fixture 34 | def pipeline_config(pretrained_word_vectors) -> dict: 35 | config = { 36 | "name": "pretrained_word_vectors_test", 37 | "features": { 38 | "word": { 39 | "embedding_dim": 2, 40 | "weights_file": str(pretrained_word_vectors.absolute()), 41 | } 42 | }, 43 | "head": {"type": "TextClassification", "labels": ["good"]}, 44 | } 45 | return config 46 | 47 | 48 | def test_create_pipeline_with_weights_file(pipeline_config, dataset, tmp_path): 49 | pipeline = Pipeline.from_config(pipeline_config) 50 | 51 | output = tmp_path / "pretrained_word_vector_output" 52 | trainer = Trainer( 53 | pipeline=pipeline, 54 | train_dataset=dataset, 55 | trainer_config=TrainerConfiguration(max_epochs=1, gpus=0), 56 | ) 57 | trainer.fit(output) 58 | 59 | instance = pipeline.head.featurize("test") 60 | instance.index_fields(pipeline.vocab) 61 | 62 | assert_allclose( 63 | pipeline.backbone.embedder(instance.as_tensor_dict()["text"], 0), 64 | torch.tensor([[0.66, 0.33]]), 65 | ) 66 | 67 | # Loading a pretrained model without the weights file should work 68 | Path(pipeline_config["features"]["word"]["weights_file"]).unlink() 69 | assert isinstance(Pipeline.from_pretrained(str(output / "model.tar.gz")), Pipeline) 70 | 71 | 72 | def test_extending_vocab_with_weights_file( 73 | pipeline_config, dataset, dataset2, capsys, caplog 74 | ): 75 | pipeline = Pipeline.from_config(pipeline_config) 76 | # create vocab 77 | pipeline.create_vocab([dataset.to_instances(pipeline)]) 78 | 79 | # extending the vocab with the weights file available should apply the pretrained weights 80 | pipeline.create_vocab([dataset2.to_instances(pipeline)]) 81 | 82 | instance = pipeline.head.featurize("this") 83 | instance.index_fields(pipeline.vocab) 84 | 85 | assert_allclose( 86 | pipeline.backbone.embedder(instance.as_tensor_dict()["text"]), 87 | torch.tensor([[0.25, 0.75]]), 88 | ) 89 | 90 | # extending the vocab with the weights file deleted should trigger a warning 91 | Path(pipeline_config["features"]["word"]["weights_file"]).unlink() 92 | ds = Dataset.from_dict({"text": ["that"], "label": ["good"]}) 93 | pipeline.create_vocab([ds.to_instances(pipeline)]) 94 | 95 | assert caplog.record_tuples[-1][0] == "allennlp.modules.token_embedders.embedding" 96 | assert caplog.record_tuples[-1][1] == 30 97 | assert ( 98 | "Embedding at model_path, " 99 | "_head.backbone.embedder.token_embedder_word cannot locate the pretrained_file." 100 | in caplog.record_tuples[-1][2] 101 | ) 102 | 103 | 104 | def test_raise_filenotfound_error(pipeline_config, dataset): 105 | Path(pipeline_config["features"]["word"]["weights_file"]).unlink() 106 | pipeline = Pipeline.from_config(pipeline_config) 107 | 108 | with pytest.raises(FileNotFoundError): 109 | pipeline.create_vocab([dataset.to_instances(pipeline)]) 110 | -------------------------------------------------------------------------------- /tests/text/modules/heads/test_task_head.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from allennlp.data import Instance 3 | from allennlp.data.fields import ListField 4 | from allennlp.data.fields import TextField 5 | from spacy.tokenizer import Tokenizer 6 | from spacy.vocab import Vocab 7 | 8 | from biome.text.configuration import PredictionConfiguration 9 | from biome.text.helpers import spacy_to_allennlp_token 10 | from biome.text.modules.heads import TaskHead 11 | from biome.text.modules.heads import TaskPrediction 12 | from biome.text.modules.heads.task_prediction import Token 13 | 14 | 15 | @pytest.fixture 16 | def task_head() -> TaskHead: 17 | return TaskHead(backbone="mock_backbone") 18 | 19 | 20 | def test_prediction_not_implemented(task_head): 21 | with pytest.raises(NotImplementedError): 22 | task_head.make_task_prediction("mock", "mock", "mock") 23 | 24 | 25 | def test_attributions_not_implemented(task_head, monkeypatch): 26 | def mock_make_task_prediction(*args, **kwargs): 27 | return TaskPrediction() 28 | 29 | monkeypatch.setattr(task_head, "_make_task_prediction", mock_make_task_prediction) 30 | 31 | with pytest.raises(NotImplementedError): 32 | task_head.make_task_prediction( 33 | "mock", "mock", PredictionConfiguration(add_attributions=True) 34 | ) 35 | 36 | 37 | def test_make_task_prediction(monkeypatch, task_head): 38 | def mock_make_task_prediction(*args, **kwargs): 39 | return TaskPrediction() 40 | 41 | def mock_compute_attributions(*args, **kwargs): 42 | return kwargs 43 | 44 | def mock_extract_tokens(*args, **kwargs): 45 | return "tokens" 46 | 47 | monkeypatch.setattr(task_head, "_make_task_prediction", mock_make_task_prediction) 48 | monkeypatch.setattr(task_head, "_compute_attributions", mock_compute_attributions) 49 | monkeypatch.setattr(task_head, "_extract_tokens", mock_extract_tokens) 50 | 51 | prediction = task_head.make_task_prediction( 52 | "mock_forward_output", 53 | "mock_instance", 54 | PredictionConfiguration( 55 | add_tokens=True, 56 | add_attributions=True, 57 | attributions_kwargs={"test": "kwarg"}, 58 | ), 59 | ) 60 | 61 | assert isinstance(prediction, TaskPrediction) 62 | assert hasattr(prediction, "tokens") and hasattr(prediction, "attributions") 63 | assert prediction.tokens == "tokens" 64 | assert prediction.attributions == {"test": "kwarg"} 65 | 66 | 67 | @pytest.mark.parametrize("allennlp_tokens", [False, True]) 68 | def test_extract_tokens(task_head, allennlp_tokens): 69 | tokenizer = Tokenizer(Vocab()) 70 | input_tokens = list(tokenizer("test this sentence.")) 71 | if allennlp_tokens: 72 | input_tokens = [spacy_to_allennlp_token(tok) for tok in input_tokens] 73 | 74 | tf = TextField(input_tokens, None) 75 | instance = Instance({"test": tf}) 76 | 77 | tokens = task_head._extract_tokens(instance) 78 | 79 | assert all([isinstance(tok, Token) for tok in tokens]) 80 | assert all(itok.text == otok.text for itok, otok in zip(input_tokens, tokens)) 81 | assert all(itok.idx == otok.start for itok, otok in zip(input_tokens, tokens)) 82 | if allennlp_tokens: 83 | assert all(itok.idx_end == otok.end for itok, otok in zip(input_tokens, tokens)) 84 | else: 85 | assert all( 86 | itok.idx + len(itok.text) == otok.end 87 | for itok, otok in zip(input_tokens, tokens) 88 | ) 89 | assert all([tok.field == "test" for tok in tokens]) 90 | 91 | 92 | def test_extract_tokens_listfield(task_head): 93 | tokenizer = Tokenizer(Vocab()) 94 | input_tokens = list(tokenizer("test this sentence.")) 95 | 96 | tf = TextField(input_tokens, None) 97 | instance = Instance({"test": ListField([tf, tf])}) 98 | 99 | tokens = task_head._extract_tokens(instance) 100 | 101 | assert len(tokens) == 2 and len(tokens[0]) == 3 and len(tokens[1]) == 3 102 | assert all( 103 | [all([isinstance(tok, Token) for tok in tf_tokens] for tf_tokens in tokens)] 104 | ) 105 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/components/Navbar.vue: -------------------------------------------------------------------------------- 1 | 38 | 39 | 94 | 95 | 141 | -------------------------------------------------------------------------------- /tests/text/test_pipeline_vocab.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from biome.text import Dataset 4 | from biome.text import Pipeline 5 | from biome.text import Trainer 6 | from biome.text import VocabularyConfiguration 7 | from biome.text.errors import EmptyVocabError 8 | from biome.text.features import CharFeatures 9 | from biome.text.features import TransformersFeatures 10 | from biome.text.features import WordFeatures 11 | 12 | 13 | @pytest.fixture 14 | def pipeline(): 15 | config = { 16 | "name": "vocab_test", 17 | "features": { 18 | "transformers": {"model_name": "sshleifer/tiny-distilbert-base-cased"}, 19 | "word": {"embedding_dim": 2}, 20 | "char": { 21 | "embedding_dim": 2, 22 | "dropout": 0.1, 23 | "encoder": { 24 | "type": "gru", 25 | "hidden_size": 2, 26 | "num_layers": 1, 27 | "bidirectional": False, 28 | }, 29 | }, 30 | }, 31 | "head": { 32 | "type": "TextClassification", 33 | "labels": ["good", "bad"], 34 | }, 35 | } 36 | 37 | return Pipeline.from_config(config) 38 | 39 | 40 | @pytest.fixture 41 | def train_dataset(): 42 | data = {"text": ["this is a test", "and another one"], "label": ["good", "bad"]} 43 | return Dataset.from_dict(data) 44 | 45 | 46 | @pytest.fixture 47 | def valid_dataset(): 48 | data = { 49 | "text": ["and what about the validation", "do not forget this one"], 50 | "label": ["bad", "good"], 51 | } 52 | return Dataset.from_dict(data) 53 | 54 | 55 | def test_default_vocab(pipeline, train_dataset, valid_dataset): 56 | # Transformer vocab is added on pipeline creation 57 | assert pipeline.vocab.get_vocab_size(TransformersFeatures.namespace) == 28996 58 | # While word and char vocab should be empty (except for the oov and padding token) 59 | assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 2 60 | assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 2 61 | 62 | # Training should build a default vocab with only the training dataset 63 | Trainer(pipeline, train_dataset=train_dataset) 64 | assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 9 65 | assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 12 66 | assert pipeline.vocab.get_vocab_size(TransformersFeatures.namespace) == 28996 67 | 68 | # Pretrained pipelines should extend the vocab by default 69 | Trainer(pipeline, train_dataset=valid_dataset) 70 | assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 16 71 | assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 19 72 | assert pipeline.vocab.get_vocab_size(TransformersFeatures.namespace) == 28996 73 | 74 | 75 | def test_specific_vocab_config(pipeline, train_dataset, valid_dataset): 76 | vocab_config = VocabularyConfiguration(include_valid_data=True) 77 | 78 | Trainer( 79 | pipeline, 80 | train_dataset=train_dataset, 81 | valid_dataset=valid_dataset, 82 | vocab_config=vocab_config, 83 | ) 84 | assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 16 85 | assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 19 86 | assert pipeline.vocab.get_vocab_size(TransformersFeatures.namespace) == 28996 87 | 88 | 89 | def test_not_touching_vocab(pipeline, train_dataset, valid_dataset): 90 | # vocab_config=None leaves the pipeline's vocab empty from an unpretrained pipeline 91 | with pytest.raises(EmptyVocabError): 92 | Trainer(pipeline, train_dataset=train_dataset, vocab_config=None) 93 | 94 | # vocab_config=None should not extend the vocab for a pretrained pipeline 95 | Trainer(pipeline, train_dataset=train_dataset) 96 | assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 9 97 | assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 12 98 | Trainer(pipeline, train_dataset=valid_dataset, vocab_config=None) 99 | assert pipeline.vocab.get_vocab_size(WordFeatures.namespace) == 9 100 | assert pipeline.vocab.get_vocab_size(CharFeatures.namespace) == 12 101 | -------------------------------------------------------------------------------- /docs/docs/documentation/user-guides/1-nlp-tasks.md: -------------------------------------------------------------------------------- 1 | # NLP Tasks 2 | 3 | In *biome.text* NLP tasks are defined via ``TaskHead`` classes. 4 | 5 | This section gives a summary of the library's main heads and tasks. 6 | 7 | ## TextClassification 8 | 9 | **Tutorials**: [Training a short text classifier of German business names](../tutorials/Training_a_text_classifier.md) 10 | 11 | **NLP tasks**: text classification, sentiment analysis, entity typing, relation classification. 12 | 13 | **Input**: `text`: a single field or a concatenation of input fields. 14 | 15 | **Output**: `label` by default, a probability distribution over labels except if `multilabel` is enabled for multi-label classification problems. 16 | 17 | **Main parameters**: 18 | 19 | `pooler`: a `Seq2VecEncoderConfiguration` to pool a sequence of encoded word/char vectors into a single vector representing the input text 20 | 21 | 22 | See [TextClassification API](../../api/biome/text/modules/heads/classification/text_classification.md#textclassification) for more details. 23 | 24 | ## RecordClassification 25 | 26 | **NLP tasks**: text classification, sentiment analysis, entity typing, relation classification and semi-structured data classification problems with product, customer data, etc. 27 | 28 | **Input**: `document`: a list of fields. 29 | 30 | **Output**: `labels` by default, a probability distribution over labels except if `multilabel` is enabled for multi-label classification problems. 31 | 32 | **Main parameters**: 33 | 34 | `record_keys`: field keys to be used as input features to the model, e.g., name, first_name, body, subject, etc. 35 | 36 | `tokens_pooler`: a `Seq2VecEncoderConfiguration` to pool a sequence of encoded word/char vectors **for each field** into a single vector representing the field. 37 | 38 | `fields_encoder`: a `Seq2SeqEncoderConfiguration` to encode a sequence of field vectors. 39 | 40 | `fields_pooler`: a `Seq2VecEncoderConfiguration` to pool a sequence of encoded field vectors into a single vector representing the whole document/record. 41 | 42 | See [RecordClassification API](../../api/biome/text/modules/heads/classification/record_classification.md#recordclassification) for more details. 43 | 44 | ## RecordPairClassification 45 | 46 | **NLP tasks**: Classify the relation between a pair of structured data. For example, do two sets of customer data belong to the same customer or not. 47 | 48 | **Input**: `record1`, `record2`. Two dictionaries that should share the same keys, preferably in the same order. 49 | 50 | **Output**: `labels`. By default, a probability distribution over labels except if `multilabel` is enabled for multi-label classification problems. 51 | 52 | **Main parameters**: 53 | 54 | `field_encoder`: A `Seq2VecEncoder` to encode and pool the single dictionary items of both inputs. It takes both, the key and the value, into account. 55 | 56 | `record_encoder`: A `Seq2SeqEncoder` to contextualize the encoded dictionary items within its record. 57 | 58 | `matcher_forward`: A `BiMPMMatching` layer for the (optionally only forward) record encoder layer. 59 | 60 | `aggregator`: A `Seq2VecEncoder` to pool the output of the matching layers. 61 | 62 | See the [RecordPairClassification API](../../api/biome/text/modules/heads/classification/record_pair_classification.md) for more details. 63 | 64 | ## TokenClassification 65 | 66 | **Tutorials**: [Training a sequence tagger for Slot Filling](../tutorials/Training_a_sequence_tagger_for_Slot_Filling.md) 67 | 68 | **NLP tasks**: NER, Slot filling, Part of speech tagging. 69 | 70 | **Input**: `text`: **pretokenized text** as a list of tokens. 71 | 72 | **Output**: `labels`: one label for each token according to the `label_encoding` scheme defined in the head (e.g., BIO). 73 | 74 | **Main parameters**: 75 | 76 | `feedforward`: feed-forward layer to be applied after token encoding. 77 | 78 | See [TokenClassification API](../../api/biome/text/modules/heads/token_classification.md#tokenclassification) for more details. 79 | 80 | ## LanguageModelling 81 | 82 | **NLP tasks**: Pre-training, word-level next token language model. 83 | 84 | **Input**: `text`: a single field or a concatenation of input fields. 85 | 86 | **Output**: contextualized word vectors. 87 | 88 | **Main parameters**: 89 | 90 | `dropout` to be applied after token encoding. 91 | 92 | 93 | See [LanguageModelling API](../../api/biome/text/modules/heads/language_modelling.md#languagemodelling) for more details. 94 | -------------------------------------------------------------------------------- /src/biome/text/modules/heads/classification/record_classification.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from typing import List 3 | from typing import Optional 4 | from typing import Union 5 | 6 | import numpy 7 | from allennlp.data import Instance 8 | 9 | from biome.text.backbone import ModelBackbone 10 | from biome.text.modules.configuration import ComponentConfiguration 11 | from biome.text.modules.configuration import FeedForwardConfiguration 12 | from biome.text.modules.configuration import Seq2SeqEncoderConfiguration 13 | from biome.text.modules.configuration import Seq2VecEncoderConfiguration 14 | from biome.text.modules.heads import DocumentClassification 15 | from biome.text.modules.heads.task_prediction import RecordClassificationPrediction 16 | 17 | 18 | class RecordClassification(DocumentClassification): 19 | """ 20 | Task head for data record classification. 21 | Accepts a variable data inputs and apply featuring over defined record keys. 22 | 23 | This head applies a doc2vec architecture from a structured record data input 24 | 25 | Parameters 26 | ---------- 27 | backbone 28 | The backbone of your model. Must not be provided when initiating with `Pipeline.from_config`. 29 | labels 30 | A list of labels for your classification task. 31 | token_pooler 32 | The pooler at token level to provide one vector per record field. Default: `BagOfEmbeddingsEncoder`. 33 | fields_encoder 34 | An optional sequence to sequence encoder that contextualizes the record field representations. Default: None. 35 | fields_pooler 36 | The pooler at sentence level to provide a vector for the whole record. Default: `BagOfEmbeddingsEncoder`. 37 | feedforward 38 | An optional feedforward layer applied to the output of the fields pooler. Default: None. 39 | multilabel 40 | Is this a multi label classification task? Default: False 41 | label_weights 42 | A list of weights for each label. The weights must be in the same order as the `labels`. 43 | You can also provide a dictionary that maps the label to its weight. Default: None. 44 | """ 45 | 46 | def __init__( 47 | self, 48 | backbone: ModelBackbone, 49 | labels: List[str], 50 | record_keys: List[str], 51 | token_pooler: Optional[Seq2VecEncoderConfiguration] = None, 52 | fields_encoder: Optional[Seq2SeqEncoderConfiguration] = None, 53 | fields_pooler: Optional[Seq2VecEncoderConfiguration] = None, 54 | feedforward: Optional[FeedForwardConfiguration] = None, 55 | multilabel: Optional[bool] = False, 56 | label_weights: Optional[Union[List[float], Dict[str, float]]] = None, 57 | ) -> None: 58 | 59 | super().__init__( 60 | backbone, 61 | labels=labels, 62 | token_pooler=token_pooler, 63 | sentence_encoder=fields_encoder, 64 | sentence_pooler=fields_pooler, 65 | feedforward=feedforward, 66 | multilabel=multilabel, 67 | label_weights=label_weights, 68 | ) 69 | 70 | self._empty_prediction = RecordClassificationPrediction( 71 | labels=[], probabilities=[] 72 | ) 73 | 74 | self._inputs = record_keys 75 | 76 | def inputs(self) -> Optional[List[str]]: 77 | """The inputs names are determined by configured record keys""" 78 | return self._inputs 79 | 80 | def featurize( 81 | self, label: Optional[Union[str, List[str]]] = None, **inputs 82 | ) -> Instance: 83 | 84 | record = {input_key: inputs[input_key] for input_key in self._inputs} 85 | instance = self.backbone.featurizer(record, to_field=self.forward_arg_name) 86 | 87 | return self._add_label(instance, label) 88 | 89 | def _make_task_prediction( 90 | self, 91 | single_forward_output: Dict[str, numpy.ndarray], 92 | instance: Instance, 93 | ) -> RecordClassificationPrediction: 94 | labels, probabilities = self._compute_labels_and_probabilities( 95 | single_forward_output 96 | ) 97 | 98 | return RecordClassificationPrediction( 99 | labels=labels, probabilities=probabilities 100 | ) 101 | 102 | 103 | class RecordClassificationConfiguration(ComponentConfiguration[RecordClassification]): 104 | """Lazy initialization for document classification head components""" 105 | 106 | pass 107 | -------------------------------------------------------------------------------- /tests/text_classification_integration_test.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import pytest 4 | from pytorch_lightning import seed_everything 5 | 6 | from biome.text import Dataset 7 | from biome.text import Pipeline 8 | from biome.text import Trainer 9 | from biome.text import VocabularyConfiguration 10 | from biome.text.configuration import CharFeatures 11 | from biome.text.configuration import TrainerConfiguration 12 | from biome.text.configuration import WordFeatures 13 | 14 | 15 | @pytest.fixture 16 | def train_valid_dataset(resources_data_path) -> Tuple[Dataset, Dataset]: 17 | """Returns both training and validation datasets""" 18 | 19 | training_ds = Dataset.from_csv( 20 | paths=str(resources_data_path / "business.cat.2k.train.csv") 21 | ) 22 | validation_ds = Dataset.from_csv( 23 | paths=str(resources_data_path / "business.cat.2k.valid.csv") 24 | ) 25 | 26 | return training_ds, validation_ds 27 | 28 | 29 | @pytest.fixture 30 | def pipeline_dict() -> dict: 31 | return { 32 | "name": "german_business_names", 33 | "features": { 34 | "word": {"embedding_dim": 16, "lowercase_tokens": True}, 35 | "char": { 36 | "embedding_dim": 16, 37 | "encoder": { 38 | "type": "gru", 39 | "num_layers": 1, 40 | "hidden_size": 32, 41 | "bidirectional": True, 42 | }, 43 | "dropout": 0.1, 44 | }, 45 | }, 46 | "head": { 47 | "type": "TextClassification", 48 | "labels": [ 49 | "Unternehmensberatungen", 50 | "Friseure", 51 | "Tiefbau", 52 | "Dienstleistungen", 53 | "Gebrauchtwagen", 54 | "Restaurants", 55 | "Architekturbüros", 56 | "Elektriker", 57 | "Vereine", 58 | "Versicherungsvermittler", 59 | "Sanitärinstallationen", 60 | "Edv", 61 | "Maler", 62 | "Physiotherapie", 63 | "Werbeagenturen", 64 | "Apotheken", 65 | "Vermittlungen", 66 | "Hotels", 67 | "Autowerkstätten", 68 | "Elektrotechnik", 69 | "Allgemeinärzte", 70 | "Handelsvermittler Und -vertreter", 71 | ], 72 | "pooler": { 73 | "type": "gru", 74 | "num_layers": 1, 75 | "hidden_size": 16, 76 | "bidirectional": True, 77 | }, 78 | "feedforward": { 79 | "num_layers": 1, 80 | "hidden_dims": [16], 81 | "activations": ["relu"], 82 | "dropout": [0.1], 83 | }, 84 | }, 85 | } 86 | 87 | 88 | def test_text_classification(tmp_path, pipeline_dict, train_valid_dataset): 89 | """Apart from a well specified training, this also tests the vocab creation!""" 90 | seed_everything(43) 91 | 92 | pl = Pipeline.from_config(pipeline_dict) 93 | train_ds = train_valid_dataset[0] 94 | valid_ds = train_valid_dataset[1] 95 | 96 | vocab_config = VocabularyConfiguration(max_vocab_size={"word": 50}) 97 | trainer_config = TrainerConfiguration( 98 | batch_size=64, 99 | optimizer={"type": "adam", "lr": 0.01}, 100 | max_epochs=5, 101 | default_root_dir=str(tmp_path), 102 | gpus=0, # turn off gpus even if available 103 | ) 104 | 105 | trainer = Trainer( 106 | pipeline=pl, 107 | train_dataset=train_ds, 108 | valid_dataset=valid_ds, 109 | trainer_config=trainer_config, 110 | vocab_config=vocab_config, 111 | ) 112 | 113 | trainer.fit(tmp_path / "output") 114 | 115 | assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 116 | assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83 117 | 118 | assert pl.num_trainable_parameters == 22070 119 | 120 | evaluation = trainer.test(valid_ds, batch_size=16) 121 | 122 | # Reminder: the value depends on the batch_size! 123 | assert evaluation["test_loss"] == pytest.approx(0.7404146790504456, abs=0.003) 124 | 125 | Pipeline.from_pretrained(str(tmp_path / "output" / "model.tar.gz")) 126 | 127 | assert pl.vocab.get_vocab_size(WordFeatures.namespace) == 52 128 | assert pl.vocab.get_vocab_size(CharFeatures.namespace) == 83 129 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/layouts/Layout.vue: -------------------------------------------------------------------------------- 1 | 53 | 54 | 160 | 175 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/public/assets/img/pytorch.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/styles/index.styl: -------------------------------------------------------------------------------- 1 | @require "./code-colors.styl" 2 | 3 | body 4 | font-family: $primaryFontFamily !important 5 | h1, h2, h3, h4 6 | font-family: $secondaryFontFamily 7 | h2 8 | border-bottom none 9 | dd, dt 10 | line-height 1.7em 11 | pre.title 12 | line-height: 0 !important 13 | padding: 0 !important 14 | margin: 0 !important 15 | background-color: transparent !important 16 | overflow: visible !important 17 | h2, h3 18 | margin-bottom: 0 !important 19 | .badge.green, .badge.tip 20 | background: $accentColor 21 | .custom-block.danger, .custom-block.tip, .custom-block.warning 22 | border-width 1px !important 23 | border-style solid !important 24 | background #FFFFFF !important 25 | color $textColor !important 26 | .custom-block.danger 27 | border-color $red 28 | .custom-block-title 29 | color $red 30 | .custom-block.tip 31 | border-color $green 32 | .custom-block-title 33 | color $green 34 | .custom-block.warning 35 | border-color $yellow 36 | .custom-block-title 37 | color $yellow 38 | .table-of-contents 39 | ul 40 | list-style none 41 | li 42 | padding-left 1em 43 | position relative 44 | li:before 45 | content "" 46 | height 4px 47 | width 4px 48 | border-radius 50% 49 | position absolute 50 | background $accentColor 51 | padding 1px 52 | margin-right 0.5em 53 | left 0 54 | top 12px 55 | .no-sidebar 56 | .navbar 57 | display: none 58 | .page p img:not(.icon) 59 | border 1px solid $borderColor 60 | .navbar .site-name span 61 | font-family 'Basis Grotesque Pro Light' !important 62 | font-weight lighter 63 | .sidebar 64 | background: $sidebarBgColor 65 | &__link 66 | display: block !important 67 | a 68 | display: block !important 69 | &__img 70 | width auto 71 | max-width 180px 72 | margin: 2em auto 1em auto 73 | display: block 74 | @media screen and (max-width: $MQNarrow) 75 | max-width 130px 76 | &-heading 77 | pointer-events: none 78 | font-family: $secondaryFontFamily 79 | &:after 80 | content: '' 81 | width: 35px 82 | height: 2px 83 | background: none 84 | display: block 85 | &.open 86 | color: $accentColor !important 87 | &:after 88 | background: $accentColor 89 | &-link 90 | border-left: none !important 91 | font-family: $secondaryFontFamily 92 | .theme-default-content code 93 | background-color: $codePillColor 94 | .go-to-top 95 | background: white 96 | height: 1.2rem !important 97 | width: 1.2rem !important 98 | border-radius: 13px 99 | padding: 0.8rem 100 | box-shadow: 0 2px 4px #929292 101 | .nav-links 102 | .nav-item:first-child 103 | text-transform: uppercase 104 | .nav-link 105 | font-weight: 600 !important 106 | font-family: $secondaryFontFamily 107 | .external__icon 108 | margin-left 0.3em 109 | .search-box input 110 | border-radius 0 !important 111 | .footer 112 | //position: absolute 113 | bottom: 1rem 114 | margin-top: 5rem 115 | text-align: center 116 | &__img 117 | max-width: 100px 118 | 119 | @media screen and (min-width: $MQNarrow) 120 | .navbar 121 | position: relative 122 | border-bottom: none 123 | // line-height: 4.4em !important 124 | margin-left: 320px 125 | .nav-links 126 | display flex 127 | width 100% 128 | align-items center 129 | margin-left 3em 130 | margin-right -1em 131 | .nav-item 132 | width: auto 133 | margin-left auto 134 | .nav-link.external 135 | font-size 12px 136 | .no-sidebar & 137 | max-width: 960px 138 | padding: 0 2.5rem 139 | margin: auto 140 | box-sizing: content-box; 141 | .links 142 | max-width: 100% 143 | margin-left: 0 144 | padding-left: 0 !important 145 | .home-link 146 | display: none 147 | .links 148 | position: relative !important 149 | right: auto !important 150 | max-width: 740px !important; 151 | margin: 1.4em auto 0.7em 4.5em; 152 | .page-nav 153 | margin-left: 4.5rem !important 154 | .sidebar 155 | top: 0 156 | z-index: 21 157 | overflow: visible 158 | border-right: none 159 | > ul 160 | overflow-y: auto 161 | max-height: calc(100% - 170px) 162 | > li:last-child 163 | margin-bottom 2em 164 | .sidebar-sub-headers .sidebar-sub-headers 165 | display none 166 | .active + .sidebar-sub-headers .sidebar-sub-headers 167 | display block 168 | .search-box input 169 | border-width: 2px !important 170 | border-radius: 2px !important 171 | color $accentColor !important 172 | min-width: 250px !important 173 | padding-left 0.5rem !important 174 | background-position calc(100% - 1rem) !important 175 | height 2.5rem !important 176 | border-color $sidebarBgColor !important 177 | &:focus 178 | border-color $accentColor !important 179 | .theme-default-content:not(.custom) 180 | max-width: 740px 181 | margin: 0 auto 182 | padding: 2rem 2.5rem 183 | margin-left: 4.5rem 184 | -------------------------------------------------------------------------------- /tests/text/test_features_transformers.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | from numpy.testing import assert_allclose 5 | 6 | from biome.text import Dataset 7 | from biome.text import Pipeline 8 | from biome.text import Trainer 9 | from biome.text import TrainerConfiguration 10 | from biome.text.features import TransformersFeatures 11 | 12 | 13 | @pytest.fixture 14 | def train_dataset() -> Dataset: 15 | """Creates the training dataset""" 16 | source = ( 17 | Path(__file__).parent.parent 18 | / "resources" 19 | / "data" 20 | / "emotions_with_transformers.txt" 21 | ) 22 | 23 | train_dataset = Dataset.from_csv( 24 | paths=str(source), delimiter=";", column_names=["text", "label"] 25 | ) 26 | return train_dataset 27 | 28 | 29 | @pytest.fixture 30 | def pipeline_dict() -> dict: 31 | """Creation of pipeline dictionary""" 32 | 33 | pipeline_dict = { 34 | "name": "emotions_with_transformers", 35 | "features": { 36 | "transformers": {"model_name": "sshleifer/tiny-distilbert-base-cased"} 37 | }, 38 | "head": { 39 | "type": "TextClassification", 40 | "labels": [ 41 | "anger", 42 | "fear", 43 | "joy", 44 | "love", 45 | "sadness", 46 | "surprise", 47 | ], 48 | "pooler": { 49 | "type": "bert_pooler", 50 | "pretrained_model": "sshleifer/tiny-distilbert-base-cased", 51 | "requires_grad": True, 52 | "dropout": 0.1, 53 | }, 54 | }, 55 | } 56 | 57 | return pipeline_dict 58 | 59 | 60 | @pytest.fixture 61 | def trainer_config(tmp_path) -> TrainerConfiguration: 62 | return TrainerConfiguration( 63 | batch_size=16, 64 | max_epochs=1, 65 | optimizer={ 66 | "type": "adam", 67 | "lr": 0.0001, 68 | }, 69 | gpus=0, 70 | default_root_dir=str(tmp_path), 71 | ) 72 | 73 | 74 | def test_pure_transformers(tmp_path, pipeline_dict, trainer_config, train_dataset): 75 | """Testing a Transformer training process and a model load""" 76 | 77 | pl = Pipeline.from_config(pipeline_dict) 78 | 79 | # Check a fixed vocabulary size for the model 80 | assert pl.backbone.vocab.get_vocab_size("transformers") == 28996 81 | 82 | pl.predict(text="test") 83 | 84 | output = tmp_path / "output" 85 | trainer = Trainer( 86 | pipeline=pl, train_dataset=train_dataset, trainer_config=trainer_config 87 | ) 88 | trainer.fit(output_dir=output) 89 | 90 | # Test vocabulary from a pretrained file 91 | pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) 92 | 93 | # Check a fixed vocabulary size for the model after loading 94 | assert pl.backbone.vocab.get_vocab_size("transformers") == 28996 95 | 96 | 97 | def test_transformers_and_word(tmp_path, pipeline_dict, trainer_config, train_dataset): 98 | """Testing Transformer pipeline with an added word feature layer""" 99 | # Changing the pipeline to delete the BERT pooler and add a word feature 100 | del pipeline_dict["head"]["pooler"] 101 | pipeline_dict["features"].update( 102 | {"word": {"embedding_dim": 16, "lowercase_tokens": True}} 103 | ) 104 | 105 | pl = Pipeline.from_config(pipeline_dict) 106 | pl.predict(text="test") 107 | 108 | output = tmp_path / "output" 109 | trainer = Trainer( 110 | pipeline=pl, train_dataset=train_dataset, trainer_config=trainer_config 111 | ) 112 | trainer.fit(output_dir=output) 113 | 114 | # Check a fixed vocabulary size for the transformer and the word feature 115 | assert pl.backbone.vocab.get_vocab_size("transformers") == 28996 116 | assert pl.backbone.vocab.get_vocab_size("word") == 273 117 | 118 | # Test vocab from a pretrained file 119 | pl = Pipeline.from_pretrained(str(output / "model.tar.gz")) 120 | 121 | # Check a fixed vocabulary size for the transformer and the word feature after loading 122 | assert pl.backbone.vocab.get_vocab_size("transformers") == 28996 123 | assert pl.backbone.vocab.get_vocab_size("word") == 273 124 | 125 | 126 | def test_max_length_not_affecting_shorter_sequences(pipeline_dict): 127 | """Max length change should not affect at all previous shorter-length models""" 128 | 129 | pl = Pipeline.from_config(pipeline_dict) 130 | state_dict = pl._model.state_dict() # dict with the whole state of the module 131 | probs = pl.predict("Test this")["probabilities"] # probabilities of the test input 132 | 133 | pipeline_dict["features"]["transformers"]["max_length"] = 100 # changing max length 134 | pl = Pipeline.from_config(pipeline_dict) 135 | pl._model.load_state_dict(state_dict) # loading previous state from dict 136 | probs_max_length = pl.predict("Test this")["probabilities"] 137 | 138 | assert_allclose(probs, probs_max_length) 139 | 140 | 141 | def test_serialization(pipeline_dict): 142 | """Testing object saving. Model from the pipeline must be equal to the model from .json""" 143 | 144 | feature = TransformersFeatures(**pipeline_dict["features"]["transformers"]) 145 | assert feature == TransformersFeatures(**feature.to_json()) 146 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [master] 7 | release: 8 | types: [published] 9 | 10 | jobs: 11 | tests_docs: 12 | name: Run Tests & Build Docs 13 | runs-on: ubuntu-latest 14 | # make sure commands run in a bash shell 15 | defaults: 16 | run: 17 | shell: bash -l {0} 18 | steps: 19 | - name: Set BIOME_TEXT_DOC_VERSION for Release 🥦 20 | if: ${{ github.event_name == 'release' }} 21 | run: echo BIOME_TEXT_DOC_VERSION=${{ github.event.release.tag_name }} >> $GITHUB_ENV 22 | - name: Checkout Code 🛎 23 | uses: actions/checkout@v2 24 | - name: Setup Conda Env 🐍 25 | uses: conda-incubator/setup-miniconda@v2 26 | with: 27 | environment-file: environment_dev.yml 28 | activate-environment: biometext 29 | - name: Cache pip 👜 30 | uses: actions/cache@v2 31 | env: 32 | # Increase this value to reset cache if setup.py has not changed 33 | CACHE_NUMBER: 0 34 | with: 35 | path: ~/.cache/pip 36 | key: ${{ runner.os }}-pip-${{ env.CACHE_NUMBER }}-${{ hashFiles('setup.py') }} 37 | - name: Install Biome 🌿 38 | run: make dev 39 | - name: Linting 🍩 40 | # TODO: there is an issue with pylint and our CI, for now we only run our pre-commit hooks 41 | run: pre-commit run --all-files 42 | - name: Run Tests 📈 43 | run: make test 44 | - name: Build Docs 📘 45 | # build and zip the docs 46 | run: | 47 | make build_docs 48 | tar -czf docs_build_output.tar.gz docs/site 49 | - name: Upload Build Output 🍕 50 | if: ${{ github.event_name == 'push' || github.event_name == 'release' }} 51 | uses: actions/upload-artifact@v2 52 | with: 53 | name: docs_build_output 54 | path: docs_build_output.tar.gz 55 | 56 | deploy_docs: 57 | name: Deploy Docs 58 | runs-on: ubuntu-latest 59 | needs: tests_docs 60 | if: ${{ github.event_name == 'push' || github.event_name == 'release' }} 61 | env: 62 | BIOME_TEXT_DOC_VERSION: master 63 | # make sure commands run in a bash shell 64 | defaults: 65 | run: 66 | shell: bash -l {0} 67 | steps: 68 | - name: Set BIOME_TEXT_DOC_VERSION for Release 🥦 69 | if: ${{ github.event_name == 'release' }} 70 | run: echo BIOME_TEXT_DOC_VERSION=${{ github.event.release.tag_name }} >> $GITHUB_ENV 71 | - name: Checkout Code 🛎 72 | # Recommended and required by JamesIves/github-pages-deploy-action 73 | uses: actions/checkout@v2 74 | with: 75 | persist-credentials: false 76 | - name: Download Build Output 🧀 77 | uses: actions/download-artifact@v2 78 | with: 79 | name: docs_build_output 80 | - name: Extract Build Output 🍗 81 | run: tar -xzf docs_build_output.tar.gz 82 | - name: Deploy Docs 🚀 83 | uses: JamesIves/github-pages-deploy-action@3.7.1 84 | with: 85 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 86 | BRANCH: gh-pages # The branch the action should deploy to. 87 | FOLDER: docs/site # The folder the action should deploy. 88 | TARGET_FOLDER: /${{ env.BIOME_TEXT_DOC_VERSION }}/ 89 | CLEAN: true # Automatically remove deleted files from the deploy branch 90 | - name: Checkout gh-pages for Release 🛎 91 | if: ${{ github.event_name == 'release' }} 92 | uses: actions/checkout@v2 93 | with: 94 | ref: gh-pages 95 | - name: Update Versions and Index for Release 🍗 96 | if: ${{ github.event_name == 'release' }} 97 | run: | 98 | sed -i 's/master/master\n${{ env.BIOME_TEXT_DOC_VERSION }}/' versions.txt 99 | sed -i 's/biome-text\/.*\//biome-text\/${{ env.BIOME_TEXT_DOC_VERSION }}\//' index.html 100 | git config user.name github-actions 101 | git config user.email github-actions@github.com 102 | git add versions.txt index.html 103 | git commit -m "Update versions.txt and index.html due to new release" 104 | git push 105 | 106 | deploy_release: 107 | name: Deploy Release 108 | runs-on: ubuntu-latest 109 | if: ${{ github.event_name == 'release' }} 110 | needs: tests_docs 111 | defaults: 112 | run: 113 | shell: bash -l {0} 114 | steps: 115 | - name: Checkout Code 🛎 116 | uses: actions/checkout@v2 117 | - name: Setup Conda Env 🐍 118 | uses: conda-incubator/setup-miniconda@v2 119 | with: 120 | environment-file: environment_dev.yml 121 | activate-environment: biome 122 | - name: Build Package 🍟 123 | run: make dist 124 | - name: Publish Package to TestPyPI 🥪 125 | uses: pypa/gh-action-pypi-publish@master 126 | with: 127 | user: __token__ 128 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 129 | repository_url: https://test.pypi.org/legacy/ 130 | - name: Test Installing 🍿 131 | run: pip install --index-url https://test.pypi.org/simple --no-deps biome-text==${GITHUB_REF#refs/*/v} 132 | - name: Publish Package to PyPI 🥩 133 | uses: pypa/gh-action-pypi-publish@master 134 | with: 135 | user: __token__ 136 | password: ${{ secrets.PYPI_API_TOKEN }} 137 | -------------------------------------------------------------------------------- /src/biome/text/vocabulary.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manages vocabulary tasks and fetches vocabulary information 3 | 4 | Provides utilities for getting information from a given vocabulary. 5 | 6 | Provides management actions such as extending the labels, setting new labels or creating an "empty" vocab. 7 | """ 8 | import logging 9 | from typing import Dict 10 | from typing import List 11 | 12 | from allennlp.data import Vocabulary 13 | from allennlp.data.vocabulary import DEFAULT_NON_PADDED_NAMESPACES 14 | 15 | from biome.text.features import TransformersFeatures 16 | from biome.text.features import WordFeatures 17 | 18 | LABELS_NAMESPACE = "gold_labels" 19 | 20 | _LOGGER = logging.getLogger(__name__) 21 | 22 | 23 | def get_labels(vocab: Vocabulary) -> List[str]: 24 | """Gets list of labels in the vocabulary 25 | 26 | Parameters 27 | ---------- 28 | vocab: `allennlp.data.Vocabulary` 29 | 30 | Returns 31 | ------- 32 | labels: `List[str]` 33 | A list of label strings 34 | """ 35 | return [k for k in vocab.get_token_to_index_vocabulary(namespace=LABELS_NAMESPACE)] 36 | 37 | 38 | def label_for_index(vocab: Vocabulary, idx: int) -> str: 39 | """Gets label string for a label `int` id 40 | 41 | Parameters 42 | ---------- 43 | vocab: `allennlp.data.Vocabulary` 44 | idx: `int 45 | the token index 46 | 47 | Returns 48 | ------- 49 | label: `str` 50 | The string for a label id 51 | """ 52 | return vocab.get_token_from_index(idx, namespace=LABELS_NAMESPACE) 53 | 54 | 55 | def index_for_label(vocab: Vocabulary, label: str) -> int: 56 | """Gets the label `int` id for label string 57 | 58 | Parameters 59 | ---------- 60 | vocab: `allennlp.data.Vocabulary`` 61 | label: `str` 62 | the label 63 | 64 | Returns 65 | ------- 66 | label_idx: `int` 67 | The label id for label string 68 | """ 69 | return vocab.get_token_index(label, namespace=LABELS_NAMESPACE) 70 | 71 | 72 | def get_index_to_labels_dictionary(vocab: Vocabulary) -> Dict[int, str]: 73 | """Gets a dictionary for turning label `int` ids into label strings 74 | 75 | Parameters 76 | ---------- 77 | vocab: `allennlp.data.Vocabulary` 78 | 79 | Returns 80 | ------- 81 | labels: `Dict[int, str]` 82 | A dictionary to get fetch label strings from ids 83 | """ 84 | return vocab.get_index_to_token_vocabulary(LABELS_NAMESPACE) 85 | 86 | 87 | def words_vocab_size(vocab: Vocabulary) -> int: 88 | """Fetches the vocabulary size for the `words` namespace 89 | 90 | Parameters 91 | ---------- 92 | vocab: `allennlp.data.Vocabulary` 93 | 94 | Returns 95 | ------- 96 | size: `int` 97 | The vocabulary size for the words namespace 98 | """ 99 | return vocab.get_vocab_size(WordFeatures.namespace) 100 | 101 | 102 | def extend_labels(vocab: Vocabulary, labels: List[str]): 103 | """Adds a list of label strings to the vocabulary 104 | 105 | Use this to add new labels to your vocabulary (e.g., useful for reusing the weights of an existing classifier) 106 | 107 | Parameters 108 | ---------- 109 | vocab: `allennlp.data.Vocabulary` 110 | labels: `List[str]` 111 | A list of strings containing the labels to add to an existing vocabulary 112 | """ 113 | vocab.add_tokens_to_namespace(labels, namespace=LABELS_NAMESPACE) 114 | 115 | 116 | def set_labels(vocab: Vocabulary, new_labels: List[str]): 117 | """Resets the labels in the vocabulary with a given labels string list 118 | 119 | Parameters 120 | ---------- 121 | vocab: `allennlp.data.Vocabulary` 122 | new_labels: `List[str]` 123 | The label strings to add to the vocabulary 124 | """ 125 | for namespace_vocab in [ 126 | vocab.get_token_to_index_vocabulary(LABELS_NAMESPACE), 127 | vocab.get_index_to_token_vocabulary(LABELS_NAMESPACE), 128 | ]: 129 | tokens = list(namespace_vocab.keys()) 130 | for token in tokens: 131 | del namespace_vocab[token] 132 | 133 | extend_labels(vocab, new_labels) 134 | 135 | 136 | def create_empty_vocabulary() -> Vocabulary: 137 | """Creates an empty Vocabulary with configured namespaces 138 | 139 | Returns 140 | ------- 141 | empty_vocab 142 | The transformers namespace is added to the `non_padded_namespace`. 143 | """ 144 | # Following is a hack, because AllenNLP handles the Transformers vocab differently! 145 | # The transformer vocab has its own padding and oov token, so we add it to the non_padded_namespaces. 146 | # AllenNLP gives its "transformer vocab" by default the "tags" namespace, which is a non_padded_namespace ... 147 | # If we do not do this, then writing the vocab to a file and loading it will fail, since AllenNLP will 148 | # look for its default OVV token in the vocab unless it is flagged as non_padded_namespace. 149 | # (see the doc string of `allennlp.data.token_indexers.PretrainedTransformerIndexer`) 150 | return Vocabulary( 151 | non_padded_namespaces=DEFAULT_NON_PADDED_NAMESPACES 152 | + (TransformersFeatures.namespace,) 153 | ) 154 | 155 | 156 | def is_empty(vocab: Vocabulary, namespaces: List[str]) -> bool: 157 | """Checks if at least one of the given namespaces has an empty vocab. 158 | 159 | Parameters 160 | ---------- 161 | vocab 162 | The vocabulary 163 | namespaces 164 | Namespaces to check in the vocabulary 165 | 166 | Returns 167 | ------- 168 | True if one or more namespaces have an empty vocab 169 | """ 170 | # If a namespace does not exist in the vocab, a default one is created on the fly with a padding and oov token 171 | # We must drop the padding and out of vocab (oov) tokens -> 2 tokens 172 | return any([vocab.get_vocab_size(namespace) < 3 for namespace in namespaces]) 173 | -------------------------------------------------------------------------------- /src/biome/text/cli/serve.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from typing import List 3 | 4 | import click 5 | import uvicorn 6 | from allennlp.common.util import sanitize 7 | from click import Path 8 | from fastapi import FastAPI 9 | from fastapi import HTTPException 10 | from fastapi.exceptions import RequestValidationError 11 | from fastapi.responses import PlainTextResponse 12 | from pydantic import BaseConfig 13 | from pydantic import create_model 14 | from starlette.exceptions import HTTPException as StarletteHTTPException 15 | 16 | from biome.text import Pipeline 17 | 18 | 19 | @click.command() 20 | @click.argument("pipeline_path", type=Path(exists=True)) 21 | @click.option( 22 | "--port", 23 | "-p", 24 | type=int, 25 | default=9999, 26 | show_default=True, 27 | help="Port on which to serve the REST API.", 28 | ) 29 | @click.option( 30 | "--predictions_dir", 31 | "-pd", 32 | type=click.Path(), 33 | default=None, 34 | help="Path to log raw predictions from the service.", 35 | ) 36 | @click.option( 37 | "--host", 38 | type=str, 39 | default="0.0.0.0", 40 | help="Host of the underlying uvicorn server", 41 | ) 42 | def serve(pipeline_path: str, port: int, predictions_dir: str, host: str) -> None: 43 | """Serves the pipeline predictions as a REST API 44 | 45 | PIPELINE_PATH is the path to a pretrained pipeline (model.tar.gz file). 46 | """ 47 | pipeline = Pipeline.from_pretrained(pipeline_path) 48 | pipeline._model.eval() 49 | 50 | if predictions_dir: 51 | pipeline.init_prediction_logger(predictions_dir) 52 | 53 | return _serve(pipeline, port, host) 54 | 55 | 56 | def _serve(pipeline: Pipeline, port: int = 9999, host: str = "0.0.0.0"): 57 | """Serves an pipeline as rest api""" 58 | predict_parameters = inspect.signature(pipeline.predict).parameters 59 | model_parameters = { 60 | name: ( 61 | par.annotation, 62 | None, # We need a default value to allow for batch predictions! 63 | ) 64 | for name, par in predict_parameters.items() 65 | if par.default == inspect.Parameter.empty 66 | } 67 | optional_parameters = { 68 | name: (par.annotation, par.default) 69 | for name, par in predict_parameters.items() 70 | # The batch parameter needs an extra logic to allow for a proper BaseModel for it 71 | if par.default != inspect.Parameter.empty and name != "batch" 72 | } 73 | 74 | class Config(BaseConfig): 75 | extra = "forbid" 76 | 77 | ModelInput = create_model("ModelInput", **model_parameters, __config__=Config) 78 | PredictInput = create_model( 79 | "PredictInput", 80 | **model_parameters, 81 | batch=(List[ModelInput], None), 82 | **optional_parameters, 83 | __config__=Config, 84 | ) 85 | 86 | class http_error_handling: 87 | """Error handling for http error transcription""" 88 | 89 | def __enter__(self): 90 | pass 91 | 92 | def __exit__(self, exc_type, exc_val, exc_tb): 93 | if isinstance(exc_val, Exception): 94 | # Common http error handling 95 | raise HTTPException(status_code=500, detail=str(exc_val)) 96 | 97 | def make_app() -> FastAPI: 98 | app = FastAPI() 99 | 100 | error_msg = f"\nCheck the docs at '0.0.0.0:{port}/docs' for an example of a valid request body." 101 | 102 | @app.exception_handler(RequestValidationError) 103 | async def validation_exception_handler(request, exc): 104 | return PlainTextResponse(str(exc) + error_msg, status_code=400) 105 | 106 | @app.exception_handler(StarletteHTTPException) 107 | async def http_exception_handler(request, exc): 108 | if exc.status_code == 400: 109 | return PlainTextResponse( 110 | str(exc.detail) + error_msg, status_code=exc.status_code 111 | ) 112 | else: 113 | return PlainTextResponse(str(exc.detail), status_code=exc.status_code) 114 | 115 | @app.post("/predict", tags=["Pipeline"]) 116 | async def predict(predict_input: PredictInput): 117 | """Returns a prediction given some input data 118 | 119 | Parameters 120 | ---------- 121 | - **args/kwargs:** See the Example Value for the Request body below. 122 | If provided, the **batch** parameter will be ignored. 123 | - **batch:** A list of dictionaries that represents a batch of inputs. 124 | The dictionary keys must comply with the **args/kwargs**. 125 | Predicting batches should typically be faster than repeated calls with **args/kwargs**. 126 | - **add_tokens:** If true, adds a 'tokens' key in the prediction that contains the tokenized input. 127 | - **add_attributions:** If true, adds a 'attributions' key that contains attributions of the input to the prediction. 128 | - **attributions_kwargs:** This dict is directly passed on to the `TaskHead.compute_attributions()`. 129 | 130 | Returns 131 | ------- 132 | - **predictions:** A dictionary or a list of dictionaries containing the predictions and additional information. 133 | """ 134 | with http_error_handling(): 135 | return sanitize( 136 | pipeline.predict(**predict_input.dict(skip_defaults=True)) 137 | ) 138 | 139 | @app.get("/config", tags=["Pipeline"]) 140 | async def config(): 141 | """The configuration of the pipeline""" 142 | with http_error_handling(): 143 | return pipeline.config.as_dict() 144 | 145 | @app.get("/_status", tags=["REST service"]) 146 | async def status(): 147 | with http_error_handling(): 148 | return {"ok": True} 149 | 150 | return app 151 | 152 | uvicorn.run(make_app(), host=host, port=port) 153 | -------------------------------------------------------------------------------- /tests/text/test_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import pytest 5 | from pytorch_lightning.loggers import CSVLogger 6 | from pytorch_lightning.loggers import LoggerCollection 7 | from pytorch_lightning.loggers import MLFlowLogger 8 | from pytorch_lightning.loggers import TensorBoardLogger 9 | from pytorch_lightning.loggers import WandbLogger 10 | 11 | from biome.text import Dataset 12 | from biome.text import Pipeline 13 | from biome.text import Trainer 14 | from biome.text.configuration import TrainerConfiguration 15 | 16 | 17 | @pytest.fixture 18 | def dataset(resources_data_path) -> Dataset: 19 | return Dataset.from_csv( 20 | paths=str(resources_data_path / "business.cat.2k.valid.csv") 21 | ) 22 | 23 | 24 | @pytest.fixture 25 | def pipeline_dict() -> dict: 26 | return { 27 | "name": "german_business_names", 28 | "features": { 29 | "word": {"embedding_dim": 16, "lowercase_tokens": True}, 30 | }, 31 | "head": { 32 | "type": "TextClassification", 33 | "labels": [ 34 | "Unternehmensberatungen", 35 | "Friseure", 36 | "Tiefbau", 37 | "Dienstleistungen", 38 | "Gebrauchtwagen", 39 | "Restaurants", 40 | "Architekturbüros", 41 | "Elektriker", 42 | "Vereine", 43 | "Versicherungsvermittler", 44 | "Sanitärinstallationen", 45 | "Edv", 46 | "Maler", 47 | "Physiotherapie", 48 | "Werbeagenturen", 49 | "Apotheken", 50 | "Vermittlungen", 51 | "Hotels", 52 | "Autowerkstätten", 53 | "Elektrotechnik", 54 | "Allgemeinärzte", 55 | "Handelsvermittler Und -vertreter", 56 | ], 57 | }, 58 | } 59 | 60 | 61 | def test_default_root_dir(change_to_tmp_working_dir, pipeline_dict, dataset): 62 | pl = Pipeline.from_config(pipeline_dict) 63 | trainer = Trainer(pl, train_dataset=dataset) 64 | assert trainer.trainer.default_root_dir == str( 65 | change_to_tmp_working_dir / "training_logs" 66 | ) 67 | 68 | 69 | def test_deep_copy_of_trainer_config(pipeline_dict, dataset): 70 | pl = Pipeline.from_config(pipeline_dict) 71 | trainer_config = TrainerConfiguration() 72 | trainer = Trainer(pl, train_dataset=dataset, trainer_config=trainer_config) 73 | assert trainer_config is not trainer._trainer_config 74 | 75 | 76 | @pytest.mark.parametrize( 77 | "input_kwargs,expected_loggers", 78 | [ 79 | ({}, ["csv", "tensorboard", "wandb"]), 80 | ({"logger": False}, []), 81 | ( 82 | { 83 | "logger": MLFlowLogger( 84 | tracking_uri=os.path.join(tempfile.gettempdir(), "mlruns") 85 | ), 86 | "add_wandb_logger": False, 87 | }, 88 | ["csv", "tensorboard", "mlflow"], 89 | ), 90 | ( 91 | { 92 | "logger": [ 93 | MLFlowLogger( 94 | tracking_uri=os.path.join(tempfile.gettempdir(), "mlruns") 95 | ), 96 | CSVLogger(save_dir=tempfile.gettempdir()), 97 | ], 98 | "add_wandb_logger": False, 99 | "add_tensorboard_logger": False, 100 | }, 101 | ["csv", "mlflow"], 102 | ), 103 | ], 104 | ) 105 | def test_add_default_loggers( 106 | input_kwargs, expected_loggers, pipeline_dict, dataset, tmp_path 107 | ): 108 | trainer_config = TrainerConfiguration( 109 | **input_kwargs, default_root_dir=str(tmp_path) 110 | ) 111 | trainer = Trainer( 112 | Pipeline.from_config(pipeline_dict), 113 | train_dataset=dataset, 114 | trainer_config=trainer_config, 115 | ) 116 | if input_kwargs.get("logger") is not False: 117 | assert isinstance(trainer.trainer.logger, LoggerCollection) 118 | assert len(trainer.trainer.logger.experiment) == len(expected_loggers) 119 | else: 120 | assert trainer._trainer_config.logger is False 121 | 122 | def loggers_include(logger_type) -> bool: 123 | return any( 124 | [ 125 | isinstance(logger, logger_type) 126 | for logger in trainer._trainer_config.logger 127 | ] 128 | ) 129 | 130 | for logger in expected_loggers: 131 | if logger == "csv": 132 | assert loggers_include(CSVLogger) 133 | if logger == "tensorboard": 134 | assert loggers_include(TensorBoardLogger) 135 | if logger == "wandb": 136 | assert loggers_include(WandbLogger) 137 | assert (tmp_path / "wandb").is_dir() 138 | if logger == "mlflow": 139 | assert loggers_include(MLFlowLogger) 140 | 141 | 142 | def test_pipeline_test(pipeline_dict, dataset, tmp_path): 143 | import json 144 | 145 | pl = Pipeline.from_config(pipeline_dict) 146 | trainer = Trainer(pl) 147 | first_metrics = trainer.test(dataset, output_dir=tmp_path, batch_size=16) 148 | assert "test_loss" in first_metrics 149 | 150 | assert (tmp_path / "metrics.json").is_file() 151 | with (tmp_path / "metrics.json").open() as file: 152 | assert "test_loss" in json.load(file) 153 | 154 | assert pl.evaluate(dataset)["test_loss"] == pytest.approx( 155 | first_metrics["test_loss"] 156 | ) 157 | 158 | 159 | def test_create_output_dir(pipeline_dict, dataset, tmp_path): 160 | config = TrainerConfiguration( 161 | logger=False, fast_dev_run=True, batch_size=1, max_epochs=1, gpus=0 162 | ) 163 | pipeline = Pipeline.from_config(pipeline_dict) 164 | trainer = Trainer(pipeline, train_dataset=dataset, trainer_config=config) 165 | 166 | output_dir = tmp_path / "test_this_non_existing_parent_dir" / "output" 167 | trainer.fit(output_dir=output_dir) 168 | 169 | assert output_dir.is_dir() 170 | -------------------------------------------------------------------------------- /docs/docs/.vuepress/theme/components/PageNav.vue: -------------------------------------------------------------------------------- 1 | 76 | 77 | 164 | 165 | 220 | --------------------------------------------------------------------------------